fix bug in sentence piece (#77)

cjpais · web-flow · commit f6e23ebe2469 · 2026-04-01T18:33:02.000+08:00
* fix bug in sentence piece

* fmt
diff --git a/src/decode/mod.rs b/src/decode/mod.rs
@@ -5,5 +5,5 @@ pub mod tokens;
 
 pub use ctc::{ctc_greedy_decode, CtcDecoderResult};
 pub use greedy::GreedyDecoder;
-pub use sentencepiece::sentencepiece_to_text;
+pub use sentencepiece::{parse_byte_token, sentencepiece_to_text};
 pub use tokens::{load_vocab, SymbolTable};
diff --git a/src/decode/sentencepiece.rs b/src/decode/sentencepiece.rs
@@ -11,3 +11,84 @@ pub fn sentencepiece_to_text(tokens: &[&str]) -> String {
     // Clean up contraction spacing (e.g. "can 't" → "can't")
     text.replace(" '", "'")
 }
+
+/// Parse a byte-level BPE token like `<0xE5>` into its byte value.
+///
+/// SentencePiece tokenizers emit these for characters outside the base vocabulary
+/// (e.g. CJK characters are split into individual UTF-8 bytes).
+pub fn parse_byte_token(token: &str) -> Option<u8> {
+    if token.starts_with("<0x") && token.ends_with('>') && token.len() == 6 {
+        let hex = &token[3..5];
+        u8::from_str_radix(hex, 16).ok()
+    } else {
+        None
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_byte_token_valid() {
+        assert_eq!(parse_byte_token("<0xE5>"), Some(0xE5));
+        assert_eq!(parse_byte_token("<0xB0>"), Some(0xB0));
+        assert_eq!(parse_byte_token("<0xBC>"), Some(0xBC));
+        assert_eq!(parse_byte_token("<0x00>"), Some(0x00));
+        assert_eq!(parse_byte_token("<0xFF>"), Some(0xFF));
+    }
+
+    #[test]
+    fn test_parse_byte_token_invalid() {
+        assert_eq!(parse_byte_token("hello"), None);
+        assert_eq!(parse_byte_token("<|en|>"), None);
+        assert_eq!(parse_byte_token("<0x>"), None);
+        assert_eq!(parse_byte_token("<0xEE"), None); // missing >
+        assert_eq!(parse_byte_token("<0xGG>"), None); // invalid hex
+    }
+
+    #[test]
+    fn test_byte_tokens_reassemble_chinese() {
+        // 尼 = E5 B0 BC, 豪 = E8 B1 AA
+        // Simulates what Cohere's decode_ids does with byte tokens
+        let tokens = vec!["<0xE5>", "<0xB0>", "<0xBC>", "豪", "。"];
+        let mut bytes: Vec<u8> = Vec::new();
+        for token in &tokens {
+            if let Some(byte_val) = parse_byte_token(token) {
+                bytes.push(byte_val);
+            } else {
+                bytes.extend(token.as_bytes());
+            }
+        }
+        let text = String::from_utf8_lossy(&bytes);
+        assert_eq!(text, "尼豪。");
+    }
+
+    #[test]
+    fn test_byte_tokens_full_cjk_sequence() {
+        // 你好 = E4 BD A0 E5 A5 BD
+        let tokens = vec!["<0xE4>", "<0xBD>", "<0xA0>", "<0xE5>", "<0xA5>", "<0xBD>"];
+        let mut bytes: Vec<u8> = Vec::new();
+        for token in &tokens {
+            if let Some(byte_val) = parse_byte_token(token) {
+                bytes.push(byte_val);
+            } else {
+                bytes.extend(token.as_bytes());
+            }
+        }
+        let text = String::from_utf8_lossy(&bytes);
+        assert_eq!(text, "你好");
+    }
+
+    #[test]
+    fn test_sentencepiece_to_text_basic() {
+        let tokens = vec![" Hello", " world"];
+        assert_eq!(sentencepiece_to_text(&tokens), "Hello world");
+    }
+
+    #[test]
+    fn test_sentencepiece_to_text_contractions() {
+        let tokens = vec![" can", " 't"];
+        assert_eq!(sentencepiece_to_text(&tokens), "can't");
+    }
+}
diff --git a/src/onnx/cohere/mod.rs b/src/onnx/cohere/mod.rs
@@ -10,7 +10,7 @@ use ort::session::SessionInputValue;
 use ort::value::DynValue;
 
 use super::{session, Quantization};
-use crate::decode::{load_vocab, sentencepiece_to_text, GreedyDecoder};
+use crate::decode::{load_vocab, parse_byte_token, GreedyDecoder};
 use crate::{
     ModelCapabilities, SpeechModel, TranscribeError, TranscribeOptions, TranscriptionResult,
 };
@@ -294,7 +294,7 @@ impl CohereModel {
     }
 
     fn decode_ids(&self, token_ids: &[i64]) -> String {
-        let pieces = token_ids
+        let tokens: Vec<&str> = token_ids
             .iter()
             .filter_map(|&id| self.vocab.get(id as usize))
             .filter(|token| {
@@ -304,9 +304,24 @@ impl CohereModel {
                     && token.as_str() != "<pad>"
             })
             .map(|token| token.as_str())
-            .collect::<Vec<_>>();
+            .collect();
+
+        // Handle byte-level BPE tokens (<0xNN>) by collecting into a byte buffer.
+        // SentencePiece tokenizers emit these for characters outside the base vocabulary
+        // (e.g. CJK characters are split into individual UTF-8 bytes).
+        let mut bytes: Vec<u8> = Vec::new();
+        for token in &tokens {
+            if let Some(byte_val) = parse_byte_token(token) {
+                bytes.push(byte_val);
+            } else {
+                bytes.extend(token.as_bytes());
+            }
+        }
 
-        sentencepiece_to_text(&pieces)
+        let text = String::from_utf8_lossy(&bytes);
+        let text = text.trim();
+        // Clean up contraction spacing (e.g. "can 't" → "can't")
+        text.replace(" '", "'")
     }
 
     fn decoder_input_name(&self, preferred: &str, fallbacks: &[&str]) -> String {
diff --git a/src/onnx/moonshine/model.rs b/src/onnx/moonshine/model.rs
@@ -7,7 +7,7 @@ use std::fs::File;
 use std::io::BufReader;
 use std::path::Path;
 
-use crate::decode::GreedyDecoder;
+use crate::decode::{parse_byte_token, GreedyDecoder};
 use crate::onnx::session;
 use crate::onnx::Quantization;
 use crate::{
@@ -417,7 +417,7 @@ impl MoonshineTokenizer {
         let mut bytes: Vec<u8> = Vec::new();
 
         for token in &tokens {
-            if let Some(byte_val) = Self::parse_byte_token(token) {
+            if let Some(byte_val) = parse_byte_token(token) {
                 bytes.push(byte_val);
             } else {
                 let decoded = token.replace('\u{2581}', " ");
@@ -430,13 +430,4 @@ impl MoonshineTokenizer {
 
         Ok(text.to_string())
     }
-
-    fn parse_byte_token(token: &str) -> Option<u8> {
-        if token.starts_with("<0x") && token.ends_with('>') && token.len() == 6 {
-            let hex = &token[3..5];
-            u8::from_str_radix(hex, 16).ok()
-        } else {
-            None
-        }
-    }
 }
diff --git a/tests/cohere.rs b/tests/cohere.rs
@@ -65,3 +65,39 @@ fn test_cohere_german() {
         result.text
     );
 }
+
+#[test]
+fn test_cohere_chinese() {
+    let model_path = PathBuf::from("models/cohere-int4");
+    let audio_path = PathBuf::from("samples/chinese.wav");
+
+    if !common::require_paths(&[&model_path, &audio_path]) {
+        return;
+    }
+
+    let mut model =
+        CohereModel::load(&model_path, &Quantization::Int4).expect("Failed to load Cohere model");
+
+    let result = model
+        .transcribe_file(
+            &audio_path,
+            &transcribe_rs::TranscribeOptions {
+                language: Some("zh".into()),
+                ..Default::default()
+            },
+        )
+        .expect("Failed to transcribe Chinese audio with Cohere model");
+
+    println!("Chinese transcription: {}", result.text);
+    assert!(
+        !result.text.trim().is_empty(),
+        "Cohere Chinese transcription should not be empty"
+    );
+
+    // Output should contain actual Chinese characters, not byte tokens like <0xE5>
+    assert!(
+        !result.text.contains("<0x"),
+        "Chinese transcription should not contain raw byte tokens, got: '{}'",
+        result.text
+    );
+}