@@ -11,3 +11,84 @@ pub fn sentencepiece_to_text(tokens: &[&str]) -> String {
1111 // Clean up contraction spacing (e.g. "can 't" → "can't")
1212 text. replace ( " '" , "'" )
1313}
14+
15+ /// Parse a byte-level BPE token like `<0xE5>` into its byte value.
16+ ///
17+ /// SentencePiece tokenizers emit these for characters outside the base vocabulary
18+ /// (e.g. CJK characters are split into individual UTF-8 bytes).
19+ pub fn parse_byte_token ( token : & str ) -> Option < u8 > {
20+ if token. starts_with ( "<0x" ) && token. ends_with ( '>' ) && token. len ( ) == 6 {
21+ let hex = & token[ 3 ..5 ] ;
22+ u8:: from_str_radix ( hex, 16 ) . ok ( )
23+ } else {
24+ None
25+ }
26+ }
27+
28+ #[ cfg( test) ]
29+ mod tests {
30+ use super :: * ;
31+
32+ #[ test]
33+ fn test_parse_byte_token_valid ( ) {
34+ assert_eq ! ( parse_byte_token( "<0xE5>" ) , Some ( 0xE5 ) ) ;
35+ assert_eq ! ( parse_byte_token( "<0xB0>" ) , Some ( 0xB0 ) ) ;
36+ assert_eq ! ( parse_byte_token( "<0xBC>" ) , Some ( 0xBC ) ) ;
37+ assert_eq ! ( parse_byte_token( "<0x00>" ) , Some ( 0x00 ) ) ;
38+ assert_eq ! ( parse_byte_token( "<0xFF>" ) , Some ( 0xFF ) ) ;
39+ }
40+
41+ #[ test]
42+ fn test_parse_byte_token_invalid ( ) {
43+ assert_eq ! ( parse_byte_token( "hello" ) , None ) ;
44+ assert_eq ! ( parse_byte_token( "<|en|>" ) , None ) ;
45+ assert_eq ! ( parse_byte_token( "<0x>" ) , None ) ;
46+ assert_eq ! ( parse_byte_token( "<0xEE" ) , None ) ; // missing >
47+ assert_eq ! ( parse_byte_token( "<0xGG>" ) , None ) ; // invalid hex
48+ }
49+
50+ #[ test]
51+ fn test_byte_tokens_reassemble_chinese ( ) {
52+ // 尼 = E5 B0 BC, 豪 = E8 B1 AA
53+ // Simulates what Cohere's decode_ids does with byte tokens
54+ let tokens = vec ! [ "<0xE5>" , "<0xB0>" , "<0xBC>" , "豪" , "。" ] ;
55+ let mut bytes: Vec < u8 > = Vec :: new ( ) ;
56+ for token in & tokens {
57+ if let Some ( byte_val) = parse_byte_token ( token) {
58+ bytes. push ( byte_val) ;
59+ } else {
60+ bytes. extend ( token. as_bytes ( ) ) ;
61+ }
62+ }
63+ let text = String :: from_utf8_lossy ( & bytes) ;
64+ assert_eq ! ( text, "尼豪。" ) ;
65+ }
66+
67+ #[ test]
68+ fn test_byte_tokens_full_cjk_sequence ( ) {
69+ // 你好 = E4 BD A0 E5 A5 BD
70+ let tokens = vec ! [ "<0xE4>" , "<0xBD>" , "<0xA0>" , "<0xE5>" , "<0xA5>" , "<0xBD>" ] ;
71+ let mut bytes: Vec < u8 > = Vec :: new ( ) ;
72+ for token in & tokens {
73+ if let Some ( byte_val) = parse_byte_token ( token) {
74+ bytes. push ( byte_val) ;
75+ } else {
76+ bytes. extend ( token. as_bytes ( ) ) ;
77+ }
78+ }
79+ let text = String :: from_utf8_lossy ( & bytes) ;
80+ assert_eq ! ( text, "你好" ) ;
81+ }
82+
83+ #[ test]
84+ fn test_sentencepiece_to_text_basic ( ) {
85+ let tokens = vec ! [ " Hello" , " world" ] ;
86+ assert_eq ! ( sentencepiece_to_text( & tokens) , "Hello world" ) ;
87+ }
88+
89+ #[ test]
90+ fn test_sentencepiece_to_text_contractions ( ) {
91+ let tokens = vec ! [ " can" , " 't" ] ;
92+ assert_eq ! ( sentencepiece_to_text( & tokens) , "can't" ) ;
93+ }
94+ }
0 commit comments