implement review comments

marinegor · marinegor · commit 3f0d4fe5fa9e · 2026-03-05T13:24:16.000+01:00
diff --git a/crates/bpe/benchmarks/performance.rs b/crates/bpe/benchmarks/performance.rs
@@ -13,13 +13,6 @@ use rand::rngs::StdRng;
 use rand::SeedableRng;
 use rand::{rng, Rng};
 
-fn get_rng(seed: u64) -> StdRng {
-    // Expand the u64 seed to 32 bytes
-    let mut seed_bytes = [0u8; 32];
-    seed_bytes[..8].copy_from_slice(&seed.to_le_bytes());
-    StdRng::from_seed(seed_bytes)
-}
-
 fn counting_benchmark(c: &mut Criterion) {
     for (name, bpe, _, _) in TOKENIZERS.iter() {
         let input = create_test_string(&bpe.bpe, 80_000);
@@ -107,10 +100,7 @@ fn encoding_benchmark(c: &mut Criterion) {
                 |b, bytes| {
                     b.iter_batched(
                         || select_test_string(&text, *bytes),
-                        |text| {
-                            bpe.bpe
-                                .encode_minimal_dropout(text.as_bytes(), 0.1, get_rng(0))
-                        },
+                        |text| bpe.bpe.encode_minimal_dropout(text.as_bytes(), 0.1, rng()),
                         criterion::BatchSize::SmallInput,
                     )
                 },
diff --git a/crates/bpe/src/byte_pair_encoding.rs b/crates/bpe/src/byte_pair_encoding.rs
@@ -558,17 +558,20 @@ impl BytePairEncoding {
     /// In more detail: the tokenization uses dynamic programming, i.e. it models the tokenization as a graph,
     /// where every position between text bytes is a node and two nodes are connected when the text slice between those two nodes matches a token.
     // It then tries to find the shortest possible path from the beginning of the text till the end, i.e. it finds the shortest possible encoding.
-// For this nodes are processed from right to left. At each node, edges starting at that node and ending on the right are tested and
-// the one producing the shortest path is stored together with the length of the shortest path to that node.
+    // For this nodes are processed from right to left. At each node, edges starting at that node and ending on the right are tested and
+    // the one producing the shortest path is stored together with the length of the shortest path to that node.
     // The length of the shortest path is stored as second value, the edge (or rather token) is stored as first value.
+    // Then, we walk in reverse direction through the table along the shortest path.
+    // Note: the reason for constructing the table from back to front is that
+    // the reconstruction outputs the path from start till end (i.e. we don't have to reverse the path afterwards).
     //
     // For the dropout (when dropout > 0.0), we uniformly drop edges from the graph, but always keep the one-byte tokens such that the graph stays connected.
     // Note: this is very different from how BPE works and cannot produce the same output as the algorithm
     // in the [paper's repository](https://github.com/VProv/BPE-Dropout/blob/master/bpe.py#L98), for two main reasons:
     //   - `encode_minimal` already doesn't follow the original heap-based BPE procedure
-    //   - randomness source in dropout works differently in rust and python
     //   - BPE-dropout authors discard all multi-byte tokens for each word separately, while this implementation does not split the "sentence" into words first
     //     and hence may include previously discarded token later down the byte stream. At the sentence level though we don't expect it to make much difference.
+    //     Also, this implementation of BPE constructs merges on the fly from the set of tokens, hence might come up with a different set of merges with the same dictionary.
     #[cfg(feature = "rand")]
     pub fn encode_minimal_dropout<R: rand::Rng>(
         &self,
diff --git a/crates/bpe/tests/src/lib.rs b/crates/bpe/tests/src/lib.rs
@@ -1,7 +1,5 @@
 #[cfg(test)]
 mod tests {
-    use std::time;
-
     use itertools::Itertools;
     use rand::{rng, Rng};
     use tiktoken_rs::cl100k_base_singleton;
@@ -157,31 +155,38 @@ mod tests {
         }
 
         let bpe = &cl100k_base().bpe;
-        for bytes in [10000, 20000] {
-            for _ in 0..8 {
-                let input = create_test_bytes(bpe, bytes);
-                let encoded = bpe.encode_minimal(&input);
-                let encoded_d_0_2 = bpe.encode_minimal_dropout(&input, 0.2, get_rng(0));
-                let encoded_d_0_9 = bpe.encode_minimal_dropout(&input, 0.9, get_rng(1));
-                let encoded_d_1_0 = bpe.encode_minimal_dropout(&input, 1.0, get_rng(2));
-                let decoded = bpe.decode_tokens(&encoded);
-                let decoded_min = bpe.decode_tokens(&encoded_d_min);
-                let decoded_max = bpe.decode_tokens(&encoded_d_max);
-                let decoded_max_again = bpe.decode_tokens(&encoded_d_1_0);
-                println!("Input length: {}, Encoded length: {}, Encoded with dropout length: {}-{}, max {}",
-                    input.len(), encoded.len(), encoded_d_min.len(), encoded_d_max.len(), encoded_d_1_0.len());
-                assert_eq!(input, decoded);
-                assert_eq!(input, decoded_min);
-                assert_eq!(input, decoded_max);
-                assert_eq!(input, decoded_max_again);
-                assert_eq!(input.len(), encoded_d_1_0.len());
-                assert!(encoded_d_min.len() >= encoded.len());
-                assert!(encoded_d_max.len() > encoded.len());
-
-                assert_ne!(encoded, encoded_d_min);
-                assert_ne!(encoded, encoded_d_max);
-                assert_ne!(encoded_d_max, encoded_d_1_0);
-            }
+        let bytes = 10000;
+        for _ in 0..8 {
+            let input = create_test_bytes(bpe, bytes);
+            let encoded = bpe.encode_minimal(&input);
+            let encoded_d_0_2 = bpe.encode_minimal_dropout(&input, 0.2, get_rng(0));
+            let encoded_d_0_9 = bpe.encode_minimal_dropout(&input, 0.9, get_rng(1));
+            let encoded_d_1_0 = bpe.encode_minimal_dropout(&input, 1.0, get_rng(1));
+            let encoded_d_0_9_again = bpe.encode_minimal_dropout(&input, 0.9, get_rng(1));
+            let decoded = bpe.decode_tokens(&encoded);
+            let decoded_min = bpe.decode_tokens(&encoded_d_0_2);
+            let decoded_max = bpe.decode_tokens(&encoded_d_0_9);
+            let decoded_max_again = bpe.decode_tokens(&encoded_d_0_9_again);
+            println!(
+                "Input length: {}, Encoded length: {}, Encoded with dropout length: {}-{}, max {}",
+                input.len(),
+                encoded.len(),
+                encoded_d_0_2.len(),
+                encoded_d_0_9.len(),
+                encoded_d_0_9_again.len()
+            );
+            assert_eq!(encoded_d_0_9, encoded_d_0_9_again);
+            assert_eq!(input, decoded);
+            assert_eq!(input, decoded_min);
+            assert_eq!(input, decoded_max);
+            assert_eq!(input, decoded_max_again);
+            assert_eq!(input.len(), encoded_d_1_0.len());
+            assert!(encoded_d_0_2.len() >= encoded.len());
+            assert!(encoded_d_0_9.len() > encoded.len());
+
+            assert_ne!(encoded, encoded_d_0_2);
+            assert_ne!(encoded, encoded_d_0_9);
+            assert_ne!(encoded_d_0_9, encoded_d_1_0);
         }
     }
 }