cjpais
diff --git a/‎Cargo.toml‎
Lines changed: 8 additions & 0 deletions b/‎Cargo.toml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 39 additions & 2 deletions b/‎README.md‎
Lines changed: 39 additions & 2 deletions
diff --git a/‎examples/canary.rs‎
Lines changed: 1 addition & 1 deletion b/‎examples/canary.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/cohere.rs‎
Lines changed: 58 additions & 0 deletions b/‎examples/cohere.rs‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎src/decode/greedy.rs‎
Lines changed: 127 additions & 0 deletions b/‎src/decode/greedy.rs‎
Lines changed: 127 additions & 0 deletions
diff --git a/‎src/decode/mod.rs‎
Lines changed: 2 additions & 0 deletions b/‎src/decode/mod.rs‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/onnx/canary/decoder.rs‎
Lines changed: 11 additions & 32 deletions b/‎src/onnx/canary/decoder.rs‎
Lines changed: 11 additions & 32 deletions
@@ -107,6 +107,10 @@ required-features = ["onnx"]
 name = "canary"
 required-features = ["onnx"]
 
+[[example]]
+name = "cohere"
+required-features = ["onnx"]
+
 [[example]]
 name = "whisperfile"
 required-features = ["whisperfile"]
@@ -143,6 +147,10 @@ required-features = ["onnx"]
 name = "canary"
 required-features = ["onnx"]
 
+[[test]]
+name = "cohere"
+required-features = ["onnx"]
+
 [[test]]
 name = "whisperfile"
 required-features = ["whisperfile"]
 
@@ -1,6 +1,6 @@
 # transcribe-rs
 
-Multi-engine speech-to-text library for Rust. Supports Parakeet, Canary, Moonshine, SenseVoice, GigaAM, Whisper, Whisperfile, and OpenAI.
+Multi-engine speech-to-text library for Rust. Supports Parakeet, Canary, Cohere, Moonshine, SenseVoice, GigaAM, Whisper, Whisperfile, and OpenAI.
 
 ## Breaking Changes in 0.3.0
 
@@ -24,7 +24,7 @@ No features are enabled by default. Pick the engines you need:
 
 | Feature | Engines |
 |---------|---------|
-| `onnx` | Parakeet, Canary, Moonshine, SenseVoice, GigaAM (via ONNX Runtime) |
+| `onnx` | Parakeet, Canary, Cohere, Moonshine, SenseVoice, GigaAM (via ONNX Runtime) |
 | `whisper-cpp` | Whisper (local, GGML via whisper.cpp with Metal/Vulkan/CUDA) |
 | `whisperfile` | Whisperfile (local server wrapper) |
 | `openai` | OpenAI API (remote, async) |
@@ -143,6 +143,30 @@ Model variant (Flash vs V2) is auto-detected from vocabulary size. Flash models
 - **ITN** (inverse text normalization) — enabled by default. Converts spoken numbers to written form (e.g. "one hundred twenty three" becomes "123"). Set `use_itn: false` to disable. Only supported on V2 models; silently ignored on Flash.
 - **Translation** — set `target_language` to translate between supported languages.
 
+### Cohere
+
+```rust
+use transcribe_rs::onnx::cohere::{CohereModel, CohereParams};
+use transcribe_rs::onnx::Quantization;
+use std::path::PathBuf;
+
+let mut model = CohereModel::load(
+    &PathBuf::from("models/cohere-int4"),
+    &Quantization::Int4,
+)?;
+
+let samples = transcribe_rs::audio::read_wav_samples(&PathBuf::from("audio.wav"))?;
+let result = model.transcribe_with(
+    &samples,
+    &CohereParams {
+        language: Some("en".to_string()),
+        ..Default::default()
+    },
+)?;
+```
+
+Available in int4 and int8 quantizations.
+
 ### SenseVoice
 
 ```rust
@@ -295,6 +319,8 @@ All audio input must be **16 kHz, mono, 16-bit PCM WAV**.
 | Canary 180M Flash | [HuggingFace](https://huggingface.co/istupakov/canary-180m-flash-onnx) |
 | Canary 1B Flash | [HuggingFace](https://huggingface.co/istupakov/canary-1b-flash-onnx) |
 | Canary 1B v2 | [HuggingFace](https://huggingface.co/istupakov/canary-1b-v2-onnx) |
+| Cohere (int4) | [HuggingFace](https://huggingface.co/cstr/cohere-transcribe-onnx-int4) |
+| Cohere (int8) | [HuggingFace](https://huggingface.co/tristanripke/cohere-transcribe-onnx-int8) |
 | SenseVoice (int8) | [blob.handy.computer](https://blob.handy.computer/sense-voice-int8.tar.gz) / [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models) |
 | Moonshine | [blob.handy.computer (base)](https://blob.handy.computer/moonshine-base.tar.gz), [blob.handy.computer (tiny streaming en)](https://blob.handy.computer/moonshine-tiny-streaming-en.tar.gz), [blob.handy.computer (small streaming en)](https://blob.handy.computer/moonshine-small-streaming-en.tar.gz), [blob.handy.computer (medium streaming en)](https://blob.handy.computer/moonshine-medium-streaming-en.tar.gz) |
 | GigaAM | [HuggingFace](https://huggingface.co/istupakov/gigaam-v3-onnx/tree/main) |
@@ -321,6 +347,16 @@ models/canary-1b-v2/
 └── vocab.txt
 ```
 
+**Cohere** (directory):
+```
+models/cohere-int4/
+├── cohere-encoder.int4.onnx
+├── cohere-encoder.int4.onnx.data
+├── cohere-decoder.int4.onnx
+├── cohere-decoder.int4.onnx.data
+└── tokens.txt
+```
+
 **SenseVoice** (directory):
 ```
 models/sense-voice/
@@ -375,6 +411,7 @@ Each engine has an example in `examples/`. Run with the appropriate feature flag
 ```bash
 cargo run --example parakeet --features onnx
 cargo run --example canary --features onnx
+cargo run --example cohere --features onnx
 cargo run --example sense_voice --features onnx
 cargo run --example moonshine --features onnx
 cargo run --example moonshine_streaming --features onnx
 
@@ -34,7 +34,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
         positional
             .get(1)
             .map(|s| s.as_str())
-            .unwrap_or("samples/jfk.wav"),
+            .unwrap_or("samples/dots.wav"),
     );
 
     let audio_duration = get_audio_duration(&wav_path)?;
 
@@ -0,0 +1,58 @@
+use std::path::PathBuf;
+use std::time::Instant;
+
+use transcribe_rs::onnx::cohere::CohereModel;
+use transcribe_rs::onnx::Quantization;
+use transcribe_rs::SpeechModel;
+
+fn get_audio_duration(path: &PathBuf) -> Result<f64, Box<dyn std::error::Error>> {
+    let reader = hound::WavReader::open(path)?;
+    let spec = reader.spec();
+    let duration = reader.duration() as f64 / spec.sample_rate as f64;
+    Ok(duration)
+}
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    env_logger::init();
+
+    let args: Vec<String> = std::env::args().collect();
+    let quant = args.get(1).map(|s| s.as_str()).unwrap_or("int8");
+
+    let (model_path, quantization) = match quant {
+        "int4" => ("models/cohere-int4", Quantization::Int4),
+        "int8" => ("models/cohere-int8", Quantization::Int8),
+        other => {
+            eprintln!("Unknown quantization: {other}. Use 'int4' or 'int8'.");
+            std::process::exit(1);
+        }
+    };
+
+    let model_path = PathBuf::from(model_path);
+    let wav_path = PathBuf::from("samples/dots.wav");
+
+    let audio_duration = get_audio_duration(&wav_path)?;
+    println!("Audio duration: {:.2}s", audio_duration);
+
+    println!("Using Cohere ONNX engine ({quant})");
+    println!("Loading model: {:?}", model_path);
+
+    let load_start = Instant::now();
+    let mut model = CohereModel::load(&model_path, &quantization)?;
+    let load_duration = load_start.elapsed();
+    println!("Model loaded in {:.2?}", load_duration);
+
+    println!("Transcribing file: {:?}", wav_path);
+    let transcribe_start = Instant::now();
+    let result = model.transcribe_file(&wav_path, &transcribe_rs::TranscribeOptions::default())?;
+    let transcribe_duration = transcribe_start.elapsed();
+    println!("Transcription completed in {:.2?}", transcribe_duration);
+
+    let speedup_factor = audio_duration / transcribe_duration.as_secs_f64();
+    println!(
+        "Real-time speedup: {:.2}x faster than real-time",
+        speedup_factor
+    );
+    println!("Transcription result:\n{}", result.text);
+
+    Ok(())
+}
@@ -0,0 +1,127 @@
+/// Greedy autoregressive token selection with repetition detection.
+///
+/// Wraps the common argmax + EOS + repeat-guard pattern shared by all
+/// autoregressive decoder engines (Canary, Moonshine, Cohere).
+///
+/// Each engine still owns its KV cache and decoder session — this struct
+/// only handles token selection and stopping decisions.
+
+const DEFAULT_MAX_CONSECUTIVE_REPEATS: usize = 8;
+
+pub struct GreedyDecoder {
+    eos_id: i64,
+    max_consecutive_repeats: usize,
+    last_token: i64,
+    consecutive_count: usize,
+}
+
+impl GreedyDecoder {
+    pub fn new(eos_id: i64) -> Self {
+        Self {
+            eos_id,
+            max_consecutive_repeats: DEFAULT_MAX_CONSECUTIVE_REPEATS,
+            last_token: -1,
+            consecutive_count: 0,
+        }
+    }
+
+    pub fn with_max_repeats(mut self, n: usize) -> Self {
+        self.max_consecutive_repeats = n;
+        self
+    }
+
+    /// Given logits for the last decoder position, pick the next token.
+    ///
+    /// Returns `Some(token_id)` to continue decoding, or `None` to stop
+    /// (EOS reached or repetition limit hit).
+    pub fn next_token(&mut self, logits: &[f32]) -> Option<i64> {
+        let token = argmax(logits) as i64;
+
+        if token == self.eos_id {
+            return None;
+        }
+
+        if token == self.last_token {
+            self.consecutive_count += 1;
+            if self.consecutive_count > self.max_consecutive_repeats {
+                log::warn!(
+                    "Greedy decode: token {} repeated {} consecutive times, stopping",
+                    token,
+                    self.consecutive_count
+                );
+                return None;
+            }
+        } else {
+            self.consecutive_count = 1;
+        }
+
+        self.last_token = token;
+        Some(token)
+    }
+}
+
+fn argmax(logits: &[f32]) -> usize {
+    let mut max_idx = 0;
+    let mut max_val = f32::NEG_INFINITY;
+    for (i, &v) in logits.iter().enumerate() {
+        if v > max_val {
+            max_val = v;
+            max_idx = i;
+        }
+    }
+    max_idx
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_argmax() {
+        assert_eq!(argmax(&[1.0, 3.0, 2.0]), 1);
+        assert_eq!(argmax(&[-1.0, -3.0, -0.5]), 2);
+        assert_eq!(argmax(&[5.0]), 0);
+    }
+
+    #[test]
+    fn test_eos_stops() {
+        let mut dec = GreedyDecoder::new(2);
+        // logits where token 2 (EOS) wins
+        assert_eq!(dec.next_token(&[0.0, 0.0, 10.0, 0.0]), None);
+    }
+
+    #[test]
+    fn test_normal_token() {
+        let mut dec = GreedyDecoder::new(2);
+        assert_eq!(dec.next_token(&[0.0, 10.0, 0.0, 0.0]), Some(1));
+    }
+
+    #[test]
+    fn test_repeat_limit() {
+        let mut dec = GreedyDecoder::new(99).with_max_repeats(3);
+        let logits = [0.0, 10.0, 0.0]; // always picks token 1
+        assert_eq!(dec.next_token(&logits), Some(1)); // count=1
+        assert_eq!(dec.next_token(&logits), Some(1)); // count=2
+        assert_eq!(dec.next_token(&logits), Some(1)); // count=3
+        assert_eq!(dec.next_token(&logits), None); // count=4 > 3 → stop
+    }
+
+    #[test]
+    fn test_repeat_resets_on_different_token() {
+        let mut dec = GreedyDecoder::new(99).with_max_repeats(3);
+        assert_eq!(dec.next_token(&[0.0, 10.0, 0.0]), Some(1)); // count=1
+        assert_eq!(dec.next_token(&[0.0, 10.0, 0.0]), Some(1)); // count=2
+        assert_eq!(dec.next_token(&[10.0, 0.0, 0.0]), Some(0)); // different, count=1
+        assert_eq!(dec.next_token(&[0.0, 10.0, 0.0]), Some(1)); // count=1
+        assert_eq!(dec.next_token(&[0.0, 10.0, 0.0]), Some(1)); // count=2
+        assert_eq!(dec.next_token(&[0.0, 10.0, 0.0]), Some(1)); // count=3
+        assert_eq!(dec.next_token(&[0.0, 10.0, 0.0]), None); // count=4 > 3 → stop
+    }
+
+    #[test]
+    fn test_nan_handling() {
+        let mut dec = GreedyDecoder::new(99);
+        // NaN logits — argmax uses `>` which is false for NaN, so index 0 wins
+        assert_eq!(dec.next_token(&[f32::NAN, f32::NAN, f32::NAN]), Some(0));
+    }
+}
@@ -1,7 +1,9 @@
 mod ctc;
+mod greedy;
 mod sentencepiece;
 pub mod tokens;
 
 pub use ctc::{ctc_greedy_decode, CtcDecoderResult};
+pub use greedy::GreedyDecoder;
 pub use sentencepiece::sentencepiece_to_text;
 pub use tokens::{load_vocab, SymbolTable};
@@ -4,6 +4,7 @@ use ort::value::ValueType;
 use ort::value::{DynValue, Tensor};
 
 use super::vocab::Vocab;
+use crate::decode::GreedyDecoder;
 use crate::TranscribeError;
 
 pub fn decode_autoregressive(
@@ -26,6 +27,7 @@ pub fn decode_autoregressive(
     let mut decoder_mems: DynValue = Tensor::from_array(empty_cache)?.into_dyn();
 
     let eos_id = vocab.eos_token_id();
+    let mut greedy = GreedyDecoder::new(eos_id);
     let mut all_tokens = prompt_tokens;
 
     // Limit decode steps so total tokens (prompt + generated) stays within
@@ -58,7 +60,7 @@ pub fn decode_autoregressive(
         ])?;
 
         // Extract logits in a scoped borrow, then release before remove()
-        let next_token = {
+        let last_logits = {
             let (logits_shape, logits_data) =
                 outputs["logits"].try_extract_tensor::<f32>().map_err(|e| {
                     TranscribeError::Inference(format!("Failed to extract logits: {e}"))
@@ -68,18 +70,19 @@ pub fn decode_autoregressive(
             let vocab_size = logits_shape[2] as usize;
 
             let last_step_offset = (seq_len - 1) * vocab_size;
-            let last_logits = &logits_data[last_step_offset..last_step_offset + vocab_size];
+            logits_data[last_step_offset..last_step_offset + vocab_size].to_vec()
+        };
 
-            argmax(last_logits) as i64
+        let next_token = match greedy.next_token(&last_logits) {
+            Some(t) => t,
+            None => {
+                log::debug!("Decode stopped at step {}", step);
+                break;
+            }
         };
 
         log::debug!("Step {}: predicted token ID {}", step, next_token);
 
-        if next_token == eos_id {
-            log::debug!("EOS token reached at step {}", step);
-            break;
-        }
-
         all_tokens.push(next_token);
 
         // Take the KV cache directly from outputs (Arc clone, no data copy)
@@ -129,27 +132,3 @@ fn extract_decoder_mems_shape(decoder: &Session) -> Result<(usize, usize), Trans
         ))),
     }
 }
-
-fn argmax(slice: &[f32]) -> usize {
-    let mut max_idx = 0;
-    let mut max_val = f32::NEG_INFINITY;
-    for (i, &v) in slice.iter().enumerate() {
-        if v > max_val {
-            max_val = v;
-            max_idx = i;
-        }
-    }
-    max_idx
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_argmax() {
-        assert_eq!(argmax(&[1.0, 3.0, 2.0]), 1);
-        assert_eq!(argmax(&[-1.0, -3.0, -0.5]), 2);
-        assert_eq!(argmax(&[5.0]), 0);
-    }
-}