fix: align Cohere Transcribe API with repo conventions

cgbur · cgbur · commit 057884a952b5 · 2026-03-31T23:51:10.000-07:00
- Add Quantization parameter to load() for API consistency with other
  ONNX models (ignored since no quantized variants exist)
- Change max_new_tokens default from 448 to 256 to match Python reference
- Use #[derive(Default)] for CohereTranscribeParams per PORTING.md
- Add HuggingFace download links in README
- Accept optional CLI arg in example for testing custom wav files
- Update doc comments with accurate audio length behavior
diff --git a/README.md b/README.md
@@ -216,9 +216,13 @@ let result = model.transcribe_file(&PathBuf::from("audio.wav"), &transcribe_rs::
 
 ```rust
 use transcribe_rs::onnx::cohere_transcribe::{CohereTranscribeModel, CohereTranscribeParams};
+use transcribe_rs::onnx::Quantization;
 use std::path::PathBuf;
 
-let mut model = CohereTranscribeModel::load(&PathBuf::from("models/cohere-transcribe"))?;
+let mut model = CohereTranscribeModel::load(
+    &PathBuf::from("models/cohere-transcribe"),
+    &Quantization::default(),
+)?;
 
 let samples = transcribe_rs::audio::read_wav_samples(&PathBuf::from("audio.wav"))?;
 let result = model.transcribe_with(
@@ -317,7 +321,7 @@ All audio input must be **16 kHz, mono, 16-bit PCM WAV**.
 | SenseVoice (int8) | [blob.handy.computer](https://blob.handy.computer/sense-voice-int8.tar.gz) / [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models) |
 | Moonshine | [blob.handy.computer (base)](https://blob.handy.computer/moonshine-base.tar.gz), [blob.handy.computer (tiny streaming en)](https://blob.handy.computer/moonshine-tiny-streaming-en.tar.gz), [blob.handy.computer (small streaming en)](https://blob.handy.computer/moonshine-small-streaming-en.tar.gz), [blob.handy.computer (medium streaming en)](https://blob.handy.computer/moonshine-medium-streaming-en.tar.gz) |
 | GigaAM | [HuggingFace](https://huggingface.co/istupakov/gigaam-v3-onnx/tree/main) |
-| Cohere Transcribe | Model-specific ONNX export plus `vocab.txt` |
+| Cohere Transcribe | [HuggingFace](https://huggingface.co/eschmidbauer/cohere-transcribe-03-2026-onnx) (ONNX export of [CohereLabs/cohere-transcribe-03-2026](https://huggingface.co/CohereLabs/cohere-transcribe-03-2026)) |
 | Whisper (GGML) | [HuggingFace](https://huggingface.co/ggerganov/whisper.cpp/tree/main) |
 | Whisperfile binary | [GitHub](https://github.com/mozilla-ai/llamafile/releases/download/0.9.3/whisperfile-0.9.3) |
 
diff --git a/examples/cohere_transcribe.rs b/examples/cohere_transcribe.rs
@@ -2,6 +2,7 @@ use std::path::PathBuf;
 use std::time::Instant;
 
 use transcribe_rs::onnx::cohere_transcribe::{CohereTranscribeModel, CohereTranscribeParams};
+use transcribe_rs::onnx::Quantization;
 
 fn get_audio_duration(path: &PathBuf) -> Result<f64, Box<dyn std::error::Error>> {
     let reader = hound::WavReader::open(path)?;
@@ -14,14 +15,17 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     env_logger::init();
 
     let model_path = PathBuf::from("models/cohere-transcribe");
-    let wav_path = PathBuf::from("samples/jfk.wav");
+    let wav_path = std::env::args()
+        .nth(1)
+        .map(PathBuf::from)
+        .unwrap_or_else(|| PathBuf::from("samples/jfk.wav"));
 
     let audio_duration = get_audio_duration(&wav_path)?;
     println!("Audio duration: {:.2}s", audio_duration);
 
     // Load
     let load_start = Instant::now();
-    let mut model = CohereTranscribeModel::load(&model_path)?;
+    let mut model = CohereTranscribeModel::load(&model_path, &Quantization::default())?;
     println!("Model loaded in {:.2?}", load_start.elapsed());
 
     // Transcribe
diff --git a/src/onnx/cohere_transcribe/mod.rs b/src/onnx/cohere_transcribe/mod.rs
@@ -10,6 +10,7 @@ use ort::value::{DynValue, Tensor};
 
 use self::decoder::decode_autoregressive;
 use self::vocab::Vocab;
+use super::Quantization;
 use crate::features::{compute_mel, MelConfig, WindowType};
 use crate::{
     ModelCapabilities, SpeechModel, TranscribeError, TranscribeOptions, TranscriptionResult,
@@ -51,21 +52,12 @@ fn mel_config() -> MelConfig {
 }
 
 /// Per-model inference parameters for Cohere Transcribe.
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, Default)]
 pub struct CohereTranscribeParams {
     /// Source language (ISO-639-1, e.g. "en"). Defaults to "en".
     pub language: Option<String>,
-    /// Maximum number of new tokens to generate. Defaults to 448.
-    pub max_new_tokens: usize,
-}
-
-impl Default for CohereTranscribeParams {
-    fn default() -> Self {
-        Self {
-            language: None,
-            max_new_tokens: 448,
-        }
-    }
+    /// Maximum number of new tokens to generate. Defaults to 256.
+    pub max_new_tokens: Option<usize>,
 }
 
 /// Cohere Transcribe speech model backed by ONNX sessions.
@@ -87,7 +79,10 @@ pub struct CohereTranscribeModel {
 
 impl CohereTranscribeModel {
     /// Load a Cohere Transcribe model from `model_dir`.
-    pub fn load(model_dir: &Path) -> Result<Self, TranscribeError> {
+    ///
+    /// The `quantization` parameter is accepted for API consistency but currently
+    /// ignored since no quantized variants of this model are available.
+    pub fn load(model_dir: &Path, _quantization: &Quantization) -> Result<Self, TranscribeError> {
         if !model_dir.exists() {
             return Err(TranscribeError::ModelNotFound(model_dir.to_path_buf()));
         }
@@ -139,12 +134,19 @@ impl CohereTranscribeModel {
     }
 
     /// Transcribe with model-specific parameters.
+    ///
+    /// The upstream config specifies `max_audio_clip_s = 35`, but the ONNX encoder
+    /// accepts longer audio. The autoregressive decoder is the practical limit:
+    /// it stops at EOS or `max_new_tokens` (default 256), so very long audio may
+    /// be truncated. For long-form transcription, use a chunked transcriber
+    /// (e.g. `VadChunkedTranscriber` or `EnergyAdaptiveTranscriber`).
     pub fn transcribe_with(
         &mut self,
         samples: &[f32],
         params: &CohereTranscribeParams,
     ) -> Result<TranscriptionResult, TranscribeError> {
         let language = params.language.as_deref().unwrap_or("en");
+        let max_new_tokens = params.max_new_tokens.unwrap_or(256);
         let total_start = Instant::now();
 
         // Step 1: Compute mel features
@@ -212,7 +214,7 @@ impl CohereTranscribeModel {
             src_len,
             prompt_tokens,
             &self.vocab,
-            params.max_new_tokens,
+            max_new_tokens,
         )?;
 
         log::debug!("Decoding completed in {:.2?}", decode_start.elapsed());
diff --git a/tests/cohere_transcribe.rs b/tests/cohere_transcribe.rs
@@ -2,6 +2,7 @@ mod common;
 
 use std::path::PathBuf;
 use transcribe_rs::onnx::cohere_transcribe::CohereTranscribeModel;
+use transcribe_rs::onnx::Quantization;
 use transcribe_rs::SpeechModel;
 
 #[test]
@@ -15,7 +16,8 @@ fn test_cohere_transcribe() {
         return;
     }
 
-    let mut model = CohereTranscribeModel::load(&model_dir).expect("Failed to load model");
+    let mut model =
+        CohereTranscribeModel::load(&model_dir, &Quantization::default()).expect("Failed to load model");
 
     let result = model
         .transcribe_file(&wav_path, &transcribe_rs::TranscribeOptions::default())