whisper: set no_context to prevent quality drift over a session (#79)

anton-averich · cjpais · web-flow · commit d97ae65debaa · 2026-04-08T21:23:25.000+08:00
* whisper: set no_context to prevent quality drift over a session

Whisper transcription quality degrades progressively over a long
push-to-talk session: short clips get mis-recognized or returned
empty, and language detection sticks to the previous language
(e.g. RU→EN switches keep producing Russian). Reloading the model
restores quality.

The cause is whisper.cpp's default prompt_past behaviour — the last
decoded tokens are fed back as a prompt for the next decode. That's
the right thing for continuous speech (lectures, meetings) where
consecutive segments are connected, but the wrong thing for
push-to-talk and similar workloads where each call to transcribe
is an independent utterance: stale prompt tokens bias the next
decode. Short clips suffer most because they have less acoustic
evidence to overcome the stale prompt; language switches suffer
because the prompt is in the previous language and steers detection.

Set no_context = true so each decode starts from a clean prompt.
The user-supplied initial_prompt continues to work — it goes through
a different field and is unaffected.

* whisper: expose no_context as a configurable field

Per review, make no_context an opt-in field on WhisperInferenceParams
so callers can override it for continuous-speech use cases (lectures,
meetings, streaming) where carrying prompt_past across segments
improves consistency. Default stays true — the right choice for
independent utterances such as push-to-talk dictation, which is the
case the previous commit fixed.

* fmt

---------

Co-authored-by: CJ Pais &lt;cj@cjpais.com&gt;
diff --git a/src/onnx/session.rs b/src/onnx/session.rs
@@ -8,11 +8,11 @@ use ort::ep::ROCm;
 use ort::ep::TensorRT;
 #[cfg(feature = "ort-webgpu")]
 use ort::ep::WebGPU;
-#[cfg(feature = "ort-xnnpack")]
-use ort::ep::XNNPACK;
 use ort::ep::CPU;
 #[cfg(feature = "ort-cuda")]
 use ort::ep::CUDA;
+#[cfg(feature = "ort-xnnpack")]
+use ort::ep::XNNPACK;
 
 use ort::session::builder::GraphOptimizationLevel;
 use ort::session::Session;
diff --git a/src/whisper_cpp/mod.rs b/src/whisper_cpp/mod.rs
@@ -110,6 +110,10 @@ pub struct WhisperInferenceParams {
 
     /// Initial prompt to provide context to the model.
     pub initial_prompt: Option<String>,
+
+    /// Start each decode with a clean prompt (whisper.cpp's `prompt_past`).
+    /// Default `true` suits push-to-talk; set `false` for continuous speech.
+    pub no_context: bool,
 }
 
 impl Default for WhisperInferenceParams {
@@ -126,6 +130,7 @@ impl Default for WhisperInferenceParams {
             no_speech_thold: 0.2,
             n_threads: 0,
             initial_prompt: None,
+            no_context: true,
         }
     }
 }
@@ -220,6 +225,7 @@ impl WhisperEngine {
         full_params.set_suppress_blank(params.suppress_blank);
         full_params.set_suppress_nst(params.suppress_non_speech_tokens);
         full_params.set_no_speech_thold(params.no_speech_thold);
+        full_params.set_no_context(params.no_context);
         if params.n_threads > 0 {
             full_params.set_n_threads(params.n_threads);
         }

Original file line number	Diff line number	Diff line change
`@@ -110,6 +110,10 @@ pub struct WhisperInferenceParams {`
`110`	`110`
`111`	`111`	`/// Initial prompt to provide context to the model.`
`112`	`112`	`pub initial_prompt: Option<String>,`
	`113`	`+`
	`114`	+ /// Start each decode with a clean prompt (whisper.cpp's `prompt_past`).
	`115`	+ /// Default `true` suits push-to-talk; set `false` for continuous speech.
	`116`	`+ pub no_context: bool,`
`113`	`117`	`}`
`114`	`118`
`115`	`119`	`impl Default for WhisperInferenceParams {`
`@@ -126,6 +130,7 @@ impl Default for WhisperInferenceParams {`
`126`	`130`	`no_speech_thold: 0.2,`
`127`	`131`	`n_threads: 0,`
`128`	`132`	`initial_prompt: None,`
	`133`	`+ no_context: true,`
`129`	`134`	`}`
`130`	`135`	`}`
`131`	`136`	`}`
`@@ -220,6 +225,7 @@ impl WhisperEngine {`
`220`	`225`	`full_params.set_suppress_blank(params.suppress_blank);`
`221`	`226`	`full_params.set_suppress_nst(params.suppress_non_speech_tokens);`
`222`	`227`	`full_params.set_no_speech_thold(params.no_speech_thold);`
	`228`	`+ full_params.set_no_context(params.no_context);`
`223`	`229`	`if params.n_threads > 0 {`
`224`	`230`	`full_params.set_n_threads(params.n_threads);`
`225`	`231`	`}`