Skip to content

Commit 5c30fb1

Browse files
committed
fix: address branch review findings (5 warnings, 5 infos)
- asr_text guard: only apply when EOS was seen, not on max_tokens truncation (fixes conflict with truncation test) - Add asr_text_token_id >= 0 to load-time validation - Mark tokenizer encode() as pub(crate) to prevent misuse on long text - Use ..Default::default() in transcribe_raw instead of hardcoded 512 - Fix dangling OLOGY_BUG.md doc reference - Fix cfg(test) function doc reference - Add unit tests for language-conditioned prompt structure and None-path equivalence with standard prompt
1 parent 3fe303e commit 5c30fb1

4 files changed

Lines changed: 75 additions & 13 deletions

File tree

src/onnx/qwen3/engine.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,8 @@ impl SpeechModel for Qwen3Model {
135135
);
136136
}
137137
let params = Qwen3Params {
138-
max_tokens: 512,
139138
language: options.language.clone(),
139+
..Default::default()
140140
};
141141
self.transcribe_with(samples, &params)
142142
}

src/onnx/qwen3/model.rs

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ impl Qwen3AsrModel {
4343
&& st.audio_start_token_id >= 0
4444
&& st.audio_end_token_id >= 0
4545
&& st.audio_pad_token_id >= 0
46+
&& st.asr_text_token_id >= 0
4647
&& !st.eos_token_ids.is_empty()
4748
&& st.eos_token_ids.iter().all(|&id| id >= 0);
4849
if !valid {
@@ -286,24 +287,27 @@ impl Qwen3AsrModel {
286287
}
287288
}
288289

289-
if !self
290+
let eos_reached = self
290291
.config
291292
.special_tokens
292293
.eos_token_ids
293-
.contains(&current_token)
294-
{
294+
.contains(&current_token);
295+
if !eos_reached {
295296
log::warn!(
296297
"Qwen3-ASR: max_tokens ({}) reached without EOS token",
297298
max_tokens
298299
);
299300
}
300301

301302
// The model should produce `language <Name> <asr_text> <transcription>`.
302-
// If the <asr_text> separator token is absent, the decoder failed to produce
303-
// a valid transcription (e.g. degenerate "ology" output from int4 quantization
304-
// noise on non-speech audio). Return empty string rather than garbage.
303+
// If the <asr_text> separator token is absent AND the model completed
304+
// normally (EOS seen), the decoder failed to produce a valid transcription
305+
// (e.g. degenerate "ology" output from int4 quantization noise on
306+
// non-speech audio). Return empty string rather than garbage.
307+
// When max_tokens truncated the output, the <asr_text> token may simply
308+
// not have been reached yet — this is not degenerate, just truncated.
305309
let asr_text_id = self.config.special_tokens.asr_text_token_id;
306-
if !output_tokens.contains(&asr_text_id) {
310+
if eos_reached && !output_tokens.contains(&asr_text_id) {
307311
let preview: Vec<_> = output_tokens.iter().take(20).collect();
308312
log::warn!(
309313
"Qwen3-ASR: no <asr_text> token in output ({} tokens, first 20: {:?}); \

src/onnx/qwen3/prompt.rs

Lines changed: 62 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,11 +57,11 @@ pub(crate) fn build_prompt_ids(
5757
/// When `language_token_ids` is provided, the prompt includes a system message
5858
/// ("You are a helpful assistant.") and a user instruction ("Please transcribe
5959
/// the above {language} audio."). This conditions the decoder toward the
60-
/// specified language and avoids degenerate output on non-speech audio
61-
/// (see OLOGY_BUG.md).
60+
/// specified language and avoids degenerate output (e.g. "ology") from int4
61+
/// quantization noise on non-speech audio.
6262
///
63-
/// When `language_token_ids` is `None`, falls back to [`build_prompt_ids`]
64-
/// (empty system turn, no language instruction).
63+
/// When `language_token_ids` is `None`, builds the standard prompt with an
64+
/// empty system turn and no language instruction.
6565
pub fn build_prompt_ids_with_language(
6666
special_tokens: &SpecialTokens,
6767
audio_token_count: usize,
@@ -201,4 +201,62 @@ mod tests {
201201
assert_eq!(end, 19);
202202
assert_eq!(end - start, 10);
203203
}
204+
205+
#[test]
206+
fn test_build_prompt_ids_with_language() {
207+
let st = test_special_tokens();
208+
let lang_ids: &[i64] = &[22574]; // "English"
209+
let ids = build_prompt_ids_with_language(&st, 5, Some(lang_ids));
210+
211+
// System turn: im_start, system, newline, <SYSTEM_CONTENT>, im_end, newline
212+
assert_eq!(ids[0], st.im_start_token_id);
213+
assert_eq!(ids[1], SYSTEM_TOKEN_ID);
214+
assert_eq!(ids[2], NEWLINE_TOKEN_ID);
215+
// SYSTEM_CONTENT: 6 tokens [2610, 525, 264, 10950, 17847, 13]
216+
assert_eq!(&ids[3..9], SYSTEM_CONTENT);
217+
assert_eq!(ids[9], st.im_end_token_id);
218+
assert_eq!(ids[10], NEWLINE_TOKEN_ID);
219+
220+
// User turn: im_start, user, newline, USER_PREFIX, audio_start
221+
assert_eq!(ids[11], st.im_start_token_id);
222+
assert_eq!(ids[12], USER_TOKEN_ID);
223+
assert_eq!(ids[13], NEWLINE_TOKEN_ID);
224+
// USER_PREFIX: 5 tokens
225+
assert_eq!(&ids[14..19], USER_PREFIX);
226+
assert_eq!(ids[19], st.audio_start_token_id);
227+
228+
// 5 audio_pad tokens
229+
for i in 20..25 {
230+
assert_eq!(ids[i], st.audio_pad_token_id);
231+
}
232+
233+
// audio_end, USER_SUFFIX_PRE, language tokens, USER_SUFFIX_POST, im_end, newline
234+
assert_eq!(ids[25], st.audio_end_token_id);
235+
let suffix_pre_start = 26;
236+
let suffix_pre_end = suffix_pre_start + USER_SUFFIX_PRE.len();
237+
assert_eq!(&ids[suffix_pre_start..suffix_pre_end], USER_SUFFIX_PRE);
238+
assert_eq!(ids[suffix_pre_end], 22574); // "English"
239+
let suffix_post_start = suffix_pre_end + 1;
240+
let suffix_post_end = suffix_post_start + USER_SUFFIX_POST.len();
241+
assert_eq!(&ids[suffix_post_start..suffix_post_end], USER_SUFFIX_POST);
242+
assert_eq!(ids[suffix_post_end], st.im_end_token_id);
243+
assert_eq!(ids[suffix_post_end + 1], NEWLINE_TOKEN_ID);
244+
245+
// Assistant turn
246+
assert_eq!(ids[suffix_post_end + 2], st.im_start_token_id);
247+
assert_eq!(ids[suffix_post_end + 3], ASSISTANT_TOKEN_ID);
248+
assert_eq!(ids[suffix_post_end + 4], NEWLINE_TOKEN_ID);
249+
250+
// Audio pad range should still work
251+
let (start, end) = get_audio_pad_range(&ids, st.audio_pad_token_id).unwrap();
252+
assert_eq!(end - start, 5);
253+
}
254+
255+
#[test]
256+
fn test_language_none_matches_standard_prompt() {
257+
let st = test_special_tokens();
258+
let standard = build_prompt_ids(&st, 10);
259+
let with_none = build_prompt_ids_with_language(&st, 10, None);
260+
assert_eq!(standard, with_none);
261+
}
204262
}

src/onnx/qwen3/tokenizer.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ impl Qwen3Tokenizer {
111111
/// vocabulary entry at each position. For common English words and language
112112
/// names this produces identical results to the reference BPE tokenizer.
113113
/// Results may differ on rare subword boundaries.
114-
pub fn encode(&self, text: &str) -> Vec<i64> {
114+
pub(crate) fn encode(&self, text: &str) -> Vec<i64> {
115115
let bytes = text.as_bytes();
116116
let mut ids = Vec::new();
117117
let mut pos = 0;

0 commit comments

Comments
 (0)