Skip to content

Commit 2d87ea5

Browse files
committed
refactor(sanitizer): harden JSON extraction with thought block and preamble handling
Sanitizer now strips both <think> (qwen3 native) and <thought> blocks before JSON parsing, uses targeted "type" key search for reliable JSON extraction from noisy LLM output, and detects conversational preambles via VALID_TYPE_START_REGEX. Adds 9 regression tests covering thought blocks, unclosed tags, preambles, and noisy JSON with braces.
1 parent b601410 commit 2d87ea5

2 files changed

Lines changed: 184 additions & 21 deletions

File tree

src/services/sanitizer.rs

Lines changed: 46 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,17 @@ static SCOPE_REGEX: LazyLock<Regex> =
2727

2828
static CODE_FENCE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"```[\s\S]*?```").unwrap());
2929

30+
static THOUGHT_BLOCK_REGEX: LazyLock<Regex> =
31+
LazyLock::new(|| Regex::new(r"(?s)<(?:thought|think)>[\s\S]*?</(?:thought|think)>").unwrap());
32+
33+
static UNCLOSED_THOUGHT_REGEX: LazyLock<Regex> =
34+
LazyLock::new(|| Regex::new(r"(?i)^\s*<(?:thought|think)>\s*").unwrap());
35+
36+
static VALID_TYPE_START_REGEX: LazyLock<Regex> = LazyLock::new(|| {
37+
let types = CommitType::ALL.join("|");
38+
Regex::new(&format!(r"(?m)(?:^|\s)({})(?:\(|!|:)", types)).unwrap()
39+
});
40+
3041
static PREAMBLE_PATTERNS: &[&str] = &[
3142
"here's the commit message",
3243
"here is the commit message",
@@ -102,7 +113,7 @@ impl CommitSanitizer {
102113
/// ` <continuation lines, indented two spaces>`
103114
///
104115
/// `str::len()` is `const fn` since Rust 1.39 — `FIRST_LINE_BUDGET` is a
105-
/// valid compile-time constant on MSRV 1.85.
116+
/// valid compile-time constant on MSRV 1.94.
106117
fn format_breaking_footer(desc: &str) -> String {
107118
const PREFIX: &str = "BREAKING CHANGE: ";
108119
const FIRST_LINE_BUDGET: usize = 72 - PREFIX.len(); // 55
@@ -136,29 +147,34 @@ impl CommitSanitizer {
136147
}
137148

138149
fn try_parse_json(raw: &str) -> std::result::Result<StructuredCommit, ()> {
139-
let trimmed = raw.trim();
140-
141-
// Direct JSON
142-
if trimmed.starts_with('{') {
143-
return serde_json::from_str(trimmed).map_err(|_| ());
144-
}
145-
146-
// JSON in code fence
147-
if let Some(start) = trimmed.find("```json") {
148-
let after_fence = &trimmed[start + 7..];
149-
if let Some(end) = after_fence.find("```") {
150-
let json = after_fence[..end].trim();
151-
return serde_json::from_str(json).map_err(|_| ());
150+
// Strip thought blocks first to avoid picking up braces inside thoughts
151+
let stripped = THOUGHT_BLOCK_REGEX.replace_all(raw, "");
152+
let stripped = UNCLOSED_THOUGHT_REGEX.replace(&stripped, "");
153+
let trimmed = stripped.trim();
154+
155+
// 1. Look for the start of our specific JSON structure
156+
// We look for "type" key because that's mandatory and highly specific.
157+
if let Some(type_key_pos) = trimmed.find("type") {
158+
// Find the '{' that starts this object (search backwards from "type")
159+
if let Some(start_brace) = trimmed[..type_key_pos].rfind('{') {
160+
let json_candidate = &trimmed[start_brace..];
161+
// Find the last '}' to handle trailing text
162+
if let Some(end_brace) = json_candidate.rfind('}') {
163+
let json = &json_candidate[..=end_brace];
164+
if let Ok(structured) = serde_json::from_str::<StructuredCommit>(json) {
165+
return Ok(structured);
166+
}
167+
}
152168
}
153169
}
154170

155-
// Plain code fence
156-
if let Some(start) = trimmed.find("```") {
157-
let after_fence = &trimmed[start + 3..];
158-
if let Some(end) = after_fence.find("```") {
159-
let content = after_fence[..end].trim();
160-
if content.starts_with('{') {
161-
return serde_json::from_str(content).map_err(|_| ());
171+
// 2. Fallback to any brace if specific start not found
172+
if let Some(start_brace) = trimmed.find('{') {
173+
let json_candidate = &trimmed[start_brace..];
174+
if let Some(end_brace) = json_candidate.rfind('}') {
175+
let json = &json_candidate[..=end_brace];
176+
if let Ok(structured) = serde_json::from_str::<StructuredCommit>(json) {
177+
return Ok(structured);
162178
}
163179
}
164180
}
@@ -266,9 +282,18 @@ impl CommitSanitizer {
266282
fn clean_text(raw: &str, format: &CommitFormat) -> String {
267283
let mut cleaned = raw.to_string();
268284

285+
// Remove thought blocks
286+
cleaned = THOUGHT_BLOCK_REGEX.replace_all(&cleaned, "").to_string();
287+
cleaned = UNCLOSED_THOUGHT_REGEX.replace(&cleaned, "").to_string();
288+
269289
// Remove code fences
270290
cleaned = CODE_FENCE_REGEX.replace_all(&cleaned, "").to_string();
271291

292+
// Find the actual start of the conventional commit (skip preambles/thoughts)
293+
if let Some(mat) = VALID_TYPE_START_REGEX.find(&cleaned) {
294+
cleaned = cleaned[mat.start()..].to_string();
295+
}
296+
272297
// Remove quotes at start/end
273298
cleaned = cleaned.trim().to_string();
274299
if cleaned.starts_with('"') && cleaned.ends_with('"') && cleaned.len() >= 2 {

tests/sanitizer.rs

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -603,3 +603,141 @@ fn validator_corrections_format() {
603603
assert!(corrections.contains("Type is wrong."));
604604
assert!(corrections.contains("Breaking change missing."));
605605
}
606+
607+
#[test]
608+
fn sanitize_with_preceding_thought_block() {
609+
let raw = r#"<thought>
610+
The core change is the addition of the CommitValidator struct to enforce subject specificity and evidence-based rules.
611+
</thought>
612+
{
613+
"type": "feat",
614+
"scope": "sanitizer",
615+
"subject": "add CommitValidator for evidence-based validation",
616+
"body": "Implements deterministic validation rules against code analysis signals.",
617+
"breaking_change": null
618+
}"#;
619+
let format = CommitFormat::default();
620+
let result = CommitSanitizer::sanitize(raw, &format).unwrap();
621+
assert!(
622+
result.starts_with("feat(sanitizer): add CommitValidator for evidence-based validation")
623+
);
624+
}
625+
626+
#[test]
627+
fn sanitize_plain_text_with_thought_block() {
628+
let raw = r#"<thought>
629+
The core change is renaming the function.
630+
</thought>
631+
refactor: rename process to process_all"#;
632+
let format = CommitFormat::default();
633+
let result = CommitSanitizer::sanitize(raw, &format).unwrap();
634+
assert_eq!(result, "refactor: rename process to process_all");
635+
}
636+
637+
#[test]
638+
fn sanitize_with_thought_block_containing_braces() {
639+
let raw = r#"<thought>
640+
I should generate a JSON like this: { "foo": "bar" }
641+
</thought>
642+
{
643+
"type": "refactor",
644+
"scope": "splitter",
645+
"subject": "upgrade clustering to hybrid Jaccard similarity",
646+
"body": null,
647+
"breaking_change": null
648+
}"#;
649+
let format = CommitFormat::default();
650+
let result = CommitSanitizer::sanitize(raw, &format).unwrap();
651+
assert_eq!(
652+
result,
653+
"refactor(splitter): upgrade clustering to hybrid Jaccard similarity"
654+
);
655+
}
656+
657+
#[test]
658+
fn sanitize_with_unclosed_thought_block() {
659+
// LLM might forget to close tag but still output the message
660+
let raw = r#"<thought>
661+
I will refactor the splitter to use Jaccard similarity.
662+
663+
refactor(splitter): upgrade clustering to hybrid Jaccard similarity"#;
664+
let format = CommitFormat::default();
665+
let result = CommitSanitizer::sanitize(raw, &format).unwrap();
666+
assert_eq!(
667+
result,
668+
"refactor(splitter): upgrade clustering to hybrid Jaccard similarity"
669+
);
670+
}
671+
672+
#[test]
673+
fn sanitize_with_noise_containing_braces_before_json() {
674+
let raw = r#"<thought>...</thought>
675+
The diff spans several files and adds a new field { "foo": 1 } to the config.
676+
{
677+
"type": "refactor",
678+
"scope": "sanitizer",
679+
"subject": "harden JSON extraction",
680+
"body": null,
681+
"breaking_change": null
682+
}"#;
683+
let format = CommitFormat::default();
684+
let result = CommitSanitizer::sanitize(raw, &format).unwrap();
685+
assert_eq!(result, "refactor(sanitizer): harden JSON extraction");
686+
}
687+
688+
#[test]
689+
fn sanitize_with_noise_before_plain_text() {
690+
let raw = r#"The diff spans several files. refactor: improve thing"#;
691+
let format = CommitFormat::default();
692+
let result = CommitSanitizer::sanitize(raw, &format).unwrap();
693+
assert_eq!(result, "refactor: improve thing");
694+
}
695+
696+
#[test]
697+
fn sanitize_with_think_block_json() {
698+
// qwen3/ollama native thinking uses <think> tags, not <thought>
699+
let raw = r#"<think>
700+
I need to analyze the diff. The main change is adding a new struct.
701+
</think>
702+
{
703+
"type": "feat",
704+
"scope": "core",
705+
"subject": "add DiffFingerprint struct for similarity comparison",
706+
"body": null,
707+
"breaking_change": null
708+
}"#;
709+
let format = CommitFormat::default();
710+
let result = CommitSanitizer::sanitize(raw, &format).unwrap();
711+
assert_eq!(
712+
result,
713+
"feat(core): add DiffFingerprint struct for similarity comparison"
714+
);
715+
}
716+
717+
#[test]
718+
fn sanitize_with_unclosed_think_block() {
719+
let raw = r#"<think>
720+
I will analyze the changes...
721+
722+
feat: add DiffFingerprint struct for similarity comparison"#;
723+
let format = CommitFormat::default();
724+
let result = CommitSanitizer::sanitize(raw, &format).unwrap();
725+
assert_eq!(
726+
result,
727+
"feat: add DiffFingerprint struct for similarity comparison"
728+
);
729+
}
730+
731+
#[test]
732+
fn sanitize_conversational_preamble_with_json() {
733+
// Claude/Anthropic sometimes outputs conversational text before JSON
734+
let raw = r#"Let me analyze the changes in the diff.
735+
736+
{"type": "refactor", "scope": "splitter", "subject": "upgrade clustering to hybrid Jaccard similarity", "body": null, "breaking_change": null}"#;
737+
let format = CommitFormat::default();
738+
let result = CommitSanitizer::sanitize(raw, &format).unwrap();
739+
assert_eq!(
740+
result,
741+
"refactor(splitter): upgrade clustering to hybrid Jaccard similarity"
742+
);
743+
}

0 commit comments

Comments
 (0)