Skip to content

Commit 8db162d

Browse files
committed
refactor(services): add Jaccard clustering, NUL-delimited git, conflict fix
Splitter uses hybrid similarity (diff-shape + Jaccard vocabulary overlap) for more accurate file grouping. Git service uses -z flag for safe path parsing. Conflict detector uses component-based path matching and scans only added lines, with concat! to prevent self-detection.
1 parent 2d87ea5 commit 8db162d

3 files changed

Lines changed: 97 additions & 33 deletions

File tree

src/services/git.rs

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,9 @@ impl GitService {
5858
pub async fn get_staged_changes(&self, max_file_lines: usize) -> Result<StagedChanges> {
5959
self.check_state()?;
6060

61-
// Two calls total (down from N+1): name-status + unified diff
61+
// Two calls total: name-status (NUL delimited) + unified diff
6262
let (status_output, diff_output) = tokio::try_join!(
63-
self.run_git(&["diff", "--cached", "--name-status", "--no-renames"]),
63+
self.run_git(&["diff", "-z", "--cached", "--name-status", "--no-renames"]),
6464
self.run_git(&[
6565
"diff",
6666
"--cached",
@@ -75,33 +75,36 @@ impl GitService {
7575
let mut files = Vec::new();
7676
let mut stats = DiffStats::default();
7777

78-
for line in status_output.lines() {
79-
if line.is_empty() {
80-
continue;
81-
}
78+
let mut parts = status_output.split('\0').filter(|s| !s.is_empty());
8279

83-
let parts: Vec<&str> = line.splitn(2, '\t').collect();
84-
if parts.len() != 2 {
85-
continue;
86-
}
80+
while let Some(status_code) = parts.next() {
81+
let path_str = match parts.next() {
82+
Some(p) => p,
83+
None => break, // Should not happen with well-formed git output
84+
};
8785

88-
let status = match parts[0] {
86+
let status = match status_code {
8987
"A" => ChangeStatus::Added,
9088
"M" => ChangeStatus::Modified,
9189
"D" => ChangeStatus::Deleted,
9290
_ => continue,
9391
};
9492

95-
let file_path = Path::new(parts[1]).to_path_buf();
93+
let file_path = PathBuf::from(path_str);
9694
let category = FileCategory::from_path(&file_path);
9795
let is_binary = Self::is_binary_path(&file_path);
9896

9997
if is_binary {
10098
continue;
10199
}
102100

101+
// For lookups in file_diffs, we need the string key.
102+
// Note: split_unified_diff currently uses paths from "diff --git a/... b/..." headers which are usually standard strings.
103+
// Complex unicode paths might mismatch if git output encoding differs, but -z guarantees strict bytes for status.
104+
let diff_key = file_path.to_string_lossy();
105+
103106
let diff = file_diffs
104-
.get(parts[1])
107+
.get(diff_key.as_ref())
105108
.map(|d| Self::truncate_diff(d, max_file_lines))
106109
.unwrap_or_default();
107110

src/services/safety.rs

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -79,15 +79,25 @@ pub fn scan_for_secrets(changes: &StagedChanges) -> Vec<SecretMatch> {
7979
pub fn check_for_conflicts(changes: &StagedChanges) -> bool {
8080
for file in &changes.files {
8181
// Skip docs/test files where conflict markers might be intentional examples
82-
if file.path.to_string_lossy().contains("test")
83-
|| file.path.to_string_lossy().contains("doc")
84-
|| file.path.to_string_lossy().contains("example")
85-
{
82+
// Use path components to avoid matching "testing_utils" or "documentation" substrings
83+
if file.path.components().any(|c| {
84+
let s = c.as_os_str().to_string_lossy();
85+
s == "tests" || s == "docs" || s == "examples" || s.contains("test")
86+
}) {
8687
continue;
8788
}
8889

89-
if file.diff.contains("<<<<<<<") || file.diff.contains(">>>>>>>") {
90-
return true;
90+
// Only check added lines for conflict markers
91+
for line in file.diff.lines() {
92+
if line.starts_with('+') && !line.starts_with("+++") {
93+
// Split strings to prevent self-detection in this file's own diff
94+
const CONFLICT_START: &str = concat!("<", "<<<<<<");
95+
const CONFLICT_END: &str = concat!(">", ">>>>>>");
96+
97+
if line.contains(CONFLICT_START) || line.contains(CONFLICT_END) {
98+
return true;
99+
}
100+
}
91101
}
92102
}
93103
false

src/services/splitter.rs

Lines changed: 65 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
//
33
// SPDX-License-Identifier: PolyForm-Noncommercial-1.0.0
44

5-
use std::collections::HashMap;
5+
use std::collections::{HashMap, HashSet};
66
use std::path::{Path, PathBuf};
77

88
use crate::domain::{CodeSymbol, CommitType, FileCategory, FileChange, StagedChanges};
@@ -228,6 +228,44 @@ impl CommitSplitter {
228228
}
229229
}
230230

231+
/// Compute Jaccard similarity between two sets of tokens.
232+
fn jaccard_index(a: &HashSet<String>, b: &HashSet<String>) -> f64 {
233+
if a.is_empty() && b.is_empty() {
234+
return 1.0;
235+
}
236+
let intersection = a.intersection(b).count();
237+
let union = a.len() + b.len() - intersection;
238+
if union == 0 {
239+
0.0
240+
} else {
241+
intersection as f64 / union as f64
242+
}
243+
}
244+
245+
/// Extract significant tokens from a diff.
246+
/// Captures the vocabulary of the change (variable names, keywords, types).
247+
fn tokenize_diff(diff: &str) -> HashSet<String> {
248+
let mut tokens = HashSet::new();
249+
for line in diff.lines() {
250+
// Only process add/remove lines, skip headers
251+
if (!line.starts_with('+') && !line.starts_with('-'))
252+
|| line.starts_with("+++")
253+
|| line.starts_with("---")
254+
{
255+
continue;
256+
}
257+
258+
// Split by non-alphanumeric characters to get words
259+
for word in line[1..].split(|c: char| !c.is_alphanumeric()) {
260+
if word.len() > 2 {
261+
// Skip tiny tokens
262+
tokens.insert(word.to_string());
263+
}
264+
}
265+
}
266+
tokens
267+
}
268+
231269
/// Group source files by diff-shape similarity.
232270
///
233271
/// Files with very similar fingerprints (same kind of transformation)
@@ -237,34 +275,47 @@ impl CommitSplitter {
237275
return vec![files.to_vec()];
238276
}
239277

240-
let fingerprints: Vec<(&FileChange, DiffFingerprint)> = files
278+
// Pre-calculate features for all files
279+
let features: Vec<(&FileChange, DiffFingerprint, HashSet<String>)> = files
241280
.iter()
242-
.map(|f| (*f, Self::diff_fingerprint(f)))
281+
.map(|f| (*f, Self::diff_fingerprint(f), Self::tokenize_diff(&f.diff)))
243282
.collect();
244283

245-
// Greedy clustering: assign each file to the first similar group, or create new
246-
let mut clusters: Vec<(DiffFingerprint, Vec<&'a FileChange>)> = Vec::new();
284+
// Greedy clustering with hybrid similarity
285+
let mut clusters: Vec<(DiffFingerprint, HashSet<String>, Vec<&'a FileChange>)> = Vec::new();
247286

248-
for (file, fp) in &fingerprints {
287+
for (file, fp, tokens) in &features {
249288
let mut assigned = false;
250289

251-
for (centroid, members) in &mut clusters {
252-
if centroid.is_similar(fp) {
253-
members.push(file);
254-
assigned = true;
255-
break;
290+
for (centroid_fp, centroid_tokens, members) in &mut clusters {
291+
// Hybrid similarity:
292+
// 1. Must be statistically similar (shape/size)
293+
// 2. Must share content vocabulary (Jaccard index)
294+
295+
if centroid_fp.is_similar(fp) {
296+
let content_sim = Self::jaccard_index(centroid_tokens, tokens);
297+
298+
// Threshold: 0.4 implies significant vocabulary overlap
299+
// e.g., sharing variable names, types, or specific syntax patterns
300+
if content_sim > 0.4 {
301+
members.push(file);
302+
// Update centroid tokens? Union them to represent the group better?
303+
// For simple greedy, keeping the first file's tokens as centroid is simpler/stable.
304+
assigned = true;
305+
break;
306+
}
256307
}
257308
}
258309

259310
if !assigned {
260-
clusters.push((fp.clone(), vec![file]));
311+
clusters.push((fp.clone(), tokens.clone(), vec![file]));
261312
}
262313
}
263314

264315
// If clustering produced only 1 group, check if it's genuinely uniform
265316
// or if we should fall back to module-based splitting.
266317
if clusters.len() == 1 {
267-
let (centroid, cluster_files) = &clusters[0];
318+
let (centroid, _, cluster_files) = &clusters[0];
268319

269320
// If all files have non-empty, highly balanced diffs (adds ≈ removes) and are
270321
// small, they likely received the same mechanical transformation → keep grouped.
@@ -290,7 +341,7 @@ impl CommitSplitter {
290341

291342
let mut result: Vec<Vec<&'a FileChange>> = Vec::new();
292343

293-
for (_, cluster_files) in clusters {
344+
for (_, _, cluster_files) in clusters {
294345
// E2: Sub-split large clusters that span multiple modules.
295346
// If a group has >6 files across multiple modules, the shape clustering
296347
// wasn't discriminating enough — split by module to avoid mega-groups.

0 commit comments

Comments
 (0)