Skip to content

Commit a778dac

Browse files
committed
feat(eval): complete eval harness with 38 fixtures and aggregate reporting
- 26 new fixtures (38 total) covering all 11 commit types, AST features, and edge cases across Rust, TypeScript, Python, Go - EvalSummary with per-type accuracy breakdown and overall score - Summary prints after CLI eval runs - 348 tests passing with eval feature
1 parent 1396b52 commit a778dac

61 files changed

Lines changed: 1599 additions & 9 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

src/eval.rs

Lines changed: 92 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -286,20 +286,23 @@ impl EvalRunner {
286286

287287
self.print_results(&results);
288288

289-
let failed = results.iter().filter(|r| !r.passed()).count();
290-
if failed > 0 {
289+
// Print aggregate summary
290+
let summary = EvalSummary::from_results(&results);
291+
eprintln!("{}", summary.format_report());
292+
293+
if summary.total_failed > 0 {
291294
eprintln!(
292-
"\n{} {} fixture(s) failed",
295+
"{} {} fixture(s) failed",
293296
style("FAIL").red().bold(),
294-
failed,
297+
summary.total_failed,
295298
);
296299
std::process::exit(1);
297300
}
298301

299302
eprintln!(
300-
"\n{} All {} fixture(s) passed",
303+
"{} All {} fixture(s) passed",
301304
style("PASS").green().bold(),
302-
results.len(),
305+
summary.total_passed,
303306
);
304307

305308
Ok(())
@@ -1015,6 +1018,89 @@ impl EvalRunner {
10151018
}
10161019
}
10171020

1021+
/// Aggregate evaluation summary with per-type accuracy breakdown.
1022+
#[derive(Debug)]
1023+
pub struct EvalSummary {
1024+
pub total_fixtures: usize,
1025+
pub total_passed: usize,
1026+
pub total_failed: usize,
1027+
/// Per-type accuracy: (type_name, passed, total).
1028+
pub per_type: Vec<(String, usize, usize)>,
1029+
}
1030+
1031+
impl EvalSummary {
1032+
/// Build a summary from eval results.
1033+
#[must_use]
1034+
pub fn from_results(results: &[EvalResult]) -> Self {
1035+
let total_fixtures = results.len();
1036+
let total_passed = results.iter().filter(|r| r.passed()).count();
1037+
let total_failed = total_fixtures - total_passed;
1038+
1039+
// Group by expected_type
1040+
let mut type_map: std::collections::BTreeMap<String, (usize, usize)> =
1041+
std::collections::BTreeMap::new();
1042+
1043+
for result in results {
1044+
let key = result.expected_type.to_lowercase();
1045+
if key.is_empty() {
1046+
continue;
1047+
}
1048+
let entry = type_map.entry(key).or_insert((0, 0));
1049+
entry.1 += 1; // total
1050+
if result.passed() {
1051+
entry.0 += 1; // passed
1052+
}
1053+
}
1054+
1055+
let per_type: Vec<(String, usize, usize)> = type_map
1056+
.into_iter()
1057+
.map(|(k, (passed, total))| (k, passed, total))
1058+
.collect();
1059+
1060+
Self {
1061+
total_fixtures,
1062+
total_passed,
1063+
total_failed,
1064+
per_type,
1065+
}
1066+
}
1067+
1068+
/// Format the summary as a human-readable report.
1069+
#[must_use]
1070+
pub fn format_report(&self) -> String {
1071+
let mut report = String::new();
1072+
1073+
report.push_str("=== Eval Summary ===\n\n");
1074+
1075+
// Per-type breakdown
1076+
report.push_str("Per-type accuracy:\n");
1077+
for (type_name, passed, total) in &self.per_type {
1078+
let pct = if *total > 0 {
1079+
(*passed as f64 / *total as f64) * 100.0
1080+
} else {
1081+
0.0
1082+
};
1083+
report.push_str(&format!(
1084+
" {}: {}/{} ({:.0}%)\n",
1085+
type_name, passed, total, pct
1086+
));
1087+
}
1088+
1089+
// Overall score
1090+
let overall_pct = if self.total_fixtures > 0 {
1091+
(self.total_passed as f64 / self.total_fixtures as f64) * 100.0
1092+
} else {
1093+
0.0
1094+
};
1095+
report.push_str(&format!(
1096+
"\nOverall: {}/{} ({:.1}%)\n",
1097+
self.total_passed, self.total_fixtures, overall_pct
1098+
));
1099+
1100+
report
1101+
}
1102+
}
1103+
10181104
/// Parse a symbol kind string from TOML into `SymbolKind`.
10191105
fn parse_symbol_kind(kind: &str) -> SymbolKind {
10201106
match kind.to_lowercase().as_str() {

tests/eval.rs

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -184,10 +184,35 @@ fn breaking_change_fixtures() {
184184
fn fixture_count() {
185185
let runner = EvalRunner::new(fixtures_dir(), None);
186186
let results = runner.run_sync().expect("eval runner should not error");
187-
// 2 original (simple-feat, style-only) + 10 new = 12
187+
// 12 original + 26 new = 38
188188
assert!(
189-
results.len() >= 12,
190-
"Expected at least 12 fixtures, found {}",
189+
results.len() >= 38,
190+
"Expected at least 38 fixtures, found {}",
191191
results.len()
192192
);
193193
}
194+
195+
/// Print aggregate per-type accuracy report after running all fixtures.
196+
#[test]
197+
fn aggregate_summary() {
198+
let runner = EvalRunner::new(fixtures_dir(), None);
199+
let results = runner.run_sync().expect("eval runner should not error");
200+
201+
let summary = commitbee::eval::EvalSummary::from_results(&results);
202+
let report = summary.format_report();
203+
204+
// Print the report so it's visible with --nocapture
205+
eprintln!("\n{}", report);
206+
207+
// Verify the summary math is consistent
208+
assert_eq!(
209+
summary.total_passed + summary.total_failed,
210+
summary.total_fixtures,
211+
"passed + failed should equal total"
212+
);
213+
assert_eq!(
214+
summary.total_fixtures,
215+
results.len(),
216+
"summary total should match results count"
217+
);
218+
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
diff --git a/src/services/api.rs b/src/services/api.rs
2+
index abc1234..def5678 100644
3+
--- a/src/services/api.rs
4+
+++ b/src/services/api.rs
5+
@@ -5,20 +5,5 @@ use crate::error::Result;
6+
7+
pub struct ApiClient {
8+
base_url: String,
9+
}
10+
11+
-/// Send a GET request to the given path.
12+
-pub fn api_get(client: &ApiClient, path: &str) -> Result<Response> {
13+
- let url = format!("{}/{}", client.base_url, path);
14+
- reqwest::blocking::get(&url).map_err(|e| Error::Network(e.to_string()))
15+
-}
16+
-
17+
-/// Send a POST request with a JSON body.
18+
-pub fn api_post(client: &ApiClient, path: &str, body: &str) -> Result<Response> {
19+
- let url = format!("{}/{}", client.base_url, path);
20+
- reqwest::blocking::Client::new()
21+
- .post(&url)
22+
- .body(body.to_string())
23+
- .send()
24+
- .map_err(|e| Error::Network(e.to_string()))
25+
-}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# SPDX-FileCopyrightText: 2026 Sephyi <me@sephy.io>
2+
#
3+
# SPDX-License-Identifier: PolyForm-Noncommercial-1.0.0
4+
5+
name = "ast-breaking-public-removal"
6+
description = "Removing public API without replacement should signal breaking change"
7+
language = "rust"
8+
category = "ast"
9+
expected_type = "refactor"
10+
expected_scope = "optional"
11+
12+
[evidence]
13+
has_new_public_api = false
14+
public_api_removed_count = 2
15+
16+
[prompt]
17+
must_contain = ["SYMBOLS CHANGED", "Removed"]
18+
19+
[breaking]
20+
expected = true
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
[[symbols]]
2+
kind = "Function"
3+
name = "api_get"
4+
file = "src/services/api.rs"
5+
line = 10
6+
end_line = 13
7+
is_public = true
8+
is_added = false
9+
signature = "pub fn api_get(client: &ApiClient, path: &str) -> Result<Response>"
10+
11+
[[symbols]]
12+
kind = "Function"
13+
name = "api_post"
14+
file = "src/services/api.rs"
15+
line = 16
16+
end_line = 22
17+
is_public = true
18+
is_added = false
19+
signature = "pub fn api_post(client: &ApiClient, path: &str, body: &str) -> Result<Response>"
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
diff --git a/src/services/validator.rs b/src/services/validator.rs
2+
index abc1234..def5678 100644
3+
--- a/src/services/validator.rs
4+
+++ b/src/services/validator.rs
5+
@@ -5,8 +5,8 @@ use crate::error::Result;
6+
7+
impl Validator {
8+
- pub fn check(&self, input: &str) -> bool {
9+
- !input.is_empty() && input.len() < 1024
10+
+ pub fn check(&self, input: &str) -> bool {
11+
+ !input.is_empty() && input.len() < 1024
12+
}
13+
}
14+
15+
@@ -15,0 +16,10 @@
16+
+/// Validate a batch of inputs, returning the first error.
17+
+pub fn validate_batch(inputs: &[&str]) -> Result<()> {
18+
+ for input in inputs {
19+
+ if input.is_empty() {
20+
+ return Err(crate::error::Error::Config("empty input in batch".into()));
21+
+ }
22+
+ }
23+
+ Ok(())
24+
+}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# SPDX-FileCopyrightText: 2026 Sephyi <me@sephy.io>
2+
#
3+
# SPDX-License-Identifier: PolyForm-Noncommercial-1.0.0
4+
5+
name = "ast-mixed-ws-semantic"
6+
description = "Mix of whitespace and semantic changes should produce feat, not style"
7+
language = "rust"
8+
category = "ast"
9+
expected_type = "feat"
10+
expected_scope = "optional"
11+
12+
[evidence]
13+
has_new_public_api = true
14+
15+
[prompt]
16+
must_contain = ["SYMBOLS CHANGED", "Added"]
17+
18+
[breaking]
19+
expected = false
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
[[symbols]]
2+
kind = "Function"
3+
name = "validate_batch"
4+
file = "src/services/validator.rs"
5+
line = 16
6+
end_line = 24
7+
is_public = true
8+
is_added = true
9+
signature = "pub fn validate_batch(inputs: &[&str]) -> Result<()>"
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
diff --git a/src/services/json_format.rs b/src/services/json_format.rs
2+
new file mode 100644
3+
index 0000000..abc1234
4+
--- /dev/null
5+
+++ b/src/services/json_format.rs
6+
@@ -0,0 +1,10 @@
7+
+use crate::error::Result;
8+
+
9+
+pub struct JsonFormatter;
10+
+
11+
+impl JsonFormatter {
12+
+ pub fn format(&self, data: &str) -> Result<String> {
13+
+ let parsed: serde_json::Value = serde_json::from_str(data)?;
14+
+ Ok(serde_json::to_string_pretty(&parsed)?)
15+
+ }
16+
+}
17+
diff --git a/src/services/yaml_format.rs b/src/services/yaml_format.rs
18+
new file mode 100644
19+
index 0000000..def5678
20+
--- /dev/null
21+
+++ b/src/services/yaml_format.rs
22+
@@ -0,0 +1,10 @@
23+
+use crate::error::Result;
24+
+
25+
+pub struct YamlFormatter;
26+
+
27+
+impl YamlFormatter {
28+
+ pub fn format(&self, data: &str) -> Result<String> {
29+
+ let parsed: serde_yaml::Value = serde_yaml::from_str(data)?;
30+
+ Ok(serde_yaml::to_string(&parsed)?)
31+
+ }
32+
+}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# SPDX-FileCopyrightText: 2026 Sephyi <me@sephy.io>
2+
#
3+
# SPDX-License-Identifier: PolyForm-Noncommercial-1.0.0
4+
5+
name = "ast-overloaded-symbols"
6+
description = "Multiple symbols with same name across files should all appear"
7+
language = "rust"
8+
category = "ast"
9+
expected_type = "feat"
10+
expected_scope = "optional"
11+
12+
[evidence]
13+
has_new_public_api = true
14+
15+
[prompt]
16+
must_contain = ["SYMBOLS CHANGED", "Added"]
17+
18+
[breaking]
19+
expected = false

0 commit comments

Comments
 (0)