Skip to content

Commit e36f629

Browse files
neuron7xLabclaude
andcommitted
fix(robustness): expose candidate_count, mark placeholder jitter N/A
Task 2 of the DECISION_GRADE escalation — cleans the evidence table so no reader can confuse a tautological measurement for a real one, and forbids placeholder jitter from asserting a live pass. ## CPCV: candidate_count + interpretation KuramotoCPCVResult now carries: - pbo_candidate_count: int (2 for fold-mirror) - pbo_interpretation: str ('tautological' for n<3) - loo_pbo_interpretation: str ('admissible' for n>=5) Interpretation rule is a single module-level helper: n < 3 → 'tautological' (best-IS trivially best) n < 5 → 'weak' (low statistical power) n >= 5 → 'admissible' The fold-mirror PBO is retained as a sanity baseline but the markdown row now explicitly labels it n=2, *tautological*. The LOO-grid PBO is labelled n=13, *admissible* and carries the real signal. ## Jitter: placeholder forces fraction_within_tol_pass=False kuramoto_jitter_suite.run_kuramoto_jitter_suite() now sets fraction_within_tol_pass=False whenever evaluator_mode != 'LIVE', regardless of the raw fraction-within-tol. The stability dataclass retains the raw fraction honestly — it is only the decision-layer pass boolean that is forced to False. Decision layer reason string is now placeholder-aware: - placeholder → 'jitter: placeholder evaluator — abstains from live ✓/✗' - live failure → 'jitter: fraction-within-tol below threshold' ## Evidence-table presentation ROBUSTNESS_RESULTS.md now shows: | CPCV | PBO (fold mirror, n=2, *tautological*) | 0.0000 | ✓ | | CPCV | PBO (LOO grid, n=13, *admissible*) | 0.2000 | ✓ | | Jitter | fraction_within_tol | 1.0000 | N/A | | Jitter | evaluator_mode | `PLACEHOLDER_APPROXIMATION` (…) | n/a | No ✓ appears on any placeholder row. The tautological PBO is surfaced explicitly; no reader will mistake it for a statistically meaningful overfit test. ## Tests - test_pbo_candidate_count_and_interpretation — fold-mirror is always n=2/tautological, LOO is n=13/admissible. - test_placeholder_forces_pass_false — placeholder evaluator must set fraction_within_tol_pass=False regardless of raw fraction. All 60/60 robustness tests green; mypy --strict clean across 21 files; 28/28 frozen artefacts intact. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 2d9bf67 commit e36f629

9 files changed

Lines changed: 109 additions & 22 deletions

File tree

backtest/robustness_gates.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,10 @@ def evaluate_robustness_gates(
113113
jitter_pass = bool(evidence.jitter.fraction_within_tol_pass)
114114
jitter_is_placeholder = evidence.jitter.evaluator_mode != "LIVE"
115115
if not jitter_pass:
116-
reasons.append("jitter: fraction-within-tol below threshold")
116+
if jitter_is_placeholder:
117+
reasons.append("jitter: placeholder evaluator — abstains from live ✓/✗")
118+
else:
119+
reasons.append("jitter: fraction-within-tol below threshold")
117120

118121
if evidence.cpcv.n_folds < 2:
119122
reasons.append("cpcv: fewer than 2 folds available")

research/robustness/protocols/kuramoto_cpcv_suite.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,33 @@
2020
LOO_PBO_PASS_THRESHOLD: Final[float] = 0.50
2121

2222

23+
PBO_TAUTOLOGICAL_CUTOFF: Final[int] = 3
24+
PBO_WEAK_CUTOFF: Final[int] = 5
25+
26+
27+
def _pbo_interpretation(n_candidates: int) -> str:
28+
"""Label a PBO measurement by its candidate count.
29+
30+
Fewer than 3 strategies: 'tautological' (best-IS is trivially best).
31+
Fewer than 5 strategies: 'weak' (low statistical power).
32+
5 or more: 'admissible'.
33+
"""
34+
if n_candidates < PBO_TAUTOLOGICAL_CUTOFF:
35+
return "tautological"
36+
if n_candidates < PBO_WEAK_CUTOFF:
37+
return "weak"
38+
return "admissible"
39+
40+
2341
@dataclass(frozen=True)
2442
class KuramotoCPCVResult:
2543
"""Aggregate of the CPCV/PBO/PSR suite on frozen evidence."""
2644

2745
fold_sharpes: tuple[float, ...]
2846
pbo: float
2947
pbo_pass: bool
48+
pbo_candidate_count: int
49+
pbo_interpretation: str
3050
psr_daily: float
3151
psr_pass: bool
3252
annualised_sharpe: float
@@ -35,6 +55,7 @@ class KuramotoCPCVResult:
3555
loo_pbo: float | None
3656
loo_pbo_pass: bool
3757
loo_n_strategies: int
58+
loo_pbo_interpretation: str
3859

3960

4061
def _fold_oos_matrix(fold_sharpes: tuple[float, ...]) -> NDArray[np.float64]:
@@ -105,10 +126,17 @@ def run_kuramoto_cpcv_suite(
105126
loo_pbo = estimate_pbo(loo_oos)
106127
loo_pbo_pass = loo_pbo < LOO_PBO_PASS_THRESHOLD
107128

129+
# The fold-mirror PBO uses exactly 2 "strategies" (anchor + median-
130+
# shifted mirror); by construction this is below PBO_TAUTOLOGICAL_CUTOFF
131+
# and the interpretation is always 'tautological'. It is retained as
132+
# a sanity-check baseline; the LOO-grid PBO is the decision-grade one.
133+
fold_mirror_candidate_count = 2
108134
return KuramotoCPCVResult(
109135
fold_sharpes=fold_sharpes,
110136
pbo=pbo,
111137
pbo_pass=pbo < PBO_PASS_THRESHOLD,
138+
pbo_candidate_count=fold_mirror_candidate_count,
139+
pbo_interpretation=_pbo_interpretation(fold_mirror_candidate_count),
112140
psr_daily=psr,
113141
psr_pass=(psr >= PSR_PASS_THRESHOLD) if np.isfinite(psr) else False,
114142
annualised_sharpe=sr,
@@ -117,4 +145,5 @@ def run_kuramoto_cpcv_suite(
117145
loo_pbo=loo_pbo,
118146
loo_pbo_pass=loo_pbo_pass,
119147
loo_n_strategies=loo_n_strategies,
148+
loo_pbo_interpretation=_pbo_interpretation(loo_n_strategies),
120149
)

research/robustness/protocols/kuramoto_jitter_suite.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,9 +89,17 @@ def run_kuramoto_jitter_suite(
8989
sharpe_tolerance=sharpe_tolerance,
9090
seed=seed,
9191
)
92+
# When the evaluator is the placeholder, jitter evidence is NOT
93+
# decision-grade — force the pass boolean to False regardless of the
94+
# observed fraction-within-tol. The decision layer carries a separate
95+
# 'jitter_is_placeholder' flag that routes to INSUFFICIENT_EVIDENCE
96+
# when require_live_jitter is True; otherwise placeholder-with-False
97+
# simply abstains from asserting a live pass.
98+
is_placeholder = EVALUATOR_MODE != "LIVE"
99+
raw_pass = stability.fraction_within_tol >= fraction_within_tol_pass
92100
return KuramotoJitterSuiteResult(
93101
stability=stability,
94102
evaluator_mode=EVALUATOR_MODE,
95-
fraction_within_tol_pass=(stability.fraction_within_tol >= fraction_within_tol_pass),
103+
fraction_within_tol_pass=False if is_placeholder else raw_pass,
96104
pass_threshold=fraction_within_tol_pass,
97105
)

results/cross_asset_kuramoto/robustness_v1/ROBUSTNESS_RESULTS.md

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,21 +6,23 @@ Terminal decision: **FAIL**
66

77
| Suite | Metric | Value | Pass |
88
|---|---|---:|:-:|
9-
| CPCV | PBO (fold mirror) | 0.0000 ||
9+
| CPCV | PBO (fold mirror, n=2, *tautological*) | 0.0000 ||
1010
| CPCV | PSR (daily) | 1.0000 ||
1111
| CPCV | Annualised Sharpe (daily) | 0.4832 | n/a |
12-
| CPCV | PBO (LOO grid, n=13) | 0.2000 ||
12+
| CPCV | PBO (LOO grid, n=13, *admissible*) | 0.2000 ||
1313
| Null | iid_bootstrap p-value | 0.5045 ||
1414
| Null | stationary_bootstrap p-value | 0.5235 ||
15-
| Jitter | fraction_within_tol | 1.0000 | |
16-
| Jitter | evaluator_mode | `PLACEHOLDER_APPROXIMATION` | n/a |
15+
| Jitter | fraction_within_tol | 1.0000 | N/A |
16+
| Jitter | evaluator_mode | `PLACEHOLDER_APPROXIMATION` (not decision-grade; live evaluator required to flip this row to ✓ / ✗) | n/a |
1717

1818
## Reasons
1919

2020
- null: one or more families failed
21+
- jitter: placeholder evaluator — abstains from live ✓/✗
2122

2223
## Notes
2324

2425
- Evidence is derived from the frozen `offline_robustness/SOURCE_HASHES.json` bundle; 28 artifacts hash-verified.
25-
- Null suite uses cumulative-return pct_change as a return proxy; raw `net_ret` is not in the frozen demo bundle, which limits statistical power relative to the published headline Sharpe (`risk_metrics.csv::sharpe = 1.2619`).
26-
- Jitter evaluator is PLACEHOLDER_APPROXIMATION: rebuild under perturbed parameters requires the raw asset panel.
26+
- Null suite uses mathematically exact daily log-returns (`diff(log(strategy_cumret))`) — no approximation. See `ROBUSTNESS_PROTOCOL.md` § 1 for the derivation contract.
27+
- PBO interpretation: fewer than 3 candidates is `tautological`, fewer than 5 is `weak`, 5+ is `admissible`. The fold-mirror PBO is always tautological by construction and is kept only as a sanity baseline; the LOO-grid PBO is the decision-grade one.
28+
- Jitter row shows `N/A` while the evaluator is `PLACEHOLDER_APPROXIMATION`; a live rebuild is required to replace the row with a real ✓ / ✗.

results/cross_asset_kuramoto/robustness_v1/cpcv_summary.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,13 @@
99
],
1010
"loo_n_strategies": 13,
1111
"loo_pbo": 0.2,
12+
"loo_pbo_interpretation": "admissible",
1213
"loo_pbo_pass": true,
1314
"n_bars": 2166,
1415
"n_folds": 5,
1516
"pbo": 0.0,
17+
"pbo_candidate_count": 2,
18+
"pbo_interpretation": "tautological",
1619
"pbo_pass": true,
1720
"psr_daily": 1.0,
1821
"psr_pass": true

results/cross_asset_kuramoto/robustness_v1/jitter_summary.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"evaluator_mode": "PLACEHOLDER_APPROXIMATION",
3-
"fraction_within_tol_pass": true,
3+
"fraction_within_tol_pass": false,
44
"pass_threshold": 0.8,
55
"stability": {
66
"anchor_sharpe": 1.2619,

results/cross_asset_kuramoto/robustness_v1/verdict.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,12 @@
44
"cpcv_pass": true,
55
"input_source": "daily_log_returns",
66
"jitter_is_placeholder": true,
7-
"jitter_pass": true,
7+
"jitter_pass": false,
88
"label": "FAIL",
99
"label_qualifier": "FAIL_ON_DAILY_RETURNS",
1010
"null_pass": false,
1111
"reasons": [
12-
"null: one or more families failed"
12+
"null: one or more families failed",
13+
"jitter: placeholder evaluator \u2014 abstains from live \u2713/\u2717"
1314
]
1415
}

scripts/run_kuramoto_robustness_v1.py

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -61,14 +61,19 @@ def _render_markdown(
6161
"",
6262
"| Suite | Metric | Value | Pass |",
6363
"|---|---|---:|:-:|",
64-
f"| CPCV | PBO (fold mirror) | {cpcv_dict['pbo']:.4f} | {'✓' if cpcv_dict['pbo_pass'] else '✗'} |",
65-
f"| CPCV | PSR (daily) | {cpcv_dict['psr_daily']:.4f} | {'✓' if cpcv_dict['psr_pass'] else '✗'} |",
64+
f"| CPCV | PBO (fold mirror, n={cpcv_dict['pbo_candidate_count']}, "
65+
f"*{cpcv_dict['pbo_interpretation']}*) | "
66+
f"{cpcv_dict['pbo']:.4f} | "
67+
f"{'✓' if cpcv_dict['pbo_pass'] else '✗'} |",
68+
f"| CPCV | PSR (daily) | {cpcv_dict['psr_daily']:.4f} | "
69+
f"{'✓' if cpcv_dict['psr_pass'] else '✗'} |",
6670
f"| CPCV | Annualised Sharpe (daily) | {cpcv_dict['annualised_sharpe']:.4f} | n/a |",
6771
]
6872
loo_pbo = cpcv_dict.get("loo_pbo")
6973
if loo_pbo is not None:
7074
lines.append(
71-
f"| CPCV | PBO (LOO grid, n={cpcv_dict['loo_n_strategies']}) | "
75+
f"| CPCV | PBO (LOO grid, n={cpcv_dict['loo_n_strategies']}, "
76+
f"*{cpcv_dict['loo_pbo_interpretation']}*) | "
7277
f"{loo_pbo:.4f} | "
7378
f"{'✓' if cpcv_dict['loo_pbo_pass'] else '✗'} |"
7479
)
@@ -78,12 +83,22 @@ def _render_markdown(
7883
f"{family['p_value']:.4f} | "
7984
f"{'✓' if family['p_value_pass'] else '✗'} |"
8085
)
86+
jitter_is_placeholder = jitter_dict["evaluator_mode"] != "LIVE"
87+
if jitter_is_placeholder:
88+
jitter_pass_cell = "N/A"
89+
jitter_note = (
90+
"`PLACEHOLDER_APPROXIMATION` (not decision-grade; live evaluator "
91+
"required to flip this row to ✓ / ✗)"
92+
)
93+
else:
94+
jitter_pass_cell = "✓" if jitter_dict["fraction_within_tol_pass"] else "✗"
95+
jitter_note = f"`{jitter_dict['evaluator_mode']}`"
8196
lines.extend(
8297
[
8398
f"| Jitter | fraction_within_tol | "
8499
f"{jitter_dict['stability']['fraction_within_tol']:.4f} | "
85-
f"{'✓' if jitter_dict['fraction_within_tol_pass'] else '✗'} |",
86-
f"| Jitter | evaluator_mode | `{jitter_dict['evaluator_mode']}` | n/a |",
100+
f"{jitter_pass_cell} |",
101+
f"| Jitter | evaluator_mode | {jitter_note} | n/a |",
87102
"",
88103
"## Reasons",
89104
"",
@@ -100,12 +115,16 @@ def _render_markdown(
100115
"",
101116
"- Evidence is derived from the frozen `offline_robustness/"
102117
"SOURCE_HASHES.json` bundle; 28 artifacts hash-verified.",
103-
"- Null suite uses cumulative-return pct_change as a return proxy;"
104-
" raw `net_ret` is not in the frozen demo bundle, which limits"
105-
" statistical power relative to the published headline Sharpe"
106-
" (`risk_metrics.csv::sharpe = 1.2619`).",
107-
"- Jitter evaluator is PLACEHOLDER_APPROXIMATION: rebuild under"
108-
" perturbed parameters requires the raw asset panel.",
118+
"- Null suite uses mathematically exact daily log-returns "
119+
"(`diff(log(strategy_cumret))`) — no approximation. See "
120+
"`ROBUSTNESS_PROTOCOL.md` § 1 for the derivation contract.",
121+
"- PBO interpretation: fewer than 3 candidates is `tautological`, "
122+
"fewer than 5 is `weak`, 5+ is `admissible`. The fold-mirror PBO "
123+
"is always tautological by construction and is kept only as a "
124+
"sanity baseline; the LOO-grid PBO is the decision-grade one.",
125+
"- Jitter row shows `N/A` while the evaluator is "
126+
"`PLACEHOLDER_APPROXIMATION`; a live rebuild is required to "
127+
"replace the row with a real ✓ / ✗.",
109128
"",
110129
]
111130
)

tests/research/robustness/test_kuramoto_suites.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,17 @@ def test_loo_pbo_matches_hand_computed(self, contract: KuramotoRobustnessContrac
5050
assert r.loo_pbo is not None
5151
assert abs(r.loo_pbo - 0.20) < 1e-9
5252

53+
def test_pbo_candidate_count_and_interpretation(
54+
self, contract: KuramotoRobustnessContract
55+
) -> None:
56+
"""Fold-mirror PBO must always be flagged tautological (n=2);
57+
LOO-grid PBO with n=13 must be flagged admissible."""
58+
r = run_kuramoto_cpcv_suite(contract)
59+
assert r.pbo_candidate_count == 2
60+
assert r.pbo_interpretation == "tautological"
61+
assert r.loo_n_strategies == 13
62+
assert r.loo_pbo_interpretation == "admissible"
63+
5364

5465
class TestNullSuite:
5566
def test_two_families_returned_and_bounded(self, contract: KuramotoRobustnessContract) -> None:
@@ -96,3 +107,14 @@ def test_placeholder_quadratic_monotonicity(self, contract: KuramotoRobustnessCo
96107
# Sharpe that exceeds the anchor by construction.
97108
r = run_kuramoto_jitter_suite(contract, n_candidates=64)
98109
assert max(r.stability.perturbed_sharpes) <= r.stability.anchor_sharpe
110+
111+
def test_placeholder_forces_pass_false(self, contract: KuramotoRobustnessContract) -> None:
112+
"""Task 2: placeholder evaluator MUST force fraction_within_tol_pass
113+
to False, regardless of the observed fraction. The decision layer
114+
carries evaluator_mode separately and uses it to route to
115+
INSUFFICIENT_EVIDENCE when require_live_jitter=True."""
116+
r = run_kuramoto_jitter_suite(contract, n_candidates=32)
117+
assert r.evaluator_mode == "PLACEHOLDER_APPROXIMATION"
118+
assert r.fraction_within_tol_pass is False
119+
# Sanity: the raw fraction is still reported honestly on stability.
120+
assert 0.0 <= r.stability.fraction_within_tol <= 1.0

0 commit comments

Comments
 (0)