fix(robustness): expose candidate_count, mark placeholder jitter N/A

neuron7xLab · claude · neuron7xLab · commit e36f629cc89c · 2026-04-22T13:49:00.000+03:00
Task 2 of the DECISION_GRADE escalation — cleans the evidence table so
no reader can confuse a tautological measurement for a real one, and
forbids placeholder jitter from asserting a live pass.

## CPCV: candidate_count + interpretation

KuramotoCPCVResult now carries:
  - pbo_candidate_count: int       (2 for fold-mirror)
  - pbo_interpretation: str        ('tautological' for n&lt;3)
  - loo_pbo_interpretation: str    ('admissible' for n&gt;=5)

Interpretation rule is a single module-level helper:
  n &lt; 3 → 'tautological'   (best-IS trivially best)
  n &lt; 5 → 'weak'           (low statistical power)
  n &gt;= 5 → 'admissible'

The fold-mirror PBO is retained as a sanity baseline but the markdown
row now explicitly labels it n=2, *tautological*. The LOO-grid PBO is
labelled n=13, *admissible* and carries the real signal.

## Jitter: placeholder forces fraction_within_tol_pass=False

kuramoto_jitter_suite.run_kuramoto_jitter_suite() now sets
fraction_within_tol_pass=False whenever evaluator_mode != 'LIVE',
regardless of the raw fraction-within-tol. The stability dataclass
retains the raw fraction honestly — it is only the decision-layer pass
boolean that is forced to False.

Decision layer reason string is now placeholder-aware:
  - placeholder → 'jitter: placeholder evaluator — abstains from live ✓/✗'
  - live failure → 'jitter: fraction-within-tol below threshold'

## Evidence-table presentation

ROBUSTNESS_RESULTS.md now shows:
  | CPCV | PBO (fold mirror, n=2, *tautological*) | 0.0000 | ✓ |
  | CPCV | PBO (LOO grid, n=13, *admissible*)    | 0.2000 | ✓ |
  | Jitter | fraction_within_tol                  | 1.0000 | N/A |
  | Jitter | evaluator_mode                       | `PLACEHOLDER_APPROXIMATION` (…) | n/a |

No ✓ appears on any placeholder row. The tautological PBO is surfaced
explicitly; no reader will mistake it for a statistically meaningful
overfit test.

## Tests

- test_pbo_candidate_count_and_interpretation — fold-mirror is always
  n=2/tautological, LOO is n=13/admissible.
- test_placeholder_forces_pass_false — placeholder evaluator must set
  fraction_within_tol_pass=False regardless of raw fraction.
All 60/60 robustness tests green; mypy --strict clean across 21 files;
28/28 frozen artefacts intact.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/backtest/robustness_gates.py b/backtest/robustness_gates.py
@@ -113,7 +113,10 @@ def evaluate_robustness_gates(
     jitter_pass = bool(evidence.jitter.fraction_within_tol_pass)
     jitter_is_placeholder = evidence.jitter.evaluator_mode != "LIVE"
     if not jitter_pass:
-        reasons.append("jitter: fraction-within-tol below threshold")
+        if jitter_is_placeholder:
+            reasons.append("jitter: placeholder evaluator — abstains from live ✓/✗")
+        else:
+            reasons.append("jitter: fraction-within-tol below threshold")
 
     if evidence.cpcv.n_folds < 2:
         reasons.append("cpcv: fewer than 2 folds available")
diff --git a/research/robustness/protocols/kuramoto_cpcv_suite.py b/research/robustness/protocols/kuramoto_cpcv_suite.py
@@ -20,13 +20,33 @@
 LOO_PBO_PASS_THRESHOLD: Final[float] = 0.50
 
 
+PBO_TAUTOLOGICAL_CUTOFF: Final[int] = 3
+PBO_WEAK_CUTOFF: Final[int] = 5
+
+
+def _pbo_interpretation(n_candidates: int) -> str:
+    """Label a PBO measurement by its candidate count.
+
+    Fewer than 3 strategies: 'tautological' (best-IS is trivially best).
+    Fewer than 5 strategies: 'weak' (low statistical power).
+    5 or more: 'admissible'.
+    """
+    if n_candidates < PBO_TAUTOLOGICAL_CUTOFF:
+        return "tautological"
+    if n_candidates < PBO_WEAK_CUTOFF:
+        return "weak"
+    return "admissible"
+
+
 @dataclass(frozen=True)
 class KuramotoCPCVResult:
     """Aggregate of the CPCV/PBO/PSR suite on frozen evidence."""
 
     fold_sharpes: tuple[float, ...]
     pbo: float
     pbo_pass: bool
+    pbo_candidate_count: int
+    pbo_interpretation: str
     psr_daily: float
     psr_pass: bool
     annualised_sharpe: float
@@ -35,6 +55,7 @@ class KuramotoCPCVResult:
     loo_pbo: float | None
     loo_pbo_pass: bool
     loo_n_strategies: int
+    loo_pbo_interpretation: str
 
 
 def _fold_oos_matrix(fold_sharpes: tuple[float, ...]) -> NDArray[np.float64]:
@@ -105,10 +126,17 @@ def run_kuramoto_cpcv_suite(
             loo_pbo = estimate_pbo(loo_oos)
             loo_pbo_pass = loo_pbo < LOO_PBO_PASS_THRESHOLD
 
+    # The fold-mirror PBO uses exactly 2 "strategies" (anchor + median-
+    # shifted mirror); by construction this is below PBO_TAUTOLOGICAL_CUTOFF
+    # and the interpretation is always 'tautological'. It is retained as
+    # a sanity-check baseline; the LOO-grid PBO is the decision-grade one.
+    fold_mirror_candidate_count = 2
     return KuramotoCPCVResult(
         fold_sharpes=fold_sharpes,
         pbo=pbo,
         pbo_pass=pbo < PBO_PASS_THRESHOLD,
+        pbo_candidate_count=fold_mirror_candidate_count,
+        pbo_interpretation=_pbo_interpretation(fold_mirror_candidate_count),
         psr_daily=psr,
         psr_pass=(psr >= PSR_PASS_THRESHOLD) if np.isfinite(psr) else False,
         annualised_sharpe=sr,
@@ -117,4 +145,5 @@ def run_kuramoto_cpcv_suite(
         loo_pbo=loo_pbo,
         loo_pbo_pass=loo_pbo_pass,
         loo_n_strategies=loo_n_strategies,
+        loo_pbo_interpretation=_pbo_interpretation(loo_n_strategies),
     )
diff --git a/research/robustness/protocols/kuramoto_jitter_suite.py b/research/robustness/protocols/kuramoto_jitter_suite.py
@@ -89,9 +89,17 @@ def run_kuramoto_jitter_suite(
         sharpe_tolerance=sharpe_tolerance,
         seed=seed,
     )
+    # When the evaluator is the placeholder, jitter evidence is NOT
+    # decision-grade — force the pass boolean to False regardless of the
+    # observed fraction-within-tol. The decision layer carries a separate
+    # 'jitter_is_placeholder' flag that routes to INSUFFICIENT_EVIDENCE
+    # when require_live_jitter is True; otherwise placeholder-with-False
+    # simply abstains from asserting a live pass.
+    is_placeholder = EVALUATOR_MODE != "LIVE"
+    raw_pass = stability.fraction_within_tol >= fraction_within_tol_pass
     return KuramotoJitterSuiteResult(
         stability=stability,
         evaluator_mode=EVALUATOR_MODE,
-        fraction_within_tol_pass=(stability.fraction_within_tol >= fraction_within_tol_pass),
+        fraction_within_tol_pass=False if is_placeholder else raw_pass,
         pass_threshold=fraction_within_tol_pass,
     )
diff --git a/results/cross_asset_kuramoto/robustness_v1/ROBUSTNESS_RESULTS.md b/results/cross_asset_kuramoto/robustness_v1/ROBUSTNESS_RESULTS.md
@@ -6,21 +6,23 @@ Terminal decision: **FAIL**
 
 | Suite | Metric | Value | Pass |
 |---|---|---:|:-:|
-| CPCV | PBO (fold mirror) | 0.0000 | ✓ |
+| CPCV | PBO (fold mirror, n=2, *tautological*) | 0.0000 | ✓ |
 | CPCV | PSR (daily) | 1.0000 | ✓ |
 | CPCV | Annualised Sharpe (daily) | 0.4832 | n/a |
-| CPCV | PBO (LOO grid, n=13) | 0.2000 | ✓ |
+| CPCV | PBO (LOO grid, n=13, *admissible*) | 0.2000 | ✓ |
 | Null | iid_bootstrap p-value | 0.5045 | ✗ |
 | Null | stationary_bootstrap p-value | 0.5235 | ✗ |
-| Jitter | fraction_within_tol | 1.0000 | ✓ |
-| Jitter | evaluator_mode | `PLACEHOLDER_APPROXIMATION` | n/a |
+| Jitter | fraction_within_tol | 1.0000 | N/A |
+| Jitter | evaluator_mode | `PLACEHOLDER_APPROXIMATION` (not decision-grade; live evaluator required to flip this row to ✓ / ✗) | n/a |
 
 ## Reasons
 
 - null: one or more families failed
+- jitter: placeholder evaluator — abstains from live ✓/✗
 
 ## Notes
 
 - Evidence is derived from the frozen `offline_robustness/SOURCE_HASHES.json` bundle; 28 artifacts hash-verified.
-- Null suite uses cumulative-return pct_change as a return proxy; raw `net_ret` is not in the frozen demo bundle, which limits statistical power relative to the published headline Sharpe (`risk_metrics.csv::sharpe = 1.2619`).
-- Jitter evaluator is PLACEHOLDER_APPROXIMATION: rebuild under perturbed parameters requires the raw asset panel.
+- Null suite uses mathematically exact daily log-returns (`diff(log(strategy_cumret))`) — no approximation. See `ROBUSTNESS_PROTOCOL.md` § 1 for the derivation contract.
+- PBO interpretation: fewer than 3 candidates is `tautological`, fewer than 5 is `weak`, 5+ is `admissible`. The fold-mirror PBO is always tautological by construction and is kept only as a sanity baseline; the LOO-grid PBO is the decision-grade one.
+- Jitter row shows `N/A` while the evaluator is `PLACEHOLDER_APPROXIMATION`; a live rebuild is required to replace the row with a real ✓ / ✗.
diff --git a/results/cross_asset_kuramoto/robustness_v1/cpcv_summary.json b/results/cross_asset_kuramoto/robustness_v1/cpcv_summary.json
@@ -9,10 +9,13 @@
   ],
   "loo_n_strategies": 13,
   "loo_pbo": 0.2,
+  "loo_pbo_interpretation": "admissible",
   "loo_pbo_pass": true,
   "n_bars": 2166,
   "n_folds": 5,
   "pbo": 0.0,
+  "pbo_candidate_count": 2,
+  "pbo_interpretation": "tautological",
   "pbo_pass": true,
   "psr_daily": 1.0,
   "psr_pass": true
diff --git a/results/cross_asset_kuramoto/robustness_v1/jitter_summary.json b/results/cross_asset_kuramoto/robustness_v1/jitter_summary.json
@@ -1,6 +1,6 @@
 {
   "evaluator_mode": "PLACEHOLDER_APPROXIMATION",
-  "fraction_within_tol_pass": true,
+  "fraction_within_tol_pass": false,
   "pass_threshold": 0.8,
   "stability": {
     "anchor_sharpe": 1.2619,
diff --git a/results/cross_asset_kuramoto/robustness_v1/verdict.json b/results/cross_asset_kuramoto/robustness_v1/verdict.json
@@ -4,11 +4,12 @@
   "cpcv_pass": true,
   "input_source": "daily_log_returns",
   "jitter_is_placeholder": true,
-  "jitter_pass": true,
+  "jitter_pass": false,
   "label": "FAIL",
   "label_qualifier": "FAIL_ON_DAILY_RETURNS",
   "null_pass": false,
   "reasons": [
-    "null: one or more families failed"
+    "null: one or more families failed",
+    "jitter: placeholder evaluator \u2014 abstains from live \u2713/\u2717"
   ]
 }
diff --git a/scripts/run_kuramoto_robustness_v1.py b/scripts/run_kuramoto_robustness_v1.py
@@ -61,14 +61,19 @@ def _render_markdown(
         "",
         "| Suite | Metric | Value | Pass |",
         "|---|---|---:|:-:|",
-        f"| CPCV | PBO (fold mirror) | {cpcv_dict['pbo']:.4f} | {'✓' if cpcv_dict['pbo_pass'] else '✗'} |",
-        f"| CPCV | PSR (daily) | {cpcv_dict['psr_daily']:.4f} | {'✓' if cpcv_dict['psr_pass'] else '✗'} |",
+        f"| CPCV | PBO (fold mirror, n={cpcv_dict['pbo_candidate_count']}, "
+        f"*{cpcv_dict['pbo_interpretation']}*) | "
+        f"{cpcv_dict['pbo']:.4f} | "
+        f"{'✓' if cpcv_dict['pbo_pass'] else '✗'} |",
+        f"| CPCV | PSR (daily) | {cpcv_dict['psr_daily']:.4f} | "
+        f"{'✓' if cpcv_dict['psr_pass'] else '✗'} |",
         f"| CPCV | Annualised Sharpe (daily) | {cpcv_dict['annualised_sharpe']:.4f} | n/a |",
     ]
     loo_pbo = cpcv_dict.get("loo_pbo")
     if loo_pbo is not None:
         lines.append(
-            f"| CPCV | PBO (LOO grid, n={cpcv_dict['loo_n_strategies']}) | "
+            f"| CPCV | PBO (LOO grid, n={cpcv_dict['loo_n_strategies']}, "
+            f"*{cpcv_dict['loo_pbo_interpretation']}*) | "
             f"{loo_pbo:.4f} | "
             f"{'✓' if cpcv_dict['loo_pbo_pass'] else '✗'} |"
         )
@@ -78,12 +83,22 @@ def _render_markdown(
             f"{family['p_value']:.4f} | "
             f"{'✓' if family['p_value_pass'] else '✗'} |"
         )
+    jitter_is_placeholder = jitter_dict["evaluator_mode"] != "LIVE"
+    if jitter_is_placeholder:
+        jitter_pass_cell = "N/A"
+        jitter_note = (
+            "`PLACEHOLDER_APPROXIMATION` (not decision-grade; live evaluator "
+            "required to flip this row to ✓ / ✗)"
+        )
+    else:
+        jitter_pass_cell = "✓" if jitter_dict["fraction_within_tol_pass"] else "✗"
+        jitter_note = f"`{jitter_dict['evaluator_mode']}`"
     lines.extend(
         [
             f"| Jitter | fraction_within_tol | "
             f"{jitter_dict['stability']['fraction_within_tol']:.4f} | "
-            f"{'✓' if jitter_dict['fraction_within_tol_pass'] else '✗'} |",
-            f"| Jitter | evaluator_mode | `{jitter_dict['evaluator_mode']}` | n/a |",
+            f"{jitter_pass_cell} |",
+            f"| Jitter | evaluator_mode | {jitter_note} | n/a |",
             "",
             "## Reasons",
             "",
@@ -100,12 +115,16 @@ def _render_markdown(
             "",
             "- Evidence is derived from the frozen `offline_robustness/"
             "SOURCE_HASHES.json` bundle; 28 artifacts hash-verified.",
-            "- Null suite uses cumulative-return pct_change as a return proxy;"
-            " raw `net_ret` is not in the frozen demo bundle, which limits"
-            " statistical power relative to the published headline Sharpe"
-            " (`risk_metrics.csv::sharpe = 1.2619`).",
-            "- Jitter evaluator is PLACEHOLDER_APPROXIMATION: rebuild under"
-            " perturbed parameters requires the raw asset panel.",
+            "- Null suite uses mathematically exact daily log-returns "
+            "(`diff(log(strategy_cumret))`) — no approximation. See "
+            "`ROBUSTNESS_PROTOCOL.md` § 1 for the derivation contract.",
+            "- PBO interpretation: fewer than 3 candidates is `tautological`, "
+            "fewer than 5 is `weak`, 5+ is `admissible`. The fold-mirror PBO "
+            "is always tautological by construction and is kept only as a "
+            "sanity baseline; the LOO-grid PBO is the decision-grade one.",
+            "- Jitter row shows `N/A` while the evaluator is "
+            "`PLACEHOLDER_APPROXIMATION`; a live rebuild is required to "
+            "replace the row with a real ✓ / ✗.",
             "",
         ]
     )
diff --git a/tests/research/robustness/test_kuramoto_suites.py b/tests/research/robustness/test_kuramoto_suites.py
@@ -50,6 +50,17 @@ def test_loo_pbo_matches_hand_computed(self, contract: KuramotoRobustnessContrac
         assert r.loo_pbo is not None
         assert abs(r.loo_pbo - 0.20) < 1e-9
 
+    def test_pbo_candidate_count_and_interpretation(
+        self, contract: KuramotoRobustnessContract
+    ) -> None:
+        """Fold-mirror PBO must always be flagged tautological (n=2);
+        LOO-grid PBO with n=13 must be flagged admissible."""
+        r = run_kuramoto_cpcv_suite(contract)
+        assert r.pbo_candidate_count == 2
+        assert r.pbo_interpretation == "tautological"
+        assert r.loo_n_strategies == 13
+        assert r.loo_pbo_interpretation == "admissible"
+
 
 class TestNullSuite:
     def test_two_families_returned_and_bounded(self, contract: KuramotoRobustnessContract) -> None:
@@ -96,3 +107,14 @@ def test_placeholder_quadratic_monotonicity(self, contract: KuramotoRobustnessCo
         # Sharpe that exceeds the anchor by construction.
         r = run_kuramoto_jitter_suite(contract, n_candidates=64)
         assert max(r.stability.perturbed_sharpes) <= r.stability.anchor_sharpe
+
+    def test_placeholder_forces_pass_false(self, contract: KuramotoRobustnessContract) -> None:
+        """Task 2: placeholder evaluator MUST force fraction_within_tol_pass
+        to False, regardless of the observed fraction. The decision layer
+        carries evaluator_mode separately and uses it to route to
+        INSUFFICIENT_EVIDENCE when require_live_jitter=True."""
+        r = run_kuramoto_jitter_suite(contract, n_candidates=32)
+        assert r.evaluator_mode == "PLACEHOLDER_APPROXIMATION"
+        assert r.fraction_within_tol_pass is False
+        # Sanity: the raw fraction is still reported honestly on stability.
+        assert 0.0 <= r.stability.fraction_within_tol <= 1.0

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"evaluator_mode": "PLACEHOLDER_APPROXIMATION",`
`3`		`- "fraction_within_tol_pass": true,`
	`3`	`+ "fraction_within_tol_pass": false,`
`4`	`4`	`"pass_threshold": 0.8,`
`5`	`5`	`"stability": {`
`6`	`6`	`"anchor_sharpe": 1.2619,`