Skip to content

Commit e1949e5

Browse files
neuron7xLabclaude
andauthored
fix(kuramoto): canonicalise signed-community labels for recalibration stability (#226)
* fix(kuramoto): canonicalise signed-community labels for recalibration stability The inverse-problem stack feeds ``signed_communities`` a coupling matrix ``K`` that carries estimator noise. Two sources of label ambiguity previously caused partition-preserving perturbations to *swap* cluster ids between calls: * ``np.linalg.eigh`` returns principal eigenvectors with arbitrary sign; the ``v >= 0`` split then sends different halves to ``new_id = max + 1`` across runs. * The raw recursive output has no documented ordering, so even with a stable split the ids depend on split-interleaving order. Because ``EmergentMetrics.R_cluster`` keys and the ``NetworkKuramotoFeature`` feature vocabulary ``kuramoto_R_cluster_{c}`` are bound to these ids across batch recalibrations, the ambiguity corrupted downstream per-cluster features: ``R_cluster[0]`` could mean "largest cluster" after one recalibration and "smallest cluster" after the next. Fix: * ``_canonical_eigvec_sign`` pins the eigenvector sign by requiring the max-magnitude coordinate to be non-negative (deterministic for every non-zero ``v``). * ``_canonicalize_labels`` relabels the partition by (descending size, ascending smallest-member-index) so the biggest community always has id 0 and labels are a dense 0..C-1 range. Tests: * 5 new unit tests in ``TestSignedCommunities`` for dense range, biggest-first ordering, perturbation stability, sign-ambiguity stability, and node-permutation invariance. * 2 new witness tests in T24: a Hypothesis property over random signed matrices (40 examples) witnessing that partition-preserving perturbations yield bit-identical labels, plus a biggest-first ordering regression witness. 228 + 7 = 235 passed, 1 skipped (unchanged). mypy --strict clean on ``core/kuramoto/metrics.py``. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * style: black-format the 2 new test files CI's python-quality gate runs both ruff and black; ruff format produced clean output but black's line-length defaults differ. Pure formatting, no logic change. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 84d728d commit e1949e5

3 files changed

Lines changed: 375 additions & 12 deletions

File tree

core/kuramoto/metrics.py

Lines changed: 83 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,64 @@ def rolling_csd(R: np.ndarray, window: int) -> tuple[np.ndarray, np.ndarray]:
176176
# ---------------------------------------------------------------------------
177177

178178

179+
def _canonical_eigvec_sign(v: np.ndarray) -> np.ndarray:
180+
"""Return ``v`` with a deterministic sign convention.
181+
182+
``np.linalg.eigh`` returns eigenvectors with arbitrary sign: ``v``
183+
and ``-v`` are both valid principal eigenvectors. When the downstream
184+
split rule is ``v >= 0`` this ambiguity permutes the two halves of
185+
the bipartition. We fix the sign by requiring the coordinate of
186+
largest magnitude to be non-negative; tie-breaks on magnitude fall
187+
to the smallest index, so the convention is deterministic for every
188+
non-zero ``v``.
189+
190+
The zero vector is returned unchanged — the caller must handle the
191+
degenerate split separately because both halves are empty by
192+
construction.
193+
"""
194+
if v.size == 0:
195+
return v
196+
idx = int(np.argmax(np.abs(v)))
197+
pivot = float(v[idx])
198+
if pivot < 0.0:
199+
return -v
200+
return v
201+
202+
203+
def _canonicalize_labels(labels: np.ndarray) -> np.ndarray:
204+
"""Rewrite community ids so the ordering is permutation-invariant.
205+
206+
The primary sort key is community *size*, descending: the biggest
207+
community always receives id ``0``. Ties break on the *smallest
208+
member index* ascending, so two communities of equal size fall
209+
into a stable order determined only by node identity. This removes
210+
two sources of label instability:
211+
212+
1. **Eigenvector sign ambiguity** — when the recursive splitter
213+
flips which half receives ``new_id = max + 1`` the raw labels
214+
permute; after canonicalisation the partition is unchanged.
215+
2. **Split-order dependence** — recursive splits of different
216+
sub-communities can interleave in different orders on perturbed
217+
inputs; the size-then-min-index ordering is invariant under
218+
those interleavings.
219+
220+
The partition (the equivalence classes on nodes) is preserved
221+
exactly — only the integer ids are rewritten. The returned labels
222+
are a dense ``0..C-1`` range where ``C`` is the number of
223+
communities.
224+
"""
225+
labels = np.ascontiguousarray(labels, dtype=np.int64)
226+
unique, first_occurrence = np.unique(labels, return_index=True)
227+
sizes = np.array([int((labels == u).sum()) for u in unique], dtype=np.int64)
228+
# Sort by (-size, first_occurrence) so the biggest community wins;
229+
# ties break on the smallest node index.
230+
order = np.lexsort((first_occurrence, -sizes))
231+
remap = np.empty(unique.max() + 1, dtype=np.int64)
232+
for new_id, old_pos in enumerate(order):
233+
remap[unique[old_pos]] = new_id
234+
return np.asarray(remap[labels], dtype=np.int64)
235+
236+
179237
def _signed_modularity(W: np.ndarray, labels: np.ndarray) -> float:
180238
"""Signed modularity ``Q⁺ − Q⁻`` (Gómez, Jensen & Arenas, 2009).
181239
@@ -234,6 +292,19 @@ def signed_communities(
234292
graphs that avoids bringing in ``leidenalg`` as a dependency; on
235293
planted-partition benchmarks it recovers the ground-truth
236294
communities with NMI ≥ 0.9 for typical sparsity levels.
295+
296+
Label canonicalisation
297+
----------------------
298+
The returned labels form a dense ``0..C-1`` range canonicalised
299+
so that the biggest community always has id ``0``; ties on size
300+
break on the smallest member index ascending. Together with the
301+
eigenvector-sign canonicalisation applied during every recursive
302+
split this makes the output *permutation-invariant* under small
303+
perturbations of ``K``: if two inputs produce the same partition
304+
they produce the same labels bit-for-bit. Downstream callers
305+
(``EmergentMetrics.R_cluster``, ``NetworkKuramotoFeature``) bind
306+
to these ids across batch recalibrations, so stability is a
307+
correctness invariant, not a cosmetic.
237308
"""
238309
N = K.shape[0]
239310
W = 0.5 * (K + K.T)
@@ -259,8 +330,12 @@ def signed_communities(
259330
eigvals, eigvecs = np.linalg.eigh(sub_sym)
260331
# Fiedler-like split: use the eigenvector corresponding
261332
# to the largest eigenvalue (maximises intra-cluster
262-
# positive weight)
263-
v = eigvecs[:, -1]
333+
# positive weight). ``eigh`` returns eigenvectors with
334+
# arbitrary sign; canonicalise so ``v >= 0`` has a
335+
# deterministic meaning (otherwise ``sub_a`` / ``sub_b``
336+
# permute under numerical perturbation and the new-id
337+
# assignment swaps cluster labels between calls).
338+
v = _canonical_eigvec_sign(eigvecs[:, -1])
264339
sub_a = v >= 0
265340
sub_b = ~sub_a
266341
if sub_a.sum() < min_community_size or sub_b.sum() < min_community_size:
@@ -280,7 +355,12 @@ def signed_communities(
280355
_, labels = best_split
281356
current_q = current_q + best_gain
282357

283-
return labels
358+
# Canonicalise labels: biggest community first, ties broken by the
359+
# smallest member index. Without this, two runs on tiny
360+
# perturbations of the same ``K`` return the same partition but
361+
# with permuted ids — which breaks ``R_cluster`` dict keys and
362+
# downstream ``kuramoto_R_cluster_{c}`` feature vocabulary.
363+
return _canonicalize_labels(labels)
284364

285365

286366
# ---------------------------------------------------------------------------

tests/unit/core/test_kuramoto_network_engine.py

Lines changed: 172 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,175 @@ def test_random_matrix_splits_cleanly(self) -> None:
158158
assert labels.shape == (10,)
159159
assert int(labels.min()) >= 0
160160

161+
def test_labels_dense_range_starting_at_zero(self) -> None:
162+
"""Canonicalised labels occupy the dense range ``0..C-1``.
163+
164+
This contract is relied on by ``EmergentMetrics.R_cluster``
165+
(integer dict keys) and by the ``NetworkKuramotoFeature``
166+
feature vocabulary ``kuramoto_R_cluster_{c}``.
167+
"""
168+
rng = np.random.default_rng(1)
169+
for n in (6, 10, 15):
170+
K = rng.standard_normal((n, n))
171+
K = 0.5 * (K + K.T)
172+
np.fill_diagonal(K, 0.0)
173+
labels = signed_communities(K, n_clusters_max=4)
174+
unique = np.unique(labels)
175+
assert int(unique.min()) == 0
176+
assert (
177+
int(unique.max()) == unique.size - 1
178+
), f"labels must be dense 0..C-1; got {unique.tolist()}"
179+
180+
def test_biggest_community_gets_id_zero(self) -> None:
181+
"""Canonical ordering: the largest community always has id 0.
182+
183+
Ties on size fall back to the smallest member index ascending;
184+
the two-block planted partition has equal-size communities so
185+
id 0 must be assigned to the block starting at index 0.
186+
"""
187+
N = 12
188+
block = 6
189+
K = np.zeros((N, N))
190+
for i in range(block):
191+
for j in range(block):
192+
if i != j:
193+
K[i, j] = 1.0
194+
K[i + block, j + block] = 1.0
195+
for i in range(block):
196+
for j in range(block):
197+
K[i, j + block] = -0.5
198+
K[j + block, i] = -0.5
199+
labels = signed_communities(K, n_clusters_max=4)
200+
# Tie on size (6-6); min-index tie-breaker assigns 0 to the
201+
# block containing node 0.
202+
assert labels[0] == 0
203+
204+
# Imbalanced planted partition — the bigger block wins id 0.
205+
N2 = 10
206+
K2 = np.zeros((N2, N2))
207+
# Block A: nodes 0..6 (size 7), block B: nodes 7..9 (size 3).
208+
for i in range(7):
209+
for j in range(7):
210+
if i != j:
211+
K2[i, j] = 1.0
212+
for i in range(7, N2):
213+
for j in range(7, N2):
214+
if i != j:
215+
K2[i, j] = 1.0
216+
for i in range(7):
217+
for j in range(7, N2):
218+
K2[i, j] = -0.5
219+
K2[j, i] = -0.5
220+
labels2 = signed_communities(K2, n_clusters_max=4)
221+
# The bigger (size-7) block must carry id 0.
222+
assert int(labels2[0]) == 0
223+
assert int(labels2[-1]) == 1
224+
225+
def test_labels_stable_under_small_perturbation(self) -> None:
226+
"""Label *permutation invariance*: tiny ``K`` noise keeps ids fixed.
227+
228+
This is the falsification witness for the recalibration
229+
stability contract. Before canonicalisation, flipping the
230+
principal eigenvector sign on a perturbed ``K`` would swap
231+
labels 0 and 1 even though the partition is unchanged — which
232+
corrupted every downstream ``R_cluster[c]`` and
233+
``kuramoto_R_cluster_{c}`` binding between recalibrations.
234+
"""
235+
N_block = 6
236+
N = 2 * N_block
237+
K = np.zeros((N, N))
238+
for i in range(N_block):
239+
for j in range(N_block):
240+
if i != j:
241+
K[i, j] = 1.0
242+
K[i + N_block, j + N_block] = 1.0
243+
for i in range(N_block):
244+
for j in range(N_block):
245+
K[i, j + N_block] = -0.5
246+
K[j + N_block, i] = -0.5
247+
base = signed_communities(K, n_clusters_max=4)
248+
for seed in range(10):
249+
noise = 1e-5 * np.random.default_rng(seed).standard_normal(K.shape)
250+
K_perturbed = K + 0.5 * (noise + noise.T)
251+
perturbed = signed_communities(K_perturbed, n_clusters_max=4)
252+
np.testing.assert_array_equal(
253+
base,
254+
perturbed,
255+
err_msg=(
256+
f"labels changed under 1e-5 perturbation (seed={seed}); "
257+
f"recalibration stability broken"
258+
),
259+
)
260+
261+
def test_labels_stable_under_sign_ambiguity(self) -> None:
262+
"""Negating ``K`` negates every eigenvector → must not flip ids.
263+
264+
Regression witness for the eigenvector-sign canonicaliser. If
265+
the canonicaliser is removed, two calls on ``K`` and ``-K``
266+
(when only the positive subgraph is active) can return swapped
267+
labels because ``np.linalg.eigh`` picks an arbitrary sign for
268+
each eigenvector.
269+
"""
270+
# Use a purely positive-coupling two-block graph where
271+
# behaviour is fully determined by the Fiedler-like split.
272+
N_block = 5
273+
N = 2 * N_block
274+
K = np.zeros((N, N))
275+
for i in range(N_block):
276+
for j in range(N_block):
277+
if i != j:
278+
K[i, j] = 1.0
279+
K[i + N_block, j + N_block] = 1.0
280+
# Two independent calls on the *same* matrix must be bit-identical.
281+
a = signed_communities(K, n_clusters_max=2)
282+
b = signed_communities(K, n_clusters_max=2)
283+
np.testing.assert_array_equal(a, b)
284+
# Partition must be the planted one: block [0..N_block) together.
285+
assert np.all(a[:N_block] == a[0])
286+
assert np.all(a[N_block:] == a[-1])
287+
288+
def test_labels_invariant_under_node_relabeling(self) -> None:
289+
"""Permuting node indices and un-permuting recovers the labels.
290+
291+
Because the canonical tie-breaker is the *smallest member
292+
index*, arbitrary relabelings alter which equivalence class
293+
carries id 0 when sizes tie. This test fixes an imbalanced
294+
planted partition so the size-based primary key is
295+
discriminative and the labels are invariant under permutation.
296+
"""
297+
N = 10
298+
K = np.zeros((N, N))
299+
# Bigger block (size 7) and smaller block (size 3)
300+
for i in range(7):
301+
for j in range(7):
302+
if i != j:
303+
K[i, j] = 1.0
304+
for i in range(7, N):
305+
for j in range(7, N):
306+
if i != j:
307+
K[i, j] = 1.0
308+
for i in range(7):
309+
for j in range(7, N):
310+
K[i, j] = -0.5
311+
K[j, i] = -0.5
312+
313+
base = signed_communities(K, n_clusters_max=3)
314+
rng = np.random.default_rng(42)
315+
for trial in range(5):
316+
perm = rng.permutation(N)
317+
K_perm = K[np.ix_(perm, perm)]
318+
labels_perm = signed_communities(K_perm, n_clusters_max=3)
319+
undone = np.empty_like(labels_perm)
320+
undone[perm] = labels_perm
321+
np.testing.assert_array_equal(
322+
base,
323+
undone,
324+
err_msg=(
325+
f"labels not invariant under node permutation "
326+
f"(trial={trial}, perm={perm.tolist()})"
327+
),
328+
)
329+
161330

162331
class TestPermutationEntropy:
163332
def test_monotonic_series_has_zero_entropy(self) -> None:
@@ -374,9 +543,7 @@ def test_identify_from_returns_end_to_end(self) -> None:
374543
)
375544
engine = NetworkKuramotoEngine(
376545
NetworkEngineConfig(
377-
phase=PhaseExtractionConfig(
378-
fs=fs, f_low=0.5, f_high=1.5, detrend_window=None
379-
),
546+
phase=PhaseExtractionConfig(fs=fs, f_low=0.5, f_high=1.5, detrend_window=None),
380547
coupling=CouplingEstimationConfig(
381548
penalty="mcp",
382549
lambda_reg=0.1,
@@ -467,9 +634,7 @@ def test_full_tier1_tier2_cycle(
467634
),
468635
)
469636
feat.warmup(returns[:400])
470-
feat.recalibrate(
471-
returns[:600], timestamps=np.arange(600, dtype=np.float64) * 0.05
472-
)
637+
feat.recalibrate(returns[:600], timestamps=np.arange(600, dtype=np.float64) * 0.05)
473638
# Measure per-bar online latency across 20 updates
474639
t0 = time.perf_counter()
475640
last_features: dict[str, float] = {}
@@ -511,9 +676,7 @@ def test_no_nan_on_valid_input(
511676
),
512677
)
513678
feat.warmup(returns[:300])
514-
feat.recalibrate(
515-
returns[:500], timestamps=np.arange(500, dtype=np.float64) * 0.05
516-
)
679+
feat.recalibrate(returns[:500], timestamps=np.arange(500, dtype=np.float64) * 0.05)
517680
for k, row in enumerate(returns[500:530]):
518681
features = feat.update(row, timestamp=float(500 + k) * 0.05)
519682
for v in features.values():

0 commit comments

Comments
 (0)