Skip to content

Commit d01682b

Browse files
committed
Add download hash verification and restore test thresholds
- H4: download_data.py now verifies SHA-256 hash after download, before extraction. Uses placeholder hash with TODO for now. - M8: Restore test thresholds from 0.5 to 0.7 in test_polygraphdiscrepancy.py. The test distributions (ER 0.8 vs ER 0.1) are clearly distinct; 0.5 was essentially random chance.
1 parent 91e3076 commit d01682b

2 files changed

Lines changed: 40 additions & 4 deletions

File tree

reproducibility/download_data.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66
python download_data.py --subset # Download small subset for CI
77
"""
88

9+
import hashlib
910
import shutil
1011
import subprocess
12+
from pathlib import Path
1113

1214
import typer
1315
from loguru import logger
@@ -23,6 +25,9 @@
2325
REPO_ROOT = here()
2426
DATA_DIR = REPO_ROOT / "data"
2527

28+
# SHA-256 hash of the data archive. Update after publishing a new version.
29+
EXPECTED_SHA256 = "TODO_COMPUTE_AND_SET_HASH"
30+
2631
EXPECTED_DIRS = ["AUTOGRAPH", "DIGRESS", "ESGG", "GRAN"]
2732

2833

@@ -33,6 +38,35 @@ def check_data_exists() -> bool:
3338
return all((DATA_DIR / d).exists() for d in EXPECTED_DIRS)
3439

3540

41+
def _verify_sha256(path: Path, expected: str) -> None:
42+
"""Verify the SHA-256 hash of a file.
43+
44+
Args:
45+
path: Path to the file to verify.
46+
expected: Expected hex-encoded SHA-256 digest.
47+
48+
Raises:
49+
ValueError: If the computed hash does not match *expected*.
50+
"""
51+
if expected == "TODO_COMPUTE_AND_SET_HASH":
52+
logger.warning("EXPECTED_SHA256 is unset — skipping hash verification")
53+
return
54+
55+
sha256 = hashlib.sha256()
56+
with open(path, "rb") as f:
57+
for chunk in iter(lambda: f.read(1 << 20), b""):
58+
sha256.update(chunk)
59+
computed = sha256.hexdigest()
60+
61+
if computed != expected:
62+
path.unlink()
63+
raise ValueError(
64+
f"SHA-256 mismatch for {path.name}: "
65+
f"expected {expected}, got {computed}"
66+
)
67+
logger.info("SHA-256 verified: {}", computed)
68+
69+
3670
def _download_and_extract() -> None:
3771
"""Download the archive from MPCDF DataShare and extract it."""
3872
archive_path = REPO_ROOT / "data_archive.zip"
@@ -49,6 +83,8 @@ def _download_and_extract() -> None:
4983
check=True,
5084
)
5185

86+
_verify_sha256(archive_path, EXPECTED_SHA256)
87+
5288
logger.info("Extracting archive to {}", DATA_DIR)
5389
DATA_DIR.mkdir(parents=True, exist_ok=True)
5490
shutil.unpack_archive(archive_path, DATA_DIR)

tests/test_polygraphdiscrepancy.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,8 @@ def test_classifier_metric(
5353
train, test = clf_metric.compute(sparse_graphs)
5454

5555
assert isinstance(train, float) and isinstance(test, float)
56-
assert train >= 0.5, f"Train score {train} is less than 0.5"
57-
assert test >= 0.5, f"Test score {test} is less than 0.5"
56+
assert train >= 0.7, f"Train score {train} is less than 0.7"
57+
assert test >= 0.7, f"Test score {test} is less than 0.7"
5858

5959
train, test = clf_metric.compute(dense_graphs)
6060
assert train <= 0.2, f"Train score {train} is greater than 0.2"
@@ -87,8 +87,8 @@ def test_polygraphdiscrepancy(
8787
assert len(result["subscores"]) == len(descriptors)
8888
assert result["pgd"] == result["subscores"][result["pgd_descriptor"]]
8989

90-
assert result["pgd"] >= 0.5, (
91-
f"PolyGraphDiscrepancy {result['pgd']} is less than 0.5"
90+
assert result["pgd"] >= 0.7, (
91+
f"PolyGraphDiscrepancy {result['pgd']} is less than 0.7"
9292
)
9393

9494
result = pgd.compute(dense_graphs)

0 commit comments

Comments
 (0)