Added tests for snippets in markdown

Markus28 · Markus28 · commit b218ac3a7be0 · 2025-08-08T11:36:01.000+02:00
diff --git a/tests/collect_doc_snippets.py b/tests/collect_doc_snippets.py
@@ -29,12 +29,6 @@ def extract_code_blocks_from_docstring(docstring):
     return fixed_blocks
 
 
-def run_code_snippet(code, context=None):
-    """Run a snippet of code and catch any exceptions."""
-    context = context or {}
-    exec(code, context)
-
-
 def extract_all_docstrings_from_module(module):
     """Get all docstrings from a module, its classes, and functions."""
     docstrings = []
diff --git a/tests/collect_markdown_snippets.py b/tests/collect_markdown_snippets.py
@@ -0,0 +1,126 @@
+import os
+import re
+import pathlib
+import textwrap
+from typing import Iterator, List, Tuple
+
+# ----------------------------
+# Configuration
+# ----------------------------
+BASE_DIR = pathlib.Path(__file__).resolve().parent.parent
+DOCS_DIR = BASE_DIR / "docs"
+
+# Regex to match triple-backticked code blocks:
+# - PYTHON_ONLY matches blocks explicitly labeled as Python (python|py|python3|pycon|ipython)
+# - ANY_LABEL_OR_UNLABELED matches python-labeled OR unlabeled blocks
+PYTHON_CODE_BLOCK_PATTERN = re.compile(
+    r"```\s*(?:python|py|python3|pycon|ipython)(?:[^\n]*)?\s*\n(.*?)```",
+    re.DOTALL,
+)
+ANY_CODE_BLOCK_PATTERN = re.compile(
+    r"```(?:\s*(?:python|py|python3|pycon|ipython)(?:[^\n]*)?)?\s*\n(.*?)```",
+    re.DOTALL,
+)
+
+# Directories to skip when walking the docs tree (common build outputs)
+SKIP_DIRS = {"_build", ".git", ".venv", "build", "site", "dist", "node_modules"}
+
+
+def _extract_code_blocks_from_markdown(
+    markdown_text: str, only_python: bool = True
+) -> List[str]:
+    """Extract triple-backtick code blocks from a markdown string and fix indentation.
+
+    If only_python is True, only code blocks explicitly labeled as Python are captured.
+    Otherwise, unlabeled code blocks are captured as well.
+    """
+    pattern = (
+        PYTHON_CODE_BLOCK_PATTERN if only_python else ANY_CODE_BLOCK_PATTERN
+    )
+    code_blocks = pattern.findall(markdown_text or "")
+
+    fixed_blocks: List[str] = []
+    for code in code_blocks:
+        dedented_code = textwrap.dedent(code)
+        fixed_blocks.append(dedented_code)
+
+    return fixed_blocks
+
+
+def _iter_markdown_files(root: pathlib.Path) -> Iterator[pathlib.Path]:
+    """Yield markdown files (.md, .mdx) under root, skipping common build dirs."""
+    if not root.exists():
+        return
+
+    for dirpath, dirnames, filenames in os.walk(root):
+        # In-place prune dirs to skip
+        dirnames[:] = [d for d in dirnames if d not in SKIP_DIRS]
+
+        for filename in filenames:
+            if filename.lower().endswith((".md", ".mdx")):
+                yield pathlib.Path(dirpath) / filename
+
+
+def gather_markdown_snippets(only_python: bool = True) -> List[Tuple[str, str]]:
+    """Gather all triple-backtick code snippets from markdown docs in `docs/`.
+
+    For each markdown file, concatenate all discovered code blocks into a single
+    Python snippet. Returns a list of (file_identifier, concatenated_code) tuples.
+    The identifier is the path to the markdown file relative to the repo root.
+
+    Parameters
+    ----------
+    only_python: bool
+        If True (default), only capture code fences labeled as Python. If False,
+        also capture unlabeled fences.
+    """
+    print("Gathering markdown snippets from docs/ ...")
+
+    snippets: List[Tuple[str, str]] = []
+
+    if not DOCS_DIR.exists():
+        print(f"docs/ directory not found at {DOCS_DIR}")
+        return snippets
+
+    files = list(_iter_markdown_files(DOCS_DIR))
+    print(f"Found {len(files)} markdown files to process")
+
+    for index, md_path in enumerate(sorted(files)):
+        rel_path = md_path.relative_to(BASE_DIR)
+        print(f"[{index + 1}/{len(files)}] Processing: {rel_path}")
+        try:
+            text = md_path.read_text(encoding="utf-8")
+        except Exception as exc:
+            print(
+                f"  -> Failed to read {rel_path}: {type(exc).__name__}: {exc}"
+            )
+            # Treat read errors as test cases too for visibility
+            snippets.append(
+                (
+                    f"{rel_path} (read error)",
+                    f"# read error\nraise IOError({repr(str(exc))})",
+                )
+            )
+            continue
+
+        code_blocks = _extract_code_blocks_from_markdown(
+            text, only_python=only_python
+        )
+        print(f"  -> Found {len(code_blocks)} code blocks")
+
+        if code_blocks:
+            # Concatenate all blocks with spacing to avoid accidental token pasting
+            concatenated = (
+                "\n\n".join(block.strip("\n") for block in code_blocks) + "\n"
+            )
+            identifier = f"{rel_path}"
+            snippets.append((identifier, concatenated))
+            print(f"    -> Added concatenated snippet for: {identifier}")
+
+    print(f"Finished gathering snippets. Total: {len(snippets)} snippets")
+    return snippets
+
+
+__all__ = [
+    "gather_markdown_snippets",
+]
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -35,9 +35,11 @@
 from polygraph.datasets.molecules import QM9
 
 from collect_doc_snippets import gather_docstring_snippets
+from collect_markdown_snippets import gather_markdown_snippets
 
 # Cache the snippets to avoid calling gather_docstring_snippets multiple times
-_cached_snippets = None
+_cached_doc_snippets = None
+_cached_md_snippets = None
 
 NO_SKIP_OPTION = "--no-skip"
 SAMPLE_SIZE_OPTION = "--sample-size"
@@ -234,18 +236,33 @@ def sample_molecules():
 
 
 def pytest_generate_tests(metafunc):
-    if "code_snippet" in metafunc.fixturenames:
-        global _cached_snippets
-        if _cached_snippets is None:
+    if "doc_snippet" in metafunc.fixturenames:
+        global _cached_doc_snippets
+        if _cached_doc_snippets is None:
             print(
                 "pytest_generate_tests: Gathering docstring snippets with debug output..."
             )
-            _cached_snippets = gather_docstring_snippets()
+            _cached_doc_snippets = gather_docstring_snippets()
             print(
-                f"pytest_generate_tests: Found {len(_cached_snippets)} snippets"
+                f"pytest_generate_tests: Found {len(_cached_doc_snippets)} snippets"
             )
         metafunc.parametrize(
-            "code_snippet",
-            _cached_snippets,
-            ids=[snippet[0] for snippet in _cached_snippets],
+            "doc_snippet",
+            _cached_doc_snippets,
+            ids=[snippet[0] for snippet in _cached_doc_snippets],
+        )
+    if "md_snippet" in metafunc.fixturenames:
+        global _cached_md_snippets
+        if _cached_md_snippets is None:
+            print(
+                "pytest_generate_tests: Gathering markdown snippets with debug output..."
+            )
+            _cached_md_snippets = gather_markdown_snippets()
+            print(
+                f"pytest_generate_tests: Found {len(_cached_md_snippets)} snippets"
+            )
+        metafunc.parametrize(
+            "md_snippet",
+            _cached_md_snippets,
+            ids=[snippet[0] for snippet in _cached_md_snippets],
         )
diff --git a/tests/test_docstring_snippets.py b/tests/test_docstring_snippets.py
diff --git a/tests/test_snippets.py b/tests/test_snippets.py
@@ -0,0 +1,26 @@
+import pytest
+import traceback
+
+
+def run_code_snippet(code, context=None):
+    """Run a snippet of code and catch any exceptions."""
+    context = context or {}
+    exec(code, context)
+
+
+def test_docstring_snippet_runs(doc_snippet):
+    """Each code snippet from a docstring should run without error."""
+    snippet_id, code = doc_snippet
+
+    try:
+        run_code_snippet(code)
+    except Exception:
+        pytest.fail(f"Snippet failed: {snippet_id}\n\n{traceback.format_exc()}")
+
+
+def test_markdown_snippet_runs(md_snippet):
+    snippet_id, code = md_snippet
+    try:
+        run_code_snippet(code)
+    except Exception:
+        pytest.fail(f"Snippet failed: {snippet_id}\n\n{traceback.format_exc()}")