Merge pull request #24 from x-tabdeveloping/tests

x-tabdeveloping · web-flow · commit 923196b795f2 · 2024-03-13T14:36:46.000+01:00
Added Integration tests for all models
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,36 @@
+name: Tests
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+jobs:
+  pytest:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9"]
+          #
+    # This allows a subsequently queued workflow run to interrupt previous runs
+    concurrency:
+      group: "${{ github.workflow }}-${{ matrix.python-version}}-${{ matrix.os }} @ ${{ github.ref }}"
+      cancel-in-progress: true
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: "pip"
+      # You can test your matrix by printing the current Python version
+      - name: Display Python version
+        run: python3 -c "import sys; print(sys.version)"
+
+      - name: Install dependencies
+        run: python3 -m pip install --upgrade turftopic[pyro-ppl] pandas pytest
+
+      - name: Run tests
+        run: python3 -m pytest tests/
+
diff --git a/tests/test_integration.py b/tests/test_integration.py
@@ -0,0 +1,57 @@
+import tempfile
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import pytest
+from sentence_transformers import SentenceTransformer
+from sklearn.datasets import fetch_20newsgroups
+
+from turftopic import (
+    GMM,
+    AutoEncodingTopicModel,
+    ClusteringTopicModel,
+    KeyNMF,
+    SemanticSignalSeparation,
+)
+
+newsgroups = fetch_20newsgroups(
+    subset="all",
+    categories=[
+        "misc.forsale",
+    ],
+    remove=("headers", "footers", "quotes"),
+)
+texts = newsgroups.data
+trf = SentenceTransformer("all-MiniLM-L6-v2")
+embeddings = np.asarray(trf.encode(texts))
+
+models = [
+    GMM(5, encoder=trf),
+    SemanticSignalSeparation(5, encoder=trf),
+    KeyNMF(5, encoder=trf),
+    ClusteringTopicModel(
+        n_reduce_to=5,
+        feature_importance="c-tf-idf",
+        encoder=trf,
+        reduction_method="agglomerative",
+    ),
+    ClusteringTopicModel(
+        n_reduce_to=5,
+        feature_importance="centroid",
+        encoder=trf,
+        reduction_method="smallest",
+    ),
+    AutoEncodingTopicModel(5, combined=True),
+]
+
+
+@pytest.mark.parametrize("model", models)
+def test_fit_export_table(model):
+    doc_topic_matrix = model.fit_transform(texts, embeddings=embeddings)
+    table = model.export_topics(format="csv")
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        out_path = Path(tmpdirname).joinpath("topics.csv")
+        with out_path.open("w") as out_file:
+            out_file.write(table)
+        df = pd.read_csv(out_path)