Merge pull request #28 from rbroc/dynamic-clustering

rbroc · web-flow · commit a37354db4923 · 2024-03-21T12:43:38.000+01:00
add dynamic topic modeling to clustering models
diff --git a/docs/clustering.md b/docs/clustering.md
@@ -188,6 +188,11 @@ top2vec = ClusteringTopicModel(
 Theoretically the model descriptions above should result in the same behaviour as the other two packages, but there might be minor changes in implementation.
 We do not intend to keep up with changes in Top2Vec's and BERTopic's internal implementation details indefinitely.
 
+### _(Optional)_ 5. Dynamic Modeling
+
+Clustering models are also capable of dynamic topic modeling. This happens by fitting a clustering model over the entire corpus, as we expect that there is only one semantic model generating the documents.
+To gain temporal representations for topics, the corpus is divided into equal, or arbitrarily chosen time slices, and then term importances are estimated using Soft-c-TF-IDF, c-TF-IDF, or distances from cluster centroid for each of the time slices separately. When distance from cluster centroids is used to estimate topic importances in dynamic modeling, cluster centroids are computed based on documents and terms present within a given time slice.
+
 ## Considerations
 
 ### Strengths
diff --git a/docs/dynamic.md b/docs/dynamic.md
@@ -28,7 +28,7 @@ Dynamic topic models in Turftopic have a unified interface.
 To fit a dynamic topic model you will need a corpus, that has been annotated with timestamps.
 The timestamps need to be Python `datetime` objects, but pandas `Timestamp` object are also supported.
 
-Models that have dynamic modeling capabilities have a `fit_transform_dynamic()` method, that fits the model on the corpus over time.
+Models that have dynamic modeling capabilities (currently, `GMM` and `ClusteringTopicModel`) have a `fit_transform_dynamic()` method, that fits the model on the corpus over time.
 
 ```python
 from datetime import datetime
diff --git a/tests/test_integration.py b/tests/test_integration.py
@@ -1,3 +1,4 @@
+from datetime import datetime
 import tempfile
 from pathlib import Path
 
@@ -15,6 +16,21 @@
     SemanticSignalSeparation,
 )
 
+
+def generate_dates(
+    n_dates: int,
+) -> list[datetime]:
+    """Generate random dates to test dynamic models"""
+    dates = []
+    for n in range(n_dates):
+        d = np.random.randint(low=1, high=29)
+        m = np.random.randint(low=1, high=13)
+        y = np.random.randint(low=2000, high=2020)
+        date = datetime(year=y, month=m, day=d)
+        dates.append(date)
+    return dates
+
+
 newsgroups = fetch_20newsgroups(
     subset="all",
     categories=[
@@ -25,12 +41,13 @@
 texts = newsgroups.data
 trf = SentenceTransformer("all-MiniLM-L6-v2")
 embeddings = np.asarray(trf.encode(texts))
+timestamps = generate_dates(n_dates=len(texts))
 
 models = [
     GMM(5, encoder=trf),
     SemanticSignalSeparation(5, encoder=trf),
-    KeyNMF(5, encoder=trf, keyword_scope='document'),
-    KeyNMF(5, encoder=trf, keyword_scope='corpus'),
+    KeyNMF(5, encoder=trf, keyword_scope="document"),
+    KeyNMF(5, encoder=trf, keyword_scope="corpus"),
     ClusteringTopicModel(
         n_reduce_to=5,
         feature_importance="c-tf-idf",
@@ -46,6 +63,22 @@
     AutoEncodingTopicModel(5, combined=True),
 ]
 
+dynamic_models = [
+    GMM(5, encoder=trf),
+    ClusteringTopicModel(
+        n_reduce_to=5,
+        feature_importance="centroid",
+        encoder=trf,
+        reduction_method="smallest",
+    ),
+    ClusteringTopicModel(
+        n_reduce_to=5,
+        feature_importance="soft-c-tf-idf",
+        encoder=trf,
+        reduction_method="smallest"
+    ),
+]
+
 
 @pytest.mark.parametrize("model", models)
 def test_fit_export_table(model):
@@ -56,3 +89,18 @@ def test_fit_export_table(model):
         with out_path.open("w") as out_file:
             out_file.write(table)
         df = pd.read_csv(out_path)
+
+
+@pytest.mark.parametrize("model", dynamic_models)
+def test_fit_dynamic(model):
+    doc_topic_matrix = model.fit_transform_dynamic(
+        texts,
+        embeddings=embeddings,
+        timestamps=timestamps,
+    )
+    table = model.export_topics(format="csv")
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        out_path = Path(tmpdirname).joinpath("topics.csv")
+        with out_path.open("w") as out_file:
+            out_file.write(table)
+        df = pd.read_csv(out_path)
diff --git a/turftopic/base.py b/turftopic/base.py
@@ -23,7 +23,9 @@ def remove_whitespace(text: str) -> str:
 class ContextualModel(ABC, TransformerMixin, BaseEstimator):
     """Base class for contextual topic models in Turftopic."""
 
-    def get_topics(self, top_k: int = 10) -> List[Tuple[Any, List[Tuple[str, float]]]]:
+    def get_topics(
+        self, top_k: int = 10
+    ) -> List[Tuple[Any, List[Tuple[str, float]]]]:
         """Returns high-level topic representations in form of the top K words
         in each topic.
 
@@ -135,8 +137,12 @@ def _highest_ranking_docs(
         except AttributeError:
             pass
         kth = min(top_k, document_topic_matrix.shape[0] - 1)
-        highest = np.argpartition(-document_topic_matrix[:, topic_id], kth)[:kth]
-        highest = highest[np.argsort(-document_topic_matrix[highest, topic_id])]
+        highest = np.argpartition(-document_topic_matrix[:, topic_id], kth)[
+            :kth
+        ]
+        highest = highest[
+            np.argsort(-document_topic_matrix[highest, topic_id])
+        ]
         scores = document_topic_matrix[highest, topic_id]
         columns = []
         columns.append("Document")
@@ -171,7 +177,9 @@ def print_highest_ranking_documents(
             topic_id, raw_documents, document_topic_matrix, top_k
         )
         table = Table(show_lines=True)
-        table.add_column("Document", justify="left", style="magenta", max_width=100)
+        table.add_column(
+            "Document", justify="left", style="magenta", max_width=100
+        )
         table.add_column("Score", style="blue", justify="right")
         for row in rows:
             table.add_row(*row)
@@ -223,7 +231,9 @@ def _topic_distribution(
     ) -> list[list[str]]:
         if topic_dist is None:
             if text is None:
-                raise ValueError("You should either pass a text or a distribution.")
+                raise ValueError(
+                    "You should either pass a text or a distribution."
+                )
             try:
                 topic_dist = self.transform([text])
             except AttributeError:
@@ -248,7 +258,9 @@ def _topic_distribution(
             rows.append([topic_names[ind], f"{score:.2f}"])
         return [columns, *rows]
 
-    def print_topic_distribution(self, text=None, topic_dist=None, top_k: int = 10):
+    def print_topic_distribution(
+        self, text=None, topic_dist=None, top_k: int = 10
+    ):
         """Pretty prints topic distribution in a document.
 
         Parameters
@@ -330,7 +342,9 @@ def fit_transform(
         """
         pass
 
-    def fit(self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None):
+    def fit(
+        self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None
+    ):
         """Fits model on the given corpus.
 
         Parameters
@@ -396,9 +410,13 @@ def prepare_topic_data(
         if embeddings is None:
             embeddings = self.encode_documents(corpus)
         try:
-            document_topic_matrix = self.transform(corpus, embeddings=embeddings)
+            document_topic_matrix = self.transform(
+                corpus, embeddings=embeddings
+            )
         except (AttributeError, NotFittedError):
-            document_topic_matrix = self.fit_transform(corpus, embeddings=embeddings)
+            document_topic_matrix = self.fit_transform(
+                corpus, embeddings=embeddings
+            )
         dtm = self.vectorizer.transform(corpus)  # type: ignore
         res: TopicData = {
             "corpus": corpus,
diff --git a/turftopic/dynamic.py b/turftopic/dynamic.py
@@ -199,7 +199,9 @@ def print_topics_over_time(
         show_scores: bool, default False
             Indicates whether to show importance scores for each word.
         """
-        columns, *rows = self._topics_over_time(top_k, show_scores, date_format)
+        columns, *rows = self._topics_over_time(
+            top_k, show_scores, date_format
+        )
         table = Table(show_lines=True)
         for column in columns:
             table.add_column(column)
diff --git a/turftopic/encoders/__init__.py b/turftopic/encoders/__init__.py
@@ -9,5 +9,5 @@
     "OpenAIEmbeddings",
     "VoyageEmbeddings",
     "ExternalEncoder",
-    "E5Encoder"
+    "E5Encoder",
 ]
diff --git a/turftopic/encoders/utils.py b/turftopic/encoders/utils.py
@@ -0,0 +1,12 @@
+import itertools
+from typing import Iterable, List
+
+
+def batched(iterable, n: int) -> Iterable[List[str]]:
+    "Batch data into tuples of length n. The last batch may be shorter."
+    # batched('ABCDEFG', 3) --> ABC DEF G
+    if n < 1:
+        raise ValueError("n must be at least one")
+    it = iter(iterable)
+    while batch := list(itertools.islice(it, n)):
+        yield batch
diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py
diff --git a/turftopic/models/gmm.py b/turftopic/models/gmm.py
diff --git a/turftopic/models/keynmf.py b/turftopic/models/keynmf.py

Original file line number	Diff line number	Diff line change
`@@ -9,5 +9,5 @@`
`9`	`9`	`"OpenAIEmbeddings",`
`10`	`10`	`"VoyageEmbeddings",`
`11`	`11`	`"ExternalEncoder",`
`12`		`- "E5Encoder"`
	`12`	`+ "E5Encoder",`
`13`	`13`	`]`