Added random_state argument to all models so results are exactly reproducible.

x-tabdeveloping · x-tabdeveloping · commit 450184b2403c · 2024-04-04T11:23:13.000+02:00
diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py
@@ -137,6 +137,14 @@ class ClusteringTopicModel(ContextualModel, ClusterMixin, DynamicTopicModel):
         The specified reduction method will be used to merge them.
         By default, topics are not merged.
     reduction_method: 'agglomerative', 'smallest'
+        Method used to reduce the number of topics post-hoc.
+        When 'agglomerative', BERTopic's topic reduction method is used,
+        where topic vectors are hierarchically clustered.
+        When 'smallest', the smallest topic gets merged into the closest
+        non-outlier cluster until the desired number
+        is achieved similarly to Top2Vec.
+    random_state: int, default None
+        Random state to use so that results are exactly reproducible.
     """
 
     def __init__(
@@ -154,8 +162,10 @@ def __init__(
         reduction_method: Literal[
             "agglomerative", "smallest"
         ] = "agglomerative",
+        random_state: Optional[int] = None,
     ):
         self.encoder = encoder
+        self.random_state = random_state
         if feature_importance not in ["c-tf-idf", "soft-c-tf-idf", "centroid"]:
             raise ValueError(feature_message)
         if isinstance(encoder, int):
@@ -174,7 +184,7 @@ def __init__(
             self.clustering = clustering
         if dimensionality_reduction is None:
             self.dimensionality_reduction = TSNE(
-                n_components=2, metric="cosine"
+                n_components=2, metric="cosine", random_state=random_state
             )
         else:
             self.dimensionality_reduction = dimensionality_reduction
@@ -196,7 +206,9 @@ def _merge_agglomerative(self, n_reduce_to: int) -> np.ndarray:
         )
         old_labels = [label for label in self.classes_ if label != -1]
         new_labels = AgglomerativeClustering(
-            n_clusters=n_reduce_to, metric="cosine", linkage="average"
+            n_clusters=n_reduce_to,
+            metric="cosine",
+            linkage="average",
         ).fit_predict(interesting_topic_vectors)
         res = {}
         if -1 in self.classes_:
@@ -235,7 +247,9 @@ def _estimate_parameters(
             self.labels_, classes=self.classes_
         )
         if self.feature_importance == "soft-c-tf-idf":
-            self.components_ = soft_ctf_idf(document_topic_matrix, doc_term_matrix)  # type: ignore
+            self.components_ = soft_ctf_idf(
+                document_topic_matrix, doc_term_matrix
+            )  # type: ignore
         elif self.feature_importance == "centroid":
             self.components_ = cluster_centroid_distance(
                 self.topic_vectors_,
@@ -327,7 +341,7 @@ def fit_transform_dynamic(
         if embeddings is None:
             embeddings = self.encoder_.encode(raw_documents)
         for i_timebin in np.arange(len(self.time_bin_edges) - 1):
-            if hasattr(self, 'components_'):
+            if hasattr(self, "components_"):
                 doc_topic_matrix = label_binarize(
                     self.labels_, classes=self.classes_
                 )
diff --git a/turftopic/models/ctm.py b/turftopic/models/ctm.py
@@ -1,4 +1,6 @@
 import math
+import random
+import sys
 from typing import Optional, Union
 
 import numpy as np
@@ -129,6 +131,8 @@ class AutoEncodingTopicModel(ContextualModel):
         Learning rate for the optimizer.
     n_epochs: int, default 50
         Number of epochs to run during training.
+    random_state: int, default None
+        Random state to use so that results are exactly reproducible.
     """
 
     def __init__(
@@ -144,8 +148,10 @@ def __init__(
         batch_size: int = 42,
         learning_rate: float = 1e-2,
         n_epochs: int = 50,
+        random_state: Optional[int] = None,
     ):
         self.n_components = n_components
+        self.random_state = random_state
         self.encoder = encoder
         if isinstance(encoder, str):
             self.encoder_ = SentenceTransformer(encoder)
@@ -205,7 +211,7 @@ def fit(
             status.update("Extracting terms.")
             document_term_matrix = self.vectorizer.fit_transform(raw_documents)
             console.log("Term extraction done.")
-            seed = 0
+            seed = self.random_state or random.randint(0, sys.maxint - 1)
             torch.manual_seed(seed)
             pyro.set_rng_seed(seed)
             device = torch.device(
diff --git a/turftopic/models/gmm.py b/turftopic/models/gmm.py
@@ -54,6 +54,8 @@ class GMM(ContextualModel, DynamicTopicModel):
         result in Gaussian components.
         For even larger datasets you can use IncrementalPCA to reduce
         memory load.
+    random_state: int, default None
+        Random state to use so that results are exactly reproducible.
 
     Attributes
     ----------
@@ -71,11 +73,13 @@ def __init__(
         dimensionality_reduction: Optional[TransformerMixin] = None,
         weight_prior: Literal["dirichlet", "dirichlet_process", None] = None,
         gamma: Optional[float] = None,
+        random_state: Optional[int] = None,
     ):
         self.n_components = n_components
         self.encoder = encoder
         self.weight_prior = weight_prior
         self.gamma = gamma
+        self.random_state = random_state
         if isinstance(encoder, str):
             self.encoder_ = SentenceTransformer(encoder)
         else:
@@ -94,9 +98,12 @@ def __init__(
                     else "dirichlet_process"
                 ),
                 weight_concentration_prior=gamma,
+                random_state=self.random_state,
             )
         else:
-            mixture = GaussianMixture(n_components)
+            mixture = GaussianMixture(
+                n_components, random_state=self.random_state
+            )
         if dimensionality_reduction is not None:
             self.gmm_ = make_pipeline(dimensionality_reduction, mixture)
         else:
@@ -162,7 +169,7 @@ def fit_transform_dynamic(
         bins: Union[int, list[datetime]] = 10,
     ):
         time_labels, self.time_bin_edges = bin_timestamps(timestamps, bins)
-        if hasattr(self, 'components_'):
+        if hasattr(self, "components_"):
             doc_topic_matrix = self.transform(
                 raw_documents, embeddings=embeddings
             )
diff --git a/turftopic/models/keynmf.py b/turftopic/models/keynmf.py
@@ -79,6 +79,8 @@ class KeyNMF(ContextualModel):
         is performed on the whole vocabulary ('corpus') or only
         using words that are included in the document ('document').
         Setting this to 'corpus' allows for multilingual topics.
+    random_state: int, default None
+        Random state to use so that results are exactly reproducible.
     """
 
     def __init__(
@@ -90,7 +92,9 @@ def __init__(
         vectorizer: Optional[CountVectorizer] = None,
         top_n: int = 25,
         keyword_scope: str = "document",
+        random_state: Optional[int] = None,
     ):
+        self.random_state = random_state
         if keyword_scope not in ["document", "corpus"]:
             raise ValueError("keyword_scope must be 'document' or 'corpus'")
         self.n_components = n_components
@@ -105,7 +109,7 @@ def __init__(
         else:
             self.vectorizer = vectorizer
         self.dict_vectorizer_ = DictVectorizer()
-        self.nmf_ = NMF(n_components)
+        self.nmf_ = NMF(n_components, random_state=self.random_state)
         self.keyword_scope = keyword_scope
 
     def extract_keywords(
@@ -172,7 +176,9 @@ def minibatch_train(
         console=None,
     ):
         self.dict_vectorizer_.fit(keywords)
-        self.nmf_ = MiniBatchNMF(self.n_components)
+        self.nmf_ = MiniBatchNMF(
+            self.n_components, random_state=self.random_state
+        )
         epoch_costs = []
         for i_epoch in range(max_epochs):
             epoch_cost = 0
@@ -220,7 +226,9 @@ def big_fit(
             console.log("Keywords extracted.")
             keywords = KeywordIterator(keyword_file)
             status.update("Fitting NMF.")
-            self.minibatch_train(keywords, max_epochs, batch_size, console=console)  # type: ignore
+            self.minibatch_train(
+                keywords, max_epochs, batch_size, console=console
+            )  # type: ignore
             console.log("NMF fitted.")
         return self