linting

rbroc · rbroc · commit 4fcab1738198 · 2024-03-20T16:20:52.000+01:00
diff --git a/tests/test_integration.py b/tests/test_integration.py
@@ -13,22 +13,22 @@
     AutoEncodingTopicModel,
     ClusteringTopicModel,
     KeyNMF,
-    SemanticSignalSeparation
+    SemanticSignalSeparation,
 )
 
 
 def generate_dates(
-        n_dates: int,
+    n_dates: int,
 ) -> list[datetime]:
-        """ Generate random dates to test dynamic models """
-        dates = []
-        for n in range(n_dates):
-             d = np.random.randint(low=1, high=29)
-             m = np.random.randint(low=1, high=13)
-             y = np.random.randint(low=2000, high=2020)
-             date = datetime(year=y, month=m, day=d)
-             dates.append(date)
-        return dates
+    """Generate random dates to test dynamic models"""
+    dates = []
+    for n in range(n_dates):
+        d = np.random.randint(low=1, high=29)
+        m = np.random.randint(low=1, high=13)
+        y = np.random.randint(low=2000, high=2020)
+        date = datetime(year=y, month=m, day=d)
+        dates.append(date)
+    return dates
 
 
 newsgroups = fetch_20newsgroups(
@@ -46,8 +46,8 @@ def generate_dates(
 models = [
     GMM(5, encoder=trf),
     SemanticSignalSeparation(5, encoder=trf),
-    KeyNMF(5, encoder=trf, keyword_scope='document'),
-    KeyNMF(5, encoder=trf, keyword_scope='corpus'),
+    KeyNMF(5, encoder=trf, keyword_scope="document"),
+    KeyNMF(5, encoder=trf, keyword_scope="corpus"),
     ClusteringTopicModel(
         n_reduce_to=5,
         feature_importance="c-tf-idf",
@@ -69,14 +69,14 @@ def generate_dates(
         n_reduce_to=5,
         feature_importance="centroid",
         encoder=trf,
-        reduction_method="smallest"
+        reduction_method="smallest",
     ),
     ClusteringTopicModel(
         n_reduce_to=5,
         feature_importance="soft-c-tf-idf",
         encoder=trf,
         reduction_method="smallest"
-    )
+    ),
 ]
 
 
@@ -94,7 +94,9 @@ def test_fit_export_table(model):
 @pytest.mark.parametrize("model", dynamic_models)
 def test_fit_dynamic(model):
     doc_topic_matrix = model.fit_transform_dynamic(
-         texts, embeddings=embeddings, timestamps=timestamps,
+        texts,
+        embeddings=embeddings,
+        timestamps=timestamps,
     )
     table = model.export_topics(format="csv")
     with tempfile.TemporaryDirectory() as tmpdirname:
diff --git a/turftopic/base.py b/turftopic/base.py
@@ -23,7 +23,9 @@ def remove_whitespace(text: str) -> str:
 class ContextualModel(ABC, TransformerMixin, BaseEstimator):
     """Base class for contextual topic models in Turftopic."""
 
-    def get_topics(self, top_k: int = 10) -> List[Tuple[Any, List[Tuple[str, float]]]]:
+    def get_topics(
+        self, top_k: int = 10
+    ) -> List[Tuple[Any, List[Tuple[str, float]]]]:
         """Returns high-level topic representations in form of the top K words
         in each topic.
 
@@ -135,8 +137,12 @@ def _highest_ranking_docs(
         except AttributeError:
             pass
         kth = min(top_k, document_topic_matrix.shape[0] - 1)
-        highest = np.argpartition(-document_topic_matrix[:, topic_id], kth)[:kth]
-        highest = highest[np.argsort(-document_topic_matrix[highest, topic_id])]
+        highest = np.argpartition(-document_topic_matrix[:, topic_id], kth)[
+            :kth
+        ]
+        highest = highest[
+            np.argsort(-document_topic_matrix[highest, topic_id])
+        ]
         scores = document_topic_matrix[highest, topic_id]
         columns = []
         columns.append("Document")
@@ -171,7 +177,9 @@ def print_highest_ranking_documents(
             topic_id, raw_documents, document_topic_matrix, top_k
         )
         table = Table(show_lines=True)
-        table.add_column("Document", justify="left", style="magenta", max_width=100)
+        table.add_column(
+            "Document", justify="left", style="magenta", max_width=100
+        )
         table.add_column("Score", style="blue", justify="right")
         for row in rows:
             table.add_row(*row)
@@ -223,7 +231,9 @@ def _topic_distribution(
     ) -> list[list[str]]:
         if topic_dist is None:
             if text is None:
-                raise ValueError("You should either pass a text or a distribution.")
+                raise ValueError(
+                    "You should either pass a text or a distribution."
+                )
             try:
                 topic_dist = self.transform([text])
             except AttributeError:
@@ -248,7 +258,9 @@ def _topic_distribution(
             rows.append([topic_names[ind], f"{score:.2f}"])
         return [columns, *rows]
 
-    def print_topic_distribution(self, text=None, topic_dist=None, top_k: int = 10):
+    def print_topic_distribution(
+        self, text=None, topic_dist=None, top_k: int = 10
+    ):
         """Pretty prints topic distribution in a document.
 
         Parameters
@@ -330,7 +342,9 @@ def fit_transform(
         """
         pass
 
-    def fit(self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None):
+    def fit(
+        self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None
+    ):
         """Fits model on the given corpus.
 
         Parameters
@@ -396,9 +410,13 @@ def prepare_topic_data(
         if embeddings is None:
             embeddings = self.encode_documents(corpus)
         try:
-            document_topic_matrix = self.transform(corpus, embeddings=embeddings)
+            document_topic_matrix = self.transform(
+                corpus, embeddings=embeddings
+            )
         except (AttributeError, NotFittedError):
-            document_topic_matrix = self.fit_transform(corpus, embeddings=embeddings)
+            document_topic_matrix = self.fit_transform(
+                corpus, embeddings=embeddings
+            )
         dtm = self.vectorizer.transform(corpus)  # type: ignore
         res: TopicData = {
             "corpus": corpus,
diff --git a/turftopic/dynamic.py b/turftopic/dynamic.py
@@ -199,7 +199,9 @@ def print_topics_over_time(
         show_scores: bool, default False
             Indicates whether to show importance scores for each word.
         """
-        columns, *rows = self._topics_over_time(top_k, show_scores, date_format)
+        columns, *rows = self._topics_over_time(
+            top_k, show_scores, date_format
+        )
         table = Table(show_lines=True)
         for column in columns:
             table.add_column(column)
diff --git a/turftopic/encoders/__init__.py b/turftopic/encoders/__init__.py
@@ -9,5 +9,5 @@
     "OpenAIEmbeddings",
     "VoyageEmbeddings",
     "ExternalEncoder",
-    "E5Encoder"
+    "E5Encoder",
 ]
diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py
@@ -53,7 +53,9 @@ def smallest_hierarchical_join(
     classes = list(classes_)
     while len(classes) > n_to:
         smallest = np.argmin(topic_sizes)
-        dist = cosine_distances(np.atleast_2d(topic_vectors[smallest]), topic_vectors)
+        dist = cosine_distances(
+            np.atleast_2d(topic_vectors[smallest]), topic_vectors
+        )
         closest = np.argsort(dist[0])[1]
         merge_inst.append((classes[smallest], classes[closest]))
         classes.pop(smallest)
@@ -68,7 +70,8 @@ def smallest_hierarchical_join(
 
 
 def calculate_topic_vectors(
-    cluster_labels: np.ndarray, embeddings: np.ndarray,
+    cluster_labels: np.ndarray,
+    embeddings: np.ndarray,
     time_index: Optional[np.ndarray] = None,
 ) -> np.ndarray:
     """Calculates topic centroids."""
@@ -138,20 +141,22 @@ class ClusteringTopicModel(ContextualModel, ClusterMixin, DynamicTopicModel):
 
     def __init__(
         self,
-        encoder: Union[Encoder, str] = "sentence-transformers/all-MiniLM-L6-v2",
+        encoder: Union[
+            Encoder, str
+        ] = "sentence-transformers/all-MiniLM-L6-v2",
         vectorizer: Optional[CountVectorizer] = None,
         dimensionality_reduction: Optional[TransformerMixin] = None,
         clustering: Optional[ClusterMixin] = None,
         feature_importance: Literal[
             "c-tf-idf", "soft-c-tf-idf", "centroid"
         ] = "soft-c-tf-idf",
         n_reduce_to: Optional[int] = None,
-        reduction_method: Literal["agglomerative", "smallest"] = "agglomerative",
+        reduction_method: Literal[
+            "agglomerative", "smallest"
+        ] = "agglomerative",
     ):
         self.encoder = encoder
-        if feature_importance not in ["c-tf-idf", 
-                                      "soft-c-tf-idf", 
-                                      "centroid"]:
+        if feature_importance not in ["c-tf-idf", "soft-c-tf-idf", "centroid"]:
             raise ValueError(feature_message)
         if isinstance(encoder, int):
             raise TypeError(integer_message)
@@ -168,7 +173,9 @@ def __init__(
         else:
             self.clustering = clustering
         if dimensionality_reduction is None:
-            self.dimensionality_reduction = TSNE(n_components=2, metric="cosine")
+            self.dimensionality_reduction = TSNE(
+                n_components=2, metric="cosine"
+            )
         else:
             self.dimensionality_reduction = dimensionality_reduction
         self.feature_importance = feature_importance
@@ -225,7 +232,9 @@ def _estimate_parameters(
         self.vocab_embeddings = self.encoder_.encode(
             self.vectorizer.get_feature_names_out()
         )  # type: ignore
-        document_topic_matrix = label_binarize(self.labels_, classes=self.classes_)
+        document_topic_matrix = label_binarize(
+            self.labels_, classes=self.classes_
+        )
         if self.feature_importance == "soft-c-tf-idf":
             self.components_ = soft_ctf_idf(document_topic_matrix, doc_term_matrix)  # type: ignore
         elif self.feature_importance == "centroid":
@@ -266,7 +275,9 @@ def fit_predict(
             self.doc_term_matrix = self.vectorizer.fit_transform(raw_documents)
             console.log("Term extraction done.")
             status.update("Reducing Dimensionality")
-            reduced_embeddings = self.dimensionality_reduction.fit_transform(embeddings)
+            reduced_embeddings = self.dimensionality_reduction.fit_transform(
+                embeddings
+            )
             console.log("Dimensionality reduction done.")
             status.update("Clustering documents")
             self.labels_ = self.clustering.fit_predict(reduced_embeddings)
@@ -279,7 +290,9 @@ def fit_predict(
             console.log("Parameter estimation done.")
             if self.n_reduce_to is not None:
                 n_topics = self.classes_.shape[0]
-                status.update(f"Reducing topics from {n_topics} to {self.n_reduce_to}")
+                status.update(
+                    f"Reducing topics from {n_topics} to {self.n_reduce_to}"
+                )
                 if self.reduction_method == "agglomerative":
                     self.labels_ = self._merge_agglomerative(self.n_reduce_to)
                 else:
@@ -316,25 +329,32 @@ def fit_transform_dynamic(
             embeddings = self.encoder_.encode(raw_documents)
         for i_timebin in np.arange(len(self.time_bin_edges) - 1):
             if self.components_ is not None:
-                doc_topic_matrix = label_binarize(self.labels_, classes=self.classes_)
+                doc_topic_matrix = label_binarize(
+                    self.labels_, classes=self.classes_
+                )
             else:
-                doc_topic_matrix = self.fit_transform(raw_documents, embeddings=embeddings)
-            topic_importances = doc_topic_matrix[time_labels == i_timebin].sum(axis=0)
+                doc_topic_matrix = self.fit_transform(
+                    raw_documents, embeddings=embeddings
+                )
+            topic_importances = doc_topic_matrix[time_labels == i_timebin].sum(
+                axis=0
+            )
             topic_importances = topic_importances / topic_importances.sum()
             t_doc_term_matrix = self.doc_term_matrix[time_labels == i_timebin]
             t_doc_topic_matrix = doc_topic_matrix[time_labels == i_timebin]
             if "c-tf-idf" in self.feature_importance:
-                if self.feature_importance == 'soft-c-tf-idf':
+                if self.feature_importance == "soft-c-tf-idf":
                     components = soft_ctf_idf(
-                        t_doc_topic_matrix,
-                        t_doc_term_matrix
+                        t_doc_topic_matrix, t_doc_term_matrix
                     )
-                elif self.feature_importance == 'c-tf-idf':
+                elif self.feature_importance == "c-tf-idf":
                     components = ctf_idf(t_doc_topic_matrix, t_doc_term_matrix)
-            elif self.feature_importance == 'centroid':
+            elif self.feature_importance == "centroid":
                 time_index = time_labels == i_timebin
                 t_topic_vectors = calculate_topic_vectors(
-                    self.labels_, embeddings, time_index,
+                    self.labels_,
+                    embeddings,
+                    time_index,
                 )
                 topic_mask = np.isnan(t_topic_vectors).all(
                     axis=1, keepdims=True
diff --git a/turftopic/models/gmm.py b/turftopic/models/gmm.py
@@ -64,7 +64,9 @@ class GMM(ContextualModel, DynamicTopicModel):
     def __init__(
         self,
         n_components: int,
-        encoder: Union[Encoder, str] = "sentence-transformers/all-MiniLM-L6-v2",
+        encoder: Union[
+            Encoder, str
+        ] = "sentence-transformers/all-MiniLM-L6-v2",
         vectorizer: Optional[CountVectorizer] = None,
         dimensionality_reduction: Optional[TransformerMixin] = None,
         weight_prior: Literal["dirichlet", "dirichlet_process", None] = None,
@@ -118,7 +120,9 @@ def fit_transform(
             console.log("Mixture model fitted.")
             status.update("Estimating term importances.")
             document_topic_matrix = self.gmm_.predict_proba(embeddings)
-            self.components_ = soft_ctf_idf(document_topic_matrix, document_term_matrix)
+            self.components_ = soft_ctf_idf(
+                document_topic_matrix, document_term_matrix
+            )
             console.log("Model fitting done.")
         return document_topic_matrix
 
@@ -160,14 +164,20 @@ def fit_transform_dynamic(
     ):
         time_labels, self.time_bin_edges = bin_timestamps(timestamps, bins)
         if self.components_ is not None:
-            doc_topic_matrix = self.transform(raw_documents, embeddings=embeddings)
+            doc_topic_matrix = self.transform(
+                raw_documents, embeddings=embeddings
+            )
         else:
-            doc_topic_matrix = self.fit_transform(raw_documents, embeddings=embeddings)
+            doc_topic_matrix = self.fit_transform(
+                raw_documents, embeddings=embeddings
+            )
         document_term_matrix = self.vectorizer.transform(raw_documents)
         temporal_components = []
         temporal_importances = []
         for i_timebin in np.arange(len(self.time_bin_edges) - 1):
-            topic_importances = doc_topic_matrix[time_labels == i_timebin].sum(axis=0)
+            topic_importances = doc_topic_matrix[time_labels == i_timebin].sum(
+                axis=0
+            )
             # Normalizing
             topic_importances = topic_importances / topic_importances.sum()
             components = soft_ctf_idf(
diff --git a/turftopic/models/keynmf.py b/turftopic/models/keynmf.py
@@ -89,9 +89,9 @@ def __init__(
         ] = "sentence-transformers/all-MiniLM-L6-v2",
         vectorizer: Optional[CountVectorizer] = None,
         top_n: int = 25,
-        keyword_scope: str = 'document',
+        keyword_scope: str = "document",
     ):
-        if keyword_scope not in ['document', 'corpus']:
+        if keyword_scope not in ["document", "corpus"]:
             raise ValueError("keyword_scope must be 'document' or 'corpus'")
         self.n_components = n_components
         self.top_n = top_n
@@ -123,7 +123,7 @@ def extract_keywords(
         for i in range(total):
             terms = document_term_matrix[i, :].todense()
             embedding = embeddings[i].reshape(1, -1)
-            if self.keyword_scope == 'document':
+            if self.keyword_scope == "document":
                 mask = terms > 0
             else:
                 tot_freq = document_term_matrix.sum(axis=0)

Original file line number	Diff line number	Diff line change
`@@ -9,5 +9,5 @@`
`9`	`9`	`"OpenAIEmbeddings",`
`10`	`10`	`"VoyageEmbeddings",`
`11`	`11`	`"ExternalEncoder",`
`12`		`- "E5Encoder"`
	`12`	`+ "E5Encoder",`
`13`	`13`	`]`