Skip to content

Commit 4fcab17

Browse files
committed
linting
1 parent 22129ff commit 4fcab17

7 files changed

Lines changed: 107 additions & 55 deletions

File tree

tests/test_integration.py

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -13,22 +13,22 @@
1313
AutoEncodingTopicModel,
1414
ClusteringTopicModel,
1515
KeyNMF,
16-
SemanticSignalSeparation
16+
SemanticSignalSeparation,
1717
)
1818

1919

2020
def generate_dates(
21-
n_dates: int,
21+
n_dates: int,
2222
) -> list[datetime]:
23-
""" Generate random dates to test dynamic models """
24-
dates = []
25-
for n in range(n_dates):
26-
d = np.random.randint(low=1, high=29)
27-
m = np.random.randint(low=1, high=13)
28-
y = np.random.randint(low=2000, high=2020)
29-
date = datetime(year=y, month=m, day=d)
30-
dates.append(date)
31-
return dates
23+
"""Generate random dates to test dynamic models"""
24+
dates = []
25+
for n in range(n_dates):
26+
d = np.random.randint(low=1, high=29)
27+
m = np.random.randint(low=1, high=13)
28+
y = np.random.randint(low=2000, high=2020)
29+
date = datetime(year=y, month=m, day=d)
30+
dates.append(date)
31+
return dates
3232

3333

3434
newsgroups = fetch_20newsgroups(
@@ -46,8 +46,8 @@ def generate_dates(
4646
models = [
4747
GMM(5, encoder=trf),
4848
SemanticSignalSeparation(5, encoder=trf),
49-
KeyNMF(5, encoder=trf, keyword_scope='document'),
50-
KeyNMF(5, encoder=trf, keyword_scope='corpus'),
49+
KeyNMF(5, encoder=trf, keyword_scope="document"),
50+
KeyNMF(5, encoder=trf, keyword_scope="corpus"),
5151
ClusteringTopicModel(
5252
n_reduce_to=5,
5353
feature_importance="c-tf-idf",
@@ -69,14 +69,14 @@ def generate_dates(
6969
n_reduce_to=5,
7070
feature_importance="centroid",
7171
encoder=trf,
72-
reduction_method="smallest"
72+
reduction_method="smallest",
7373
),
7474
ClusteringTopicModel(
7575
n_reduce_to=5,
7676
feature_importance="soft-c-tf-idf",
7777
encoder=trf,
7878
reduction_method="smallest"
79-
)
79+
),
8080
]
8181

8282

@@ -94,7 +94,9 @@ def test_fit_export_table(model):
9494
@pytest.mark.parametrize("model", dynamic_models)
9595
def test_fit_dynamic(model):
9696
doc_topic_matrix = model.fit_transform_dynamic(
97-
texts, embeddings=embeddings, timestamps=timestamps,
97+
texts,
98+
embeddings=embeddings,
99+
timestamps=timestamps,
98100
)
99101
table = model.export_topics(format="csv")
100102
with tempfile.TemporaryDirectory() as tmpdirname:

turftopic/base.py

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@ def remove_whitespace(text: str) -> str:
2323
class ContextualModel(ABC, TransformerMixin, BaseEstimator):
2424
"""Base class for contextual topic models in Turftopic."""
2525

26-
def get_topics(self, top_k: int = 10) -> List[Tuple[Any, List[Tuple[str, float]]]]:
26+
def get_topics(
27+
self, top_k: int = 10
28+
) -> List[Tuple[Any, List[Tuple[str, float]]]]:
2729
"""Returns high-level topic representations in form of the top K words
2830
in each topic.
2931
@@ -135,8 +137,12 @@ def _highest_ranking_docs(
135137
except AttributeError:
136138
pass
137139
kth = min(top_k, document_topic_matrix.shape[0] - 1)
138-
highest = np.argpartition(-document_topic_matrix[:, topic_id], kth)[:kth]
139-
highest = highest[np.argsort(-document_topic_matrix[highest, topic_id])]
140+
highest = np.argpartition(-document_topic_matrix[:, topic_id], kth)[
141+
:kth
142+
]
143+
highest = highest[
144+
np.argsort(-document_topic_matrix[highest, topic_id])
145+
]
140146
scores = document_topic_matrix[highest, topic_id]
141147
columns = []
142148
columns.append("Document")
@@ -171,7 +177,9 @@ def print_highest_ranking_documents(
171177
topic_id, raw_documents, document_topic_matrix, top_k
172178
)
173179
table = Table(show_lines=True)
174-
table.add_column("Document", justify="left", style="magenta", max_width=100)
180+
table.add_column(
181+
"Document", justify="left", style="magenta", max_width=100
182+
)
175183
table.add_column("Score", style="blue", justify="right")
176184
for row in rows:
177185
table.add_row(*row)
@@ -223,7 +231,9 @@ def _topic_distribution(
223231
) -> list[list[str]]:
224232
if topic_dist is None:
225233
if text is None:
226-
raise ValueError("You should either pass a text or a distribution.")
234+
raise ValueError(
235+
"You should either pass a text or a distribution."
236+
)
227237
try:
228238
topic_dist = self.transform([text])
229239
except AttributeError:
@@ -248,7 +258,9 @@ def _topic_distribution(
248258
rows.append([topic_names[ind], f"{score:.2f}"])
249259
return [columns, *rows]
250260

251-
def print_topic_distribution(self, text=None, topic_dist=None, top_k: int = 10):
261+
def print_topic_distribution(
262+
self, text=None, topic_dist=None, top_k: int = 10
263+
):
252264
"""Pretty prints topic distribution in a document.
253265
254266
Parameters
@@ -330,7 +342,9 @@ def fit_transform(
330342
"""
331343
pass
332344

333-
def fit(self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None):
345+
def fit(
346+
self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None
347+
):
334348
"""Fits model on the given corpus.
335349
336350
Parameters
@@ -396,9 +410,13 @@ def prepare_topic_data(
396410
if embeddings is None:
397411
embeddings = self.encode_documents(corpus)
398412
try:
399-
document_topic_matrix = self.transform(corpus, embeddings=embeddings)
413+
document_topic_matrix = self.transform(
414+
corpus, embeddings=embeddings
415+
)
400416
except (AttributeError, NotFittedError):
401-
document_topic_matrix = self.fit_transform(corpus, embeddings=embeddings)
417+
document_topic_matrix = self.fit_transform(
418+
corpus, embeddings=embeddings
419+
)
402420
dtm = self.vectorizer.transform(corpus) # type: ignore
403421
res: TopicData = {
404422
"corpus": corpus,

turftopic/dynamic.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,9 @@ def print_topics_over_time(
199199
show_scores: bool, default False
200200
Indicates whether to show importance scores for each word.
201201
"""
202-
columns, *rows = self._topics_over_time(top_k, show_scores, date_format)
202+
columns, *rows = self._topics_over_time(
203+
top_k, show_scores, date_format
204+
)
203205
table = Table(show_lines=True)
204206
for column in columns:
205207
table.add_column(column)

turftopic/encoders/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,5 @@
99
"OpenAIEmbeddings",
1010
"VoyageEmbeddings",
1111
"ExternalEncoder",
12-
"E5Encoder"
12+
"E5Encoder",
1313
]

turftopic/models/cluster.py

Lines changed: 40 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,9 @@ def smallest_hierarchical_join(
5353
classes = list(classes_)
5454
while len(classes) > n_to:
5555
smallest = np.argmin(topic_sizes)
56-
dist = cosine_distances(np.atleast_2d(topic_vectors[smallest]), topic_vectors)
56+
dist = cosine_distances(
57+
np.atleast_2d(topic_vectors[smallest]), topic_vectors
58+
)
5759
closest = np.argsort(dist[0])[1]
5860
merge_inst.append((classes[smallest], classes[closest]))
5961
classes.pop(smallest)
@@ -68,7 +70,8 @@ def smallest_hierarchical_join(
6870

6971

7072
def calculate_topic_vectors(
71-
cluster_labels: np.ndarray, embeddings: np.ndarray,
73+
cluster_labels: np.ndarray,
74+
embeddings: np.ndarray,
7275
time_index: Optional[np.ndarray] = None,
7376
) -> np.ndarray:
7477
"""Calculates topic centroids."""
@@ -138,20 +141,22 @@ class ClusteringTopicModel(ContextualModel, ClusterMixin, DynamicTopicModel):
138141

139142
def __init__(
140143
self,
141-
encoder: Union[Encoder, str] = "sentence-transformers/all-MiniLM-L6-v2",
144+
encoder: Union[
145+
Encoder, str
146+
] = "sentence-transformers/all-MiniLM-L6-v2",
142147
vectorizer: Optional[CountVectorizer] = None,
143148
dimensionality_reduction: Optional[TransformerMixin] = None,
144149
clustering: Optional[ClusterMixin] = None,
145150
feature_importance: Literal[
146151
"c-tf-idf", "soft-c-tf-idf", "centroid"
147152
] = "soft-c-tf-idf",
148153
n_reduce_to: Optional[int] = None,
149-
reduction_method: Literal["agglomerative", "smallest"] = "agglomerative",
154+
reduction_method: Literal[
155+
"agglomerative", "smallest"
156+
] = "agglomerative",
150157
):
151158
self.encoder = encoder
152-
if feature_importance not in ["c-tf-idf",
153-
"soft-c-tf-idf",
154-
"centroid"]:
159+
if feature_importance not in ["c-tf-idf", "soft-c-tf-idf", "centroid"]:
155160
raise ValueError(feature_message)
156161
if isinstance(encoder, int):
157162
raise TypeError(integer_message)
@@ -168,7 +173,9 @@ def __init__(
168173
else:
169174
self.clustering = clustering
170175
if dimensionality_reduction is None:
171-
self.dimensionality_reduction = TSNE(n_components=2, metric="cosine")
176+
self.dimensionality_reduction = TSNE(
177+
n_components=2, metric="cosine"
178+
)
172179
else:
173180
self.dimensionality_reduction = dimensionality_reduction
174181
self.feature_importance = feature_importance
@@ -225,7 +232,9 @@ def _estimate_parameters(
225232
self.vocab_embeddings = self.encoder_.encode(
226233
self.vectorizer.get_feature_names_out()
227234
) # type: ignore
228-
document_topic_matrix = label_binarize(self.labels_, classes=self.classes_)
235+
document_topic_matrix = label_binarize(
236+
self.labels_, classes=self.classes_
237+
)
229238
if self.feature_importance == "soft-c-tf-idf":
230239
self.components_ = soft_ctf_idf(document_topic_matrix, doc_term_matrix) # type: ignore
231240
elif self.feature_importance == "centroid":
@@ -266,7 +275,9 @@ def fit_predict(
266275
self.doc_term_matrix = self.vectorizer.fit_transform(raw_documents)
267276
console.log("Term extraction done.")
268277
status.update("Reducing Dimensionality")
269-
reduced_embeddings = self.dimensionality_reduction.fit_transform(embeddings)
278+
reduced_embeddings = self.dimensionality_reduction.fit_transform(
279+
embeddings
280+
)
270281
console.log("Dimensionality reduction done.")
271282
status.update("Clustering documents")
272283
self.labels_ = self.clustering.fit_predict(reduced_embeddings)
@@ -279,7 +290,9 @@ def fit_predict(
279290
console.log("Parameter estimation done.")
280291
if self.n_reduce_to is not None:
281292
n_topics = self.classes_.shape[0]
282-
status.update(f"Reducing topics from {n_topics} to {self.n_reduce_to}")
293+
status.update(
294+
f"Reducing topics from {n_topics} to {self.n_reduce_to}"
295+
)
283296
if self.reduction_method == "agglomerative":
284297
self.labels_ = self._merge_agglomerative(self.n_reduce_to)
285298
else:
@@ -316,25 +329,32 @@ def fit_transform_dynamic(
316329
embeddings = self.encoder_.encode(raw_documents)
317330
for i_timebin in np.arange(len(self.time_bin_edges) - 1):
318331
if self.components_ is not None:
319-
doc_topic_matrix = label_binarize(self.labels_, classes=self.classes_)
332+
doc_topic_matrix = label_binarize(
333+
self.labels_, classes=self.classes_
334+
)
320335
else:
321-
doc_topic_matrix = self.fit_transform(raw_documents, embeddings=embeddings)
322-
topic_importances = doc_topic_matrix[time_labels == i_timebin].sum(axis=0)
336+
doc_topic_matrix = self.fit_transform(
337+
raw_documents, embeddings=embeddings
338+
)
339+
topic_importances = doc_topic_matrix[time_labels == i_timebin].sum(
340+
axis=0
341+
)
323342
topic_importances = topic_importances / topic_importances.sum()
324343
t_doc_term_matrix = self.doc_term_matrix[time_labels == i_timebin]
325344
t_doc_topic_matrix = doc_topic_matrix[time_labels == i_timebin]
326345
if "c-tf-idf" in self.feature_importance:
327-
if self.feature_importance == 'soft-c-tf-idf':
346+
if self.feature_importance == "soft-c-tf-idf":
328347
components = soft_ctf_idf(
329-
t_doc_topic_matrix,
330-
t_doc_term_matrix
348+
t_doc_topic_matrix, t_doc_term_matrix
331349
)
332-
elif self.feature_importance == 'c-tf-idf':
350+
elif self.feature_importance == "c-tf-idf":
333351
components = ctf_idf(t_doc_topic_matrix, t_doc_term_matrix)
334-
elif self.feature_importance == 'centroid':
352+
elif self.feature_importance == "centroid":
335353
time_index = time_labels == i_timebin
336354
t_topic_vectors = calculate_topic_vectors(
337-
self.labels_, embeddings, time_index,
355+
self.labels_,
356+
embeddings,
357+
time_index,
338358
)
339359
topic_mask = np.isnan(t_topic_vectors).all(
340360
axis=1, keepdims=True

turftopic/models/gmm.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,9 @@ class GMM(ContextualModel, DynamicTopicModel):
6464
def __init__(
6565
self,
6666
n_components: int,
67-
encoder: Union[Encoder, str] = "sentence-transformers/all-MiniLM-L6-v2",
67+
encoder: Union[
68+
Encoder, str
69+
] = "sentence-transformers/all-MiniLM-L6-v2",
6870
vectorizer: Optional[CountVectorizer] = None,
6971
dimensionality_reduction: Optional[TransformerMixin] = None,
7072
weight_prior: Literal["dirichlet", "dirichlet_process", None] = None,
@@ -118,7 +120,9 @@ def fit_transform(
118120
console.log("Mixture model fitted.")
119121
status.update("Estimating term importances.")
120122
document_topic_matrix = self.gmm_.predict_proba(embeddings)
121-
self.components_ = soft_ctf_idf(document_topic_matrix, document_term_matrix)
123+
self.components_ = soft_ctf_idf(
124+
document_topic_matrix, document_term_matrix
125+
)
122126
console.log("Model fitting done.")
123127
return document_topic_matrix
124128

@@ -160,14 +164,20 @@ def fit_transform_dynamic(
160164
):
161165
time_labels, self.time_bin_edges = bin_timestamps(timestamps, bins)
162166
if self.components_ is not None:
163-
doc_topic_matrix = self.transform(raw_documents, embeddings=embeddings)
167+
doc_topic_matrix = self.transform(
168+
raw_documents, embeddings=embeddings
169+
)
164170
else:
165-
doc_topic_matrix = self.fit_transform(raw_documents, embeddings=embeddings)
171+
doc_topic_matrix = self.fit_transform(
172+
raw_documents, embeddings=embeddings
173+
)
166174
document_term_matrix = self.vectorizer.transform(raw_documents)
167175
temporal_components = []
168176
temporal_importances = []
169177
for i_timebin in np.arange(len(self.time_bin_edges) - 1):
170-
topic_importances = doc_topic_matrix[time_labels == i_timebin].sum(axis=0)
178+
topic_importances = doc_topic_matrix[time_labels == i_timebin].sum(
179+
axis=0
180+
)
171181
# Normalizing
172182
topic_importances = topic_importances / topic_importances.sum()
173183
components = soft_ctf_idf(

turftopic/models/keynmf.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -89,9 +89,9 @@ def __init__(
8989
] = "sentence-transformers/all-MiniLM-L6-v2",
9090
vectorizer: Optional[CountVectorizer] = None,
9191
top_n: int = 25,
92-
keyword_scope: str = 'document',
92+
keyword_scope: str = "document",
9393
):
94-
if keyword_scope not in ['document', 'corpus']:
94+
if keyword_scope not in ["document", "corpus"]:
9595
raise ValueError("keyword_scope must be 'document' or 'corpus'")
9696
self.n_components = n_components
9797
self.top_n = top_n
@@ -123,7 +123,7 @@ def extract_keywords(
123123
for i in range(total):
124124
terms = document_term_matrix[i, :].todense()
125125
embedding = embeddings[i].reshape(1, -1)
126-
if self.keyword_scope == 'document':
126+
if self.keyword_scope == "document":
127127
mask = terms > 0
128128
else:
129129
tot_freq = document_term_matrix.sum(axis=0)

0 commit comments

Comments
 (0)