Skip to content

Commit a37354d

Browse files
authored
Merge pull request #28 from rbroc/dynamic-clustering
add dynamic topic modeling to clustering models
2 parents 870b3a7 + 21944dd commit a37354d

10 files changed

Lines changed: 216 additions & 34 deletions

File tree

docs/clustering.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,11 @@ top2vec = ClusteringTopicModel(
188188
Theoretically the model descriptions above should result in the same behaviour as the other two packages, but there might be minor changes in implementation.
189189
We do not intend to keep up with changes in Top2Vec's and BERTopic's internal implementation details indefinitely.
190190

191+
### _(Optional)_ 5. Dynamic Modeling
192+
193+
Clustering models are also capable of dynamic topic modeling. This happens by fitting a clustering model over the entire corpus, as we expect that there is only one semantic model generating the documents.
194+
To gain temporal representations for topics, the corpus is divided into equal, or arbitrarily chosen time slices, and then term importances are estimated using Soft-c-TF-IDF, c-TF-IDF, or distances from cluster centroid for each of the time slices separately. When distance from cluster centroids is used to estimate topic importances in dynamic modeling, cluster centroids are computed based on documents and terms present within a given time slice.
195+
191196
## Considerations
192197

193198
### Strengths

docs/dynamic.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ Dynamic topic models in Turftopic have a unified interface.
2828
To fit a dynamic topic model you will need a corpus, that has been annotated with timestamps.
2929
The timestamps need to be Python `datetime` objects, but pandas `Timestamp` object are also supported.
3030

31-
Models that have dynamic modeling capabilities have a `fit_transform_dynamic()` method, that fits the model on the corpus over time.
31+
Models that have dynamic modeling capabilities (currently, `GMM` and `ClusteringTopicModel`) have a `fit_transform_dynamic()` method, that fits the model on the corpus over time.
3232

3333
```python
3434
from datetime import datetime

tests/test_integration.py

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from datetime import datetime
12
import tempfile
23
from pathlib import Path
34

@@ -15,6 +16,21 @@
1516
SemanticSignalSeparation,
1617
)
1718

19+
20+
def generate_dates(
21+
n_dates: int,
22+
) -> list[datetime]:
23+
"""Generate random dates to test dynamic models"""
24+
dates = []
25+
for n in range(n_dates):
26+
d = np.random.randint(low=1, high=29)
27+
m = np.random.randint(low=1, high=13)
28+
y = np.random.randint(low=2000, high=2020)
29+
date = datetime(year=y, month=m, day=d)
30+
dates.append(date)
31+
return dates
32+
33+
1834
newsgroups = fetch_20newsgroups(
1935
subset="all",
2036
categories=[
@@ -25,12 +41,13 @@
2541
texts = newsgroups.data
2642
trf = SentenceTransformer("all-MiniLM-L6-v2")
2743
embeddings = np.asarray(trf.encode(texts))
44+
timestamps = generate_dates(n_dates=len(texts))
2845

2946
models = [
3047
GMM(5, encoder=trf),
3148
SemanticSignalSeparation(5, encoder=trf),
32-
KeyNMF(5, encoder=trf, keyword_scope='document'),
33-
KeyNMF(5, encoder=trf, keyword_scope='corpus'),
49+
KeyNMF(5, encoder=trf, keyword_scope="document"),
50+
KeyNMF(5, encoder=trf, keyword_scope="corpus"),
3451
ClusteringTopicModel(
3552
n_reduce_to=5,
3653
feature_importance="c-tf-idf",
@@ -46,6 +63,22 @@
4663
AutoEncodingTopicModel(5, combined=True),
4764
]
4865

66+
dynamic_models = [
67+
GMM(5, encoder=trf),
68+
ClusteringTopicModel(
69+
n_reduce_to=5,
70+
feature_importance="centroid",
71+
encoder=trf,
72+
reduction_method="smallest",
73+
),
74+
ClusteringTopicModel(
75+
n_reduce_to=5,
76+
feature_importance="soft-c-tf-idf",
77+
encoder=trf,
78+
reduction_method="smallest"
79+
),
80+
]
81+
4982

5083
@pytest.mark.parametrize("model", models)
5184
def test_fit_export_table(model):
@@ -56,3 +89,18 @@ def test_fit_export_table(model):
5689
with out_path.open("w") as out_file:
5790
out_file.write(table)
5891
df = pd.read_csv(out_path)
92+
93+
94+
@pytest.mark.parametrize("model", dynamic_models)
95+
def test_fit_dynamic(model):
96+
doc_topic_matrix = model.fit_transform_dynamic(
97+
texts,
98+
embeddings=embeddings,
99+
timestamps=timestamps,
100+
)
101+
table = model.export_topics(format="csv")
102+
with tempfile.TemporaryDirectory() as tmpdirname:
103+
out_path = Path(tmpdirname).joinpath("topics.csv")
104+
with out_path.open("w") as out_file:
105+
out_file.write(table)
106+
df = pd.read_csv(out_path)

turftopic/base.py

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@ def remove_whitespace(text: str) -> str:
2323
class ContextualModel(ABC, TransformerMixin, BaseEstimator):
2424
"""Base class for contextual topic models in Turftopic."""
2525

26-
def get_topics(self, top_k: int = 10) -> List[Tuple[Any, List[Tuple[str, float]]]]:
26+
def get_topics(
27+
self, top_k: int = 10
28+
) -> List[Tuple[Any, List[Tuple[str, float]]]]:
2729
"""Returns high-level topic representations in form of the top K words
2830
in each topic.
2931
@@ -135,8 +137,12 @@ def _highest_ranking_docs(
135137
except AttributeError:
136138
pass
137139
kth = min(top_k, document_topic_matrix.shape[0] - 1)
138-
highest = np.argpartition(-document_topic_matrix[:, topic_id], kth)[:kth]
139-
highest = highest[np.argsort(-document_topic_matrix[highest, topic_id])]
140+
highest = np.argpartition(-document_topic_matrix[:, topic_id], kth)[
141+
:kth
142+
]
143+
highest = highest[
144+
np.argsort(-document_topic_matrix[highest, topic_id])
145+
]
140146
scores = document_topic_matrix[highest, topic_id]
141147
columns = []
142148
columns.append("Document")
@@ -171,7 +177,9 @@ def print_highest_ranking_documents(
171177
topic_id, raw_documents, document_topic_matrix, top_k
172178
)
173179
table = Table(show_lines=True)
174-
table.add_column("Document", justify="left", style="magenta", max_width=100)
180+
table.add_column(
181+
"Document", justify="left", style="magenta", max_width=100
182+
)
175183
table.add_column("Score", style="blue", justify="right")
176184
for row in rows:
177185
table.add_row(*row)
@@ -223,7 +231,9 @@ def _topic_distribution(
223231
) -> list[list[str]]:
224232
if topic_dist is None:
225233
if text is None:
226-
raise ValueError("You should either pass a text or a distribution.")
234+
raise ValueError(
235+
"You should either pass a text or a distribution."
236+
)
227237
try:
228238
topic_dist = self.transform([text])
229239
except AttributeError:
@@ -248,7 +258,9 @@ def _topic_distribution(
248258
rows.append([topic_names[ind], f"{score:.2f}"])
249259
return [columns, *rows]
250260

251-
def print_topic_distribution(self, text=None, topic_dist=None, top_k: int = 10):
261+
def print_topic_distribution(
262+
self, text=None, topic_dist=None, top_k: int = 10
263+
):
252264
"""Pretty prints topic distribution in a document.
253265
254266
Parameters
@@ -330,7 +342,9 @@ def fit_transform(
330342
"""
331343
pass
332344

333-
def fit(self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None):
345+
def fit(
346+
self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None
347+
):
334348
"""Fits model on the given corpus.
335349
336350
Parameters
@@ -396,9 +410,13 @@ def prepare_topic_data(
396410
if embeddings is None:
397411
embeddings = self.encode_documents(corpus)
398412
try:
399-
document_topic_matrix = self.transform(corpus, embeddings=embeddings)
413+
document_topic_matrix = self.transform(
414+
corpus, embeddings=embeddings
415+
)
400416
except (AttributeError, NotFittedError):
401-
document_topic_matrix = self.fit_transform(corpus, embeddings=embeddings)
417+
document_topic_matrix = self.fit_transform(
418+
corpus, embeddings=embeddings
419+
)
402420
dtm = self.vectorizer.transform(corpus) # type: ignore
403421
res: TopicData = {
404422
"corpus": corpus,

turftopic/dynamic.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,9 @@ def print_topics_over_time(
199199
show_scores: bool, default False
200200
Indicates whether to show importance scores for each word.
201201
"""
202-
columns, *rows = self._topics_over_time(top_k, show_scores, date_format)
202+
columns, *rows = self._topics_over_time(
203+
top_k, show_scores, date_format
204+
)
203205
table = Table(show_lines=True)
204206
for column in columns:
205207
table.add_column(column)

turftopic/encoders/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,5 @@
99
"OpenAIEmbeddings",
1010
"VoyageEmbeddings",
1111
"ExternalEncoder",
12-
"E5Encoder"
12+
"E5Encoder",
1313
]

turftopic/encoders/utils.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import itertools
2+
from typing import Iterable, List
3+
4+
5+
def batched(iterable, n: int) -> Iterable[List[str]]:
6+
"Batch data into tuples of length n. The last batch may be shorter."
7+
# batched('ABCDEFG', 3) --> ABC DEF G
8+
if n < 1:
9+
raise ValueError("n must be at least one")
10+
it = iter(iterable)
11+
while batch := list(itertools.islice(it, n)):
12+
yield batch

0 commit comments

Comments
 (0)