Skip to content

Commit 5731bdc

Browse files
Merge pull request #38 from x-tabdeveloping/s3_arguments
Random States for models and max_iter for S3
2 parents 56aeff0 + ca4a95a commit 5731bdc

5 files changed

Lines changed: 68 additions & 23 deletions

File tree

turftopic/models/cluster.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,14 @@ class ClusteringTopicModel(ContextualModel, ClusterMixin, DynamicTopicModel):
137137
The specified reduction method will be used to merge them.
138138
By default, topics are not merged.
139139
reduction_method: 'agglomerative', 'smallest'
140+
Method used to reduce the number of topics post-hoc.
141+
When 'agglomerative', BERTopic's topic reduction method is used,
142+
where topic vectors are hierarchically clustered.
143+
When 'smallest', the smallest topic gets merged into the closest
144+
non-outlier cluster until the desired number
145+
is achieved similarly to Top2Vec.
146+
random_state: int, default None
147+
Random state to use so that results are exactly reproducible.
140148
"""
141149

142150
def __init__(
@@ -154,8 +162,10 @@ def __init__(
154162
reduction_method: Literal[
155163
"agglomerative", "smallest"
156164
] = "agglomerative",
165+
random_state: Optional[int] = None,
157166
):
158167
self.encoder = encoder
168+
self.random_state = random_state
159169
if feature_importance not in ["c-tf-idf", "soft-c-tf-idf", "centroid"]:
160170
raise ValueError(feature_message)
161171
if isinstance(encoder, int):
@@ -174,7 +184,7 @@ def __init__(
174184
self.clustering = clustering
175185
if dimensionality_reduction is None:
176186
self.dimensionality_reduction = TSNE(
177-
n_components=2, metric="cosine"
187+
n_components=2, metric="cosine", random_state=random_state
178188
)
179189
else:
180190
self.dimensionality_reduction = dimensionality_reduction
@@ -196,7 +206,9 @@ def _merge_agglomerative(self, n_reduce_to: int) -> np.ndarray:
196206
)
197207
old_labels = [label for label in self.classes_ if label != -1]
198208
new_labels = AgglomerativeClustering(
199-
n_clusters=n_reduce_to, metric="cosine", linkage="average"
209+
n_clusters=n_reduce_to,
210+
metric="cosine",
211+
linkage="average",
200212
).fit_predict(interesting_topic_vectors)
201213
res = {}
202214
if -1 in self.classes_:
@@ -235,7 +247,9 @@ def _estimate_parameters(
235247
self.labels_, classes=self.classes_
236248
)
237249
if self.feature_importance == "soft-c-tf-idf":
238-
self.components_ = soft_ctf_idf(document_topic_matrix, doc_term_matrix) # type: ignore
250+
self.components_ = soft_ctf_idf(
251+
document_topic_matrix, doc_term_matrix
252+
) # type: ignore
239253
elif self.feature_importance == "centroid":
240254
self.components_ = cluster_centroid_distance(
241255
self.topic_vectors_,
@@ -327,7 +341,7 @@ def fit_transform_dynamic(
327341
if embeddings is None:
328342
embeddings = self.encoder_.encode(raw_documents)
329343
for i_timebin in np.arange(len(self.time_bin_edges) - 1):
330-
if hasattr(self, 'components_'):
344+
if hasattr(self, "components_"):
331345
doc_topic_matrix = label_binarize(
332346
self.labels_, classes=self.classes_
333347
)

turftopic/models/ctm.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import math
2+
import random
23
from typing import Optional, Union
34

45
import numpy as np
@@ -129,6 +130,8 @@ class AutoEncodingTopicModel(ContextualModel):
129130
Learning rate for the optimizer.
130131
n_epochs: int, default 50
131132
Number of epochs to run during training.
133+
random_state: int, default None
134+
Random state to use so that results are exactly reproducible.
132135
"""
133136

134137
def __init__(
@@ -144,8 +147,10 @@ def __init__(
144147
batch_size: int = 42,
145148
learning_rate: float = 1e-2,
146149
n_epochs: int = 50,
150+
random_state: Optional[int] = None,
147151
):
148152
self.n_components = n_components
153+
self.random_state = random_state
149154
self.encoder = encoder
150155
if isinstance(encoder, str):
151156
self.encoder_ = SentenceTransformer(encoder)
@@ -205,7 +210,7 @@ def fit(
205210
status.update("Extracting terms.")
206211
document_term_matrix = self.vectorizer.fit_transform(raw_documents)
207212
console.log("Term extraction done.")
208-
seed = 0
213+
seed = self.random_state or random.randint(0, 10_000)
209214
torch.manual_seed(seed)
210215
pyro.set_rng_seed(seed)
211216
device = torch.device(

turftopic/models/decomp.py

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
import numpy as np
44
from rich.console import Console
55
from sentence_transformers import SentenceTransformer
6-
from sklearn.decomposition import PCA, FastICA
6+
from sklearn.base import TransformerMixin
7+
from sklearn.decomposition import FastICA
78
from sklearn.feature_extraction.text import CountVectorizer
89

910
from turftopic.base import ContextualModel, Encoder
@@ -20,33 +21,40 @@ class SemanticSignalSeparation(ContextualModel):
2021
2122
corpus: list[str] = ["some text", "more text", ...]
2223
23-
model = SemanticSignalSeparation(10, objective="independence").fit(corpus)
24+
model = SemanticSignalSeparation(10).fit(corpus)
2425
model.print_topics()
2526
```
2627
2728
Parameters
2829
----------
29-
n_components: int
30+
n_components: int, default 10
3031
Number of topics.
3132
encoder: str or SentenceTransformer
3233
Model to encode documents/terms, all-MiniLM-L6-v2 is the default.
3334
vectorizer: CountVectorizer, default None
3435
Vectorizer used for term extraction.
3536
Can be used to prune or filter the vocabulary.
36-
objective: 'orthogonality' or 'independence', default 'independence'
37-
Indicates what the components should be optimized for.
38-
When 'orthogonality', PCA is used to discover components,
39-
when 'independence', ICA is used to discover components.
37+
decomposition: TransformerMixin, default None
38+
Custom decomposition method to use.
39+
Can be an instance of FastICA or PCA, or basically any dimensionality
40+
reduction method. Has to have `fit_transform` and `fit` methods.
41+
If not specified, FastICA is used.
42+
max_iter: int, default 200
43+
Maximum number of iterations for ICA.
44+
random_state: int, default None
45+
Random state to use so that results are exactly reproducible.
4046
"""
4147

4248
def __init__(
4349
self,
44-
n_components: int,
50+
n_components: int = 10,
4551
encoder: Union[
4652
Encoder, str
4753
] = "sentence-transformers/all-MiniLM-L6-v2",
4854
vectorizer: Optional[CountVectorizer] = None,
49-
objective: Literal["orthogonality", "independence"] = "independence",
55+
decomposition: Optional[TransformerMixin] = None,
56+
max_iter: int = 200,
57+
random_state: Optional[int] = None,
5058
):
5159
self.n_components = n_components
5260
self.encoder = encoder
@@ -58,11 +66,14 @@ def __init__(
5866
self.vectorizer = default_vectorizer()
5967
else:
6068
self.vectorizer = vectorizer
61-
self.objective = objective
62-
if objective == "independence":
63-
self.decomposition = FastICA(n_components)
69+
self.max_iter = max_iter
70+
self.random_state = random_state
71+
if decomposition is None:
72+
self.decomposition = FastICA(
73+
n_components, max_iter=max_iter, random_state=random_state
74+
)
6475
else:
65-
self.decomposition = PCA(n_components)
76+
self.decomposition = decomposition
6677

6778
def fit_transform(
6879
self, raw_documents, y=None, embeddings: Optional[np.ndarray] = None

turftopic/models/gmm.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ class GMM(ContextualModel, DynamicTopicModel):
5454
result in Gaussian components.
5555
For even larger datasets you can use IncrementalPCA to reduce
5656
memory load.
57+
random_state: int, default None
58+
Random state to use so that results are exactly reproducible.
5759
5860
Attributes
5961
----------
@@ -71,11 +73,13 @@ def __init__(
7173
dimensionality_reduction: Optional[TransformerMixin] = None,
7274
weight_prior: Literal["dirichlet", "dirichlet_process", None] = None,
7375
gamma: Optional[float] = None,
76+
random_state: Optional[int] = None,
7477
):
7578
self.n_components = n_components
7679
self.encoder = encoder
7780
self.weight_prior = weight_prior
7881
self.gamma = gamma
82+
self.random_state = random_state
7983
if isinstance(encoder, str):
8084
self.encoder_ = SentenceTransformer(encoder)
8185
else:
@@ -94,9 +98,12 @@ def __init__(
9498
else "dirichlet_process"
9599
),
96100
weight_concentration_prior=gamma,
101+
random_state=self.random_state,
97102
)
98103
else:
99-
mixture = GaussianMixture(n_components)
104+
mixture = GaussianMixture(
105+
n_components, random_state=self.random_state
106+
)
100107
if dimensionality_reduction is not None:
101108
self.gmm_ = make_pipeline(dimensionality_reduction, mixture)
102109
else:
@@ -162,7 +169,7 @@ def fit_transform_dynamic(
162169
bins: Union[int, list[datetime]] = 10,
163170
):
164171
time_labels, self.time_bin_edges = bin_timestamps(timestamps, bins)
165-
if hasattr(self, 'components_'):
172+
if hasattr(self, "components_"):
166173
doc_topic_matrix = self.transform(
167174
raw_documents, embeddings=embeddings
168175
)

turftopic/models/keynmf.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,8 @@ class KeyNMF(ContextualModel):
7979
is performed on the whole vocabulary ('corpus') or only
8080
using words that are included in the document ('document').
8181
Setting this to 'corpus' allows for multilingual topics.
82+
random_state: int, default None
83+
Random state to use so that results are exactly reproducible.
8284
"""
8385

8486
def __init__(
@@ -90,7 +92,9 @@ def __init__(
9092
vectorizer: Optional[CountVectorizer] = None,
9193
top_n: int = 25,
9294
keyword_scope: str = "document",
95+
random_state: Optional[int] = None,
9396
):
97+
self.random_state = random_state
9498
if keyword_scope not in ["document", "corpus"]:
9599
raise ValueError("keyword_scope must be 'document' or 'corpus'")
96100
self.n_components = n_components
@@ -105,7 +109,7 @@ def __init__(
105109
else:
106110
self.vectorizer = vectorizer
107111
self.dict_vectorizer_ = DictVectorizer()
108-
self.nmf_ = NMF(n_components)
112+
self.nmf_ = NMF(n_components, random_state=self.random_state)
109113
self.keyword_scope = keyword_scope
110114

111115
def extract_keywords(
@@ -172,7 +176,9 @@ def minibatch_train(
172176
console=None,
173177
):
174178
self.dict_vectorizer_.fit(keywords)
175-
self.nmf_ = MiniBatchNMF(self.n_components)
179+
self.nmf_ = MiniBatchNMF(
180+
self.n_components, random_state=self.random_state
181+
)
176182
epoch_costs = []
177183
for i_epoch in range(max_epochs):
178184
epoch_cost = 0
@@ -220,7 +226,9 @@ def big_fit(
220226
console.log("Keywords extracted.")
221227
keywords = KeywordIterator(keyword_file)
222228
status.update("Fitting NMF.")
223-
self.minibatch_train(keywords, max_epochs, batch_size, console=console) # type: ignore
229+
self.minibatch_train(
230+
keywords, max_epochs, batch_size, console=console
231+
) # type: ignore
224232
console.log("NMF fitted.")
225233
return self
226234

0 commit comments

Comments
 (0)