Skip to content

Commit f7e48d5

Browse files
Removed Bayes importance method and added NPMI
1 parent 15919a0 commit f7e48d5

3 files changed

Lines changed: 29 additions & 41 deletions

File tree

turftopic/feature_importance.py

Lines changed: 19 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -207,37 +207,27 @@ def ctf_idf(
207207
return np.stack(components), idf_diag
208208

209209

210-
def bayes_rule(
211-
doc_topic_matrix: np.ndarray, doc_term_matrix: spr.csr_matrix
210+
def npmi(
211+
doc_topic_matrix: np.ndarray,
212+
doc_term_matrix: spr.csr_matrix,
213+
smoothing: int = 1,
212214
) -> np.ndarray:
213-
"""Computes feature importance based on Bayes' rule.
214-
The importance of a word for a topic is the probability of the topic conditional on the word.
215-
216-
$$p(t|w) = \\frac{p(w|t) * p(t)}{p(w)}$$
217-
218-
Parameters
219-
----------
220-
doc_topic_matrix: np.ndarray
221-
Document-topic matrix of shape (n_documents, n_topics)
222-
doc_term_matrix: np.ndarray
223-
Document-term matrix of shape (n_documents, vocab_size)
224-
225-
Returns
226-
-------
227-
ndarray of shape (n_topics, vocab_size)
228-
Term importance matrix.
229-
"""
230215
eps = np.finfo(float).eps
231-
p_w = np.squeeze(np.asarray(doc_term_matrix.sum(axis=0)))
216+
p_w = np.squeeze(np.asarray(doc_term_matrix.sum(axis=0))) + smoothing
232217
p_w = p_w / p_w.sum()
233218
p_w[p_w <= 0] = eps
234-
p_t = doc_topic_matrix.sum(axis=0)
219+
p_t = doc_topic_matrix.sum(axis=0) + smoothing
235220
p_t = p_t / p_t.sum()
236-
term_importance = doc_topic_matrix.T @ doc_term_matrix
237-
overall_in_topic = np.abs(term_importance).sum(axis=1)
238-
overall_in_topic[overall_in_topic <= 0] = eps
239-
p_wt = (term_importance.T / (overall_in_topic)).T
240-
p_wt /= p_wt.sum(axis=1)[:, None]
241-
p_tw = (p_wt.T * p_t).T / p_w
242-
p_tw /= np.nansum(p_tw, axis=0)
243-
return p_tw
221+
labels = np.argmax(doc_topic_matrix, axis=1)
222+
p_wt = []
223+
for i in np.arange(doc_topic_matrix.shape[1]):
224+
_p_w = np.squeeze(np.asarray(doc_term_matrix[labels == i].sum(axis=0)))
225+
_p_w = _p_w / _p_w.sum()
226+
_p_w[_p_w <= 0] = eps
227+
p_wt.append(_p_w)
228+
p_wt = np.stack(p_wt)
229+
log_p_wt = np.log2(p_wt)
230+
numerator = log_p_wt - np.log2(p_w)
231+
denominator = -(log_p_wt.T - np.log2(p_t)).T
232+
res = numerator / denominator
233+
return res

turftopic/models/_hierarchical_clusters.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,11 @@
1010

1111
from turftopic.base import ContextualModel
1212
from turftopic.feature_importance import (
13-
bayes_rule,
1413
cluster_centroid_distance,
1514
ctf_idf,
1615
fighting_words,
1716
linear_classifier,
17+
npmi,
1818
soft_ctf_idf,
1919
)
2020
from turftopic.hierarchical import TopicNode
@@ -221,8 +221,8 @@ def _estimate_children_components(self) -> dict[int, np.ndarray]:
221221
self.model.embeddings,
222222
self.model.vocab_embeddings,
223223
)
224-
elif self.model.feature_importance == "bayes":
225-
components = bayes_rule(
224+
elif self.model.feature_importance == "npmi":
225+
components = npmi(
226226
document_topic_matrix, self.model.doc_term_matrix
227227
)
228228
else:

turftopic/models/cluster.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,11 @@
2323
from turftopic.dynamic import DynamicTopicModel
2424
from turftopic.encoders.multimodal import MultimodalEncoder
2525
from turftopic.feature_importance import (
26-
bayes_rule,
2726
cluster_centroid_distance,
2827
ctf_idf,
2928
fighting_words,
3029
linear_classifier,
30+
npmi,
3131
soft_ctf_idf,
3232
)
3333
from turftopic.models._hierarchical_clusters import (
@@ -64,7 +64,7 @@
6464
"soft-c-tf-idf",
6565
"c-tf-idf",
6666
"centroid",
67-
"bayes",
67+
"npmi",
6868
"linear",
6969
"fighting-words",
7070
]
@@ -157,7 +157,7 @@ class ClusteringTopicModel(
157157
'c-tf-idf' uses BERTopic's c-tf-idf.
158158
'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should
159159
be very similar to 'c-tf-idf'.
160-
'bayes' uses Bayes' rule.
160+
'npmi' uses normalized pointwise information between clusters and words.
161161
'linear' calculates most predictive directions in embedding space and projects
162162
words onto them.
163163
'fighting-words' calculates word importances based on the Fighting Words
@@ -293,7 +293,7 @@ def estimate_components(
293293
'c-tf-idf' uses BERTopic's c-tf-idf.
294294
'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should
295295
be very similar to 'c-tf-idf'.
296-
'bayes' uses Bayes' rule.
296+
'npmi' uses normalized pointwise mutual information between clusters and words.
297297
'linear' calculates most predictive directions in embedding space and projects
298298
words onto them.
299299
'fighting-words' calculates word importances based on the Fighting Words
@@ -564,7 +564,7 @@ def estimate_temporal_components(
564564
'c-tf-idf' uses BERTopic's c-tf-idf.
565565
'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should
566566
be very similar to 'c-tf-idf'.
567-
'bayes' uses Bayes' rule.
567+
'npmi' uses normalized pointwise information between clusters and words.
568568
'linear' calculates most predictive directions in embedding space and projects
569569
words onto them.
570570
@@ -605,10 +605,8 @@ def estimate_temporal_components(
605605
self.temporal_components_[i_timebin], _ = soft_ctf_idf(
606606
t_doc_topic, t_dtm, return_idf=True
607607
)
608-
elif feature_importance == "bayes":
609-
self.temporal_components_[i_timebin] = bayes_rule(
610-
t_doc_topic, t_dtm
611-
)
608+
elif feature_importance == "npmi":
609+
self.temporal_components_[i_timebin] = npmi(t_doc_topic, t_dtm)
612610
elif feature_importance == "fighting-words":
613611
self.temporal_components_[i_timebin] = fighting_words(
614612
t_doc_topic, t_dtm

0 commit comments

Comments
 (0)