Removed Bayes importance method and added NPMI

x-tabdeveloping · x-tabdeveloping · commit f7e48d530a7f · 2025-10-12T12:58:03.000+02:00
diff --git a/turftopic/feature_importance.py b/turftopic/feature_importance.py
@@ -207,37 +207,27 @@ def ctf_idf(
         return np.stack(components), idf_diag
 
 
-def bayes_rule(
-    doc_topic_matrix: np.ndarray, doc_term_matrix: spr.csr_matrix
+def npmi(
+    doc_topic_matrix: np.ndarray,
+    doc_term_matrix: spr.csr_matrix,
+    smoothing: int = 1,
 ) -> np.ndarray:
-    """Computes feature importance based on Bayes' rule.
-    The importance of a word for a topic is the probability of the topic conditional on the word.
-
-    $$p(t|w) = \\frac{p(w|t) * p(t)}{p(w)}$$
-
-    Parameters
-    ----------
-    doc_topic_matrix: np.ndarray
-        Document-topic matrix of shape (n_documents, n_topics)
-    doc_term_matrix: np.ndarray
-        Document-term matrix of shape (n_documents, vocab_size)
-
-    Returns
-    -------
-    ndarray of shape (n_topics, vocab_size)
-        Term importance matrix.
-    """
     eps = np.finfo(float).eps
-    p_w = np.squeeze(np.asarray(doc_term_matrix.sum(axis=0)))
+    p_w = np.squeeze(np.asarray(doc_term_matrix.sum(axis=0))) + smoothing
     p_w = p_w / p_w.sum()
     p_w[p_w <= 0] = eps
-    p_t = doc_topic_matrix.sum(axis=0)
+    p_t = doc_topic_matrix.sum(axis=0) + smoothing
     p_t = p_t / p_t.sum()
-    term_importance = doc_topic_matrix.T @ doc_term_matrix
-    overall_in_topic = np.abs(term_importance).sum(axis=1)
-    overall_in_topic[overall_in_topic <= 0] = eps
-    p_wt = (term_importance.T / (overall_in_topic)).T
-    p_wt /= p_wt.sum(axis=1)[:, None]
-    p_tw = (p_wt.T * p_t).T / p_w
-    p_tw /= np.nansum(p_tw, axis=0)
-    return p_tw
+    labels = np.argmax(doc_topic_matrix, axis=1)
+    p_wt = []
+    for i in np.arange(doc_topic_matrix.shape[1]):
+        _p_w = np.squeeze(np.asarray(doc_term_matrix[labels == i].sum(axis=0)))
+        _p_w = _p_w / _p_w.sum()
+        _p_w[_p_w <= 0] = eps
+        p_wt.append(_p_w)
+    p_wt = np.stack(p_wt)
+    log_p_wt = np.log2(p_wt)
+    numerator = log_p_wt - np.log2(p_w)
+    denominator = -(log_p_wt.T - np.log2(p_t)).T
+    res = numerator / denominator
+    return res
diff --git a/turftopic/models/_hierarchical_clusters.py b/turftopic/models/_hierarchical_clusters.py
@@ -10,11 +10,11 @@
 
 from turftopic.base import ContextualModel
 from turftopic.feature_importance import (
-    bayes_rule,
     cluster_centroid_distance,
     ctf_idf,
     fighting_words,
     linear_classifier,
+    npmi,
     soft_ctf_idf,
 )
 from turftopic.hierarchical import TopicNode
@@ -221,8 +221,8 @@ def _estimate_children_components(self) -> dict[int, np.ndarray]:
                     self.model.embeddings,
                     self.model.vocab_embeddings,
                 )
-        elif self.model.feature_importance == "bayes":
-            components = bayes_rule(
+        elif self.model.feature_importance == "npmi":
+            components = npmi(
                 document_topic_matrix, self.model.doc_term_matrix
             )
         else:
diff --git a/turftopic/models/cluster.py b/turftopic/models/cluster.py
@@ -23,11 +23,11 @@
 from turftopic.dynamic import DynamicTopicModel
 from turftopic.encoders.multimodal import MultimodalEncoder
 from turftopic.feature_importance import (
-    bayes_rule,
     cluster_centroid_distance,
     ctf_idf,
     fighting_words,
     linear_classifier,
+    npmi,
     soft_ctf_idf,
 )
 from turftopic.models._hierarchical_clusters import (
@@ -64,7 +64,7 @@
     "soft-c-tf-idf",
     "c-tf-idf",
     "centroid",
-    "bayes",
+    "npmi",
     "linear",
     "fighting-words",
 ]
@@ -157,7 +157,7 @@ class ClusteringTopicModel(
         'c-tf-idf' uses BERTopic's c-tf-idf.
         'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should
         be very similar to 'c-tf-idf'.
-        'bayes' uses Bayes' rule.
+        'npmi' uses normalized pointwise information between clusters and words.
         'linear' calculates most predictive directions in embedding space and projects
         words onto them.
         'fighting-words' calculates word importances based on the Fighting Words
@@ -293,7 +293,7 @@ def estimate_components(
             'c-tf-idf' uses BERTopic's c-tf-idf.
             'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should
             be very similar to 'c-tf-idf'.
-            'bayes' uses Bayes' rule.
+            'npmi' uses normalized pointwise mutual information between clusters and words.
             'linear' calculates most predictive directions in embedding space and projects
             words onto them.
             'fighting-words' calculates word importances based on the Fighting Words
@@ -564,7 +564,7 @@ def estimate_temporal_components(
             'c-tf-idf' uses BERTopic's c-tf-idf.
             'soft-c-tf-idf' uses Soft c-TF-IDF from GMM, the results should
             be very similar to 'c-tf-idf'.
-            'bayes' uses Bayes' rule.
+            'npmi' uses normalized pointwise information between clusters and words.
             'linear' calculates most predictive directions in embedding space and projects
             words onto them.
 
@@ -605,10 +605,8 @@ def estimate_temporal_components(
                 self.temporal_components_[i_timebin], _ = soft_ctf_idf(
                     t_doc_topic, t_dtm, return_idf=True
                 )
-            elif feature_importance == "bayes":
-                self.temporal_components_[i_timebin] = bayes_rule(
-                    t_doc_topic, t_dtm
-                )
+            elif feature_importance == "npmi":
+                self.temporal_components_[i_timebin] = npmi(t_doc_topic, t_dtm)
             elif feature_importance == "fighting-words":
                 self.temporal_components_[i_timebin] = fighting_words(
                     t_doc_topic, t_dtm