Skip to content

Commit 34df185

Browse files
Added dosctring for NPMI
1 parent 9dbe830 commit 34df185

1 file changed

Lines changed: 23 additions & 0 deletions

File tree

turftopic/feature_importance.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,29 @@ def npmi(
212212
doc_term_matrix: spr.csr_matrix,
213213
smoothing: int = 5,
214214
) -> np.ndarray:
215+
"""Uses normalized pointwise mutual information between
216+
clusters and terms to calculate term importance scores.
217+
218+
To not underestimate individual words' occurrances (overfit),
219+
a smoothing term is added, which is mathematically equivalent
220+
to using the MAP estimate of a symmetric dirichlet-multinomial model.
221+
222+
Parameters
223+
----------
224+
doc_topic_matrix: np.ndarray
225+
Document-topic matrix of shape (n_documents, n_topics)
226+
doc_term_matrix: np.ndarray
227+
Document-term matrix of shape (n_documents, vocab_size)
228+
smoothing: int, default 5
229+
Alpha parameter of the symmetric Dirichlet-multinomial.
230+
Corresponds to assuming that each term and cluster
231+
occurred this many more times than the observed.
232+
233+
Returns
234+
-------
235+
ndarray of shape (n_topics, vocab_size)
236+
Term importance matrix.
237+
"""
215238
eps = np.finfo(float).eps
216239
p_w = np.squeeze(np.asarray(doc_term_matrix.sum(axis=0))) + smoothing
217240
p_w = p_w / p_w.sum()

0 commit comments

Comments
 (0)