Skip to content

Commit 7247618

Browse files
committed
do not include words that never occur in the corpus
1 parent a133446 commit 7247618

1 file changed

Lines changed: 6 additions & 5 deletions

File tree

turftopic/models/keynmf.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -125,11 +125,12 @@ def extract_keywords(
125125
embedding = embeddings[i].reshape(1, -1)
126126
if self.keyword_scope == 'document':
127127
mask = terms > 0
128-
if not np.any(mask):
129-
keywords.append(dict())
130-
continue
131128
else:
132-
mask = np.ones(shape=terms.shape, dtype=bool)
129+
tot_freq = document_term_matrix.sum(axis=0)
130+
mask = tot_freq != 0
131+
if not np.any(mask):
132+
keywords.append(dict())
133+
continue
133134
important_terms = np.squeeze(np.asarray(mask))
134135
word_embeddings = self.vocab_embeddings[important_terms]
135136
sim = cosine_similarity(embedding, word_embeddings)
@@ -284,7 +285,7 @@ def prepare_topic_data(
284285
except (NotFittedError, AttributeError):
285286
doc_topic_matrix = self.nmf_.fit_transform(dtm)
286287
self.components_ = self.nmf_.components_
287-
console.log("Model fiting done.")
288+
console.log("Model fitting done.")
288289
res: TopicData = {
289290
"corpus": corpus,
290291
"document_term_matrix": dtm,

0 commit comments

Comments
 (0)