Skip to content

Commit 59795c2

Browse files
authored
Merge pull request #22 from rbroc/multilingual-keynmf
Implement multilingual KeyNMF
2 parents b408540 + 0a2bf70 commit 59795c2

3 files changed

Lines changed: 23 additions & 8 deletions

File tree

docs/KeyNMF.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,10 @@ Keywords are assigned to each document based on the cosine similarity of the doc
1919
Only the top K words with positive cosine similarity to the document are kept.
2020

2121
These keywords are then arranged into a document-term importance matrix where each column represents a keyword that was encountered in at least one document,
22-
and each row is a document.
23-
The entries in the matrix are the cosine similarities of the given keyword to the document in semantic space.
22+
and each row is a document. The entries in the matrix are the cosine similarities of the given keyword to the document in semantic space.
23+
24+
Keyword extraction can be performed by computing cosine similarities between document embeddings and embeddings of the entire vocabulary,
25+
or between document embeddings and words that occur within each document. The former scenario allows for multilingual topics.
2426

2527
### 2. Topic Discovery
2628

@@ -39,7 +41,6 @@ can be explained.
3941

4042
### Weaknesses
4143

42-
- Lack of Multilingual Capabilities: KeyNMF as it is currently implemented cannot be used in a multilingual context. Changes to the model that allow this are possible, and will likely be ijmplemented in the future.
4344
- Lack of Nuance: Since only the top K keywords are considered and used for topic extraction some of the nuances, especially in long texts might get lost. We therefore recommend that you scale K with the average length of the texts you're working with. For tweets it might be worth it to scale it down to 5, while with longer documents, a larger number (let's say 50) might be advisable.
4445
- Practitioners have to choose the number of topics a priori.
4546

tests/test_integration.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@
2929
models = [
3030
GMM(5, encoder=trf),
3131
SemanticSignalSeparation(5, encoder=trf),
32-
KeyNMF(5, encoder=trf),
32+
KeyNMF(5, encoder=trf, keyword_scope='document'),
33+
KeyNMF(5, encoder=trf, keyword_scope='corpus'),
3334
ClusteringTopicModel(
3435
n_reduce_to=5,
3536
feature_importance="c-tf-idf",

turftopic/models/keynmf.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,11 @@ class KeyNMF(ContextualModel):
7474
Can be used to prune or filter the vocabulary.
7575
top_n: int, default 25
7676
Number of keywords to extract for each document.
77+
keyword_scope: str, default 'document'
78+
Specifies whether keyword extraction for each document
79+
is performed on the whole vocabulary ('corpus') or only
80+
using words that are included in the document ('document').
81+
Setting this to 'corpus' allows for multilingual topics.
7782
"""
7883

7984
def __init__(
@@ -84,7 +89,10 @@ def __init__(
8489
] = "sentence-transformers/all-MiniLM-L6-v2",
8590
vectorizer: Optional[CountVectorizer] = None,
8691
top_n: int = 25,
92+
keyword_scope: str = 'document',
8793
):
94+
if keyword_scope not in ['document', 'corpus']:
95+
raise ValueError("keyword_scope must be 'document' or 'corpus'")
8896
self.n_components = n_components
8997
self.top_n = top_n
9098
self.encoder = encoder
@@ -98,6 +106,7 @@ def __init__(
98106
self.vectorizer = vectorizer
99107
self.dict_vectorizer_ = DictVectorizer()
100108
self.nmf_ = NMF(n_components)
109+
self.keyword_scope = keyword_scope
101110

102111
def extract_keywords(
103112
self,
@@ -114,11 +123,15 @@ def extract_keywords(
114123
for i in range(total):
115124
terms = document_term_matrix[i, :].todense()
116125
embedding = embeddings[i].reshape(1, -1)
117-
nonzero = terms > 0
118-
if not np.any(nonzero):
126+
if self.keyword_scope == 'document':
127+
mask = terms > 0
128+
else:
129+
tot_freq = document_term_matrix.sum(axis=0)
130+
mask = tot_freq != 0
131+
if not np.any(mask):
119132
keywords.append(dict())
120133
continue
121-
important_terms = np.squeeze(np.asarray(nonzero))
134+
important_terms = np.squeeze(np.asarray(mask))
122135
word_embeddings = self.vocab_embeddings[important_terms]
123136
sim = cosine_similarity(embedding, word_embeddings)
124137
sim = np.ravel(sim)
@@ -272,7 +285,7 @@ def prepare_topic_data(
272285
except (NotFittedError, AttributeError):
273286
doc_topic_matrix = self.nmf_.fit_transform(dtm)
274287
self.components_ = self.nmf_.components_
275-
console.log("Model fiting done.")
288+
console.log("Model fitting done.")
276289
res: TopicData = {
277290
"corpus": corpus,
278291
"document_term_matrix": dtm,

0 commit comments

Comments
 (0)