Merge pull request #22 from rbroc/multilingual-keynmf

rbroc · web-flow · commit 59795c27c21a · 2024-03-14T09:11:06.000+01:00
Implement multilingual KeyNMF
diff --git a/docs/KeyNMF.md b/docs/KeyNMF.md
@@ -19,8 +19,10 @@ Keywords are assigned to each document based on the cosine similarity of the doc
 Only the top K words with positive cosine similarity to the document are kept.
 
 These keywords are then arranged into a document-term importance matrix where each column represents a keyword that was encountered in at least one document,
-and each row is a document.
-The entries in the matrix are the cosine similarities of the given keyword to the document in semantic space.
+and each row is a document. The entries in the matrix are the cosine similarities of the given keyword to the document in semantic space.
+
+Keyword extraction can be performed by computing cosine similarities between document embeddings and embeddings of the entire vocabulary, 
+or between document embeddings and words that occur within each document. The former scenario allows for multilingual topics.
 
 ### 2. Topic Discovery
 
@@ -39,7 +41,6 @@ can be explained.
 
 ### Weaknesses
 
- - Lack of Multilingual Capabilities: KeyNMF as it is currently implemented cannot be used in a multilingual context. Changes to the model that allow this are possible, and will likely be ijmplemented in the future.
  - Lack of Nuance: Since only the top K keywords are considered and used for topic extraction some of the nuances, especially in long texts might get lost. We therefore recommend that you scale K with the average length of the texts you're working with. For tweets it might be worth it to scale it down to 5, while with longer documents, a larger number (let's say 50) might be advisable.
  - Practitioners have to choose the number of topics a priori.
 
diff --git a/tests/test_integration.py b/tests/test_integration.py
@@ -29,7 +29,8 @@
 models = [
     GMM(5, encoder=trf),
     SemanticSignalSeparation(5, encoder=trf),
-    KeyNMF(5, encoder=trf),
+    KeyNMF(5, encoder=trf, keyword_scope='document'),
+    KeyNMF(5, encoder=trf, keyword_scope='corpus'),
     ClusteringTopicModel(
         n_reduce_to=5,
         feature_importance="c-tf-idf",
diff --git a/turftopic/models/keynmf.py b/turftopic/models/keynmf.py
@@ -74,6 +74,11 @@ class KeyNMF(ContextualModel):
         Can be used to prune or filter the vocabulary.
     top_n: int, default 25
         Number of keywords to extract for each document.
+    keyword_scope: str, default 'document'
+        Specifies whether keyword extraction for each document
+        is performed on the whole vocabulary ('corpus') or only
+        using words that are included in the document ('document').
+        Setting this to 'corpus' allows for multilingual topics.
     """
 
     def __init__(
@@ -84,7 +89,10 @@ def __init__(
         ] = "sentence-transformers/all-MiniLM-L6-v2",
         vectorizer: Optional[CountVectorizer] = None,
         top_n: int = 25,
+        keyword_scope: str = 'document',
     ):
+        if keyword_scope not in ['document', 'corpus']:
+            raise ValueError("keyword_scope must be 'document' or 'corpus'")
         self.n_components = n_components
         self.top_n = top_n
         self.encoder = encoder
@@ -98,6 +106,7 @@ def __init__(
             self.vectorizer = vectorizer
         self.dict_vectorizer_ = DictVectorizer()
         self.nmf_ = NMF(n_components)
+        self.keyword_scope = keyword_scope
 
     def extract_keywords(
         self,
@@ -114,11 +123,15 @@ def extract_keywords(
         for i in range(total):
             terms = document_term_matrix[i, :].todense()
             embedding = embeddings[i].reshape(1, -1)
-            nonzero = terms > 0
-            if not np.any(nonzero):
+            if self.keyword_scope == 'document':
+                mask = terms > 0
+            else:
+                tot_freq = document_term_matrix.sum(axis=0)
+                mask = tot_freq != 0
+            if not np.any(mask):
                 keywords.append(dict())
                 continue
-            important_terms = np.squeeze(np.asarray(nonzero))
+            important_terms = np.squeeze(np.asarray(mask))
             word_embeddings = self.vocab_embeddings[important_terms]
             sim = cosine_similarity(embedding, word_embeddings)
             sim = np.ravel(sim)
@@ -272,7 +285,7 @@ def prepare_topic_data(
             except (NotFittedError, AttributeError):
                 doc_topic_matrix = self.nmf_.fit_transform(dtm)
                 self.components_ = self.nmf_.components_
-            console.log("Model fiting done.")
+            console.log("Model fitting done.")
         res: TopicData = {
             "corpus": corpus,
             "document_term_matrix": dtm,