Added seed phrases to KeyNMF

x-tabdeveloping · x-tabdeveloping · commit e2821042cb4a · 2025-01-31T14:29:54.000+01:00
diff --git a/turftopic/models/_keynmf.py b/turftopic/models/_keynmf.py
@@ -120,6 +120,7 @@ def batch_extract_keywords(
         self,
         documents: list[str],
         embeddings: Optional[np.ndarray] = None,
+        seed_embedding: Optional[np.ndarray] = None,
     ) -> list[dict[str, float]]:
         if not len(documents):
             return []
@@ -142,6 +143,16 @@ def batch_extract_keywords(
         if len(new_terms):
             self._add_terms(new_terms)
         total = embeddings.shape[0]
+        # Relevance based on similarity to seed embedding
+        document_relevance = None
+        if seed_embedding is not None:
+            if self.metric == "cosine":
+                document_relevance = cosine_similarity(
+                    [seed_embedding], embeddings
+                )[0]
+            else:
+                document_relevance = np.dot(embeddings, seed_embedding)
+            document_relevance[document_relevance < 0] = 0
         for i in range(total):
             terms = document_term_matrix[i, :].todense()
             embedding = embeddings[i].reshape(1, -1)
@@ -162,14 +173,13 @@ def batch_extract_keywords(
                     )
                 )
             if self.metric == "cosine":
-                sim = cosine_similarity(embedding, word_embeddings).astype(
-                    np.float64
-                )
+                sim = cosine_similarity(embedding, word_embeddings)
                 sim = np.ravel(sim)
             else:
-                sim = np.dot(word_embeddings, embedding[0]).T.astype(
-                    np.float64
-                )
+                sim = np.dot(word_embeddings, embedding[0]).T
+            # If a seed is specified, we multiply by the document's relevance
+            if document_relevance is not None:
+                sim = document_relevance[i] * sim
             kth = min(self.top_n, len(sim) - 1)
             top = np.argpartition(-sim, kth)[:kth]
             top_words = batch_vocab[important_terms][top]
diff --git a/turftopic/models/keynmf.py b/turftopic/models/keynmf.py
@@ -49,6 +49,10 @@ class KeyNMF(ContextualModel, DynamicTopicModel):
         Random state to use so that results are exactly reproducible.
     metric: "cosine" or "dot", default "cosine"
         Similarity metric to use for keyword extraction.
+    seed_phrase: str, default None
+        Describes an aspect of the corpus that the model should explore.
+        It can be a free-text query, such as
+        "Christian Denominations: Protestantism and Catholicism"
     """
 
     def __init__(
@@ -61,6 +65,7 @@ def __init__(
         top_n: int = 25,
         random_state: Optional[int] = None,
         metric: Literal["cosine", "dot"] = "cosine",
+        seed_phrase: Optional[str] = None,
     ):
         self.random_state = random_state
         self.n_components = n_components
@@ -85,6 +90,10 @@ def __init__(
             encoder=self.encoder_,
             metric=self.metric,
         )
+        self.seed_phrase = seed_phrase
+        self.seed_embedding = None
+        if self.seed_phrase is not None:
+            self.seed_embedding = self.encoder_.encode([self.seed_phrase])[0]
 
     def extract_keywords(
         self,
@@ -103,7 +112,9 @@ def extract_keywords(
         if isinstance(batch_or_document, str):
             batch_or_document = [batch_or_document]
         return self.extractor.batch_extract_keywords(
-            batch_or_document, embeddings=embeddings
+            batch_or_document,
+            embeddings=embeddings,
+            seed_embedding=self.seed_embedding,
         )
 
     def vectorize(