Merge pull request #125 from x-tabdeveloping/sentiment_arc

x-tabdeveloping · web-flow · commit e707316da48d · 2026-04-01T12:18:44.000+02:00
Concept Vector Projection
diff --git a/docs/cvp.md b/docs/cvp.md
@@ -0,0 +1,79 @@
+# Concept Vector Projection
+
+Concept Vector Projection is an embedding-based method for extracting continuous sentiment (or other) scores from free-text documents.
+
+<figure>
+  <img src="../images/cvp.png", title="", style="width:1050px;padding:0px;border:none;"></img>
+  <figcaption> Figure 1: Schematic Overview of Concept Vector Projection.<br> <i>Figure from Lyngbæk et al. (2025)</i> </figcaption>
+</figure>
+
+The method rests on the idea that one can construct a _concept vector_ by encoding positive and negative _seed phrases_ with a transformer, then taking the difference of these mean vectors.
+We can then project other documents' embeddings onto these concept vectors by taking the dot product with the concept vector, thereby giving continuous scores on how related documents are to a given concept.
+
+## Usage
+
+### Single Concept
+
+When projecting onto a single concept, you should specify the seeds as a tuple of positive and negative phrases.
+
+```python
+from turftopic import ConceptVectorProjection
+
+positive = [
+    "I love this product",
+    "This is absolutely lovely",
+    "My daughter is going to adore this"
+]
+negative = [
+    "This product is not at all as advertised, I'm very displeased",
+    "I hate this",
+    "What a horrible way to deal with people"
+]
+cvp = ConceptVectorProjection(seeds=(positive, negative))
+
+test_documents = ["My cute little doggy", "Few this is digusting"]
+doc_concept_matrix = cvp.transform(test_documents)
+print(doc_concept_matrix)
+```
+
+```python
+[[0.24265897]
+ [0.01709663]]
+```
+
+### Multiple Concepts
+
+When projecting documents to multiple concepts at once, you will need to specify seeds for each concept, as well as its name.
+Internally this is handled with an `OrderedDict`, which you can either specify yourself, or Turftopic can do it for you:
+
+```python
+import pandas as pd
+from collections import OrderedDict
+
+cuteness_seeds = (["Absolutely adorable", "I love how he dances with his little feet"], ["What a big slob of an abomination", "A suspicious old man sat next to me on the bus today"])
+bullish_seeds = (["We are going to the moon", "This stock will prove an incredible investment"], ["I will short the hell out of them", "Uber stocks drop 7% in value after down-time."])
+
+# Either specify it like this:
+seeds = [("cuteness", cuteness_seeds), ("bullish", bullish_seeds)]
+# or as an OrderedDict:
+seeds = OrderedDict([("cuteness", cuteness_seeds), ("bullish", bullish_seeds)])
+cvp = ConceptVectorProjection(seeds=seeds)
+
+test_documents = ["What an awesome investment", "Tiny beautiful kitty-cat"]
+doc_concept_matrix = cvp.transform(test_documents)
+concept_df = pd.DataFrame(doc_concept_matrix, columns=cvp.get_feature_names_out())
+print(concept_df)
+```
+
+```python
+   cuteness   bullish
+0  0.085957  0.288779
+1  0.269454  0.009495
+```
+
+## API Reference
+
+
+::: turftopic.models.cvp.ConceptVectorProjection
+
+
diff --git a/docs/images/cvp.png b/docs/images/cvp.png
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -34,6 +34,8 @@ nav:
     - Clustering Models (BERTopic & Top2Vec): clustering.md
     - Autoencoding Models (ZeroShotTM & CombinedTM): ctm.md
     - FASTopic: FASTopic.md
+  - Other Models (e.g. Sentiment Analysis):
+    - Concept Vector Projection (Continuous Sentiment Scoring): cvp.md
   - Embedding Models: encoders.md
   - Vectorizers (Term extraction): vectorizers.md
   - Topic Analysis and Naming with LLMs: analyzers.md
diff --git a/tests/test_cvp.py b/tests/test_cvp.py
@@ -0,0 +1,25 @@
+def test_cvp():
+    from turftopic import ConceptVectorProjection
+
+    cuteness_seeds = (
+        ["Absolutely adorable", "I love how he dances with his little feet"],
+        [
+            "What a big slob of an abomination",
+            "A suspicious old man sat next to me on the bus today",
+        ],
+    )
+    bullish_seeds = (
+        [
+            "We are going to the moon",
+            "This stock will prove an incredible investment",
+        ],
+        [
+            "I will short the hell out of them",
+            "Uber stocks drop 7% in value after down-time.",
+        ],
+    )
+    seeds = [("cuteness", cuteness_seeds), ("bullish", bullish_seeds)]
+    cvp = ConceptVectorProjection(seeds=seeds)
+    test_documents = ["What an awesome investment", "Tiny beautiful kitty-cat"]
+    doc_concept_matrix = cvp.transform(test_documents)
+    assert doc_concept_matrix.shape == (2, 2)
diff --git a/turftopic/__init__.py b/turftopic/__init__.py
@@ -3,6 +3,7 @@
 from turftopic.base import ContextualModel
 from turftopic.error import NotInstalled
 from turftopic.models.cluster import BERTopic, ClusteringTopicModel, Top2Vec
+from turftopic.models.cvp import ConceptVectorProjection
 from turftopic.models.decomp import S3, SemanticSignalSeparation
 from turftopic.models.fastopic import FASTopic
 from turftopic.models.gmm import GMM
@@ -34,4 +35,5 @@
     "create_concept_browser",
     "S3",
     "SensTopic",
+    "ConceptVectorProjection",
 ]
diff --git a/turftopic/encoders/utils.py b/turftopic/encoders/utils.py
@@ -18,43 +18,63 @@ def batched(iterable, n: int) -> Iterable[List[str]]:
 
 def encode_chunks(
     encoder,
-    sentences,
+    texts,
     batch_size=64,
     window_size=50,
     step_size=40,
-    return_chunks=False,
-    show_progress_bar=False,
 ):
-    chunks = []
+    """
+    Returns
+    -------
+    chunk_embeddings: list[np.ndarray]
+        Embedding matrix of chunks for each document.
+    chunk_positions: list[list[tuple[int, int]]]
+        List of start and end character index of chunks for each document.
+    """
+    chunk_positions = []
     chunk_embeddings = []
     for start_index in trange(
         0,
-        len(sentences),
+        len(texts),
         batch_size,
         desc="Encoding batches...",
-        disable=not show_progress_bar,
     ):
-        batch = sentences[start_index : start_index + batch_size]
+        batch = texts[start_index : start_index + batch_size]
         features = encoder.tokenize(batch)
         with torch.no_grad():
             output_features = encoder.forward(features)
         n_tokens = output_features["attention_mask"].sum(axis=1)
+        # Find first nonzero elements in each document
+        # The document could be padded from the left, so we have to watch out for this.
+        start_token = torch.argmax(
+            (output_features["attention_mask"] > 0).to(torch.long), axis=1
+        )
+        end_token = start_token + n_tokens
         for i_doc in range(len(batch)):
-            for chunk_start in range(0, n_tokens[i_doc], step_size):
-                chunk_end = min(chunk_start + window_size, n_tokens[i_doc])
+            _chunk_embeddings = []
+            _chunk_positions = []
+            for chunk_start in range(
+                start_token[i_doc], end_token[i_doc], step_size
+            ):
+                chunk_end = min(chunk_start + window_size, end_token[i_doc])
                 _emb = output_features["token_embeddings"][
                     i_doc, chunk_start:chunk_end, :
                 ].mean(axis=0)
-                chunk_embeddings.append(_emb)
-                if return_chunks:
-                    chunks.append(
-                        encoder.tokenizer.decode(
-                            features["input_ids"][i_doc, chunk_start:chunk_end]
-                        )
-                        .replace("[CLS]", "")
-                        .replace("[SEP]", "")
+                _chunk_embeddings.append(_emb)
+                chunk_text = (
+                    encoder.tokenizer.decode(
+                        features["input_ids"][i_doc, chunk_start:chunk_end],
+                        skip_special_tokens=True,
                     )
-    if not return_chunks:
-        chunks = None
-    chunk_embeddings = np.stack(chunk_embeddings)
-    return chunk_embeddings, chunks
+                    .replace("[CLS]", "")
+                    .replace("[SEP]", "")
+                    .strip()
+                )
+                doc_text = texts[start_index + i_doc]
+                start_char = doc_text.find(chunk_text)
+                end_char = start_char + len(chunk_text)
+                _chunk_positions.append((start_char, end_char))
+            _chunk_embeddings = np.stack(_chunk_embeddings)
+            chunk_embeddings.append(_chunk_embeddings)
+            chunk_positions.append(_chunk_positions)
+    return chunk_embeddings, chunk_positions
diff --git a/turftopic/models/cvp.py b/turftopic/models/cvp.py
@@ -0,0 +1,149 @@
+import json
+import tempfile
+from collections import OrderedDict
+from pathlib import Path
+from typing import Union
+
+import joblib
+import numpy as np
+from huggingface_hub import HfApi
+from sentence_transformers import SentenceTransformer
+from sklearn.base import BaseEstimator, TransformerMixin
+
+from turftopic.base import Encoder
+from turftopic.encoders.multimodal import MultimodalEncoder
+from turftopic.serialization import create_readme, get_package_versions
+
+Seeds = tuple[list[str], list[str]]
+
+
+class ConceptVectorProjection(BaseEstimator, TransformerMixin):
+    """Concept Vector Projection model from [Lyngbæk et al. (2025)](https://doi.org/10.63744/nVu1Zq5gRkuD)
+    Can be used to project document embeddings onto a difference projection vector between positive and negative seed phrases.
+    The primary use case is sentiment analysis, and continuous sentiment scores,
+    especially for languages where dedicated models are not available.
+
+    Parameters
+    ----------
+    seeds: (list[str], list[str]) or list of (str, (list[str], list[str]))
+        If you want to project to a single concept, then
+        a tuple of (list of negative terms, list of positive terms). <br>
+        If there are multiple concepts, they should be specified as (name, Seeds) tuples in a list.
+        Alternatively, seeds can be an OrderedDict with the names of the concepts being the keys,
+        and the tuples of negative and positive seeds as the values.
+    encoder: str or SentenceTransformer
+        Model to produce document representations, paraphrase-multilingual-mpnet-base-v2 is the default
+        per Lyngbæk et al. (2025).
+    """
+
+    def __init__(
+        self,
+        seeds: Union[Seeds, list[tuple[str, Seeds]], OrderedDict[str, Seeds]],
+        encoder: Union[
+            Encoder, str, MultimodalEncoder
+        ] = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
+    ):
+        self.seeds = seeds
+        if isinstance(seeds, OrderedDict):
+            self._seeds = seeds
+        elif (
+            (len(seeds) == 2)
+            and (isinstance(seeds, tuple))
+            and (isinstance(seeds[0][0], str))
+        ):
+            self._seeds = OrderedDict([("default", seeds)])
+        else:
+            self._seeds = OrderedDict(seeds)
+        self.encoder = encoder
+        if isinstance(encoder, str):
+            self.encoder_ = SentenceTransformer(encoder)
+        else:
+            self.encoder_ = encoder
+        self.classes_ = np.array([name for name in self._seeds])
+        self.concept_matrix_ = []
+        for _, (positive, negative) in self._seeds.items():
+            positive_emb = self.encoder_.encode(positive)
+            negative_emb = self.encoder_.encode(negative)
+            cv = np.mean(positive_emb, axis=0) - np.mean(negative_emb, axis=0)
+            self.concept_matrix_.append(cv / np.linalg.norm(cv))
+        self.concept_matrix_ = np.stack(self.concept_matrix_)
+
+    def get_feature_names_out(self):
+        """Returns concept names in an array."""
+        return self.classes_
+
+    def fit_transform(self, raw_documents=None, y=None, embeddings=None):
+        """Project documents onto the concept vectors.
+
+        Parameters
+        ----------
+        raw_documents: list[str] or None
+            List of documents to project to the concept vectors.
+        embeddings: ndarray of shape (n_documents, n_dimensions)
+            Document embeddings (has to be created with the same encoder as the concept vectors.)
+
+        Returns
+        -------
+        document_concept_matrix: ndarray of shape (n_documents, n_dimensions)
+            Prevalance of each concept in each document.
+        """
+        if (raw_documents is None) and (embeddings is None):
+            raise ValueError(
+                "Either embeddings or raw_documents has to be passed, both are None."
+            )
+        if embeddings is None:
+            embeddings = self.encoder_.encode(raw_documents)
+        return embeddings @ self.concept_matrix_.T
+
+    def transform(self, raw_documents=None, embeddings=None):
+        """Project documents onto the concept vectors.
+
+        Parameters
+        ----------
+        raw_documents: list[str] or None
+            List of documents to project to the concept vectors.
+        embeddings: ndarray of shape (n_documents, n_dimensions)
+            Document embeddings (has to be created with the same encoder as the concept vectors.)
+
+        Returns
+        -------
+        document_concept_matrix: ndarray of shape (n_documents, n_dimensions)
+            Prevalance of each concept in each document.
+        """
+        return self.fit_transform(raw_documents, embeddings=embeddings)
+
+    def to_disk(self, out_dir: Union[Path, str]):
+        """Persists model to directory on your machine.
+
+        Parameters
+        ----------
+        out_dir: Path | str
+            Directory to save the model to.
+        """
+        out_dir = Path(out_dir)
+        out_dir.mkdir(exist_ok=True)
+        package_versions = get_package_versions()
+        with out_dir.joinpath("package_versions.json").open("w") as ver_file:
+            ver_file.write(json.dumps(package_versions))
+        joblib.dump(self, out_dir.joinpath("model.joblib"))
+
+    def push_to_hub(self, repo_id: str):
+        """Uploads model to HuggingFace Hub
+
+        Parameters
+        ----------
+        repo_id: str
+            Repository to upload the model to.
+        """
+        api = HfApi()
+        api.create_repo(repo_id, exist_ok=True)
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            readme_path = Path(tmp_dir).joinpath("README.md")
+            with readme_path.open("w") as readme_file:
+                readme_file.write(create_readme(self, repo_id))
+            self.to_disk(tmp_dir)
+            api.upload_folder(
+                folder_path=tmp_dir,
+                repo_id=repo_id,
+                repo_type="model",
+            )
diff --git a/turftopic/vectorizers/phrases.py b/turftopic/vectorizers/phrases.py