Skip to content

Commit e707316

Browse files
Merge pull request #125 from x-tabdeveloping/sentiment_arc
Concept Vector Projection
2 parents d6db72f + 05de799 commit e707316

8 files changed

Lines changed: 380 additions & 21 deletions

File tree

docs/cvp.md

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
# Concept Vector Projection
2+
3+
Concept Vector Projection is an embedding-based method for extracting continuous sentiment (or other) scores from free-text documents.
4+
5+
<figure>
6+
<img src="../images/cvp.png", title="", style="width:1050px;padding:0px;border:none;"></img>
7+
<figcaption> Figure 1: Schematic Overview of Concept Vector Projection.<br> <i>Figure from Lyngbæk et al. (2025)</i> </figcaption>
8+
</figure>
9+
10+
The method rests on the idea that one can construct a _concept vector_ by encoding positive and negative _seed phrases_ with a transformer, then taking the difference of these mean vectors.
11+
We can then project other documents' embeddings onto these concept vectors by taking the dot product with the concept vector, thereby giving continuous scores on how related documents are to a given concept.
12+
13+
## Usage
14+
15+
### Single Concept
16+
17+
When projecting onto a single concept, you should specify the seeds as a tuple of positive and negative phrases.
18+
19+
```python
20+
from turftopic import ConceptVectorProjection
21+
22+
positive = [
23+
"I love this product",
24+
"This is absolutely lovely",
25+
"My daughter is going to adore this"
26+
]
27+
negative = [
28+
"This product is not at all as advertised, I'm very displeased",
29+
"I hate this",
30+
"What a horrible way to deal with people"
31+
]
32+
cvp = ConceptVectorProjection(seeds=(positive, negative))
33+
34+
test_documents = ["My cute little doggy", "Few this is digusting"]
35+
doc_concept_matrix = cvp.transform(test_documents)
36+
print(doc_concept_matrix)
37+
```
38+
39+
```python
40+
[[0.24265897]
41+
[0.01709663]]
42+
```
43+
44+
### Multiple Concepts
45+
46+
When projecting documents to multiple concepts at once, you will need to specify seeds for each concept, as well as its name.
47+
Internally this is handled with an `OrderedDict`, which you can either specify yourself, or Turftopic can do it for you:
48+
49+
```python
50+
import pandas as pd
51+
from collections import OrderedDict
52+
53+
cuteness_seeds = (["Absolutely adorable", "I love how he dances with his little feet"], ["What a big slob of an abomination", "A suspicious old man sat next to me on the bus today"])
54+
bullish_seeds = (["We are going to the moon", "This stock will prove an incredible investment"], ["I will short the hell out of them", "Uber stocks drop 7% in value after down-time."])
55+
56+
# Either specify it like this:
57+
seeds = [("cuteness", cuteness_seeds), ("bullish", bullish_seeds)]
58+
# or as an OrderedDict:
59+
seeds = OrderedDict([("cuteness", cuteness_seeds), ("bullish", bullish_seeds)])
60+
cvp = ConceptVectorProjection(seeds=seeds)
61+
62+
test_documents = ["What an awesome investment", "Tiny beautiful kitty-cat"]
63+
doc_concept_matrix = cvp.transform(test_documents)
64+
concept_df = pd.DataFrame(doc_concept_matrix, columns=cvp.get_feature_names_out())
65+
print(concept_df)
66+
```
67+
68+
```python
69+
cuteness bullish
70+
0 0.085957 0.288779
71+
1 0.269454 0.009495
72+
```
73+
74+
## API Reference
75+
76+
77+
::: turftopic.models.cvp.ConceptVectorProjection
78+
79+

docs/images/cvp.png

93.8 KB
Loading

mkdocs.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ nav:
3434
- Clustering Models (BERTopic & Top2Vec): clustering.md
3535
- Autoencoding Models (ZeroShotTM & CombinedTM): ctm.md
3636
- FASTopic: FASTopic.md
37+
- Other Models (e.g. Sentiment Analysis):
38+
- Concept Vector Projection (Continuous Sentiment Scoring): cvp.md
3739
- Embedding Models: encoders.md
3840
- Vectorizers (Term extraction): vectorizers.md
3941
- Topic Analysis and Naming with LLMs: analyzers.md

tests/test_cvp.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
def test_cvp():
2+
from turftopic import ConceptVectorProjection
3+
4+
cuteness_seeds = (
5+
["Absolutely adorable", "I love how he dances with his little feet"],
6+
[
7+
"What a big slob of an abomination",
8+
"A suspicious old man sat next to me on the bus today",
9+
],
10+
)
11+
bullish_seeds = (
12+
[
13+
"We are going to the moon",
14+
"This stock will prove an incredible investment",
15+
],
16+
[
17+
"I will short the hell out of them",
18+
"Uber stocks drop 7% in value after down-time.",
19+
],
20+
)
21+
seeds = [("cuteness", cuteness_seeds), ("bullish", bullish_seeds)]
22+
cvp = ConceptVectorProjection(seeds=seeds)
23+
test_documents = ["What an awesome investment", "Tiny beautiful kitty-cat"]
24+
doc_concept_matrix = cvp.transform(test_documents)
25+
assert doc_concept_matrix.shape == (2, 2)

turftopic/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from turftopic.base import ContextualModel
44
from turftopic.error import NotInstalled
55
from turftopic.models.cluster import BERTopic, ClusteringTopicModel, Top2Vec
6+
from turftopic.models.cvp import ConceptVectorProjection
67
from turftopic.models.decomp import S3, SemanticSignalSeparation
78
from turftopic.models.fastopic import FASTopic
89
from turftopic.models.gmm import GMM
@@ -34,4 +35,5 @@
3435
"create_concept_browser",
3536
"S3",
3637
"SensTopic",
38+
"ConceptVectorProjection",
3739
]

turftopic/encoders/utils.py

Lines changed: 41 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -18,43 +18,63 @@ def batched(iterable, n: int) -> Iterable[List[str]]:
1818

1919
def encode_chunks(
2020
encoder,
21-
sentences,
21+
texts,
2222
batch_size=64,
2323
window_size=50,
2424
step_size=40,
25-
return_chunks=False,
26-
show_progress_bar=False,
2725
):
28-
chunks = []
26+
"""
27+
Returns
28+
-------
29+
chunk_embeddings: list[np.ndarray]
30+
Embedding matrix of chunks for each document.
31+
chunk_positions: list[list[tuple[int, int]]]
32+
List of start and end character index of chunks for each document.
33+
"""
34+
chunk_positions = []
2935
chunk_embeddings = []
3036
for start_index in trange(
3137
0,
32-
len(sentences),
38+
len(texts),
3339
batch_size,
3440
desc="Encoding batches...",
35-
disable=not show_progress_bar,
3641
):
37-
batch = sentences[start_index : start_index + batch_size]
42+
batch = texts[start_index : start_index + batch_size]
3843
features = encoder.tokenize(batch)
3944
with torch.no_grad():
4045
output_features = encoder.forward(features)
4146
n_tokens = output_features["attention_mask"].sum(axis=1)
47+
# Find first nonzero elements in each document
48+
# The document could be padded from the left, so we have to watch out for this.
49+
start_token = torch.argmax(
50+
(output_features["attention_mask"] > 0).to(torch.long), axis=1
51+
)
52+
end_token = start_token + n_tokens
4253
for i_doc in range(len(batch)):
43-
for chunk_start in range(0, n_tokens[i_doc], step_size):
44-
chunk_end = min(chunk_start + window_size, n_tokens[i_doc])
54+
_chunk_embeddings = []
55+
_chunk_positions = []
56+
for chunk_start in range(
57+
start_token[i_doc], end_token[i_doc], step_size
58+
):
59+
chunk_end = min(chunk_start + window_size, end_token[i_doc])
4560
_emb = output_features["token_embeddings"][
4661
i_doc, chunk_start:chunk_end, :
4762
].mean(axis=0)
48-
chunk_embeddings.append(_emb)
49-
if return_chunks:
50-
chunks.append(
51-
encoder.tokenizer.decode(
52-
features["input_ids"][i_doc, chunk_start:chunk_end]
53-
)
54-
.replace("[CLS]", "")
55-
.replace("[SEP]", "")
63+
_chunk_embeddings.append(_emb)
64+
chunk_text = (
65+
encoder.tokenizer.decode(
66+
features["input_ids"][i_doc, chunk_start:chunk_end],
67+
skip_special_tokens=True,
5668
)
57-
if not return_chunks:
58-
chunks = None
59-
chunk_embeddings = np.stack(chunk_embeddings)
60-
return chunk_embeddings, chunks
69+
.replace("[CLS]", "")
70+
.replace("[SEP]", "")
71+
.strip()
72+
)
73+
doc_text = texts[start_index + i_doc]
74+
start_char = doc_text.find(chunk_text)
75+
end_char = start_char + len(chunk_text)
76+
_chunk_positions.append((start_char, end_char))
77+
_chunk_embeddings = np.stack(_chunk_embeddings)
78+
chunk_embeddings.append(_chunk_embeddings)
79+
chunk_positions.append(_chunk_positions)
80+
return chunk_embeddings, chunk_positions

turftopic/models/cvp.py

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
import json
2+
import tempfile
3+
from collections import OrderedDict
4+
from pathlib import Path
5+
from typing import Union
6+
7+
import joblib
8+
import numpy as np
9+
from huggingface_hub import HfApi
10+
from sentence_transformers import SentenceTransformer
11+
from sklearn.base import BaseEstimator, TransformerMixin
12+
13+
from turftopic.base import Encoder
14+
from turftopic.encoders.multimodal import MultimodalEncoder
15+
from turftopic.serialization import create_readme, get_package_versions
16+
17+
Seeds = tuple[list[str], list[str]]
18+
19+
20+
class ConceptVectorProjection(BaseEstimator, TransformerMixin):
21+
"""Concept Vector Projection model from [Lyngbæk et al. (2025)](https://doi.org/10.63744/nVu1Zq5gRkuD)
22+
Can be used to project document embeddings onto a difference projection vector between positive and negative seed phrases.
23+
The primary use case is sentiment analysis, and continuous sentiment scores,
24+
especially for languages where dedicated models are not available.
25+
26+
Parameters
27+
----------
28+
seeds: (list[str], list[str]) or list of (str, (list[str], list[str]))
29+
If you want to project to a single concept, then
30+
a tuple of (list of negative terms, list of positive terms). <br>
31+
If there are multiple concepts, they should be specified as (name, Seeds) tuples in a list.
32+
Alternatively, seeds can be an OrderedDict with the names of the concepts being the keys,
33+
and the tuples of negative and positive seeds as the values.
34+
encoder: str or SentenceTransformer
35+
Model to produce document representations, paraphrase-multilingual-mpnet-base-v2 is the default
36+
per Lyngbæk et al. (2025).
37+
"""
38+
39+
def __init__(
40+
self,
41+
seeds: Union[Seeds, list[tuple[str, Seeds]], OrderedDict[str, Seeds]],
42+
encoder: Union[
43+
Encoder, str, MultimodalEncoder
44+
] = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
45+
):
46+
self.seeds = seeds
47+
if isinstance(seeds, OrderedDict):
48+
self._seeds = seeds
49+
elif (
50+
(len(seeds) == 2)
51+
and (isinstance(seeds, tuple))
52+
and (isinstance(seeds[0][0], str))
53+
):
54+
self._seeds = OrderedDict([("default", seeds)])
55+
else:
56+
self._seeds = OrderedDict(seeds)
57+
self.encoder = encoder
58+
if isinstance(encoder, str):
59+
self.encoder_ = SentenceTransformer(encoder)
60+
else:
61+
self.encoder_ = encoder
62+
self.classes_ = np.array([name for name in self._seeds])
63+
self.concept_matrix_ = []
64+
for _, (positive, negative) in self._seeds.items():
65+
positive_emb = self.encoder_.encode(positive)
66+
negative_emb = self.encoder_.encode(negative)
67+
cv = np.mean(positive_emb, axis=0) - np.mean(negative_emb, axis=0)
68+
self.concept_matrix_.append(cv / np.linalg.norm(cv))
69+
self.concept_matrix_ = np.stack(self.concept_matrix_)
70+
71+
def get_feature_names_out(self):
72+
"""Returns concept names in an array."""
73+
return self.classes_
74+
75+
def fit_transform(self, raw_documents=None, y=None, embeddings=None):
76+
"""Project documents onto the concept vectors.
77+
78+
Parameters
79+
----------
80+
raw_documents: list[str] or None
81+
List of documents to project to the concept vectors.
82+
embeddings: ndarray of shape (n_documents, n_dimensions)
83+
Document embeddings (has to be created with the same encoder as the concept vectors.)
84+
85+
Returns
86+
-------
87+
document_concept_matrix: ndarray of shape (n_documents, n_dimensions)
88+
Prevalance of each concept in each document.
89+
"""
90+
if (raw_documents is None) and (embeddings is None):
91+
raise ValueError(
92+
"Either embeddings or raw_documents has to be passed, both are None."
93+
)
94+
if embeddings is None:
95+
embeddings = self.encoder_.encode(raw_documents)
96+
return embeddings @ self.concept_matrix_.T
97+
98+
def transform(self, raw_documents=None, embeddings=None):
99+
"""Project documents onto the concept vectors.
100+
101+
Parameters
102+
----------
103+
raw_documents: list[str] or None
104+
List of documents to project to the concept vectors.
105+
embeddings: ndarray of shape (n_documents, n_dimensions)
106+
Document embeddings (has to be created with the same encoder as the concept vectors.)
107+
108+
Returns
109+
-------
110+
document_concept_matrix: ndarray of shape (n_documents, n_dimensions)
111+
Prevalance of each concept in each document.
112+
"""
113+
return self.fit_transform(raw_documents, embeddings=embeddings)
114+
115+
def to_disk(self, out_dir: Union[Path, str]):
116+
"""Persists model to directory on your machine.
117+
118+
Parameters
119+
----------
120+
out_dir: Path | str
121+
Directory to save the model to.
122+
"""
123+
out_dir = Path(out_dir)
124+
out_dir.mkdir(exist_ok=True)
125+
package_versions = get_package_versions()
126+
with out_dir.joinpath("package_versions.json").open("w") as ver_file:
127+
ver_file.write(json.dumps(package_versions))
128+
joblib.dump(self, out_dir.joinpath("model.joblib"))
129+
130+
def push_to_hub(self, repo_id: str):
131+
"""Uploads model to HuggingFace Hub
132+
133+
Parameters
134+
----------
135+
repo_id: str
136+
Repository to upload the model to.
137+
"""
138+
api = HfApi()
139+
api.create_repo(repo_id, exist_ok=True)
140+
with tempfile.TemporaryDirectory() as tmp_dir:
141+
readme_path = Path(tmp_dir).joinpath("README.md")
142+
with readme_path.open("w") as readme_file:
143+
readme_file.write(create_readme(self, repo_id))
144+
self.to_disk(tmp_dir)
145+
api.upload_folder(
146+
folder_path=tmp_dir,
147+
repo_id=repo_id,
148+
repo_type="model",
149+
)

0 commit comments

Comments
 (0)