Skip to content

Commit e19f174

Browse files
Added docstrings for CVP
1 parent 0e84dc8 commit e19f174

1 file changed

Lines changed: 53 additions & 5 deletions

File tree

turftopic/models/cvp.py

Lines changed: 53 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,20 +8,39 @@
88
from turftopic.base import Encoder
99
from turftopic.encoders.multimodal import MultimodalEncoder
1010

11+
Seeds = tuple[list[str], list[str]]
12+
1113

1214
class ConceptVectorProjection(BaseEstimator, TransformerMixin):
15+
"""Concept Vector Projection model from [Lyngbæk et al. (2025)](https://doi.org/10.63744/nVu1Zq5gRkuD)
16+
Can be used to project document embeddings onto a difference projection vector between positive and negative seed phrases.
17+
The primary use case is sentiment analysis, and continuous sentiment scores,
18+
especially for languages where dedicated models are not available.
19+
20+
Parameters
21+
----------
22+
seeds: (list[str], list[str]) or list of (str, (list[str], list[str]))
23+
If you want to project to a single concept, then
24+
a tuple of (list of negative terms, list of positive terms). <br>
25+
If there are multiple concepts, they should be specified as (name, Seeds) tuples in a list.
26+
Alternatively, seeds can be an OrderedDict with the names of the concepts being the keys,
27+
and the tuples of negative and positive seeds as the values.
28+
encoder: str or SentenceTransformer
29+
Model to produce document representations, paraphrase-multilingual-mpnet-base-v2 is the default
30+
per Lyngbæk et al. (2025).
31+
"""
32+
1333
def __init__(
1434
self,
15-
seeds: (
16-
tuple[list[str], list[str]]
17-
| list[tuple[[str, tuple[list[str], list[str]]]]]
18-
),
35+
seeds: Union[Seeds, list[tuple[str, Seeds]], OrderedDict[str, Seeds]],
1936
encoder: Union[
2037
Encoder, str, MultimodalEncoder
2138
] = "sentence-transformers/all-MiniLM-L6-v2",
2239
):
2340
self.seeds = seeds
24-
if (
41+
if isinstance(seeds, OrderedDict):
42+
self._seeds = seeds
43+
elif (
2544
(len(seeds) == 2)
2645
and (isinstance(seeds, tuple))
2746
and (isinstance(seeds[0][0], str))
@@ -44,9 +63,24 @@ def __init__(
4463
self.concept_matrix_ = np.stack(self.concept_matrix_)
4564

4665
def get_feature_names_out(self):
66+
"""Returns concept names in an array."""
4767
return self.classes_
4868

4969
def fit_transform(self, raw_documents=None, y=None, embeddings=None):
70+
"""Project documents onto the concept vectors.
71+
72+
Parameters
73+
----------
74+
raw_documents: list[str] or None
75+
List of documents to project to the concept vectors.
76+
embeddings: ndarray of shape (n_documents, n_dimensions)
77+
Document embeddings (has to be created with the same encoder as the concept vectors.)
78+
79+
Returns
80+
-------
81+
document_concept_matrix: ndarray of shape (n_documents, n_dimensions)
82+
Prevalance of each concept in each document.
83+
"""
5084
if (raw_documents is None) and (embeddings is None):
5185
raise ValueError(
5286
"Either embeddings or raw_documents has to be passed, both are None."
@@ -56,4 +90,18 @@ def fit_transform(self, raw_documents=None, y=None, embeddings=None):
5690
return embeddings @ self.concept_matrix_.T
5791

5892
def transform(self, raw_documents=None, embeddings=None):
93+
"""Project documents onto the concept vectors.
94+
95+
Parameters
96+
----------
97+
raw_documents: list[str] or None
98+
List of documents to project to the concept vectors.
99+
embeddings: ndarray of shape (n_documents, n_dimensions)
100+
Document embeddings (has to be created with the same encoder as the concept vectors.)
101+
102+
Returns
103+
-------
104+
document_concept_matrix: ndarray of shape (n_documents, n_dimensions)
105+
Prevalance of each concept in each document.
106+
"""
59107
return self.fit_transform(raw_documents, embeddings=embeddings)

0 commit comments

Comments
 (0)