88from turftopic .base import Encoder
99from turftopic .encoders .multimodal import MultimodalEncoder
1010
11+ Seeds = tuple [list [str ], list [str ]]
12+
1113
1214class ConceptVectorProjection (BaseEstimator , TransformerMixin ):
15+ """Concept Vector Projection model from [Lyngbæk et al. (2025)](https://doi.org/10.63744/nVu1Zq5gRkuD)
16+ Can be used to project document embeddings onto a difference projection vector between positive and negative seed phrases.
17+ The primary use case is sentiment analysis, and continuous sentiment scores,
18+ especially for languages where dedicated models are not available.
19+
20+ Parameters
21+ ----------
22+ seeds: (list[str], list[str]) or list of (str, (list[str], list[str]))
23+ If you want to project to a single concept, then
24+ a tuple of (list of negative terms, list of positive terms). <br>
25+ If there are multiple concepts, they should be specified as (name, Seeds) tuples in a list.
26+ Alternatively, seeds can be an OrderedDict with the names of the concepts being the keys,
27+ and the tuples of negative and positive seeds as the values.
28+ encoder: str or SentenceTransformer
29+ Model to produce document representations, paraphrase-multilingual-mpnet-base-v2 is the default
30+ per Lyngbæk et al. (2025).
31+ """
32+
1333 def __init__ (
1434 self ,
15- seeds : (
16- tuple [list [str ], list [str ]]
17- | list [tuple [[str , tuple [list [str ], list [str ]]]]]
18- ),
35+ seeds : Union [Seeds , list [tuple [str , Seeds ]], OrderedDict [str , Seeds ]],
1936 encoder : Union [
2037 Encoder , str , MultimodalEncoder
2138 ] = "sentence-transformers/all-MiniLM-L6-v2" ,
2239 ):
2340 self .seeds = seeds
24- if (
41+ if isinstance (seeds , OrderedDict ):
42+ self ._seeds = seeds
43+ elif (
2544 (len (seeds ) == 2 )
2645 and (isinstance (seeds , tuple ))
2746 and (isinstance (seeds [0 ][0 ], str ))
@@ -44,9 +63,24 @@ def __init__(
4463 self .concept_matrix_ = np .stack (self .concept_matrix_ )
4564
4665 def get_feature_names_out (self ):
66+ """Returns concept names in an array."""
4767 return self .classes_
4868
4969 def fit_transform (self , raw_documents = None , y = None , embeddings = None ):
70+ """Project documents onto the concept vectors.
71+
72+ Parameters
73+ ----------
74+ raw_documents: list[str] or None
75+ List of documents to project to the concept vectors.
76+ embeddings: ndarray of shape (n_documents, n_dimensions)
77+ Document embeddings (has to be created with the same encoder as the concept vectors.)
78+
79+ Returns
80+ -------
81+ document_concept_matrix: ndarray of shape (n_documents, n_dimensions)
82+ Prevalance of each concept in each document.
83+ """
5084 if (raw_documents is None ) and (embeddings is None ):
5185 raise ValueError (
5286 "Either embeddings or raw_documents has to be passed, both are None."
@@ -56,4 +90,18 @@ def fit_transform(self, raw_documents=None, y=None, embeddings=None):
5690 return embeddings @ self .concept_matrix_ .T
5791
5892 def transform (self , raw_documents = None , embeddings = None ):
93+ """Project documents onto the concept vectors.
94+
95+ Parameters
96+ ----------
97+ raw_documents: list[str] or None
98+ List of documents to project to the concept vectors.
99+ embeddings: ndarray of shape (n_documents, n_dimensions)
100+ Document embeddings (has to be created with the same encoder as the concept vectors.)
101+
102+ Returns
103+ -------
104+ document_concept_matrix: ndarray of shape (n_documents, n_dimensions)
105+ Prevalance of each concept in each document.
106+ """
59107 return self .fit_transform (raw_documents , embeddings = embeddings )
0 commit comments