Merge pull request #129 from x-tabdeveloping/multimodal-sbert

x-tabdeveloping · web-flow · commit 12e6e86360ac · 2026-04-13T14:42:17.000+02:00
Multimodal sbert
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,7 @@ profile = "black"
 
 [project]
 name = "turftopic"
-version = "0.25.0"
+version = "0.25.1"
 description = "Topic modeling with contextual representations from sentence transformers."
 authors = [
    { name = "Márton Kardos <power.up1163@gmail.com>", email = "martonkardos@cas.au.dk" }
diff --git a/turftopic/base.py b/turftopic/base.py
@@ -41,7 +41,7 @@ def encode_documents(self, raw_documents: Iterable[str]) -> np.ndarray:
         """
         if not hasattr(self.encoder_, "encode"):
             return self.encoder.get_text_embeddings(list(raw_documents))
-        return self.encoder_.encode(raw_documents)
+        return self.encoder_.encode(list(raw_documents))
 
     @abstractmethod
     def fit_transform(
diff --git a/turftopic/late.py b/turftopic/late.py
@@ -53,40 +53,20 @@ def _encode_tokens(
             Start and end character of each token in each document.
         """
         self.has_used_token_level = True
-        token_embeddings = []
-        offsets = []
-        for start_index in trange(
-            0,
-            len(texts),
-            batch_size,
-            desc="Encoding batches...",
-        ):
-            batch = texts[start_index : start_index + batch_size]
-            features = self.tokenize(batch)
-            with torch.no_grad():
-                output_features = self.forward(features)
-            n_tokens = output_features["attention_mask"].sum(axis=1)
-            # Find first nonzero elements in each document
-            # The document could be padded from the left, so we have to watch out for this.
-            start_token = torch.argmax(
-                (output_features["attention_mask"] > 0).to(torch.long), axis=1
-            )
-            end_token = start_token + n_tokens
-            for i_doc in range(len(batch)):
-                _token_embeddings = (
-                    output_features["token_embeddings"][
-                        i_doc, start_token[i_doc] : end_token[i_doc], :
-                    ]
-                    .float()
-                    .numpy(force=True)
-                )
-                _n = _token_embeddings.shape[0]
-                # We extract the character offsets and prune it at the maximum context length
-                _offsets = self.tokenizer(
-                    batch[i_doc], return_offsets_mapping=True, verbose=False
-                )["offset_mapping"][:_n]
-                token_embeddings.append(_token_embeddings)
-                offsets.append(_offsets)
+        token_embeddings = self.encode(
+            texts, output_value="token_embeddings", batch_size=batch_size
+        )
+        offsets = self.tokenizer(
+            texts, return_offsets_mapping=True, verbose=False
+        )["offset_mapping"]
+        offsets = [
+            offs[: len(embs)] for offs, embs in zip(offsets, token_embeddings)
+        ]
+        token_embeddings = [
+            embs.numpy(force=True)
+            for embs in token_embeddings
+            if torch.is_tensor(embs)
+        ]
         return token_embeddings, offsets
 
     def encode_tokens(
diff --git a/turftopic/models/ctm.py b/turftopic/models/ctm.py
@@ -193,7 +193,7 @@ def transform(
             Document-topic matrix.
         """
         if embeddings is None:
-            embeddings = self.encoder_.encode(raw_documents)
+            embeddings = self.encode_documents(raw_documents)
         if self.combined:
             bow = self.vectorizer.fit_transform(raw_documents)
             contextual_embeddings = np.concatenate(
@@ -219,7 +219,7 @@ def fit_transform(
         with console.status("Fitting model") as status:
             if embeddings is None:
                 status.update("Encoding documents")
-                embeddings = self.encoder_.encode(raw_documents)
+                embeddings = self.encode_documents(raw_documents)
                 console.log("Documents encoded.")
             status.update("Extracting terms.")
             document_term_matrix = self.vectorizer.fit_transform(raw_documents)
diff --git a/turftopic/models/cvp.py b/turftopic/models/cvp.py
@@ -62,8 +62,8 @@ def __init__(
         self.classes_ = np.array([name for name in self._seeds])
         self.concept_matrix_ = []
         for _, (positive, negative) in self._seeds.items():
-            positive_emb = self.encoder_.encode(positive)
-            negative_emb = self.encoder_.encode(negative)
+            positive_emb = self.encoder_.encode(list(positive))
+            negative_emb = self.encoder_.encode(list(negative))
             cv = np.mean(positive_emb, axis=0) - np.mean(negative_emb, axis=0)
             self.concept_matrix_.append(cv / np.linalg.norm(cv))
         self.concept_matrix_ = np.stack(self.concept_matrix_)
@@ -92,7 +92,7 @@ def fit_transform(self, raw_documents=None, y=None, embeddings=None):
                 "Either embeddings or raw_documents has to be passed, both are None."
             )
         if embeddings is None:
-            embeddings = self.encoder_.encode(raw_documents)
+            embeddings = self.encoder_.encode(list(raw_documents))
         return embeddings @ self.concept_matrix_.T
 
     def transform(self, raw_documents=None, embeddings=None):
diff --git a/turftopic/models/decomp.py b/turftopic/models/decomp.py
@@ -140,7 +140,7 @@ def fit_transform(
         with console.status("Fitting model") as status:
             if self.embeddings is None:
                 status.update("Encoding documents")
-                self.embeddings = self.encoder_.encode(raw_documents)
+                self.embeddings = self.encode_documents(raw_documents)
                 console.log("Documents encoded.")
             status.update("Decomposing embeddings")
             if isinstance(self.decomposition, FastICA) and (y is not None):
@@ -153,7 +153,7 @@ def fit_transform(
             vocab = self.vectorizer.fit(raw_documents).get_feature_names_out()
             console.log("Term extraction done.")
             status.update("Encoding vocabulary")
-            self.vocab_embeddings = self.encoder_.encode(vocab)
+            self.vocab_embeddings = self.encode_documents(vocab)
             if self.vocab_embeddings.shape[1] != self.embeddings.shape[1]:
                 raise ValueError(
                     NOT_MATCHING_ERROR.format(
@@ -636,7 +636,7 @@ def transform(
             Document-topic matrix.
         """
         if embeddings is None:
-            embeddings = self.encoder_.encode(raw_documents)
+            embeddings = self.encode_documents(raw_documents)
         return self.decomposition.transform(embeddings)
 
     def print_topics(
diff --git a/turftopic/models/gmm.py b/turftopic/models/gmm.py
@@ -206,7 +206,7 @@ def fit_transform(
         with console.status("Fitting model") as status:
             if embeddings is None:
                 status.update("Encoding documents")
-                embeddings = self.encoder_.encode(raw_documents)
+                embeddings = self.encode_documents(raw_documents)
                 console.log("Documents encoded.")
             self.embeddings = embeddings
             status.update("Extracting terms.")
@@ -325,7 +325,7 @@ def transform(
             Document-topic matrix.
         """
         if embeddings is None:
-            embeddings = self.encoder_.encode(raw_documents)
+            embeddings = self.encode_documents(raw_documents)
         if self.dimensionality_reduction is not None:
             embeddings = self.dimensionality_reduction.transform(embeddings)
         return self.gmm_.predict_proba(embeddings)
diff --git a/turftopic/models/senstopic.py b/turftopic/models/senstopic.py
@@ -149,7 +149,7 @@ def fit_transform(
         with console.status("Fitting model") as status:
             if self.embeddings is None:
                 status.update("Encoding documents")
-                self.embeddings = self.encoder_.encode(raw_documents)
+                self.embeddings = self.encode_documents(raw_documents)
                 console.log("Documents encoded.")
             if self.n_components == "auto":
                 status.update("Finding the number of components.")
@@ -177,7 +177,7 @@ def fit_transform(
             console.log("Term extraction done.")
             if getattr(self, "vocab_embeddings", None) is None:
                 status.update("Encoding vocabulary")
-                self.vocab_embeddings = self.encoder_.encode(vocab)
+                self.vocab_embeddings = self.encode_documents(vocab)
             if self.vocab_embeddings.shape[1] != self.embeddings.shape[1]:
                 raise ValueError(
                     NOT_MATCHING_ERROR.format(