Skip to content

Commit e76a4d8

Browse files
Replaced all raw encoder_.encode calls due to issue in sentence-transformers
1 parent 443bf25 commit e76a4d8

7 files changed

Lines changed: 358 additions & 12 deletions

File tree

turftopic/models/_colbert.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
import warnings
2+
from typing import Optional
3+
4+
import numpy as np
5+
import torch
6+
from sklearn.feature_extraction.text import CountVectorizer
7+
from sklearn.metrics.pairwise import cosine_similarity
8+
from tqdm import trange
9+
from transformers import AutoTokenizer, BertModel
10+
11+
12+
class ColBERTKeywordExtractor:
13+
def __init__(
14+
self,
15+
top_n: int,
16+
model_name: str,
17+
vectorizer: CountVectorizer,
18+
batch_size: int = 32,
19+
):
20+
self.top_n = top_n
21+
self.model_name = model_name
22+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
23+
self.model = BertModel.from_pretrained(model_name)
24+
self.vectorizer = vectorizer
25+
self.batch_size = batch_size
26+
self.key_to_index: dict[str, int] = {}
27+
self.term_embeddings: Optional[np.ndarray] = None
28+
29+
def encode_batch(self, sentences: list[str]) -> list[np.ndarray]:
30+
with torch.no_grad():
31+
inputs = self.tokenizer(
32+
sentences, return_tensors="pt", padding=True
33+
)
34+
outputs = self.model(**inputs)
35+
hidden_state = outputs.last_hidden_state
36+
embeddings = []
37+
for h, m in zip(hidden_state, inputs["attention_mask"]):
38+
embeddings.append(h[m > 0].numpy())
39+
return embeddings
40+
41+
@property
42+
def vocab(self) -> np.ndarray:
43+
res = [""] * self.n_vocab
44+
for key, index in self.key_to_index.items():
45+
res[index] = key
46+
return np.array(res)
47+
48+
@property
49+
def n_vocab(self) -> int:
50+
return len(self.key_to_index)
51+
52+
def _add_terms(self, new_terms: list[str]):
53+
for term in new_terms:
54+
self.key_to_index[term] = self.n_vocab
55+
term_encodings = self.encode_batch(new_terms)
56+
term_encodings = np.stack(
57+
[np.mean(_t, axis=0) for _t in term_encodings]
58+
)
59+
if self.term_embeddings is not None:
60+
self.term_embeddings = np.concatenate(
61+
(self.term_embeddings, term_encodings), axis=0
62+
)
63+
else:
64+
self.term_embeddings = term_encodings
65+
66+
def batch_extract_keywords(
67+
self,
68+
documents: list[str],
69+
embeddings: Optional[np.ndarray] = None,
70+
seed_embedding: Optional[np.ndarray] = None,
71+
fitting: bool = True,
72+
) -> list[dict[str, float]]:
73+
if not len(documents):
74+
return []
75+
if embeddings is not None:
76+
warnings.warn(
77+
"embeddings parameter specified, but get ignored when using ColBERT."
78+
)
79+
keywords = []
80+
if fitting:
81+
document_term_matrix = self.vectorizer.fit_transform(documents)
82+
else:
83+
document_term_matrix = self.vectorizer.transform(documents)
84+
batch_vocab = self.vectorizer.get_feature_names_out()
85+
new_terms = list(set(batch_vocab) - set(self.key_to_index.keys()))
86+
if len(new_terms):
87+
self._add_terms(new_terms)
88+
for i in trange(
89+
0, len(documents), self.batch_size, desc="Extracting keywords"
90+
):
91+
_docs = documents[i : i + self.batch_size]
92+
_embs = self.encode_batch(_docs)
93+
for j in range(0, self.batch_size):
94+
terms = document_term_matrix[i + j, :].todense()
95+
mask = terms > 0
96+
if not np.any(mask):
97+
keywords.append(dict())
98+
continue
99+
important_terms = np.ravel(np.asarray(mask))
100+
word_embeddings = [
101+
self.term_embeddings[self.key_to_index[term]]
102+
for term in batch_vocab[important_terms]
103+
]
104+
sim = cosine_similarity(word_embeddings, _embs[j])
105+
maxsim = np.max(sim, axis=1)
106+
kth = min(self.top_n, len(maxsim) - 1)
107+
top = np.argpartition(-maxsim, kth)[:kth]
108+
top_words = batch_vocab[important_terms][top]
109+
top_sims = [sim for sim in maxsim[top] if sim > 0]
110+
keywords.append(dict(zip(top_words, top_sims)))
111+
return keywords

turftopic/models/_gaussian_lda.py

Lines changed: 235 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,235 @@
1+
from __future__ import division
2+
3+
import random
4+
from collections import defaultdict
5+
6+
import gensim
7+
import numpy as np
8+
from numpy import exp, linalg, log, pi
9+
from scipy.special import gamma, gammaln
10+
11+
12+
class Wishart(object):
13+
14+
def __init__(self, word_vecs):
15+
self.nu = None
16+
self.kappa = None
17+
self.psi = None
18+
19+
self.set_params(word_vecs)
20+
21+
def set_params(self, word_vecs):
22+
#turn dict of word vecs into a matrix
23+
word_vecs = np.vstack(word_vecs.values())
24+
25+
self.nu = word_vecs.shape[1] #len of columns
26+
self.kappa = 0.01
27+
self.psi = np.sum(word_vecs.T.dot(word_vecs), axis=0) # should this be np.sum(x.dot(x.T)))??? also changed this to x.T.dot(x)
28+
self.mu = np.mean(word_vecs, axis=0)
29+
print "psi shape", self.psi.shape
30+
31+
32+
33+
class Gauss_LDA(object):
34+
35+
def __init__(self, num_topics, corpus, word_vector_filepath):
36+
self.doc_topic_CT = None
37+
self.word_topics = {}
38+
self.corpus = corpus
39+
self.vocab = None
40+
self.priors = None
41+
self.word_vecs = {}
42+
self.numtopics = num_topics
43+
self.vocab = set([])
44+
self.topic_params = defaultdict(dict)
45+
self.wordvecFP = word_vector_filepath
46+
self.word_index = {}
47+
self.word_vec_size = None
48+
self.alpha = 50./self.numtopics
49+
50+
51+
def process_corpus(self, documents):
52+
"""
53+
Takes a list of documents, and processes them
54+
sets vocab
55+
returns: None
56+
"""
57+
temp_corpus = {}
58+
for index, doc in enumerate(documents):
59+
words = doc.split()
60+
temp_corpus[index] = words
61+
for word in words:
62+
self.vocab.add(word)
63+
self.corpus = temp_corpus
64+
print "Done processing corpus with {} documents".format(len(documents))
65+
66+
67+
def process_wordvectors(self, filepath):
68+
print "Processing word-vectors, this takes a moment"
69+
70+
vectors = gensim.models.Word2Vec.load_word2vec_format(fname=filepath, binary=False)
71+
useable_vocab = 0
72+
unusable_vocab = 0
73+
self.word_vec_size = vectors.vector_size
74+
75+
for word in self.vocab:
76+
try:
77+
vectors[word]
78+
useable_vocab += 1
79+
except KeyError: unusable_vocab += 1
80+
81+
print "There are {0} words that could be convereted to word vectors in your corpus \n" \
82+
"There are {1} words that could NOT be converted to word vectors".format(useable_vocab, unusable_vocab)
83+
84+
# self.word_vecs = np.zeros((useable_vocab, vectors.vector_size))
85+
index = 0
86+
for word in self.vocab:
87+
try:
88+
self.word_vecs[word] = vectors[word]
89+
index += 1
90+
except KeyError: continue
91+
print "Word-vectors for the corpus are created"
92+
93+
94+
def fit(self, iterations=1, init=True): #set hyperparams here?
95+
if init == True:
96+
self.init()
97+
init = False
98+
99+
print "Starting fit"
100+
for i in xrange(iterations):
101+
self.sample()
102+
print "{} iterations complete".format(i)
103+
104+
def init(self):
105+
106+
self.process_corpus(self.corpus)
107+
self.process_wordvectors(self.wordvecFP)
108+
#setting wishhart priors
109+
self.priors = Wishart(self.word_vecs)
110+
self.doc_topic_CT = np.zeros((len(self.corpus.keys()), self.numtopics))
111+
112+
self.word_topics = {word: random.choice(range(self.numtopics)) for word in self.vocab}
113+
# get Doc-Topic Counts
114+
for docID, doc in self.corpus.iteritems():
115+
for word in doc:
116+
topicID = self.word_topics[word]
117+
self.doc_topic_CT[docID, topicID] += 1
118+
119+
# Init parameters for topic distributions
120+
for k in range(self.numtopics):
121+
self.recalculate_topic_params(k)
122+
123+
print "Intialization complete"
124+
def sample(self, init=True):
125+
126+
print "sampling started"
127+
# Randomly assign word to topics
128+
if init == False:
129+
self.word_topics = {word: random.choice(range(self.numtopics)) for word in self.vocab}
130+
131+
for docID, doc in self.corpus.iteritems():
132+
for word in doc:
133+
#subtracting info about current word-topic assignment from doc-topic count table
134+
topic_id = self.word_topics[word]
135+
self.doc_topic_CT[docID, topic_id] - doc.count(word)
136+
137+
self.recalculate_topic_params(topic_id)
138+
posterior = []
139+
max = 0
140+
for k in range(self.numtopics): #start getting the pdf's for each word-topic assignment
141+
log_prob = self.draw_new_wt_assgns(word, k)
142+
# Count of topic in doc
143+
Nkd = self.doc_topic_CT[docID, k]
144+
log_posterior = log(Nkd + self.alpha) * log_prob
145+
posterior.append(log_posterior)
146+
#doing this for some normalization scheme
147+
if log_posterior > max: max = log_posterior
148+
149+
normalized_posterior = [exp(i-max) for i in posterior]
150+
print np.sum(posterior)
151+
print np.random.multinomial(1, pvals=posterior)
152+
## need to copy the normalization scheme from Util.sample
153+
init = False
154+
return None
155+
156+
def draw_new_wt_assgns(self, word, topic_id):
157+
158+
# Getting params for calculating PDF of T-Dist for a word
159+
wordvec = self.word_vecs[word]
160+
inv_cov = self.topic_params[topic_id]["Inverse Covariance"]
161+
cov_det = self.topic_params[topic_id]["Covariance Determinant"]
162+
Nk = self.topic_params[topic_id]["Topic Count"]
163+
KappaK = self.topic_params[topic_id]["Topic Kappa"]
164+
centered = self.word_vecs[word] - self.priors.mu
165+
topic_cov = self.topic_params[topic_id]["Topic Covariance"]
166+
167+
168+
# Precalculating some terms (V_di - Mu)^T * Cov^-1 * (V_di - Mu)
169+
LLcomp = centered.T.dot(inv_cov).dot(centered)
170+
d = self.word_vec_size
171+
nu = self.priors.nu + Nk - d + 1
172+
173+
log_prop = gammaln(nu + d / 2) - \
174+
(gammaln(nu / 2) + d/2 * (log(nu) + log(pi)) +0.5 * log(cov_det) + (nu + d) / 2 * log(1 + LLcomp / nu))
175+
176+
return log_prop
177+
# logprob = Gamma.logGamma((nu + Data.D)/2) - \
178+
# (Gamma.logGamma(nu/2) + Data.D/2 * (Math.log(nu)+Math.log(Math.PI)) + 0.5 * Math.log(det) + (nu + Data.D)/2* Math.log(1+val/nu))
179+
180+
def recalculate_topic_params(self, topic_id):
181+
182+
topic_count = np.sum(self.doc_topic_CT[:, topic_id], axis=0) # N_k
183+
184+
kappa_k = self.priors.kappa + topic_count # K_k
185+
nu_k = self.priors.nu + topic_count # V_k
186+
187+
scaled_topic_mean_K, scaled_topic_cov_K = self.get_scaled_topic_mean_cov(topic_id) # V-Bar_k and C_k
188+
189+
vk_mu = scaled_topic_mean_K - self.priors.mu #V-bar_k - Mu
190+
print self.priors.psi
191+
psi_k = self.priors.psi + scaled_topic_cov_K + ((self.priors.kappa * topic_count) / kappa_k) * (vk_mu.T.dot(vk_mu)) # Psi_k
192+
193+
topic_mean = (self.priors.kappa * self.priors.mu + topic_count * scaled_topic_mean_K) / kappa_k # Mu_k
194+
topic_cov = psi_k / (nu_k - self.word_vec_size + 1) # Sigma_k
195+
196+
self.topic_params[topic_id]["Topic Count"] = topic_count
197+
self.topic_params[topic_id]["Topic Kappa"] = kappa_k
198+
self.topic_params[topic_id]["Topic Nu"] = nu_k
199+
self.topic_params[topic_id]["Topic Mean"], self.topic_params[topic_id]["Topic Covariance"] = topic_mean, topic_cov
200+
self.topic_params[topic_id]["Inverse Covariance"] = np.linalg.inv(topic_cov)
201+
self.topic_params[topic_id]["Covariance Determinant"] = np.linalg.det(topic_cov)
202+
self.topic_params[topic_id]["Liklihood Componant"] = None
203+
204+
205+
return topic_mean, topic_cov
206+
207+
def get_scaled_topic_mean_cov(self, topic_id):
208+
'mean of word vecs in a topic'
209+
# get words assigned to topic_id
210+
word_vecs = []
211+
for word, topic in self.word_topics.iteritems():
212+
if topic == topic_id:
213+
word_vecs.append(self.word_vecs[word])
214+
print word_vecs
215+
word_vecs = np.vstack(word_vecs)
216+
print word_vecs.shape
217+
print np.sum(word_vecs, axis=0)
218+
print np.sum(self.doc_topic_CT[:, topic_id], axis=0)
219+
mean = np.sum(word_vecs, axis=0) / (np.sum(self.doc_topic_CT[:, topic_id], axis=0) + 2) #added a small number here to stop overflow
220+
221+
# mean_centered = np.sum(word_vecs, axis=0) - mean
222+
mean_centered = word_vecs - mean
223+
print self.doc_topic_CT
224+
print mean
225+
print mean_centered
226+
227+
cov = mean_centered.T.dot(mean_centered)
228+
return mean, cov
229+
230+
if __name__ == "__main__":
231+
corpus = ["apple orange mango melon", "dog cat bird rat"]
232+
wordvec_fileapth = "/Users/michael/Documents/Gaussian_LDA-master/data/glove.wiki/glove.6B.50d.txt"
233+
g = Gauss_LDA(2, corpus, wordvec_fileapth )
234+
g.fit(2)
235+
# print g.topic_params[1]["Topic Count"]

turftopic/models/ctm.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ def transform(
193193
Document-topic matrix.
194194
"""
195195
if embeddings is None:
196-
embeddings = self.encoder_.encode(raw_documents)
196+
embeddings = self.encode_documents(raw_documents)
197197
if self.combined:
198198
bow = self.vectorizer.fit_transform(raw_documents)
199199
contextual_embeddings = np.concatenate(
@@ -219,7 +219,7 @@ def fit_transform(
219219
with console.status("Fitting model") as status:
220220
if embeddings is None:
221221
status.update("Encoding documents")
222-
embeddings = self.encoder_.encode(raw_documents)
222+
embeddings = self.encode_documents(raw_documents)
223223
console.log("Documents encoded.")
224224
status.update("Extracting terms.")
225225
document_term_matrix = self.vectorizer.fit_transform(raw_documents)

turftopic/models/cvp.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,8 @@ def __init__(
6262
self.classes_ = np.array([name for name in self._seeds])
6363
self.concept_matrix_ = []
6464
for _, (positive, negative) in self._seeds.items():
65-
positive_emb = self.encoder_.encode(positive)
66-
negative_emb = self.encoder_.encode(negative)
65+
positive_emb = self.encoder_.encode(list(positive))
66+
negative_emb = self.encoder_.encode(list(negative))
6767
cv = np.mean(positive_emb, axis=0) - np.mean(negative_emb, axis=0)
6868
self.concept_matrix_.append(cv / np.linalg.norm(cv))
6969
self.concept_matrix_ = np.stack(self.concept_matrix_)
@@ -92,7 +92,7 @@ def fit_transform(self, raw_documents=None, y=None, embeddings=None):
9292
"Either embeddings or raw_documents has to be passed, both are None."
9393
)
9494
if embeddings is None:
95-
embeddings = self.encoder_.encode(raw_documents)
95+
embeddings = self.encoder_.encode(list(raw_documents))
9696
return embeddings @ self.concept_matrix_.T
9797

9898
def transform(self, raw_documents=None, embeddings=None):

0 commit comments

Comments
 (0)