x-tabdeveloping
diff --git a/‎turftopic/models/_colbert.py‎
Lines changed: 111 additions & 0 deletions b/‎turftopic/models/_colbert.py‎
Lines changed: 111 additions & 0 deletions
diff --git a/‎turftopic/models/_gaussian_lda.py‎
Lines changed: 235 additions & 0 deletions b/‎turftopic/models/_gaussian_lda.py‎
Lines changed: 235 additions & 0 deletions
diff --git a/‎turftopic/models/ctm.py‎
Lines changed: 2 additions & 2 deletions b/‎turftopic/models/ctm.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎turftopic/models/cvp.py‎
Lines changed: 3 additions & 3 deletions b/‎turftopic/models/cvp.py‎
Lines changed: 3 additions & 3 deletions
@@ -0,0 +1,111 @@
+import warnings
+from typing import Optional
+
+import numpy as np
+import torch
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+from tqdm import trange
+from transformers import AutoTokenizer, BertModel
+
+
+class ColBERTKeywordExtractor:
+    def __init__(
+        self,
+        top_n: int,
+        model_name: str,
+        vectorizer: CountVectorizer,
+        batch_size: int = 32,
+    ):
+        self.top_n = top_n
+        self.model_name = model_name
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = BertModel.from_pretrained(model_name)
+        self.vectorizer = vectorizer
+        self.batch_size = batch_size
+        self.key_to_index: dict[str, int] = {}
+        self.term_embeddings: Optional[np.ndarray] = None
+
+    def encode_batch(self, sentences: list[str]) -> list[np.ndarray]:
+        with torch.no_grad():
+            inputs = self.tokenizer(
+                sentences, return_tensors="pt", padding=True
+            )
+            outputs = self.model(**inputs)
+        hidden_state = outputs.last_hidden_state
+        embeddings = []
+        for h, m in zip(hidden_state, inputs["attention_mask"]):
+            embeddings.append(h[m > 0].numpy())
+        return embeddings
+
+    @property
+    def vocab(self) -> np.ndarray:
+        res = [""] * self.n_vocab
+        for key, index in self.key_to_index.items():
+            res[index] = key
+        return np.array(res)
+
+    @property
+    def n_vocab(self) -> int:
+        return len(self.key_to_index)
+
+    def _add_terms(self, new_terms: list[str]):
+        for term in new_terms:
+            self.key_to_index[term] = self.n_vocab
+        term_encodings = self.encode_batch(new_terms)
+        term_encodings = np.stack(
+            [np.mean(_t, axis=0) for _t in term_encodings]
+        )
+        if self.term_embeddings is not None:
+            self.term_embeddings = np.concatenate(
+                (self.term_embeddings, term_encodings), axis=0
+            )
+        else:
+            self.term_embeddings = term_encodings
+
+    def batch_extract_keywords(
+        self,
+        documents: list[str],
+        embeddings: Optional[np.ndarray] = None,
+        seed_embedding: Optional[np.ndarray] = None,
+        fitting: bool = True,
+    ) -> list[dict[str, float]]:
+        if not len(documents):
+            return []
+        if embeddings is not None:
+            warnings.warn(
+                "embeddings parameter specified, but get ignored when using ColBERT."
+            )
+        keywords = []
+        if fitting:
+            document_term_matrix = self.vectorizer.fit_transform(documents)
+        else:
+            document_term_matrix = self.vectorizer.transform(documents)
+        batch_vocab = self.vectorizer.get_feature_names_out()
+        new_terms = list(set(batch_vocab) - set(self.key_to_index.keys()))
+        if len(new_terms):
+            self._add_terms(new_terms)
+        for i in trange(
+            0, len(documents), self.batch_size, desc="Extracting keywords"
+        ):
+            _docs = documents[i : i + self.batch_size]
+            _embs = self.encode_batch(_docs)
+            for j in range(0, self.batch_size):
+                terms = document_term_matrix[i + j, :].todense()
+                mask = terms > 0
+                if not np.any(mask):
+                    keywords.append(dict())
+                    continue
+                important_terms = np.ravel(np.asarray(mask))
+                word_embeddings = [
+                    self.term_embeddings[self.key_to_index[term]]
+                    for term in batch_vocab[important_terms]
+                ]
+                sim = cosine_similarity(word_embeddings, _embs[j])
+                maxsim = np.max(sim, axis=1)
+                kth = min(self.top_n, len(maxsim) - 1)
+                top = np.argpartition(-maxsim, kth)[:kth]
+                top_words = batch_vocab[important_terms][top]
+                top_sims = [sim for sim in maxsim[top] if sim > 0]
+                keywords.append(dict(zip(top_words, top_sims)))
+        return keywords
@@ -0,0 +1,235 @@
+from __future__ import division
+
+import random
+from collections import defaultdict
+
+import gensim
+import numpy as np
+from numpy import exp, linalg, log, pi
+from scipy.special import gamma, gammaln
+
+
+class Wishart(object):
+
+    def __init__(self, word_vecs):
+        self.nu = None
+        self.kappa = None
+        self.psi = None
+
+        self.set_params(word_vecs)
+
+    def set_params(self, word_vecs):
+        #turn dict of word vecs into a matrix
+        word_vecs = np.vstack(word_vecs.values())
+
+        self.nu = word_vecs.shape[1] #len of columns
+        self.kappa = 0.01
+        self.psi = np.sum(word_vecs.T.dot(word_vecs), axis=0) # should this be np.sum(x.dot(x.T)))??? also changed this to x.T.dot(x)
+        self.mu = np.mean(word_vecs, axis=0)
+        print "psi shape", self.psi.shape
+
+
+
+class Gauss_LDA(object):
+
+    def __init__(self, num_topics, corpus, word_vector_filepath):
+        self.doc_topic_CT = None
+        self.word_topics = {}
+        self.corpus = corpus
+        self.vocab = None
+        self.priors = None
+        self.word_vecs = {}
+        self.numtopics = num_topics
+        self.vocab = set([])
+        self.topic_params = defaultdict(dict)
+        self.wordvecFP = word_vector_filepath
+        self.word_index = {}
+        self.word_vec_size = None
+        self.alpha = 50./self.numtopics
+
+
+    def process_corpus(self, documents):
+        """
+        Takes a list of documents, and processes them
+        sets vocab
+        returns: None
+        """
+        temp_corpus = {}
+        for index, doc in enumerate(documents):
+            words = doc.split()
+            temp_corpus[index] = words
+            for word in words:
+                self.vocab.add(word)
+        self.corpus = temp_corpus
+        print "Done processing corpus with {} documents".format(len(documents))
+
+
+    def process_wordvectors(self, filepath):
+        print "Processing word-vectors, this takes a moment"
+
+        vectors = gensim.models.Word2Vec.load_word2vec_format(fname=filepath, binary=False)
+        useable_vocab = 0
+        unusable_vocab = 0
+        self.word_vec_size = vectors.vector_size
+
+        for word in self.vocab:
+            try:
+                vectors[word]
+                useable_vocab += 1
+            except KeyError: unusable_vocab += 1
+
+        print "There are {0} words that could be convereted to word vectors in your corpus \n" \
+              "There are {1} words that could NOT be converted to word vectors".format(useable_vocab, unusable_vocab)
+
+        # self.word_vecs = np.zeros((useable_vocab, vectors.vector_size))
+        index = 0
+        for word in self.vocab:
+            try:
+                self.word_vecs[word] = vectors[word]
+                index += 1
+            except KeyError: continue
+        print "Word-vectors for the corpus are created"
+
+
+    def fit(self, iterations=1, init=True): #set hyperparams here?
+        if init == True:
+            self.init()
+            init = False
+
+        print "Starting fit"
+        for i in xrange(iterations):
+            self.sample()
+            print "{} iterations complete".format(i)
+
+    def init(self):
+
+        self.process_corpus(self.corpus)
+        self.process_wordvectors(self.wordvecFP)
+        #setting wishhart priors
+        self.priors = Wishart(self.word_vecs)
+        self.doc_topic_CT = np.zeros((len(self.corpus.keys()), self.numtopics))
+
+        self.word_topics = {word: random.choice(range(self.numtopics)) for word in self.vocab}
+        # get Doc-Topic Counts
+        for docID, doc in self.corpus.iteritems():
+            for word in doc:
+                topicID = self.word_topics[word]
+                self.doc_topic_CT[docID, topicID] += 1
+
+        # Init parameters for topic distributions
+        for k in range(self.numtopics):
+            self.recalculate_topic_params(k)
+
+        print "Intialization complete"
+    def sample(self, init=True):
+
+        print "sampling started"
+        # Randomly assign word to topics
+        if init == False:
+            self.word_topics = {word: random.choice(range(self.numtopics)) for word in self.vocab}
+
+        for docID, doc in self.corpus.iteritems():
+            for word in doc:
+                #subtracting info about current word-topic assignment from doc-topic count table
+                topic_id = self.word_topics[word]
+                self.doc_topic_CT[docID, topic_id] - doc.count(word)
+
+                self.recalculate_topic_params(topic_id)
+                posterior = []
+                max = 0
+                for k in range(self.numtopics): #start getting the pdf's for each word-topic assignment
+                    log_prob = self.draw_new_wt_assgns(word, k)
+                    # Count of topic in doc
+                    Nkd = self.doc_topic_CT[docID, k]
+                    log_posterior = log(Nkd + self.alpha) * log_prob
+                    posterior.append(log_posterior)
+                    #doing this for some normalization scheme
+                    if log_posterior > max: max = log_posterior
+
+                normalized_posterior = [exp(i-max) for i in posterior]
+                print np.sum(posterior)
+                print np.random.multinomial(1, pvals=posterior)
+                ## need to copy the normalization scheme from Util.sample
+        init = False
+        return None
+
+    def draw_new_wt_assgns(self, word, topic_id):
+
+        # Getting params for calculating PDF of T-Dist for a word
+        wordvec = self.word_vecs[word]
+        inv_cov = self.topic_params[topic_id]["Inverse Covariance"]
+        cov_det = self.topic_params[topic_id]["Covariance Determinant"]
+        Nk = self.topic_params[topic_id]["Topic Count"]
+        KappaK = self.topic_params[topic_id]["Topic Kappa"]
+        centered = self.word_vecs[word] - self.priors.mu
+        topic_cov = self.topic_params[topic_id]["Topic Covariance"]
+
+
+        # Precalculating some terms (V_di - Mu)^T * Cov^-1 * (V_di - Mu)
+        LLcomp = centered.T.dot(inv_cov).dot(centered)
+        d = self.word_vec_size
+        nu = self.priors.nu + Nk - d + 1
+
+        log_prop = gammaln(nu + d / 2) - \
+                   (gammaln(nu / 2) + d/2 * (log(nu) + log(pi)) +0.5 * log(cov_det) + (nu + d) / 2 * log(1 + LLcomp / nu))
+
+        return log_prop
+        # logprob = Gamma.logGamma((nu + Data.D)/2) - \
+        #           (Gamma.logGamma(nu/2) + Data.D/2 * (Math.log(nu)+Math.log(Math.PI)) + 0.5 * Math.log(det) + (nu + Data.D)/2* Math.log(1+val/nu))
+
+    def recalculate_topic_params(self, topic_id):
+
+        topic_count = np.sum(self.doc_topic_CT[:, topic_id], axis=0) # N_k
+
+        kappa_k = self.priors.kappa + topic_count # K_k
+        nu_k = self.priors.nu + topic_count # V_k
+
+        scaled_topic_mean_K, scaled_topic_cov_K  = self.get_scaled_topic_mean_cov(topic_id) # V-Bar_k and C_k
+
+        vk_mu = scaled_topic_mean_K - self.priors.mu #V-bar_k - Mu
+        print self.priors.psi
+        psi_k = self.priors.psi + scaled_topic_cov_K + ((self.priors.kappa * topic_count) / kappa_k) * (vk_mu.T.dot(vk_mu)) # Psi_k
+
+        topic_mean = (self.priors.kappa * self.priors.mu + topic_count * scaled_topic_mean_K) / kappa_k # Mu_k
+        topic_cov = psi_k / (nu_k - self.word_vec_size + 1) # Sigma_k
+
+        self.topic_params[topic_id]["Topic Count"] = topic_count
+        self.topic_params[topic_id]["Topic Kappa"] = kappa_k
+        self.topic_params[topic_id]["Topic Nu"] = nu_k
+        self.topic_params[topic_id]["Topic Mean"], self.topic_params[topic_id]["Topic Covariance"] = topic_mean, topic_cov
+        self.topic_params[topic_id]["Inverse Covariance"] = np.linalg.inv(topic_cov)
+        self.topic_params[topic_id]["Covariance Determinant"] = np.linalg.det(topic_cov)
+        self.topic_params[topic_id]["Liklihood Componant"] = None
+
+
+        return topic_mean, topic_cov
+
+    def get_scaled_topic_mean_cov(self, topic_id):
+        'mean of word vecs in a topic'
+        # get words assigned to topic_id
+        word_vecs = []
+        for word, topic in self.word_topics.iteritems():
+            if topic == topic_id:
+                word_vecs.append(self.word_vecs[word])
+        print word_vecs
+        word_vecs = np.vstack(word_vecs)
+        print word_vecs.shape
+        print np.sum(word_vecs, axis=0)
+        print np.sum(self.doc_topic_CT[:, topic_id], axis=0)
+        mean = np.sum(word_vecs, axis=0) / (np.sum(self.doc_topic_CT[:, topic_id], axis=0) + 2) #added a small number here to stop overflow
+
+        # mean_centered = np.sum(word_vecs, axis=0) - mean
+        mean_centered = word_vecs - mean
+        print self.doc_topic_CT
+        print mean
+        print mean_centered
+
+        cov = mean_centered.T.dot(mean_centered)
+        return mean, cov
+
+if __name__ == "__main__":
+    corpus = ["apple orange mango melon", "dog cat bird rat"]
+    wordvec_fileapth = "/Users/michael/Documents/Gaussian_LDA-master/data/glove.wiki/glove.6B.50d.txt"
+    g = Gauss_LDA(2, corpus, wordvec_fileapth )
+    g.fit(2)
+    # print g.topic_params[1]["Topic Count"]
@@ -193,7 +193,7 @@ def transform(
             Document-topic matrix.
         """
         if embeddings is None:
-            embeddings = self.encoder_.encode(raw_documents)
+            embeddings = self.encode_documents(raw_documents)
         if self.combined:
             bow = self.vectorizer.fit_transform(raw_documents)
             contextual_embeddings = np.concatenate(
@@ -219,7 +219,7 @@ def fit_transform(
         with console.status("Fitting model") as status:
             if embeddings is None:
                 status.update("Encoding documents")
-                embeddings = self.encoder_.encode(raw_documents)
+                embeddings = self.encode_documents(raw_documents)
                 console.log("Documents encoded.")
             status.update("Extracting terms.")
             document_term_matrix = self.vectorizer.fit_transform(raw_documents)
 
@@ -62,8 +62,8 @@ def __init__(
         self.classes_ = np.array([name for name in self._seeds])
         self.concept_matrix_ = []
         for _, (positive, negative) in self._seeds.items():
-            positive_emb = self.encoder_.encode(positive)
-            negative_emb = self.encoder_.encode(negative)
+            positive_emb = self.encoder_.encode(list(positive))
+            negative_emb = self.encoder_.encode(list(negative))
             cv = np.mean(positive_emb, axis=0) - np.mean(negative_emb, axis=0)
             self.concept_matrix_.append(cv / np.linalg.norm(cv))
         self.concept_matrix_ = np.stack(self.concept_matrix_)
@@ -92,7 +92,7 @@ def fit_transform(self, raw_documents=None, y=None, embeddings=None):
                 "Either embeddings or raw_documents has to be passed, both are None."
             )
         if embeddings is None:
-            embeddings = self.encoder_.encode(raw_documents)
+            embeddings = self.encoder_.encode(list(raw_documents))
         return embeddings @ self.concept_matrix_.T
 
     def transform(self, raw_documents=None, embeddings=None):