|
| 1 | +from __future__ import division |
| 2 | + |
| 3 | +import random |
| 4 | +from collections import defaultdict |
| 5 | + |
| 6 | +import gensim |
| 7 | +import numpy as np |
| 8 | +from numpy import exp, linalg, log, pi |
| 9 | +from scipy.special import gamma, gammaln |
| 10 | + |
| 11 | + |
| 12 | +class Wishart(object): |
| 13 | + |
| 14 | + def __init__(self, word_vecs): |
| 15 | + self.nu = None |
| 16 | + self.kappa = None |
| 17 | + self.psi = None |
| 18 | + |
| 19 | + self.set_params(word_vecs) |
| 20 | + |
| 21 | + def set_params(self, word_vecs): |
| 22 | + #turn dict of word vecs into a matrix |
| 23 | + word_vecs = np.vstack(word_vecs.values()) |
| 24 | + |
| 25 | + self.nu = word_vecs.shape[1] #len of columns |
| 26 | + self.kappa = 0.01 |
| 27 | + self.psi = np.sum(word_vecs.T.dot(word_vecs), axis=0) # should this be np.sum(x.dot(x.T)))??? also changed this to x.T.dot(x) |
| 28 | + self.mu = np.mean(word_vecs, axis=0) |
| 29 | + print "psi shape", self.psi.shape |
| 30 | + |
| 31 | + |
| 32 | + |
| 33 | +class Gauss_LDA(object): |
| 34 | + |
| 35 | + def __init__(self, num_topics, corpus, word_vector_filepath): |
| 36 | + self.doc_topic_CT = None |
| 37 | + self.word_topics = {} |
| 38 | + self.corpus = corpus |
| 39 | + self.vocab = None |
| 40 | + self.priors = None |
| 41 | + self.word_vecs = {} |
| 42 | + self.numtopics = num_topics |
| 43 | + self.vocab = set([]) |
| 44 | + self.topic_params = defaultdict(dict) |
| 45 | + self.wordvecFP = word_vector_filepath |
| 46 | + self.word_index = {} |
| 47 | + self.word_vec_size = None |
| 48 | + self.alpha = 50./self.numtopics |
| 49 | + |
| 50 | + |
| 51 | + def process_corpus(self, documents): |
| 52 | + """ |
| 53 | + Takes a list of documents, and processes them |
| 54 | + sets vocab |
| 55 | + returns: None |
| 56 | + """ |
| 57 | + temp_corpus = {} |
| 58 | + for index, doc in enumerate(documents): |
| 59 | + words = doc.split() |
| 60 | + temp_corpus[index] = words |
| 61 | + for word in words: |
| 62 | + self.vocab.add(word) |
| 63 | + self.corpus = temp_corpus |
| 64 | + print "Done processing corpus with {} documents".format(len(documents)) |
| 65 | + |
| 66 | + |
| 67 | + def process_wordvectors(self, filepath): |
| 68 | + print "Processing word-vectors, this takes a moment" |
| 69 | + |
| 70 | + vectors = gensim.models.Word2Vec.load_word2vec_format(fname=filepath, binary=False) |
| 71 | + useable_vocab = 0 |
| 72 | + unusable_vocab = 0 |
| 73 | + self.word_vec_size = vectors.vector_size |
| 74 | + |
| 75 | + for word in self.vocab: |
| 76 | + try: |
| 77 | + vectors[word] |
| 78 | + useable_vocab += 1 |
| 79 | + except KeyError: unusable_vocab += 1 |
| 80 | + |
| 81 | + print "There are {0} words that could be convereted to word vectors in your corpus \n" \ |
| 82 | + "There are {1} words that could NOT be converted to word vectors".format(useable_vocab, unusable_vocab) |
| 83 | + |
| 84 | + # self.word_vecs = np.zeros((useable_vocab, vectors.vector_size)) |
| 85 | + index = 0 |
| 86 | + for word in self.vocab: |
| 87 | + try: |
| 88 | + self.word_vecs[word] = vectors[word] |
| 89 | + index += 1 |
| 90 | + except KeyError: continue |
| 91 | + print "Word-vectors for the corpus are created" |
| 92 | + |
| 93 | + |
| 94 | + def fit(self, iterations=1, init=True): #set hyperparams here? |
| 95 | + if init == True: |
| 96 | + self.init() |
| 97 | + init = False |
| 98 | + |
| 99 | + print "Starting fit" |
| 100 | + for i in xrange(iterations): |
| 101 | + self.sample() |
| 102 | + print "{} iterations complete".format(i) |
| 103 | + |
| 104 | + def init(self): |
| 105 | + |
| 106 | + self.process_corpus(self.corpus) |
| 107 | + self.process_wordvectors(self.wordvecFP) |
| 108 | + #setting wishhart priors |
| 109 | + self.priors = Wishart(self.word_vecs) |
| 110 | + self.doc_topic_CT = np.zeros((len(self.corpus.keys()), self.numtopics)) |
| 111 | + |
| 112 | + self.word_topics = {word: random.choice(range(self.numtopics)) for word in self.vocab} |
| 113 | + # get Doc-Topic Counts |
| 114 | + for docID, doc in self.corpus.iteritems(): |
| 115 | + for word in doc: |
| 116 | + topicID = self.word_topics[word] |
| 117 | + self.doc_topic_CT[docID, topicID] += 1 |
| 118 | + |
| 119 | + # Init parameters for topic distributions |
| 120 | + for k in range(self.numtopics): |
| 121 | + self.recalculate_topic_params(k) |
| 122 | + |
| 123 | + print "Intialization complete" |
| 124 | + def sample(self, init=True): |
| 125 | + |
| 126 | + print "sampling started" |
| 127 | + # Randomly assign word to topics |
| 128 | + if init == False: |
| 129 | + self.word_topics = {word: random.choice(range(self.numtopics)) for word in self.vocab} |
| 130 | + |
| 131 | + for docID, doc in self.corpus.iteritems(): |
| 132 | + for word in doc: |
| 133 | + #subtracting info about current word-topic assignment from doc-topic count table |
| 134 | + topic_id = self.word_topics[word] |
| 135 | + self.doc_topic_CT[docID, topic_id] - doc.count(word) |
| 136 | + |
| 137 | + self.recalculate_topic_params(topic_id) |
| 138 | + posterior = [] |
| 139 | + max = 0 |
| 140 | + for k in range(self.numtopics): #start getting the pdf's for each word-topic assignment |
| 141 | + log_prob = self.draw_new_wt_assgns(word, k) |
| 142 | + # Count of topic in doc |
| 143 | + Nkd = self.doc_topic_CT[docID, k] |
| 144 | + log_posterior = log(Nkd + self.alpha) * log_prob |
| 145 | + posterior.append(log_posterior) |
| 146 | + #doing this for some normalization scheme |
| 147 | + if log_posterior > max: max = log_posterior |
| 148 | + |
| 149 | + normalized_posterior = [exp(i-max) for i in posterior] |
| 150 | + print np.sum(posterior) |
| 151 | + print np.random.multinomial(1, pvals=posterior) |
| 152 | + ## need to copy the normalization scheme from Util.sample |
| 153 | + init = False |
| 154 | + return None |
| 155 | + |
| 156 | + def draw_new_wt_assgns(self, word, topic_id): |
| 157 | + |
| 158 | + # Getting params for calculating PDF of T-Dist for a word |
| 159 | + wordvec = self.word_vecs[word] |
| 160 | + inv_cov = self.topic_params[topic_id]["Inverse Covariance"] |
| 161 | + cov_det = self.topic_params[topic_id]["Covariance Determinant"] |
| 162 | + Nk = self.topic_params[topic_id]["Topic Count"] |
| 163 | + KappaK = self.topic_params[topic_id]["Topic Kappa"] |
| 164 | + centered = self.word_vecs[word] - self.priors.mu |
| 165 | + topic_cov = self.topic_params[topic_id]["Topic Covariance"] |
| 166 | + |
| 167 | + |
| 168 | + # Precalculating some terms (V_di - Mu)^T * Cov^-1 * (V_di - Mu) |
| 169 | + LLcomp = centered.T.dot(inv_cov).dot(centered) |
| 170 | + d = self.word_vec_size |
| 171 | + nu = self.priors.nu + Nk - d + 1 |
| 172 | + |
| 173 | + log_prop = gammaln(nu + d / 2) - \ |
| 174 | + (gammaln(nu / 2) + d/2 * (log(nu) + log(pi)) +0.5 * log(cov_det) + (nu + d) / 2 * log(1 + LLcomp / nu)) |
| 175 | + |
| 176 | + return log_prop |
| 177 | + # logprob = Gamma.logGamma((nu + Data.D)/2) - \ |
| 178 | + # (Gamma.logGamma(nu/2) + Data.D/2 * (Math.log(nu)+Math.log(Math.PI)) + 0.5 * Math.log(det) + (nu + Data.D)/2* Math.log(1+val/nu)) |
| 179 | + |
| 180 | + def recalculate_topic_params(self, topic_id): |
| 181 | + |
| 182 | + topic_count = np.sum(self.doc_topic_CT[:, topic_id], axis=0) # N_k |
| 183 | + |
| 184 | + kappa_k = self.priors.kappa + topic_count # K_k |
| 185 | + nu_k = self.priors.nu + topic_count # V_k |
| 186 | + |
| 187 | + scaled_topic_mean_K, scaled_topic_cov_K = self.get_scaled_topic_mean_cov(topic_id) # V-Bar_k and C_k |
| 188 | + |
| 189 | + vk_mu = scaled_topic_mean_K - self.priors.mu #V-bar_k - Mu |
| 190 | + print self.priors.psi |
| 191 | + psi_k = self.priors.psi + scaled_topic_cov_K + ((self.priors.kappa * topic_count) / kappa_k) * (vk_mu.T.dot(vk_mu)) # Psi_k |
| 192 | + |
| 193 | + topic_mean = (self.priors.kappa * self.priors.mu + topic_count * scaled_topic_mean_K) / kappa_k # Mu_k |
| 194 | + topic_cov = psi_k / (nu_k - self.word_vec_size + 1) # Sigma_k |
| 195 | + |
| 196 | + self.topic_params[topic_id]["Topic Count"] = topic_count |
| 197 | + self.topic_params[topic_id]["Topic Kappa"] = kappa_k |
| 198 | + self.topic_params[topic_id]["Topic Nu"] = nu_k |
| 199 | + self.topic_params[topic_id]["Topic Mean"], self.topic_params[topic_id]["Topic Covariance"] = topic_mean, topic_cov |
| 200 | + self.topic_params[topic_id]["Inverse Covariance"] = np.linalg.inv(topic_cov) |
| 201 | + self.topic_params[topic_id]["Covariance Determinant"] = np.linalg.det(topic_cov) |
| 202 | + self.topic_params[topic_id]["Liklihood Componant"] = None |
| 203 | + |
| 204 | + |
| 205 | + return topic_mean, topic_cov |
| 206 | + |
| 207 | + def get_scaled_topic_mean_cov(self, topic_id): |
| 208 | + 'mean of word vecs in a topic' |
| 209 | + # get words assigned to topic_id |
| 210 | + word_vecs = [] |
| 211 | + for word, topic in self.word_topics.iteritems(): |
| 212 | + if topic == topic_id: |
| 213 | + word_vecs.append(self.word_vecs[word]) |
| 214 | + print word_vecs |
| 215 | + word_vecs = np.vstack(word_vecs) |
| 216 | + print word_vecs.shape |
| 217 | + print np.sum(word_vecs, axis=0) |
| 218 | + print np.sum(self.doc_topic_CT[:, topic_id], axis=0) |
| 219 | + mean = np.sum(word_vecs, axis=0) / (np.sum(self.doc_topic_CT[:, topic_id], axis=0) + 2) #added a small number here to stop overflow |
| 220 | + |
| 221 | + # mean_centered = np.sum(word_vecs, axis=0) - mean |
| 222 | + mean_centered = word_vecs - mean |
| 223 | + print self.doc_topic_CT |
| 224 | + print mean |
| 225 | + print mean_centered |
| 226 | + |
| 227 | + cov = mean_centered.T.dot(mean_centered) |
| 228 | + return mean, cov |
| 229 | + |
| 230 | +if __name__ == "__main__": |
| 231 | + corpus = ["apple orange mango melon", "dog cat bird rat"] |
| 232 | + wordvec_fileapth = "/Users/michael/Documents/Gaussian_LDA-master/data/glove.wiki/glove.6B.50d.txt" |
| 233 | + g = Gauss_LDA(2, corpus, wordvec_fileapth ) |
| 234 | + g.fit(2) |
| 235 | + # print g.topic_params[1]["Topic Count"] |
0 commit comments