Skip to content

Commit 55b5fe5

Browse files
Added PhraseVectorizer
1 parent 124fc01 commit 55b5fe5

1 file changed

Lines changed: 82 additions & 0 deletions

File tree

turftopic/vectorizers/phrases.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import numpy as np
2+
from sklearn.base import BaseEstimator, TransformerMixin
3+
from sklearn.feature_extraction.text import CountVectorizer
4+
5+
6+
class PhraseVectorizer(BaseEstimator, TransformerMixin):
7+
"""NPMI-score-based phrase extraction."""
8+
9+
def __init__(
10+
self,
11+
max_ngram=3,
12+
min_df=10,
13+
max_df=1.0,
14+
threshold=0.5,
15+
stop_words="english",
16+
smoothing=5,
17+
):
18+
self.stop_words = stop_words
19+
self.threshold = threshold
20+
self.max_ngram = max_ngram
21+
self.min_df = min_df
22+
self.max_df = max_df
23+
self.smoothing = smoothing
24+
self.ngram_range = (1, max_ngram)
25+
26+
def fit_transform(self, raw_documents, y=None):
27+
self.vectorizer_ = CountVectorizer(
28+
stop_words=self.stop_words,
29+
min_df=self.min_df,
30+
max_df=self.max_df,
31+
ngram_range=self.ngram_range,
32+
)
33+
dtm = self.vectorizer_.fit_transform(raw_documents)
34+
all_vocab = self.vectorizer_.get_feature_names_out()
35+
token_count = dict(
36+
zip(
37+
self.vectorizer_.get_feature_names_out(),
38+
np.ravel(dtm.sum(axis=1)),
39+
)
40+
)
41+
counts = np.ravel(dtm.sum(axis=1))
42+
word_indices = [
43+
i
44+
for word, i in self.vectorizer_.vocabulary_.items()
45+
if len(word.split()) == 1
46+
]
47+
n_ws = dtm[:, word_indices].sum() + len(word_indices) * self.smoothing
48+
ngram_indices = []
49+
for i, (token, n_w1w2) in enumerate(zip(all_vocab, counts)):
50+
_words = token.split()
51+
if len(_words) == 1:
52+
continue
53+
w1, w2 = _words[0], _words[-1]
54+
n_w1 = token_count.get(w1, None)
55+
n_w2 = token_count.get(w2, None)
56+
if (n_w1 is None) or (n_w2 is None):
57+
continue
58+
p_w1w2 = (n_w1w2 + self.smoothing) / n_ws
59+
p_w1 = (n_w1 + self.smoothing) / n_ws
60+
p_w2 = (n_w2 + self.smoothing) / n_ws
61+
pmi = np.log2(p_w1w2 / (p_w1 * p_w2))
62+
npmi = pmi / (-np.log2(p_w1w2))
63+
if npmi > self.threshold:
64+
ngram_indices.append(i)
65+
self.indices_ = np.array(word_indices + ngram_indices)
66+
self.feature_names_out_ = all_vocab[self.indices_]
67+
self.vocabulary_ = dict(
68+
zip(self.feature_names_out_, range(len(self.feature_names_out_)))
69+
)
70+
dtm = dtm[:, self.indices_]
71+
return dtm
72+
73+
def transform(self, raw_documents):
74+
dtm = self.vectorizer_.transform(raw_documents)
75+
return dtm[:, self.indices_]
76+
77+
def fit(self, raw_documents, y=None):
78+
self.fit_transform(raw_documents, y)
79+
return self
80+
81+
def get_feature_names_out(self):
82+
return self.feature_names_out_

0 commit comments

Comments
 (0)