Skip to content

Commit e5e6500

Browse files
Added TokenCountVectorizer
1 parent 8d617de commit e5e6500

1 file changed

Lines changed: 77 additions & 0 deletions

File tree

turftopic/vectorizers/spacy.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,3 +153,80 @@ def lemma_tokenize(self, text: str) -> list[str]:
153153

154154
def build_tokenizer(self):
155155
return self.lemma_tokenize
156+
157+
158+
class TokenCountVectorizer(CountVectorizer):
159+
"""Tokenizes text with SpaCy using its language-specific tokenization rules and stop-word lists
160+
161+
Parameters
162+
----------
163+
language_code: str, default "en"
164+
Language code for the language you intend to use.
165+
remove_stop_words: bool, default True
166+
Indicates whether stop words should be removed.
167+
remove_nonalpha: bool, default True
168+
Indicates whether only tokens containing alphabetical characters should be kept.
169+
"""
170+
171+
def __init__(
172+
self,
173+
language_code: str = "en",
174+
remove_stop_words: bool = True,
175+
remove_nonalpha: bool = True,
176+
*,
177+
input="content",
178+
encoding="utf-8",
179+
decode_error="strict",
180+
strip_accents=None,
181+
lowercase=True,
182+
preprocessor=None,
183+
tokenizer=None,
184+
stop_words=None,
185+
token_pattern=r"(?u)\b\w\w+\b",
186+
ngram_range=(1, 1),
187+
analyzer="word",
188+
max_df=1.0,
189+
min_df=1,
190+
max_features=None,
191+
vocabulary=None,
192+
binary=False,
193+
dtype=np.int64,
194+
):
195+
self.language_code = language_code
196+
self.remove_stop_words = remove_stop_words
197+
self.remove_nonalpha = remove_nonalpha
198+
super().__init__(
199+
input=input,
200+
encoding=encoding,
201+
decode_error=decode_error,
202+
strip_accents=strip_accents,
203+
lowercase=lowercase,
204+
preprocessor=preprocessor,
205+
tokenizer=tokenizer,
206+
stop_words=stop_words,
207+
token_pattern=token_pattern,
208+
ngram_range=ngram_range,
209+
analyzer=analyzer,
210+
max_df=max_df,
211+
min_df=min_df,
212+
max_features=max_features,
213+
vocabulary=vocabulary,
214+
binary=binary,
215+
dtype=dtype,
216+
)
217+
218+
def build_tokenizer(self):
219+
nlp = spacy.blank(self.language_code)
220+
221+
def tokenize(text: str) -> list[str]:
222+
doc = nlp(text)
223+
result = []
224+
for tok in doc:
225+
if self.remove_stop_words and tok.is_stop:
226+
continue
227+
if self.remove_nonalpha and not tok.is_alpha:
228+
continue
229+
result.append(tok.orth_)
230+
return result
231+
232+
return tokenize

0 commit comments

Comments
 (0)