@@ -153,3 +153,80 @@ def lemma_tokenize(self, text: str) -> list[str]:
153153
154154 def build_tokenizer (self ):
155155 return self .lemma_tokenize
156+
157+
158+ class TokenCountVectorizer (CountVectorizer ):
159+ """Tokenizes text with SpaCy using its language-specific tokenization rules and stop-word lists
160+
161+ Parameters
162+ ----------
163+ language_code: str, default "en"
164+ Language code for the language you intend to use.
165+ remove_stop_words: bool, default True
166+ Indicates whether stop words should be removed.
167+ remove_nonalpha: bool, default True
168+ Indicates whether only tokens containing alphabetical characters should be kept.
169+ """
170+
171+ def __init__ (
172+ self ,
173+ language_code : str = "en" ,
174+ remove_stop_words : bool = True ,
175+ remove_nonalpha : bool = True ,
176+ * ,
177+ input = "content" ,
178+ encoding = "utf-8" ,
179+ decode_error = "strict" ,
180+ strip_accents = None ,
181+ lowercase = True ,
182+ preprocessor = None ,
183+ tokenizer = None ,
184+ stop_words = None ,
185+ token_pattern = r"(?u)\b\w\w+\b" ,
186+ ngram_range = (1 , 1 ),
187+ analyzer = "word" ,
188+ max_df = 1.0 ,
189+ min_df = 1 ,
190+ max_features = None ,
191+ vocabulary = None ,
192+ binary = False ,
193+ dtype = np .int64 ,
194+ ):
195+ self .language_code = language_code
196+ self .remove_stop_words = remove_stop_words
197+ self .remove_nonalpha = remove_nonalpha
198+ super ().__init__ (
199+ input = input ,
200+ encoding = encoding ,
201+ decode_error = decode_error ,
202+ strip_accents = strip_accents ,
203+ lowercase = lowercase ,
204+ preprocessor = preprocessor ,
205+ tokenizer = tokenizer ,
206+ stop_words = stop_words ,
207+ token_pattern = token_pattern ,
208+ ngram_range = ngram_range ,
209+ analyzer = analyzer ,
210+ max_df = max_df ,
211+ min_df = min_df ,
212+ max_features = max_features ,
213+ vocabulary = vocabulary ,
214+ binary = binary ,
215+ dtype = dtype ,
216+ )
217+
218+ def build_tokenizer (self ):
219+ nlp = spacy .blank (self .language_code )
220+
221+ def tokenize (text : str ) -> list [str ]:
222+ doc = nlp (text )
223+ result = []
224+ for tok in doc :
225+ if self .remove_stop_words and tok .is_stop :
226+ continue
227+ if self .remove_nonalpha and not tok .is_alpha :
228+ continue
229+ result .append (tok .orth_ )
230+ return result
231+
232+ return tokenize
0 commit comments