from spacy.attrs import ORTH import spacy import unicodedata class CustomTokenizer: def __init__(self): nlp1 = spacy.load("ru_core_news_sm", disable=["ner", "attribute_ruler", "senter", "parser", "morphologizer", "tok2vec"]) nlp1.tokenizer.add_special_case('', [{ORTH: ""}]) nlp1.tokenizer.add_special_case('', [{ORTH: ""}]) self.nlp1 = nlp1 def tokenize_rus(self, s): s = self.unicode_toAscii(s.lower().strip()) s = self.nlp1(s) return s def unicode_toAscii(self, s): return ''.join( c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn' )