1234567891011121314151617181920212223 |
- from spacy.attrs import ORTH
- import spacy
- import unicodedata
- class CustomTokenizer:
- def __init__(self):
- nlp1 = spacy.load("ru_core_news_sm", disable=["ner", "attribute_ruler", "senter", "parser", "morphologizer", "tok2vec"])
- nlp1.tokenizer.add_special_case('<sos>', [{ORTH: "<sos>"}])
- nlp1.tokenizer.add_special_case('<eos>', [{ORTH: "<eos>"}])
- self.nlp1 = nlp1
- def tokenize_rus(self, s):
- s = self.unicode_toAscii(s.lower().strip())
- s = self.nlp1(s)
- return s
- def unicode_toAscii(self, s):
- return ''.join(
- c for c in unicodedata.normalize('NFD', s)
- if unicodedata.category(c) != 'Mn'
- )
|