tokenizer.py 698 B

1234567891011121314151617181920212223
  1. from spacy.attrs import ORTH
  2. import spacy
  3. import unicodedata
  4. class CustomTokenizer:
  5. def __init__(self):
  6. nlp1 = spacy.load("ru_core_news_sm", disable=["ner", "attribute_ruler", "senter", "parser", "morphologizer", "tok2vec"])
  7. nlp1.tokenizer.add_special_case('<sos>', [{ORTH: "<sos>"}])
  8. nlp1.tokenizer.add_special_case('<eos>', [{ORTH: "<eos>"}])
  9. self.nlp1 = nlp1
  10. def tokenize_rus(self, s):
  11. s = self.unicode_toAscii(s.lower().strip())
  12. s = self.nlp1(s)
  13. return s
  14. def unicode_toAscii(self, s):
  15. return ''.join(
  16. c for c in unicodedata.normalize('NFD', s)
  17. if unicodedata.category(c) != 'Mn'
  18. )