sbcsgroupprober.py 4.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. ######################## BEGIN LICENSE BLOCK ########################
  2. # The Original Code is Mozilla Universal charset detector code.
  3. #
  4. # The Initial Developer of the Original Code is
  5. # Netscape Communications Corporation.
  6. # Portions created by the Initial Developer are Copyright (C) 2001
  7. # the Initial Developer. All Rights Reserved.
  8. #
  9. # Contributor(s):
  10. # Mark Pilgrim - port to Python
  11. # Shy Shalom - original C code
  12. #
  13. # This library is free software; you can redistribute it and/or
  14. # modify it under the terms of the GNU Lesser General Public
  15. # License as published by the Free Software Foundation; either
  16. # version 2.1 of the License, or (at your option) any later version.
  17. #
  18. # This library is distributed in the hope that it will be useful,
  19. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  20. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  21. # Lesser General Public License for more details.
  22. #
  23. # You should have received a copy of the GNU Lesser General Public
  24. # License along with this library; if not, write to the Free Software
  25. # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
  26. # 02110-1301 USA
  27. ######################### END LICENSE BLOCK #########################
  28. from .charsetgroupprober import CharSetGroupProber
  29. from .hebrewprober import HebrewProber
  30. from .langbulgarianmodel import ISO_8859_5_BULGARIAN_MODEL, WINDOWS_1251_BULGARIAN_MODEL
  31. from .langgreekmodel import ISO_8859_7_GREEK_MODEL, WINDOWS_1253_GREEK_MODEL
  32. from .langhebrewmodel import WINDOWS_1255_HEBREW_MODEL
  33. # from .langhungarianmodel import (ISO_8859_2_HUNGARIAN_MODEL,
  34. # WINDOWS_1250_HUNGARIAN_MODEL)
  35. from .langrussianmodel import (
  36. IBM855_RUSSIAN_MODEL,
  37. IBM866_RUSSIAN_MODEL,
  38. ISO_8859_5_RUSSIAN_MODEL,
  39. KOI8_R_RUSSIAN_MODEL,
  40. MACCYRILLIC_RUSSIAN_MODEL,
  41. WINDOWS_1251_RUSSIAN_MODEL,
  42. )
  43. from .langthaimodel import TIS_620_THAI_MODEL
  44. from .langturkishmodel import ISO_8859_9_TURKISH_MODEL
  45. from .sbcharsetprober import SingleByteCharSetProber
  46. class SBCSGroupProber(CharSetGroupProber):
  47. def __init__(self) -> None:
  48. super().__init__()
  49. hebrew_prober = HebrewProber()
  50. logical_hebrew_prober = SingleByteCharSetProber(
  51. WINDOWS_1255_HEBREW_MODEL, is_reversed=False, name_prober=hebrew_prober
  52. )
  53. # TODO: See if using ISO-8859-8 Hebrew model works better here, since
  54. # it's actually the visual one
  55. visual_hebrew_prober = SingleByteCharSetProber(
  56. WINDOWS_1255_HEBREW_MODEL, is_reversed=True, name_prober=hebrew_prober
  57. )
  58. hebrew_prober.set_model_probers(logical_hebrew_prober, visual_hebrew_prober)
  59. # TODO: ORDER MATTERS HERE. I changed the order vs what was in master
  60. # and several tests failed that did not before. Some thought
  61. # should be put into the ordering, and we should consider making
  62. # order not matter here, because that is very counter-intuitive.
  63. self.probers = [
  64. SingleByteCharSetProber(WINDOWS_1251_RUSSIAN_MODEL),
  65. SingleByteCharSetProber(KOI8_R_RUSSIAN_MODEL),
  66. SingleByteCharSetProber(ISO_8859_5_RUSSIAN_MODEL),
  67. SingleByteCharSetProber(MACCYRILLIC_RUSSIAN_MODEL),
  68. SingleByteCharSetProber(IBM866_RUSSIAN_MODEL),
  69. SingleByteCharSetProber(IBM855_RUSSIAN_MODEL),
  70. SingleByteCharSetProber(ISO_8859_7_GREEK_MODEL),
  71. SingleByteCharSetProber(WINDOWS_1253_GREEK_MODEL),
  72. SingleByteCharSetProber(ISO_8859_5_BULGARIAN_MODEL),
  73. SingleByteCharSetProber(WINDOWS_1251_BULGARIAN_MODEL),
  74. # TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250)
  75. # after we retrain model.
  76. # SingleByteCharSetProber(ISO_8859_2_HUNGARIAN_MODEL),
  77. # SingleByteCharSetProber(WINDOWS_1250_HUNGARIAN_MODEL),
  78. SingleByteCharSetProber(TIS_620_THAI_MODEL),
  79. SingleByteCharSetProber(ISO_8859_9_TURKISH_MODEL),
  80. hebrew_prober,
  81. logical_hebrew_prober,
  82. visual_hebrew_prober,
  83. ]
  84. self.reset()