charsetgroupprober.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. ######################## BEGIN LICENSE BLOCK ########################
  2. # The Original Code is Mozilla Communicator client code.
  3. #
  4. # The Initial Developer of the Original Code is
  5. # Netscape Communications Corporation.
  6. # Portions created by the Initial Developer are Copyright (C) 1998
  7. # the Initial Developer. All Rights Reserved.
  8. #
  9. # Contributor(s):
  10. # Mark Pilgrim - port to Python
  11. #
  12. # This library is free software; you can redistribute it and/or
  13. # modify it under the terms of the GNU Lesser General Public
  14. # License as published by the Free Software Foundation; either
  15. # version 2.1 of the License, or (at your option) any later version.
  16. #
  17. # This library is distributed in the hope that it will be useful,
  18. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  20. # Lesser General Public License for more details.
  21. #
  22. # You should have received a copy of the GNU Lesser General Public
  23. # License along with this library; if not, write to the Free Software
  24. # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
  25. # 02110-1301 USA
  26. ######################### END LICENSE BLOCK #########################
  27. from typing import List, Optional, Union
  28. from .charsetprober import CharSetProber
  29. from .enums import LanguageFilter, ProbingState
  30. class CharSetGroupProber(CharSetProber):
  31. def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None:
  32. super().__init__(lang_filter=lang_filter)
  33. self._active_num = 0
  34. self.probers: List[CharSetProber] = []
  35. self._best_guess_prober: Optional[CharSetProber] = None
  36. def reset(self) -> None:
  37. super().reset()
  38. self._active_num = 0
  39. for prober in self.probers:
  40. prober.reset()
  41. prober.active = True
  42. self._active_num += 1
  43. self._best_guess_prober = None
  44. @property
  45. def charset_name(self) -> Optional[str]:
  46. if not self._best_guess_prober:
  47. self.get_confidence()
  48. if not self._best_guess_prober:
  49. return None
  50. return self._best_guess_prober.charset_name
  51. @property
  52. def language(self) -> Optional[str]:
  53. if not self._best_guess_prober:
  54. self.get_confidence()
  55. if not self._best_guess_prober:
  56. return None
  57. return self._best_guess_prober.language
  58. def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState:
  59. for prober in self.probers:
  60. if not prober.active:
  61. continue
  62. state = prober.feed(byte_str)
  63. if not state:
  64. continue
  65. if state == ProbingState.FOUND_IT:
  66. self._best_guess_prober = prober
  67. self._state = ProbingState.FOUND_IT
  68. return self.state
  69. if state == ProbingState.NOT_ME:
  70. prober.active = False
  71. self._active_num -= 1
  72. if self._active_num <= 0:
  73. self._state = ProbingState.NOT_ME
  74. return self.state
  75. return self.state
  76. def get_confidence(self) -> float:
  77. state = self.state
  78. if state == ProbingState.FOUND_IT:
  79. return 0.99
  80. if state == ProbingState.NOT_ME:
  81. return 0.01
  82. best_conf = 0.0
  83. self._best_guess_prober = None
  84. for prober in self.probers:
  85. if not prober.active:
  86. self.logger.debug("%s not active", prober.charset_name)
  87. continue
  88. conf = prober.get_confidence()
  89. self.logger.debug(
  90. "%s %s confidence = %s", prober.charset_name, prober.language, conf
  91. )
  92. if best_conf < conf:
  93. best_conf = conf
  94. self._best_guess_prober = prober
  95. if not self._best_guess_prober:
  96. return 0.0
  97. return best_conf