unicode.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352
  1. # unicode.py
  2. import sys
  3. from itertools import filterfalse
  4. from typing import List, Tuple, Union
  5. class _lazyclassproperty:
  6. def __init__(self, fn):
  7. self.fn = fn
  8. self.__doc__ = fn.__doc__
  9. self.__name__ = fn.__name__
  10. def __get__(self, obj, cls):
  11. if cls is None:
  12. cls = type(obj)
  13. if not hasattr(cls, "_intern") or any(
  14. cls._intern is getattr(superclass, "_intern", [])
  15. for superclass in cls.__mro__[1:]
  16. ):
  17. cls._intern = {}
  18. attrname = self.fn.__name__
  19. if attrname not in cls._intern:
  20. cls._intern[attrname] = self.fn(cls)
  21. return cls._intern[attrname]
  22. UnicodeRangeList = List[Union[Tuple[int, int], Tuple[int]]]
  23. class unicode_set:
  24. """
  25. A set of Unicode characters, for language-specific strings for
  26. ``alphas``, ``nums``, ``alphanums``, and ``printables``.
  27. A unicode_set is defined by a list of ranges in the Unicode character
  28. set, in a class attribute ``_ranges``. Ranges can be specified using
  29. 2-tuples or a 1-tuple, such as::
  30. _ranges = [
  31. (0x0020, 0x007e),
  32. (0x00a0, 0x00ff),
  33. (0x0100,),
  34. ]
  35. Ranges are left- and right-inclusive. A 1-tuple of (x,) is treated as (x, x).
  36. A unicode set can also be defined using multiple inheritance of other unicode sets::
  37. class CJK(Chinese, Japanese, Korean):
  38. pass
  39. """
  40. _ranges: UnicodeRangeList = []
  41. @_lazyclassproperty
  42. def _chars_for_ranges(cls):
  43. ret = []
  44. for cc in cls.__mro__:
  45. if cc is unicode_set:
  46. break
  47. for rr in getattr(cc, "_ranges", ()):
  48. ret.extend(range(rr[0], rr[-1] + 1))
  49. return [chr(c) for c in sorted(set(ret))]
  50. @_lazyclassproperty
  51. def printables(cls):
  52. "all non-whitespace characters in this range"
  53. return "".join(filterfalse(str.isspace, cls._chars_for_ranges))
  54. @_lazyclassproperty
  55. def alphas(cls):
  56. "all alphabetic characters in this range"
  57. return "".join(filter(str.isalpha, cls._chars_for_ranges))
  58. @_lazyclassproperty
  59. def nums(cls):
  60. "all numeric digit characters in this range"
  61. return "".join(filter(str.isdigit, cls._chars_for_ranges))
  62. @_lazyclassproperty
  63. def alphanums(cls):
  64. "all alphanumeric characters in this range"
  65. return cls.alphas + cls.nums
  66. @_lazyclassproperty
  67. def identchars(cls):
  68. "all characters in this range that are valid identifier characters, plus underscore '_'"
  69. return "".join(
  70. sorted(
  71. set(
  72. "".join(filter(str.isidentifier, cls._chars_for_ranges))
  73. + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªµº"
  74. + "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ"
  75. + "_"
  76. )
  77. )
  78. )
  79. @_lazyclassproperty
  80. def identbodychars(cls):
  81. """
  82. all characters in this range that are valid identifier body characters,
  83. plus the digits 0-9
  84. """
  85. return "".join(
  86. sorted(
  87. set(
  88. cls.identchars
  89. + "0123456789"
  90. + "".join(
  91. [c for c in cls._chars_for_ranges if ("_" + c).isidentifier()]
  92. )
  93. )
  94. )
  95. )
  96. class pyparsing_unicode(unicode_set):
  97. """
  98. A namespace class for defining common language unicode_sets.
  99. """
  100. # fmt: off
  101. # define ranges in language character sets
  102. _ranges: UnicodeRangeList = [
  103. (0x0020, sys.maxunicode),
  104. ]
  105. class BasicMultilingualPlane(unicode_set):
  106. "Unicode set for the Basic Multilingual Plane"
  107. _ranges: UnicodeRangeList = [
  108. (0x0020, 0xFFFF),
  109. ]
  110. class Latin1(unicode_set):
  111. "Unicode set for Latin-1 Unicode Character Range"
  112. _ranges: UnicodeRangeList = [
  113. (0x0020, 0x007E),
  114. (0x00A0, 0x00FF),
  115. ]
  116. class LatinA(unicode_set):
  117. "Unicode set for Latin-A Unicode Character Range"
  118. _ranges: UnicodeRangeList = [
  119. (0x0100, 0x017F),
  120. ]
  121. class LatinB(unicode_set):
  122. "Unicode set for Latin-B Unicode Character Range"
  123. _ranges: UnicodeRangeList = [
  124. (0x0180, 0x024F),
  125. ]
  126. class Greek(unicode_set):
  127. "Unicode set for Greek Unicode Character Ranges"
  128. _ranges: UnicodeRangeList = [
  129. (0x0342, 0x0345),
  130. (0x0370, 0x0377),
  131. (0x037A, 0x037F),
  132. (0x0384, 0x038A),
  133. (0x038C,),
  134. (0x038E, 0x03A1),
  135. (0x03A3, 0x03E1),
  136. (0x03F0, 0x03FF),
  137. (0x1D26, 0x1D2A),
  138. (0x1D5E,),
  139. (0x1D60,),
  140. (0x1D66, 0x1D6A),
  141. (0x1F00, 0x1F15),
  142. (0x1F18, 0x1F1D),
  143. (0x1F20, 0x1F45),
  144. (0x1F48, 0x1F4D),
  145. (0x1F50, 0x1F57),
  146. (0x1F59,),
  147. (0x1F5B,),
  148. (0x1F5D,),
  149. (0x1F5F, 0x1F7D),
  150. (0x1F80, 0x1FB4),
  151. (0x1FB6, 0x1FC4),
  152. (0x1FC6, 0x1FD3),
  153. (0x1FD6, 0x1FDB),
  154. (0x1FDD, 0x1FEF),
  155. (0x1FF2, 0x1FF4),
  156. (0x1FF6, 0x1FFE),
  157. (0x2129,),
  158. (0x2719, 0x271A),
  159. (0xAB65,),
  160. (0x10140, 0x1018D),
  161. (0x101A0,),
  162. (0x1D200, 0x1D245),
  163. (0x1F7A1, 0x1F7A7),
  164. ]
  165. class Cyrillic(unicode_set):
  166. "Unicode set for Cyrillic Unicode Character Range"
  167. _ranges: UnicodeRangeList = [
  168. (0x0400, 0x052F),
  169. (0x1C80, 0x1C88),
  170. (0x1D2B,),
  171. (0x1D78,),
  172. (0x2DE0, 0x2DFF),
  173. (0xA640, 0xA672),
  174. (0xA674, 0xA69F),
  175. (0xFE2E, 0xFE2F),
  176. ]
  177. class Chinese(unicode_set):
  178. "Unicode set for Chinese Unicode Character Range"
  179. _ranges: UnicodeRangeList = [
  180. (0x2E80, 0x2E99),
  181. (0x2E9B, 0x2EF3),
  182. (0x31C0, 0x31E3),
  183. (0x3400, 0x4DB5),
  184. (0x4E00, 0x9FEF),
  185. (0xA700, 0xA707),
  186. (0xF900, 0xFA6D),
  187. (0xFA70, 0xFAD9),
  188. (0x16FE2, 0x16FE3),
  189. (0x1F210, 0x1F212),
  190. (0x1F214, 0x1F23B),
  191. (0x1F240, 0x1F248),
  192. (0x20000, 0x2A6D6),
  193. (0x2A700, 0x2B734),
  194. (0x2B740, 0x2B81D),
  195. (0x2B820, 0x2CEA1),
  196. (0x2CEB0, 0x2EBE0),
  197. (0x2F800, 0x2FA1D),
  198. ]
  199. class Japanese(unicode_set):
  200. "Unicode set for Japanese Unicode Character Range, combining Kanji, Hiragana, and Katakana ranges"
  201. _ranges: UnicodeRangeList = []
  202. class Kanji(unicode_set):
  203. "Unicode set for Kanji Unicode Character Range"
  204. _ranges: UnicodeRangeList = [
  205. (0x4E00, 0x9FBF),
  206. (0x3000, 0x303F),
  207. ]
  208. class Hiragana(unicode_set):
  209. "Unicode set for Hiragana Unicode Character Range"
  210. _ranges: UnicodeRangeList = [
  211. (0x3041, 0x3096),
  212. (0x3099, 0x30A0),
  213. (0x30FC,),
  214. (0xFF70,),
  215. (0x1B001,),
  216. (0x1B150, 0x1B152),
  217. (0x1F200,),
  218. ]
  219. class Katakana(unicode_set):
  220. "Unicode set for Katakana Unicode Character Range"
  221. _ranges: UnicodeRangeList = [
  222. (0x3099, 0x309C),
  223. (0x30A0, 0x30FF),
  224. (0x31F0, 0x31FF),
  225. (0x32D0, 0x32FE),
  226. (0xFF65, 0xFF9F),
  227. (0x1B000,),
  228. (0x1B164, 0x1B167),
  229. (0x1F201, 0x1F202),
  230. (0x1F213,),
  231. ]
  232. class Hangul(unicode_set):
  233. "Unicode set for Hangul (Korean) Unicode Character Range"
  234. _ranges: UnicodeRangeList = [
  235. (0x1100, 0x11FF),
  236. (0x302E, 0x302F),
  237. (0x3131, 0x318E),
  238. (0x3200, 0x321C),
  239. (0x3260, 0x327B),
  240. (0x327E,),
  241. (0xA960, 0xA97C),
  242. (0xAC00, 0xD7A3),
  243. (0xD7B0, 0xD7C6),
  244. (0xD7CB, 0xD7FB),
  245. (0xFFA0, 0xFFBE),
  246. (0xFFC2, 0xFFC7),
  247. (0xFFCA, 0xFFCF),
  248. (0xFFD2, 0xFFD7),
  249. (0xFFDA, 0xFFDC),
  250. ]
  251. Korean = Hangul
  252. class CJK(Chinese, Japanese, Hangul):
  253. "Unicode set for combined Chinese, Japanese, and Korean (CJK) Unicode Character Range"
  254. class Thai(unicode_set):
  255. "Unicode set for Thai Unicode Character Range"
  256. _ranges: UnicodeRangeList = [
  257. (0x0E01, 0x0E3A),
  258. (0x0E3F, 0x0E5B)
  259. ]
  260. class Arabic(unicode_set):
  261. "Unicode set for Arabic Unicode Character Range"
  262. _ranges: UnicodeRangeList = [
  263. (0x0600, 0x061B),
  264. (0x061E, 0x06FF),
  265. (0x0700, 0x077F),
  266. ]
  267. class Hebrew(unicode_set):
  268. "Unicode set for Hebrew Unicode Character Range"
  269. _ranges: UnicodeRangeList = [
  270. (0x0591, 0x05C7),
  271. (0x05D0, 0x05EA),
  272. (0x05EF, 0x05F4),
  273. (0xFB1D, 0xFB36),
  274. (0xFB38, 0xFB3C),
  275. (0xFB3E,),
  276. (0xFB40, 0xFB41),
  277. (0xFB43, 0xFB44),
  278. (0xFB46, 0xFB4F),
  279. ]
  280. class Devanagari(unicode_set):
  281. "Unicode set for Devanagari Unicode Character Range"
  282. _ranges: UnicodeRangeList = [
  283. (0x0900, 0x097F),
  284. (0xA8E0, 0xA8FF)
  285. ]
  286. # fmt: on
  287. pyparsing_unicode.Japanese._ranges = (
  288. pyparsing_unicode.Japanese.Kanji._ranges
  289. + pyparsing_unicode.Japanese.Hiragana._ranges
  290. + pyparsing_unicode.Japanese.Katakana._ranges
  291. )
  292. pyparsing_unicode.BMP = pyparsing_unicode.BasicMultilingualPlane
  293. # add language identifiers using language Unicode
  294. pyparsing_unicode.العربية = pyparsing_unicode.Arabic
  295. pyparsing_unicode.中文 = pyparsing_unicode.Chinese
  296. pyparsing_unicode.кириллица = pyparsing_unicode.Cyrillic
  297. pyparsing_unicode.Ελληνικά = pyparsing_unicode.Greek
  298. pyparsing_unicode.עִברִית = pyparsing_unicode.Hebrew
  299. pyparsing_unicode.日本語 = pyparsing_unicode.Japanese
  300. pyparsing_unicode.Japanese.漢字 = pyparsing_unicode.Japanese.Kanji
  301. pyparsing_unicode.Japanese.カタカナ = pyparsing_unicode.Japanese.Katakana
  302. pyparsing_unicode.Japanese.ひらがな = pyparsing_unicode.Japanese.Hiragana
  303. pyparsing_unicode.한국어 = pyparsing_unicode.Korean
  304. pyparsing_unicode.ไทย = pyparsing_unicode.Thai
  305. pyparsing_unicode.देवनागरी = pyparsing_unicode.Devanagari