lexer.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883
  1. """
  2. pygments.lexer
  3. ~~~~~~~~~~~~~~
  4. Base lexer classes.
  5. :copyright: Copyright 2006-2022 by the Pygments team, see AUTHORS.
  6. :license: BSD, see LICENSE for details.
  7. """
  8. import re
  9. import sys
  10. import time
  11. from pip._vendor.pygments.filter import apply_filters, Filter
  12. from pip._vendor.pygments.filters import get_filter_by_name
  13. from pip._vendor.pygments.token import Error, Text, Other, Whitespace, _TokenType
  14. from pip._vendor.pygments.util import get_bool_opt, get_int_opt, get_list_opt, \
  15. make_analysator, Future, guess_decode
  16. from pip._vendor.pygments.regexopt import regex_opt
  17. __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',
  18. 'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this',
  19. 'default', 'words', 'line_re']
  20. line_re = re.compile('.*?\n')
  21. _encoding_map = [(b'\xef\xbb\xbf', 'utf-8'),
  22. (b'\xff\xfe\0\0', 'utf-32'),
  23. (b'\0\0\xfe\xff', 'utf-32be'),
  24. (b'\xff\xfe', 'utf-16'),
  25. (b'\xfe\xff', 'utf-16be')]
  26. _default_analyse = staticmethod(lambda x: 0.0)
  27. class LexerMeta(type):
  28. """
  29. This metaclass automagically converts ``analyse_text`` methods into
  30. static methods which always return float values.
  31. """
  32. def __new__(mcs, name, bases, d):
  33. if 'analyse_text' in d:
  34. d['analyse_text'] = make_analysator(d['analyse_text'])
  35. return type.__new__(mcs, name, bases, d)
  36. class Lexer(metaclass=LexerMeta):
  37. """
  38. Lexer for a specific language.
  39. Basic options recognized:
  40. ``stripnl``
  41. Strip leading and trailing newlines from the input (default: True).
  42. ``stripall``
  43. Strip all leading and trailing whitespace from the input
  44. (default: False).
  45. ``ensurenl``
  46. Make sure that the input ends with a newline (default: True). This
  47. is required for some lexers that consume input linewise.
  48. .. versionadded:: 1.3
  49. ``tabsize``
  50. If given and greater than 0, expand tabs in the input (default: 0).
  51. ``encoding``
  52. If given, must be an encoding name. This encoding will be used to
  53. convert the input string to Unicode, if it is not already a Unicode
  54. string (default: ``'guess'``, which uses a simple UTF-8 / Locale /
  55. Latin1 detection. Can also be ``'chardet'`` to use the chardet
  56. library, if it is installed.
  57. ``inencoding``
  58. Overrides the ``encoding`` if given.
  59. """
  60. #: Name of the lexer
  61. name = None
  62. #: URL of the language specification/definition
  63. url = None
  64. #: Shortcuts for the lexer
  65. aliases = []
  66. #: File name globs
  67. filenames = []
  68. #: Secondary file name globs
  69. alias_filenames = []
  70. #: MIME types
  71. mimetypes = []
  72. #: Priority, should multiple lexers match and no content is provided
  73. priority = 0
  74. def __init__(self, **options):
  75. self.options = options
  76. self.stripnl = get_bool_opt(options, 'stripnl', True)
  77. self.stripall = get_bool_opt(options, 'stripall', False)
  78. self.ensurenl = get_bool_opt(options, 'ensurenl', True)
  79. self.tabsize = get_int_opt(options, 'tabsize', 0)
  80. self.encoding = options.get('encoding', 'guess')
  81. self.encoding = options.get('inencoding') or self.encoding
  82. self.filters = []
  83. for filter_ in get_list_opt(options, 'filters', ()):
  84. self.add_filter(filter_)
  85. def __repr__(self):
  86. if self.options:
  87. return '<pygments.lexers.%s with %r>' % (self.__class__.__name__,
  88. self.options)
  89. else:
  90. return '<pygments.lexers.%s>' % self.__class__.__name__
  91. def add_filter(self, filter_, **options):
  92. """
  93. Add a new stream filter to this lexer.
  94. """
  95. if not isinstance(filter_, Filter):
  96. filter_ = get_filter_by_name(filter_, **options)
  97. self.filters.append(filter_)
  98. def analyse_text(text):
  99. """
  100. Has to return a float between ``0`` and ``1`` that indicates
  101. if a lexer wants to highlight this text. Used by ``guess_lexer``.
  102. If this method returns ``0`` it won't highlight it in any case, if
  103. it returns ``1`` highlighting with this lexer is guaranteed.
  104. The `LexerMeta` metaclass automatically wraps this function so
  105. that it works like a static method (no ``self`` or ``cls``
  106. parameter) and the return value is automatically converted to
  107. `float`. If the return value is an object that is boolean `False`
  108. it's the same as if the return values was ``0.0``.
  109. """
  110. def get_tokens(self, text, unfiltered=False):
  111. """
  112. Return an iterable of (tokentype, value) pairs generated from
  113. `text`. If `unfiltered` is set to `True`, the filtering mechanism
  114. is bypassed even if filters are defined.
  115. Also preprocess the text, i.e. expand tabs and strip it if
  116. wanted and applies registered filters.
  117. """
  118. if not isinstance(text, str):
  119. if self.encoding == 'guess':
  120. text, _ = guess_decode(text)
  121. elif self.encoding == 'chardet':
  122. try:
  123. from pip._vendor import chardet
  124. except ImportError as e:
  125. raise ImportError('To enable chardet encoding guessing, '
  126. 'please install the chardet library '
  127. 'from http://chardet.feedparser.org/') from e
  128. # check for BOM first
  129. decoded = None
  130. for bom, encoding in _encoding_map:
  131. if text.startswith(bom):
  132. decoded = text[len(bom):].decode(encoding, 'replace')
  133. break
  134. # no BOM found, so use chardet
  135. if decoded is None:
  136. enc = chardet.detect(text[:1024]) # Guess using first 1KB
  137. decoded = text.decode(enc.get('encoding') or 'utf-8',
  138. 'replace')
  139. text = decoded
  140. else:
  141. text = text.decode(self.encoding)
  142. if text.startswith('\ufeff'):
  143. text = text[len('\ufeff'):]
  144. else:
  145. if text.startswith('\ufeff'):
  146. text = text[len('\ufeff'):]
  147. # text now *is* a unicode string
  148. text = text.replace('\r\n', '\n')
  149. text = text.replace('\r', '\n')
  150. if self.stripall:
  151. text = text.strip()
  152. elif self.stripnl:
  153. text = text.strip('\n')
  154. if self.tabsize > 0:
  155. text = text.expandtabs(self.tabsize)
  156. if self.ensurenl and not text.endswith('\n'):
  157. text += '\n'
  158. def streamer():
  159. for _, t, v in self.get_tokens_unprocessed(text):
  160. yield t, v
  161. stream = streamer()
  162. if not unfiltered:
  163. stream = apply_filters(stream, self.filters, self)
  164. return stream
  165. def get_tokens_unprocessed(self, text):
  166. """
  167. Return an iterable of (index, tokentype, value) pairs where "index"
  168. is the starting position of the token within the input text.
  169. In subclasses, implement this method as a generator to
  170. maximize effectiveness.
  171. """
  172. raise NotImplementedError
  173. class DelegatingLexer(Lexer):
  174. """
  175. This lexer takes two lexer as arguments. A root lexer and
  176. a language lexer. First everything is scanned using the language
  177. lexer, afterwards all ``Other`` tokens are lexed using the root
  178. lexer.
  179. The lexers from the ``template`` lexer package use this base lexer.
  180. """
  181. def __init__(self, _root_lexer, _language_lexer, _needle=Other, **options):
  182. self.root_lexer = _root_lexer(**options)
  183. self.language_lexer = _language_lexer(**options)
  184. self.needle = _needle
  185. Lexer.__init__(self, **options)
  186. def get_tokens_unprocessed(self, text):
  187. buffered = ''
  188. insertions = []
  189. lng_buffer = []
  190. for i, t, v in self.language_lexer.get_tokens_unprocessed(text):
  191. if t is self.needle:
  192. if lng_buffer:
  193. insertions.append((len(buffered), lng_buffer))
  194. lng_buffer = []
  195. buffered += v
  196. else:
  197. lng_buffer.append((i, t, v))
  198. if lng_buffer:
  199. insertions.append((len(buffered), lng_buffer))
  200. return do_insertions(insertions,
  201. self.root_lexer.get_tokens_unprocessed(buffered))
  202. # ------------------------------------------------------------------------------
  203. # RegexLexer and ExtendedRegexLexer
  204. #
  205. class include(str): # pylint: disable=invalid-name
  206. """
  207. Indicates that a state should include rules from another state.
  208. """
  209. pass
  210. class _inherit:
  211. """
  212. Indicates the a state should inherit from its superclass.
  213. """
  214. def __repr__(self):
  215. return 'inherit'
  216. inherit = _inherit() # pylint: disable=invalid-name
  217. class combined(tuple): # pylint: disable=invalid-name
  218. """
  219. Indicates a state combined from multiple states.
  220. """
  221. def __new__(cls, *args):
  222. return tuple.__new__(cls, args)
  223. def __init__(self, *args):
  224. # tuple.__init__ doesn't do anything
  225. pass
  226. class _PseudoMatch:
  227. """
  228. A pseudo match object constructed from a string.
  229. """
  230. def __init__(self, start, text):
  231. self._text = text
  232. self._start = start
  233. def start(self, arg=None):
  234. return self._start
  235. def end(self, arg=None):
  236. return self._start + len(self._text)
  237. def group(self, arg=None):
  238. if arg:
  239. raise IndexError('No such group')
  240. return self._text
  241. def groups(self):
  242. return (self._text,)
  243. def groupdict(self):
  244. return {}
  245. def bygroups(*args):
  246. """
  247. Callback that yields multiple actions for each group in the match.
  248. """
  249. def callback(lexer, match, ctx=None):
  250. for i, action in enumerate(args):
  251. if action is None:
  252. continue
  253. elif type(action) is _TokenType:
  254. data = match.group(i + 1)
  255. if data:
  256. yield match.start(i + 1), action, data
  257. else:
  258. data = match.group(i + 1)
  259. if data is not None:
  260. if ctx:
  261. ctx.pos = match.start(i + 1)
  262. for item in action(lexer,
  263. _PseudoMatch(match.start(i + 1), data), ctx):
  264. if item:
  265. yield item
  266. if ctx:
  267. ctx.pos = match.end()
  268. return callback
  269. class _This:
  270. """
  271. Special singleton used for indicating the caller class.
  272. Used by ``using``.
  273. """
  274. this = _This()
  275. def using(_other, **kwargs):
  276. """
  277. Callback that processes the match with a different lexer.
  278. The keyword arguments are forwarded to the lexer, except `state` which
  279. is handled separately.
  280. `state` specifies the state that the new lexer will start in, and can
  281. be an enumerable such as ('root', 'inline', 'string') or a simple
  282. string which is assumed to be on top of the root state.
  283. Note: For that to work, `_other` must not be an `ExtendedRegexLexer`.
  284. """
  285. gt_kwargs = {}
  286. if 'state' in kwargs:
  287. s = kwargs.pop('state')
  288. if isinstance(s, (list, tuple)):
  289. gt_kwargs['stack'] = s
  290. else:
  291. gt_kwargs['stack'] = ('root', s)
  292. if _other is this:
  293. def callback(lexer, match, ctx=None):
  294. # if keyword arguments are given the callback
  295. # function has to create a new lexer instance
  296. if kwargs:
  297. # XXX: cache that somehow
  298. kwargs.update(lexer.options)
  299. lx = lexer.__class__(**kwargs)
  300. else:
  301. lx = lexer
  302. s = match.start()
  303. for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
  304. yield i + s, t, v
  305. if ctx:
  306. ctx.pos = match.end()
  307. else:
  308. def callback(lexer, match, ctx=None):
  309. # XXX: cache that somehow
  310. kwargs.update(lexer.options)
  311. lx = _other(**kwargs)
  312. s = match.start()
  313. for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
  314. yield i + s, t, v
  315. if ctx:
  316. ctx.pos = match.end()
  317. return callback
  318. class default:
  319. """
  320. Indicates a state or state action (e.g. #pop) to apply.
  321. For example default('#pop') is equivalent to ('', Token, '#pop')
  322. Note that state tuples may be used as well.
  323. .. versionadded:: 2.0
  324. """
  325. def __init__(self, state):
  326. self.state = state
  327. class words(Future):
  328. """
  329. Indicates a list of literal words that is transformed into an optimized
  330. regex that matches any of the words.
  331. .. versionadded:: 2.0
  332. """
  333. def __init__(self, words, prefix='', suffix=''):
  334. self.words = words
  335. self.prefix = prefix
  336. self.suffix = suffix
  337. def get(self):
  338. return regex_opt(self.words, prefix=self.prefix, suffix=self.suffix)
  339. class RegexLexerMeta(LexerMeta):
  340. """
  341. Metaclass for RegexLexer, creates the self._tokens attribute from
  342. self.tokens on the first instantiation.
  343. """
  344. def _process_regex(cls, regex, rflags, state):
  345. """Preprocess the regular expression component of a token definition."""
  346. if isinstance(regex, Future):
  347. regex = regex.get()
  348. return re.compile(regex, rflags).match
  349. def _process_token(cls, token):
  350. """Preprocess the token component of a token definition."""
  351. assert type(token) is _TokenType or callable(token), \
  352. 'token type must be simple type or callable, not %r' % (token,)
  353. return token
  354. def _process_new_state(cls, new_state, unprocessed, processed):
  355. """Preprocess the state transition action of a token definition."""
  356. if isinstance(new_state, str):
  357. # an existing state
  358. if new_state == '#pop':
  359. return -1
  360. elif new_state in unprocessed:
  361. return (new_state,)
  362. elif new_state == '#push':
  363. return new_state
  364. elif new_state[:5] == '#pop:':
  365. return -int(new_state[5:])
  366. else:
  367. assert False, 'unknown new state %r' % new_state
  368. elif isinstance(new_state, combined):
  369. # combine a new state from existing ones
  370. tmp_state = '_tmp_%d' % cls._tmpname
  371. cls._tmpname += 1
  372. itokens = []
  373. for istate in new_state:
  374. assert istate != new_state, 'circular state ref %r' % istate
  375. itokens.extend(cls._process_state(unprocessed,
  376. processed, istate))
  377. processed[tmp_state] = itokens
  378. return (tmp_state,)
  379. elif isinstance(new_state, tuple):
  380. # push more than one state
  381. for istate in new_state:
  382. assert (istate in unprocessed or
  383. istate in ('#pop', '#push')), \
  384. 'unknown new state ' + istate
  385. return new_state
  386. else:
  387. assert False, 'unknown new state def %r' % new_state
  388. def _process_state(cls, unprocessed, processed, state):
  389. """Preprocess a single state definition."""
  390. assert type(state) is str, "wrong state name %r" % state
  391. assert state[0] != '#', "invalid state name %r" % state
  392. if state in processed:
  393. return processed[state]
  394. tokens = processed[state] = []
  395. rflags = cls.flags
  396. for tdef in unprocessed[state]:
  397. if isinstance(tdef, include):
  398. # it's a state reference
  399. assert tdef != state, "circular state reference %r" % state
  400. tokens.extend(cls._process_state(unprocessed, processed,
  401. str(tdef)))
  402. continue
  403. if isinstance(tdef, _inherit):
  404. # should be processed already, but may not in the case of:
  405. # 1. the state has no counterpart in any parent
  406. # 2. the state includes more than one 'inherit'
  407. continue
  408. if isinstance(tdef, default):
  409. new_state = cls._process_new_state(tdef.state, unprocessed, processed)
  410. tokens.append((re.compile('').match, None, new_state))
  411. continue
  412. assert type(tdef) is tuple, "wrong rule def %r" % tdef
  413. try:
  414. rex = cls._process_regex(tdef[0], rflags, state)
  415. except Exception as err:
  416. raise ValueError("uncompilable regex %r in state %r of %r: %s" %
  417. (tdef[0], state, cls, err)) from err
  418. token = cls._process_token(tdef[1])
  419. if len(tdef) == 2:
  420. new_state = None
  421. else:
  422. new_state = cls._process_new_state(tdef[2],
  423. unprocessed, processed)
  424. tokens.append((rex, token, new_state))
  425. return tokens
  426. def process_tokendef(cls, name, tokendefs=None):
  427. """Preprocess a dictionary of token definitions."""
  428. processed = cls._all_tokens[name] = {}
  429. tokendefs = tokendefs or cls.tokens[name]
  430. for state in list(tokendefs):
  431. cls._process_state(tokendefs, processed, state)
  432. return processed
  433. def get_tokendefs(cls):
  434. """
  435. Merge tokens from superclasses in MRO order, returning a single tokendef
  436. dictionary.
  437. Any state that is not defined by a subclass will be inherited
  438. automatically. States that *are* defined by subclasses will, by
  439. default, override that state in the superclass. If a subclass wishes to
  440. inherit definitions from a superclass, it can use the special value
  441. "inherit", which will cause the superclass' state definition to be
  442. included at that point in the state.
  443. """
  444. tokens = {}
  445. inheritable = {}
  446. for c in cls.__mro__:
  447. toks = c.__dict__.get('tokens', {})
  448. for state, items in toks.items():
  449. curitems = tokens.get(state)
  450. if curitems is None:
  451. # N.b. because this is assigned by reference, sufficiently
  452. # deep hierarchies are processed incrementally (e.g. for
  453. # A(B), B(C), C(RegexLexer), B will be premodified so X(B)
  454. # will not see any inherits in B).
  455. tokens[state] = items
  456. try:
  457. inherit_ndx = items.index(inherit)
  458. except ValueError:
  459. continue
  460. inheritable[state] = inherit_ndx
  461. continue
  462. inherit_ndx = inheritable.pop(state, None)
  463. if inherit_ndx is None:
  464. continue
  465. # Replace the "inherit" value with the items
  466. curitems[inherit_ndx:inherit_ndx+1] = items
  467. try:
  468. # N.b. this is the index in items (that is, the superclass
  469. # copy), so offset required when storing below.
  470. new_inh_ndx = items.index(inherit)
  471. except ValueError:
  472. pass
  473. else:
  474. inheritable[state] = inherit_ndx + new_inh_ndx
  475. return tokens
  476. def __call__(cls, *args, **kwds):
  477. """Instantiate cls after preprocessing its token definitions."""
  478. if '_tokens' not in cls.__dict__:
  479. cls._all_tokens = {}
  480. cls._tmpname = 0
  481. if hasattr(cls, 'token_variants') and cls.token_variants:
  482. # don't process yet
  483. pass
  484. else:
  485. cls._tokens = cls.process_tokendef('', cls.get_tokendefs())
  486. return type.__call__(cls, *args, **kwds)
  487. class RegexLexer(Lexer, metaclass=RegexLexerMeta):
  488. """
  489. Base for simple stateful regular expression-based lexers.
  490. Simplifies the lexing process so that you need only
  491. provide a list of states and regular expressions.
  492. """
  493. #: Flags for compiling the regular expressions.
  494. #: Defaults to MULTILINE.
  495. flags = re.MULTILINE
  496. #: At all time there is a stack of states. Initially, the stack contains
  497. #: a single state 'root'. The top of the stack is called "the current state".
  498. #:
  499. #: Dict of ``{'state': [(regex, tokentype, new_state), ...], ...}``
  500. #:
  501. #: ``new_state`` can be omitted to signify no state transition.
  502. #: If ``new_state`` is a string, it is pushed on the stack. This ensure
  503. #: the new current state is ``new_state``.
  504. #: If ``new_state`` is a tuple of strings, all of those strings are pushed
  505. #: on the stack and the current state will be the last element of the list.
  506. #: ``new_state`` can also be ``combined('state1', 'state2', ...)``
  507. #: to signify a new, anonymous state combined from the rules of two
  508. #: or more existing ones.
  509. #: Furthermore, it can be '#pop' to signify going back one step in
  510. #: the state stack, or '#push' to push the current state on the stack
  511. #: again. Note that if you push while in a combined state, the combined
  512. #: state itself is pushed, and not only the state in which the rule is
  513. #: defined.
  514. #:
  515. #: The tuple can also be replaced with ``include('state')``, in which
  516. #: case the rules from the state named by the string are included in the
  517. #: current one.
  518. tokens = {}
  519. def get_tokens_unprocessed(self, text, stack=('root',)):
  520. """
  521. Split ``text`` into (tokentype, text) pairs.
  522. ``stack`` is the initial stack (default: ``['root']``)
  523. """
  524. pos = 0
  525. tokendefs = self._tokens
  526. statestack = list(stack)
  527. statetokens = tokendefs[statestack[-1]]
  528. while 1:
  529. for rexmatch, action, new_state in statetokens:
  530. m = rexmatch(text, pos)
  531. if m:
  532. if action is not None:
  533. if type(action) is _TokenType:
  534. yield pos, action, m.group()
  535. else:
  536. yield from action(self, m)
  537. pos = m.end()
  538. if new_state is not None:
  539. # state transition
  540. if isinstance(new_state, tuple):
  541. for state in new_state:
  542. if state == '#pop':
  543. if len(statestack) > 1:
  544. statestack.pop()
  545. elif state == '#push':
  546. statestack.append(statestack[-1])
  547. else:
  548. statestack.append(state)
  549. elif isinstance(new_state, int):
  550. # pop, but keep at least one state on the stack
  551. # (random code leading to unexpected pops should
  552. # not allow exceptions)
  553. if abs(new_state) >= len(statestack):
  554. del statestack[1:]
  555. else:
  556. del statestack[new_state:]
  557. elif new_state == '#push':
  558. statestack.append(statestack[-1])
  559. else:
  560. assert False, "wrong state def: %r" % new_state
  561. statetokens = tokendefs[statestack[-1]]
  562. break
  563. else:
  564. # We are here only if all state tokens have been considered
  565. # and there was not a match on any of them.
  566. try:
  567. if text[pos] == '\n':
  568. # at EOL, reset state to "root"
  569. statestack = ['root']
  570. statetokens = tokendefs['root']
  571. yield pos, Whitespace, '\n'
  572. pos += 1
  573. continue
  574. yield pos, Error, text[pos]
  575. pos += 1
  576. except IndexError:
  577. break
  578. class LexerContext:
  579. """
  580. A helper object that holds lexer position data.
  581. """
  582. def __init__(self, text, pos, stack=None, end=None):
  583. self.text = text
  584. self.pos = pos
  585. self.end = end or len(text) # end=0 not supported ;-)
  586. self.stack = stack or ['root']
  587. def __repr__(self):
  588. return 'LexerContext(%r, %r, %r)' % (
  589. self.text, self.pos, self.stack)
  590. class ExtendedRegexLexer(RegexLexer):
  591. """
  592. A RegexLexer that uses a context object to store its state.
  593. """
  594. def get_tokens_unprocessed(self, text=None, context=None):
  595. """
  596. Split ``text`` into (tokentype, text) pairs.
  597. If ``context`` is given, use this lexer context instead.
  598. """
  599. tokendefs = self._tokens
  600. if not context:
  601. ctx = LexerContext(text, 0)
  602. statetokens = tokendefs['root']
  603. else:
  604. ctx = context
  605. statetokens = tokendefs[ctx.stack[-1]]
  606. text = ctx.text
  607. while 1:
  608. for rexmatch, action, new_state in statetokens:
  609. m = rexmatch(text, ctx.pos, ctx.end)
  610. if m:
  611. if action is not None:
  612. if type(action) is _TokenType:
  613. yield ctx.pos, action, m.group()
  614. ctx.pos = m.end()
  615. else:
  616. yield from action(self, m, ctx)
  617. if not new_state:
  618. # altered the state stack?
  619. statetokens = tokendefs[ctx.stack[-1]]
  620. # CAUTION: callback must set ctx.pos!
  621. if new_state is not None:
  622. # state transition
  623. if isinstance(new_state, tuple):
  624. for state in new_state:
  625. if state == '#pop':
  626. if len(ctx.stack) > 1:
  627. ctx.stack.pop()
  628. elif state == '#push':
  629. ctx.stack.append(ctx.stack[-1])
  630. else:
  631. ctx.stack.append(state)
  632. elif isinstance(new_state, int):
  633. # see RegexLexer for why this check is made
  634. if abs(new_state) >= len(ctx.stack):
  635. del ctx.stack[1:]
  636. else:
  637. del ctx.stack[new_state:]
  638. elif new_state == '#push':
  639. ctx.stack.append(ctx.stack[-1])
  640. else:
  641. assert False, "wrong state def: %r" % new_state
  642. statetokens = tokendefs[ctx.stack[-1]]
  643. break
  644. else:
  645. try:
  646. if ctx.pos >= ctx.end:
  647. break
  648. if text[ctx.pos] == '\n':
  649. # at EOL, reset state to "root"
  650. ctx.stack = ['root']
  651. statetokens = tokendefs['root']
  652. yield ctx.pos, Text, '\n'
  653. ctx.pos += 1
  654. continue
  655. yield ctx.pos, Error, text[ctx.pos]
  656. ctx.pos += 1
  657. except IndexError:
  658. break
  659. def do_insertions(insertions, tokens):
  660. """
  661. Helper for lexers which must combine the results of several
  662. sublexers.
  663. ``insertions`` is a list of ``(index, itokens)`` pairs.
  664. Each ``itokens`` iterable should be inserted at position
  665. ``index`` into the token stream given by the ``tokens``
  666. argument.
  667. The result is a combined token stream.
  668. TODO: clean up the code here.
  669. """
  670. insertions = iter(insertions)
  671. try:
  672. index, itokens = next(insertions)
  673. except StopIteration:
  674. # no insertions
  675. yield from tokens
  676. return
  677. realpos = None
  678. insleft = True
  679. # iterate over the token stream where we want to insert
  680. # the tokens from the insertion list.
  681. for i, t, v in tokens:
  682. # first iteration. store the position of first item
  683. if realpos is None:
  684. realpos = i
  685. oldi = 0
  686. while insleft and i + len(v) >= index:
  687. tmpval = v[oldi:index - i]
  688. if tmpval:
  689. yield realpos, t, tmpval
  690. realpos += len(tmpval)
  691. for it_index, it_token, it_value in itokens:
  692. yield realpos, it_token, it_value
  693. realpos += len(it_value)
  694. oldi = index - i
  695. try:
  696. index, itokens = next(insertions)
  697. except StopIteration:
  698. insleft = False
  699. break # not strictly necessary
  700. if oldi < len(v):
  701. yield realpos, t, v[oldi:]
  702. realpos += len(v) - oldi
  703. # leftover tokens
  704. while insleft:
  705. # no normal tokens, set realpos to zero
  706. realpos = realpos or 0
  707. for p, t, v in itokens:
  708. yield realpos, t, v
  709. realpos += len(v)
  710. try:
  711. index, itokens = next(insertions)
  712. except StopIteration:
  713. insleft = False
  714. break # not strictly necessary
  715. class ProfilingRegexLexerMeta(RegexLexerMeta):
  716. """Metaclass for ProfilingRegexLexer, collects regex timing info."""
  717. def _process_regex(cls, regex, rflags, state):
  718. if isinstance(regex, words):
  719. rex = regex_opt(regex.words, prefix=regex.prefix,
  720. suffix=regex.suffix)
  721. else:
  722. rex = regex
  723. compiled = re.compile(rex, rflags)
  724. def match_func(text, pos, endpos=sys.maxsize):
  725. info = cls._prof_data[-1].setdefault((state, rex), [0, 0.0])
  726. t0 = time.time()
  727. res = compiled.match(text, pos, endpos)
  728. t1 = time.time()
  729. info[0] += 1
  730. info[1] += t1 - t0
  731. return res
  732. return match_func
  733. class ProfilingRegexLexer(RegexLexer, metaclass=ProfilingRegexLexerMeta):
  734. """Drop-in replacement for RegexLexer that does profiling of its regexes."""
  735. _prof_data = []
  736. _prof_sort_index = 4 # defaults to time per call
  737. def get_tokens_unprocessed(self, text, stack=('root',)):
  738. # this needs to be a stack, since using(this) will produce nested calls
  739. self.__class__._prof_data.append({})
  740. yield from RegexLexer.get_tokens_unprocessed(self, text, stack)
  741. rawdata = self.__class__._prof_data.pop()
  742. data = sorted(((s, repr(r).strip('u\'').replace('\\\\', '\\')[:65],
  743. n, 1000 * t, 1000 * t / n)
  744. for ((s, r), (n, t)) in rawdata.items()),
  745. key=lambda x: x[self._prof_sort_index],
  746. reverse=True)
  747. sum_total = sum(x[3] for x in data)
  748. print()
  749. print('Profiling result for %s lexing %d chars in %.3f ms' %
  750. (self.__class__.__name__, len(text), sum_total))
  751. print('=' * 110)
  752. print('%-20s %-64s ncalls tottime percall' % ('state', 'regex'))
  753. print('-' * 110)
  754. for d in data:
  755. print('%-20s %-65s %5d %8.4f %8.4f' % d)
  756. print('=' * 110)