diff options
Diffstat (limited to 'scripts/debug_lexer.py')
-rwxr-xr-x | scripts/debug_lexer.py | 306 |
1 files changed, 306 insertions, 0 deletions
diff --git a/scripts/debug_lexer.py b/scripts/debug_lexer.py new file mode 100755 index 0000000..6323d9c --- /dev/null +++ b/scripts/debug_lexer.py @@ -0,0 +1,306 @@ +#!/usr/bin/python +""" + Lexing error finder + ~~~~~~~~~~~~~~~~~~~ + + For the source files given on the command line, display + the text where Error tokens are being generated, along + with some context. + + :copyright: Copyright 2006-2022 by the Pygments team, see AUTHORS. + :license: BSD, see LICENSE for details. +""" + +import os +import sys +import struct + +# always prefer Pygments from source if exists +srcpath = os.path.join(os.path.dirname(__file__), '..') +if os.path.isdir(os.path.join(srcpath, 'pygments')): + sys.path.insert(0, srcpath) + + +from pygments.lexer import RegexLexer, ExtendedRegexLexer, LexerContext, \ + ProfilingRegexLexer, ProfilingRegexLexerMeta +from pygments.lexers import get_lexer_by_name, find_lexer_class, \ + find_lexer_class_for_filename, guess_lexer +from pygments.token import Error, Text, _TokenType +from pygments.cmdline import _parse_options + + +class DebuggingRegexLexer(ExtendedRegexLexer): + """Make the state stack, position and current match instance attributes.""" + + def get_tokens_unprocessed(self, text, stack=('root',)): + """ + Split ``text`` into (tokentype, text) pairs. + + ``stack`` is the initial stack (default: ``['root']``) + """ + tokendefs = self._tokens + self.ctx = ctx = LexerContext(text, 0) + ctx.stack = list(stack) + statetokens = tokendefs[ctx.stack[-1]] + while 1: + for rexmatch, action, new_state in statetokens: + self.m = m = rexmatch(text, ctx.pos, ctx.end) + if m: + if action is not None: + if type(action) is _TokenType: + yield ctx.pos, action, m.group() + ctx.pos = m.end() + else: + if not isinstance(self, ExtendedRegexLexer): + yield from action(self, m) + ctx.pos = m.end() + else: + yield from action(self, m, ctx) + if not new_state: + # altered the state stack? + statetokens = tokendefs[ctx.stack[-1]] + if new_state is not None: + # state transition + if isinstance(new_state, tuple): + for state in new_state: + if state == '#pop': + ctx.stack.pop() + elif state == '#push': + ctx.stack.append(ctx.stack[-1]) + else: + ctx.stack.append(state) + elif isinstance(new_state, int): + # pop + del ctx.stack[new_state:] + elif new_state == '#push': + ctx.stack.append(ctx.stack[-1]) + else: + assert False, 'wrong state def: %r' % new_state + statetokens = tokendefs[ctx.stack[-1]] + break + else: + try: + if ctx.pos >= ctx.end: + break + if text[ctx.pos] == '\n': + # at EOL, reset state to 'root' + ctx.stack = ['root'] + statetokens = tokendefs['root'] + yield ctx.pos, Text, '\n' + ctx.pos += 1 + continue + yield ctx.pos, Error, text[ctx.pos] + ctx.pos += 1 + except IndexError: + break + + +def decode_atheris(bstr): + """Decode a byte string into a Unicode string using the algorithm + of Google's Atheris fuzzer library, which aims to produce a wide + range of possible Unicode inputs. + + Corresponds to ConsumeUnicodeImpl() with filter_surrogates=false in + https://github.com/google/atheris/blob/master/fuzzed_data_provider.cc + """ + if len(bstr) < 2: + return '' + # The first byte only selects if the rest is decoded as ascii, "utf-16" or "utf-32" + spec, bstr = bstr[0], bstr[1:] + if spec & 1: # pure ASCII + return ''.join(chr(ch & 0x7f) for ch in bstr) + elif spec & 2: # UTF-16 + bstr = bstr if len(bstr) % 2 == 0 else bstr[:-1] + return bstr.decode('utf16') + + # else UTF-32 + def valid_codepoint(ch): + ch &= 0x1fffff + if ch & 0x100000: + ch &= ~0x0f0000 + return chr(ch) + + chars = struct.unpack('%dI%dx' % divmod(len(bstr), 4), bstr) + return ''.join(map(valid_codepoint), chars) + + +def main(fn, lexer=None, options={}): + if fn == '-': + text = sys.stdin.read() + else: + with open(fn, 'rb') as fp: + text = fp.read() + if decode_strategy == 'latin1': + try: + text = text.decode('utf8') + except UnicodeError: + print('Warning: non-UTF8 input, using latin1') + text = text.decode('latin1') + elif decode_strategy == 'utf8-ignore': + try: + text = text.decode('utf8') + except UnicodeError: + print('Warning: ignoring non-UTF8 bytes in input') + text = text.decode('utf8', 'ignore') + elif decode_strategy == 'atheris': + text = decode_atheris(text) + + text = text.strip('\n') + '\n' + + if lexer is not None: + lxcls = get_lexer_by_name(lexer).__class__ + elif guess: + lxcls = guess_lexer(text).__class__ + print('Using lexer: %s (%s.%s)' % (lxcls.name, lxcls.__module__, + lxcls.__name__)) + else: + lxcls = find_lexer_class_for_filename(os.path.basename(fn)) + if lxcls is None: + name, rest = fn.split('_', 1) + lxcls = find_lexer_class(name) + if lxcls is None: + raise AssertionError('no lexer found for file %r' % fn) + print('Using lexer: %s (%s.%s)' % (lxcls.name, lxcls.__module__, + lxcls.__name__)) + debug_lexer = False + # if profile: + # # does not work for e.g. ExtendedRegexLexers + # if lxcls.__bases__ == (RegexLexer,): + # # yes we can! (change the metaclass) + # lxcls.__class__ = ProfilingRegexLexerMeta + # lxcls.__bases__ = (ProfilingRegexLexer,) + # lxcls._prof_sort_index = profsort + # else: + # if lxcls.__bases__ == (RegexLexer,): + # lxcls.__bases__ = (DebuggingRegexLexer,) + # debug_lexer = True + # elif lxcls.__bases__ == (DebuggingRegexLexer,): + # # already debugged before + # debug_lexer = True + # else: + # # HACK: ExtendedRegexLexer subclasses will only partially work here. + # lxcls.__bases__ = (DebuggingRegexLexer,) + # debug_lexer = True + + lx = lxcls(**options) + lno = 1 + tokens = [] + states = [] + + def show_token(tok, state): + reprs = list(map(repr, tok)) + print(' ' + reprs[1] + ' ' + ' ' * (29-len(reprs[1])) + reprs[0], end=' ') + if debug_lexer: + print(' ' + ' ' * (29-len(reprs[0])) + ' : '.join(state) + if state else '', end=' ') + print() + + for type, val in lx.get_tokens(text): + lno += val.count('\n') + if type == Error and not ignerror: + print('Error parsing', fn, 'on line', lno) + if not showall: + print('Previous tokens' + (debug_lexer and ' and states' or '') + ':') + for i in range(max(len(tokens) - num, 0), len(tokens)): + if debug_lexer: + show_token(tokens[i], states[i]) + else: + show_token(tokens[i], None) + print('Error token:') + vlen = len(repr(val)) + print(' ' + repr(val), end=' ') + if debug_lexer and hasattr(lx, 'ctx'): + print(' ' * (60-vlen) + ' : '.join(lx.ctx.stack), end=' ') + print() + print() + return 1 + tokens.append((type, val)) + if debug_lexer: + if hasattr(lx, 'ctx'): + states.append(lx.ctx.stack[:]) + else: + states.append(None) + if showall: + show_token((type, val), states[-1] if debug_lexer else None) + return 0 + + +def print_help(): + print('''\ +Pygments development helper to quickly debug lexers. + + scripts/debug_lexer.py [options] file ... + +Give one or more filenames to lex them and display possible error tokens +and/or profiling info. Files are assumed to be encoded in UTF-8. + +Selecting lexer and options: + + -l NAME use lexer named NAME (default is to guess from + the given filenames) + -g guess lexer from content + -u if input is non-utf8, use "ignore" handler instead + of using latin1 encoding + -U use Atheris fuzzer's method of converting + byte input to Unicode + -O OPTIONSTR use lexer options parsed from OPTIONSTR + +Debugging lexing errors: + + -n N show the last N tokens on error + -a always show all lexed tokens (default is only + to show them when an error occurs) + -e do not stop on error tokens + +Profiling: + + -p use the ProfilingRegexLexer to profile regexes + instead of the debugging lexer + -s N sort profiling output by column N (default is + column 4, the time per call) +''') + + +num = 10 +showall = False +ignerror = False +lexer = None +options = {} +profile = False +profsort = 4 +guess = False +decode_strategy = 'latin1' + +if __name__ == '__main__': + import getopt + opts, args = getopt.getopt(sys.argv[1:], 'n:l:aepO:s:hguU') + for opt, val in opts: + if opt == '-n': + num = int(val) + elif opt == '-a': + showall = True + elif opt == '-e': + ignerror = True + elif opt == '-l': + lexer = val + elif opt == '-p': + profile = True + elif opt == '-s': + profsort = int(val) + elif opt == '-O': + options = _parse_options([val]) + elif opt == '-g': + guess = True + elif opt == '-u': + decode_strategy = 'utf8-ignore' + elif opt == '-U': + decode_strategy = 'atheris' + elif opt == '-h': + print_help() + sys.exit(0) + ret = 0 + if not args: + print_help() + for f in args: + ret += main(f, lexer, options) + sys.exit(bool(ret)) |