1 files changed, 306 insertions, 0 deletions
diff --git a/scripts/debug_lexer.py b/scripts/debug_lexer.py
new file mode 100755
index 0000000..6323d9c
--- /dev/null
+++ b/scripts/debug_lexer.py
@@ -0,0 +1,306 @@
+#!/usr/bin/python
+"""
+    Lexing error finder
+    ~~~~~~~~~~~~~~~~~~~
+
+    For the source files given on the command line, display
+    the text where Error tokens are being generated, along
+    with some context.
+
+    :copyright: Copyright 2006-2022 by the Pygments team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+import os
+import sys
+import struct
+
+# always prefer Pygments from source if exists
+srcpath = os.path.join(os.path.dirname(__file__), '..')
+if os.path.isdir(os.path.join(srcpath, 'pygments')):
+    sys.path.insert(0, srcpath)
+
+
+from pygments.lexer import RegexLexer, ExtendedRegexLexer, LexerContext, \
+    ProfilingRegexLexer, ProfilingRegexLexerMeta
+from pygments.lexers import get_lexer_by_name, find_lexer_class, \
+    find_lexer_class_for_filename, guess_lexer
+from pygments.token import Error, Text, _TokenType
+from pygments.cmdline import _parse_options
+
+
+class DebuggingRegexLexer(ExtendedRegexLexer):
+    """Make the state stack, position and current match instance attributes."""
+
+    def get_tokens_unprocessed(self, text, stack=('root',)):
+        """
+        Split ``text`` into (tokentype, text) pairs.
+
+        ``stack`` is the initial stack (default: ``['root']``)
+        """
+        tokendefs = self._tokens
+        self.ctx = ctx = LexerContext(text, 0)
+        ctx.stack = list(stack)
+        statetokens = tokendefs[ctx.stack[-1]]
+        while 1:
+            for rexmatch, action, new_state in statetokens:
+                self.m = m = rexmatch(text, ctx.pos, ctx.end)
+                if m:
+                    if action is not None:
+                        if type(action) is _TokenType:
+                            yield ctx.pos, action, m.group()
+                            ctx.pos = m.end()
+                        else:
+                            if not isinstance(self, ExtendedRegexLexer):
+                                yield from action(self, m)
+                                ctx.pos = m.end()
+                            else:
+                                yield from action(self, m, ctx)
+                                if not new_state:
+                                    # altered the state stack?
+                                    statetokens = tokendefs[ctx.stack[-1]]
+                    if new_state is not None:
+                        # state transition
+                        if isinstance(new_state, tuple):
+                            for state in new_state:
+                                if state == '#pop':
+                                    ctx.stack.pop()
+                                elif state == '#push':
+                                    ctx.stack.append(ctx.stack[-1])
+                                else:
+                                    ctx.stack.append(state)
+                        elif isinstance(new_state, int):
+                            # pop
+                            del ctx.stack[new_state:]
+                        elif new_state == '#push':
+                            ctx.stack.append(ctx.stack[-1])
+                        else:
+                            assert False, 'wrong state def: %r' % new_state
+                        statetokens = tokendefs[ctx.stack[-1]]
+                    break
+            else:
+                try:
+                    if ctx.pos >= ctx.end:
+                        break
+                    if text[ctx.pos] == '\n':
+                        # at EOL, reset state to 'root'
+                        ctx.stack = ['root']
+                        statetokens = tokendefs['root']
+                        yield ctx.pos, Text, '\n'
+                        ctx.pos += 1
+                        continue
+                    yield ctx.pos, Error, text[ctx.pos]
+                    ctx.pos += 1
+                except IndexError:
+                    break
+
+
+def decode_atheris(bstr):
+    """Decode a byte string into a Unicode string using the algorithm
+    of Google's Atheris fuzzer library, which aims to produce a wide
+    range of possible Unicode inputs.
+
+    Corresponds to ConsumeUnicodeImpl() with filter_surrogates=false in
+    https://github.com/google/atheris/blob/master/fuzzed_data_provider.cc
+    """
+    if len(bstr) < 2:
+        return ''
+    # The first byte only selects if the rest is decoded as ascii, "utf-16" or "utf-32"
+    spec, bstr = bstr[0], bstr[1:]
+    if spec & 1:  # pure ASCII
+        return ''.join(chr(ch & 0x7f) for ch in bstr)
+    elif spec & 2:  # UTF-16
+        bstr = bstr if len(bstr) % 2 == 0 else bstr[:-1]
+        return bstr.decode('utf16')
+
+    # else UTF-32
+    def valid_codepoint(ch):
+        ch &= 0x1fffff
+        if ch & 0x100000:
+            ch &= ~0x0f0000
+        return chr(ch)
+
+    chars = struct.unpack('%dI%dx' % divmod(len(bstr), 4), bstr)
+    return ''.join(map(valid_codepoint), chars)
+
+
+def main(fn, lexer=None, options={}):
+    if fn == '-':
+        text = sys.stdin.read()
+    else:
+        with open(fn, 'rb') as fp:
+            text = fp.read()
+        if decode_strategy == 'latin1':
+            try:
+                text = text.decode('utf8')
+            except UnicodeError:
+                print('Warning: non-UTF8 input, using latin1')
+                text = text.decode('latin1')
+        elif decode_strategy == 'utf8-ignore':
+            try:
+                text = text.decode('utf8')
+            except UnicodeError:
+                print('Warning: ignoring non-UTF8 bytes in input')
+                text = text.decode('utf8', 'ignore')
+        elif decode_strategy == 'atheris':
+            text = decode_atheris(text)
+
+    text = text.strip('\n') + '\n'
+
+    if lexer is not None:
+        lxcls = get_lexer_by_name(lexer).__class__
+    elif guess:
+        lxcls = guess_lexer(text).__class__
+        print('Using lexer: %s (%s.%s)' % (lxcls.name, lxcls.__module__,
+                                           lxcls.__name__))
+    else:
+        lxcls = find_lexer_class_for_filename(os.path.basename(fn))
+        if lxcls is None:
+            name, rest = fn.split('_', 1)
+            lxcls = find_lexer_class(name)
+            if lxcls is None:
+                raise AssertionError('no lexer found for file %r' % fn)
+        print('Using lexer: %s (%s.%s)' % (lxcls.name, lxcls.__module__,
+                                           lxcls.__name__))
+    debug_lexer = False
+    # if profile:
+    #     # does not work for e.g. ExtendedRegexLexers
+    #     if lxcls.__bases__ == (RegexLexer,):
+    #         # yes we can!  (change the metaclass)
+    #         lxcls.__class__ = ProfilingRegexLexerMeta
+    #         lxcls.__bases__ = (ProfilingRegexLexer,)
+    #         lxcls._prof_sort_index = profsort
+    # else:
+    #     if lxcls.__bases__ == (RegexLexer,):
+    #         lxcls.__bases__ = (DebuggingRegexLexer,)
+    #         debug_lexer = True
+    #     elif lxcls.__bases__ == (DebuggingRegexLexer,):
+    #         # already debugged before
+    #         debug_lexer = True
+    #     else:
+    #         # HACK: ExtendedRegexLexer subclasses will only partially work here.
+    #         lxcls.__bases__ = (DebuggingRegexLexer,)
+    #         debug_lexer = True
+
+    lx = lxcls(**options)
+    lno = 1
+    tokens = []
+    states = []
+
+    def show_token(tok, state):
+        reprs = list(map(repr, tok))
+        print('   ' + reprs[1] + ' ' + ' ' * (29-len(reprs[1])) + reprs[0], end=' ')
+        if debug_lexer:
+            print(' ' + ' ' * (29-len(reprs[0])) + ' : '.join(state)
+                  if state else '', end=' ')
+        print()
+
+    for type, val in lx.get_tokens(text):
+        lno += val.count('\n')
+        if type == Error and not ignerror:
+            print('Error parsing', fn, 'on line', lno)
+            if not showall:
+                print('Previous tokens' + (debug_lexer and ' and states' or '') + ':')
+                for i in range(max(len(tokens) - num, 0), len(tokens)):
+                    if debug_lexer:
+                        show_token(tokens[i], states[i])
+                    else:
+                        show_token(tokens[i], None)
+            print('Error token:')
+            vlen = len(repr(val))
+            print('   ' + repr(val), end=' ')
+            if debug_lexer and hasattr(lx, 'ctx'):
+                print(' ' * (60-vlen) + ' : '.join(lx.ctx.stack), end=' ')
+            print()
+            print()
+            return 1
+        tokens.append((type, val))
+        if debug_lexer:
+            if hasattr(lx, 'ctx'):
+                states.append(lx.ctx.stack[:])
+            else:
+                states.append(None)
+        if showall:
+            show_token((type, val), states[-1] if debug_lexer else None)
+    return 0
+
+
+def print_help():
+    print('''\
+Pygments development helper to quickly debug lexers.
+
+    scripts/debug_lexer.py [options] file ...
+
+Give one or more filenames to lex them and display possible error tokens
+and/or profiling info.  Files are assumed to be encoded in UTF-8.
+
+Selecting lexer and options:
+
+    -l NAME         use lexer named NAME (default is to guess from
+                    the given filenames)
+    -g              guess lexer from content
+    -u              if input is non-utf8, use "ignore" handler instead
+                    of using latin1 encoding
+    -U              use Atheris fuzzer's method of converting
+                    byte input to Unicode
+    -O OPTIONSTR    use lexer options parsed from OPTIONSTR
+
+Debugging lexing errors:
+
+    -n N            show the last N tokens on error
+    -a              always show all lexed tokens (default is only
+                    to show them when an error occurs)
+    -e              do not stop on error tokens
+
+Profiling:
+
+    -p              use the ProfilingRegexLexer to profile regexes
+                    instead of the debugging lexer
+    -s N            sort profiling output by column N (default is
+                    column 4, the time per call)
+''')
+
+
+num = 10
+showall = False
+ignerror = False
+lexer = None
+options = {}
+profile = False
+profsort = 4
+guess = False
+decode_strategy = 'latin1'
+
+if __name__ == '__main__':
+    import getopt
+    opts, args = getopt.getopt(sys.argv[1:], 'n:l:aepO:s:hguU')
+    for opt, val in opts:
+        if opt == '-n':
+            num = int(val)
+        elif opt == '-a':
+            showall = True
+        elif opt == '-e':
+            ignerror = True
+        elif opt == '-l':
+            lexer = val
+        elif opt == '-p':
+            profile = True
+        elif opt == '-s':
+            profsort = int(val)
+        elif opt == '-O':
+            options = _parse_options([val])
+        elif opt == '-g':
+            guess = True
+        elif opt == '-u':
+            decode_strategy = 'utf8-ignore'
+        elif opt == '-U':
+            decode_strategy = 'atheris'
+        elif opt == '-h':
+            print_help()
+            sys.exit(0)
+    ret = 0
+    if not args:
+        print_help()
+    for f in args:
+        ret += main(f, lexer, options)
+    sys.exit(bool(ret))