Adding upstream version 2.14.0+dfsg.upstream/2.14.0+dfsg upstream

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 11:33:32 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-04 11:33:32 +0000
commit: 1f403ad2197fc7442409f434ee574f3e6b46fb73 (patch)
tree: 0299c6dd11d5edfa918a29b6456bc1875f1d288c /scripts/check_repeated_token.py
parent: Initial commit. (diff)
download: pygments-1f403ad2197fc7442409f434ee574f3e6b46fb73.tar.xz
pygments-1f403ad2197fc7442409f434ee574f3e6b46fb73.zip
1 files changed, 77 insertions, 0 deletions
diff --git a/scripts/check_repeated_token.py b/scripts/check_repeated_token.py
new file mode 100755
index 0000000..1636281
--- /dev/null
+++ b/scripts/check_repeated_token.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+"""
+    Checker for repeated tokens
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    Helper script to find suspicious lexers which produce the same token
+    repeatedly, i.e. for example:
+
+    .. code::
+
+      'd'           Text
+      'a'           Text
+      't'           Text
+      'a'           Text
+      'b'           Text
+      'a'           Text
+      's'           Text
+      'e'           Text
+
+    This script has two test modes: Check for tokens repeating more often than
+    a given threshold, and exclude anything but single-character tokens.
+    Repeated single-character tokens are quite problematic as they result in
+    bloated output and are usually an indication that someone is missing
+    a + or * in the regex.
+
+    :copyright: Copyright 2006-2022 by the Pygments team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+import argparse
+import sys
+
+from utility import unpack_output_file, process_output_files
+
+
+def check_file(path, threshold, single_only):
+    current_token = ''
+    current_token_repeat_count = 1
+
+    for value, token, linenumber in unpack_output_file(path):
+        if single_only and len(value) > 1:
+            token = ''
+            current_token_repeat_count = 1
+            continue
+
+        if token != current_token:
+            current_token = token
+            current_token_repeat_count = 1
+        else:
+            current_token_repeat_count += 1
+
+        if current_token_repeat_count > threshold:
+            print(f'{path}:{linenumber}')
+            return False
+
+    return True
+
+
+def main(args):
+    def check_file_callback(path):
+        return check_file(path, args.threshold, args.single)
+
+    if process_output_files(args.TEST_ROOT, check_file_callback) > 0:
+        return 1
+    return 0
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('TEST_ROOT',
+                        help='Root directory containing the tests')
+    parser.add_argument('-t', '--threshold', type=int, default=5,
+                        help='Warn if a token repeats itself more often then '
+                             'this number.')
+    parser.add_argument('-s', '--single', action='store_true', default=False,
+                        help='Only look at tokens matching a single character')
+    args = parser.parse_args()
+    sys.exit(main(args))
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 11:33:32 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-04 11:33:32 +0000
commit	1f403ad2197fc7442409f434ee574f3e6b46fb73 (patch)
tree	0299c6dd11d5edfa918a29b6456bc1875f1d288c /scripts/check_repeated_token.py
parent	Initial commit. (diff)
download	pygments-1f403ad2197fc7442409f434ee574f3e6b46fb73.tar.xz pygments-1f403ad2197fc7442409f434ee574f3e6b46fb73.zip