diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 11:33:32 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 11:33:32 +0000 |
commit | 1f403ad2197fc7442409f434ee574f3e6b46fb73 (patch) | |
tree | 0299c6dd11d5edfa918a29b6456bc1875f1d288c /scripts/check_repeated_token.py | |
parent | Initial commit. (diff) | |
download | pygments-1f403ad2197fc7442409f434ee574f3e6b46fb73.tar.xz pygments-1f403ad2197fc7442409f434ee574f3e6b46fb73.zip |
Adding upstream version 2.14.0+dfsg.upstream/2.14.0+dfsgupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'scripts/check_repeated_token.py')
-rwxr-xr-x | scripts/check_repeated_token.py | 77 |
1 files changed, 77 insertions, 0 deletions
diff --git a/scripts/check_repeated_token.py b/scripts/check_repeated_token.py new file mode 100755 index 0000000..1636281 --- /dev/null +++ b/scripts/check_repeated_token.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python +""" + Checker for repeated tokens + ~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + Helper script to find suspicious lexers which produce the same token + repeatedly, i.e. for example: + + .. code:: + + 'd' Text + 'a' Text + 't' Text + 'a' Text + 'b' Text + 'a' Text + 's' Text + 'e' Text + + This script has two test modes: Check for tokens repeating more often than + a given threshold, and exclude anything but single-character tokens. + Repeated single-character tokens are quite problematic as they result in + bloated output and are usually an indication that someone is missing + a + or * in the regex. + + :copyright: Copyright 2006-2022 by the Pygments team, see AUTHORS. + :license: BSD, see LICENSE for details. +""" +import argparse +import sys + +from utility import unpack_output_file, process_output_files + + +def check_file(path, threshold, single_only): + current_token = '' + current_token_repeat_count = 1 + + for value, token, linenumber in unpack_output_file(path): + if single_only and len(value) > 1: + token = '' + current_token_repeat_count = 1 + continue + + if token != current_token: + current_token = token + current_token_repeat_count = 1 + else: + current_token_repeat_count += 1 + + if current_token_repeat_count > threshold: + print(f'{path}:{linenumber}') + return False + + return True + + +def main(args): + def check_file_callback(path): + return check_file(path, args.threshold, args.single) + + if process_output_files(args.TEST_ROOT, check_file_callback) > 0: + return 1 + return 0 + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('TEST_ROOT', + help='Root directory containing the tests') + parser.add_argument('-t', '--threshold', type=int, default=5, + help='Warn if a token repeats itself more often then ' + 'this number.') + parser.add_argument('-s', '--single', action='store_true', default=False, + help='Only look at tokens matching a single character') + args = parser.parse_args() + sys.exit(main(args)) |