diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-09-19 04:14:26 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-09-19 04:14:26 +0000 |
commit | c4e8a3222648fcf22ca207f1815ebbf7cd144eeb (patch) | |
tree | 93d5c6aa93d9987680dd1adad5685e2ad698f223 /tools/check_spelling.py | |
parent | Adding upstream version 4.2.6. (diff) | |
download | wireshark-c4e8a3222648fcf22ca207f1815ebbf7cd144eeb.tar.xz wireshark-c4e8a3222648fcf22ca207f1815ebbf7cd144eeb.zip |
Adding upstream version 4.4.0.upstream/4.4.0upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'tools/check_spelling.py')
-rwxr-xr-x | tools/check_spelling.py | 179 |
1 files changed, 153 insertions, 26 deletions
diff --git a/tools/check_spelling.py b/tools/check_spelling.py index 7e319081..be0bbf82 100755 --- a/tools/check_spelling.py +++ b/tools/check_spelling.py @@ -11,10 +11,18 @@ import re import subprocess import argparse import signal +import glob + +from spellchecker import SpellChecker from collections import Counter +from html.parser import HTMLParser +import urllib.request # Looks for spelling errors among strings found in source or documentation files. -# N.B. To run this script, you should install pyspellchecker (not spellchecker) using pip. +# N.B., +# - To run this script, you should install pyspellchecker (not spellchecker) using pip. +# - Because of colouring, you may want to pipe into less -R + # TODO: check structured doxygen comments? @@ -44,12 +52,12 @@ signal.signal(signal.SIGINT, signal_handler) # Create spellchecker, and augment with some Wireshark words. -from spellchecker import SpellChecker # Set up our dict with words from text file. spell = SpellChecker() spell.word_frequency.load_text_file('./tools/wireshark_words.txt') + # Track words that were not found. missing_words = [] @@ -67,7 +75,8 @@ class File: self.values = [] filename, extension = os.path.splitext(file) - self.code_file = extension in {'.c', '.cpp'} + # TODO: add '.lua'? Would also need to check string and comment formats... + self.code_file = extension in {'.c', '.cpp', '.h' } with open(file, 'r', encoding="utf8") as f: @@ -124,7 +133,6 @@ class File: def checkMultiWordsRecursive(self, word): length = len(word) - #print('word=', word) if length < 4: return False @@ -159,6 +167,12 @@ class File: v = str(v) + # Sometimes parentheses used to show optional letters, so don't leave space + #if re.compile(r"^[\S]*\(").search(v): + # v = v.replace('(', '') + #if re.compile(r"\S\)").search(v): + # v = v.replace(')', '') + # Ignore includes. if v.endswith('.h'): continue @@ -191,17 +205,19 @@ class File: v = v.replace('?', ' ') v = v.replace('=', ' ') v = v.replace('*', ' ') + v = v.replace('%u', '') + v = v.replace('%d', '') + v = v.replace('%s', '') v = v.replace('%', ' ') v = v.replace('#', ' ') v = v.replace('&', ' ') v = v.replace('@', ' ') v = v.replace('$', ' ') + v = v.replace('^', ' ') v = v.replace('®', '') v = v.replace("'", ' ') v = v.replace('"', ' ') - v = v.replace('%u', '') - v = v.replace('%d', '') - v = v.replace('%s', '') + v = v.replace('~', ' ') # Split into words. value_words = v.split() @@ -225,11 +241,14 @@ class File: if word.endswith("s’"): word = word[:-2] + if self.numberPlusUnits(word): continue if len(word) > 4 and spell.unknown([word]) and not self.checkMultiWords(word) and not self.wordBeforeId(word): - print(self.file, value_index, '/', num_values, '"' + original + '"', bcolors.FAIL + word + bcolors.ENDC, + # Highlight words that appeared in Wikipedia list. + print(bcolors.BOLD if word in wiki_db else '', + self.file, value_index, '/', num_values, '"' + original + '"', bcolors.FAIL + word + bcolors.ENDC, ' -> ', '?') # TODO: this can be interesting, but takes too long! @@ -261,9 +280,24 @@ def removeContractions(code_string): def removeComments(code_string): code_string = re.sub(re.compile(r"/\*.*?\*/", re.DOTALL), "" , code_string) # C-style comment # Avoid matching // where it is allowed, e.g., https://www... or file:///... - code_string = re.sub(re.compile(r"(?<!:)(?<!/)(?<!\")(?<!\"\s\s)(?<!file:/)//.*?\n" ) ,"" , code_string) # C++-style comment + code_string = re.sub(re.compile(r"(?<!:)(?<!/)(?<!\")(?<!\")(?<!\"\s\s)(?<!file:/)(?<!\,\s)//.*?\n" ) ,"" , code_string) # C++-style comment return code_string +def getCommentWords(code_string): + words = [] + + # C++ comments + matches = re.finditer(r'//\s(.*?)\n', code_string) + for m in matches: + words += m.group(1).split() + + # C comments + matches = re.finditer(r'/\*(.*?)\*/', code_string) + for m in matches: + words += m.group(1).split() + + return words + def removeSingleQuotes(code_string): code_string = code_string.replace('\\\\', " ") # Separate at \\ code_string = code_string.replace('\"\\\\\"', "") @@ -287,7 +321,7 @@ def removeHexSpecifiers(code_string): # Create a File object that knows about all of the strings in the given file. -def findStrings(filename): +def findStrings(filename, check_comments=False): with open(filename, 'r', encoding="utf8") as f: contents = f.read() @@ -302,7 +336,14 @@ def findStrings(filename): # What we check depends upon file type. if file.code_file: + # May want to check comments for selected dissectors + if check_comments: + comment_words = getCommentWords(contents) + for w in comment_words: + file.add(w) + contents = removeComments(contents) + # Code so only checking strings. matches = re.finditer(r'\"([^\"]*)\"', contents) for m in matches: @@ -328,6 +369,9 @@ def isGeneratedFile(filename): if filename.endswith('pci-ids.c') or filename.endswith('services-data.c') or filename.endswith('manuf-data.c'): return True + if filename.endswith('packet-woww.c'): + return True + # Open file f_read = open(os.path.join(filename), 'r', encoding="utf8") for line_no,line in enumerate(f_read): @@ -358,7 +402,8 @@ def isAppropriateFile(filename): file, extension = os.path.splitext(filename) if filename.find('CMake') != -1: return False - return extension in { '.adoc', '.c', '.cpp', '.pod', '.nsi', '.txt'} or file.endswith('README') + # TODO: add , '.lua' ? + return extension in { '.adoc', '.c', '.cpp', '.pod', '.txt' } or file.endswith('README') def findFilesInFolder(folder, recursive=True): @@ -382,13 +427,13 @@ def findFilesInFolder(folder, recursive=True): # Check the given file. -def checkFile(filename): +def checkFile(filename, check_comments=False): # Check file exists - e.g. may have been deleted in a recent commit. if not os.path.exists(filename): print(filename, 'does not exist!') return - file = findStrings(filename) + file = findStrings(filename, check_comments) file.spellCheck() @@ -401,17 +446,82 @@ def checkFile(filename): parser = argparse.ArgumentParser(description='Check spellings in specified files') parser.add_argument('--file', action='append', help='specify individual file to test') -parser.add_argument('--folder', action='store', default='', +parser.add_argument('--folder', action='append', help='specify folder to test') +parser.add_argument('--glob', action='append', + help='specify glob to test - should give in "quotes"') parser.add_argument('--no-recurse', action='store_true', default='', - help='do not recurse inside chosen folder') + help='do not recurse inside chosen folder(s)') parser.add_argument('--commits', action='store', help='last N commits to check') parser.add_argument('--open', action='store_true', help='check open files') +parser.add_argument('--comments', action='store_true', + help='check comments in source files') +parser.add_argument('--no-wikipedia', action='store_true', + help='skip checking known bad words from wikipedia - can be slow') + args = parser.parse_args() +class TypoSourceDocumentParser(HTMLParser): + def __init__(self): + super().__init__() + self.capturing = False + self.content = '' + + def handle_starttag(self, tag, attrs): + if tag == 'pre': + self.capturing = True + + def handle_endtag(self, tag): + if tag == 'pre': + self.capturing = False + + def handle_data(self, data): + if self.capturing: + self.content += data + + +# Fetch some common mispellings from wikipedia so we will definitely flag them. +wiki_db = dict() +if not args.no_wikipedia: + print('Fetching Wikipedia\'s list of common misspellings.') + req_headers = { 'User-Agent': 'Wireshark check-wikipedia-typos' } + req = urllib.request.Request('https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines', headers=req_headers) + try: + response = urllib.request.urlopen(req) + content = response.read() + content = content.decode('UTF-8', 'replace') + + # Extract the "<pre>...</pre>" part of the document. + parser = TypoSourceDocumentParser() + parser.feed(content) + content = parser.content.strip() + + wiki_db = dict(line.lower().split('->', maxsplit=1) for line in content.splitlines()) + del wiki_db['cmo'] # All false positives. + del wiki_db['ect'] # Too many false positives. + del wiki_db['thru'] # We'll let that one thru. ;-) + del wiki_db['sargeant'] # All false positives. + + # Remove each word from dict + removed = 0 + for word in wiki_db: + try: + if should_exit: + exit(1) + spell.word_frequency.remove_words([word]) + #print('Removed', word) + removed += 1 + except Exception: + pass + + print('Removed', removed, 'known bad words') + except Exception: + print('Failed to fetch and/or parse Wikipedia mispellings!') + + # Get files from wherever command-line args indicate. files = [] @@ -423,14 +533,15 @@ if args.file: exit(1) else: files.append(f) -elif args.commits: +if args.commits: # Get files affected by specified number of commits. command = ['git', 'diff', '--name-only', 'HEAD~' + args.commits] files = [f.decode('utf-8') for f in subprocess.check_output(command).splitlines()] # Filter files files = list(filter(lambda f : os.path.exists(f) and isAppropriateFile(f) and not isGeneratedFile(f), files)) -elif args.open: + +if args.open: # Unstaged changes. command = ['git', 'diff', '--name-only'] files = [f.decode('utf-8') @@ -444,26 +555,42 @@ elif args.open: # Filter files. files_staged = list(filter(lambda f : isAppropriateFile(f) and not isGeneratedFile(f), files_staged)) for f in files_staged: - if not f in files: + if f not in files: files.append(f) -else: - # By default, scan dissectors directory - folder = os.path.join('epan', 'dissectors') - # But overwrite with any folder entry. - if args.folder: - folder = args.folder + +if args.glob: + # Add specified file(s) + for g in args.glob: + for f in glob.glob(g): + if not os.path.isfile(f): + print('Chosen file', f, 'does not exist.') + exit(1) + else: + files.append(f) + +if args.folder: + for folder in args.folder: if not os.path.isdir(folder): print('Folder', folder, 'not found!') exit(1) + # Find files from folder. + print('Looking for files in', folder) + files += findFilesInFolder(folder, not args.no_recurse) + +# By default, scan dissector files. +if not args.file and not args.open and not args.commits and not args.glob and not args.folder: + # By default, scan dissectors directory + folder = os.path.join('epan', 'dissectors') # Find files from folder. print('Looking for files in', folder) files = findFilesInFolder(folder, not args.no_recurse) + # If scanning a subset of files, list them here. print('Examining:') -if args.file or args.folder or args.commits or args.open: +if args.file or args.folder or args.commits or args.open or args.glob: if files: print(' '.join(files), '\n') else: @@ -475,7 +602,7 @@ else: # Now check the chosen files. for f in files: # Check this file. - checkFile(f) + checkFile(f, check_comments=args.comments) # But get out if control-C has been pressed. if should_exit: exit(1) |