diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-10 20:34:10 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-10 20:34:10 +0000 |
commit | e4ba6dbc3f1e76890b22773807ea37fe8fa2b1bc (patch) | |
tree | 68cb5ef9081156392f1dd62a00c6ccc1451b93df /tools/check_spelling.py | |
parent | Initial commit. (diff) | |
download | wireshark-e4ba6dbc3f1e76890b22773807ea37fe8fa2b1bc.tar.xz wireshark-e4ba6dbc3f1e76890b22773807ea37fe8fa2b1bc.zip |
Adding upstream version 4.2.2.upstream/4.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'tools/check_spelling.py')
-rwxr-xr-x | tools/check_spelling.py | 493 |
1 files changed, 493 insertions, 0 deletions
diff --git a/tools/check_spelling.py b/tools/check_spelling.py new file mode 100755 index 00000000..7e319081 --- /dev/null +++ b/tools/check_spelling.py @@ -0,0 +1,493 @@ +#!/usr/bin/env python3 +# Wireshark - Network traffic analyzer +# By Gerald Combs <gerald@wireshark.org> +# Copyright 1998 Gerald Combs +# +# SPDX-License-Identifier: GPL-2.0-or-later + +import os +import sys +import re +import subprocess +import argparse +import signal +from collections import Counter + +# Looks for spelling errors among strings found in source or documentation files. +# N.B. To run this script, you should install pyspellchecker (not spellchecker) using pip. + +# TODO: check structured doxygen comments? + +# For text colouring/highlighting. +class bcolors: + HEADER = '\033[95m' + OKBLUE = '\033[94m' + OKGREEN = '\033[92m' + ADDED = '\033[45m' + WARNING = '\033[93m' + FAIL = '\033[91m' + ENDC = '\033[0m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + + +# Try to exit soon after Ctrl-C is pressed. +should_exit = False + +def signal_handler(sig, frame): + global should_exit + should_exit = True + print('You pressed Ctrl+C - exiting') + +signal.signal(signal.SIGINT, signal_handler) + + + +# Create spellchecker, and augment with some Wireshark words. +from spellchecker import SpellChecker +# Set up our dict with words from text file. +spell = SpellChecker() +spell.word_frequency.load_text_file('./tools/wireshark_words.txt') + + +# Track words that were not found. +missing_words = [] + + +# Split camelCase string into separate words. +def camelCaseSplit(identifier): + matches = re.finditer(r'.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier) + return [m.group(0) for m in matches] + + +# A File object contains all of the strings to be checked for a given file. +class File: + def __init__(self, file): + self.file = file + self.values = [] + + filename, extension = os.path.splitext(file) + self.code_file = extension in {'.c', '.cpp'} + + + with open(file, 'r', encoding="utf8") as f: + contents = f.read() + + if self.code_file: + # Remove comments so as not to trip up RE. + contents = removeComments(contents) + + # Find protocol name and add to dict. + # N.B. doesn't work when a variable is used instead of a literal for the protocol name... + matches = re.finditer(r'proto_register_protocol\s*\([\n\r\s]*\"(.*)\",[\n\r\s]*\"(.*)\",[\n\r\s]*\"(.*)\"', contents) + for m in matches: + protocol = m.group(3) + # Add to dict. + spell.word_frequency.load_words([protocol]) + spell.known([protocol]) + print('Protocol is: ' + bcolors.BOLD + protocol + bcolors.ENDC) + + # Add a string found in this file. + def add(self, value): + self.values.append(value.encode('utf-8') if sys.platform.startswith('win') else value) + + # Whole word is not recognised, but is it 2 words concatenated (without camelcase) ? + def checkMultiWords(self, word): + if len(word) < 6: + return False + + # Don't consider if mixed cases. + if not (word.islower() or word.isupper()): + # But make an exception if only the fist letter is uppercase.. + if not word == (word[0].upper() + word[1:]): + return False + + # Try splitting into 2 words recognised at various points. + # Allow 3-letter words. + length = len(word) + for idx in range(3, length-3): + word1 = word[0:idx] + word2 = word[idx:] + + if not spell.unknown([word1, word2]): + return True + + return self.checkMultiWordsRecursive(word) + + # If word before 'id' is recognised, accept word. + def wordBeforeId(self, word): + if word.lower().endswith('id'): + if not spell.unknown([word[0:len(word)-2]]): + return True + else: + return False + + def checkMultiWordsRecursive(self, word): + length = len(word) + #print('word=', word) + if length < 4: + return False + + for idx in range(4, length+1): + w = word[0:idx] + if not spell.unknown([w]): + if idx == len(word): + return True + else: + if self.checkMultiWordsRecursive(word[idx:]): + return True + + return False + + def numberPlusUnits(self, word): + m = re.search(r'^([0-9]+)([a-zA-Z]+)$', word) + if m: + if m.group(2).lower() in { "bit", "bits", "gb", "kbps", "gig", "mb", "th", "mhz", "v", "hz", "k", + "mbps", "m", "g", "ms", "nd", "nds", "rd", "kb", "kbit", "ghz", + "khz", "km", "ms", "usec", "sec", "gbe", "ns", "ksps", "qam", "mm" }: + return True + return False + + + # Check the spelling of all the words we have found + def spellCheck(self): + + num_values = len(self.values) + for value_index,v in enumerate(self.values): + if should_exit: + exit(1) + + v = str(v) + + # Ignore includes. + if v.endswith('.h'): + continue + + # Store original (as want to include for context in error report). + original = str(v) + + # Replace most punctuation with spaces, and eliminate common format specifiers. + v = v.replace('.', ' ') + v = v.replace(',', ' ') + v = v.replace('`', ' ') + v = v.replace(':', ' ') + v = v.replace(';', ' ') + v = v.replace('"', ' ') + v = v.replace('\\', ' ') + v = v.replace('+', ' ') + v = v.replace('|', ' ') + v = v.replace('(', ' ') + v = v.replace(')', ' ') + v = v.replace('[', ' ') + v = v.replace(']', ' ') + v = v.replace('{', ' ') + v = v.replace('}', ' ') + v = v.replace('<', ' ') + v = v.replace('>', ' ') + v = v.replace('_', ' ') + v = v.replace('-', ' ') + v = v.replace('/', ' ') + v = v.replace('!', ' ') + v = v.replace('?', ' ') + v = v.replace('=', ' ') + v = v.replace('*', ' ') + v = v.replace('%', ' ') + v = v.replace('#', ' ') + v = v.replace('&', ' ') + v = v.replace('@', ' ') + v = v.replace('$', ' ') + v = v.replace('®', '') + v = v.replace("'", ' ') + v = v.replace('"', ' ') + v = v.replace('%u', '') + v = v.replace('%d', '') + v = v.replace('%s', '') + + # Split into words. + value_words = v.split() + # Further split up any camelCase words. + words = [] + for w in value_words: + words += camelCaseSplit(w) + + # Check each word within this string in turn. + for word in words: + # Strip trailing digits from word. + word = word.rstrip('1234567890') + + # Quote marks found in some of the docs... + word = word.replace('“', '') + word = word.replace('”', '') + + # Single and collective possession + if word.endswith("’s"): + word = word[:-2] + if word.endswith("s’"): + word = word[:-2] + + if self.numberPlusUnits(word): + continue + + if len(word) > 4 and spell.unknown([word]) and not self.checkMultiWords(word) and not self.wordBeforeId(word): + print(self.file, value_index, '/', num_values, '"' + original + '"', bcolors.FAIL + word + bcolors.ENDC, + ' -> ', '?') + + # TODO: this can be interesting, but takes too long! + # bcolors.OKGREEN + spell.correction(word) + bcolors.ENDC + global missing_words + missing_words.append(word) + +def removeWhitespaceControl(code_string): + code_string = code_string.replace('\\n', ' ') + code_string = code_string.replace('\\r', ' ') + code_string = code_string.replace('\\t', ' ') + return code_string + +# Remove any contractions from the given string. +def removeContractions(code_string): + contractions = [ "wireshark’s", "don’t", "let’s", "isn’t", "won’t", "user’s", "hasn’t", "you’re", "o’clock", "you’ll", + "you’d", "developer’s", "doesn’t", "what’s", "let’s", "haven’t", "can’t", "you’ve", + "shouldn’t", "didn’t", "wouldn’t", "aren’t", "there’s", "packet’s", "couldn’t", "world’s", + "needn’t", "graph’s", "table’s", "parent’s", "entity’s", "server’s", "node’s", + "querier’s", "sender’s", "receiver’s", "computer’s", "frame’s", "vendor’s", "system’s", + "we’ll", "asciidoctor’s", "protocol’s", "microsoft’s", "wasn’t" ] + for c in contractions: + code_string = code_string.replace(c, "") + code_string = code_string.replace(c.capitalize(), "") + code_string = code_string.replace(c.replace('’', "'"), "") + code_string = code_string.replace(c.capitalize().replace('’', "'"), "") + return code_string + +def removeComments(code_string): + code_string = re.sub(re.compile(r"/\*.*?\*/", re.DOTALL), "" , code_string) # C-style comment + # Avoid matching // where it is allowed, e.g., https://www... or file:///... + code_string = re.sub(re.compile(r"(?<!:)(?<!/)(?<!\")(?<!\"\s\s)(?<!file:/)//.*?\n" ) ,"" , code_string) # C++-style comment + return code_string + +def removeSingleQuotes(code_string): + code_string = code_string.replace('\\\\', " ") # Separate at \\ + code_string = code_string.replace('\"\\\\\"', "") + code_string = code_string.replace("\\\"", " ") + code_string = code_string.replace("'\"'", "") + code_string = code_string.replace('…', ' ') + return code_string + +def removeHexSpecifiers(code_string): + # Find all hex numbers + + looking = True + while looking: + m = re.search(r'(0x[0-9a-fA-F]*)', code_string) + if m: + code_string = code_string.replace(m.group(0), "") + else: + looking = False + + return code_string + + +# Create a File object that knows about all of the strings in the given file. +def findStrings(filename): + with open(filename, 'r', encoding="utf8") as f: + contents = f.read() + + # Remove comments & embedded quotes so as not to trip up RE. + contents = removeContractions(contents) + contents = removeWhitespaceControl(contents) + contents = removeSingleQuotes(contents) + contents = removeHexSpecifiers(contents) + + # Create file object. + file = File(filename) + + # What we check depends upon file type. + if file.code_file: + contents = removeComments(contents) + # Code so only checking strings. + matches = re.finditer(r'\"([^\"]*)\"', contents) + for m in matches: + file.add(m.group(1)) + else: + # A documentation file, so examine all words. + for w in contents.split(): + file.add(w) + + return file + + +# Test for whether the given file was automatically generated. +def isGeneratedFile(filename): + # Check file exists - e.g. may have been deleted in a recent commit. + if not os.path.exists(filename): + return False + + if not filename.endswith('.c'): + return False + + # This file is generated, but notice is further in than want to check for all files + if filename.endswith('pci-ids.c') or filename.endswith('services-data.c') or filename.endswith('manuf-data.c'): + return True + + # Open file + f_read = open(os.path.join(filename), 'r', encoding="utf8") + for line_no,line in enumerate(f_read): + # The comment to say that its generated is near the top, so give up once + # get a few lines down. + if line_no > 10: + f_read.close() + return False + if (line.find('Generated automatically') != -1 or + line.find('Autogenerated from') != -1 or + line.find('is autogenerated') != -1 or + line.find('automatically generated by Pidl') != -1 or + line.find('Created by: The Qt Meta Object Compiler') != -1 or + line.find('This file was generated') != -1 or + line.find('This filter was automatically generated') != -1 or + line.find('This file is auto generated, do not edit!') != -1 or + line.find('this file is automatically generated') != -1): + + f_read.close() + return True + + # OK, looks like a hand-written file! + f_read.close() + return False + + +def isAppropriateFile(filename): + file, extension = os.path.splitext(filename) + if filename.find('CMake') != -1: + return False + return extension in { '.adoc', '.c', '.cpp', '.pod', '.nsi', '.txt'} or file.endswith('README') + + +def findFilesInFolder(folder, recursive=True): + files_to_check = [] + + if recursive: + for root, subfolders, files in os.walk(folder): + for f in files: + if should_exit: + return + f = os.path.join(root, f) + if isAppropriateFile(f) and not isGeneratedFile(f): + files_to_check.append(f) + else: + for f in sorted(os.listdir(folder)): + f = os.path.join(folder, f) + if isAppropriateFile(f) and not isGeneratedFile(f): + files_to_check.append(f) + + return files_to_check + + +# Check the given file. +def checkFile(filename): + # Check file exists - e.g. may have been deleted in a recent commit. + if not os.path.exists(filename): + print(filename, 'does not exist!') + return + + file = findStrings(filename) + file.spellCheck() + + + +################################################################# +# Main logic. + +# command-line args. Controls which files should be checked. +# If no args given, will just scan epan/dissectors folder. +parser = argparse.ArgumentParser(description='Check spellings in specified files') +parser.add_argument('--file', action='append', + help='specify individual file to test') +parser.add_argument('--folder', action='store', default='', + help='specify folder to test') +parser.add_argument('--no-recurse', action='store_true', default='', + help='do not recurse inside chosen folder') +parser.add_argument('--commits', action='store', + help='last N commits to check') +parser.add_argument('--open', action='store_true', + help='check open files') + +args = parser.parse_args() + + +# Get files from wherever command-line args indicate. +files = [] +if args.file: + # Add specified file(s) + for f in args.file: + if not os.path.isfile(f): + print('Chosen file', f, 'does not exist.') + exit(1) + else: + files.append(f) +elif args.commits: + # Get files affected by specified number of commits. + command = ['git', 'diff', '--name-only', 'HEAD~' + args.commits] + files = [f.decode('utf-8') + for f in subprocess.check_output(command).splitlines()] + # Filter files + files = list(filter(lambda f : os.path.exists(f) and isAppropriateFile(f) and not isGeneratedFile(f), files)) +elif args.open: + # Unstaged changes. + command = ['git', 'diff', '--name-only'] + files = [f.decode('utf-8') + for f in subprocess.check_output(command).splitlines()] + # Filter files. + files = list(filter(lambda f : isAppropriateFile(f) and not isGeneratedFile(f), files)) + # Staged changes. + command = ['git', 'diff', '--staged', '--name-only'] + files_staged = [f.decode('utf-8') + for f in subprocess.check_output(command).splitlines()] + # Filter files. + files_staged = list(filter(lambda f : isAppropriateFile(f) and not isGeneratedFile(f), files_staged)) + for f in files_staged: + if not f in files: + files.append(f) +else: + # By default, scan dissectors directory + folder = os.path.join('epan', 'dissectors') + # But overwrite with any folder entry. + if args.folder: + folder = args.folder + if not os.path.isdir(folder): + print('Folder', folder, 'not found!') + exit(1) + + # Find files from folder. + print('Looking for files in', folder) + files = findFilesInFolder(folder, not args.no_recurse) + + +# If scanning a subset of files, list them here. +print('Examining:') +if args.file or args.folder or args.commits or args.open: + if files: + print(' '.join(files), '\n') + else: + print('No files to check.\n') +else: + print('All dissector modules\n') + + +# Now check the chosen files. +for f in files: + # Check this file. + checkFile(f) + # But get out if control-C has been pressed. + if should_exit: + exit(1) + + + +# Show the most commonly not-recognised words. +print('') +counter = Counter(missing_words).most_common(100) +if len(counter) > 0: + for c in counter: + print(c[0], ':', c[1]) + +# Show error count. +print('\n' + bcolors.BOLD + str(len(missing_words)) + ' issues found' + bcolors.ENDC + '\n') |