From e4ba6dbc3f1e76890b22773807ea37fe8fa2b1bc Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Wed, 10 Apr 2024 22:34:10 +0200
Subject: Adding upstream version 4.2.2.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 tools/check_spelling.py | 493 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 493 insertions(+)
 create mode 100755 tools/check_spelling.py

(limited to 'tools/check_spelling.py')

diff --git a/tools/check_spelling.py b/tools/check_spelling.py
new file mode 100755
index 0000000..7e31908
--- /dev/null
+++ b/tools/check_spelling.py
@@ -0,0 +1,493 @@
+#!/usr/bin/env python3
+# Wireshark - Network traffic analyzer
+# By Gerald Combs <gerald@wireshark.org>
+# Copyright 1998 Gerald Combs
+#
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+import os
+import sys
+import re
+import subprocess
+import argparse
+import signal
+from collections import Counter
+
+# Looks for spelling errors among strings found in source or documentation files.
+# N.B. To run this script, you should install pyspellchecker (not spellchecker) using pip.
+
+# TODO: check structured doxygen comments?
+
+# For text colouring/highlighting.
+class bcolors:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKGREEN = '\033[92m'
+    ADDED = '\033[45m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+
+
+# Try to exit soon after Ctrl-C is pressed.
+should_exit = False
+
+def signal_handler(sig, frame):
+    global should_exit
+    should_exit = True
+    print('You pressed Ctrl+C - exiting')
+
+signal.signal(signal.SIGINT, signal_handler)
+
+
+
+# Create spellchecker, and augment with some Wireshark words.
+from spellchecker import SpellChecker
+# Set up our dict with words from text file.
+spell = SpellChecker()
+spell.word_frequency.load_text_file('./tools/wireshark_words.txt')
+
+
+# Track words that were not found.
+missing_words = []
+
+
+# Split camelCase string into separate words.
+def camelCaseSplit(identifier):
+    matches = re.finditer(r'.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
+    return [m.group(0) for m in matches]
+
+
+# A File object contains all of the strings to be checked for a given file.
+class File:
+    def __init__(self, file):
+        self.file = file
+        self.values = []
+
+        filename, extension = os.path.splitext(file)
+        self.code_file = extension in {'.c', '.cpp'}
+
+
+        with open(file, 'r', encoding="utf8") as f:
+            contents = f.read()
+
+            if self.code_file:
+                # Remove comments so as not to trip up RE.
+                contents = removeComments(contents)
+
+            # Find protocol name and add to dict.
+            # N.B. doesn't work when a variable is used instead of a literal for the protocol name...
+            matches = re.finditer(r'proto_register_protocol\s*\([\n\r\s]*\"(.*)\",[\n\r\s]*\"(.*)\",[\n\r\s]*\"(.*)\"', contents)
+            for m in matches:
+                protocol = m.group(3)
+                # Add to dict.
+                spell.word_frequency.load_words([protocol])
+                spell.known([protocol])
+                print('Protocol is: ' + bcolors.BOLD +  protocol + bcolors.ENDC)
+
+    # Add a string found in this file.
+    def add(self, value):
+        self.values.append(value.encode('utf-8') if sys.platform.startswith('win') else value)
+
+    # Whole word is not recognised, but is it 2 words concatenated (without camelcase) ?
+    def checkMultiWords(self, word):
+        if len(word) < 6:
+            return False
+
+        # Don't consider if mixed cases.
+        if not (word.islower() or word.isupper()):
+            # But make an exception if only the fist letter is uppercase..
+            if not word == (word[0].upper() + word[1:]):
+                return False
+
+        # Try splitting into 2 words recognised at various points.
+        # Allow 3-letter words.
+        length = len(word)
+        for idx in range(3, length-3):
+            word1 = word[0:idx]
+            word2 = word[idx:]
+
+            if not spell.unknown([word1, word2]):
+                return True
+
+        return self.checkMultiWordsRecursive(word)
+
+    # If word before 'id' is recognised, accept word.
+    def wordBeforeId(self, word):
+        if word.lower().endswith('id'):
+            if not spell.unknown([word[0:len(word)-2]]):
+                return True
+            else:
+                return False
+
+    def checkMultiWordsRecursive(self, word):
+        length = len(word)
+        #print('word=', word)
+        if length < 4:
+            return False
+
+        for idx in range(4, length+1):
+            w = word[0:idx]
+            if not spell.unknown([w]):
+                if idx == len(word):
+                    return True
+                else:
+                    if self.checkMultiWordsRecursive(word[idx:]):
+                        return True
+
+        return False
+
+    def numberPlusUnits(self, word):
+        m = re.search(r'^([0-9]+)([a-zA-Z]+)$', word)
+        if m:
+            if m.group(2).lower() in { "bit", "bits", "gb", "kbps", "gig", "mb", "th", "mhz", "v", "hz", "k",
+                                       "mbps", "m", "g", "ms", "nd", "nds", "rd", "kb", "kbit", "ghz",
+                                       "khz", "km", "ms", "usec", "sec", "gbe", "ns", "ksps", "qam", "mm" }:
+                return True
+        return False
+
+
+    # Check the spelling of all the words we have found
+    def spellCheck(self):
+
+        num_values = len(self.values)
+        for value_index,v in enumerate(self.values):
+            if should_exit:
+                exit(1)
+
+            v = str(v)
+
+            # Ignore includes.
+            if v.endswith('.h'):
+                continue
+
+            # Store original (as want to include for context in error report).
+            original = str(v)
+
+            # Replace most punctuation with spaces, and eliminate common format specifiers.
+            v = v.replace('.', ' ')
+            v = v.replace(',', ' ')
+            v = v.replace('`', ' ')
+            v = v.replace(':', ' ')
+            v = v.replace(';', ' ')
+            v = v.replace('"', ' ')
+            v = v.replace('\\', ' ')
+            v = v.replace('+', ' ')
+            v = v.replace('|', ' ')
+            v = v.replace('(', ' ')
+            v = v.replace(')', ' ')
+            v = v.replace('[', ' ')
+            v = v.replace(']', ' ')
+            v = v.replace('{', ' ')
+            v = v.replace('}', ' ')
+            v = v.replace('<', ' ')
+            v = v.replace('>', ' ')
+            v = v.replace('_', ' ')
+            v = v.replace('-', ' ')
+            v = v.replace('/', ' ')
+            v = v.replace('!', ' ')
+            v = v.replace('?', ' ')
+            v = v.replace('=', ' ')
+            v = v.replace('*', ' ')
+            v = v.replace('%', ' ')
+            v = v.replace('#', ' ')
+            v = v.replace('&', ' ')
+            v = v.replace('@', ' ')
+            v = v.replace('$', ' ')
+            v = v.replace('®', '')
+            v = v.replace("'", ' ')
+            v = v.replace('"', ' ')
+            v = v.replace('%u', '')
+            v = v.replace('%d', '')
+            v = v.replace('%s', '')
+
+            # Split into words.
+            value_words = v.split()
+            # Further split up any camelCase words.
+            words = []
+            for w in value_words:
+                words +=  camelCaseSplit(w)
+
+            # Check each word within this string in turn.
+            for word in words:
+                # Strip trailing digits from word.
+                word = word.rstrip('1234567890')
+
+                # Quote marks found in some of the docs...
+                word = word.replace('“', '')
+                word = word.replace('”', '')
+
+                # Single and collective possession
+                if word.endswith("’s"):
+                    word = word[:-2]
+                if word.endswith("s’"):
+                    word = word[:-2]
+
+                if self.numberPlusUnits(word):
+                    continue
+
+                if len(word) > 4 and spell.unknown([word]) and not self.checkMultiWords(word) and not self.wordBeforeId(word):
+                    print(self.file, value_index, '/', num_values, '"' + original + '"', bcolors.FAIL + word + bcolors.ENDC,
+                          ' -> ', '?')
+
+                    # TODO: this can be interesting, but takes too long!
+                    # bcolors.OKGREEN + spell.correction(word) + bcolors.ENDC
+                    global missing_words
+                    missing_words.append(word)
+
+def removeWhitespaceControl(code_string):
+    code_string = code_string.replace('\\n', ' ')
+    code_string = code_string.replace('\\r', ' ')
+    code_string = code_string.replace('\\t', ' ')
+    return code_string
+
+# Remove any contractions from the given string.
+def removeContractions(code_string):
+    contractions = [ "wireshark’s", "don’t", "let’s", "isn’t", "won’t", "user’s", "hasn’t", "you’re", "o’clock", "you’ll",
+                     "you’d", "developer’s", "doesn’t", "what’s", "let’s", "haven’t", "can’t", "you’ve",
+                     "shouldn’t", "didn’t", "wouldn’t", "aren’t", "there’s", "packet’s", "couldn’t", "world’s",
+                     "needn’t", "graph’s", "table’s", "parent’s", "entity’s", "server’s", "node’s",
+                     "querier’s", "sender’s", "receiver’s", "computer’s", "frame’s", "vendor’s", "system’s",
+                     "we’ll", "asciidoctor’s", "protocol’s", "microsoft’s", "wasn’t" ]
+    for c in contractions:
+        code_string = code_string.replace(c, "")
+        code_string = code_string.replace(c.capitalize(), "")
+        code_string = code_string.replace(c.replace('’', "'"), "")
+        code_string = code_string.replace(c.capitalize().replace('’', "'"), "")
+    return code_string
+
+def removeComments(code_string):
+    code_string = re.sub(re.compile(r"/\*.*?\*/", re.DOTALL), "" , code_string) # C-style comment
+    # Avoid matching // where it is allowed, e.g.,  https://www... or file:///...
+    code_string = re.sub(re.compile(r"(?<!:)(?<!/)(?<!\")(?<!\"\s\s)(?<!file:/)//.*?\n" ) ,"" , code_string)             # C++-style comment
+    return code_string
+
+def removeSingleQuotes(code_string):
+    code_string = code_string.replace('\\\\', " ")        # Separate at \\
+    code_string = code_string.replace('\"\\\\\"', "")
+    code_string = code_string.replace("\\\"", " ")
+    code_string = code_string.replace("'\"'", "")
+    code_string = code_string.replace('…', ' ')
+    return code_string
+
+def removeHexSpecifiers(code_string):
+    # Find all hex numbers
+
+    looking = True
+    while looking:
+        m = re.search(r'(0x[0-9a-fA-F]*)', code_string)
+        if m:
+            code_string = code_string.replace(m.group(0), "")
+        else:
+            looking = False
+
+    return code_string
+
+
+# Create a File object that knows about all of the strings in the given file.
+def findStrings(filename):
+    with open(filename, 'r', encoding="utf8") as f:
+        contents = f.read()
+
+        # Remove comments & embedded quotes so as not to trip up RE.
+        contents = removeContractions(contents)
+        contents = removeWhitespaceControl(contents)
+        contents = removeSingleQuotes(contents)
+        contents = removeHexSpecifiers(contents)
+
+        # Create file object.
+        file = File(filename)
+
+        # What we check depends upon file type.
+        if file.code_file:
+            contents = removeComments(contents)
+            # Code so only checking strings.
+            matches = re.finditer(r'\"([^\"]*)\"', contents)
+            for m in matches:
+                file.add(m.group(1))
+        else:
+            # A documentation file, so examine all words.
+            for w in contents.split():
+                file.add(w)
+
+        return file
+
+
+# Test for whether the given file was automatically generated.
+def isGeneratedFile(filename):
+    # Check file exists - e.g. may have been deleted in a recent commit.
+    if not os.path.exists(filename):
+        return False
+
+    if not filename.endswith('.c'):
+        return False
+
+    # This file is generated, but notice is further in than want to check for all files
+    if filename.endswith('pci-ids.c') or filename.endswith('services-data.c') or filename.endswith('manuf-data.c'):
+        return True
+
+    # Open file
+    f_read = open(os.path.join(filename), 'r', encoding="utf8")
+    for line_no,line in enumerate(f_read):
+        # The comment to say that its generated is near the top, so give up once
+        # get a few lines down.
+        if line_no > 10:
+            f_read.close()
+            return False
+        if (line.find('Generated automatically') != -1 or
+            line.find('Autogenerated from') != -1 or
+            line.find('is autogenerated') != -1 or
+            line.find('automatically generated by Pidl') != -1 or
+            line.find('Created by: The Qt Meta Object Compiler') != -1 or
+            line.find('This file was generated') != -1 or
+            line.find('This filter was automatically generated') != -1 or
+            line.find('This file is auto generated, do not edit!') != -1 or
+            line.find('this file is automatically generated') != -1):
+
+            f_read.close()
+            return True
+
+    # OK, looks like a hand-written file!
+    f_read.close()
+    return False
+
+
+def isAppropriateFile(filename):
+    file, extension = os.path.splitext(filename)
+    if filename.find('CMake') != -1:
+        return False
+    return extension in { '.adoc', '.c', '.cpp', '.pod', '.nsi', '.txt'} or file.endswith('README')
+
+
+def findFilesInFolder(folder, recursive=True):
+    files_to_check = []
+
+    if recursive:
+        for root, subfolders, files in os.walk(folder):
+            for f in files:
+                if should_exit:
+                    return
+                f = os.path.join(root, f)
+                if isAppropriateFile(f) and not isGeneratedFile(f):
+                    files_to_check.append(f)
+    else:
+        for f in sorted(os.listdir(folder)):
+            f = os.path.join(folder, f)
+            if isAppropriateFile(f) and not isGeneratedFile(f):
+                files_to_check.append(f)
+
+    return files_to_check
+
+
+# Check the given file.
+def checkFile(filename):
+    # Check file exists - e.g. may have been deleted in a recent commit.
+    if not os.path.exists(filename):
+        print(filename, 'does not exist!')
+        return
+
+    file = findStrings(filename)
+    file.spellCheck()
+
+
+
+#################################################################
+# Main logic.
+
+# command-line args.  Controls which files should be checked.
+# If no args given, will just scan epan/dissectors folder.
+parser = argparse.ArgumentParser(description='Check spellings in specified files')
+parser.add_argument('--file', action='append',
+                    help='specify individual file to test')
+parser.add_argument('--folder', action='store', default='',
+                    help='specify folder to test')
+parser.add_argument('--no-recurse', action='store_true', default='',
+                    help='do not recurse inside chosen folder')
+parser.add_argument('--commits', action='store',
+                    help='last N commits to check')
+parser.add_argument('--open', action='store_true',
+                    help='check open files')
+
+args = parser.parse_args()
+
+
+# Get files from wherever command-line args indicate.
+files = []
+if args.file:
+    # Add specified file(s)
+    for f in args.file:
+        if not os.path.isfile(f):
+            print('Chosen file', f, 'does not exist.')
+            exit(1)
+        else:
+            files.append(f)
+elif args.commits:
+    # Get files affected by specified number of commits.
+    command = ['git', 'diff', '--name-only', 'HEAD~' + args.commits]
+    files = [f.decode('utf-8')
+             for f in subprocess.check_output(command).splitlines()]
+    # Filter files
+    files = list(filter(lambda f : os.path.exists(f) and isAppropriateFile(f) and not isGeneratedFile(f), files))
+elif args.open:
+    # Unstaged changes.
+    command = ['git', 'diff', '--name-only']
+    files = [f.decode('utf-8')
+             for f in subprocess.check_output(command).splitlines()]
+    # Filter files.
+    files = list(filter(lambda f : isAppropriateFile(f) and not isGeneratedFile(f), files))
+    # Staged changes.
+    command = ['git', 'diff', '--staged', '--name-only']
+    files_staged = [f.decode('utf-8')
+                    for f in subprocess.check_output(command).splitlines()]
+    # Filter files.
+    files_staged = list(filter(lambda f : isAppropriateFile(f) and not isGeneratedFile(f), files_staged))
+    for f in files_staged:
+        if not f in files:
+            files.append(f)
+else:
+    # By default, scan dissectors directory
+    folder = os.path.join('epan', 'dissectors')
+    # But overwrite with any folder entry.
+    if args.folder:
+        folder = args.folder
+        if not os.path.isdir(folder):
+            print('Folder', folder, 'not found!')
+            exit(1)
+
+    # Find files from folder.
+    print('Looking for files in', folder)
+    files = findFilesInFolder(folder, not args.no_recurse)
+
+
+# If scanning a subset of files, list them here.
+print('Examining:')
+if args.file or args.folder or args.commits or args.open:
+    if files:
+        print(' '.join(files), '\n')
+    else:
+        print('No files to check.\n')
+else:
+    print('All dissector modules\n')
+
+
+# Now check the chosen files.
+for f in files:
+    # Check this file.
+    checkFile(f)
+    # But get out if control-C has been pressed.
+    if should_exit:
+        exit(1)
+
+
+
+# Show the most commonly not-recognised words.
+print('')
+counter = Counter(missing_words).most_common(100)
+if len(counter) > 0:
+    for c in counter:
+        print(c[0], ':', c[1])
+
+# Show error count.
+print('\n' + bcolors.BOLD + str(len(missing_words)) + ' issues found' + bcolors.ENDC + '\n')
-- 
cgit v1.2.3