1 files changed, 153 insertions, 26 deletions
diff --git a/tools/check_spelling.py b/tools/check_spelling.py
index 7e319081..be0bbf82 100755
--- a/tools/check_spelling.py
+++ b/tools/check_spelling.py
@@ -11,10 +11,18 @@ import re
 import subprocess
 import argparse
 import signal
+import glob
+
+from spellchecker import SpellChecker
 from collections import Counter
+from html.parser import HTMLParser
+import urllib.request
 
 # Looks for spelling errors among strings found in source or documentation files.
-# N.B. To run this script, you should install pyspellchecker (not spellchecker) using pip.
+# N.B.,
+# - To run this script, you should install pyspellchecker (not spellchecker) using pip.
+# - Because of colouring, you may want to pipe into less -R
+
 
 # TODO: check structured doxygen comments?
 
@@ -44,12 +52,12 @@ signal.signal(signal.SIGINT, signal_handler)
 
 
 # Create spellchecker, and augment with some Wireshark words.
-from spellchecker import SpellChecker
 # Set up our dict with words from text file.
 spell = SpellChecker()
 spell.word_frequency.load_text_file('./tools/wireshark_words.txt')
 
 
+
 # Track words that were not found.
 missing_words = []
 
@@ -67,7 +75,8 @@ class File:
         self.values = []
 
         filename, extension = os.path.splitext(file)
-        self.code_file = extension in {'.c', '.cpp'}
+        # TODO: add '.lua'?  Would also need to check string and comment formats...
+        self.code_file = extension in {'.c', '.cpp', '.h' }
 
 
         with open(file, 'r', encoding="utf8") as f:
@@ -124,7 +133,6 @@ class File:
 
     def checkMultiWordsRecursive(self, word):
         length = len(word)
-        #print('word=', word)
         if length < 4:
             return False
 
@@ -159,6 +167,12 @@ class File:
 
             v = str(v)
 
+            # Sometimes parentheses used to show optional letters, so don't leave space
+            #if re.compile(r"^[\S]*\(").search(v):
+            #    v = v.replace('(', '')
+            #if re.compile(r"\S\)").search(v):
+            #    v = v.replace(')', '')
+
             # Ignore includes.
             if v.endswith('.h'):
                 continue
@@ -191,17 +205,19 @@ class File:
             v = v.replace('?', ' ')
             v = v.replace('=', ' ')
             v = v.replace('*', ' ')
+            v = v.replace('%u', '')
+            v = v.replace('%d', '')
+            v = v.replace('%s', '')
             v = v.replace('%', ' ')
             v = v.replace('#', ' ')
             v = v.replace('&', ' ')
             v = v.replace('@', ' ')
             v = v.replace('$', ' ')
+            v = v.replace('^', ' ')
             v = v.replace('®', '')
             v = v.replace("'", ' ')
             v = v.replace('"', ' ')
-            v = v.replace('%u', '')
-            v = v.replace('%d', '')
-            v = v.replace('%s', '')
+            v = v.replace('~', ' ')
 
             # Split into words.
             value_words = v.split()
@@ -225,11 +241,14 @@ class File:
                 if word.endswith("s’"):
                     word = word[:-2]
 
+
                 if self.numberPlusUnits(word):
                     continue
 
                 if len(word) > 4 and spell.unknown([word]) and not self.checkMultiWords(word) and not self.wordBeforeId(word):
-                    print(self.file, value_index, '/', num_values, '"' + original + '"', bcolors.FAIL + word + bcolors.ENDC,
+                    # Highlight words that appeared in Wikipedia list.
+                    print(bcolors.BOLD if word in wiki_db else '',
+                          self.file, value_index, '/', num_values, '"' + original + '"', bcolors.FAIL + word + bcolors.ENDC,
                           ' -> ', '?')
 
                     # TODO: this can be interesting, but takes too long!
@@ -261,9 +280,24 @@ def removeContractions(code_string):
 def removeComments(code_string):
     code_string = re.sub(re.compile(r"/\*.*?\*/", re.DOTALL), "" , code_string) # C-style comment
     # Avoid matching // where it is allowed, e.g.,  https://www... or file:///...
-    code_string = re.sub(re.compile(r"(?<!:)(?<!/)(?<!\")(?<!\"\s\s)(?<!file:/)//.*?\n" ) ,"" , code_string)             # C++-style comment
+    code_string = re.sub(re.compile(r"(?<!:)(?<!/)(?<!\")(?<!\")(?<!\"\s\s)(?<!file:/)(?<!\,\s)//.*?\n" ) ,"" , code_string)             # C++-style comment
     return code_string
 
+def getCommentWords(code_string):
+    words = []
+
+    # C++ comments
+    matches = re.finditer(r'//\s(.*?)\n', code_string)
+    for m in matches:
+        words += m.group(1).split()
+
+    # C comments
+    matches = re.finditer(r'/\*(.*?)\*/', code_string)
+    for m in matches:
+        words += m.group(1).split()
+
+    return words
+
 def removeSingleQuotes(code_string):
     code_string = code_string.replace('\\\\', " ")        # Separate at \\
     code_string = code_string.replace('\"\\\\\"', "")
@@ -287,7 +321,7 @@ def removeHexSpecifiers(code_string):
 
 
 # Create a File object that knows about all of the strings in the given file.
-def findStrings(filename):
+def findStrings(filename, check_comments=False):
     with open(filename, 'r', encoding="utf8") as f:
         contents = f.read()
 
@@ -302,7 +336,14 @@ def findStrings(filename):
 
         # What we check depends upon file type.
         if file.code_file:
+            # May want to check comments for selected dissectors
+            if check_comments:
+                comment_words = getCommentWords(contents)
+                for w in comment_words:
+                    file.add(w)
+
             contents = removeComments(contents)
+
             # Code so only checking strings.
             matches = re.finditer(r'\"([^\"]*)\"', contents)
             for m in matches:
@@ -328,6 +369,9 @@ def isGeneratedFile(filename):
     if filename.endswith('pci-ids.c') or filename.endswith('services-data.c') or filename.endswith('manuf-data.c'):
         return True
 
+    if filename.endswith('packet-woww.c'):
+        return True
+
     # Open file
     f_read = open(os.path.join(filename), 'r', encoding="utf8")
     for line_no,line in enumerate(f_read):
@@ -358,7 +402,8 @@ def isAppropriateFile(filename):
     file, extension = os.path.splitext(filename)
     if filename.find('CMake') != -1:
         return False
-    return extension in { '.adoc', '.c', '.cpp', '.pod', '.nsi', '.txt'} or file.endswith('README')
+    # TODO: add , '.lua' ?
+    return extension in { '.adoc', '.c', '.cpp', '.pod', '.txt' } or file.endswith('README')
 
 
 def findFilesInFolder(folder, recursive=True):
@@ -382,13 +427,13 @@ def findFilesInFolder(folder, recursive=True):
 
 
 # Check the given file.
-def checkFile(filename):
+def checkFile(filename, check_comments=False):
     # Check file exists - e.g. may have been deleted in a recent commit.
     if not os.path.exists(filename):
         print(filename, 'does not exist!')
         return
 
-    file = findStrings(filename)
+    file = findStrings(filename, check_comments)
     file.spellCheck()
 
 
@@ -401,17 +446,82 @@ def checkFile(filename):
 parser = argparse.ArgumentParser(description='Check spellings in specified files')
 parser.add_argument('--file', action='append',
                     help='specify individual file to test')
-parser.add_argument('--folder', action='store', default='',
+parser.add_argument('--folder', action='append',
                     help='specify folder to test')
+parser.add_argument('--glob', action='append',
+                    help='specify glob to test - should give in "quotes"')
 parser.add_argument('--no-recurse', action='store_true', default='',
-                    help='do not recurse inside chosen folder')
+                    help='do not recurse inside chosen folder(s)')
 parser.add_argument('--commits', action='store',
                     help='last N commits to check')
 parser.add_argument('--open', action='store_true',
                     help='check open files')
+parser.add_argument('--comments', action='store_true',
+                    help='check comments in source files')
+parser.add_argument('--no-wikipedia', action='store_true',
+                    help='skip checking known bad words from wikipedia - can be slow')
+
 
 args = parser.parse_args()
 
+class TypoSourceDocumentParser(HTMLParser):
+    def __init__(self):
+        super().__init__()
+        self.capturing = False
+        self.content = ''
+
+    def handle_starttag(self, tag, attrs):
+        if tag == 'pre':
+            self.capturing = True
+
+    def handle_endtag(self, tag):
+        if tag == 'pre':
+            self.capturing = False
+
+    def handle_data(self, data):
+        if self.capturing:
+            self.content += data
+
+
+# Fetch some common mispellings from wikipedia so we will definitely flag them.
+wiki_db = dict()
+if not args.no_wikipedia:
+    print('Fetching Wikipedia\'s list of common misspellings.')
+    req_headers = { 'User-Agent': 'Wireshark check-wikipedia-typos' }
+    req = urllib.request.Request('https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines', headers=req_headers)
+    try:
+        response = urllib.request.urlopen(req)
+        content = response.read()
+        content = content.decode('UTF-8', 'replace')
+
+        # Extract the "<pre>...</pre>" part of the document.
+        parser = TypoSourceDocumentParser()
+        parser.feed(content)
+        content = parser.content.strip()
+
+        wiki_db = dict(line.lower().split('->', maxsplit=1) for line in content.splitlines())
+        del wiki_db['cmo']      # All false positives.
+        del wiki_db['ect']      # Too many false positives.
+        del wiki_db['thru']     # We'll let that one thru. ;-)
+        del wiki_db['sargeant'] # All false positives.
+
+        # Remove each word from dict
+        removed = 0
+        for word in wiki_db:
+            try:
+                if should_exit:
+                    exit(1)
+                spell.word_frequency.remove_words([word])
+                #print('Removed', word)
+                removed += 1
+            except Exception:
+                pass
+
+        print('Removed', removed, 'known bad words')
+    except Exception:
+        print('Failed to fetch and/or parse Wikipedia mispellings!')
+
+
 
 # Get files from wherever command-line args indicate.
 files = []
@@ -423,14 +533,15 @@ if args.file:
             exit(1)
         else:
             files.append(f)
-elif args.commits:
+if args.commits:
     # Get files affected by specified number of commits.
     command = ['git', 'diff', '--name-only', 'HEAD~' + args.commits]
     files = [f.decode('utf-8')
              for f in subprocess.check_output(command).splitlines()]
     # Filter files
     files = list(filter(lambda f : os.path.exists(f) and isAppropriateFile(f) and not isGeneratedFile(f), files))
-elif args.open:
+
+if args.open:
     # Unstaged changes.
     command = ['git', 'diff', '--name-only']
     files = [f.decode('utf-8')
@@ -444,26 +555,42 @@ elif args.open:
     # Filter files.
     files_staged = list(filter(lambda f : isAppropriateFile(f) and not isGeneratedFile(f), files_staged))
     for f in files_staged:
-        if not f in files:
+        if f not in files:
             files.append(f)
-else:
-    # By default, scan dissectors directory
-    folder = os.path.join('epan', 'dissectors')
-    # But overwrite with any folder entry.
-    if args.folder:
-        folder = args.folder
+
+if args.glob:
+    # Add specified file(s)
+    for g in args.glob:
+        for f in glob.glob(g):
+            if not os.path.isfile(f):
+                print('Chosen file', f, 'does not exist.')
+                exit(1)
+            else:
+                files.append(f)
+
+if args.folder:
+    for folder in args.folder:
         if not os.path.isdir(folder):
             print('Folder', folder, 'not found!')
             exit(1)
 
+        # Find files from folder.
+        print('Looking for files in', folder)
+        files += findFilesInFolder(folder, not args.no_recurse)
+
+# By default, scan dissector files.
+if not args.file and not args.open and not args.commits and not args.glob and not args.folder:
+    # By default, scan dissectors directory
+    folder = os.path.join('epan', 'dissectors')
     # Find files from folder.
     print('Looking for files in', folder)
     files = findFilesInFolder(folder, not args.no_recurse)
 
 
+
 # If scanning a subset of files, list them here.
 print('Examining:')
-if args.file or args.folder or args.commits or args.open:
+if args.file or args.folder or args.commits or args.open or args.glob:
     if files:
         print(' '.join(files), '\n')
     else:
@@ -475,7 +602,7 @@ else:
 # Now check the chosen files.
 for f in files:
     # Check this file.
-    checkFile(f)
+    checkFile(f, check_comments=args.comments)
     # But get out if control-C has been pressed.
     if should_exit:
         exit(1)