1 files changed, 402 insertions, 0 deletions
diff --git a/bin/find-german-comments b/bin/find-german-comments
new file mode 100755
index 000000000..bb76941c1
--- /dev/null
+++ b/bin/find-german-comments
@@ -0,0 +1,402 @@
+#!/usr/bin/env python3
+########################################################################
+#
+#  Copyright (c) 2010 Jonas Jensen, Miklos Vajna
+#
+#  Permission is hereby granted, free of charge, to any person
+#  obtaining a copy of this software and associated documentation
+#  files (the "Software"), to deal in the Software without
+#  restriction, including without limitation the rights to use,
+#  copy, modify, merge, publish, distribute, sublicense, and/or sell
+#  copies of the Software, and to permit persons to whom the
+#  Software is furnished to do so, subject to the following
+#  conditions:
+#
+#  The above copyright notice and this permission notice shall be
+#  included in all copies or substantial portions of the Software.
+#
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+#  OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+#  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+#  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+#  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+#  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+#  OTHER DEALINGS IN THE SOFTWARE.
+#
+########################################################################
+
+
+import sys
+import re
+import subprocess
+import os
+import argparse
+import string
+
+class Parser:
+    """
+    This parser extracts comments from source files, tries to guess
+    their language and then prints out the German ones.
+    """
+    def __init__(self):
+        self.strip = string.punctuation + " \n"
+        self.text_cat = self.start_text_cat()
+        parser = argparse.ArgumentParser(description='Searches for German comments in cxx/hxx source files inside a given root directory recursively.')
+        parser.add_argument("-f", "--filenames-only", action="store_true",
+            help="Only print the filenames of files containing German comments")
+        parser.add_argument("-v", "--verbose", action="store_true",
+            help="Turn on verbose mode (print only positives progress to stderr)")
+        parser.add_argument("-l", "--line-numbers", action="store_true",
+            help="Prints the filenames and line numbers only.")
+        parser.add_argument("-L", "--line-numbers-pos", action="store_true",
+            help="Prints the filenames and line numbers only (if positive).")
+        parser.add_argument("-t", "--threshold", action="store", default=0, type=int,
+            help="When used with '--line-numbers', only bothers outputting comment info if there are more than X number of flagged comments. Useful for weeding out false positives.")
+        parser.add_argument("directory", nargs='?', default='.', type=str, help='Give a directory to search in')
+        self.args = parser.parse_args()
+        self.check_source_files(self.args.directory)
+
+    def get_comments(self, filename):
+        """
+        Extracts the source code comments.
+        """
+        linenum = 0
+        if self.args.verbose:
+            print("processing file '%s'...\n" % filename)
+        sock = open(filename)
+        # add an empty line to trigger the output of collected oneliner
+        # comment group
+        lines = sock.readlines() + ["\n"]
+        sock.close()
+
+        in_comment = False
+        buf = []
+        count = 1
+        for i in lines:
+            if "//" in i and not in_comment:
+                # if we find a new //-style comment, then we
+                # just append it to a previous one if: there is
+                # only whitespace before the // mark that is
+                # necessary to make comments longer, giving
+                # more reliable output
+                if not len(re.sub("(.*)//.*", r"\1", i).strip(self.strip)):
+                    s = re.sub(".*// ?", "", i).strip(self.strip)
+                    if len(s):
+                        buf.append(s)
+                else:
+                    # otherwise it's an independent //-style comment in the next line
+                    yield (count, "\n    ".join(buf))
+                    buf = [re.sub(".*// ?", "", i.strip(self.strip))]
+            elif "//" not in i and not in_comment and len(buf) > 0:
+                # first normal line after a // block
+                yield (count, "\n    ".join(buf))
+                buf = []
+            elif "/*" in i and "*/" not in i and not in_comment:
+                # start of a real multiline comment
+                in_comment = True
+                linenum = count
+                s = re.sub(".*/\*+", "", i.strip(self.strip))
+                if len(s):
+                    buf.append(s.strip(self.strip))
+            elif in_comment and not "*/" in i:
+                # in multiline comment
+                s = re.sub("^( |\|)*\*?", "", i)
+                if len(s.strip(self.strip)):
+                    buf.append(s.strip(self.strip))
+            elif "*/" in i and in_comment:
+                # end of multiline comment
+                in_comment = False
+                s = re.sub(r"\*+/.*", "", i.strip(self.strip))
+                if len(s):
+                    buf.append(s)
+                yield (count, "\n    ".join(buf))
+                buf = []
+            elif "/*" in i and "*/" in i:
+                # c-style oneliner comment
+                yield (count, re.sub(".*/\*(.*)\*/.*", r"\1", i).strip(self.strip))
+            count += 1
+
+    def start_text_cat(self):
+        cwd = os.getcwd()
+        # change to our directory
+        os.chdir(os.path.split(os.path.abspath(sys.argv[0]))[0])
+        sock = subprocess.Popen(["text_cat/text_cat", "-s", "-d", "text_cat/LM"], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+        os.chdir(cwd)
+        return sock
+
+    def get_lang(self, s):
+        """ the output is 'german' or 'english' or 'german or english'. When
+        unsure, just don't warn, there are strings where you just can't
+        determine the results reliably, like '#110680#' """
+
+        self.text_cat.stdin.write(bytes(s, 'utf-8'))
+        self.text_cat.stdin.write(bytes("\n", 'utf-8'))
+        self.text_cat.stdin.flush()
+        lang = self.text_cat.stdout.readline().strip()
+        return lang
+
+    def is_german(self, s):
+        """
+        determines if a string is German or not
+        """
+        # for short strings we can't do reliable recognition, so skip
+        # short strings and less than 4 words
+        s = s.replace('\n', ' ')
+        if len(s) < 32 or len(s.split()) < 4:
+            return False
+        return self.get_lang(s) == b"german"
+
+    def check_file(self, path):
+        """
+        checks each comment in a file
+        """
+        def tab_calc(path):
+            START = 40 #Default of 10 tabs
+            if len(path) >= START:
+                return 1
+            diff = START - len(path)
+            if diff % 4 is not 0:
+                padding = 1
+            else:
+                padding = 0
+            return (diff/4)+padding
+
+        if self.args.line_numbers or self.args.line_numbers_pos:
+            TABS = "\t"*10
+            path_linenums = []
+            for linenum, s in self.get_comments(path):
+                if self.is_german(s):
+                    path_linenums.append(linenum)
+            valid = len(path_linenums) > int(self.args.threshold)
+            if self.args.line_numbers:
+                print("%s ... %s positives -- %s\n" % (path, str(len(path_linenums)), str(valid)))
+            if valid:
+                if self.args.line_numbers_pos:
+                    print("%s ... %s positives\n" % (path, str(len(path_linenums))))
+                    return
+                if len(path) + (len(path_linenums)*4) > 75:
+                    print("%s:\n" % path)
+                    while path_linenums:
+                        i = 0
+                        numline = []
+                        while i < 10:
+                            try:
+                                numline.append(path_linenums[0])
+                                path_linenums.remove(path_linenums[0])
+                            except IndexError:
+                                i = 10
+                            i += 1
+                        numline = [str(i) for i in numline]
+                        print("%s%s" % (TABS, ",".join(numline)))
+                else:
+                    if self.args.line_numbers:
+                        path_linenums = [str(i) for i in path_linenums]
+                        print("%s:%s%s" % (path, "\t"*int(tab_calc(path)), ",".join(path_linenums)))
+
+        elif not self.args.filenames_only:
+            for linenum, s in self.get_comments(path):
+                if self.is_german(s):
+                    print("%s:%s: %s" % (path, linenum, s))
+        else:
+            fnames = set([])
+            for linenum, s in self.get_comments(path):
+                if self.is_german(s):
+                    # Make sure we print each filename only once
+                    fnames.add(path)
+            # Print the filenames
+            for f in fnames:
+                print(f)
+
+    def first_elem(self, path):
+        """
+        Returns the root directory in our repo of a given path, so we can check against the whitelist.
+        """
+        lastElem = os.path.dirname(path)
+        done = False
+        while not done:
+            nextElem = os.path.split(lastElem)[0]
+            if nextElem is not '':
+                lastElem = nextElem
+            else:
+                done = True
+        return lastElem
+
+    def check_source_files(self, directory):
+        """
+        checks each _tracked_ file in a directory recursively
+        """
+
+	# top-level project directory -> use whitelist.
+        globalscan = False
+        if os.path.exists(directory + "/.git/config"):
+           globalscan = True
+
+        # Change into the given dir, so "git ls-tree" does work.
+        os.chdir(directory)
+
+        sock = os.popen(r"git ls-tree -r HEAD --name-only |egrep '\.(c|cc|cpp|cxx|h|hxx|mm)$'")
+        lines = sock.readlines()
+        sock.close()
+
+        # Helps to speedup a global scan
+        directory_whitelist = {
+            "ure" : 1,
+            "ios" : 1,
+            "bean" : 1,
+            "apple_remote" : 1,
+            "UnoControls" : 1,
+            "accessibility" : 1,
+            "android" : 1,
+            "animations" : 1,
+            "avmedia" : 1,
+            "basctl" : 1,
+            "basegfx" : 1,
+            "basic" : 1,
+            "binaryurp" : 1,
+            "bridges" : 1,
+            "canvas" : 1,
+            "chart2" : 1,
+            "cli_ure" : 1,
+            "codemaker" : 1,
+            "comphelper" : 1,
+            "compilerplugins" : 1,
+            "configmgr" : 1,
+            "connectivity" : 1,
+            "cppcanvas" : 1,
+            "cppu" : 1,
+            "cppuhelper" : 1,
+            "cpputools" : 1,
+            "cui" : 1,
+            "dbaccess" : 1,
+            "desktop" : 1,
+            "drawinglayer" : 1,
+            "dtrans" : 1,
+            "editeng" : 1,
+            "embeddedobj" : 1,
+            "embedserv" : 1,
+            "eventattacher" : 1,
+            "extensions" : 1,
+            "external" : 1,
+            "filter" : 1,
+            "forms" : 1,
+            "formula" : 1,
+            "fpicker" : 1,
+            "framework" : 1,
+            "helpcompiler" : 1,
+            "hwpfilter" : 1,
+            "i18npool" : 1,
+            "i18nlangtag" : 1,
+            "i18nutil" : 1,
+            "idl" : 1,
+            "idlc" : 1,
+            "include" : 1,
+            "io" : 1,
+            "javaunohelper" : 1,
+            "jvmaccess" : 1,
+            "jvmfwk" : 1,
+            "jurt" : 1,
+            "l10ntools" : 1,
+            "libreofficekit" : 1,
+            "lingucomponent" : 1,
+            "linguistic" : 1,
+            "lotuswordpro" : 1,
+            "mysqlc" : 1,
+            "o3tl" : 1,
+            "odk" : 1,
+            "officecfg" : 1,
+            "onlineupdate" : 1,
+            "opencl" : 1,
+            "oox" : 1,
+            "package" : 1,
+            "postprocess" : 1,
+            "pyuno" : 1,
+            "registry" : 1,
+            "remotebridges" : 1,
+            "reportdesign" : 1,
+            "rsc" : 1,
+            "sal" : 1,
+            "salhelper" : 1,
+            "sax" : 1,
+            "sc" : 1,
+            "scaddins" : 1,
+            "sccomp" : 1,
+            "scripting" : 1,
+            "sd" : 1,
+            "sdext" : 1,
+            "sfx2" : 1,
+            "shell" : 1,
+            "setup_native" : 1,
+            "sot" : 1,
+            "slideshow" : 1,
+            "smoketest" : 1,
+            "solenv" : 1,
+            "soltools" : 1,
+            "starmath" : 1,
+            "stoc" : 1,
+            "store" : 1,
+            "svgio" : 1,
+            "svl" : 1,
+            "svtools" : 1,
+            "svx" : 1,
+            "sw" : 1,
+            "test" : 1,
+            "testtools" : 1,
+            "toolkit" : 1,
+            "tools" : 1,
+            "touch" : 1,
+            "ucb" : 1,
+            "ucbhelper" : 1,
+            "unodevtools" : 1,
+            "unotest" : 1,
+            "unoidl" : 1,
+            "unotools" : 1,
+            "unoxml" : 1,
+            "uui" : 1,
+            "vbahelper" : 1,
+            "vcl" : 1,
+            "winaccessibility" : 1,
+            "writerfilter" : 1,
+            "writerperfect" : 1,
+            "xmlhelp" : 1,
+            "xmloff" : 1,
+            "xmlreader" : 1,
+            "xmlsecurity" : 1,
+            "xmlscript" : 1,
+        }
+
+        if globalscan:
+            print("Scanning all files globally:")
+        elif directory == '.':
+            print("Scanning all files in our current directory:")
+        else:
+            print("Scanning all files in", directory + ":")
+
+        num_checked = 0
+
+        for path in lines:
+            baseDir = self.first_elem(path)
+            # If we have a globalscan use the whitelist.
+            if globalscan:
+                if not baseDir in directory_whitelist:
+                    sys.stderr.write("\n - Error: Missing path %s -\n\n" % baseDir)
+                    sys.exit(1)
+                elif directory_whitelist[baseDir] is 0:
+                    self.check_file(path.strip())
+                    num_checked = num_checked + 1
+                elif directory_whitelist[baseDir] is 1:
+                    sys.stderr.write("Skipping whitelisted directory %s\n" % baseDir)
+                    directory_whitelist[baseDir] = 2
+            elif not globalscan:
+                self.check_file(path.strip())
+                num_checked = num_checked + 1
+
+        print("Scanned %s files\n" % num_checked)
+
+try:
+    Parser()
+except KeyboardInterrupt:
+    print("Interrupted!")
+    sys.exit(0)
+
+# vim:set shiftwidth=4 softtabstop=4 expandtab: