summaryrefslogtreecommitdiffstats
path: root/bin/find-german-comments
diff options
context:
space:
mode:
Diffstat (limited to '')
-rwxr-xr-xbin/find-german-comments402
1 files changed, 402 insertions, 0 deletions
diff --git a/bin/find-german-comments b/bin/find-german-comments
new file mode 100755
index 000000000..bb76941c1
--- /dev/null
+++ b/bin/find-german-comments
@@ -0,0 +1,402 @@
+#!/usr/bin/env python3
+########################################################################
+#
+# Copyright (c) 2010 Jonas Jensen, Miklos Vajna
+#
+# Permission is hereby granted, free of charge, to any person
+# obtaining a copy of this software and associated documentation
+# files (the "Software"), to deal in the Software without
+# restriction, including without limitation the rights to use,
+# copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following
+# conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+########################################################################
+
+
+import sys
+import re
+import subprocess
+import os
+import argparse
+import string
+
+class Parser:
+ """
+ This parser extracts comments from source files, tries to guess
+ their language and then prints out the German ones.
+ """
+ def __init__(self):
+ self.strip = string.punctuation + " \n"
+ self.text_cat = self.start_text_cat()
+ parser = argparse.ArgumentParser(description='Searches for German comments in cxx/hxx source files inside a given root directory recursively.')
+ parser.add_argument("-f", "--filenames-only", action="store_true",
+ help="Only print the filenames of files containing German comments")
+ parser.add_argument("-v", "--verbose", action="store_true",
+ help="Turn on verbose mode (print only positives progress to stderr)")
+ parser.add_argument("-l", "--line-numbers", action="store_true",
+ help="Prints the filenames and line numbers only.")
+ parser.add_argument("-L", "--line-numbers-pos", action="store_true",
+ help="Prints the filenames and line numbers only (if positive).")
+ parser.add_argument("-t", "--threshold", action="store", default=0, type=int,
+ help="When used with '--line-numbers', only bothers outputting comment info if there are more than X number of flagged comments. Useful for weeding out false positives.")
+ parser.add_argument("directory", nargs='?', default='.', type=str, help='Give a directory to search in')
+ self.args = parser.parse_args()
+ self.check_source_files(self.args.directory)
+
+ def get_comments(self, filename):
+ """
+ Extracts the source code comments.
+ """
+ linenum = 0
+ if self.args.verbose:
+ print("processing file '%s'...\n" % filename)
+ sock = open(filename)
+ # add an empty line to trigger the output of collected oneliner
+ # comment group
+ lines = sock.readlines() + ["\n"]
+ sock.close()
+
+ in_comment = False
+ buf = []
+ count = 1
+ for i in lines:
+ if "//" in i and not in_comment:
+ # if we find a new //-style comment, then we
+ # just append it to a previous one if: there is
+ # only whitespace before the // mark that is
+ # necessary to make comments longer, giving
+ # more reliable output
+ if not len(re.sub("(.*)//.*", r"\1", i).strip(self.strip)):
+ s = re.sub(".*// ?", "", i).strip(self.strip)
+ if len(s):
+ buf.append(s)
+ else:
+ # otherwise it's an independent //-style comment in the next line
+ yield (count, "\n ".join(buf))
+ buf = [re.sub(".*// ?", "", i.strip(self.strip))]
+ elif "//" not in i and not in_comment and len(buf) > 0:
+ # first normal line after a // block
+ yield (count, "\n ".join(buf))
+ buf = []
+ elif "/*" in i and "*/" not in i and not in_comment:
+ # start of a real multiline comment
+ in_comment = True
+ linenum = count
+ s = re.sub(".*/\*+", "", i.strip(self.strip))
+ if len(s):
+ buf.append(s.strip(self.strip))
+ elif in_comment and not "*/" in i:
+ # in multiline comment
+ s = re.sub("^( |\|)*\*?", "", i)
+ if len(s.strip(self.strip)):
+ buf.append(s.strip(self.strip))
+ elif "*/" in i and in_comment:
+ # end of multiline comment
+ in_comment = False
+ s = re.sub(r"\*+/.*", "", i.strip(self.strip))
+ if len(s):
+ buf.append(s)
+ yield (count, "\n ".join(buf))
+ buf = []
+ elif "/*" in i and "*/" in i:
+ # c-style oneliner comment
+ yield (count, re.sub(".*/\*(.*)\*/.*", r"\1", i).strip(self.strip))
+ count += 1
+
+ def start_text_cat(self):
+ cwd = os.getcwd()
+ # change to our directory
+ os.chdir(os.path.split(os.path.abspath(sys.argv[0]))[0])
+ sock = subprocess.Popen(["text_cat/text_cat", "-s", "-d", "text_cat/LM"], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+ os.chdir(cwd)
+ return sock
+
+ def get_lang(self, s):
+ """ the output is 'german' or 'english' or 'german or english'. When
+ unsure, just don't warn, there are strings where you just can't
+ determine the results reliably, like '#110680#' """
+
+ self.text_cat.stdin.write(bytes(s, 'utf-8'))
+ self.text_cat.stdin.write(bytes("\n", 'utf-8'))
+ self.text_cat.stdin.flush()
+ lang = self.text_cat.stdout.readline().strip()
+ return lang
+
+ def is_german(self, s):
+ """
+ determines if a string is German or not
+ """
+ # for short strings we can't do reliable recognition, so skip
+ # short strings and less than 4 words
+ s = s.replace('\n', ' ')
+ if len(s) < 32 or len(s.split()) < 4:
+ return False
+ return self.get_lang(s) == b"german"
+
+ def check_file(self, path):
+ """
+ checks each comment in a file
+ """
+ def tab_calc(path):
+ START = 40 #Default of 10 tabs
+ if len(path) >= START:
+ return 1
+ diff = START - len(path)
+ if diff % 4 is not 0:
+ padding = 1
+ else:
+ padding = 0
+ return (diff/4)+padding
+
+ if self.args.line_numbers or self.args.line_numbers_pos:
+ TABS = "\t"*10
+ path_linenums = []
+ for linenum, s in self.get_comments(path):
+ if self.is_german(s):
+ path_linenums.append(linenum)
+ valid = len(path_linenums) > int(self.args.threshold)
+ if self.args.line_numbers:
+ print("%s ... %s positives -- %s\n" % (path, str(len(path_linenums)), str(valid)))
+ if valid:
+ if self.args.line_numbers_pos:
+ print("%s ... %s positives\n" % (path, str(len(path_linenums))))
+ return
+ if len(path) + (len(path_linenums)*4) > 75:
+ print("%s:\n" % path)
+ while path_linenums:
+ i = 0
+ numline = []
+ while i < 10:
+ try:
+ numline.append(path_linenums[0])
+ path_linenums.remove(path_linenums[0])
+ except IndexError:
+ i = 10
+ i += 1
+ numline = [str(i) for i in numline]
+ print("%s%s" % (TABS, ",".join(numline)))
+ else:
+ if self.args.line_numbers:
+ path_linenums = [str(i) for i in path_linenums]
+ print("%s:%s%s" % (path, "\t"*int(tab_calc(path)), ",".join(path_linenums)))
+
+ elif not self.args.filenames_only:
+ for linenum, s in self.get_comments(path):
+ if self.is_german(s):
+ print("%s:%s: %s" % (path, linenum, s))
+ else:
+ fnames = set([])
+ for linenum, s in self.get_comments(path):
+ if self.is_german(s):
+ # Make sure we print each filename only once
+ fnames.add(path)
+ # Print the filenames
+ for f in fnames:
+ print(f)
+
+ def first_elem(self, path):
+ """
+ Returns the root directory in our repo of a given path, so we can check against the whitelist.
+ """
+ lastElem = os.path.dirname(path)
+ done = False
+ while not done:
+ nextElem = os.path.split(lastElem)[0]
+ if nextElem is not '':
+ lastElem = nextElem
+ else:
+ done = True
+ return lastElem
+
+ def check_source_files(self, directory):
+ """
+ checks each _tracked_ file in a directory recursively
+ """
+
+ # top-level project directory -> use whitelist.
+ globalscan = False
+ if os.path.exists(directory + "/.git/config"):
+ globalscan = True
+
+ # Change into the given dir, so "git ls-tree" does work.
+ os.chdir(directory)
+
+ sock = os.popen(r"git ls-tree -r HEAD --name-only |egrep '\.(c|cc|cpp|cxx|h|hxx|mm)$'")
+ lines = sock.readlines()
+ sock.close()
+
+ # Helps to speedup a global scan
+ directory_whitelist = {
+ "ure" : 1,
+ "ios" : 1,
+ "bean" : 1,
+ "apple_remote" : 1,
+ "UnoControls" : 1,
+ "accessibility" : 1,
+ "android" : 1,
+ "animations" : 1,
+ "avmedia" : 1,
+ "basctl" : 1,
+ "basegfx" : 1,
+ "basic" : 1,
+ "binaryurp" : 1,
+ "bridges" : 1,
+ "canvas" : 1,
+ "chart2" : 1,
+ "cli_ure" : 1,
+ "codemaker" : 1,
+ "comphelper" : 1,
+ "compilerplugins" : 1,
+ "configmgr" : 1,
+ "connectivity" : 1,
+ "cppcanvas" : 1,
+ "cppu" : 1,
+ "cppuhelper" : 1,
+ "cpputools" : 1,
+ "cui" : 1,
+ "dbaccess" : 1,
+ "desktop" : 1,
+ "drawinglayer" : 1,
+ "dtrans" : 1,
+ "editeng" : 1,
+ "embeddedobj" : 1,
+ "embedserv" : 1,
+ "eventattacher" : 1,
+ "extensions" : 1,
+ "external" : 1,
+ "filter" : 1,
+ "forms" : 1,
+ "formula" : 1,
+ "fpicker" : 1,
+ "framework" : 1,
+ "helpcompiler" : 1,
+ "hwpfilter" : 1,
+ "i18npool" : 1,
+ "i18nlangtag" : 1,
+ "i18nutil" : 1,
+ "idl" : 1,
+ "idlc" : 1,
+ "include" : 1,
+ "io" : 1,
+ "javaunohelper" : 1,
+ "jvmaccess" : 1,
+ "jvmfwk" : 1,
+ "jurt" : 1,
+ "l10ntools" : 1,
+ "libreofficekit" : 1,
+ "lingucomponent" : 1,
+ "linguistic" : 1,
+ "lotuswordpro" : 1,
+ "mysqlc" : 1,
+ "o3tl" : 1,
+ "odk" : 1,
+ "officecfg" : 1,
+ "onlineupdate" : 1,
+ "opencl" : 1,
+ "oox" : 1,
+ "package" : 1,
+ "postprocess" : 1,
+ "pyuno" : 1,
+ "registry" : 1,
+ "remotebridges" : 1,
+ "reportdesign" : 1,
+ "rsc" : 1,
+ "sal" : 1,
+ "salhelper" : 1,
+ "sax" : 1,
+ "sc" : 1,
+ "scaddins" : 1,
+ "sccomp" : 1,
+ "scripting" : 1,
+ "sd" : 1,
+ "sdext" : 1,
+ "sfx2" : 1,
+ "shell" : 1,
+ "setup_native" : 1,
+ "sot" : 1,
+ "slideshow" : 1,
+ "smoketest" : 1,
+ "solenv" : 1,
+ "soltools" : 1,
+ "starmath" : 1,
+ "stoc" : 1,
+ "store" : 1,
+ "svgio" : 1,
+ "svl" : 1,
+ "svtools" : 1,
+ "svx" : 1,
+ "sw" : 1,
+ "test" : 1,
+ "testtools" : 1,
+ "toolkit" : 1,
+ "tools" : 1,
+ "touch" : 1,
+ "ucb" : 1,
+ "ucbhelper" : 1,
+ "unodevtools" : 1,
+ "unotest" : 1,
+ "unoidl" : 1,
+ "unotools" : 1,
+ "unoxml" : 1,
+ "uui" : 1,
+ "vbahelper" : 1,
+ "vcl" : 1,
+ "winaccessibility" : 1,
+ "writerfilter" : 1,
+ "writerperfect" : 1,
+ "xmlhelp" : 1,
+ "xmloff" : 1,
+ "xmlreader" : 1,
+ "xmlsecurity" : 1,
+ "xmlscript" : 1,
+ }
+
+ if globalscan:
+ print("Scanning all files globally:")
+ elif directory == '.':
+ print("Scanning all files in our current directory:")
+ else:
+ print("Scanning all files in", directory + ":")
+
+ num_checked = 0
+
+ for path in lines:
+ baseDir = self.first_elem(path)
+ # If we have a globalscan use the whitelist.
+ if globalscan:
+ if not baseDir in directory_whitelist:
+ sys.stderr.write("\n - Error: Missing path %s -\n\n" % baseDir)
+ sys.exit(1)
+ elif directory_whitelist[baseDir] is 0:
+ self.check_file(path.strip())
+ num_checked = num_checked + 1
+ elif directory_whitelist[baseDir] is 1:
+ sys.stderr.write("Skipping whitelisted directory %s\n" % baseDir)
+ directory_whitelist[baseDir] = 2
+ elif not globalscan:
+ self.check_file(path.strip())
+ num_checked = num_checked + 1
+
+ print("Scanned %s files\n" % num_checked)
+
+try:
+ Parser()
+except KeyboardInterrupt:
+ print("Interrupted!")
+ sys.exit(0)
+
+# vim:set shiftwidth=4 softtabstop=4 expandtab: