diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 09:06:44 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 09:06:44 +0000 |
commit | ed5640d8b587fbcfed7dd7967f3de04b37a76f26 (patch) | |
tree | 7a5f7c6c9d02226d7471cb3cc8fbbf631b415303 /bin/find-can-be-private-symbols.py | |
parent | Initial commit. (diff) | |
download | libreoffice-upstream.tar.xz libreoffice-upstream.zip |
Adding upstream version 4:7.4.7.upstream/4%7.4.7upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'bin/find-can-be-private-symbols.py')
-rwxr-xr-x | bin/find-can-be-private-symbols.py | 222 |
1 files changed, 222 insertions, 0 deletions
diff --git a/bin/find-can-be-private-symbols.py b/bin/find-can-be-private-symbols.py new file mode 100755 index 000000000..7203f8ed8 --- /dev/null +++ b/bin/find-can-be-private-symbols.py @@ -0,0 +1,222 @@ +#!/usr/bin/python3 +# +# Find exported symbols that can be made non-exported. +# +# Noting that (a) parsing these commands is a pain, the output is quite irregular and (b) I'm fumbling in the +# dark here, trying to guess what exactly constitutes an "import" vs an "export" of a symbol, linux linking +# is rather complex. +# +# Takes about 5min to run on a decent machine. +# +# The standalone function analysis is reasonable reliable, but the class/method analysis is less so +# (something to do with destructor thunks not showing up in my results?) +# +# Also, the class/method analysis will not catch problems like +# 'dynamic_cast from 'Foo' with hidden type visibility to 'Bar' with default type visibility' +# but loplugin:dyncastvisibility will do that for you +# + +import subprocess +import sys +import re + +exported_symbols1 = set() +imported_symbols1 = set() +exported_symbols2 = set() # decoded +imported_symbols2 = set() # decoded +# all names that exist in the source code +#all_source_names = set() + + +#subprocess_find_all_source_names = subprocess.Popen("git grep -oh -P '\\b\\w\\w\\w+\\b' -- '*.h*' | sort -u", +# stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) +#with subprocess_find_all_source_names.stdout as txt: +# for line in txt: +# line = line.strip() +# all_source_names.add(line) +#subprocess_find_all_source_names.terminate() + +# find all our shared libs +subprocess_find = subprocess.Popen("find ./instdir -name *.so && find ./workdir/LinkTarget/CppunitTest -name *.so", + stdout=subprocess.PIPE, shell=True) +with subprocess_find.stdout as txt: + for line in txt: + sharedlib = line.strip() + # look for exported symbols + subprocess_nm = subprocess.Popen(b"nm -D " + sharedlib, stdout=subprocess.PIPE, shell=True) + with subprocess_nm.stdout as txt2: + # We are looking for lines something like: + # 0000000000036ed0 T flash_component_getFactory + line_regex = re.compile(r'^[0-9a-fA-F]+ T ') + for line2_bytes in txt2: + line2 = line2_bytes.strip().decode("utf-8") + if line_regex.match(line2): + sym = line2.split(" ")[2].strip() + exported_symbols1.add(sym) + subprocess_nm.terminate() + # look for imported symbols + subprocess_objdump = subprocess.Popen(b"objdump -T " + sharedlib, stdout=subprocess.PIPE, shell=True) + with subprocess_objdump.stdout as txt2: + # ignore some header bumpf + txt2.readline() + txt2.readline() + txt2.readline() + txt2.readline() + # We are looking for lines something like: + # 0000000000000000 DF *UND* 0000000000000000 _ZN16FilterConfigItem10WriteInt32ERKN3rtl8OUStringEi + for line2_bytes in txt2: + line2 = line2_bytes.strip().decode("utf-8") + if not("*UND*"in line2): continue + tokens = line2.split(" ") + sym = tokens[len(tokens)-1].strip() + imported_symbols1.add(sym) + subprocess_objdump.terminate() +subprocess_find.terminate() + +# look for imported symbols in executables +subprocess_find = subprocess.Popen("find ./instdir -name *.bin", stdout=subprocess.PIPE, shell=True) +with subprocess_find.stdout as txt: + for line in txt: + executable = line.strip() + # look for exported symbols + subprocess_nm = subprocess.Popen(b"nm -D " + executable + b" | grep -w U", stdout=subprocess.PIPE, shell=True) + with subprocess_nm.stdout as txt2: + # We are looking for lines something like: + # U sal_detail_deinitialize + for line2_bytes in txt2: + line2 = line2_bytes.strip().decode("utf-8") + sym = line2.split(" ")[1] + imported_symbols1.add(sym) +subprocess_find.terminate() + +#progress = 0; +#for sym in sorted(imported_symbols - exported_symbols): +# progress += 1 +# if (progress % 128 == 0): print( str(int(progress * 100 / len(diff))) + "%") +# filtered_sym = subprocess.check_output(["c++filt", sym]).strip().decode("utf-8") +# if filtered_sym.startswith("non-virtual thunk to "): filtered_sym = filtered_sym[21:] +# elif filtered_sym.startswith("virtual thunk to "): filtered_sym = filtered_sym[17:] +# print("Symbol imported but not exported? " + filtered_sym) + +# Now we have to symbolize before comparing because sometimes (due to thunks) two +# different encoded names symbolize to the same method/func name +# +progress = 0; +progress_max_len = len(imported_symbols1) + len(exported_symbols1) +for sym in imported_symbols1: + progress += 1 + if (progress % 128 == 0): print( str(int(progress * 100 / progress_max_len)) + "%") + filtered_sym = subprocess.check_output(["c++filt", sym]).strip().decode("utf-8") + if filtered_sym.startswith("non-virtual thunk to "): filtered_sym = filtered_sym[21:] + elif filtered_sym.startswith("virtual thunk to "): filtered_sym = filtered_sym[17:] + imported_symbols2.add(filtered_sym) +progress = 0; +for sym in exported_symbols1: + progress += 1 + if (progress % 128 == 0): print( str(int(progress * 100 / progress_max_len)) + "%") + filtered_sym = subprocess.check_output(["c++filt", sym]).strip().decode("utf-8") + if filtered_sym.startswith("non-virtual thunk to "): filtered_sym = filtered_sym[21:] + elif filtered_sym.startswith("virtual thunk to "): filtered_sym = filtered_sym[17:] + exported_symbols2.add(filtered_sym) + +unused_exports = exported_symbols2 - imported_symbols2 +print("exported = " + str(len(exported_symbols2))) +print("imported = " + str(len(imported_symbols2))) +print("unused_exports = " + str(len(unused_exports))) + +#def extractFunctionNameFromSignature(sym): +# i = sym.find("(") +# if i == -1: return sym +# return sym[:i] + +with open("bin/find-can-be-private-symbols.functions.results", "wt") as f: + for sym in sorted(unused_exports): + # Filter out most of the noise. + # No idea where these are coming from, but not our code. + if sym.startswith("CERT_"): continue + elif sym.startswith("DER_"): continue + elif sym.startswith("FORM_"): continue + elif sym.startswith("FPDF"): continue + elif sym.startswith("HASH_"): continue + elif sym.startswith("Hunspell_"): continue + elif sym.startswith("LL_"): continue + elif sym.startswith("LP_"): continue + elif sym.startswith("LU"): continue + elif sym.startswith("MIP"): continue + elif sym.startswith("MPS"): continue + elif sym.startswith("NSS"): continue + elif sym.startswith("NSC_"): continue + elif sym.startswith("PK11"): continue + elif sym.startswith("PL_"): continue + elif sym.startswith("PQ"): continue + elif sym.startswith("PBE_"): continue + elif sym.startswith("PORT_"): continue + elif sym.startswith("PRP_"): continue + elif sym.startswith("PR_"): continue + elif sym.startswith("PT_"): continue + elif sym.startswith("QS_"): continue + elif sym.startswith("REPORT_"): continue + elif sym.startswith("RSA_"): continue + elif sym.startswith("SEC"): continue + elif sym.startswith("SGN"): continue + elif sym.startswith("SOS"): continue + elif sym.startswith("SSL_"): continue + elif sym.startswith("VFY_"): continue + elif sym.startswith("_PR_"): continue + elif sym.startswith("ber_"): continue + elif sym.startswith("bfp_"): continue + elif sym.startswith("ldap_"): continue + elif sym.startswith("ne_"): continue + elif sym.startswith("opj_"): continue + elif sym.startswith("pg_"): continue + elif sym.startswith("pq"): continue + elif sym.startswith("presolve_"): continue + elif sym.startswith("sqlite3_"): continue + elif sym.startswith("libepubgen::"): continue + elif sym.startswith("lucene::"): continue + elif sym.startswith("Hunspell::"): continue + elif sym.startswith("sk_"): continue + elif sym.startswith("_Z"): continue + # dynamically loaded + elif sym.endswith("get_implementation"): continue + elif sym.endswith("component_getFactory"): continue + elif sym == "CreateDialogFactory": continue + elif sym == "CreateUnoWrapper": continue + elif sym == "CreateWindow": continue + elif sym == "ExportDOC": continue + elif sym == "ExportPPT": continue + elif sym == "ExportRTF": continue + elif sym == "GetSaveWarningOfMSVBAStorage_ww8": continue + elif sym == "GetSpecialCharsForEdit": continue + elif sym.startswith("Import"): continue + elif sym.startswith("Java_com_sun_star_"): continue + elif sym.startswith("TestImport"): continue + elif sym.startswith("getAllCalendars_"): continue + elif sym.startswith("getAllCurrencies_"): continue + elif sym.startswith("getAllFormats"): continue + elif sym.startswith("getBreakIteratorRules_"): continue + elif sym.startswith("getCollationOptions_"): continue + elif sym.startswith("getCollatorImplementation_"): continue + elif sym.startswith("getContinuousNumberingLevels_"): continue + elif sym.startswith("getDateAcceptancePatterns_"): continue + elif sym.startswith("getForbiddenCharacters_"): continue + elif sym.startswith("getIndexAlgorithm_"): continue + elif sym.startswith("getLCInfo_"): continue + elif sym.startswith("getLocaleItem_"): continue + elif sym.startswith("getOutlineNumberingLevels_"): continue + elif sym.startswith("getReservedWords_"): continue + elif sym.startswith("getSTC_"): continue + elif sym.startswith("getSearchOptions_"): continue + elif sym.startswith("getTransliterations_"): continue + elif sym.startswith("getUnicodeScripts_"): continue + elif sym.startswith("lok_"): continue + # UDK API + elif sym.startswith("osl_"): continue + elif sym.startswith("rtl_"): continue + elif sym.startswith("typelib_"): continue + elif sym.startswith("typereg_"): continue + elif sym.startswith("uno_"): continue + # remove things we found that do not exist in our source code, they're not ours + #if not(extractFunctionNameFromSignature(sym) in all_source_names): continue + f.write(sym + "\n") + |