summaryrefslogtreecommitdiffstats
path: root/tools/check_dissector_urls.py
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-10 20:34:10 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-10 20:34:10 +0000
commite4ba6dbc3f1e76890b22773807ea37fe8fa2b1bc (patch)
tree68cb5ef9081156392f1dd62a00c6ccc1451b93df /tools/check_dissector_urls.py
parentInitial commit. (diff)
downloadwireshark-e4ba6dbc3f1e76890b22773807ea37fe8fa2b1bc.tar.xz
wireshark-e4ba6dbc3f1e76890b22773807ea37fe8fa2b1bc.zip
Adding upstream version 4.2.2.upstream/4.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'tools/check_dissector_urls.py')
-rwxr-xr-xtools/check_dissector_urls.py291
1 files changed, 291 insertions, 0 deletions
diff --git a/tools/check_dissector_urls.py b/tools/check_dissector_urls.py
new file mode 100755
index 0000000..373d88b
--- /dev/null
+++ b/tools/check_dissector_urls.py
@@ -0,0 +1,291 @@
+#!/usr/bin/env python3
+# Wireshark - Network traffic analyzer
+# By Gerald Combs <gerald@wireshark.org>
+# Copyright 1998 Gerald Combs
+#
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+import argparse
+import aiohttp
+import asyncio
+import os
+import re
+import shutil
+import signal
+import subprocess
+
+# This utility scans the dissector code for URLs, then attempts to
+# fetch the links. The results are shown in stdout, but also, at
+# the end of the run, written to files:
+# - URLs that couldn't be loaded are written to failures.txt
+# - working URLs are written to successes.txt
+# - any previous failures.txt is also copied to failures_last_run.txt
+#
+# N.B. preferred form of RFC link is e.g., https://tools.ietf.org/html/rfc4349
+
+
+# TODO:
+# - option to write back to dissector file when there is a failure?
+# - optionally parse previous/recent successes.txt and avoid fetching them again?
+# - make sure URLs are really within comments in code?
+# - use urllib.parse or similar to better check URLs?
+# - improve regex to allow '+' in URL (like confluence uses)
+
+# Try to exit soon after Ctrl-C is pressed.
+should_exit = False
+
+
+def signal_handler(sig, frame):
+ global should_exit
+ should_exit = True
+ print('You pressed Ctrl+C - exiting')
+ try:
+ tasks = asyncio.all_tasks()
+ except (RuntimeError):
+ # we haven't yet started the async link checking, we can exit directly
+ exit(1)
+ # ignore further SIGINTs while we're cancelling the running tasks
+ signal.signal(signal.SIGINT, signal.SIG_IGN)
+ for t in tasks:
+ t.cancel()
+
+signal.signal(signal.SIGINT, signal_handler)
+
+
+class FailedLookup:
+
+ def __init__(self):
+ # Fake values that will be queried (for a requests.get() return value)
+ self.status = 0
+ self.headers = {}
+ self.headers['content-type'] = '<NONE>'
+
+ def __str__(self):
+ s = ('FailedLookup: status=' + str(self.status) +
+ ' content-type=' + self.headers['content-type'])
+ return s
+
+
+# Dictionary from url -> result
+cached_lookups = {}
+
+
+class Link(object):
+
+ def __init__(self, file, line_number, url):
+ self.file = file
+ self.line_number = line_number
+ self.url = url
+ self.tested = False
+ self.r = None
+ self.success = False
+
+ def __str__(self):
+ epan_idx = self.file.find('epan')
+ if epan_idx == -1:
+ filename = self.file
+ else:
+ filename = self.file[epan_idx:]
+ s = ('SUCCESS ' if self.success else 'FAILED ') + \
+ filename + ':' + str(self.line_number) + ' ' + self.url
+ if True: # self.r:
+ if self.r.status:
+ s += " status-code=" + str(self.r.status)
+ if 'content-type' in self.r.headers:
+ s += (' content-type="' +
+ self.r.headers['content-type'] + '"')
+ else:
+ s += ' <No response Received>'
+ return s
+
+ def validate(self):
+ global cached_lookups
+ global should_exit
+ if should_exit:
+ return
+ self.tested = True
+ if self.url in cached_lookups:
+ self.r = cached_lookups[self.url]
+ else:
+ self.r = FailedLookup()
+
+ if self.r.status < 200 or self.r.status >= 300:
+ self.success = False
+ else:
+ self.success = True
+
+ if (args.verbose or not self.success) and not should_exit:
+ print(self)
+
+links = []
+files = []
+all_urls = set()
+
+def find_links_in_file(filename):
+ with open(filename, 'r', encoding="utf8") as f:
+ for line_number, line in enumerate(f, start=1):
+ # TODO: not matching
+ # https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol
+ urls = re.findall(
+ r'https?://(?:[a-zA-Z0-9./_?&=-]+|%[0-9a-fA-F]{2})+', line)
+
+ for url in urls:
+ # Lop off any trailing chars that are not part of it
+ url = url.rstrip(").',")
+
+ # A url must have a period somewhere
+ if '.' not in url:
+ continue
+ global links, all_urls
+ links.append(Link(filename, line_number, url))
+ all_urls.add(url)
+
+
+# Scan the given folder for links to test.
+def find_links_in_folder(folder):
+ # Look at files in sorted order, to give some idea of how far through it
+ # is.
+ for filename in sorted(os.listdir(folder)):
+ if filename.endswith('.c'):
+ global links
+ find_links_in_file(os.path.join(folder, filename))
+
+
+async def populate_cache(sem, session, url):
+ global cached_lookups
+ if should_exit:
+ return
+ async with sem:
+ try:
+ async with session.get(url) as r:
+ cached_lookups[url] = r
+ if args.verbose:
+ print('checking ', url, ': success', sep='')
+
+ except (asyncio.CancelledError, ValueError, ConnectionError, Exception):
+ cached_lookups[url] = FailedLookup()
+ if args.verbose:
+ print('checking ', url, ': failed', sep='')
+
+
+async def check_all_links(links):
+ sem = asyncio.Semaphore(50)
+ timeout = aiohttp.ClientTimeout(total=25)
+ connector = aiohttp.TCPConnector(limit=30)
+ headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36',
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}
+ async with aiohttp.ClientSession(connector=connector, headers=headers, timeout=timeout) as session:
+ tasks = [populate_cache(sem, session, u) for u in all_urls]
+ try:
+ await asyncio.gather(*tasks)
+ except (asyncio.CancelledError):
+ await session.close()
+
+ for l in links:
+ l.validate()
+
+
+#################################################################
+# Main logic.
+
+# command-line args. Controls which dissector files should be scanned.
+# If no args given, will just scan epan/dissectors folder.
+parser = argparse.ArgumentParser(description='Check URL links in dissectors')
+parser.add_argument('--file', action='append',
+ help='specify individual dissector file to test')
+parser.add_argument('--commits', action='store',
+ help='last N commits to check')
+parser.add_argument('--open', action='store_true',
+ help='check open files')
+parser.add_argument('--verbose', action='store_true',
+ help='when enabled, show more output')
+
+args = parser.parse_args()
+
+
+def is_dissector_file(filename):
+ p = re.compile(r'epan/dissectors/packet-.*\.c')
+ return p.match(filename)
+
+
+# Get files from wherever command-line args indicate.
+if args.file:
+ # Add specified file(s)
+ for f in args.file:
+ if not f.startswith('epan'):
+ f = os.path.join('epan', 'dissectors', f)
+ if not os.path.isfile(f):
+ print('Chosen file', f, 'does not exist.')
+ exit(1)
+ else:
+ files.append(f)
+ find_links_in_file(f)
+elif args.commits:
+ # Get files affected by specified number of commits.
+ command = ['git', 'diff', '--name-only', 'HEAD~' + args.commits]
+ files = [f.decode('utf-8')
+ for f in subprocess.check_output(command).splitlines()]
+ # Fetch links from files (dissectors files only)
+ files = list(filter(is_dissector_file, files))
+ for f in files:
+ find_links_in_file(f)
+elif args.open:
+ # Unstaged changes.
+ command = ['git', 'diff', '--name-only']
+ files = [f.decode('utf-8')
+ for f in subprocess.check_output(command).splitlines()]
+ files = list(filter(is_dissector_file, files))
+ # Staged changes.
+ command = ['git', 'diff', '--staged', '--name-only']
+ files_staged = [f.decode('utf-8')
+ for f in subprocess.check_output(command).splitlines()]
+ files_staged = list(filter(is_dissector_file, files_staged))
+ for f in files:
+ find_links_in_file(f)
+ for f in files_staged:
+ if f not in files:
+ find_links_in_file(f)
+ files.append(f)
+else:
+ # Find links from dissector folder.
+ find_links_in_folder(os.path.join(os.path.dirname(
+ __file__), '..', 'epan', 'dissectors'))
+
+
+# If scanning a subset of files, list them here.
+print('Examining:')
+if args.file or args.commits or args.open:
+ if files:
+ print(' '.join(files), '\n')
+ else:
+ print('No files to check.\n')
+else:
+ print('All dissector modules\n')
+
+asyncio.run(check_all_links(links))
+
+# Write failures to a file. Back up any previous first though.
+if os.path.exists('failures.txt'):
+ shutil.copyfile('failures.txt', 'failures_last_run.txt')
+with open('failures.txt', 'w') as f_f:
+ for l in links:
+ if l.tested and not l.success:
+ f_f.write(str(l) + '\n')
+# And successes
+with open('successes.txt', 'w') as f_s:
+ for l in links:
+ if l.tested and l.success:
+ f_s.write(str(l) + '\n')
+
+
+# Count and show overall stats.
+passed, failed = 0, 0
+for l in links:
+ if l.tested:
+ if l.success:
+ passed += 1
+ else:
+ failed += 1
+
+print('--------------------------------------------------------------------------------------------------')
+print(len(links), 'links checked: ', passed, 'passed,', failed, 'failed')