diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-15 05:54:39 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-15 05:54:39 +0000 |
commit | 267c6f2ac71f92999e969232431ba04678e7437e (patch) | |
tree | 358c9467650e1d0a1d7227a21dac2e3d08b622b2 /bin/crashreportScraper.py | |
parent | Initial commit. (diff) | |
download | libreoffice-267c6f2ac71f92999e969232431ba04678e7437e.tar.xz libreoffice-267c6f2ac71f92999e969232431ba04678e7437e.zip |
Adding upstream version 4:24.2.0.upstream/4%24.2.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'bin/crashreportScraper.py')
-rwxr-xr-x | bin/crashreportScraper.py | 221 |
1 files changed, 221 insertions, 0 deletions
diff --git a/bin/crashreportScraper.py b/bin/crashreportScraper.py new file mode 100755 index 0000000000..876570d3a0 --- /dev/null +++ b/bin/crashreportScraper.py @@ -0,0 +1,221 @@ +#!/usr/bin/env python3 + +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +# Use this script to retrieve information from https://crashreport.libreoffice.org +# about a specific version of LibreOffice +# Usage sample: ./crashreportScraper.py --version 7.2.0.4 --repository /path/to/libreoffice/repository/ + +import argparse +import requests +from bs4 import BeautifulSoup +import sys +import os +import math +from datetime import datetime +import urllib.parse + +def convert_str_to_date(value): + value = value.replace('.', '') + value = value.replace('March', 'Mar') + value = value.replace('April', 'Apr') + value = value.replace('June', 'Jun') + value = value.replace('July', 'Jul') + value = value.replace('Sept', 'Sep') + # reset the time leaving the date + value = ", ".join(value.split(", ")[:-1]) + return datetime.strptime(value, '%b %d, %Y') + +def parse_version_url(url): + crashReports = {} + + try: + html_text = requests.get(url, timeout=200).text + soup = BeautifulSoup(html_text, 'html.parser') + except requests.exceptions.Timeout: + print("Timeout requesting " + url) + sys.exit(1) + + table = soup.find("table", {"id": "data-table"}).tbody + for tr in table.find_all("tr"): + td_list = tr.find_all("td") + crashName = td_list[0].a.text.strip() + crashNumber = int(td_list[1].text.strip()) + firstCrashDate = convert_str_to_date(td_list[5].text.strip()) + lastCrashDate = convert_str_to_date(td_list[6].text.strip()) + crashReports[crashName] = [crashNumber, firstCrashDate, lastCrashDate] + + return crashReports + +def parse_reports_and_get_most_recent_report_from_last_page(url): + try: + html_text = requests.get(url, timeout=200).text + soup = BeautifulSoup(html_text, 'html.parser') + except requests.exceptions.Timeout: + print("Timeout") + raise + + count = 0 + try: + os_tab = soup.find("table", {"id": "os_tab"}).tbody + except AttributeError: + print("os_tab not found") + raise + + tr_list = os_tab.find_all("tr") + for tr in tr_list: + td_list = tr.find_all("td") + count += int(td_list[1].text.strip()) + + # There are 50 reports on each page. + # Go to the last page based on the total count to get a recent report + last_page = math.ceil( count / 50 ) + + if last_page > 1: + url = url + "?page=" + str(last_page) + try: + html_text = requests.get(url, timeout=200).text + soup = BeautifulSoup(html_text, 'html.parser') + except requests.exceptions.Timeout: + print("Timeout") + raise + + reports = soup.find("div", {"id": "reports"}).tbody + ID, currentID = "", "" + version, currentVersion = "", "" + OS, currentOS = "", "" + + tr_list = reports.find_all("tr") + for tr in tr_list: + td_list = tr.find_all("td") + + currentID = td_list[0].a.text.strip() + currentVersion = td_list[2].text.strip().split(': ')[1] + currentOS = td_list[3].text.strip() + + # get most recent version + # symbols on linux are not very informative generally + if currentOS == "windows" and currentVersion > version: + version = currentVersion + ID = currentID + OS = currentOS + + if not version: + version = currentVersion + + if not ID: + ID = currentID + + if not OS: + OS = currentOS + + return count, ID, version, OS + +def parse_details_and_get_info(url, gitRepo): + try: + html_text = requests.get(url, timeout=200).text + soup = BeautifulSoup(html_text, 'html.parser') + except requests.exceptions.Timeout: + print("Timeout") + raise + + details = soup.find("div", {"id": "details"}).tbody + tr_list = details.find_all("tr") + reason = tr_list[8].td.text.strip() + + stack = "" + codeLine = "" + + count = 0 + frames = soup.find("div", {"id": "frames"}).tbody + for tr in frames.find_all("tr"): + td_list = tr.find_all("td") + source = td_list[3].text.strip() + if source and count <= 10: + source = source.replace("\\", "/").replace("C:/cygwin64/home/buildslave/source/libo-core/", "") + stack += source + "\n" + count += 1 + + codeFile = source.split(":")[0] + codeNumber = source.split(":")[1] + try: + with open(os.path.join(gitRepo, codeFile)) as f: + lines = f.readlines() + for index, line in enumerate(lines): + if index + 1 == int(codeNumber): + codeLine += line.strip().replace("\"", "'") + "\n" + except FileNotFoundError: + codeLine += "\n" + continue + + if stack: + #multiline + stack = "\"" + stack + "\"" + + if codeLine: + #multiline + codeLine = "\"" + codeLine + "\"" + + metadata = soup.find("div", {"id": "metadata"}).tbody + tr_list = metadata.find_all("tr") + unoCommands = "" + for tr in tr_list: + if tr.th.text.strip() == "Last-4-Uno-Commands": + unoCommands = tr.td.text.strip() + + return reason, stack, codeLine, unoCommands + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + + parser.add_argument('--version', action='store', dest="version", required=True) + parser.add_argument('--repository', action="store", dest="repository", required=True) + + args = parser.parse_args() + + crashes = parse_version_url( + "https://crashreport.libreoffice.org/stats/version/" + args.version + "?limit=1000&days=30") + + print(str(len(crashes)) + " crash reports in version " + args.version) + + crashesInFile = [] + fileName = "crashes_" + args.version.replace(".", "_") + ".csv" + print("Using " + fileName) + + bInsertHeader = False + if os.path.exists(fileName): + with open(fileName, "r") as f: + lines = f.readlines() + for line in lines: + crashesInFile.append(line.split("\t")[0]) + else: + bInsertHeader = True + + with open(fileName, "a") as f: + if bInsertHeader: + line = '\t'.join(["Name", "Ratio", "Count", "First report", "Last Report", + "ID", "Version", "Reason", "OS", "Stack", "Code Lines", "Last 4 UNO Commands", '\n']) + f.write(line) + f.flush() + + for k, lDate in crashes.items(): + if k not in crashesInFile: + print("Parsing " + k) + try: + crashCount, crashID, crashVersion, crashOS = parse_reports_and_get_most_recent_report_from_last_page( + "https://crashreport.libreoffice.org/stats/signature/" + urllib.parse.quote(k)) + crashReason, crashStack, codeLine, unoCommands = parse_details_and_get_info( + "https://crashreport.libreoffice.org/stats/crash_details/" + crashID, args.repository) + ratio = round(crashCount / ((lDate[2] - lDate[1]).days + 1), 2) + line = '\t'.join([k, str(ratio), str(crashCount) , lDate[1].strftime('%y/%m/%d'), lDate[2].strftime('%y/%m/%d'), + crashID, crashVersion, crashReason, crashOS, crashStack, codeLine, unoCommands, '\n']) + f.write(line) + f.flush() + except (requests.exceptions.Timeout, AttributeError): + continue |