diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 09:06:44 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 09:06:44 +0000 |
commit | ed5640d8b587fbcfed7dd7967f3de04b37a76f26 (patch) | |
tree | 7a5f7c6c9d02226d7471cb3cc8fbbf631b415303 /bin/get-bugzilla-attachments-by-mimetype | |
parent | Initial commit. (diff) | |
download | libreoffice-upstream/4%7.4.7.tar.xz libreoffice-upstream/4%7.4.7.zip |
Adding upstream version 4:7.4.7.upstream/4%7.4.7upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'bin/get-bugzilla-attachments-by-mimetype')
-rwxr-xr-x | bin/get-bugzilla-attachments-by-mimetype | 430 |
1 files changed, 430 insertions, 0 deletions
diff --git a/bin/get-bugzilla-attachments-by-mimetype b/bin/get-bugzilla-attachments-by-mimetype new file mode 100755 index 000000000..609e6683a --- /dev/null +++ b/bin/get-bugzilla-attachments-by-mimetype @@ -0,0 +1,430 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# + +# This digs through a pile of bugzilla's and populates the cwd with a big +# collection of bug-docs in per-filetype dirs with bug-ids as names with +# prefixes to indicate which bug-tracker, e.g. +# +# fdo-bugid-X.suffix +# rhbz-bugid-X.suffix +# moz-bugid-X.suffix +# +# where X is the n'th attachment of that type in the bug +# +# The results are stored in the current directory, categorized by the +# extension of the downloaded file. When a file already exists, it is assumed +# it is already downloaded by a previous run, and up-to-date. + +from __future__ import print_function +import feedparser +import base64 +import datetime +import glob +import re +import os, os.path +import stat +import sys +import threading +try: + import queue +except: + import Queue as queue +try: + from urllib.request import urlopen +except: + from urllib import urlopen +try: + import xmlrpc.client as xmlrpclib +except: + import xmlrpclib +from xml.dom import minidom +from xml.sax.saxutils import escape +from attachment_mimetypes import mimetypes + +def urlopen_retry(url): + maxretries = 3 + for i in range(maxretries + 1): + try: + return urlopen(url) + except IOError as e: + print("caught IOError: " + str(e)) + if maxretries == i: + raise + print("retrying...") + +def get_from_bug_url_via_xml(url, mimetype, prefix, suffix): + id = url.rsplit('=', 2)[1] + print("id is " + prefix + id + " " + suffix) + print("parsing " + id) + sock = urlopen_retry(url+"&ctype=xml") + dom = minidom.parse(sock) + sock.close() + attachmentid=0 + for attachment in dom.getElementsByTagName('attachment'): + attachmentid += 1 + print(" mimetype is", end=' ') + for node in attachment.childNodes: + if node.nodeName == 'type': + # check if attachment is deleted + if not node.firstChild: + print('deleted attachment, skipping') + continue + + print(node.firstChild.nodeValue, end=' ') + if node.firstChild.nodeValue.lower() != mimetype.lower(): + print('skipping') + break + elif node.nodeName == 'data': + # check if attachment is deleted (i.e. https://bugs.kde.org/show_bug.cgi?id=53343&ctype=xml) + if not node.firstChild: + print('deleted attachment, skipping') + continue + + download = suffix + '/' +prefix + id + '-' + str(attachmentid) + '.' + suffix + if os.path.isfile(download): + print("assuming " + download + " is up to date") + continue + + # prevent re-downloading FDO attachments from TDF + if prefix == "tdf" and int(id) < 88776: + fdodownload = download.replace("tdf", "fdo") + if os.path.isfile(fdodownload): + print("assuming FDO " + fdodownload + " is up to date") + continue + + print('downloading as ' + download) + tmpfile = download + ".tmp" + f = open(tmpfile, 'wb') + f.write(base64.b64decode(node.firstChild.nodeValue)) + f.close() + os.rename(tmpfile, download) + break + +def get_novell_bug_via_xml(url, mimetype, prefix, suffix): + id = url.rsplit('=', 2)[1] + print("id is " + prefix + id + " " + suffix) + print("parsing " + id) + sock = urlopen_retry(url+"&ctype=xml") + dom = minidom.parse(sock) + sock.close() + attachmentid=0 + for comment in dom.getElementsByTagName('thetext'): + commentText = comment.firstChild.nodeValue + match = re.search(r".*Created an attachment \(id=([0-9]+)\)", commentText) + if not match: + continue + + attachmentid += 1 + + download = suffix + '/' + prefix + id + '-' + str(attachmentid) + '.' + suffix + if os.path.isfile(download): + print("assuming " + download + " is up to date") + continue + + realAttachmentId = match.group(1) + handle = urlopen_retry(novellattach + realAttachmentId) + if not handle: + print("attachment %s is not accessible" % realAttachmentId) + continue + print(" mimetype is", end=' ') + + info = handle.info() + if info.get_content_type: + remoteMime = info.get_content_type() + else: + remoteMime = info.gettype() + print(remoteMime, end=' ') + if remoteMime != mimetype: + print("skipping") + continue + + print('downloading as ' + download) + tmpfile = download + ".tmp" + f = open(tmpfile, 'wb') + f.write(handle.read()) + f.close() + os.rename(tmpfile, download) + +def create_query(mimetype): + query = dict() + query['query_format']='advanced' + query['field0-0-0']='attachments.mimetype' + query['type0-0-0']='equals' + query['value0-0-0']=mimetype + return query + +def get_downloaded_files(prefix, suffix): + return glob.glob(os.path.join(suffix, '%s*.%s' % (prefix, suffix))) + +def get_file_bz_ids(files, prefix): + return set([os.path.basename(f).split('-')[0].replace(prefix, '', 1) for f in files]) + +def get_changed_date(files): + newest = max([os.stat(f)[stat.ST_MTIME] for f in files]) + # Subtract a day to avoid timezone differences. The worst thing that + # can happen is that we are going to process more bugs than necessary. + return datetime.date.fromtimestamp(newest - 24 * 60 * 60) + +def get_through_rpc_query(rpcurl, showurl, mimetype, prefix, suffix): + try: + os.mkdir(suffix) + except: + pass + + def process(query, full, have=[]): + try: + proxy = xmlrpclib.ServerProxy(rpcurl) + result = proxy.Bug.search(query) + bugs = result['bugs'] + print(str(len(bugs)) + ' bugs to process') + + if full: + available = set([str(bug['id']) for bug in bugs]) + # we already have files from all available bugs + if available.difference(set(have)) == set(): + print("assuming all downloaded files are up to date") + return + + for bug in bugs: + url = showurl + str(bug['id']) + get_from_bug_url_via_xml(url, mimetype, prefix, suffix) + except xmlrpclib.Fault as err: + print("A fault occurred") + print("Fault code: %s" % err.faultCode) + print(err.faultString) + + query = create_query(mimetype) + query['column_list']='bug_id' + + files = get_downloaded_files(prefix, suffix) + + if files != []: + print('looking for updated bugs having %s attachment(s)' % mimetype) + query_changed = query.copy() + query_changed['field0-1-0'] = 'days_elapsed' + query_changed['type0-1-0'] = 'lessthaneq' + query_changed['value0-1-0'] = str((datetime.date.today() - get_changed_date(files)).days) + process(query_changed, False) + + print('looking for all bugs having %s attachment(s)' % mimetype) + process(query, True, get_file_bz_ids(files, prefix)) + +def get_through_rss_query(queryurl, mimetype, prefix, suffix): + try: + os.mkdir(suffix) + except: + pass + + #Getting detailed bug information and downloading an attachment body is not possible without logging in to Novell bugzilla + #get_novell_bug_via_xml function is a workaround for that situation + get_bug_function = get_novell_bug_via_xml if prefix == "novell" else get_from_bug_url_via_xml + + def process(query, full, have=[]): + url = queryurl + '?' + '&'.join(['='.join(kv) for kv in query.items()]) + print('url is ' + url) + d = feedparser.parse(url) + print(str(len(d['entries'])) + ' bugs to process') + + entries = [] + for entry in d['entries']: + bugid = entry['id'].split('=')[-1] + entries.append(entry) + + if full: + available = set([str(entry['id'].split('=')[-1]) for entry in entries]) + # we already have files from all available bugs + if available.difference(set(have)) == set(): + print("assuming all downloaded files are up to date") + return + + for entry in entries: + try: + get_bug_function(entry['id'], mimetype, prefix, suffix) + except KeyboardInterrupt: + raise # Ctrl+C should work + except: + print(entry['id'] + " failed: " + str(sys.exc_info()[0])) + pass + + query = create_query(escape(mimetype.replace("+","%2B"))) + query['ctype'] = 'rss' + + files = get_downloaded_files(prefix, suffix) + + if files != []: + print('looking for updated bugs having %s attachment(s)' % mimetype) + query_changed = query.copy() + query_changed['field0-1-0'] = 'delta_ts' + query_changed['type0-1-0'] = 'greaterthaneq' + query_changed['value0-1-0'] = get_changed_date(files).isoformat() + process(query_changed, False) + + print('looking for all bugs having %s attachment(s)' % mimetype) + process(query, True, get_file_bz_ids(files, prefix)) + +#since searching bugs having attachments with specific mimetypes is not available in launchpad API +#we're iterating over all bugs of the most interesting source packages +launchpad_pkgs = ( + "abiword", + "calibre", + "calligra", + "gnumeric", + "inkscape", + "koffice", + "libabw", + "libcdr", + "libe-book", + "libetonyek", + "libfreehand", + "libmspub", + "libmwaw", + "liborcus", + "libpagemaker", + "libreoffice", + "libvisio", + "libwpd", + "libwpg", + "libwps", + "openoffice.org", + "python-uniconvertor", + "scribus", + "sk1", + "unoconv", +) + +def get_launchpad_bugs(prefix): + #launchpadlib python module is required to download launchpad attachments + from launchpadlib.launchpad import Launchpad + + launchpad = Launchpad.login_anonymously("attachmentdownload", "production") + ubuntu = launchpad.distributions["ubuntu"] + + for pkg in launchpad_pkgs: + srcpkg = ubuntu.getSourcePackage(name=pkg) + pkgbugs = srcpkg.searchTasks(status=["New", "Fix Committed", "Invalid", "Won't Fix", "Confirmed", "Triaged", "In Progress", "Incomplete", "Incomplete (with response)", "Incomplete (without response)", "Fix Released", "Opinion", "Expired"]) + + for bugtask in pkgbugs: + bug = bugtask.bug + id = str(bug.id) + print("parsing " + id + " status: " + bugtask.status + " title: " + bug.title[:50]) + attachmentid = 0 + for attachment in bug.attachments: + attachmentid += 1 + handle = attachment.data.open() + if not handle.content_type in mimetypes: + #print "skipping" + continue + + suffix = mimetypes[handle.content_type] + if not os.path.isdir(suffix): + try: + os.mkdir(suffix) + except: + pass + + download = suffix + '/' + prefix + id + '-' + str(attachmentid) + '.' + suffix + + if os.path.isfile(download): + print("assuming " + id + " is up to date") + break + + print('mimetype is ' + handle.content_type + ' downloading as ' + download) + + tmpfile = download + ".tmp" + f = open(tmpfile, "wb") + f.write(handle.read()) + f.close() + os.rename(tmpfile, download) + +rss_bugzillas = ( + ( 'abi', 'http://bugzilla.abisource.com/buglist.cgi' ), #added for abiword + ( 'fdo', 'http://bugs.freedesktop.org/buglist.cgi' ), + ( 'gentoo', 'http://bugs.gentoo.org/buglist.cgi' ), + ( 'gnome', 'http://bugzilla.gnome.org/buglist.cgi' ), # added for gnumeric + ( 'kde', 'http://bugs.kde.org/buglist.cgi' ), # added for koffice/calligra + ( 'mandriva', 'https://qa.mandriva.com/buglist.cgi' ), + ( 'moz', 'https://bugzilla.mozilla.org/buglist.cgi' ), + # It seems something has changed and it is no longer possible to + # download any files from there. + # NOTE: I am leaving it in the list, commented out, just so someone + # does not add it back immediately .-) + # 'novell': 'https://bugzilla.novell.com/buglist.cgi', +# note: running this script against bz.apache.org apparently causes one's IP +# to be banned or something; you won't get new files in any case... +# ( 'ooo', 'https://bz.apache.org/ooo/buglist.cgi' ), + ( 'tdf', 'http://bugs.documentfoundation.org/buglist.cgi' ), +) + +redhatrpc = 'https://bugzilla.redhat.com/xmlrpc.cgi' +redhatbug = 'https://bugzilla.redhat.com/show_bug.cgi?id=' + +#Novell Bugzilla requires users to log in, in order to get details of the bugs such as attachment bodies etc. +#As a dirty workaround, we parse comments containing "Created an attachment (id=xxxxxx)" and download attachments manually +#python-bugzilla claims that it supports Novell bugzilla login but it's not working right now and novell bugzilla login +#system is a nightmare +novellattach = 'https://bugzilla.novell.com/attachment.cgi?id=' + +class manage_threads(threading.Thread): + def run(self): + #print(threading.current_thread().get_ident()) + while 1: + # Try to receive a job from queue + try: + # Get job from queue + # Use job parameters to call our query + # Then let the queue know we are done with this job + (uri, mimetype, prefix, extension) = jobs.get(True,6) + try: + get_through_rss_query(uri, mimetype, prefix, extension) + finally: + jobs.task_done() + except KeyboardInterrupt: + raise # Ctrl+C should work + except queue.Empty: + break + +def generate_multi_threading(): + + # Initialize threads + for i in range(max_threads): + manage_threads().start() + + for (prefix, uri) in rss_bugzillas: + + # Create a job for every mimetype for a bugzilla + for (mimetype,extension) in mimetypes.items(): + # It seems that bugzilla has problems returning that many results + # (10000 results is probably a limit set somewhere) so we always + # end processing the complete list. + if mimetype == 'text/html' and prefix == 'moz': + continue + + jobs.put([uri, mimetype, prefix, extension], block=True) + print("successfully placed a job in the queue searching for " + mimetype + " in bugtracker " + prefix) + + # Continue when all mimetypes are done for a bugzilla + print("STARTED all bugtracker " + prefix) + + jobs.join() + +max_threads = 20 # Number of threads to create, (1 = without multi-threading) +jobs = queue.Queue() + +generate_multi_threading() + +for (mimetype,extension) in mimetypes.items(): + get_through_rpc_query(redhatrpc, redhatbug, mimetype, "rhbz", extension) + +try: + get_launchpad_bugs("lp") +except ImportError: + print("launchpadlib unavailable, skipping Ubuntu tracker") + +# vim:set shiftwidth=4 softtabstop=4 expandtab: |