diff options
Diffstat (limited to 'bin/get-forum-attachments.py')
-rwxr-xr-x | bin/get-forum-attachments.py | 210 |
1 files changed, 210 insertions, 0 deletions
diff --git a/bin/get-forum-attachments.py b/bin/get-forum-attachments.py new file mode 100755 index 0000000000..74827fff92 --- /dev/null +++ b/bin/get-forum-attachments.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python3 + +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import argparse +import configparser +import hashlib +import magic +import os +import requests +import shutil +import sys +import tempfile + +from bs4 import BeautifulSoup +from attachment_mimetypes import mimetypes +from concurrent.futures import ThreadPoolExecutor, as_completed +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry + +forums = { + # https://wiki.documentfoundation.org/Website/Web_Sites_services#Unofficial_and_Related_Pages + 'en': ["https://forum.openoffice.org/en/forum", False, 0], + 'es': ["https://forum.openoffice.org/es/forum", False, 0], + 'fr': ["https://forum.openoffice.org/fr/forum", False, 0], + 'hu': ["https://forum.openoffice.org/hu/forum", False, 1300], + 'it': ["https://forum.openoffice.org/it/forum", False, 0], + 'ja': ["https://forum.openoffice.org/ja/forum", False, 0], + 'nl': ["https://forum.openoffice.org/nl/forum", False, 0], + 'pl': ["https://forum.openoffice.org/pl/forum", False, 0], + 'vi': ["https://forum.openoffice.org/vi/forum", False, 0], + 'tr': ["https://forum.libreoffice.org.tr", False, 0], + 'de': ["https://www.openoffice-forum.de", False, 0], + 'de2': ["https://www.libreoffice-forum.de", False, 0], + 'de3': ["https://de.openoffice.info", False, 0], + # Others + 'mso-de': ["https://www.ms-office-forum.net/forum", True, 0], + 'mso-en': ["https://www.msofficeforums.com", True, 0], + 'mso-en2': ["https://www.excelguru.ca/forums", False, 0], + 'mso-en3': ["http://www.vbaexpress.com/forum", True, 5100], + 'mso-en4': ["https://www.excelforum.com", True, 5100], + # forum : [url, doLogin, startIndex] +} + +def get_attachment_query(forum): + if forum.startswith("mso"): + return "/attachment.php?attachmentid=" + else: + return "/download/file.php?id=" + +def createSession(): + session = requests.Session() + retry = Retry(connect=3, backoff_factor=0.5) + adapter = HTTPAdapter(max_retries=retry) + session.mount('http://', adapter) + session.mount('https://', adapter) + return session + +def login(session, url, configFile): + config = configparser.ConfigParser() + + config.read(configFile) + username = config.get('login', 'username') + password = config.get('login', 'password') + resp = session.post(url + '/login.php?do=login', { + 'vb_login_username': username, + 'vb_login_password': '', + 'vb_login_md5password': hashlib.md5(password.encode()).hexdigest(), + 'vb_login_md5password_utf': hashlib.md5(password.encode()).hexdigest(), + 'cookieuser': 1, + 'do': 'login', + 's': '', + 'securitytoken': 'guest' + }) + + if resp.status_code != 200: + return False + + soup = BeautifulSoup(resp.content, 'lxml') + for p in soup.find_all("p"): + if 'Thank you for logging in' in p.get_text(): + return True + elif 'Danke für Ihre Anmeldung' in p.get_text(): + return True + + return False + +def get_attachments_from_url(forum, config, args): + url = config[0] + doLogin = config[1] + startIndex = config[2] + + print("Checking " + url) + + # Keep the index and resume from there + indexFile = os.path.join(args.outdir, forum + ".index") + if os.path.isfile(indexFile): + with open(indexFile) as f: + startIndex = int(f.readline().rstrip()) + 1 + + session = createSession() + + if doLogin: + if not login(session, url, args.config): + print("Can't log in to " + url) + return + + invalidCount = 0 + for i in range(startIndex, 999999): + fileUrl = url + get_attachment_query(forum) + str(i) + + h = session.head(fileUrl) + header = h.headers + content_type = header.get('content-type') + if "html" in content_type: + # Let's assume this is an invalid file link + invalidCount += 1 + + # Let's assume, if we get 200 invalid files, that there are no more files + if invalidCount == 200: + print("No more attachments found in " + url) + break + else: + invalidCount = 0 + + r = session.get(fileUrl, allow_redirects=True) + with tempfile.NamedTemporaryFile() as tmp: + tmp.write(r.content) + mimetype = magic.from_file(tmp.name, mime=True) + if mimetype in mimetypes: + suffix = mimetypes[mimetype] + suffixDir = os.path.join(args.outdir, suffix) + try: + os.mkdir(suffixDir) + except: + pass + + download = os.path.join(suffixDir, + "forum-" + forum + '-' + str(i) + '.' + suffix) + + print("Downloading as " + download) + shutil.copy(tmp.name, download) + + # Save the index + with open(indexFile, 'w') as f: + f.write(str(i)) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + + parser.add_argument('--outdir', action='store', dest="outdir", required=True) + parser.add_argument('--config', action="store", dest="config", required=True) + parser.add_argument('--get-file', action="store", dest="fileName", required=False) + + args = parser.parse_args() + + if not os.path.exists(args.outdir) or os.path.isfile(args.outdir): + print("Outdir folder doesn't exists") + sys.exit(1) + elif not os.path.exists(args.config) or not os.path.isfile(args.config): + print("Config file doesn't exists") + sys.exit(1) + + if not args.fileName: + processes = [] + # by default, 10 at a time seems to work fine + with ThreadPoolExecutor(max_workers=int(os.environ.get('PARALLELISM', 10))) as executor: + for forum, config in forums.items(): + processes.append(executor.submit(get_attachments_from_url, forum, config, args)) + + for task in as_completed(processes): + result = task.result() + if result: + print(result) + else: + fileNameSplit = args.fileName.split("-") + if fileNameSplit[0] != "forum" or (len(fileNameSplit) != 3 and len(fileNameSplit) != 4): + print("Incorrect file name") + sys.exit(1) + + forum = fileNameSplit[1] + fileId = fileNameSplit[2] + if fileNameSplit[1] == "mso": + forum += "-" + fileNameSplit[2] + fileId = fileNameSplit[3] + + url = forums[forum][0] + fileUrl = url + get_attachment_query(forum) + fileId.split(".")[0] + + session = createSession() + + doLogin = forums[forum][1] + if doLogin: + if not login(session, url, args.config): + print("Can't log in to " + url) + sys.exit(1) + + r = session.get(fileUrl, allow_redirects=True) + with tempfile.NamedTemporaryFile() as tmp: + tmp.write(r.content) + + download = os.path.join(args.outdir, args.fileName) + + print("Downloading " + fileUrl + " as " + download) + shutil.copy(tmp.name, download) + |