diff options
Diffstat (limited to '')
-rwxr-xr-x | bin/get-forum-attachments.py | 167 |
1 files changed, 167 insertions, 0 deletions
diff --git a/bin/get-forum-attachments.py b/bin/get-forum-attachments.py new file mode 100755 index 000000000..4e46befef --- /dev/null +++ b/bin/get-forum-attachments.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 + +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import argparse +import configparser +import hashlib +import magic +import os +import requests +import shutil +import sys +import tempfile + +from bs4 import BeautifulSoup +from attachment_mimetypes import mimetypes +from concurrent.futures import ThreadPoolExecutor, as_completed +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry + +forums = { + # https://wiki.documentfoundation.org/Website/Web_Sites_services#Unofficial_and_Related_Pages + 'en': "https://forum.openoffice.org/en/forum", + 'es': "https://forum.openoffice.org/es/forum", + 'fr': "https://forum.openoffice.org/fr/forum", + 'hu': "https://forum.openoffice.org/hu/forum", + 'it': "https://forum.openoffice.org/it/forum", + 'ja': "https://forum.openoffice.org/ja/forum", + 'nl': "https://forum.openoffice.org/nl/forum", + 'pl': "https://forum.openoffice.org/pl/forum", + 'vi': "https://forum.openoffice.org/vi/forum", + 'tr': "https://forum.libreoffice.org.tr", + 'de': "https://www.openoffice-forum.de", + 'de2': "https://www.libreoffice-forum.de", + 'de3': "https://de.openoffice.info", + # Others + 'mso-en': "https://www.msofficeforums.com", + 'mso-de': "https://www.ms-office-forum.net/forum", +} + +def do_login(session, url, configFile): + config = configparser.ConfigParser() + + config.read(configFile) + username = config.get('login', 'username') + password = config.get('login', 'password') + resp = session.post(url + '/login.php?do=login', { + 'vb_login_username': username, + 'vb_login_password': '', + 'vb_login_md5password': hashlib.md5(password.encode()).hexdigest(), + 'vb_login_md5password_utf': hashlib.md5(password.encode()).hexdigest(), + 'cookieuser': 1, + 'do': 'login', + 's': '', + 'securitytoken': 'guest' + }) + + if resp.status_code != 200: + return False + + soup = BeautifulSoup(resp.content, 'lxml') + for p in soup.find_all("p"): + if 'Thank you for logging in' in p.get_text(): + return True + elif 'Danke für Ihre Anmeldung' in p.get_text(): + return True + + return False + +def get_attachments_from_url(lang, url, pathes): + + print("Checking " + url) + + startIndex = 0 + + # Keep the index and resume from there + indexFile = os.path.join(pathes.outdir, lang + ".index") + if os.path.isfile(indexFile): + with open(indexFile) as f: + startIndex = int(f.readline().rstrip()) + 1 + else: + if lang == 'hu': + startIndex = 1300 + + session = requests.Session() + retry = Retry(connect=3, backoff_factor=0.5) + adapter = HTTPAdapter(max_retries=retry) + session.mount('http://', adapter) + session.mount('https://', adapter) + + if lang.startswith("mso"): + if not do_login(session, url, pathes.config): + print("Can't log in to " + url) + return + + invalidCount = 0 + for i in range(startIndex, 999999): + if lang.startswith("mso"): + fileUrl = url + "/attachment.php?attachmentid=" + str(i) + else: + fileUrl = url + "/download/file.php?id=" + str(i) + + h = session.head(fileUrl) + header = h.headers + content_type = header.get('content-type') + if "html" in content_type: + # Let's assume this is an invalid file link + invalidCount += 1 + + # Let's assume, if we get 100 invalid files, that there are no more files + if invalidCount == 100: + print("No more attachments found in " + url) + break + else: + invalidCount = 0 + + r = session.get(fileUrl, allow_redirects=True) + with tempfile.NamedTemporaryFile() as tmp: + tmp.write(r.content) + mimetype = magic.from_file(tmp.name, mime=True) + if mimetype in mimetypes: + suffix = mimetypes[mimetype] + suffixDir = os.path.join(pathes.outdir, suffix) + try: + os.mkdir(suffixDir) + except: + pass + + download = os.path.join(suffixDir, + "forum-" + lang + '-' + str(i) + '.' + suffix) + + print("Downloading as " + download) + shutil.copy(tmp.name, download) + + # Save the index + with open(indexFile, 'w') as f: + f.write(str(i)) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + + parser.add_argument('--outdir', action='store', dest="outdir", required=True) + parser.add_argument('--config', action="store", dest="config", required=True) + + pathes = parser.parse_args() + + if not os.path.exists(pathes.outdir) or os.path.isfile(pathes.outdir): + print("Outdir folder doesn't exists") + sys.exit(1) + elif not os.path.exists(pathes.config) or not os.path.isfile(pathes.config): + print("Config file doesn't exists") + sys.exit(1) + + processes = [] + # 10 at a time seems to work fine + with ThreadPoolExecutor(max_workers=10) as executor: + for lang, url in forums.items(): + processes.append(executor.submit(get_attachments_from_url, lang, url, pathes)) + + for task in as_completed(processes): + result = task.result() + if result: + print(result) |