summaryrefslogtreecommitdiffstats
path: root/bin/get-forum-attachments.py
diff options
context:
space:
mode:
Diffstat (limited to 'bin/get-forum-attachments.py')
-rwxr-xr-xbin/get-forum-attachments.py210
1 files changed, 210 insertions, 0 deletions
diff --git a/bin/get-forum-attachments.py b/bin/get-forum-attachments.py
new file mode 100755
index 0000000000..74827fff92
--- /dev/null
+++ b/bin/get-forum-attachments.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python3
+
+# This file is part of the LibreOffice project.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import argparse
+import configparser
+import hashlib
+import magic
+import os
+import requests
+import shutil
+import sys
+import tempfile
+
+from bs4 import BeautifulSoup
+from attachment_mimetypes import mimetypes
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry
+
+forums = {
+ # https://wiki.documentfoundation.org/Website/Web_Sites_services#Unofficial_and_Related_Pages
+ 'en': ["https://forum.openoffice.org/en/forum", False, 0],
+ 'es': ["https://forum.openoffice.org/es/forum", False, 0],
+ 'fr': ["https://forum.openoffice.org/fr/forum", False, 0],
+ 'hu': ["https://forum.openoffice.org/hu/forum", False, 1300],
+ 'it': ["https://forum.openoffice.org/it/forum", False, 0],
+ 'ja': ["https://forum.openoffice.org/ja/forum", False, 0],
+ 'nl': ["https://forum.openoffice.org/nl/forum", False, 0],
+ 'pl': ["https://forum.openoffice.org/pl/forum", False, 0],
+ 'vi': ["https://forum.openoffice.org/vi/forum", False, 0],
+ 'tr': ["https://forum.libreoffice.org.tr", False, 0],
+ 'de': ["https://www.openoffice-forum.de", False, 0],
+ 'de2': ["https://www.libreoffice-forum.de", False, 0],
+ 'de3': ["https://de.openoffice.info", False, 0],
+ # Others
+ 'mso-de': ["https://www.ms-office-forum.net/forum", True, 0],
+ 'mso-en': ["https://www.msofficeforums.com", True, 0],
+ 'mso-en2': ["https://www.excelguru.ca/forums", False, 0],
+ 'mso-en3': ["http://www.vbaexpress.com/forum", True, 5100],
+ 'mso-en4': ["https://www.excelforum.com", True, 5100],
+ # forum : [url, doLogin, startIndex]
+}
+
+def get_attachment_query(forum):
+ if forum.startswith("mso"):
+ return "/attachment.php?attachmentid="
+ else:
+ return "/download/file.php?id="
+
+def createSession():
+ session = requests.Session()
+ retry = Retry(connect=3, backoff_factor=0.5)
+ adapter = HTTPAdapter(max_retries=retry)
+ session.mount('http://', adapter)
+ session.mount('https://', adapter)
+ return session
+
+def login(session, url, configFile):
+ config = configparser.ConfigParser()
+
+ config.read(configFile)
+ username = config.get('login', 'username')
+ password = config.get('login', 'password')
+ resp = session.post(url + '/login.php?do=login', {
+ 'vb_login_username': username,
+ 'vb_login_password': '',
+ 'vb_login_md5password': hashlib.md5(password.encode()).hexdigest(),
+ 'vb_login_md5password_utf': hashlib.md5(password.encode()).hexdigest(),
+ 'cookieuser': 1,
+ 'do': 'login',
+ 's': '',
+ 'securitytoken': 'guest'
+ })
+
+ if resp.status_code != 200:
+ return False
+
+ soup = BeautifulSoup(resp.content, 'lxml')
+ for p in soup.find_all("p"):
+ if 'Thank you for logging in' in p.get_text():
+ return True
+ elif 'Danke für Ihre Anmeldung' in p.get_text():
+ return True
+
+ return False
+
+def get_attachments_from_url(forum, config, args):
+ url = config[0]
+ doLogin = config[1]
+ startIndex = config[2]
+
+ print("Checking " + url)
+
+ # Keep the index and resume from there
+ indexFile = os.path.join(args.outdir, forum + ".index")
+ if os.path.isfile(indexFile):
+ with open(indexFile) as f:
+ startIndex = int(f.readline().rstrip()) + 1
+
+ session = createSession()
+
+ if doLogin:
+ if not login(session, url, args.config):
+ print("Can't log in to " + url)
+ return
+
+ invalidCount = 0
+ for i in range(startIndex, 999999):
+ fileUrl = url + get_attachment_query(forum) + str(i)
+
+ h = session.head(fileUrl)
+ header = h.headers
+ content_type = header.get('content-type')
+ if "html" in content_type:
+ # Let's assume this is an invalid file link
+ invalidCount += 1
+
+ # Let's assume, if we get 200 invalid files, that there are no more files
+ if invalidCount == 200:
+ print("No more attachments found in " + url)
+ break
+ else:
+ invalidCount = 0
+
+ r = session.get(fileUrl, allow_redirects=True)
+ with tempfile.NamedTemporaryFile() as tmp:
+ tmp.write(r.content)
+ mimetype = magic.from_file(tmp.name, mime=True)
+ if mimetype in mimetypes:
+ suffix = mimetypes[mimetype]
+ suffixDir = os.path.join(args.outdir, suffix)
+ try:
+ os.mkdir(suffixDir)
+ except:
+ pass
+
+ download = os.path.join(suffixDir,
+ "forum-" + forum + '-' + str(i) + '.' + suffix)
+
+ print("Downloading as " + download)
+ shutil.copy(tmp.name, download)
+
+ # Save the index
+ with open(indexFile, 'w') as f:
+ f.write(str(i))
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument('--outdir', action='store', dest="outdir", required=True)
+ parser.add_argument('--config', action="store", dest="config", required=True)
+ parser.add_argument('--get-file', action="store", dest="fileName", required=False)
+
+ args = parser.parse_args()
+
+ if not os.path.exists(args.outdir) or os.path.isfile(args.outdir):
+ print("Outdir folder doesn't exists")
+ sys.exit(1)
+ elif not os.path.exists(args.config) or not os.path.isfile(args.config):
+ print("Config file doesn't exists")
+ sys.exit(1)
+
+ if not args.fileName:
+ processes = []
+ # by default, 10 at a time seems to work fine
+ with ThreadPoolExecutor(max_workers=int(os.environ.get('PARALLELISM', 10))) as executor:
+ for forum, config in forums.items():
+ processes.append(executor.submit(get_attachments_from_url, forum, config, args))
+
+ for task in as_completed(processes):
+ result = task.result()
+ if result:
+ print(result)
+ else:
+ fileNameSplit = args.fileName.split("-")
+ if fileNameSplit[0] != "forum" or (len(fileNameSplit) != 3 and len(fileNameSplit) != 4):
+ print("Incorrect file name")
+ sys.exit(1)
+
+ forum = fileNameSplit[1]
+ fileId = fileNameSplit[2]
+ if fileNameSplit[1] == "mso":
+ forum += "-" + fileNameSplit[2]
+ fileId = fileNameSplit[3]
+
+ url = forums[forum][0]
+ fileUrl = url + get_attachment_query(forum) + fileId.split(".")[0]
+
+ session = createSession()
+
+ doLogin = forums[forum][1]
+ if doLogin:
+ if not login(session, url, args.config):
+ print("Can't log in to " + url)
+ sys.exit(1)
+
+ r = session.get(fileUrl, allow_redirects=True)
+ with tempfile.NamedTemporaryFile() as tmp:
+ tmp.write(r.content)
+
+ download = os.path.join(args.outdir, args.fileName)
+
+ print("Downloading " + fileUrl + " as " + download)
+ shutil.copy(tmp.name, download)
+