120 lines
4 KiB
Python
Executable file
120 lines
4 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
|
|
# This Source Code Form is subject to the terms of the Mozilla Public
|
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
"""
|
|
This script downloads all the required PDFs from the test_manifest.json
|
|
file found in the mozilla pdf.js repo.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import pathlib
|
|
import shutil
|
|
|
|
import requests
|
|
from redo import retriable
|
|
|
|
|
|
def log(msg):
|
|
print("fetch-talos-pdf: %s" % msg)
|
|
|
|
|
|
@retriable(attempts=7, sleeptime=5, sleepscale=2)
|
|
def fetch_file(url, filepath):
|
|
"""Download a file from the given url to a given file.
|
|
|
|
:param str url: URL to download file from.
|
|
:param Path filepath: Location to ouput the downloaded file
|
|
(includes the name of the file).
|
|
"""
|
|
size = 4096
|
|
r = requests.get(url, stream=True)
|
|
r.raise_for_status()
|
|
|
|
with filepath.open("wb") as fd:
|
|
for chunk in r.iter_content(size):
|
|
fd.write(chunk)
|
|
|
|
|
|
def fetch_talos_pdf_link(pdf_path, output_file):
|
|
"""Fetches a PDF file with a link into the output file location.
|
|
|
|
:param Path pdf_path: Path to a PDF file that contains a URL to download from.
|
|
:param Path output_file: Location (including the file name) to download PDF to.
|
|
"""
|
|
pdf_link = pdf_path.read_text().strip()
|
|
log(f"Downloading from PDF link: {pdf_link}")
|
|
fetch_file(pdf_link, output_file)
|
|
|
|
|
|
def gather_talos_pdf(test_folder, pdf_info, output_dir):
|
|
"""Gathers a PDF file into the output directory.
|
|
|
|
:param Path test_folder: The test folder that the pdfs can be found in.
|
|
:param Path pdf_info: Information about the pdf we're currently gathering, and
|
|
found in the test/test_manifest.json file from the pdf.js repo.
|
|
:param Path output_dir: The directory to move/download the PDF to.
|
|
"""
|
|
if not pdf_info.get("talos", True):
|
|
return
|
|
pdf_file = pdf_info["file"]
|
|
output_pdf_path = pathlib.Path(output_dir, pathlib.Path(pdf_file).name)
|
|
|
|
log(f"Gathering PDF {pdf_file}...")
|
|
if output_pdf_path.exists():
|
|
log(f"{pdf_file} already exists in output location")
|
|
elif pdf_info.get("link", False):
|
|
fetch_talos_pdf_link(
|
|
pathlib.Path(test_folder, pdf_file + ".link"), output_pdf_path
|
|
)
|
|
else:
|
|
log(f"Copying PDF to output location {output_pdf_path}")
|
|
shutil.copy(pathlib.Path(test_folder, pdf_file), output_pdf_path)
|
|
|
|
|
|
def gather_talos_pdfs(pdf_js_repo, output_dir):
|
|
"""Gather all pdfs to be used in the talos pdfpaint test.
|
|
|
|
Uses the pdf.js repo to gather the files from it's test/test_manifest.json
|
|
file. Some of these are also links that need to be downloaded. These
|
|
are output in an output directory.
|
|
|
|
:param Path pdf_js_repo: Path to the Mozilla Github pdf.js repo.
|
|
:param Path output_dir: Output directory for the PDFs.
|
|
"""
|
|
test_manifest_path = pathlib.Path(
|
|
pdf_js_repo, "test", "test_manifest.json"
|
|
).resolve()
|
|
test_folder = test_manifest_path.parent
|
|
|
|
# Gather all the PDFs into the output directory
|
|
test_manifest = json.loads(test_manifest_path.read_text())
|
|
for pdf_info in test_manifest:
|
|
gather_talos_pdf(test_folder, pdf_info, output_dir)
|
|
|
|
# Include the test manifest in the output directory as it
|
|
# contains the names of the tests
|
|
shutil.copy(test_manifest_path, pathlib.Path(output_dir, test_manifest_path.name))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
moz_fetches_dir = os.environ.get("MOZ_FETCHES_DIR", "")
|
|
if not moz_fetches_dir:
|
|
raise Exception(
|
|
"MOZ_FETCHES_DIR is not set to the path containing the pdf.js repo"
|
|
)
|
|
|
|
pdf_js_repo = pathlib.Path(moz_fetches_dir, "pdf.js")
|
|
if not pdf_js_repo.exists():
|
|
raise Exception("Can't find the pdf.js repository in MOZ_FETCHES_DIR")
|
|
|
|
output_dir = os.environ.get("OUTPUT_DIR", "")
|
|
if not output_dir:
|
|
raise Exception("OUTPUT_DIR is not set for the file output")
|
|
|
|
output_dir_path = pathlib.Path(output_dir)
|
|
output_dir_path.mkdir(parents=True, exist_ok=True)
|
|
gather_talos_pdfs(pdf_js_repo, output_dir_path)
|