diff options
Diffstat (limited to 'src/arrow/dev/release/download_rc_binaries.py')
-rwxr-xr-x | src/arrow/dev/release/download_rc_binaries.py | 184 |
1 files changed, 184 insertions, 0 deletions
diff --git a/src/arrow/dev/release/download_rc_binaries.py b/src/arrow/dev/release/download_rc_binaries.py new file mode 100755 index 000000000..3e3d0f7d3 --- /dev/null +++ b/src/arrow/dev/release/download_rc_binaries.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re + +import argparse +import concurrent.futures as cf +import functools +import os +import subprocess +import urllib.request + + +ARTIFACTORY_ROOT = "https://apache.jfrog.io/artifactory/arrow" +DEFAULT_PARALLEL_DOWNLOADS = 8 + + +class Artifactory: + + def get_file_list(self, prefix): + def traverse(directory, files, directories): + url = f'{ARTIFACTORY_ROOT}/{directory}' + response = urllib.request.urlopen(url).read().decode() + paths = re.findall('<a href="(.+?)"', response) + for path in paths: + if path == '../': + continue + resolved_path = f'{directory}{path}' + if path.endswith('/'): + directories.append(resolved_path) + else: + files.append(resolved_path) + files = [] + if not prefix.endswith('/'): + prefix += '/' + directories = [prefix] + while len(directories) > 0: + directory = directories.pop() + traverse(directory, files, directories) + return files + + def download_files(self, files, dest=None, num_parallel=None, + re_match=None): + """ + Download files from Bintray in parallel. If file already exists, will + overwrite if the checksum does not match what Bintray says it should be + + Parameters + ---------- + files : List[Dict] + File listing from Bintray + dest : str, default None + Defaults to current working directory + num_parallel : int, default 8 + Number of files to download in parallel. If set to None, uses + default + """ + if dest is None: + dest = os.getcwd() + if num_parallel is None: + num_parallel = DEFAULT_PARALLEL_DOWNLOADS + + if re_match is not None: + regex = re.compile(re_match) + files = [x for x in files if regex.match(x)] + + if num_parallel == 1: + for path in files: + self._download_file(dest, path) + else: + parallel_map_terminate_early( + functools.partial(self._download_file, dest), + files, + num_parallel + ) + + def _download_file(self, dest, path): + base, filename = os.path.split(path) + + dest_dir = os.path.join(dest, base) + os.makedirs(dest_dir, exist_ok=True) + + dest_path = os.path.join(dest_dir, filename) + + print("Downloading {} to {}".format(path, dest_path)) + + url = f'{ARTIFACTORY_ROOT}/{path}' + + cmd = [ + 'curl', '--fail', '--location', '--retry', '5', + '--output', dest_path, url + ] + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + stdout, stderr = proc.communicate() + if proc.returncode != 0: + raise Exception("Downloading {} failed\nstdout: {}\nstderr: {}" + .format(path, stdout, stderr)) + + +def parallel_map_terminate_early(f, iterable, num_parallel): + tasks = [] + with cf.ProcessPoolExecutor(num_parallel) as pool: + for v in iterable: + tasks.append(pool.submit(functools.partial(f, v))) + + for task in cf.as_completed(tasks): + if task.exception() is not None: + e = task.exception() + for task in tasks: + task.cancel() + raise e + + +ARROW_REPOSITORY_PACKAGE_TYPES = ['centos', 'debian', 'ubuntu'] +ARROW_STANDALONE_PACKAGE_TYPES = ['nuget', 'python'] +ARROW_PACKAGE_TYPES = \ + ARROW_REPOSITORY_PACKAGE_TYPES + \ + ARROW_STANDALONE_PACKAGE_TYPES + + +def download_rc_binaries(version, rc_number, re_match=None, dest=None, + num_parallel=None, target_package_type=None): + artifactory = Artifactory() + + version_string = '{}-rc{}'.format(version, rc_number) + if target_package_type: + package_types = [target_package_type] + else: + package_types = ARROW_PACKAGE_TYPES + for package_type in package_types: + if package_type in ARROW_REPOSITORY_PACKAGE_TYPES: + prefix = f'{package_type}-rc' + else: + prefix = f'{package_type}-rc/{version_string}' + files = artifactory.get_file_list(prefix) + if package_type in ARROW_REPOSITORY_PACKAGE_TYPES: + version_pattern = re.compile(r'\d+\.\d+\.\d+') + + def is_old_release(path): + match = version_pattern.search(path) + if not match: + return False + return match[0] != version + files = [x for x in files if not is_old_release(x)] + artifactory.download_files(files, re_match=re_match, dest=dest, + num_parallel=num_parallel) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Download release candidate binaries' + ) + parser.add_argument('version', type=str, help='The version number') + parser.add_argument('rc_number', type=int, + help='The release candidate number, e.g. 0, 1, etc') + parser.add_argument('-e', '--regexp', type=str, default=None, + help=('Regular expression to match on file names ' + 'to only download certain files')) + parser.add_argument('--dest', type=str, default=os.getcwd(), + help='The output folder for the downloaded files') + parser.add_argument('--num_parallel', type=int, default=8, + help='The number of concurrent downloads to do') + parser.add_argument('--package_type', type=str, default=None, + help='The package type to be downloaded') + args = parser.parse_args() + + download_rc_binaries(args.version, args.rc_number, dest=args.dest, + re_match=args.regexp, num_parallel=args.num_parallel, + target_package_type=args.package_type) |