This is an automated email from the ASF dual-hosted git repository. kszucs pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 2ca4edf1224eb44a32942b6aa288be34f845a4b3 Author: Wes McKinney <wesm+...@apache.org> AuthorDate: Tue Oct 1 11:34:50 2019 -0500 ARROW-5831: [Release] Add Python program to download binary artifacts in parallel, allow abort/resume This script only uses the Python standard library and curl. It does 8 downloads in parallel by default. Since Bintray returns sha256 checksums we compute these on any local files and do not re-download files, so that interrupted downloads can be resumed. Closes #5550 from wesm/parallel-rc-binary-verification and squashes the following commits: ff207e670 <Wes McKinney> More robust python3 checking 1d78b9f41 <Wes McKinney> Add Python-based parallel bintray artifact download script that can resume Authored-by: Wes McKinney <wesm+...@apache.org> Signed-off-by: Wes McKinney <wesm+...@apache.org> --- dev/release/download_rc_binaries.py | 161 ++++++++++++++++++++++++++++++++ dev/release/verify-release-candidate.sh | 42 +-------- 2 files changed, 164 insertions(+), 39 deletions(-) diff --git a/dev/release/download_rc_binaries.py b/dev/release/download_rc_binaries.py new file mode 100644 index 0000000..01ab2c4 --- /dev/null +++ b/dev/release/download_rc_binaries.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import sys + +try: + import argparse + import concurrent.futures as cf + import functools + import hashlib + import json + import os + import subprocess + import urllib.request +except ImportError: + if sys.version_info.major < 3: + raise Exception("Please use Python 3 to run this script") + raise + + +BINTRAY_API_ROOT = "https://bintray.com/api/v1" +BINTRAY_DL_ROOT = "https://dl.bintray.com" +BINTRAY_REPO = 'apache/arrow' +DEFAULT_PARALLEL_DOWNLOADS = 8 + + +class Bintray: + + def __init__(self, repo=BINTRAY_REPO): + self.repo = repo + + def get_file_list(self, package, version): + url = os.path.join(BINTRAY_API_ROOT, 'packages', self.repo, package, + 'versions', version, 'files') + request = urllib.request.urlopen(url).read() + return json.loads(request) + + def download_files(self, files, dest=None, num_parallel=None): + """ + Download files from Bintray in parallel. If file already exists, will + overwrite if the checksum does not match what Bintray says it should be + + Parameters + ---------- + files : List[Dict] + File listing from Bintray + dest : str, default None + Defaults to current working directory + num_parallel : int, default 8 + Number of files to download in parallel. If set to None, uses + default + """ + if dest is None: + dest = os.getcwd() + if num_parallel is None: + num_parallel = DEFAULT_PARALLEL_DOWNLOADS + + if num_parallel == 1: + for path in files: + self._download_file(dest, path) + else: + parallel_map_terminate_early( + functools.partial(self._download_file, dest), + files, + num_parallel + ) + + def _download_file(self, dest, info): + relpath = info['path'] + + base, filename = os.path.split(relpath) + + dest_dir = os.path.join(dest, base) + os.makedirs(dest_dir, exist_ok=True) + + dest_path = os.path.join(dest_dir, filename) + + if os.path.exists(dest_path): + with open(dest_path, 'rb') as f: + sha256sum = hashlib.sha256(f.read()).hexdigest() + if sha256sum == info['sha256']: + print('Local file {} sha256 matches, skipping' + .format(dest_path)) + return + else: + print('Local file sha256 does not match, overwriting') + + print("Downloading {} to {}".format(relpath, dest_path)) + + bintray_abspath = os.path.join(BINTRAY_DL_ROOT, self.repo, relpath) + + cmd = [ + 'curl', '--fail', '--location', + '--output', dest_path, bintray_abspath + ] + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + stdout, stderr = proc.communicate() + if proc.returncode != 0: + raise Exception("Downloading {} failed\nstdout: {}\nstderr: {}" + .format(relpath, stdout, stderr)) + + +def parallel_map_terminate_early(f, iterable, num_parallel): + tasks = [] + with cf.ProcessPoolExecutor(num_parallel) as pool: + for v in iterable: + tasks.append(pool.submit(functools.partial(f, v))) + + for task in cf.as_completed(tasks): + if task.exception() is not None: + e = task.exception() + for task in tasks: + task.cancel() + raise e + + +ARROW_PACKAGE_TYPES = ['centos', 'debian', 'python', 'ubuntu'] + + +def download_rc_binaries(version, rc_number, dest=None, num_parallel=None): + bintray = Bintray() + + version_string = '{}-rc{}'.format(version, rc_number) + for package_type in ARROW_PACKAGE_TYPES: + files = bintray.get_file_list('{}-rc'.format(package_type), + version_string) + bintray.download_files(files, dest=dest, num_parallel=num_parallel) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Download release candidate binaries' + ) + parser.add_argument('version', type=str, help='The version number') + parser.add_argument('rc_number', type=int, + help='The release candidate number, e.g. 0, 1, etc') + parser.add_argument('--dest', type=str, default=os.getcwd(), + help='The output folder for the downloaded files') + parser.add_argument('--num_parallel', type=int, default=8, + help='The number of concurrent downloads to do') + args = parser.parse_args() + + download_rc_binaries(args.version, args.rc_number, dest=args.dest, + num_parallel=args.num_parallel) diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index b1a5997..d09d136 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -99,49 +99,13 @@ fetch_archive() { shasum -a 512 -c ${dist_name}.tar.gz.sha512 } -bintray() { - local command=$1 - shift - local path=$1 - shift - local url=https://bintray.com/api/v1${path} - echo "${command} ${url}" 1>&2 - curl \ - --fail \ - --request ${command} \ - ${url} \ - "$@" | \ - jq . -} - -download_bintray_files() { - local target=$1 - - local version_name=${VERSION}-rc${RC_NUMBER} - - local file - bintray \ - GET /packages/${BINTRAY_REPOSITORY}/${target}-rc/versions/${version_name}/files | \ - jq -r ".[].path" | \ - while read file; do - mkdir -p "$(dirname ${file})" - curl \ - --fail \ - --location \ - --output ${file} \ - https://dl.bintray.com/${BINTRAY_REPOSITORY}/${file} - done -} - test_binary() { local download_dir=binaries mkdir -p ${download_dir} - pushd ${download_dir} - # takes longer on slow network - for target in centos debian python ubuntu; do - download_bintray_files ${target} - done + python3 $SOURCE_DIR/download_rc_binaries.py $VERSION $RC_NUMBER --dest=${download_dir} + + pushd ${download_dir} # verify the signature and the checksums of each artifact find . -name '*.asc' | while read sigfile; do