This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 2ca4edf1224eb44a32942b6aa288be34f845a4b3
Author: Wes McKinney <wesm+...@apache.org>
AuthorDate: Tue Oct 1 11:34:50 2019 -0500

    ARROW-5831: [Release] Add Python program to download binary artifacts in 
parallel, allow abort/resume
    
    This script only uses the Python standard library and curl. It does 8 
downloads in parallel by default. Since Bintray returns sha256 checksums we 
compute these on any local files and do not re-download files, so that 
interrupted downloads can be resumed.
    
    Closes #5550 from wesm/parallel-rc-binary-verification and squashes the 
following commits:
    
    ff207e670 <Wes McKinney> More robust python3 checking
    1d78b9f41 <Wes McKinney> Add Python-based parallel bintray artifact 
download script that can resume
    
    Authored-by: Wes McKinney <wesm+...@apache.org>
    Signed-off-by: Wes McKinney <wesm+...@apache.org>
---
 dev/release/download_rc_binaries.py     | 161 ++++++++++++++++++++++++++++++++
 dev/release/verify-release-candidate.sh |  42 +--------
 2 files changed, 164 insertions(+), 39 deletions(-)

diff --git a/dev/release/download_rc_binaries.py 
b/dev/release/download_rc_binaries.py
new file mode 100644
index 0000000..01ab2c4
--- /dev/null
+++ b/dev/release/download_rc_binaries.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import sys
+
+try:
+    import argparse
+    import concurrent.futures as cf
+    import functools
+    import hashlib
+    import json
+    import os
+    import subprocess
+    import urllib.request
+except ImportError:
+    if sys.version_info.major < 3:
+        raise Exception("Please use Python 3 to run this script")
+    raise
+
+
+BINTRAY_API_ROOT = "https://bintray.com/api/v1";
+BINTRAY_DL_ROOT = "https://dl.bintray.com";
+BINTRAY_REPO = 'apache/arrow'
+DEFAULT_PARALLEL_DOWNLOADS = 8
+
+
+class Bintray:
+
+    def __init__(self, repo=BINTRAY_REPO):
+        self.repo = repo
+
+    def get_file_list(self, package, version):
+        url = os.path.join(BINTRAY_API_ROOT, 'packages', self.repo, package,
+                           'versions', version, 'files')
+        request = urllib.request.urlopen(url).read()
+        return json.loads(request)
+
+    def download_files(self, files, dest=None, num_parallel=None):
+        """
+        Download files from Bintray in parallel. If file already exists, will
+        overwrite if the checksum does not match what Bintray says it should be
+
+        Parameters
+        ----------
+        files : List[Dict]
+            File listing from Bintray
+        dest : str, default None
+            Defaults to current working directory
+        num_parallel : int, default 8
+            Number of files to download in parallel. If set to None, uses
+            default
+        """
+        if dest is None:
+            dest = os.getcwd()
+        if num_parallel is None:
+            num_parallel = DEFAULT_PARALLEL_DOWNLOADS
+
+        if num_parallel == 1:
+            for path in files:
+                self._download_file(dest, path)
+        else:
+            parallel_map_terminate_early(
+                functools.partial(self._download_file, dest),
+                files,
+                num_parallel
+            )
+
+    def _download_file(self, dest, info):
+        relpath = info['path']
+
+        base, filename = os.path.split(relpath)
+
+        dest_dir = os.path.join(dest, base)
+        os.makedirs(dest_dir, exist_ok=True)
+
+        dest_path = os.path.join(dest_dir, filename)
+
+        if os.path.exists(dest_path):
+            with open(dest_path, 'rb') as f:
+                sha256sum = hashlib.sha256(f.read()).hexdigest()
+            if sha256sum == info['sha256']:
+                print('Local file {} sha256 matches, skipping'
+                      .format(dest_path))
+                return
+            else:
+                print('Local file sha256 does not match, overwriting')
+
+        print("Downloading {} to {}".format(relpath, dest_path))
+
+        bintray_abspath = os.path.join(BINTRAY_DL_ROOT, self.repo, relpath)
+
+        cmd = [
+            'curl', '--fail', '--location',
+            '--output', dest_path, bintray_abspath
+        ]
+        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
+                                stderr=subprocess.PIPE)
+        stdout, stderr = proc.communicate()
+        if proc.returncode != 0:
+            raise Exception("Downloading {} failed\nstdout: {}\nstderr: {}"
+                            .format(relpath, stdout, stderr))
+
+
+def parallel_map_terminate_early(f, iterable, num_parallel):
+    tasks = []
+    with cf.ProcessPoolExecutor(num_parallel) as pool:
+        for v in iterable:
+            tasks.append(pool.submit(functools.partial(f, v)))
+
+        for task in cf.as_completed(tasks):
+            if task.exception() is not None:
+                e = task.exception()
+                for task in tasks:
+                    task.cancel()
+                raise e
+
+
+ARROW_PACKAGE_TYPES = ['centos', 'debian', 'python', 'ubuntu']
+
+
+def download_rc_binaries(version, rc_number, dest=None, num_parallel=None):
+    bintray = Bintray()
+
+    version_string = '{}-rc{}'.format(version, rc_number)
+    for package_type in ARROW_PACKAGE_TYPES:
+        files = bintray.get_file_list('{}-rc'.format(package_type),
+                                      version_string)
+        bintray.download_files(files, dest=dest, num_parallel=num_parallel)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Download release candidate binaries'
+    )
+    parser.add_argument('version', type=str, help='The version number')
+    parser.add_argument('rc_number', type=int,
+                        help='The release candidate number, e.g. 0, 1, etc')
+    parser.add_argument('--dest', type=str, default=os.getcwd(),
+                        help='The output folder for the downloaded files')
+    parser.add_argument('--num_parallel', type=int, default=8,
+                        help='The number of concurrent downloads to do')
+    args = parser.parse_args()
+
+    download_rc_binaries(args.version, args.rc_number, dest=args.dest,
+                         num_parallel=args.num_parallel)
diff --git a/dev/release/verify-release-candidate.sh 
b/dev/release/verify-release-candidate.sh
index b1a5997..d09d136 100755
--- a/dev/release/verify-release-candidate.sh
+++ b/dev/release/verify-release-candidate.sh
@@ -99,49 +99,13 @@ fetch_archive() {
   shasum -a 512 -c ${dist_name}.tar.gz.sha512
 }
 
-bintray() {
-  local command=$1
-  shift
-  local path=$1
-  shift
-  local url=https://bintray.com/api/v1${path}
-  echo "${command} ${url}" 1>&2
-  curl \
-    --fail \
-    --request ${command} \
-    ${url} \
-    "$@" | \
-      jq .
-}
-
-download_bintray_files() {
-  local target=$1
-
-  local version_name=${VERSION}-rc${RC_NUMBER}
-
-  local file
-  bintray \
-    GET 
/packages/${BINTRAY_REPOSITORY}/${target}-rc/versions/${version_name}/files | \
-      jq -r ".[].path" | \
-      while read file; do
-    mkdir -p "$(dirname ${file})"
-    curl \
-      --fail \
-      --location \
-      --output ${file} \
-      https://dl.bintray.com/${BINTRAY_REPOSITORY}/${file}
-  done
-}
-
 test_binary() {
   local download_dir=binaries
   mkdir -p ${download_dir}
-  pushd ${download_dir}
 
-  # takes longer on slow network
-  for target in centos debian python ubuntu; do
-    download_bintray_files ${target}
-  done
+  python3 $SOURCE_DIR/download_rc_binaries.py $VERSION $RC_NUMBER 
--dest=${download_dir}
+
+  pushd ${download_dir}
 
   # verify the signature and the checksums of each artifact
   find . -name '*.asc' | while read sigfile; do

Reply via email to