Repository: incubator-impala Updated Branches: refs/heads/master 9e7fb830f -> 4a79c9e7e
IMPALA-5181: Extract PYPI metadata from a webpage There were some build failures due to a failure to download a JSON file containing package metadata from PYPI. We need to switch to downloading this from a PYPI mirror. In order to be able to download the metadata from a PYPI mirror, we need be able to extract the data from a web page, because PYPI mirrors do not always have a JSON interface. We implement a regex based html parser in this patch. Also, we increase the number of download attempts and randomly vary the amount of time between each attempt. Testing: - Tested locally against PYPI and a PYPI mirror. - Ran a private build that passed (which used a PYPI mirror). Change-Id: If3845a0d5f568d4352e3cc4883596736974fd7de Reviewed-on: http://gerrit.cloudera.org:8080/6579 Reviewed-by: Tim Armstrong <[email protected]> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/4a79c9e7 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/4a79c9e7 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/4a79c9e7 Branch: refs/heads/master Commit: 4a79c9e7e3928f919b5fb60bab4145ba886d6252 Parents: 9e7fb83 Author: Taras Bobrovytsky <[email protected]> Authored: Thu Mar 30 13:08:21 2017 -0700 Committer: Impala Public Jenkins <[email protected]> Committed: Sat Apr 8 00:19:08 2017 +0000 ---------------------------------------------------------------------- infra/python/deps/pip_download.py | 90 +++++++++++++++++++++------------- 1 file changed, 57 insertions(+), 33 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4a79c9e7/infra/python/deps/pip_download.py ---------------------------------------------------------------------- diff --git a/infra/python/deps/pip_download.py b/infra/python/deps/pip_download.py index 85def64..bd54d30 100755 --- a/infra/python/deps/pip_download.py +++ b/infra/python/deps/pip_download.py @@ -24,18 +24,20 @@ import json import os import os.path +import re import sys from hashlib import md5 +from random import randint from time import sleep from urllib import urlopen, URLopener -NUM_TRIES = 3 +NUM_DOWNLOAD_ATTEMPTS = 8 -PYPI_MIRROR = os.environ.get("PYPI_MIRROR", "https://pypi.python.org") +PYPI_MIRROR = os.environ.get('PYPI_MIRROR', 'https://pypi.python.org') # The requirement files that list all of the required packages and versions. REQUIREMENTS_FILES = ['requirements.txt', 'compiled-requirements.txt', - 'kudu-requirements.txt'] + 'kudu-requirements.txt'] def check_md5sum(filename, expected_md5): actual_md5 = md5(open(filename).read()).hexdigest() @@ -45,47 +47,69 @@ def retry(func): '''Retry decorator.''' def wrapper(*args, **kwargs): - for _ in xrange(NUM_TRIES): + for try_num in xrange(NUM_DOWNLOAD_ATTEMPTS): + if try_num > 0: + sleep_len = randint(5, 10 * 2 ** try_num) + print 'Sleeping for {0} seconds before retrying'.format(sleep_len) + sleep(sleep_len) try: result = func(*args, **kwargs) - if result: return result + if result: + return result except Exception as e: print e - sleep(5) - print "Download failed after several attempts." + print 'Download failed after several attempts.' sys.exit(1) return wrapper +def get_package_info(pkg_name, pkg_version): + '''Returns the file name, path and md5 digest of the package.''' + # We store the matching result in the candidates list instead of returning right away + # to sort them and return the first value in alphabetical order. This ensures that the + # same result is always returned even if the ordering changed on the server. + candidates = [] + url = '{0}/simple/{1}/'.format(PYPI_MIRROR, pkg_name) + print 'Getting package info from {0}'.format(url) + # The web page should be in PEP 503 format (https://www.python.org/dev/peps/pep-0503/). + # We parse the page with regex instead of an html parser because that requires + # downloading an extra package before running this script. Since the HTML is guaranteed + # to be formatted according to PEP 503, this is acceptable. + pkg_info = urlopen(url).read() + # We assume that the URL includes a hash and the hash function is md5. This not strictly + # required by PEP 503. + regex = r'<a href=\".*?packages/(.*?)#md5=(.*?)\".*?>(.*?)<\/a>' + for match in re.finditer(regex, pkg_info): + path = match.group(1) + md5_digest = match.group(2) + file_name = match.group(3) + # Make sure that we consider only non Wheel archives, because those are not supported. + if (file_name.endswith('-{0}.tar.gz'.format(pkg_version)) or + file_name.endswith('-{0}.tar.bz2'.format(pkg_version)) or + file_name.endswith('-{0}.zip'.format(pkg_version))): + candidates.append((file_name, path, md5_digest)) + if not candidates: + print 'Could not find archive to download for {0} {1}'.format(pkg_name, pkg_version) + return (None, None, None) + return sorted(candidates)[0] + @retry def download_package(pkg_name, pkg_version): - '''Download the required package. Sometimes the download can be flaky, so we use the - retry decorator.''' - pkg_type = 'sdist' # Don't download wheel archives for now - # This JSON endpoint is not provided by PyPI mirrors so we always need to get this - # from pypi.python.org. - pkg_info = json.loads(urlopen('https://pypi.python.org/pypi/%s/json' % pkg_name).read()) - + file_name, path, expected_md5 = get_package_info(pkg_name, pkg_version) + if not file_name: + return False + if os.path.isfile(file_name) and check_md5sum(file_name, expected_md5): + print 'File with matching md5sum already exists, skipping {0}'.format(file_name) + return True downloader = URLopener() - for pkg in pkg_info['releases'][pkg_version]: - if pkg['packagetype'] == pkg_type: - filename = pkg['filename'] - expected_md5 = pkg['md5_digest'] - if os.path.isfile(filename) and check_md5sum(filename, expected_md5): - print "File with matching md5sum already exists, skipping %s" % filename - return True - pkg_url = "{0}/packages/{1}".format(PYPI_MIRROR, pkg['path']) - print "Downloading %s from %s" % (filename, pkg_url) - downloader.retrieve(pkg_url, filename) - actual_md5 = md5(open(filename).read()).hexdigest() - if check_md5sum(filename, expected_md5): - return True - else: - print "MD5 mismatch in file %s." % filename - return False - print "Could not find archive to download for %s %s %s" % ( - pkg_name, pkg_version, pkg_type) - sys.exit(1) + pkg_url = '{0}/packages/{1}'.format(PYPI_MIRROR, path) + print 'Downloading {0} from {1}'.format(file_name, pkg_url) + downloader.retrieve(pkg_url, file_name) + if check_md5sum(file_name, expected_md5): + return True + else: + print 'MD5 mismatch in file {0}.'.format(file_name) + return False def main(): if len(sys.argv) > 1:
