incubator-impala git commit: IMPALA-5181: Extract PYPI metadata from a webpage

mjacobs Fri, 07 Apr 2017 17:22:02 -0700

Repository: incubator-impala
Updated Branches:
  refs/heads/master 9e7fb830f -> 4a79c9e7e



IMPALA-5181: Extract PYPI metadata from a webpage

There were some build failures due to a failure to download a JSON file
containing package metadata from PYPI. We need to switch to downloading
this from a PYPI mirror. In order to be able to download the metadata
from a PYPI mirror, we need be able to extract the data from a web page,
because PYPI mirrors do not always have a JSON interface.

We implement a regex based html parser in this patch. Also, we increase
the number of download attempts and randomly vary the amount of time
between each attempt.

Testing:
- Tested locally against PYPI and a PYPI mirror.
- Ran a private build that passed (which used a PYPI mirror).

Change-Id: If3845a0d5f568d4352e3cc4883596736974fd7de
Reviewed-on: http://gerrit.cloudera.org:8080/6579
Reviewed-by: Tim Armstrong <[email protected]>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/4a79c9e7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/4a79c9e7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/4a79c9e7

Branch: refs/heads/master
Commit: 4a79c9e7e3928f919b5fb60bab4145ba886d6252
Parents: 9e7fb83
Author: Taras Bobrovytsky <[email protected]>
Authored: Thu Mar 30 13:08:21 2017 -0700
Committer: Impala Public Jenkins <[email protected]>
Committed: Sat Apr 8 00:19:08 2017 +0000

----------------------------------------------------------------------
 infra/python/deps/pip_download.py | 90 +++++++++++++++++++++-------------
 1 file changed, 57 insertions(+), 33 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4a79c9e7/infra/python/deps/pip_download.py
----------------------------------------------------------------------
diff --git a/infra/python/deps/pip_download.py 
b/infra/python/deps/pip_download.py
index 85def64..bd54d30 100755
--- a/infra/python/deps/pip_download.py
+++ b/infra/python/deps/pip_download.py
@@ -24,18 +24,20 @@
 import json
 import os
 import os.path
+import re
 import sys
 from hashlib import md5
+from random import randint
 from time import sleep
 from urllib import urlopen, URLopener
 
-NUM_TRIES = 3
+NUM_DOWNLOAD_ATTEMPTS = 8
 
-PYPI_MIRROR = os.environ.get("PYPI_MIRROR", "https://pypi.python.org";)
+PYPI_MIRROR = os.environ.get('PYPI_MIRROR', 'https://pypi.python.org')
 
 # The requirement files that list all of the required packages and versions.
 REQUIREMENTS_FILES = ['requirements.txt', 'compiled-requirements.txt',
-        'kudu-requirements.txt']
+                      'kudu-requirements.txt']
 
 def check_md5sum(filename, expected_md5):
   actual_md5 = md5(open(filename).read()).hexdigest()
@@ -45,47 +47,69 @@ def retry(func):
   '''Retry decorator.'''
 
   def wrapper(*args, **kwargs):
-    for _ in xrange(NUM_TRIES):
+    for try_num in xrange(NUM_DOWNLOAD_ATTEMPTS):
+      if try_num > 0:
+        sleep_len = randint(5, 10 * 2 ** try_num)
+        print 'Sleeping for {0} seconds before retrying'.format(sleep_len)
+        sleep(sleep_len)
       try:
         result = func(*args, **kwargs)
-        if result: return result
+        if result:
+          return result
       except Exception as e:
         print e
-      sleep(5)
-    print "Download failed after several attempts."
+    print 'Download failed after several attempts.'
     sys.exit(1)
 
   return wrapper
 
+def get_package_info(pkg_name, pkg_version):
+  '''Returns the file name, path and md5 digest of the package.'''
+  # We store the matching result in the candidates list instead of returning 
right away
+  # to sort them and return the first value in alphabetical order. This 
ensures that the
+  # same result is always returned even if the ordering changed on the server.
+  candidates = []
+  url = '{0}/simple/{1}/'.format(PYPI_MIRROR, pkg_name)
+  print 'Getting package info from {0}'.format(url)
+  # The web page should be in PEP 503 format 
(https://www.python.org/dev/peps/pep-0503/).
+  # We parse the page with regex instead of an html parser because that 
requires
+  # downloading an extra package before running this script. Since the HTML is 
guaranteed
+  # to be formatted according to PEP 503, this is acceptable.
+  pkg_info = urlopen(url).read()
+  # We assume that the URL includes a hash and the hash function is md5. This 
not strictly
+  # required by PEP 503.
+  regex = r'<a href=\".*?packages/(.*?)#md5=(.*?)\".*?>(.*?)<\/a>'
+  for match in re.finditer(regex, pkg_info):
+    path = match.group(1)
+    md5_digest = match.group(2)
+    file_name = match.group(3)
+    # Make sure that we consider only non Wheel archives, because those are 
not supported.
+    if (file_name.endswith('-{0}.tar.gz'.format(pkg_version)) or
+        file_name.endswith('-{0}.tar.bz2'.format(pkg_version)) or
+        file_name.endswith('-{0}.zip'.format(pkg_version))):
+      candidates.append((file_name, path, md5_digest))
+  if not candidates:
+    print 'Could not find archive to download for {0} {1}'.format(pkg_name, 
pkg_version)
+    return (None, None, None)
+  return sorted(candidates)[0]
+
 @retry
 def download_package(pkg_name, pkg_version):
-  '''Download the required package. Sometimes the download can be flaky, so we 
use the
-  retry decorator.'''
-  pkg_type = 'sdist' # Don't download wheel archives for now
-  # This JSON endpoint is not provided by PyPI mirrors so we always need to 
get this
-  # from pypi.python.org.
-  pkg_info = json.loads(urlopen('https://pypi.python.org/pypi/%s/json' % 
pkg_name).read())
-
+  file_name, path, expected_md5 = get_package_info(pkg_name, pkg_version)
+  if not file_name:
+    return False
+  if os.path.isfile(file_name) and check_md5sum(file_name, expected_md5):
+    print 'File with matching md5sum already exists, skipping 
{0}'.format(file_name)
+    return True
   downloader = URLopener()
-  for pkg in pkg_info['releases'][pkg_version]:
-    if pkg['packagetype'] == pkg_type:
-      filename = pkg['filename']
-      expected_md5 = pkg['md5_digest']
-      if os.path.isfile(filename) and check_md5sum(filename, expected_md5):
-        print "File with matching md5sum already exists, skipping %s" % 
filename
-        return True
-      pkg_url = "{0}/packages/{1}".format(PYPI_MIRROR, pkg['path'])
-      print "Downloading %s from %s" % (filename, pkg_url)
-      downloader.retrieve(pkg_url, filename)
-      actual_md5 = md5(open(filename).read()).hexdigest()
-      if check_md5sum(filename, expected_md5):
-        return True
-      else:
-        print "MD5 mismatch in file %s." % filename
-        return False
-  print "Could not find archive to download for %s %s %s" % (
-      pkg_name, pkg_version, pkg_type)
-  sys.exit(1)
+  pkg_url = '{0}/packages/{1}'.format(PYPI_MIRROR, path)
+  print 'Downloading {0} from {1}'.format(file_name, pkg_url)
+  downloader.retrieve(pkg_url, file_name)
+  if check_md5sum(file_name, expected_md5):
+    return True
+  else:
+    print 'MD5 mismatch in file {0}.'.format(file_name)
+    return False
 
 def main():
   if len(sys.argv) > 1:

incubator-impala git commit: IMPALA-5181: Extract PYPI metadata from a webpage

Reply via email to