IMPALA-3778: Fix ASF packaging build The tarballs in IMPALA_HOME/infra/python/deps and the thirdparty directory have been removed in the ASF repository. All Python dependencies and CDH components must now be downloaded as part of every build. This caused the ASF packaging build to fail. Before this patch, we used the system pip to download the Python dependencies, which caused flakiness and inconsistency on different operating systems. This patch fixes the problem by using our own script (which requires Python 2.6+ to be installed on the system), to download all the files in requirements.txt.
Also replaced all whl and zip Python packages with tar.gz to make it consistent with the ASF build. Change-Id: Ibe5a743096cda2059bd330805d324983f6730e19 Reviewed-on: http://gerrit.cloudera.org:8080/3647 Reviewed-by: Jim Apple <[email protected]> Tested-by: Taras Bobrovytsky <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/baf8fe20 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/baf8fe20 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/baf8fe20 Branch: refs/heads/master Commit: baf8fe202c1e212bbed3c73a6a63017eceb4a180 Parents: 7441032 Author: Taras Bobrovytsky <[email protected]> Authored: Sat Jul 9 02:07:17 2016 +0000 Committer: Taras Bobrovytsky <[email protected]> Committed: Thu Jul 14 19:04:45 2016 +0000 ---------------------------------------------------------------------- bin/impala-config.sh | 48 ++++++----- infra/python/deps/download_requirements | 28 ++---- infra/python/deps/find_py26.py | 41 +++++++++ infra/python/deps/pip_download.py | 124 ++++++++++++++++++++------- infra/python/deps/requirements.txt | 2 + 5 files changed, 170 insertions(+), 73 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/baf8fe20/bin/impala-config.sh ---------------------------------------------------------------------- diff --git a/bin/impala-config.sh b/bin/impala-config.sh index 9db640a..481d2cb 100755 --- a/bin/impala-config.sh +++ b/bin/impala-config.sh @@ -66,9 +66,14 @@ fi # If enabled, debug symbols are added to cross-compiled IR. : ${ENABLE_IMPALA_IR_DEBUG_INFO=false} +if [ -d $IMPALA_HOME/thirdparty ]; then + NO_THIRDPARTY=false +else + NO_THIRDPARTY=true +fi # If true, download and use the CDH components from S3 instead of the ones # in $IMPALA_HOME/thirdparty. -: ${DOWNLOAD_CDH_COMPONENTS=false} +: ${DOWNLOAD_CDH_COMPONENTS=$NO_THIRDPARTY} export IMPALA_TOOLCHAIN export SKIP_TOOLCHAIN_BOOTSTRAP @@ -426,26 +431,27 @@ alias gerrit-verify-merge="${IMPALA_AUX_TEST_HOME}/jenkins/gerrit-verify-merge.s # A marker in the environment to prove that we really did source this file export IMPALA_CONFIG_SOURCED=1 -echo "IMPALA_HOME = $IMPALA_HOME" -echo "HADOOP_HOME = $HADOOP_HOME" -echo "HADOOP_CONF_DIR = $HADOOP_CONF_DIR" -echo "MINI_DFS_BASE_DATA_DIR = $MINI_DFS_BASE_DATA_DIR" -echo "HIVE_HOME = $HIVE_HOME" -echo "HIVE_CONF_DIR = $HIVE_CONF_DIR" -echo "HBASE_HOME = $HBASE_HOME" -echo "HBASE_CONF_DIR = $HBASE_CONF_DIR" -echo "MINIKDC_HOME = $MINIKDC_HOME" -echo "THRIFT_HOME = $THRIFT_HOME" -echo "HADOOP_LZO = $HADOOP_LZO" -echo "IMPALA_LZO = $IMPALA_LZO" -echo "CLASSPATH = $CLASSPATH" -echo "LIBHDFS_OPTS = $LIBHDFS_OPTS" -echo "PYTHONPATH = $PYTHONPATH" -echo "JAVA_HOME = $JAVA_HOME" -echo "LD_LIBRARY_PATH = $LD_LIBRARY_PATH" -echo "LD_PRELOAD = $LD_PRELOAD" -echo "POSTGRES_JDBC_DRIVER = $POSTGRES_JDBC_DRIVER" -echo "IMPALA_TOOLCHAIN = $IMPALA_TOOLCHAIN" +echo "IMPALA_HOME = $IMPALA_HOME" +echo "HADOOP_HOME = $HADOOP_HOME" +echo "HADOOP_CONF_DIR = $HADOOP_CONF_DIR" +echo "MINI_DFS_BASE_DATA_DIR = $MINI_DFS_BASE_DATA_DIR" +echo "HIVE_HOME = $HIVE_HOME" +echo "HIVE_CONF_DIR = $HIVE_CONF_DIR" +echo "HBASE_HOME = $HBASE_HOME" +echo "HBASE_CONF_DIR = $HBASE_CONF_DIR" +echo "MINIKDC_HOME = $MINIKDC_HOME" +echo "THRIFT_HOME = $THRIFT_HOME" +echo "HADOOP_LZO = $HADOOP_LZO" +echo "IMPALA_LZO = $IMPALA_LZO" +echo "CLASSPATH = $CLASSPATH" +echo "LIBHDFS_OPTS = $LIBHDFS_OPTS" +echo "PYTHONPATH = $PYTHONPATH" +echo "JAVA_HOME = $JAVA_HOME" +echo "LD_LIBRARY_PATH = $LD_LIBRARY_PATH" +echo "LD_PRELOAD = $LD_PRELOAD" +echo "POSTGRES_JDBC_DRIVER = $POSTGRES_JDBC_DRIVER" +echo "IMPALA_TOOLCHAIN = $IMPALA_TOOLCHAIN" +echo "DOWNLOAD_CDH_COMPONENTS = $DOWNLOAD_CDH_COMPONENTS" # Kerberos things. If the cluster exists and is kerberized, source # the required environment. This is required for any hadoop tool to http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/baf8fe20/infra/python/deps/download_requirements ---------------------------------------------------------------------- diff --git a/infra/python/deps/download_requirements b/infra/python/deps/download_requirements index eee7a1b..e054653 100755 --- a/infra/python/deps/download_requirements +++ b/infra/python/deps/download_requirements @@ -2,31 +2,15 @@ set -euo pipefail -# Prefer the virtualenv pip and python since this is what will actually be used during the -# installation and it may be a different version than the system default. -VIRTUAL_ENV_PIP="$IMPALA_HOME"/infra/python/env/bin/pip -if [[ -e "$VIRTUAL_ENV_PIP" ]]; then - PIP="$VIRTUAL_ENV_PIP" - # Assume python is also available in the virtualenv. - PYTHON="$IMPALA_HOME"/infra/python/env/bin/python -else - PIP=pip - PYTHON=python -fi - DIR=$(dirname "$0") -# Download but don't install all packages listed in requirements.txt. -# Don't use pip to download Impyla because its setup.py requires a newer version -# of setuptools than is available on some systems we support (e.g. CentOS 6). -"$PIP" -q install --download "$DIR" -r <(grep -v "impyla" "$DIR"/requirements.txt) -# Directly download required packages not listed in requirements.txt. +pushd $DIR +PY26=$(./find_py26.py) +# Directly download packages listed in requirements.txt, but don't install them. +$PY26 pip_download.py # For virtualenv, other scripts rely on the .tar.gz package (not a .whl package). +$PY26 pip_download.py virtualenv 13.1.0 # kudu-python is downloaded separately because pip install attempts to execute a # setup.py subcommand for kudu-python that can fail even if the download succeeds. -pushd $DIR -$PYTHON ./pip_download.py virtualenv 13.1.0 -$PYTHON ./pip_download.py kudu-python 0.1.1 -IMPYLA_VERSION=$(grep impyla ./requirements.txt | grep -o '[0-9.]*') -$PYTHON ./pip_download.py impyla ${IMPYLA_VERSION} +$PY26 pip_download.py kudu-python 0.1.1 popd http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/baf8fe20/infra/python/deps/find_py26.py ---------------------------------------------------------------------- diff --git a/infra/python/deps/find_py26.py b/infra/python/deps/find_py26.py new file mode 100755 index 0000000..2ba5d23 --- /dev/null +++ b/infra/python/deps/find_py26.py @@ -0,0 +1,41 @@ +#!/usr/bin/python +# Copyright (c) 2015 Cloudera, Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script finds Python 2.6 or higher on the system and outputs the +# system command to stdout. The script exits with a nonzero exit code if +# Python 2.6+ is not present. + +import os +import subprocess +import sys +import textwrap + +def detect_python_cmd(): + '''Returns the system command that provides python 2.6 or greater.''' + paths = os.getenv("PATH").split(os.path.pathsep) + for cmd in ("python", "python27", "python2.7", "python-27", "python-2.7", "python26", + "python2.6", "python-26", "python-2.6"): + for path in paths: + cmd_path = os.path.join(path, cmd) + if not os.path.exists(cmd_path) or not os.access(cmd_path, os.X_OK): + continue + exit = subprocess.call([cmd_path, "-c", textwrap.dedent(""" + import sys + sys.exit(int(sys.version_info[:2] < (2, 6)))""")]) + if exit == 0: + return cmd_path + raise Exception("Could not find minimum required python version 2.6") + +print detect_python_cmd() http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/baf8fe20/infra/python/deps/pip_download.py ---------------------------------------------------------------------- diff --git a/infra/python/deps/pip_download.py b/infra/python/deps/pip_download.py index b1e1fa7..a3268b6 100755 --- a/infra/python/deps/pip_download.py +++ b/infra/python/deps/pip_download.py @@ -1,42 +1,106 @@ #!/usr/bin/python +# Copyright (c) 2015 Cloudera, Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # Implement the basic 'pip download' functionality in a way that gives us more control # over which archive type is downloaded and what post-download steps are executed. -import hashlib +# This script requires Python 2.6+. + import json import os.path -from urllib import urlopen, URLopener import sys +from hashlib import md5 +from time import sleep +from urllib import urlopen, URLopener -pkg_name = sys.argv[1] -pkg_version = sys.argv[2] -pkg_type = 'sdist' # Don't download wheel archives for now -pkg_info = json.loads(urlopen('https://pypi.python.org/pypi/%s/json' % pkg_name).read()) +NUM_TRIES = 3 def check_md5sum(filename, expected_md5): - expected_md5 = pkg['md5_digest'] - actual_md5 = hashlib.md5(open(filename).read()).hexdigest() + actual_md5 = md5(open(filename).read()).hexdigest() return actual_md5 == expected_md5 -found = False -downloader = URLopener() -for pkg in pkg_info['releases'][pkg_version]: - if pkg['packagetype'] == pkg_type: - filename = pkg['filename'] - expected_md5 = pkg['md5_digest'] - print "Downloading %s from %s " % (filename, pkg['url']) - if os.path.isfile(filename) and check_md5sum(filename, expected_md5): - print "File with matching md5sum already exists, skipping download." - found = True - break - downloader.retrieve(pkg['url'], filename) - actual_md5 = hashlib.md5(open(filename).read()).hexdigest() - if not check_md5sum(filename, expected_md5): - print "MD5 mismatch in file %s." % filename - sys.exit(1) - found = True - break - -if not found: - print "Could not find archive to download for %s %s %s" % (pkg_name, pkg_version, - pkg_type) +def retry(func): + '''Retry decorator.''' + + def wrapper(*args, **kwargs): + for _ in xrange(NUM_TRIES): + try: + result = func(*args, **kwargs) + if result: return result + except Exception as e: + print e + sleep(5) + print "Download failed after several attempts." + sys.exit(1) + + return wrapper + +@retry +def download_package(pkg_name, pkg_version): + '''Download the required package. Sometimes the download can be flaky, so we use the + retry decorator.''' + pkg_type = 'sdist' # Don't download wheel archives for now + pkg_info = json.loads(urlopen('https://pypi.python.org/pypi/%s/json' % pkg_name).read()) + + downloader = URLopener() + for pkg in pkg_info['releases'][pkg_version]: + if pkg['packagetype'] == pkg_type: + filename = pkg['filename'] + expected_md5 = pkg['md5_digest'] + if os.path.isfile(filename) and check_md5sum(filename, expected_md5): + print "File with matching md5sum already exists, skipping download." + return True + print "Downloading %s from %s " % (filename, pkg['url']) + downloader.retrieve(pkg['url'], filename) + actual_md5 = md5(open(filename).read()).hexdigest() + if check_md5sum(filename, expected_md5): + return True + else: + print "MD5 mismatch in file %s." % filename + return False + print "Could not find archive to download for %s %s %s" % ( + pkg_name, pkg_version, pkg_type) sys.exit(1) + +def main(): + if len(sys.argv) > 1: + _, pkg_name, pkg_version = sys.argv + download_package(pkg_name, pkg_version) + else: + # If the package name and version are not specified in the command line arguments, + # download the packages that in requirements.txt. + f = open("requirements.txt", 'r') + try: + # requirements.txt follows the standard pip grammar. + for line in f: + # A hash symbol ("#") represents a comment that should be ignored. + hash_index = line.find('#') + if hash_index != -1: + line = line[:hash_index] + # A semi colon (";") specifies some additional condition for when the package + # should be installed (for example a specific OS). We can ignore this and download + # the package anyways because the installation script(bootstrap_virtualenv.py) can + # take it into account. + semi_colon_index = line.find(';') + if semi_colon_index != -1: + line = line[:semi_colon_index] + l = line.strip() + if len(l) > 0: + pkg_name, pkg_version = l.split('==') + download_package(pkg_name.strip(), pkg_version.strip()) + finally: + f.close() + +if __name__ == '__main__': + main() http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/baf8fe20/infra/python/deps/requirements.txt ---------------------------------------------------------------------- diff --git a/infra/python/deps/requirements.txt b/infra/python/deps/requirements.txt index b3ebfb1..77e4be9 100644 --- a/infra/python/deps/requirements.txt +++ b/infra/python/deps/requirements.txt @@ -19,6 +19,7 @@ cm-api == 10.0.0 readline == 6.2.4.1; sys_platform == 'darwin' Fabric == 1.10.2 paramiko == 1.15.2 + ecdsa == 0.13 pycrypto == 2.6.1 Flask == 0.10.1 Jinja2 == 2.8 @@ -73,3 +74,4 @@ texttable == 0.8.3 # For dev purposes, not used in scripting. Version 1.2.1 is the latest that supports 2.6. ipython == 1.2.1 + apipkg == 1.4
