IMPALA-6148: Specifying thirdparty deps as URLs If the environment variable $IMPALA_<NAME>_URL is configured in impala-config-branch.sh or impala-config-local, for a thirdparty dependency, use that to download it instead of the s3://native-toolchain bucket. This makes testing against arbitrary versions of the dependencies easier.
I did a little bit of refactoring while here, creating a small class for a Package to handle reading the environment variables. I also changed bootstrap_toolchain.py to use Python logging, which cleans up the output during the multi-threaded downloading. I tested this by both with customized URLs and by running the regular build (pre-review-test, without most of the slow test suites). Change-Id: I4628d86022d4bd8b762313f7056d76416a58b422 Reviewed-on: http://gerrit.cloudera.org:8080/8456 Reviewed-by: Joe McDonnell <[email protected]> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/2212a889 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/2212a889 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/2212a889 Branch: refs/heads/master Commit: 2212a8897e66e1a5c548132d09a8e33e275a0e3b Parents: 5ab07f0 Author: Philip Zeyliger <[email protected]> Authored: Thu Nov 2 15:43:10 2017 -0700 Committer: Impala Public Jenkins <[email protected]> Committed: Fri Nov 10 02:42:16 2017 +0000 ---------------------------------------------------------------------- bin/bootstrap_toolchain.py | 150 +++++++++++++++++++++++++--------------- bin/impala-config.sh | 45 ++++++++++++ 2 files changed, 138 insertions(+), 57 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2212a889/bin/bootstrap_toolchain.py ---------------------------------------------------------------------- diff --git a/bin/bootstrap_toolchain.py b/bin/bootstrap_toolchain.py index bea3a99..8494c6c 100755 --- a/bin/bootstrap_toolchain.py +++ b/bin/bootstrap_toolchain.py @@ -25,9 +25,16 @@ # the CDH components (i.e. Hadoop, Hive, HBase and Sentry) into # CDH_COMPONENTS_HOME. # +# By default, packages are downloaded from an S3 bucket named native-toolchain. +# The exact URL is based on IMPALA_<PACKAGE>_VERSION environment variables +# (configured in impala-config.sh) as well as the OS version being built on. +# The URL can be overridden with an IMPALA_<PACKAGE>_URL environment variable +# set in impala-config-{local,branch}.sh. +# # The script is called as follows without any additional parameters: # # python bootstrap_toolchain.py +import logging import os import random import re @@ -57,6 +64,28 @@ OS_MAPPING = { "ubuntu16.04" : "ec2-package-ubuntu-16-04", } +class Package(object): + """ + Represents a package to be downloaded. A version, if not specified + explicitly, is retrieved from the environment variable IMPALA_<NAME>_VERSION. + URLs are retrieved from IMPALA_<NAME>_URL, but are optional. + """ + def __init__(self, name, version=None, url=None): + self.name = name + self.version = version + self.url = url + package_env_name = name.replace("-", "_").upper() + if self.version is None: + version_env_var = "IMPALA_{0}_VERSION".format(package_env_name) + + self.version = os.environ.get(version_env_var) + if not self.version: + raise Exception("Could not find version for {0} in environment var {1}".format( + name, version_env_var)) + if self.url is None: + url_env_var = "IMPALA_{0}_URL".format(package_env_name) + self.url = os.environ.get(url_env_var) + def try_get_platform_release_label(): """Gets the right package label from the OS version. Return None if not found.""" try: @@ -64,24 +93,35 @@ def try_get_platform_release_label(): except: return None +# Cache "lsb_release -irs" to avoid excessive logging from sh, and +# to shave a little bit of time. +lsb_release_cache = None + def get_platform_release_label(release=None): """Gets the right package label from the OS version. Raise exception if not found. 'release' can be provided to override the underlying OS version. """ + global lsb_release_cache if not release: - release = "".join(map(lambda x: x.lower(), sh.lsb_release("-irs").split())) + if lsb_release_cache: + release = lsb_release_cache + else: + release = "".join(map(lambda x: x.lower(), sh.lsb_release("-irs").split())) + lsb_release_cache = release for k, v in OS_MAPPING.iteritems(): if re.search(k, release): return v raise Exception("Could not find package label for OS version: {0}.".format(release)) - def wget_and_unpack_package(download_path, file_name, destination, wget_no_clobber): - print "URL {0}".format(download_path) + if not download_path.endswith("/" + file_name): + raise Exception("URL {0} does not match with expected file_name {1}" + .format(download_path, file_name)) NUM_ATTEMPTS = 3 for attempt in range(1, NUM_ATTEMPTS + 1): - print "Downloading {0} to {1} (attempt {2})".format(file_name, destination, attempt) + logging.info("Downloading {0} to {1}/{2} (attempt {3})".format( + download_path, destination, file_name, attempt)) # --no-clobber avoids downloading the file if a file with the name already exists try: sh.wget(download_path, directory_prefix=destination, no_clobber=wget_no_clobber) @@ -89,24 +129,27 @@ def wget_and_unpack_package(download_path, file_name, destination, wget_no_clobb except Exception, e: if attempt == NUM_ATTEMPTS: raise - print "Download failed; retrying after sleep: " + str(e) + logging.error("Download failed; retrying after sleep: " + str(e)) time.sleep(10 + random.random() * 5) # Sleep between 10 and 15 seconds. - print "Extracting {0}".format(file_name) + logging.info("Extracting {0}".format(file_name)) sh.tar(z=True, x=True, f=os.path.join(destination, file_name), directory=destination) sh.rm(os.path.join(destination, file_name)) -def download_package(destination, product, version, compiler, platform_release=None): - remove_existing_package(destination, product, version) +def download_package(destination, package, compiler, platform_release=None): + remove_existing_package(destination, package.name, package.version) toolchain_build_id = os.environ["IMPALA_TOOLCHAIN_BUILD_ID"] label = get_platform_release_label(release=platform_release) - format_params = {'product': product, 'version': version, 'compiler': compiler, - 'label': label, 'toolchain_build_id': toolchain_build_id} + format_params = {'product': package.name, 'version': package.version, + 'compiler': compiler, 'label': label, 'toolchain_build_id': toolchain_build_id} file_name = "{product}-{version}-{compiler}-{label}.tar.gz".format(**format_params) format_params['file_name'] = file_name - url_path = "/{toolchain_build_id}/{product}/{version}-{compiler}/{file_name}".format( - **format_params) - download_path = HOST + url_path + if package.url is None: + url_path = "/{toolchain_build_id}/{product}/{version}-{compiler}/{file_name}".format( + **format_params) + download_path = HOST + url_path + else: + download_path = package.url wget_and_unpack_package(download_path, file_name, destination, True) @@ -122,14 +165,13 @@ def bootstrap(toolchain_root, packages): compiler = "gcc-{0}".format(os.environ["IMPALA_GCC_VERSION"]) def handle_package(p): - pkg_name, pkg_version = unpack_name_and_version(p) - if check_for_existing_package(toolchain_root, pkg_name, pkg_version, compiler): + if check_for_existing_package(toolchain_root, p.name, p.version, compiler): return - if pkg_name != "kudu" or os.environ["KUDU_IS_SUPPORTED"] == "true": - download_package(toolchain_root, pkg_name, pkg_version, compiler) + if p.name != "kudu" or os.environ["KUDU_IS_SUPPORTED"] == "true": + download_package(toolchain_root, p, compiler) else: - build_kudu_stub(toolchain_root, pkg_version, compiler) - write_version_file(toolchain_root, pkg_name, pkg_version, compiler, + build_kudu_stub(toolchain_root, p.version, compiler) + write_version_file(toolchain_root, p.name, p.version, compiler, get_platform_release_label()) execute_many(handle_package, packages) @@ -156,18 +198,18 @@ def version_file_path(toolchain_root, pkg_name, pkg_version): def check_custom_toolchain(toolchain_root, packages): missing = [] for p in packages: - pkg_name, pkg_version = unpack_name_and_version(p) - pkg_dir = package_directory(toolchain_root, pkg_name, pkg_version) + pkg_dir = package_directory(toolchain_root, p.name, p.version) if not os.path.isdir(pkg_dir): missing.append((p, pkg_dir)) if missing: - print("The following packages are not in their expected locations.") + msg = "The following packages are not in their expected locations.\n" for p, pkg_dir in missing: - print(" %s (expected directory %s to exist)" % (p, pkg_dir)) - print("Pre-built toolchain archives not available for your platform.") - print("Clone and build native toolchain from source using this repository:") - print(" https://github.com/cloudera/native-toolchain") + msg += " %s (expected directory %s to exist)\n" % (p, pkg_dir) + msg += "Pre-built toolchain archives not available for your platform.\n" + msg += "Clone and build native toolchain from source using this repository:\n" + msg += " https://github.com/cloudera/native-toolchain\n" + logging.error(msg) raise Exception("Toolchain bootstrap failed: required packages were missing") def check_for_existing_package(toolchain_root, pkg_name, pkg_version, compiler): @@ -190,27 +232,13 @@ def write_version_file(toolchain_root, pkg_name, pkg_version, compiler, label): def remove_existing_package(toolchain_root, pkg_name, pkg_version): dir_path = package_directory(toolchain_root, pkg_name, pkg_version) if os.path.exists(dir_path): - print "Removing existing package directory {0}".format(dir_path) + logging.info("Removing existing package directory {0}".format(dir_path)) shutil.rmtree(dir_path) -def unpack_name_and_version(package): - """A package definition is either a string where the version is fetched from the - environment or a tuple where the package name and the package version are fully - specified. - """ - if isinstance(package, basestring): - env_var = "IMPALA_{0}_VERSION".format(package).replace("-", "_").upper() - try: - return package, os.environ[env_var] - except KeyError: - raise Exception("Could not find version for {0} in environment var {1}".format( - package, env_var)) - return package[0], package[1] - def build_kudu_stub(toolchain_root, kudu_version, compiler): # When Kudu isn't supported, the CentOS 7 package will be downloaded and the client # lib will be replaced with a stubbed client. - download_package(toolchain_root, "kudu", kudu_version, compiler, + download_package(toolchain_root, Package("kudu", kudu_version), compiler, platform_release="centos7") # Find the client lib files in the extracted dir. There may be several files with @@ -330,11 +358,11 @@ def execute_many(f, args): def download_cdh_components(toolchain_root, cdh_components): """Downloads and unpacks the CDH components into $CDH_COMPONENTS_HOME if not found.""" - cdh_components_home = os.getenv("CDH_COMPONENTS_HOME") + cdh_components_home = os.environ.get("CDH_COMPONENTS_HOME") if not cdh_components_home: - print("Impala environment not set up correctly, make sure " + logging.error("Impala environment not set up correctly, make sure " "$CDH_COMPONENTS_HOME is present.") - return + sys.exit(1) # Create the directory where CDH components live if necessary. if not os.path.exists(cdh_components_home): @@ -343,16 +371,18 @@ def download_cdh_components(toolchain_root, cdh_components): # The URL prefix of where CDH components live in S3. download_path_prefix = HOST + "/cdh_components/" - def download(component): - pkg_name, pkg_version = unpack_name_and_version(component) - pkg_directory = package_directory(cdh_components_home, pkg_name, pkg_version) + pkg_directory = package_directory(cdh_components_home, component.name, + component.version) if os.path.isdir(pkg_directory): return # Download the package if it doesn't exist - file_name = "{0}-{1}.tar.gz".format(pkg_name, pkg_version) - download_path = download_path_prefix + file_name + file_name = "{0}-{1}.tar.gz".format(component.name, component.version) + if component.url is None: + download_path = download_path_prefix + file_name + else: + download_path = component.url wget_and_unpack_package(download_path, file_name, cdh_components_home, False) execute_many(download, cdh_components) @@ -366,15 +396,20 @@ if __name__ == "__main__": the CDH components (i.e. hadoop, hbase, hive, llama, llama-minikidc and sentry) into the directory specified by $CDH_COMPONENTS_HOME. """ - if not os.getenv("IMPALA_HOME"): - print("Impala environment not set up correctly, make sure " + logging.basicConfig(level=logging.INFO, + format='%(asctime)s %(threadName)s %(levelname)s: %(message)s') + # 'sh' module logs at every execution, which is too noisy + logging.getLogger("sh").setLevel(logging.WARNING) + + if not os.environ.get("IMPALA_HOME"): + logging.error("Impala environment not set up correctly, make sure " "impala-config.sh is sourced.") sys.exit(1) # Create the destination directory if necessary - toolchain_root = os.getenv("IMPALA_TOOLCHAIN") + toolchain_root = os.environ.get("IMPALA_TOOLCHAIN") if not toolchain_root: - print("Impala environment not set up correctly, make sure " + logging.error("Impala environment not set up correctly, make sure " "$IMPALA_TOOLCHAIN is present.") sys.exit(1) @@ -383,14 +418,15 @@ if __name__ == "__main__": # LLVM and Kudu are the largest packages. Sort them first so that # their download starts as soon as possible. - packages = ["llvm", ("llvm", "3.9.1-asserts"), "kudu", + packages = map(Package, ["llvm", "kudu", "avro", "binutils", "boost", "breakpad", "bzip2", "cmake", "crcutil", "flatbuffers", "gcc", "gflags", "glog", "gperftools", "gtest", "libev", "lz4", "openldap", "openssl", "protobuf", - "rapidjson", "re2", "snappy", "thrift", "tpc-h", "tpc-ds", "zlib"] + "rapidjson", "re2", "snappy", "thrift", "tpc-h", "tpc-ds", "zlib"]) + packages.insert(0, Package("llvm", "3.9.1-asserts")) bootstrap(toolchain_root, packages) # Download the CDH components if necessary. if os.getenv("DOWNLOAD_CDH_COMPONENTS", "false") == "true": - cdh_components = ["hadoop", "hbase", "hive", "llama-minikdc", "sentry"] + cdh_components = map(Package, ["hadoop", "hbase", "hive", "llama-minikdc", "sentry"]) download_cdh_components(toolchain_root, cdh_components) http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2212a889/bin/impala-config.sh ---------------------------------------------------------------------- diff --git a/bin/impala-config.sh b/bin/impala-config.sh index eb04be0..0fd976b 100755 --- a/bin/impala-config.sh +++ b/bin/impala-config.sh @@ -76,51 +76,90 @@ export IMPALA_TOOLCHAIN_BUILD_ID=474-6c406b4a88 # Versions of toolchain dependencies. # ----------------------------------- export IMPALA_AVRO_VERSION=1.7.4-p4 +unset IMPALA_AVRO_URL export IMPALA_BINUTILS_VERSION=2.26.1 +unset IMPALA_BINUTILS_URL export IMPALA_BOOST_VERSION=1.57.0-p3 +unset IMPALA_BOOST_URL export IMPALA_BREAKPAD_VERSION=1b704857f1e78a864e6942e613457e55f1aecb60-p3 +unset IMPALA_BREAKPAD_URL export IMPALA_BZIP2_VERSION=1.0.6-p2 +unset IMPALA_BZIP2_URL export IMPALA_CMAKE_VERSION=3.8.2-p1 +unset IMPALA_CMAKE_URL export IMPALA_CRCUTIL_VERSION=440ba7babeff77ffad992df3a10c767f184e946e-p1 +unset IMPALA_CRCUTIL_URL export IMPALA_CYRUS_SASL_VERSION=2.1.23 +unset IMPALA_CYRUS_SASL_URL export IMPALA_FLATBUFFERS_VERSION=1.6.0 +unset IMPALA_FLATBUFFERS_URL export IMPALA_GCC_VERSION=4.9.2 +unset IMPALA_GCC_URL export IMPALA_GFLAGS_VERSION=2.2.0-p1 +unset IMPALA_GFLAGS_URL export IMPALA_GLOG_VERSION=0.3.4-p2 +unset IMPALA_GLOG_URL export IMPALA_GPERFTOOLS_VERSION=2.5 +unset IMPALA_GPERFTOOLS_URL export IMPALA_GTEST_VERSION=1.6.0 +unset IMPALA_GTEST_URL export IMPALA_LIBEV_VERSION=4.20 +unset IMPALA_LIBEV_URL export IMPALA_LLVM_VERSION=3.9.1 +unset IMPALA_LLVM_URL export IMPALA_LLVM_ASAN_VERSION=3.9.1 +unset IMPALA_LLVM_ASAN_URL + # Debug builds should use the release+asserts build to get additional coverage. # Don't use the LLVM debug build because the binaries are too large to distribute. export IMPALA_LLVM_DEBUG_VERSION=3.9.1-asserts +unset IMPALA_LLVM_DEBUG_URL export IMPALA_LZ4_VERSION=1.7.5 +unset IMPALA_LZ4_URL export IMPALA_OPENLDAP_VERSION=2.4.25 +unset IMPALA_OPENLDAP_URL export IMPALA_OPENSSL_VERSION=1.0.2l +unset IMPALA_OPENSSL_URL export IMPALA_PROTOBUF_VERSION=2.6.1 +unset IMPALA_PROTOBUF_URL export IMPALA_POSTGRES_JDBC_DRIVER_VERSION=9.0-801 +unset IMPALA_POSTGRES_JDBC_DRIVER_URL export IMPALA_RAPIDJSON_VERSION=0.11 +unset IMPALA_RAPIDJSON_URL export IMPALA_RE2_VERSION=20130115-p1 +unset IMPALA_RE2_URL export IMPALA_SNAPPY_VERSION=1.1.4 +unset IMPALA_SNAPPY_URL export IMPALA_SQUEASEL_VERSION=3.3 +unset IMPALA_SQUEASEL_URL # TPC utilities used for test/benchmark data generation. export IMPALA_TPC_DS_VERSION=2.1.0 +unset IMPALA_TPC_DS_URL export IMPALA_TPC_H_VERSION=2.17.0 +unset IMPALA_TPC_H_URL export IMPALA_THRIFT_VERSION=0.9.0-p11 +unset IMPALA_THRIFT_URL export IMPALA_THRIFT_JAVA_VERSION=0.9.0 +unset IMPALA_THRIFT_JAVA_URL export IMPALA_ZLIB_VERSION=1.2.8 +unset IMPALA_ZLIB_URL if [[ $OSTYPE == "darwin"* ]]; then IMPALA_CYRUS_SASL_VERSION=2.1.26 + unset IMPALA_CYRUS_SASL_URL IMPALA_GPERFTOOLS_VERSION=2.3 + unset IMPALA_GPERFTOOLS_URL IMPALA_OPENSSL_VERSION=1.0.1p + unset IMPALA_OPENSSL_URL IMPALA_THRIFT_VERSION=0.9.2 + unset IMPALA_THRIFT_URL IMPALA_THRIFT_JAVA_VERSION=0.9.2 + unset IMPALA_THRIFT_JAVA_URL fi # Kudu version in the toolchain; provides libkudu_client.so and minicluster binaries. export IMPALA_KUDU_VERSION=bec2a24 +unset IMPALA_KUDU_URL # Kudu version used to identify Java client jar from maven export KUDU_JAVA_VERSION=1.6.0-cdh5.14.0-SNAPSHOT @@ -129,11 +168,17 @@ export KUDU_JAVA_VERSION=1.6.0-cdh5.14.0-SNAPSHOT # ------------------------------------------ export CDH_MAJOR_VERSION=5 export IMPALA_HADOOP_VERSION=2.6.0-cdh5.14.0-SNAPSHOT +unset IMPALA_HADOOP_URL export IMPALA_HBASE_VERSION=1.2.0-cdh5.14.0-SNAPSHOT +unset IMPALA_HBASE_URL export IMPALA_HIVE_VERSION=1.1.0-cdh5.14.0-SNAPSHOT +unset IMPALA_HIVE_URL export IMPALA_SENTRY_VERSION=1.5.1-cdh5.14.0-SNAPSHOT +unset IMPALA_SENTRY_URL export IMPALA_PARQUET_VERSION=1.5.0-cdh5.14.0-SNAPSHOT +unset IMPALA_PARQUET_URL export IMPALA_LLAMA_MINIKDC_VERSION=1.0.0 +unset IMPALA_LLAMA_MINIKDC_URL # Source the branch and local config override files here to override any # variables above or any variables below that allow overriding via environment
