IMPALA-6148: Specifying thirdparty deps as URLs

If the environment variable $IMPALA_<NAME>_URL is configured in
impala-config-branch.sh or impala-config-local, for a thirdparty
dependency, use that to download it instead of the s3://native-toolchain
bucket. This makes testing against arbitrary versions of the
dependencies easier.

I did a little bit of refactoring while here, creating a small class for
a Package to handle reading the environment variables. I also changed
bootstrap_toolchain.py to use Python logging, which cleans up the output
during the multi-threaded downloading.

I tested this by both with customized URLs and by running the regular
build (pre-review-test, without most of the slow test suites).

Change-Id: I4628d86022d4bd8b762313f7056d76416a58b422
Reviewed-on: http://gerrit.cloudera.org:8080/8456
Reviewed-by: Joe McDonnell <[email protected]>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/2212a889
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/2212a889
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/2212a889

Branch: refs/heads/master
Commit: 2212a8897e66e1a5c548132d09a8e33e275a0e3b
Parents: 5ab07f0
Author: Philip Zeyliger <[email protected]>
Authored: Thu Nov 2 15:43:10 2017 -0700
Committer: Impala Public Jenkins <[email protected]>
Committed: Fri Nov 10 02:42:16 2017 +0000

----------------------------------------------------------------------
 bin/bootstrap_toolchain.py | 150 +++++++++++++++++++++++++---------------
 bin/impala-config.sh       |  45 ++++++++++++
 2 files changed, 138 insertions(+), 57 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2212a889/bin/bootstrap_toolchain.py
----------------------------------------------------------------------
diff --git a/bin/bootstrap_toolchain.py b/bin/bootstrap_toolchain.py
index bea3a99..8494c6c 100755
--- a/bin/bootstrap_toolchain.py
+++ b/bin/bootstrap_toolchain.py
@@ -25,9 +25,16 @@
 # the CDH components (i.e. Hadoop, Hive, HBase and Sentry) into
 # CDH_COMPONENTS_HOME.
 #
+# By default, packages are downloaded from an S3 bucket named native-toolchain.
+# The exact URL is based on IMPALA_<PACKAGE>_VERSION environment variables
+# (configured in impala-config.sh) as well as the OS version being built on.
+# The URL can be overridden with an IMPALA_<PACKAGE>_URL environment variable
+# set in impala-config-{local,branch}.sh.
+#
 # The script is called as follows without any additional parameters:
 #
 #     python bootstrap_toolchain.py
+import logging
 import os
 import random
 import re
@@ -57,6 +64,28 @@ OS_MAPPING = {
   "ubuntu16.04" : "ec2-package-ubuntu-16-04",
 }
 
+class Package(object):
+  """
+  Represents a package to be downloaded. A version, if not specified
+  explicitly, is retrieved from the environment variable IMPALA_<NAME>_VERSION.
+  URLs are retrieved from IMPALA_<NAME>_URL, but are optional.
+  """
+  def __init__(self, name, version=None, url=None):
+    self.name = name
+    self.version = version
+    self.url = url
+    package_env_name = name.replace("-", "_").upper()
+    if self.version is None:
+      version_env_var = "IMPALA_{0}_VERSION".format(package_env_name)
+
+      self.version = os.environ.get(version_env_var)
+      if not self.version:
+        raise Exception("Could not find version for {0} in environment var 
{1}".format(
+          name, version_env_var))
+    if self.url is None:
+      url_env_var = "IMPALA_{0}_URL".format(package_env_name)
+      self.url = os.environ.get(url_env_var)
+
 def try_get_platform_release_label():
   """Gets the right package label from the OS version. Return None if not 
found."""
   try:
@@ -64,24 +93,35 @@ def try_get_platform_release_label():
   except:
     return None
 
+# Cache "lsb_release -irs" to avoid excessive logging from sh, and
+# to shave a little bit of time.
+lsb_release_cache = None
+
 def get_platform_release_label(release=None):
   """Gets the right package label from the OS version. Raise exception if not 
found.
      'release' can be provided to override the underlying OS version.
   """
+  global lsb_release_cache
   if not release:
-    release = "".join(map(lambda x: x.lower(), sh.lsb_release("-irs").split()))
+    if lsb_release_cache:
+      release = lsb_release_cache
+    else:
+      release = "".join(map(lambda x: x.lower(), 
sh.lsb_release("-irs").split()))
+      lsb_release_cache = release
   for k, v in OS_MAPPING.iteritems():
     if re.search(k, release):
       return v
 
   raise Exception("Could not find package label for OS version: 
{0}.".format(release))
 
-
 def wget_and_unpack_package(download_path, file_name, destination, 
wget_no_clobber):
-  print "URL {0}".format(download_path)
+  if not download_path.endswith("/" + file_name):
+    raise Exception("URL {0} does not match with expected file_name {1}"
+        .format(download_path, file_name))
   NUM_ATTEMPTS = 3
   for attempt in range(1, NUM_ATTEMPTS + 1):
-    print "Downloading {0} to {1} (attempt {2})".format(file_name, 
destination, attempt)
+    logging.info("Downloading {0} to {1}/{2} (attempt {3})".format(
+      download_path, destination, file_name, attempt))
     # --no-clobber avoids downloading the file if a file with the name already 
exists
     try:
       sh.wget(download_path, directory_prefix=destination, 
no_clobber=wget_no_clobber)
@@ -89,24 +129,27 @@ def wget_and_unpack_package(download_path, file_name, 
destination, wget_no_clobb
     except Exception, e:
       if attempt == NUM_ATTEMPTS:
         raise
-      print "Download failed; retrying after sleep: " + str(e)
+      logging.error("Download failed; retrying after sleep: " + str(e))
       time.sleep(10 + random.random() * 5) # Sleep between 10 and 15 seconds.
-  print "Extracting {0}".format(file_name)
+  logging.info("Extracting {0}".format(file_name))
   sh.tar(z=True, x=True, f=os.path.join(destination, file_name), 
directory=destination)
   sh.rm(os.path.join(destination, file_name))
 
-def download_package(destination, product, version, compiler, 
platform_release=None):
-  remove_existing_package(destination, product, version)
+def download_package(destination, package, compiler, platform_release=None):
+  remove_existing_package(destination, package.name, package.version)
 
   toolchain_build_id = os.environ["IMPALA_TOOLCHAIN_BUILD_ID"]
   label = get_platform_release_label(release=platform_release)
-  format_params = {'product': product, 'version': version, 'compiler': 
compiler,
-      'label': label, 'toolchain_build_id': toolchain_build_id}
+  format_params = {'product': package.name, 'version': package.version,
+      'compiler': compiler, 'label': label, 'toolchain_build_id': 
toolchain_build_id}
   file_name = 
"{product}-{version}-{compiler}-{label}.tar.gz".format(**format_params)
   format_params['file_name'] = file_name
-  url_path = 
"/{toolchain_build_id}/{product}/{version}-{compiler}/{file_name}".format(
-      **format_params)
-  download_path = HOST + url_path
+  if package.url is None:
+    url_path = 
"/{toolchain_build_id}/{product}/{version}-{compiler}/{file_name}".format(
+        **format_params)
+    download_path = HOST + url_path
+  else:
+    download_path = package.url
 
   wget_and_unpack_package(download_path, file_name, destination, True)
 
@@ -122,14 +165,13 @@ def bootstrap(toolchain_root, packages):
   compiler = "gcc-{0}".format(os.environ["IMPALA_GCC_VERSION"])
 
   def handle_package(p):
-    pkg_name, pkg_version = unpack_name_and_version(p)
-    if check_for_existing_package(toolchain_root, pkg_name, pkg_version, 
compiler):
+    if check_for_existing_package(toolchain_root, p.name, p.version, compiler):
       return
-    if pkg_name != "kudu" or os.environ["KUDU_IS_SUPPORTED"] == "true":
-      download_package(toolchain_root, pkg_name, pkg_version, compiler)
+    if p.name != "kudu" or os.environ["KUDU_IS_SUPPORTED"] == "true":
+      download_package(toolchain_root, p, compiler)
     else:
-      build_kudu_stub(toolchain_root, pkg_version, compiler)
-    write_version_file(toolchain_root, pkg_name, pkg_version, compiler,
+      build_kudu_stub(toolchain_root, p.version, compiler)
+    write_version_file(toolchain_root, p.name, p.version, compiler,
         get_platform_release_label())
   execute_many(handle_package, packages)
 
@@ -156,18 +198,18 @@ def version_file_path(toolchain_root, pkg_name, 
pkg_version):
 def check_custom_toolchain(toolchain_root, packages):
   missing = []
   for p in packages:
-    pkg_name, pkg_version = unpack_name_and_version(p)
-    pkg_dir = package_directory(toolchain_root, pkg_name, pkg_version)
+    pkg_dir = package_directory(toolchain_root, p.name, p.version)
     if not os.path.isdir(pkg_dir):
       missing.append((p, pkg_dir))
 
   if missing:
-    print("The following packages are not in their expected locations.")
+    msg = "The following packages are not in their expected locations.\n"
     for p, pkg_dir in missing:
-      print("  %s (expected directory %s to exist)" % (p, pkg_dir))
-    print("Pre-built toolchain archives not available for your platform.")
-    print("Clone and build native toolchain from source using this 
repository:")
-    print("    https://github.com/cloudera/native-toolchain";)
+      msg += "  %s (expected directory %s to exist)\n" % (p, pkg_dir)
+    msg += "Pre-built toolchain archives not available for your platform.\n"
+    msg += "Clone and build native toolchain from source using this 
repository:\n"
+    msg += "    https://github.com/cloudera/native-toolchain\n";
+    logging.error(msg)
     raise Exception("Toolchain bootstrap failed: required packages were 
missing")
 
 def check_for_existing_package(toolchain_root, pkg_name, pkg_version, 
compiler):
@@ -190,27 +232,13 @@ def write_version_file(toolchain_root, pkg_name, 
pkg_version, compiler, label):
 def remove_existing_package(toolchain_root, pkg_name, pkg_version):
   dir_path = package_directory(toolchain_root, pkg_name, pkg_version)
   if os.path.exists(dir_path):
-    print "Removing existing package directory {0}".format(dir_path)
+    logging.info("Removing existing package directory {0}".format(dir_path))
     shutil.rmtree(dir_path)
 
-def unpack_name_and_version(package):
-  """A package definition is either a string where the version is fetched from 
the
-  environment or a tuple where the package name and the package version are 
fully
-  specified.
-  """
-  if isinstance(package, basestring):
-    env_var = "IMPALA_{0}_VERSION".format(package).replace("-", "_").upper()
-    try:
-      return package, os.environ[env_var]
-    except KeyError:
-      raise Exception("Could not find version for {0} in environment var 
{1}".format(
-        package, env_var))
-  return package[0], package[1]
-
 def build_kudu_stub(toolchain_root, kudu_version, compiler):
   # When Kudu isn't supported, the CentOS 7 package will be downloaded and the 
client
   # lib will be replaced with a stubbed client.
-  download_package(toolchain_root, "kudu", kudu_version, compiler,
+  download_package(toolchain_root, Package("kudu", kudu_version), compiler,
       platform_release="centos7")
 
   # Find the client lib files in the extracted dir. There may be several files 
with
@@ -330,11 +358,11 @@ def execute_many(f, args):
 
 def download_cdh_components(toolchain_root, cdh_components):
   """Downloads and unpacks the CDH components into $CDH_COMPONENTS_HOME if not 
found."""
-  cdh_components_home = os.getenv("CDH_COMPONENTS_HOME")
+  cdh_components_home = os.environ.get("CDH_COMPONENTS_HOME")
   if not cdh_components_home:
-    print("Impala environment not set up correctly, make sure "
+    logging.error("Impala environment not set up correctly, make sure "
           "$CDH_COMPONENTS_HOME is present.")
-    return
+    sys.exit(1)
 
   # Create the directory where CDH components live if necessary.
   if not os.path.exists(cdh_components_home):
@@ -343,16 +371,18 @@ def download_cdh_components(toolchain_root, 
cdh_components):
   # The URL prefix of where CDH components live in S3.
   download_path_prefix = HOST + "/cdh_components/"
 
-
   def download(component):
-    pkg_name, pkg_version = unpack_name_and_version(component)
-    pkg_directory = package_directory(cdh_components_home, pkg_name, 
pkg_version)
+    pkg_directory = package_directory(cdh_components_home, component.name,
+        component.version)
     if os.path.isdir(pkg_directory):
       return
 
     # Download the package if it doesn't exist
-    file_name = "{0}-{1}.tar.gz".format(pkg_name, pkg_version)
-    download_path = download_path_prefix + file_name
+    file_name = "{0}-{1}.tar.gz".format(component.name, component.version)
+    if component.url is None:
+      download_path = download_path_prefix + file_name
+    else:
+      download_path = component.url
     wget_and_unpack_package(download_path, file_name, cdh_components_home, 
False)
 
   execute_many(download, cdh_components)
@@ -366,15 +396,20 @@ if __name__ == "__main__":
   the CDH components (i.e. hadoop, hbase, hive, llama, llama-minikidc and 
sentry) into the
   directory specified by $CDH_COMPONENTS_HOME.
   """
-  if not os.getenv("IMPALA_HOME"):
-    print("Impala environment not set up correctly, make sure "
+  logging.basicConfig(level=logging.INFO,
+      format='%(asctime)s %(threadName)s %(levelname)s: %(message)s')
+  # 'sh' module logs at every execution, which is too noisy
+  logging.getLogger("sh").setLevel(logging.WARNING)
+
+  if not os.environ.get("IMPALA_HOME"):
+    logging.error("Impala environment not set up correctly, make sure "
           "impala-config.sh is sourced.")
     sys.exit(1)
 
   # Create the destination directory if necessary
-  toolchain_root = os.getenv("IMPALA_TOOLCHAIN")
+  toolchain_root = os.environ.get("IMPALA_TOOLCHAIN")
   if not toolchain_root:
-    print("Impala environment not set up correctly, make sure "
+    logging.error("Impala environment not set up correctly, make sure "
           "$IMPALA_TOOLCHAIN is present.")
     sys.exit(1)
 
@@ -383,14 +418,15 @@ if __name__ == "__main__":
 
   # LLVM and Kudu are the largest packages. Sort them first so that
   # their download starts as soon as possible.
-  packages = ["llvm", ("llvm", "3.9.1-asserts"), "kudu",
+  packages = map(Package, ["llvm", "kudu",
       "avro", "binutils", "boost", "breakpad", "bzip2", "cmake", "crcutil",
       "flatbuffers", "gcc", "gflags", "glog", "gperftools", "gtest", "libev",
       "lz4", "openldap", "openssl", "protobuf",
-      "rapidjson", "re2", "snappy", "thrift", "tpc-h", "tpc-ds", "zlib"]
+      "rapidjson", "re2", "snappy", "thrift", "tpc-h", "tpc-ds", "zlib"])
+  packages.insert(0, Package("llvm", "3.9.1-asserts"))
   bootstrap(toolchain_root, packages)
 
   # Download the CDH components if necessary.
   if os.getenv("DOWNLOAD_CDH_COMPONENTS", "false") == "true":
-    cdh_components = ["hadoop", "hbase", "hive", "llama-minikdc", "sentry"]
+    cdh_components = map(Package, ["hadoop", "hbase", "hive", "llama-minikdc", 
"sentry"])
     download_cdh_components(toolchain_root, cdh_components)

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/2212a889/bin/impala-config.sh
----------------------------------------------------------------------
diff --git a/bin/impala-config.sh b/bin/impala-config.sh
index eb04be0..0fd976b 100755
--- a/bin/impala-config.sh
+++ b/bin/impala-config.sh
@@ -76,51 +76,90 @@ export IMPALA_TOOLCHAIN_BUILD_ID=474-6c406b4a88
 # Versions of toolchain dependencies.
 # -----------------------------------
 export IMPALA_AVRO_VERSION=1.7.4-p4
+unset IMPALA_AVRO_URL
 export IMPALA_BINUTILS_VERSION=2.26.1
+unset IMPALA_BINUTILS_URL
 export IMPALA_BOOST_VERSION=1.57.0-p3
+unset IMPALA_BOOST_URL
 export IMPALA_BREAKPAD_VERSION=1b704857f1e78a864e6942e613457e55f1aecb60-p3
+unset IMPALA_BREAKPAD_URL
 export IMPALA_BZIP2_VERSION=1.0.6-p2
+unset IMPALA_BZIP2_URL
 export IMPALA_CMAKE_VERSION=3.8.2-p1
+unset IMPALA_CMAKE_URL
 export IMPALA_CRCUTIL_VERSION=440ba7babeff77ffad992df3a10c767f184e946e-p1
+unset IMPALA_CRCUTIL_URL
 export IMPALA_CYRUS_SASL_VERSION=2.1.23
+unset IMPALA_CYRUS_SASL_URL
 export IMPALA_FLATBUFFERS_VERSION=1.6.0
+unset IMPALA_FLATBUFFERS_URL
 export IMPALA_GCC_VERSION=4.9.2
+unset IMPALA_GCC_URL
 export IMPALA_GFLAGS_VERSION=2.2.0-p1
+unset IMPALA_GFLAGS_URL
 export IMPALA_GLOG_VERSION=0.3.4-p2
+unset IMPALA_GLOG_URL
 export IMPALA_GPERFTOOLS_VERSION=2.5
+unset IMPALA_GPERFTOOLS_URL
 export IMPALA_GTEST_VERSION=1.6.0
+unset IMPALA_GTEST_URL
 export IMPALA_LIBEV_VERSION=4.20
+unset IMPALA_LIBEV_URL
 export IMPALA_LLVM_VERSION=3.9.1
+unset IMPALA_LLVM_URL
 export IMPALA_LLVM_ASAN_VERSION=3.9.1
+unset IMPALA_LLVM_ASAN_URL
+
 # Debug builds should use the release+asserts build to get additional coverage.
 # Don't use the LLVM debug build because the binaries are too large to 
distribute.
 export IMPALA_LLVM_DEBUG_VERSION=3.9.1-asserts
+unset IMPALA_LLVM_DEBUG_URL
 export IMPALA_LZ4_VERSION=1.7.5
+unset IMPALA_LZ4_URL
 export IMPALA_OPENLDAP_VERSION=2.4.25
+unset IMPALA_OPENLDAP_URL
 export IMPALA_OPENSSL_VERSION=1.0.2l
+unset IMPALA_OPENSSL_URL
 export IMPALA_PROTOBUF_VERSION=2.6.1
+unset IMPALA_PROTOBUF_URL
 export IMPALA_POSTGRES_JDBC_DRIVER_VERSION=9.0-801
+unset IMPALA_POSTGRES_JDBC_DRIVER_URL
 export IMPALA_RAPIDJSON_VERSION=0.11
+unset IMPALA_RAPIDJSON_URL
 export IMPALA_RE2_VERSION=20130115-p1
+unset IMPALA_RE2_URL
 export IMPALA_SNAPPY_VERSION=1.1.4
+unset IMPALA_SNAPPY_URL
 export IMPALA_SQUEASEL_VERSION=3.3
+unset IMPALA_SQUEASEL_URL
 # TPC utilities used for test/benchmark data generation.
 export IMPALA_TPC_DS_VERSION=2.1.0
+unset IMPALA_TPC_DS_URL
 export IMPALA_TPC_H_VERSION=2.17.0
+unset IMPALA_TPC_H_URL
 export IMPALA_THRIFT_VERSION=0.9.0-p11
+unset IMPALA_THRIFT_URL
 export IMPALA_THRIFT_JAVA_VERSION=0.9.0
+unset IMPALA_THRIFT_JAVA_URL
 export IMPALA_ZLIB_VERSION=1.2.8
+unset IMPALA_ZLIB_URL
 
 if [[ $OSTYPE == "darwin"* ]]; then
   IMPALA_CYRUS_SASL_VERSION=2.1.26
+  unset IMPALA_CYRUS_SASL_URL
   IMPALA_GPERFTOOLS_VERSION=2.3
+  unset IMPALA_GPERFTOOLS_URL
   IMPALA_OPENSSL_VERSION=1.0.1p
+  unset IMPALA_OPENSSL_URL
   IMPALA_THRIFT_VERSION=0.9.2
+  unset IMPALA_THRIFT_URL
   IMPALA_THRIFT_JAVA_VERSION=0.9.2
+  unset IMPALA_THRIFT_JAVA_URL
 fi
 
 # Kudu version in the toolchain; provides libkudu_client.so and minicluster 
binaries.
 export IMPALA_KUDU_VERSION=bec2a24
+unset IMPALA_KUDU_URL
 
 # Kudu version used to identify Java client jar from maven
 export KUDU_JAVA_VERSION=1.6.0-cdh5.14.0-SNAPSHOT
@@ -129,11 +168,17 @@ export KUDU_JAVA_VERSION=1.6.0-cdh5.14.0-SNAPSHOT
 # ------------------------------------------
 export CDH_MAJOR_VERSION=5
 export IMPALA_HADOOP_VERSION=2.6.0-cdh5.14.0-SNAPSHOT
+unset IMPALA_HADOOP_URL
 export IMPALA_HBASE_VERSION=1.2.0-cdh5.14.0-SNAPSHOT
+unset IMPALA_HBASE_URL
 export IMPALA_HIVE_VERSION=1.1.0-cdh5.14.0-SNAPSHOT
+unset IMPALA_HIVE_URL
 export IMPALA_SENTRY_VERSION=1.5.1-cdh5.14.0-SNAPSHOT
+unset IMPALA_SENTRY_URL
 export IMPALA_PARQUET_VERSION=1.5.0-cdh5.14.0-SNAPSHOT
+unset IMPALA_PARQUET_URL
 export IMPALA_LLAMA_MINIKDC_VERSION=1.0.0
+unset IMPALA_LLAMA_MINIKDC_URL
 
 # Source the branch and local config override files here to override any
 # variables above or any variables below that allow overriding via environment

Reply via email to