Repository: incubator-impala
Updated Branches:
  refs/heads/master 999a444e2 -> c189d0a39


Download toolchain in parallel.

By downloading from the toolchain S3 buckets in parallel with
extracting them, this improves bootstrap_toolchain on my machine
from about 1m5s to about 30s.

  $rm -rf toolchain; time bin/bootstrap_toolchain.py > /dev/null

  real    0m29.226s
  user    0m46.516s
  sys     0m33.820s

On a large EC2 machine, closer to the S3 buckets, the new time is 21s.

Because multiprocessing hasn't always been available (python2.4 on RHEL5
won't have it), I fall back to a simpler implementation

Change-Id: I46a6088bb002402c7653dbc8257dff869afb26ec
Reviewed-on: http://gerrit.cloudera.org:8080/8237
Reviewed-by: Tim Armstrong <[email protected]>
Reviewed-by: Alex Behm <[email protected]>
Reviewed-by: Michael Brown <[email protected]>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/adb92d33
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/adb92d33
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/adb92d33

Branch: refs/heads/master
Commit: adb92d33979c71a62481e7eeae4b96f6588a18d5
Parents: 999a444
Author: Philip Zeyliger <[email protected]>
Authored: Mon Oct 9 12:55:05 2017 -0700
Committer: Impala Public Jenkins <[email protected]>
Committed: Tue Oct 10 01:25:27 2017 +0000

----------------------------------------------------------------------
 bin/bootstrap_toolchain.py | 37 ++++++++++++++++++++++++++++++-------
 1 file changed, 30 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/adb92d33/bin/bootstrap_toolchain.py
----------------------------------------------------------------------
diff --git a/bin/bootstrap_toolchain.py b/bin/bootstrap_toolchain.py
index 3a1f02d..76d4c5a 100755
--- a/bin/bootstrap_toolchain.py
+++ b/bin/bootstrap_toolchain.py
@@ -110,16 +110,17 @@ def bootstrap(toolchain_root, packages):
   # Detect the compiler
   compiler = "gcc-{0}".format(os.environ["IMPALA_GCC_VERSION"])
 
-  for p in packages:
+  def handle_package(p):
     pkg_name, pkg_version = unpack_name_and_version(p)
     if check_for_existing_package(toolchain_root, pkg_name, pkg_version, 
compiler):
-      continue
+      return
     if pkg_name != "kudu" or os.environ["KUDU_IS_SUPPORTED"] == "true":
       download_package(toolchain_root, pkg_name, pkg_version, compiler)
     else:
       build_kudu_stub(toolchain_root, pkg_version, compiler)
     write_version_file(toolchain_root, pkg_name, pkg_version, compiler,
         get_platform_release_label())
+  execute_many(handle_package, packages)
 
 def check_output(cmd_args):
   """Run the command and return the output. Raise an exception if the command 
returns
@@ -300,6 +301,22 @@ extern "C" void %s() {
   finally:
     shutil.rmtree(stub_build_dir)
 
+def execute_many(f, args):
+  """
+  Executes f(a) for a in args. If possible, uses a threadpool
+  to execute in parallel. The pool uses the number of CPUs
+  in the system as the default size.
+  """
+  pool = None
+  try:
+    import multiprocessing.pool
+    pool = multiprocessing.pool.ThreadPool()
+    return pool.map(f, args, 1)
+  except ImportError:
+    # multiprocessing was introduced in Python 2.6.
+    # For older Pythons (CentOS 5), degrade to single-threaded execution:
+    return [ f(a) for a in args ]
+
 def download_cdh_components(toolchain_root, cdh_components):
   """Downloads and unpacks the CDH components into $CDH_COMPONENTS_HOME if not 
found."""
   cdh_components_home = os.getenv("CDH_COMPONENTS_HOME")
@@ -315,17 +332,20 @@ def download_cdh_components(toolchain_root, 
cdh_components):
   # The URL prefix of where CDH components live in S3.
   download_path_prefix = HOST + "/cdh_components/"
 
-  for component in cdh_components:
+
+  def download(component):
     pkg_name, pkg_version = unpack_name_and_version(component)
     pkg_directory = package_directory(cdh_components_home, pkg_name, 
pkg_version)
     if os.path.isdir(pkg_directory):
-      continue
+      return
 
     # Download the package if it doesn't exist
     file_name = "{0}-{1}.tar.gz".format(pkg_name, pkg_version)
     download_path = download_path_prefix + file_name
     wget_and_unpack_package(download_path, file_name, cdh_components_home, 
False)
 
+  execute_many(download, cdh_components)
+
 if __name__ == "__main__":
   """Validates the presence of $IMPALA_HOME and $IMPALA_TOOLCHAIN in the 
environment.-
   By checking $IMPALA_HOME is present, we assume that IMPALA_{LIB}_VERSION 
will be present
@@ -350,9 +370,12 @@ if __name__ == "__main__":
   if not os.path.exists(toolchain_root):
     os.makedirs(toolchain_root)
 
-  packages = ["avro", "binutils", "boost", "breakpad", "bzip2", "cmake", 
"crcutil",
-      "flatbuffers", "gcc", "gflags", "glog", "gperftools", "gtest", "kudu", 
"libev",
-      "llvm", ("llvm", "3.9.1-asserts"), "lz4", "openldap", "openssl", 
"protobuf",
+  # LLVM and Kudu are the largest packages. Sort them first so that
+  # their download starts as soon as possible.
+  packages = ["llvm", ("llvm", "3.9.1-asserts"), "kudu",
+      "avro", "binutils", "boost", "breakpad", "bzip2", "cmake", "crcutil",
+      "flatbuffers", "gcc", "gflags", "glog", "gperftools", "gtest", "libev",
+      "lz4", "openldap", "openssl", "protobuf",
       "rapidjson", "re2", "snappy", "thrift", "tpc-h", "tpc-ds", "zlib"]
   bootstrap(toolchain_root, packages)
 

Reply via email to