Repository: incubator-impala Updated Branches: refs/heads/master 999a444e2 -> c189d0a39
Download toolchain in parallel. By downloading from the toolchain S3 buckets in parallel with extracting them, this improves bootstrap_toolchain on my machine from about 1m5s to about 30s. $rm -rf toolchain; time bin/bootstrap_toolchain.py > /dev/null real 0m29.226s user 0m46.516s sys 0m33.820s On a large EC2 machine, closer to the S3 buckets, the new time is 21s. Because multiprocessing hasn't always been available (python2.4 on RHEL5 won't have it), I fall back to a simpler implementation Change-Id: I46a6088bb002402c7653dbc8257dff869afb26ec Reviewed-on: http://gerrit.cloudera.org:8080/8237 Reviewed-by: Tim Armstrong <[email protected]> Reviewed-by: Alex Behm <[email protected]> Reviewed-by: Michael Brown <[email protected]> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/adb92d33 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/adb92d33 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/adb92d33 Branch: refs/heads/master Commit: adb92d33979c71a62481e7eeae4b96f6588a18d5 Parents: 999a444 Author: Philip Zeyliger <[email protected]> Authored: Mon Oct 9 12:55:05 2017 -0700 Committer: Impala Public Jenkins <[email protected]> Committed: Tue Oct 10 01:25:27 2017 +0000 ---------------------------------------------------------------------- bin/bootstrap_toolchain.py | 37 ++++++++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/adb92d33/bin/bootstrap_toolchain.py ---------------------------------------------------------------------- diff --git a/bin/bootstrap_toolchain.py b/bin/bootstrap_toolchain.py index 3a1f02d..76d4c5a 100755 --- a/bin/bootstrap_toolchain.py +++ b/bin/bootstrap_toolchain.py @@ -110,16 +110,17 @@ def bootstrap(toolchain_root, packages): # Detect the compiler compiler = "gcc-{0}".format(os.environ["IMPALA_GCC_VERSION"]) - for p in packages: + def handle_package(p): pkg_name, pkg_version = unpack_name_and_version(p) if check_for_existing_package(toolchain_root, pkg_name, pkg_version, compiler): - continue + return if pkg_name != "kudu" or os.environ["KUDU_IS_SUPPORTED"] == "true": download_package(toolchain_root, pkg_name, pkg_version, compiler) else: build_kudu_stub(toolchain_root, pkg_version, compiler) write_version_file(toolchain_root, pkg_name, pkg_version, compiler, get_platform_release_label()) + execute_many(handle_package, packages) def check_output(cmd_args): """Run the command and return the output. Raise an exception if the command returns @@ -300,6 +301,22 @@ extern "C" void %s() { finally: shutil.rmtree(stub_build_dir) +def execute_many(f, args): + """ + Executes f(a) for a in args. If possible, uses a threadpool + to execute in parallel. The pool uses the number of CPUs + in the system as the default size. + """ + pool = None + try: + import multiprocessing.pool + pool = multiprocessing.pool.ThreadPool() + return pool.map(f, args, 1) + except ImportError: + # multiprocessing was introduced in Python 2.6. + # For older Pythons (CentOS 5), degrade to single-threaded execution: + return [ f(a) for a in args ] + def download_cdh_components(toolchain_root, cdh_components): """Downloads and unpacks the CDH components into $CDH_COMPONENTS_HOME if not found.""" cdh_components_home = os.getenv("CDH_COMPONENTS_HOME") @@ -315,17 +332,20 @@ def download_cdh_components(toolchain_root, cdh_components): # The URL prefix of where CDH components live in S3. download_path_prefix = HOST + "/cdh_components/" - for component in cdh_components: + + def download(component): pkg_name, pkg_version = unpack_name_and_version(component) pkg_directory = package_directory(cdh_components_home, pkg_name, pkg_version) if os.path.isdir(pkg_directory): - continue + return # Download the package if it doesn't exist file_name = "{0}-{1}.tar.gz".format(pkg_name, pkg_version) download_path = download_path_prefix + file_name wget_and_unpack_package(download_path, file_name, cdh_components_home, False) + execute_many(download, cdh_components) + if __name__ == "__main__": """Validates the presence of $IMPALA_HOME and $IMPALA_TOOLCHAIN in the environment.- By checking $IMPALA_HOME is present, we assume that IMPALA_{LIB}_VERSION will be present @@ -350,9 +370,12 @@ if __name__ == "__main__": if not os.path.exists(toolchain_root): os.makedirs(toolchain_root) - packages = ["avro", "binutils", "boost", "breakpad", "bzip2", "cmake", "crcutil", - "flatbuffers", "gcc", "gflags", "glog", "gperftools", "gtest", "kudu", "libev", - "llvm", ("llvm", "3.9.1-asserts"), "lz4", "openldap", "openssl", "protobuf", + # LLVM and Kudu are the largest packages. Sort them first so that + # their download starts as soon as possible. + packages = ["llvm", ("llvm", "3.9.1-asserts"), "kudu", + "avro", "binutils", "boost", "breakpad", "bzip2", "cmake", "crcutil", + "flatbuffers", "gcc", "gflags", "glog", "gperftools", "gtest", "libev", + "lz4", "openldap", "openssl", "protobuf", "rapidjson", "re2", "snappy", "thrift", "tpc-h", "tpc-ds", "zlib"] bootstrap(toolchain_root, packages)
