Repository: incubator-impala Updated Branches: refs/heads/master 8a04b170d -> fc3ff1c52
IMPALA-3223: Supports download of CDH components from S3. This change updates the toolchain bootstrapping script to download the CDH components (hadoop, hbase, hive, llama, llama-minikdc and sentry) from the toolchain S3 bucket to the toolchain directory if the environment variable $DOWNLOAD_CDH_COMPONENTS is true. By default, it is false which means the CDH components in the thirdparty directory will be used instead. To build the ASF tree(https://git-wip-us.apache.org/repos/asf?p=incubator-impala.git), set $DOWNLOAD_CDH_COMPONENTS to true. Currently, the CDH components in S3 are snapshots from the thirdparty directory at 688d0efcd38731e8e27a8236dbdca21c8fd571a1. Once the integration jenkins job (impala-cdh5-trunk-core-integration) is modified to upload the latest stable builds to the S3 buckets, we can remove the thirdparty directory and always use the CDH components in the toolchain directory. Note that bootstrap_toolchain.py will not overwrite existing directories in the toolchain directory. To force a refresh of cpmponents in the toolchain directory, a user should delete the cached copy in the toolchain directory and execute bootstrap_toolchain.py again. This behavior allows users to develop locally without network connection once the toolchain has been bootstrapped. Change-Id: I16fa79db0005554cc0a116e74775647ba99f8dda Reviewed-on: http://gerrit.cloudera.org:8080/3333 Reviewed-by: Michael Ho <[email protected]> Tested-by: Internal Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/6e71e903 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/6e71e903 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/6e71e903 Branch: refs/heads/master Commit: 6e71e903ff0b7181c487376512c8dc1421642636 Parents: 8a04b17 Author: Michael Ho <[email protected]> Authored: Fri May 27 19:19:36 2016 -0700 Committer: Tim Armstrong <[email protected]> Committed: Tue Jun 21 00:37:53 2016 -0700 ---------------------------------------------------------------------- CMakeLists.txt | 15 --- bin/bootstrap_toolchain.py | 114 +++++++++++++------ bin/impala-config.sh | 23 +++- buildall.sh | 12 +- .../cdh5/etc/init.d/llama-application | 2 +- 5 files changed, 106 insertions(+), 60 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6e71e903/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/CMakeLists.txt b/CMakeLists.txt index c6638bd..1569bd3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,21 +14,6 @@ cmake_minimum_required(VERSION 2.6) -if ("$ENV{SKIP_TOOLCHAIN_BOOTSTRAP}" STREQUAL "true") - message(STATUS "SKIP_TOOLCHAIN_BOOTSTRAP is true, skipping toolchain bootstrap.") -else() - # Download any missing toolchain dependencies. If this fails, fail the build. - set(BOOTSTRAP_CMD "$ENV{IMPALA_HOME}/bin/bootstrap_toolchain.py") - # Download and unpack the dependencies - message(STATUS "Downloading and extracting dependencies.") - execute_process(COMMAND ${BOOTSTRAP_CMD} RESULT_VARIABLE BOOTSTRAP_RESULT) - if (${BOOTSTRAP_RESULT} EQUAL 0) - message(STATUS "Toolchain bootstrap complete.") - else() - message(FATAL_ERROR "Toolchain bootstrap failed.") - endif() -endif() - # Explicitly define project() to allow modifying the compiler before the project is # initialized. project(Impala) http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6e71e903/bin/bootstrap_toolchain.py ---------------------------------------------------------------------- diff --git a/bin/bootstrap_toolchain.py b/bin/bootstrap_toolchain.py index 2589d1f..5a43e1d 100755 --- a/bin/bootstrap_toolchain.py +++ b/bin/bootstrap_toolchain.py @@ -11,15 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -# Bootstrapping the native toolchain with prebuilt binaries # -# The purpose of this script is to download prebuilt artifacts of the native toolchain to -# satisfy the third-party dependencies for Impala. The script checks for the presence of -# IMPALA_HOME and IMPALA_TOOLCHAIN. IMPALA_HOME indicates that the environment is -# correctly setup and that we can deduce the version settings of the dependencies from the -# environment. IMPALA_TOOLCHAIN indicates the location where the prebuilt artifacts should -# be extracted to. +# The purpose of this script is to download prebuilt binaries and jar files to satisfy the +# third-party dependencies for Impala. The script checks for the presence of IMPALA_HOME +# and IMPALA_TOOLCHAIN. IMPALA_HOME indicates that the environment is correctly setup and +# that we can deduce the version settings of the dependencies from the environment. +# IMPALA_TOOLCHAIN indicates the location where the prebuilt artifacts should be extracted +# to. If DOWNLOAD_CDH_COMPONENTS is set to true, this script will also download and extract +# the CDH components (i.e. Hadoop, Hive, HBase, Llama, Llama-minikdc and Sentry) into +# CDH_COMPONENTS_HOME. # # The script is called as follows without any additional parameters: # @@ -29,6 +29,7 @@ import re import sh import shutil import subprocess +import sys import tempfile HOST = "https://native-toolchain.s3.amazonaws.com/build" @@ -67,45 +68,30 @@ def get_platform_release_label(release=None): raise Exception("Could not find package label for OS version: {0}.".format(release)) -def download_package(destination, product, version, compiler, platform_release=None): - remove_existing_package(destination, product, version) - - label = get_platform_release_label(release=platform_release) - file_name = "{0}-{1}-{2}-{3}.tar.gz".format(product, version, compiler, label) - url_path="/{0}/{1}-{2}/{0}-{1}-{2}-{3}.tar.gz".format(product, version, compiler, label) - download_path = HOST + url_path +def wget_and_unpack_package(download_path, file_name, destination, wget_no_clobber): print "URL {0}".format(download_path) print "Downloading {0} to {1}".format(file_name, destination) # --no-clobber avoids downloading the file if a file with the name already exists - sh.wget(download_path, directory_prefix=destination, no_clobber=True) + sh.wget(download_path, directory_prefix=destination, no_clobber=wget_no_clobber) print "Extracting {0}".format(file_name) sh.tar(z=True, x=True, f=os.path.join(destination, file_name), directory=destination) sh.rm(os.path.join(destination, file_name)) -def bootstrap(packages): - """Validates the presence of $IMPALA_HOME and $IMPALA_TOOLCHAIN in the environment. By - checking $IMPALA_HOME is present, we assume that IMPALA_{LIB}_VERSION will be present as - well. Will create the directory specified by $IMPALA_TOOLCHAIN if it does not yet - exist. Each of the packages specified in `packages` is downloaded and extracted into - $IMPALA_TOOLCHAIN. - - """ - if not os.getenv("IMPALA_HOME"): - print("Impala environment not set up correctly, make sure " - "impala-config.sh is sourced.") - sys.exit(1) +def download_package(destination, product, version, compiler, platform_release=None): + remove_existing_package(destination, product, version) - # Create the destination directory if necessary - toolchain_root = os.getenv("IMPALA_TOOLCHAIN") - if not toolchain_root: - print("Impala environment not set up correctly, make sure " - "$IMPALA_TOOLCHAIN is present.") - sys.exit(1) + label = get_platform_release_label(release=platform_release) + file_name = "{0}-{1}-{2}-{3}.tar.gz".format(product, version, compiler, label) + url_path="/{0}/{1}-{2}/{0}-{1}-{2}-{3}.tar.gz".format(product, version, compiler, label) + download_path = HOST + url_path - if not os.path.exists(toolchain_root): - os.makedirs(toolchain_root) + wget_and_unpack_package(download_path, file_name, destination, True) +def bootstrap(toolchain_root, packages): + """Downloads and unpacks each package in the list `packages` into `toolchain_root` if it + doesn't exist already. + """ if not try_get_platform_release_label(): check_custom_toolchain(toolchain_root, packages) return @@ -303,8 +289,62 @@ extern "C" void %s() { finally: shutil.rmtree(stub_build_dir) +def download_cdh_components(toolchain_root, cdh_components): + """Downloads and unpacks the CDH components into $CDH_COMPONENTS_HOME if not found.""" + cdh_components_home = os.getenv("CDH_COMPONENTS_HOME") + if not cdh_components_home: + print("Impala environment not set up correctly, make sure " + "$CDH_COMPONENTS_HOME is present.") + return + + # Create the directory where CDH components live if necessary. + if not os.path.exists(cdh_components_home): + os.makedirs(cdh_components_home) + + # The URL prefix of where CDH components live in S3. + download_path_prefix = HOST + "/cdh_components/" + + for component in cdh_components: + pkg_name, pkg_version = unpack_name_and_version(component) + pkg_directory = package_directory(cdh_components_home, pkg_name, pkg_version) + if os.path.isdir(pkg_directory): + continue + + # Download the package if it doesn't exist + file_name = "{0}-{1}.tar.gz".format(pkg_name, pkg_version) + download_path = download_path_prefix + file_name + wget_and_unpack_package(download_path, file_name, cdh_components_home, False) + if __name__ == "__main__": + """Validates the presence of $IMPALA_HOME and $IMPALA_TOOLCHAIN in the environment.- + By checking $IMPALA_HOME is present, we assume that IMPALA_{LIB}_VERSION will be present + as well. Will create the directory specified by $IMPALA_TOOLCHAIN if it doesn't exist + yet. Each of the packages specified in `packages` is downloaded and extracted into + $IMPALA_TOOLCHAIN. If $DOWNLOAD_CDH_COMPONENTS is true, this function will also download + the CDH components (i.e. hadoop, hbase, hive, llama, llama-minikidc and sentry) into the + directory specified by $CDH_COMPONENTS_HOME. + """ + if not os.getenv("IMPALA_HOME"): + print("Impala environment not set up correctly, make sure " + "impala-config.sh is sourced.") + sys.exit(1) + + # Create the destination directory if necessary + toolchain_root = os.getenv("IMPALA_TOOLCHAIN") + if not toolchain_root: + print("Impala environment not set up correctly, make sure " + "$IMPALA_TOOLCHAIN is present.") + sys.exit(1) + + if not os.path.exists(toolchain_root): + os.makedirs(toolchain_root) + packages = ["avro", "binutils", "boost", "breakpad", "bzip2", "gcc", "gflags", "glog", "gperftools", "gtest", "kudu", "llvm", ("llvm", "3.8.0-asserts-p1"), "lz4", "openldap", "rapidjson", "re2", "snappy", "thrift", "zlib"] - bootstrap(packages) + bootstrap(toolchain_root, packages) + + # Download the CDH components if necessary. + if os.getenv("DOWNLOAD_CDH_COMPONENTS", "false") == "true": + cdh_components = ["hadoop", "hbase", "hive", "llama", "llama-minikdc", "sentry"] + download_cdh_components(toolchain_root, cdh_components) http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6e71e903/bin/impala-config.sh ---------------------------------------------------------------------- diff --git a/bin/impala-config.sh b/bin/impala-config.sh index 584e5a2..9db640a 100755 --- a/bin/impala-config.sh +++ b/bin/impala-config.sh @@ -46,6 +46,9 @@ if [ -z $IMPALA_TOOLCHAIN ]; then return 1 fi +# If true, will not call $IMPALA_HOME/bin/bootstrap_toolchain.py. +: ${SKIP_TOOLCHAIN_BOOTSTRAP=false} + # This flag is used in $IMPALA_HOME/cmake_modules/toolchain.cmake. # If it's 0, Impala will be built with the compiler in the toolchain directory. : ${USE_SYSTEM_GCC=0} @@ -63,11 +66,17 @@ fi # If enabled, debug symbols are added to cross-compiled IR. : ${ENABLE_IMPALA_IR_DEBUG_INFO=false} +# If true, download and use the CDH components from S3 instead of the ones +# in $IMPALA_HOME/thirdparty. +: ${DOWNLOAD_CDH_COMPONENTS=false} + export IMPALA_TOOLCHAIN +export SKIP_TOOLCHAIN_BOOTSTRAP export USE_SYSTEM_GCC export USE_GOLD_LINKER export IMPALA_CXX_COMPILER export ENABLE_IMPALA_IR_DEBUG_INFO +export DOWNLOAD_CDH_COMPONENTS export IS_OSX=$(if [[ "$OSTYPE" == "darwin"* ]]; then echo true; else echo false; fi) # To use a local build of Kudu, set KUDU_BUILD_DIR to the path Kudu was built in and @@ -280,7 +289,7 @@ export IMPALA_HIVE_VERSION=1.1.0-cdh5.9.0-SNAPSHOT export IMPALA_SENTRY_VERSION=1.5.1-cdh5.9.0-SNAPSHOT export IMPALA_LLAMA_VERSION=1.0.0-cdh5.9.0-SNAPSHOT export IMPALA_PARQUET_VERSION=1.5.0-cdh5.9.0-SNAPSHOT -export IMPALA_MINIKDC_VERSION=1.0.0 +export IMPALA_LLAMA_MINIKDC_VERSION=1.0.0 export IMPALA_FE_DIR=$IMPALA_HOME/fe export IMPALA_BE_DIR=$IMPALA_HOME/be @@ -292,7 +301,11 @@ export IMPALA_COMMON_DIR=$IMPALA_HOME/common export PATH=$IMPALA_HOME/bin:$PATH # The directory in which all the thirdparty CDH components live. -CDH_COMPONENTS_HOME=$IMPALA_HOME/thirdparty +if [ "${DOWNLOAD_CDH_COMPONENTS}" = true ]; then + export CDH_COMPONENTS_HOME=$IMPALA_TOOLCHAIN/cdh_components +else + export CDH_COMPONENTS_HOME=$IMPALA_HOME/thirdparty +fi # Hadoop dependencies are snapshots in the Impala tree export HADOOP_HOME=$CDH_COMPONENTS_HOME/hadoop-${IMPALA_HADOOP_VERSION}/ @@ -308,7 +321,7 @@ export MINI_DFS_BASE_DATA_DIR=$IMPALA_HOME/cdh-${CDH_MAJOR_VERSION}-hdfs-data export PATH=$HADOOP_HOME/bin:$PATH export LLAMA_HOME=$CDH_COMPONENTS_HOME/llama-${IMPALA_LLAMA_VERSION}/ -export MINIKDC_HOME=$CDH_COMPONENTS_HOME/llama-minikdc-${IMPALA_MINIKDC_VERSION} +export MINIKDC_HOME=$CDH_COMPONENTS_HOME/llama-minikdc-${IMPALA_LLAMA_MINIKDC_VERSION} export SENTRY_HOME=$CDH_COMPONENTS_HOME/sentry-${IMPALA_SENTRY_VERSION} export SENTRY_CONF_DIR=$IMPALA_HOME/fe/src/test/resources @@ -382,10 +395,10 @@ export JAVA_LIBRARY_PATH=${IMPALA_SNAPPY_PATH} LIB_JAVA=`find ${JAVA_HOME}/ -name libjava.so | head -1` LIB_JSIG=`find ${JAVA_HOME}/ -name libjsig.so | head -1` LIB_JVM=` find ${JAVA_HOME}/ -name libjvm.so | head -1` -LIB_HDFS=`find ${HADOOP_HOME}/ -name libhdfs.so | head -1` LD_LIBRARY_PATH="${LD_LIBRARY_PATH-}" LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:`dirname ${LIB_JAVA}`:`dirname ${LIB_JSIG}`" -LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:`dirname ${LIB_JVM}`:`dirname ${LIB_HDFS}`" +LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:`dirname ${LIB_JVM}`" +LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${HADOOP_HOME}/lib/native" LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${IMPALA_HOME}/be/build/debug/service" LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${IMPALA_SNAPPY_PATH}" LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${IMPALA_LZO}/build" http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6e71e903/buildall.sh ---------------------------------------------------------------------- diff --git a/buildall.sh b/buildall.sh index 2c49b6f..b4542d8 100755 --- a/buildall.sh +++ b/buildall.sh @@ -244,6 +244,15 @@ if [ $CLEAN_ACTION -eq 1 ]; then $IMPALA_HOME/bin/clean.sh fi +# Populate necessary thirdparty components unless it's set to be skipped. +if [ "${SKIP_TOOLCHAIN_BOOTSTRAP}" = true ]; then + echo "SKIP_TOOLCHAIN_BOOTSTRAP is true, skipping toolchain bootstrap." +else + echo "Downloading and extracting dependencies." + $IMPALA_HOME/bin/bootstrap_toolchain.py + echo "Toolchain bootstrap complete." +fi + MAKE_IMPALA_ARGS="${MAKE_IMPALA_ARGS} -build_type=${CMAKE_BUILD_TYPE}" if [ $BUILD_FE_ONLY -eq 1 ]; then @@ -254,8 +263,7 @@ fi if [ -e $HADOOP_LZO/build/native/Linux-*-*/lib/libgplcompression.so ] then - cp $HADOOP_LZO/build/native/Linux-*-*/lib/libgplcompression.* \ - $IMPALA_HOME/thirdparty/hadoop-${IMPALA_HADOOP_VERSION}/lib/native/ + cp $HADOOP_LZO/build/native/Linux-*-*/lib/libgplcompression.* $HADOOP_HOME/lib/native else echo "No hadoop-lzo found" fi http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6e71e903/testdata/cluster/node_templates/cdh5/etc/init.d/llama-application ---------------------------------------------------------------------- diff --git a/testdata/cluster/node_templates/cdh5/etc/init.d/llama-application b/testdata/cluster/node_templates/cdh5/etc/init.d/llama-application index d19a42b..b519528 100755 --- a/testdata/cluster/node_templates/cdh5/etc/init.d/llama-application +++ b/testdata/cluster/node_templates/cdh5/etc/init.d/llama-application @@ -7,7 +7,7 @@ DIR=$(dirname $0) HADOOP_LOG_DIR="$LOG_DIR/llama" HADOOP_CLASSPATH="$NODE_DIR/etc/llama/conf:$HADOOP_CLASSPATH" -for JAR in $(find "$IMPALA_HOME"/thirdparty/llama* -name "*jar"); do +for JAR in $(find "$LLAMA_HOME" -name "*jar"); do HADOOP_CLASSPATH="$JAR:$HADOOP_CLASSPATH" done export HADOOP_CLASSPATH
