This is an automated email from the ASF dual-hosted git repository. stigahuang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 1306c58f29dee4f8f4c0ab18327bc19557a9156e Author: Fucun Chu <[email protected]> AuthorDate: Tue Aug 10 16:26:37 2021 +0800 IMPALA-10870: Add Apache Hive 3.1.2 to the minicluster This patch modifies the minicluster script to optionally use Apache Hive 3.1.2 instead of CDP Hive 3.1.3. In order to make sure that existing setups don't break this is enabled via a environment variable override to bin/impala-config.sh. When the environment variable USE_APACHE_HIVE is set to true the bootstrap_toolchain script downloads Apache Hive 3.1.2 tarballs and extracts it in the toolchain directory. These binaries are used to start the Hive services (Hiveserver2 and metastore). The default is CDP Hive 3.1.3 Since CDP Hive 3 uses some features of Apache Hive 4, this patch uses a different database name so that it is easy to switch from working from one environment which uses CDP Hive 3.1.3 metastore to another which usese Apache Hive 3.1.2 metastore. In order to start a minicluster which uses Apache Hive 3.1.2 users should follow the steps below: 1. Make sure that minicluster, if running, is stopped before you run the following commands. 2. Open a new terminal and run following commands. > export USE_APACHE_HIVE=true > source bin/impala-config.sh > bin/bootstrap_toolchain.py The above command downloads the Apache Hive 3.1.2 tarballs and extracts them in toolchain/apache_components directory. > rm $HIVE_HOME/lib/guava-*jar > cp $HADOOP_HOME/share/hadoop/hdfs/lib/guava-*.jar $HIVE_HOME/lib/ The above command is to fix HIVE-22915 > bin/create-test-configuration.sh -create_metastore The above step should provide "-create-metastore" only the first time so that a new metastore db is created and the Apache Hive 3.1.2 schema is initialized. > testdata/bin/run-all.sh Follow-up: - Add MetastoreShim to support Apache Hive 3.x in IMPALA-10871 Tests: - Made sure that the cluster comes up with Apache Hive 3.1.2 when the steps above are performed. - Made sure that existing scripts work as they do currently when argument is not provided. Change-Id: I1978909589ecacb15d32d874e97f050a85adf1f6 Reviewed-on: http://gerrit.cloudera.org:8080/17793 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- bin/bootstrap_toolchain.py | 38 +++++++++++++++++++++++++++--- bin/impala-config.sh | 58 +++++++++++++++++++++++++++++++++++++++------- buildall.sh | 5 ++++ 3 files changed, 89 insertions(+), 12 deletions(-) diff --git a/bin/bootstrap_toolchain.py b/bin/bootstrap_toolchain.py index 0a53b02..5ec5587 100755 --- a/bin/bootstrap_toolchain.py +++ b/bin/bootstrap_toolchain.py @@ -124,13 +124,16 @@ def wget_and_unpack_package(download_path, file_name, destination, wget_no_clobb if not download_path.endswith("/" + file_name): raise Exception("URL {0} does not match with expected file_name {1}" .format(download_path, file_name)) + if "closer.cgi" in download_path: + download_path += "?action=download" NUM_ATTEMPTS = 3 for attempt in range(1, NUM_ATTEMPTS + 1): logging.info("Downloading {0} to {1}/{2} (attempt {3})".format( download_path, destination, file_name, attempt)) # --no-clobber avoids downloading the file if a file with the name already exists try: - cmd = ["wget", download_path, "--directory-prefix={0}".format(destination)] + cmd = ["wget", download_path, + "--output-document={0}/{1}".format(destination, file_name)] if wget_no_clobber: cmd.append("--no-clobber") check_output(cmd) @@ -322,6 +325,29 @@ class CdpComponent(EnvVersionedPackage): makedir=makedir, template_subs_in=template_subs) +class ApacheComponent(EnvVersionedPackage): + def __init__(self, name, explicit_version=None, archive_basename_tmpl=None, + unpack_directory_tmpl=None, makedir=False, component_path_tmpl=None): + # Compute the apache base URL (based on the APACHE_MIRROR) + if "APACHE_COMPONENTS_HOME" not in os.environ: + logging.error("Impala environment not set up correctly, make sure " + "impala-config.sh is sourced.") + sys.exit(1) + template_subs = {"apache_mirror": os.environ["APACHE_MIRROR"]} + # Different components have different sub-paths. For example, hive is hive/hive-xxx, + # hadoop is hadoop/common/hadoop-xxx. The default is hive format. + if component_path_tmpl is None: + component_path_tmpl = "${name}/${name}-${version}/" + url_prefix_tmpl = "${apache_mirror}/" + component_path_tmpl + + # Get the output base directory from APACHE_COMPONENTS_HOME + destination_basedir = os.environ["APACHE_COMPONENTS_HOME"] + super(ApacheComponent, self).__init__(name, url_prefix_tmpl, destination_basedir, + explicit_version=explicit_version, + archive_basename_tmpl=archive_basename_tmpl, + unpack_directory_tmpl=unpack_directory_tmpl, + makedir=makedir, template_subs_in=template_subs) + class ToolchainKudu(ToolchainPackage): def __init__(self, platform_label=None): super(ToolchainKudu, self).__init__('kudu', platform_release=platform_label) @@ -458,8 +484,13 @@ def get_hadoop_downloads(): hadoop = CdpComponent("hadoop") hbase = CdpComponent("hbase", archive_basename_tmpl="hbase-${version}-bin", unpack_directory_tmpl="hbase-${version}") - hive = CdpComponent("hive", archive_basename_tmpl="apache-hive-${version}-bin") - hive_src = CdpComponent("hive-source", + use_apache_hive = os.environ["USE_APACHE_HIVE"] == "true" + if use_apache_hive: + hive = ApacheComponent("hive", archive_basename_tmpl="apache-hive-${version}-bin") + hive_src = ApacheComponent("hive", archive_basename_tmpl="apache-hive-${version}-src") + else: + hive = CdpComponent("hive", archive_basename_tmpl="apache-hive-${version}-bin") + hive_src = CdpComponent("hive-source", explicit_version=os.environ.get("IMPALA_HIVE_VERSION"), archive_basename_tmpl="hive-${version}-source", unpack_directory_tmpl="hive-${version}") @@ -519,6 +550,7 @@ def main(): kudu_download = None if os.getenv("DOWNLOAD_CDH_COMPONENTS", "false") == "true": create_directory_from_env_var("CDP_COMPONENTS_HOME") + create_directory_from_env_var("APACHE_COMPONENTS_HOME") if platform.processor() != "aarch64": downloads += get_kudu_downloads() downloads += get_hadoop_downloads() diff --git a/bin/impala-config.sh b/bin/impala-config.sh index a5d666a..bb407db 100755 --- a/bin/impala-config.sh +++ b/bin/impala-config.sh @@ -187,6 +187,12 @@ export CDP_RANGER_VERSION=2.1.0.7.2.12.0-104 export CDP_TEZ_VERSION=0.9.1.7.2.12.0-104 export CDP_GCS_VERSION=2.1.2.7.2.12.0-104 +# Ref: https://infra.apache.org/release-download-pages.html#closer +: ${APACHE_MIRROR:="https://www.apache.org/dyn/closer.cgi"} +export APACHE_MIRROR +export APACHE_HIVE_VERSION=3.1.2 +export APACHE_HIVE_STORAGE_API_VERSION=2.7.0 + export ARCH_NAME=$(uname -p) export IMPALA_HUDI_VERSION=0.5.0-incubating @@ -228,6 +234,9 @@ export CDP_ICEBERG_URL=${CDP_ICEBERG_URL-} export CDP_RANGER_URL=${CDP_RANGER_URL-} export CDP_TEZ_URL=${CDP_TEZ_URL-} +export APACHE_HIVE_URL=${APACHE_HIVE_URL-} +export APACHE_HIVE_SOURCE_URL=${APACHE_HIVE_SOURCE_URL-} + export CDP_COMPONENTS_HOME="$IMPALA_TOOLCHAIN/cdp_components-$CDP_BUILD_NUMBER" export CDH_MAJOR_VERSION=7 export IMPALA_AVRO_JAVA_VERSION=${CDP_AVRO_JAVA_VERSION} @@ -236,9 +245,6 @@ export IMPALA_HADOOP_URL=${CDP_HADOOP_URL-} export HADOOP_HOME="$CDP_COMPONENTS_HOME/hadoop-${IMPALA_HADOOP_VERSION}/" export IMPALA_HBASE_VERSION=${CDP_HBASE_VERSION} export IMPALA_HBASE_URL=${CDP_HBASE_URL-} -export IMPALA_HIVE_VERSION=${HIVE_VERSION_OVERRIDE:-"$CDP_HIVE_VERSION"} -export IMPALA_HIVE_URL=${CDP_HIVE_URL-} -export IMPALA_HIVE_SOURCE_URL=${CDP_HIVE_SOURCE_URL-} export IMPALA_ICEBERG_VERSION=${CDP_ICEBERG_VERSION} export IMPALA_ICEBERG_URL=${CDP_ICEBERG_URL-} export IMPALA_KNOX_VERSION=${CDP_KNOX_VERSION} @@ -248,9 +254,29 @@ export IMPALA_RANGER_VERSION=${CDP_RANGER_VERSION} export IMPALA_RANGER_URL=${CDP_RANGER_URL-} export IMPALA_TEZ_VERSION=${CDP_TEZ_VERSION} export IMPALA_TEZ_URL=${CDP_TEZ_URL-} -export IMPALA_HIVE_STORAGE_API_VERSION=${HIVE_STORAGE_API_VERSION_OVERRIDE:-"2.3.0.$IMPALA_HIVE_VERSION"} export IMPALA_GCS_VERSION=${CDP_GCS_VERSION} +export APACHE_COMPONENTS_HOME="$IMPALA_TOOLCHAIN/apache_components" +export USE_APACHE_HIVE=${USE_APACHE_HIVE-false} +if $USE_APACHE_HIVE; then + # When USE_APACHE_HIVE is set we use the apache hive version to build as well as deploy + # in the minicluster + export IMPALA_HIVE_DIST_TYPE="apache-hive" + export IMPALA_HIVE_VERSION=${APACHE_HIVE_VERSION} + export IMPALA_HIVE_URL=${APACHE_HIVE_URL-} + export IMPALA_HIVE_SOURCE_URL=${APACHE_HIVE_SOURCE_URL-} + export IMPALA_HIVE_STORAGE_API_VERSION=${APACHE_HIVE_STORAGE_API_VERSION} +else + # CDP hive version is used to build and deploy in minicluster when USE_APACHE_HIVE is + # false + export IMPALA_HIVE_DIST_TYPE="hive" + export IMPALA_HIVE_VERSION=${HIVE_VERSION_OVERRIDE:-"$CDP_HIVE_VERSION"} + export IMPALA_HIVE_URL=${CDP_HIVE_URL-} + export IMPALA_HIVE_SOURCE_URL=${CDP_HIVE_SOURCE_URL-} + export IMPALA_HIVE_STORAGE_API_VERSION=${HIVE_STORAGE_API_VERSION_OVERRIDE:-\ +"2.3.0.$IMPALA_HIVE_VERSION"} +fi + # Extract the first component of the hive version. # Allow overriding of Hive source location in case we want to build Impala without # a complete Hive build. This is used by various tests and scripts to enable and @@ -365,16 +391,26 @@ export LOCAL_FS="file:${WAREHOUSE_LOCATION_PREFIX}" export IMPALA_CLUSTER_NODES_DIR="${IMPALA_CLUSTER_NODES_DIR-$IMPALA_HOME/testdata/cluster/cdh$CDH_MAJOR_VERSION}" ESCAPED_IMPALA_HOME=$(sed "s/[^0-9a-zA-Z]/_/g" <<< "$IMPALA_HOME") -export HIVE_HOME=${HIVE_HOME_OVERRIDE:-"$CDP_COMPONENTS_HOME/apache-hive-${IMPALA_HIVE_VERSION}-bin"} -export HIVE_SRC_DIR=${HIVE_SRC_DIR_OVERRIDE:-"${CDP_COMPONENTS_HOME}/hive-${IMPALA_HIVE_VERSION}"} +if $USE_APACHE_HIVE; then + export HIVE_HOME="$APACHE_COMPONENTS_HOME/apache-hive-${IMPALA_HIVE_VERSION}-bin" + export HIVE_SRC_DIR="$APACHE_COMPONENTS_HOME/apache-hive-${IMPALA_HIVE_VERSION}-src" + # if apache hive is being used change the metastore db name, so we don't have to + # format the metastore db everytime we switch between hive versions + export METASTORE_DB=${METASTORE_DB-"$(cut -c-59 <<< HMS$ESCAPED_IMPALA_HOME)_apache"} +else + export HIVE_HOME=${HIVE_HOME_OVERRIDE:-\ +"$CDP_COMPONENTS_HOME/apache-hive-${IMPALA_HIVE_VERSION}-bin"} + export HIVE_SRC_DIR=${HIVE_SRC_DIR_OVERRIDE:-\ +"${CDP_COMPONENTS_HOME}/hive-${IMPALA_HIVE_VERSION}"} + # Previously, there were multiple configurations and the "_cdp" included below + # allowed the two to be distinct. We keep this "_cdp" for historical reasons. + export METASTORE_DB=${METASTORE_DB-"$(cut -c-59 <<< HMS$ESCAPED_IMPALA_HOME)_cdp"} +fi # Set the path to the hive_metastore.thrift which is used to build thrift code export HIVE_METASTORE_THRIFT_DIR=${HIVE_METASTORE_THRIFT_DIR_OVERRIDE:-\ "$HIVE_SRC_DIR/standalone-metastore/src/main/thrift"} export TEZ_HOME="$CDP_COMPONENTS_HOME/tez-${IMPALA_TEZ_VERSION}-minimal" export HBASE_HOME="$CDP_COMPONENTS_HOME/hbase-${IMPALA_HBASE_VERSION}/" -# Previously, there were multiple configurations and the "_cdp" included below -# allowed the two to be distinct. We keep this "_cdp" for historical reasons. -export METASTORE_DB=${METASTORE_DB-"$(cut -c-59 <<< HMS$ESCAPED_IMPALA_HOME)_cdp"} # Set the Hive binaries in the path export PATH="$HIVE_HOME/bin:$PATH" @@ -737,6 +773,10 @@ echo "IMPALA_MAVEN_OPTIONS = $IMPALA_MAVEN_OPTIONS" echo "IMPALA_TOOLCHAIN_HOST = $IMPALA_TOOLCHAIN_HOST" echo "CDP_BUILD_NUMBER = $CDP_BUILD_NUMBER" echo "CDP_COMPONENTS_HOME = $CDP_COMPONENTS_HOME" +if $USE_APACHE_HIVE; then + echo "APACHE_MIRROR = $APACHE_MIRROR" + echo "APACHE_COMPONENTS_HOME = $APACHE_COMPONENTS_HOME" +fi echo "IMPALA_HADOOP_VERSION = $IMPALA_HADOOP_VERSION" echo "IMPALA_AVRO_JAVA_VERSION= $IMPALA_AVRO_JAVA_VERSION" echo "IMPALA_PARQUET_VERSION = $IMPALA_PARQUET_VERSION" diff --git a/buildall.sh b/buildall.sh index 29fd0fd..0f6dc7c 100755 --- a/buildall.sh +++ b/buildall.sh @@ -419,6 +419,11 @@ bootstrap_dependencies() { "$IMPALA_HOME/bin/bootstrap_toolchain.py" echo "Toolchain bootstrap complete." fi + # HIVE-22915 + if [[ "${USE_APACHE_HIVE}" = true ]]; then + rm $HIVE_HOME/lib/guava-*jar + cp $HADOOP_HOME/share/hadoop/hdfs/lib/guava-*.jar $HIVE_HOME/lib/ + fi } # Build the Impala frontend and its dependencies.
