This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 1306c58f29dee4f8f4c0ab18327bc19557a9156e
Author: Fucun Chu <[email protected]>
AuthorDate: Tue Aug 10 16:26:37 2021 +0800

    IMPALA-10870: Add Apache Hive 3.1.2 to the minicluster
    
    This patch modifies the minicluster script to optionally use Apache
    Hive 3.1.2 instead of CDP Hive 3.1.3.
    
    In order to make sure that existing setups don't break this is
    enabled via a environment variable override to bin/impala-config.sh.
    When the environment variable USE_APACHE_HIVE is set to true the
    bootstrap_toolchain script downloads Apache Hive 3.1.2 tarballs and
    extracts it in the toolchain directory. These binaries are used to
    start the Hive services (Hiveserver2 and metastore). The default is
    CDP Hive 3.1.3
    
    Since CDP Hive 3 uses some features of Apache Hive 4, this patch uses
    a different database name so that it is easy to switch from working
    from one environment which uses CDP Hive 3.1.3 metastore to another
    which usese Apache Hive 3.1.2 metastore.
    
    In order to start a minicluster which uses Apache Hive 3.1.2 users
    should follow the steps below:
    
    1. Make sure that minicluster, if running, is stopped before you run
    the following commands.
    2. Open a new terminal and run following commands.
    > export USE_APACHE_HIVE=true
    > source bin/impala-config.sh
    > bin/bootstrap_toolchain.py
      The above command downloads the Apache Hive 3.1.2 tarballs and
    extracts them in toolchain/apache_components directory.
    
    > rm $HIVE_HOME/lib/guava-*jar
    > cp $HADOOP_HOME/share/hadoop/hdfs/lib/guava-*.jar $HIVE_HOME/lib/
      The above command is to fix HIVE-22915
    
    > bin/create-test-configuration.sh -create_metastore
      The above step should provide "-create-metastore" only the first time
    so that a new metastore db is created and the Apache Hive 3.1.2 schema
    is initialized.
    
    > testdata/bin/run-all.sh
    
    Follow-up:
     - Add MetastoreShim to support Apache Hive 3.x in IMPALA-10871
    
    Tests:
     - Made sure that the cluster comes up with Apache Hive 3.1.2 when the
       steps above are performed.
     - Made sure that existing scripts work as they do currently when
       argument is not provided.
    
    Change-Id: I1978909589ecacb15d32d874e97f050a85adf1f6
    Reviewed-on: http://gerrit.cloudera.org:8080/17793
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 bin/bootstrap_toolchain.py | 38 +++++++++++++++++++++++++++---
 bin/impala-config.sh       | 58 +++++++++++++++++++++++++++++++++++++++-------
 buildall.sh                |  5 ++++
 3 files changed, 89 insertions(+), 12 deletions(-)

diff --git a/bin/bootstrap_toolchain.py b/bin/bootstrap_toolchain.py
index 0a53b02..5ec5587 100755
--- a/bin/bootstrap_toolchain.py
+++ b/bin/bootstrap_toolchain.py
@@ -124,13 +124,16 @@ def wget_and_unpack_package(download_path, file_name, 
destination, wget_no_clobb
   if not download_path.endswith("/" + file_name):
     raise Exception("URL {0} does not match with expected file_name {1}"
         .format(download_path, file_name))
+  if "closer.cgi" in download_path:
+    download_path += "?action=download"
   NUM_ATTEMPTS = 3
   for attempt in range(1, NUM_ATTEMPTS + 1):
     logging.info("Downloading {0} to {1}/{2} (attempt {3})".format(
       download_path, destination, file_name, attempt))
     # --no-clobber avoids downloading the file if a file with the name already 
exists
     try:
-      cmd = ["wget", download_path, 
"--directory-prefix={0}".format(destination)]
+      cmd = ["wget", download_path,
+             "--output-document={0}/{1}".format(destination, file_name)]
       if wget_no_clobber:
         cmd.append("--no-clobber")
       check_output(cmd)
@@ -322,6 +325,29 @@ class CdpComponent(EnvVersionedPackage):
                                        makedir=makedir, 
template_subs_in=template_subs)
 
 
+class ApacheComponent(EnvVersionedPackage):
+  def __init__(self, name, explicit_version=None, archive_basename_tmpl=None,
+               unpack_directory_tmpl=None, makedir=False, 
component_path_tmpl=None):
+    # Compute the apache base URL (based on the APACHE_MIRROR)
+    if "APACHE_COMPONENTS_HOME" not in os.environ:
+      logging.error("Impala environment not set up correctly, make sure "
+                    "impala-config.sh is sourced.")
+      sys.exit(1)
+    template_subs = {"apache_mirror": os.environ["APACHE_MIRROR"]}
+    # Different components have different sub-paths. For example, hive is 
hive/hive-xxx,
+    # hadoop is hadoop/common/hadoop-xxx. The default is hive format.
+    if component_path_tmpl is None:
+      component_path_tmpl = "${name}/${name}-${version}/"
+    url_prefix_tmpl = "${apache_mirror}/" + component_path_tmpl
+
+    # Get the output base directory from APACHE_COMPONENTS_HOME
+    destination_basedir = os.environ["APACHE_COMPONENTS_HOME"]
+    super(ApacheComponent, self).__init__(name, url_prefix_tmpl, 
destination_basedir,
+                                       explicit_version=explicit_version,
+                                       
archive_basename_tmpl=archive_basename_tmpl,
+                                       
unpack_directory_tmpl=unpack_directory_tmpl,
+                                       makedir=makedir, 
template_subs_in=template_subs)
+
 class ToolchainKudu(ToolchainPackage):
   def __init__(self, platform_label=None):
     super(ToolchainKudu, self).__init__('kudu', 
platform_release=platform_label)
@@ -458,8 +484,13 @@ def get_hadoop_downloads():
   hadoop = CdpComponent("hadoop")
   hbase = CdpComponent("hbase", archive_basename_tmpl="hbase-${version}-bin",
                        unpack_directory_tmpl="hbase-${version}")
-  hive = CdpComponent("hive", 
archive_basename_tmpl="apache-hive-${version}-bin")
-  hive_src = CdpComponent("hive-source",
+  use_apache_hive = os.environ["USE_APACHE_HIVE"] == "true"
+  if use_apache_hive:
+    hive = ApacheComponent("hive", 
archive_basename_tmpl="apache-hive-${version}-bin")
+    hive_src = ApacheComponent("hive", 
archive_basename_tmpl="apache-hive-${version}-src")
+  else:
+    hive = CdpComponent("hive", 
archive_basename_tmpl="apache-hive-${version}-bin")
+    hive_src = CdpComponent("hive-source",
                           
explicit_version=os.environ.get("IMPALA_HIVE_VERSION"),
                           archive_basename_tmpl="hive-${version}-source",
                           unpack_directory_tmpl="hive-${version}")
@@ -519,6 +550,7 @@ def main():
   kudu_download = None
   if os.getenv("DOWNLOAD_CDH_COMPONENTS", "false") == "true":
     create_directory_from_env_var("CDP_COMPONENTS_HOME")
+    create_directory_from_env_var("APACHE_COMPONENTS_HOME")
     if platform.processor() != "aarch64":
       downloads += get_kudu_downloads()
     downloads += get_hadoop_downloads()
diff --git a/bin/impala-config.sh b/bin/impala-config.sh
index a5d666a..bb407db 100755
--- a/bin/impala-config.sh
+++ b/bin/impala-config.sh
@@ -187,6 +187,12 @@ export CDP_RANGER_VERSION=2.1.0.7.2.12.0-104
 export CDP_TEZ_VERSION=0.9.1.7.2.12.0-104
 export CDP_GCS_VERSION=2.1.2.7.2.12.0-104
 
+# Ref: https://infra.apache.org/release-download-pages.html#closer
+: ${APACHE_MIRROR:="https://www.apache.org/dyn/closer.cgi"}
+export APACHE_MIRROR
+export APACHE_HIVE_VERSION=3.1.2
+export APACHE_HIVE_STORAGE_API_VERSION=2.7.0
+
 export ARCH_NAME=$(uname -p)
 
 export IMPALA_HUDI_VERSION=0.5.0-incubating
@@ -228,6 +234,9 @@ export CDP_ICEBERG_URL=${CDP_ICEBERG_URL-}
 export CDP_RANGER_URL=${CDP_RANGER_URL-}
 export CDP_TEZ_URL=${CDP_TEZ_URL-}
 
+export APACHE_HIVE_URL=${APACHE_HIVE_URL-}
+export APACHE_HIVE_SOURCE_URL=${APACHE_HIVE_SOURCE_URL-}
+
 export CDP_COMPONENTS_HOME="$IMPALA_TOOLCHAIN/cdp_components-$CDP_BUILD_NUMBER"
 export CDH_MAJOR_VERSION=7
 export IMPALA_AVRO_JAVA_VERSION=${CDP_AVRO_JAVA_VERSION}
@@ -236,9 +245,6 @@ export IMPALA_HADOOP_URL=${CDP_HADOOP_URL-}
 export HADOOP_HOME="$CDP_COMPONENTS_HOME/hadoop-${IMPALA_HADOOP_VERSION}/"
 export IMPALA_HBASE_VERSION=${CDP_HBASE_VERSION}
 export IMPALA_HBASE_URL=${CDP_HBASE_URL-}
-export IMPALA_HIVE_VERSION=${HIVE_VERSION_OVERRIDE:-"$CDP_HIVE_VERSION"}
-export IMPALA_HIVE_URL=${CDP_HIVE_URL-}
-export IMPALA_HIVE_SOURCE_URL=${CDP_HIVE_SOURCE_URL-}
 export IMPALA_ICEBERG_VERSION=${CDP_ICEBERG_VERSION}
 export IMPALA_ICEBERG_URL=${CDP_ICEBERG_URL-}
 export IMPALA_KNOX_VERSION=${CDP_KNOX_VERSION}
@@ -248,9 +254,29 @@ export IMPALA_RANGER_VERSION=${CDP_RANGER_VERSION}
 export IMPALA_RANGER_URL=${CDP_RANGER_URL-}
 export IMPALA_TEZ_VERSION=${CDP_TEZ_VERSION}
 export IMPALA_TEZ_URL=${CDP_TEZ_URL-}
-export 
IMPALA_HIVE_STORAGE_API_VERSION=${HIVE_STORAGE_API_VERSION_OVERRIDE:-"2.3.0.$IMPALA_HIVE_VERSION"}
 export IMPALA_GCS_VERSION=${CDP_GCS_VERSION}
 
+export APACHE_COMPONENTS_HOME="$IMPALA_TOOLCHAIN/apache_components"
+export USE_APACHE_HIVE=${USE_APACHE_HIVE-false}
+if $USE_APACHE_HIVE; then
+  # When USE_APACHE_HIVE is set we use the apache hive version to build as 
well as deploy
+  # in the minicluster
+  export IMPALA_HIVE_DIST_TYPE="apache-hive"
+  export IMPALA_HIVE_VERSION=${APACHE_HIVE_VERSION}
+  export IMPALA_HIVE_URL=${APACHE_HIVE_URL-}
+  export IMPALA_HIVE_SOURCE_URL=${APACHE_HIVE_SOURCE_URL-}
+  export IMPALA_HIVE_STORAGE_API_VERSION=${APACHE_HIVE_STORAGE_API_VERSION}
+else
+  # CDP hive version is used to build and deploy in minicluster when 
USE_APACHE_HIVE is
+  # false
+  export IMPALA_HIVE_DIST_TYPE="hive"
+  export IMPALA_HIVE_VERSION=${HIVE_VERSION_OVERRIDE:-"$CDP_HIVE_VERSION"}
+  export IMPALA_HIVE_URL=${CDP_HIVE_URL-}
+  export IMPALA_HIVE_SOURCE_URL=${CDP_HIVE_SOURCE_URL-}
+  export IMPALA_HIVE_STORAGE_API_VERSION=${HIVE_STORAGE_API_VERSION_OVERRIDE:-\
+"2.3.0.$IMPALA_HIVE_VERSION"}
+fi
+
 # Extract the first component of the hive version.
 # Allow overriding of Hive source location in case we want to build Impala 
without
 # a complete Hive build. This is used by various tests and scripts to enable 
and
@@ -365,16 +391,26 @@ export LOCAL_FS="file:${WAREHOUSE_LOCATION_PREFIX}"
 export 
IMPALA_CLUSTER_NODES_DIR="${IMPALA_CLUSTER_NODES_DIR-$IMPALA_HOME/testdata/cluster/cdh$CDH_MAJOR_VERSION}"
 
 ESCAPED_IMPALA_HOME=$(sed "s/[^0-9a-zA-Z]/_/g" <<< "$IMPALA_HOME")
-export 
HIVE_HOME=${HIVE_HOME_OVERRIDE:-"$CDP_COMPONENTS_HOME/apache-hive-${IMPALA_HIVE_VERSION}-bin"}
-export 
HIVE_SRC_DIR=${HIVE_SRC_DIR_OVERRIDE:-"${CDP_COMPONENTS_HOME}/hive-${IMPALA_HIVE_VERSION}"}
+if $USE_APACHE_HIVE; then
+  export 
HIVE_HOME="$APACHE_COMPONENTS_HOME/apache-hive-${IMPALA_HIVE_VERSION}-bin"
+  export 
HIVE_SRC_DIR="$APACHE_COMPONENTS_HOME/apache-hive-${IMPALA_HIVE_VERSION}-src"
+  # if apache hive is being used change the metastore db name, so we don't 
have to
+  # format the metastore db everytime we switch between hive versions
+  export METASTORE_DB=${METASTORE_DB-"$(cut -c-59 <<< 
HMS$ESCAPED_IMPALA_HOME)_apache"}
+else
+  export HIVE_HOME=${HIVE_HOME_OVERRIDE:-\
+"$CDP_COMPONENTS_HOME/apache-hive-${IMPALA_HIVE_VERSION}-bin"}
+  export HIVE_SRC_DIR=${HIVE_SRC_DIR_OVERRIDE:-\
+"${CDP_COMPONENTS_HOME}/hive-${IMPALA_HIVE_VERSION}"}
+  # Previously, there were multiple configurations and the "_cdp" included 
below
+  # allowed the two to be distinct. We keep this "_cdp" for historical reasons.
+  export METASTORE_DB=${METASTORE_DB-"$(cut -c-59 <<< 
HMS$ESCAPED_IMPALA_HOME)_cdp"}
+fi
 # Set the path to the hive_metastore.thrift which is used to build thrift code
 export HIVE_METASTORE_THRIFT_DIR=${HIVE_METASTORE_THRIFT_DIR_OVERRIDE:-\
 "$HIVE_SRC_DIR/standalone-metastore/src/main/thrift"}
 export TEZ_HOME="$CDP_COMPONENTS_HOME/tez-${IMPALA_TEZ_VERSION}-minimal"
 export HBASE_HOME="$CDP_COMPONENTS_HOME/hbase-${IMPALA_HBASE_VERSION}/"
-# Previously, there were multiple configurations and the "_cdp" included below
-# allowed the two to be distinct. We keep this "_cdp" for historical reasons.
-export METASTORE_DB=${METASTORE_DB-"$(cut -c-59 <<< 
HMS$ESCAPED_IMPALA_HOME)_cdp"}
 # Set the Hive binaries in the path
 export PATH="$HIVE_HOME/bin:$PATH"
 
@@ -737,6 +773,10 @@ echo "IMPALA_MAVEN_OPTIONS    = $IMPALA_MAVEN_OPTIONS"
 echo "IMPALA_TOOLCHAIN_HOST   = $IMPALA_TOOLCHAIN_HOST"
 echo "CDP_BUILD_NUMBER        = $CDP_BUILD_NUMBER"
 echo "CDP_COMPONENTS_HOME     = $CDP_COMPONENTS_HOME"
+if $USE_APACHE_HIVE; then
+  echo "APACHE_MIRROR           = $APACHE_MIRROR"
+  echo "APACHE_COMPONENTS_HOME  = $APACHE_COMPONENTS_HOME"
+fi
 echo "IMPALA_HADOOP_VERSION   = $IMPALA_HADOOP_VERSION"
 echo "IMPALA_AVRO_JAVA_VERSION= $IMPALA_AVRO_JAVA_VERSION"
 echo "IMPALA_PARQUET_VERSION  = $IMPALA_PARQUET_VERSION"
diff --git a/buildall.sh b/buildall.sh
index 29fd0fd..0f6dc7c 100755
--- a/buildall.sh
+++ b/buildall.sh
@@ -419,6 +419,11 @@ bootstrap_dependencies() {
     "$IMPALA_HOME/bin/bootstrap_toolchain.py"
     echo "Toolchain bootstrap complete."
   fi
+  # HIVE-22915
+  if [[ "${USE_APACHE_HIVE}" = true ]]; then
+    rm $HIVE_HOME/lib/guava-*jar
+    cp $HADOOP_HOME/share/hadoop/hdfs/lib/guava-*.jar $HIVE_HOME/lib/
+  fi
 }
 
 # Build the Impala frontend and its dependencies.

Reply via email to