This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 14698c8b99b80db7e6fd99900e32b6742bef1662
Author: Joe McDonnell <[email protected]>
AuthorDate: Fri Nov 4 14:33:04 2022 -0700

    IMPALA-11603: Build against Cloudflare ZLIB by default
    
    Cloudflare Zlib is a fork of the Zlib codebase that
    has been optimized to take advantage of CPU SIMD
    instructions and other platform-specific optimizations.
    It has the same license as regular Zlib. Amazon has
    touted this as a major speedup over regular Zlib:
    
https://aws.amazon.com/blogs/opensource/improving-zlib-cloudflare-and-comparing-performance-with-other-zlib-forks/
    
    This adds the IMPALA_USE_CLOUDFLARE_ZLIB environment
    variable which allows Impala to be built against
    Cloudflare Zlib. This defaults to true. If set to
    any other value, it will build against regular Zlib.
    
    Cloudflare Zlib shows a clear performance benefit
    over regular Zlib on TPC-H ORC/deflate benchmark:
    
+----------+-------------------+---------+------------+------------+----------------+
    | Workload | File Format       | Avg (s) | Delta(Avg) | GeoMean(s) | 
Delta(GeoMean) |
    
+----------+-------------------+---------+------------+------------+----------------+
    | TPCH(42) | orc / def / block | 4.18    | -6.43%     | 3.29       | -6.74% 
        |
    
+----------+-------------------+---------+------------+------------+----------------+
    
    Testing:
     - Ran GVO tests and exhaustive release tests
    
    Change-Id: I82c480890726da0fa5bdc2a646022554eec181f4
    Reviewed-on: http://gerrit.cloudera.org:8080/19207
    Tested-by: Impala Public Jenkins <[email protected]>
    Reviewed-by: Michael Smith <[email protected]>
    Reviewed-by: Wenzhe Zhou <[email protected]>
---
 CMakeLists.txt             | 4 ++++
 bin/bootstrap_toolchain.py | 9 +++++----
 bin/impala-config.sh       | 9 +++++++++
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b7c7a65cd..b7e858812 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -143,6 +143,7 @@ set_dep_root(ZLIB)
 set_dep_root(CCTZ)
 set_dep_root(CURL)
 set_dep_root(CALLONCEHACK)
+set_dep_root(CLOUDFLAREZLIB)
 
 # The boost-cmake project hasn't been maintained for years. Let's make sure we
 # don't accidentally use it if it can be found.
@@ -214,6 +215,9 @@ IMPALA_ADD_THIRDPARTY_LIB(openssl_crypto "" "" 
${OPENSSL_CRYPTO_LIBRARY})
 find_package(Bzip2 REQUIRED)
 IMPALA_ADD_THIRDPARTY_LIB(bzip2 ${BZIP2_INCLUDE_DIR} ${BZIP2_STATIC_LIBRARIES} 
"")
 
+if ($ENV{IMPALA_USE_CLOUDFLARE_ZLIB} STREQUAL "true")
+  set(ZLIB_ROOT ${CLOUDFLAREZLIB_ROOT})
+endif()
 find_package(Zlib REQUIRED)
 IMPALA_ADD_THIRDPARTY_LIB(zlib ${ZLIB_INCLUDE_DIR} ${ZLIB_STATIC_LIBRARIES}
   ${ZLIB_SHARED_LIBRARIES})
diff --git a/bin/bootstrap_toolchain.py b/bin/bootstrap_toolchain.py
index 78b493d54..59628a3c6 100755
--- a/bin/bootstrap_toolchain.py
+++ b/bin/bootstrap_toolchain.py
@@ -465,10 +465,11 @@ def get_toolchain_downloads():
   gcc_package = ToolchainPackage("gcc")
   toolchain_packages += [llvm_package, llvm_package_asserts, gcc_package]
   toolchain_packages += [ToolchainPackage(p) for p in
-      ["avro", "binutils", "boost", "breakpad", "bzip2", "calloncehack", 
"cctz", "cmake",
-       "crcutil", "curl", "flatbuffers", "gdb", "gflags", "glog", 
"gperftools", "gtest",
-       "jwt-cpp", "libev", "libunwind", "lz4", "openldap", "orc", "protobuf",
-       "python", "rapidjson", "re2", "snappy", "tpc-h", "tpc-ds", "zlib", 
"zstd"]]
+      ["avro", "binutils", "boost", "breakpad", "bzip2", "calloncehack", 
"cctz",
+       "cloudflarezlib", "cmake", "crcutil", "curl", "flatbuffers", "gdb", 
"gflags",
+       "glog", "gperftools", "gtest", "jwt-cpp", "libev", "libunwind", "lz4", 
"openldap",
+       "orc", "protobuf", "python", "rapidjson", "re2", "snappy", "tpc-h", 
"tpc-ds",
+       "zlib", "zstd"]]
   python3_package = ToolchainPackage(
       "python", explicit_version=os.environ.get("IMPALA_PYTHON3_VERSION"))
   toolchain_packages += [python3_package]
diff --git a/bin/impala-config.sh b/bin/impala-config.sh
index 558989c4c..5f788c37b 100755
--- a/bin/impala-config.sh
+++ b/bin/impala-config.sh
@@ -173,6 +173,8 @@ export IMPALA_TPC_H_VERSION=2.17.0
 unset IMPALA_TPC_H_URL
 export IMPALA_ZLIB_VERSION=1.2.13
 unset IMPALA_ZLIB_URL
+export IMPALA_CLOUDFLAREZLIB_VERSION=9e601a3f37
+unset IMPALA_CLOUDFLAREZLIB_URL
 export IMPALA_CALLONCEHACK_VERSION=1.0.0
 unset IMPALA_CALLONCEHACK_URL
 # Thrift related environment variables.
@@ -269,6 +271,13 @@ export 
IMPALA_REDHAT8_DOCKER_BASE=${IMPALA_REDHAT8_DOCKER_BASE:-"rockylinux:8.5"
 # Impala's Java code.
 export IMPALA_DOCKER_USE_JAVA11=${IMPALA_DOCKER_USE_JAVA11:-"false"}
 
+# There are multiple compatible implementations of zlib. Cloudflare Zlib is an
+# implementation with optimizations to use platform-specific CPU features that 
are not
+# in the standard Zlib implementation. When set to true, this builds and links 
against
+# Cloudflare Zlib. When false, the build uses the regular Madler Zlib. This 
defaults
+# to true due to the large performance benefits.
+export IMPALA_USE_CLOUDFLARE_ZLIB=${IMPALA_USE_CLOUDFLARE_ZLIB:-"true"}
+
 # When IMPALA_(CDP_COMPONENT)_URL are overridden, they may contain 
'$(platform_label)'
 # which will be substituted for the CDP platform label in 
bootstrap_toolchain.py
 unset IMPALA_HADOOP_URL

Reply via email to