This is an automated email from the ASF dual-hosted git repository. joemcdonnell pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit f15a311065f2d30b727d53d96fae87f07132e4d9 Author: Joe McDonnell <joemcdonn...@cloudera.com> AuthorDate: Sun Apr 26 18:38:26 2020 -0700 IMPALA-9709: Remove Impala-lzo from the development environment This removes Impala-lzo from the Impala development environment. Impala-lzo is not built as part of the Impala build. The LZO plugin is no longer loaded. LZO tables are not loaded during dataload, and LZO is no longer tested. This removes some obsolete scan APIs that were only used by Impala-lzo. With this commit, Impala-lzo would require code changes to build against Impala. The plugin infrastructure is not removed, and this leaves some LZO support code in place. If someone were to decide to revive Impala-lzo, they would still be able to load it as a plugin and get the same functionality as before. This plugin support may be removed later. Testing: - Dryrun of GVO - Modified TestPartitionMetadataUncompressedTextOnly's test_unsupported_text_compression() to add LZO case Change-Id: I3a4f12247d8872b7e14c9feb4b2c58cfd60d4c0e Reviewed-on: http://gerrit.cloudera.org:8080/15814 Reviewed-by: Bikramjeet Vig <bikramjeet....@cloudera.com> Tested-by: Joe McDonnell <joemcdonn...@cloudera.com> --- CMakeLists.txt | 11 ------- be/src/exec/hdfs-plugin-text-scanner.cc | 6 ++-- be/src/exec/hdfs-scan-node-base.cc | 10 +----- be/src/exec/hdfs-scan-node-base.h | 12 ++------ be/src/util/codec.cc | 2 +- bin/bootstrap_system.sh | 23 ++------------ bin/clean.sh | 7 ----- bin/impala-config.sh | 18 +++-------- bin/set-ld-library-path.sh | 3 -- bin/start-impala-cluster.py | 7 ----- buildall.sh | 10 ------ docker/entrypoint.sh | 8 ----- docker/impala_base/Dockerfile | 4 +-- docker/test-with-docker.py | 13 +------- .../org/apache/impala/analysis/ToSqlUtils.java | 3 +- .../org/apache/impala/catalog/HdfsCompression.java | 4 ++- .../org/apache/impala/catalog/HdfsFileFormat.java | 6 ++-- .../org/apache/impala/planner/HdfsScanNode.java | 1 - .../org/apache/impala/planner/HdfsTableSink.java | 6 ++-- .../apache/impala/analysis/AnalyzeStmtsTest.java | 10 +++--- .../org/apache/impala/analysis/AnalyzerTest.java | 10 +++--- testdata/bad_text_lzo/bad_text.lzo | Bin 736999 -> 0 bytes testdata/bad_text_lzo/bad_text.lzo.index | Bin 5192 -> 0 bytes testdata/bin/create-load-data.sh | 22 ------------- testdata/bin/generate-schema-statements.py | 31 +++---------------- testdata/bin/generate-test-vectors.py | 1 - testdata/bin/load_nested.py | 5 ++- testdata/bin/lzo_indexer.sh | 20 ------------ .../common/etc/hadoop/conf/core-site.xml.py | 3 -- .../common/etc/hadoop/conf/yarn-site.xml.py | 4 +-- .../functional/functional_schema_template.sql | 11 ------- .../datasets/functional/schema_constraints.csv | 4 --- .../joins-hdfs-num-rows-est-enabled.test | 8 ++--- .../queries/PlannerTest/joins.test | 8 ++--- .../functional-query_dimensions.csv | 2 +- .../functional-query_exhaustive.csv | 1 - .../DataErrorsTest/hdfs-scan-node-errors.test | 18 ----------- .../queries/QueryTest/disable-lzo-plugin.test | 7 ----- .../queries/QueryTest/show-create-table.test | 12 -------- .../unsupported-compression-partitions.test | 9 +++++- .../perf-regression/perf-regression_dimensions.csv | 2 +- .../perf-regression/perf-regression_exhaustive.csv | 1 - .../perf-regression/perf-regression_pairwise.csv | 1 - .../targeted-perf/targeted-perf_dimensions.csv | 2 +- .../targeted-perf/targeted-perf_exhaustive.csv | 1 - .../targeted-perf/targeted-perf_pairwise.csv | 1 - .../targeted-stress/targeted-stress_dimensions.csv | 2 +- .../targeted-stress/targeted-stress_exhaustive.csv | 1 - .../targeted-stress/targeted-stress_pairwise.csv | 1 - .../tpcds-unmodified_dimensions.csv | 2 +- .../tpcds-unmodified_exhaustive.csv | 1 - .../tpcds-unmodified/tpcds-unmodified_pairwise.csv | 1 - testdata/workloads/tpcds/tpcds_dimensions.csv | 2 +- testdata/workloads/tpcds/tpcds_exhaustive.csv | 1 - testdata/workloads/tpcds/tpcds_pairwise.csv | 1 - testdata/workloads/tpch/tpch_dimensions.csv | 2 +- testdata/workloads/tpch/tpch_exhaustive.csv | 1 - testdata/workloads/tpch/tpch_pairwise.csv | 1 - tests/common/test_dimensions.py | 2 +- .../custom_cluster/test_hive_text_codec_interop.py | 3 +- tests/custom_cluster/test_scanner_plugin.py | 34 --------------------- tests/metadata/test_metadata_query_statements.py | 2 +- tests/metadata/test_partition_metadata.py | 23 +++++++++++--- tests/query_test/test_compressed_formats.py | 1 - tests/query_test/test_scanners_fuzz.py | 5 ++- 65 files changed, 88 insertions(+), 346 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5719249..bc8c983 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -438,17 +438,6 @@ add_custom_target(cscope ALL DEPENDS gen-deps COMMAND "${CMAKE_SOURCE_DIR}/bin/gen-cscope.sh" ) -# This call is passing IMPALA_TOOLCHAIN_PACKAGES_HOME into Impala-lzo's build.sh, -# but this is known not to work with the current version of Impala-lzo when -# IMPALA_TOOLCHAIN_PACKAGES_HOME is a subdirectory of IMPALA_TOOLCHAIN. Either -# Impala-lzo will need to be fixed or it will need to be removed. -if (DEFINED ENV{IMPALA_LZO} AND EXISTS $ENV{IMPALA_LZO}) - add_custom_target(impala-lzo ALL DEPENDS gen-deps - COMMAND $ENV{IMPALA_LZO}/build.sh ${CMAKE_BUILD_TYPE} ${CMAKE_SOURCE_DIR} - $ENV{IMPALA_TOOLCHAIN_PACKAGES_HOME} - ) -endif() - # Dump include paths to a file if (DUMP_INCLUDE_PATHS) file(REMOVE "${DUMP_INCLUDE_PATHS}") diff --git a/be/src/exec/hdfs-plugin-text-scanner.cc b/be/src/exec/hdfs-plugin-text-scanner.cc index 638c7bc..76772f0 100644 --- a/be/src/exec/hdfs-plugin-text-scanner.cc +++ b/be/src/exec/hdfs-plugin-text-scanner.cc @@ -42,9 +42,9 @@ using boost::upgrade_lock; using boost::upgrade_to_unique_lock; using std::find; -// Allow LZO by default to maintain backwards compatibility. We can add more options -// if we determine that the plugins are well-maintained and generally stable. -DEFINE_string(enabled_hdfs_text_scanner_plugins, "LZO", "(Advanced) whitelist of HDFS " +// LZO is no longer supported, so there are no plugins enabled by default. This is +// likely to be removed. +DEFINE_string(enabled_hdfs_text_scanner_plugins, "", "(Advanced) whitelist of HDFS " "text scanner plugins that Impala will try to dynamically load. Must be a " "comma-separated list of upper-case compression codec names. Each plugin implements " "support for decompression and hands off the decompressed bytes to Impala's builtin " diff --git a/be/src/exec/hdfs-scan-node-base.cc b/be/src/exec/hdfs-scan-node-base.cc index 1ef9207..61a9628 100644 --- a/be/src/exec/hdfs-scan-node-base.cc +++ b/be/src/exec/hdfs-scan-node-base.cc @@ -824,14 +824,6 @@ ScanRange* HdfsScanNodeBase::AllocateScanRange(hdfsFS fs, const char* file, int6 buffer_opts); } -ScanRange* HdfsScanNodeBase::AllocateScanRange(hdfsFS fs, const char* file, - int64_t len, int64_t offset, int64_t partition_id, int disk_id, - int cache_options, bool expected_local, int64_t mtime, - bool is_erasure_coded, const ScanRange* original_split) { - return AllocateScanRange(fs, file, len, offset, partition_id, disk_id, expected_local, - is_erasure_coded, mtime, BufferOpts(cache_options), original_split); -} - void* HdfsScanNodeBase::GetCodegenFn(THdfsFileFormat::type type) { auto it = codegend_fn_map_.find(type); if (it == codegend_fn_map_.end()) return NULL; @@ -1166,7 +1158,7 @@ void HdfsScanNodeBase::UpdateBytesRead( } } -HdfsFileDesc* ScanRangeSharedState::GetFileDesc( +const HdfsFileDesc* ScanRangeSharedState::GetFileDesc( int64_t partition_id, const std::string& filename) { auto file_desc_map_key = make_pair(partition_id, filename); DCHECK(file_descs_.find(file_desc_map_key) != file_descs_.end()); diff --git a/be/src/exec/hdfs-scan-node-base.h b/be/src/exec/hdfs-scan-node-base.h index 859c25d..f1e1195 100644 --- a/be/src/exec/hdfs-scan-node-base.h +++ b/be/src/exec/hdfs-scan-node-base.h @@ -130,9 +130,7 @@ class ScanRangeSharedState { public: /// Given a partition_id and filename returns the related file descriptor DCHECK ensures /// there is always file descriptor returned. - /// TODO: The LZO scanner expects a non const object so switch to returning a const once - /// support for LZO scanner is removed. - HdfsFileDesc* GetFileDesc(int64_t partition_id, const std::string& filename); + const HdfsFileDesc* GetFileDesc(int64_t partition_id, const std::string& filename); /// Sets the scanner specific metadata for 'partition_id' and 'filename'. /// Scanners can use this to store file header information. Thread safe. @@ -497,12 +495,6 @@ class HdfsScanNodeBase : public ScanNode { ScanRangeMetadata* metadata, int disk_id, bool expected_local, bool is_erasure_coded, int64_t mtime, const io::BufferOpts& buffer_opts); - /// Old API for compatibility with text scanners (e.g. LZO text scanner). - io::ScanRange* AllocateScanRange(hdfsFS fs, const char* file, int64_t len, - int64_t offset, int64_t partition_id, int disk_id, int cache_options, - bool expected_local, int64_t mtime, bool is_erasure_coded = false, - const io::ScanRange* original_split = nullptr); - /// Adds ranges to be read later by scanners. Must not be called once /// remaining_scan_range_submissions_ is 0. The enqueue_location specifies whether the /// scan ranges are added to the head or tail of the queue. Implemented by child classes @@ -525,7 +517,7 @@ class HdfsScanNodeBase : public ScanNode { /// Given a partition_id and filename returns the related file descriptor /// DCHECK ensures there is always file descriptor returned - inline HdfsFileDesc* GetFileDesc( + inline const HdfsFileDesc* GetFileDesc( int64_t partition_id, const std::string& filename) { return shared_state_->GetFileDesc(partition_id, filename); } diff --git a/be/src/util/codec.cc b/be/src/util/codec.cc index f5a8bf1..d788143 100644 --- a/be/src/util/codec.cc +++ b/be/src/util/codec.cc @@ -48,7 +48,7 @@ const char* const Codec::ZSTD_COMPRESSION = const char* const Codec::UNKNOWN_CODEC_ERROR = "This compression codec is currently unsupported: "; const char* const NO_LZO_MSG = "LZO codecs may not be created via the Codec interface. " - "Instead the LZO library is directly invoked."; + "Instead LZO is decoded by an optional text scanner plugin."; const Codec::CodecMap Codec::CODEC_MAP = {{"", THdfsCompression::NONE}, {DEFAULT_COMPRESSION, THdfsCompression::DEFAULT}, diff --git a/bin/bootstrap_system.sh b/bin/bootstrap_system.sh index 18cce2b..f5eaa24 100755 --- a/bin/bootstrap_system.sh +++ b/bin/bootstrap_system.sh @@ -199,7 +199,7 @@ function apt-get { echo ">>> Installing build tools" ubuntu apt-get update -ubuntu apt-get --yes install ccache curl gawk g++ gcc libffi-dev liblzo2-dev \ +ubuntu apt-get --yes install ccache curl gawk g++ gcc libffi-dev \ libkrb5-dev krb5-admin-server krb5-kdc krb5-user libsasl2-dev \ libsasl2-modules libsasl2-modules-gssapi-mit libssl-dev make ninja-build \ python-dev python-setuptools postgresql ssh wget vim-common psmisc \ @@ -240,7 +240,7 @@ redhat sudo yum install -y curl gawk gcc gcc-c++ git krb5-devel krb5-server \ krb5-workstation libevent-devel libffi-devel make openssl-devel cyrus-sasl \ cyrus-sasl-gssapi cyrus-sasl-devel cyrus-sasl-plain \ postgresql postgresql-server \ - wget vim-common nscd cmake lzo-devel fuse-devel zlib-devel \ + wget vim-common nscd cmake fuse-devel zlib-devel \ psmisc lsof openssh-server redhat-lsb java-1.8.0-openjdk-devel \ java-1.8.0-openjdk-src @@ -453,25 +453,6 @@ eval "$SET_JAVA_HOME" # Assert that we have a java available test -f $JAVA_HOME/bin/java -# LZO is not needed to compile or run Impala, but it is needed for the data load -echo ">>> Checking out Impala-lzo" -: ${IMPALA_LZO_HOME:="${IMPALA_HOME}/../Impala-lzo"} -if ! [[ -d "$IMPALA_LZO_HOME" ]] -then - git clone --branch master https://github.com/cloudera/impala-lzo.git "$IMPALA_LZO_HOME" -fi - -echo ">>> Checking out and building hadoop-lzo" - -: ${HADOOP_LZO_HOME:="${IMPALA_HOME}/../hadoop-lzo"} -if ! [[ -d "$HADOOP_LZO_HOME" ]] -then - git clone https://github.com/cloudera/hadoop-lzo.git "$HADOOP_LZO_HOME" -fi -cd "$HADOOP_LZO_HOME" -time -p ant package -cd "$IMPALA_HOME" - # Try to prepopulate the m2 directory to save time if ! bin/jenkins/populate_m2_directory.py ; then echo "Failed to prepopulate the m2 directory. Continuing..." diff --git a/bin/clean.sh b/bin/clean.sh index d0b7c3b..34781f9 100755 --- a/bin/clean.sh +++ b/bin/clean.sh @@ -67,12 +67,5 @@ popd rm -f "${IMPALA_HOME}/llvm-ir/"impala*.ll rm -f "${IMPALA_HOME}/be/generated-sources/impala-ir/"* -# Cleanup Impala-lzo -if [ -e "${IMPALA_LZO}" ]; then - pushd "${IMPALA_LZO}" - git rev-parse 2>/dev/null && git clean -fdx - popd -fi - # When switching to and from toolchain, make sure to remove all CMake generated files "${IMPALA_HOME}/bin/clean-cmake.sh" diff --git a/bin/impala-config.sh b/bin/impala-config.sh index 728d52a..55e7018 100755 --- a/bin/impala-config.sh +++ b/bin/impala-config.sh @@ -325,8 +325,6 @@ export DOWNLOAD_CDH_COMPONENTS=${DOWNLOAD_CDH_COMPONENTS-true} export IS_OSX="$(if [[ "$OSTYPE" == "darwin"* ]]; then echo true; else echo false; fi)" -export HADOOP_LZO="${HADOOP_LZO-$IMPALA_HOME/../hadoop-lzo}" -export IMPALA_LZO="${IMPALA_LZO-$IMPALA_HOME/../Impala-lzo}" export IMPALA_AUX_TEST_HOME="${IMPALA_AUX_TEST_HOME-$IMPALA_HOME/../Impala-auxiliary-tests}" export TARGET_FILESYSTEM="${TARGET_FILESYSTEM-hdfs}" export ERASURE_CODING="${ERASURE_CODING-false}" @@ -568,18 +566,13 @@ export HADOOP_CONF_DIR="$IMPALA_FE_DIR/src/test/resources" export HADOOP_INCLUDE_DIR=${HADOOP_INCLUDE_DIR_OVERRIDE:-"${HADOOP_HOME}/include"} export HADOOP_LIB_DIR=${HADOOP_LIB_DIR_OVERRIDE:-"${HADOOP_HOME}/lib"} -# Please note that the * is inside quotes, thus it won't get expanded by bash but -# by java, see "Understanding class path wildcards" at http://goo.gl/f0cfft -export HADOOP_CLASSPATH="${HADOOP_CLASSPATH-}:${HADOOP_HOME}/share/hadoop/tools/lib/*" -# YARN is configured to use LZO so the LZO jar needs to be in the hadoop classpath. -export LZO_JAR_PATH="$HADOOP_LZO/build/hadoop-lzo-0.4.15.jar" -HADOOP_CLASSPATH+=":$LZO_JAR_PATH" - # Beware of adding entries from $HADOOP_HOME here, because they can change # the order of the classpath, leading to configuration not showing up first. -HADOOP_CLASSPATH="$LZO_JAR_PATH" +export HADOOP_CLASSPATH="${HADOOP_CLASSPATH-}" # Add the path containing the hadoop-aws jar, which is required to access AWS from the # minicluster. +# Please note that the * is inside quotes, thus it won't get expanded by bash but +# by java, see "Understanding class path wildcards" at http://goo.gl/f0cfft HADOOP_CLASSPATH="${HADOOP_CLASSPATH}:${HADOOP_HOME}/share/hadoop/tools/lib/*" export PATH="$HADOOP_HOME/bin:$PATH" @@ -610,7 +603,7 @@ export HIVE_CONF_DIR="$IMPALA_FE_DIR/./src/test/resources" export POSTGRES_JDBC_DRIVER="${IMPALA_FE_DIR}/target/dependency/postgresql-${IMPALA_POSTGRES_JDBC_DRIVER_VERSION}.jar" export HIVE_AUX_JARS_PATH="$POSTGRES_JDBC_DRIVER" -export AUX_CLASSPATH="${LZO_JAR_PATH}" +export AUX_CLASSPATH="" ### Tell hive not to use jline export HADOOP_USER_CLASSPATH_FIRST=true @@ -707,7 +700,6 @@ LIBHDFS_OPTS="${LIBHDFS_OPTS} -XX:MaxPermSize=128m" export CLASSPATH="$IMPALA_FE_DIR/target/dependency:${CLASSPATH:+:${CLASSPATH}}" CLASSPATH="$IMPALA_FE_DIR/target/classes:$CLASSPATH" CLASSPATH="$IMPALA_FE_DIR/src/test/resources:$CLASSPATH" -CLASSPATH="$LZO_JAR_PATH:$CLASSPATH" # A marker in the environment to prove that we really did source this file export IMPALA_CONFIG_SOURCED=1 @@ -726,8 +718,6 @@ echo "HBASE_CONF_DIR = $HBASE_CONF_DIR" echo "RANGER_HOME = $RANGER_HOME" echo "RANGER_CONF_DIR = $RANGER_CONF_DIR " echo "THRIFT_HOME = $THRIFT_HOME" -echo "HADOOP_LZO = $HADOOP_LZO" -echo "IMPALA_LZO = $IMPALA_LZO" echo "CLASSPATH = $CLASSPATH" echo "LIBHDFS_OPTS = $LIBHDFS_OPTS" echo "JAVA_HOME = $JAVA_HOME" diff --git a/bin/set-ld-library-path.sh b/bin/set-ld-library-path.sh index 6913994..8b7ca4e 100644 --- a/bin/set-ld-library-path.sh +++ b/bin/set-ld-library-path.sh @@ -21,9 +21,6 @@ # run Impala binaries in the context of a dev environment. export LD_LIBRARY_PATH="${LD_LIBRARY_PATH:+${LD_LIBRARY_PATH}:}" -# Impala-lzo is loaded at runtime, so needs to be on the search path. -LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${IMPALA_LZO}/build" - # We built against toolchain GCC so we need to dynamically link against matching # library versions. (the rpath isn't baked into the binaries) IMPALA_TOOLCHAIN_GCC_LIB=\ diff --git a/bin/start-impala-cluster.py b/bin/start-impala-cluster.py index bc38149..cccf9f1 100755 --- a/bin/start-impala-cluster.py +++ b/bin/start-impala-cluster.py @@ -139,7 +139,6 @@ options, args = parser.parse_args() IMPALA_HOME = os.environ["IMPALA_HOME"] CORE_SITE_PATH = os.path.join(IMPALA_HOME, "fe/src/test/resources/core-site.xml") KNOWN_BUILD_TYPES = ["debug", "release", "latest"] -IMPALA_LZO = os.environ["IMPALA_LZO"] # The location in the container where the cache is always mounted. DATA_CACHE_CONTAINER_PATH = "/opt/impala/cache" @@ -623,12 +622,6 @@ class DockerMiniClusterOperations(object): # Run the container as the current user. user_args = ["--user", "{0}:{1}".format(os.getuid(), os.getgid())] - # Allow loading LZO plugin, if built. - lzo_lib_dir = os.path.join(IMPALA_LZO, "build") - if os.path.isdir(lzo_lib_dir): - mount_args += ["--mount", - "type=bind,src={0},dst=/opt/impala/lib/plugins".format(lzo_lib_dir)] - mem_limit_args = [] if mem_limit is not None: mem_limit_args = ["--memory", str(mem_limit)] diff --git a/buildall.sh b/buildall.sh index dbe4030..1a6f69b 100755 --- a/buildall.sh +++ b/buildall.sh @@ -434,9 +434,6 @@ build_all_components() { if (( build_independent_targets )); then MAKE_TARGETS+=" cscope fe tarballs" fi - if [[ -e "$IMPALA_LZO" ]]; then - MAKE_TARGETS+=" impala-lzo" - fi fi ${MAKE_CMD} -j${IMPALA_BUILD_THREADS:-4} ${IMPALA_MAKE_FLAGS} ${MAKE_TARGETS} } @@ -518,13 +515,6 @@ reconfigure_test_cluster() { # Generate the Hadoop configs needed by Impala "${IMPALA_HOME}/bin/create-test-configuration.sh" ${CREATE_TEST_CONFIG_ARGS} - - # Copy Hadoop-lzo dependencies if available (required to generate Lzo data). - if stat "$HADOOP_LZO"/build/native/Linux-*-*/lib/libgplcompression.* > /dev/null ; then - cp "$HADOOP_LZO"/build/native/Linux-*-*/lib/libgplcompression.* "$HADOOP_LIB_DIR/native" - else - echo "No hadoop-lzo found" - fi } # Starts the test cluster processes except for Impala. diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index 6584748..f636ee6 100755 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -250,14 +250,6 @@ function build_impdev() { git fetch /git_common_dir --no-tags "$GIT_HEAD_REV" git checkout -b test-with-docker FETCH_HEAD - # Checkout impala-lzo too - mkdir /home/impdev/Impala-lzo - pushd /home/impdev/Impala-lzo - git init - git fetch $IMPALA_LZO_REPO --no-tags "$IMPALA_LZO_REF" - git checkout -b test-with-docker FETCH_HEAD - popd - # Link in logs. Logs are on the host since that's the most important thing to # look at after the tests are run. ln -sf /logs logs diff --git a/docker/impala_base/Dockerfile b/docker/impala_base/Dockerfile index 362d7e3..8d178f5 100644 --- a/docker/impala_base/Dockerfile +++ b/docker/impala_base/Dockerfile @@ -18,13 +18,11 @@ ARG BASE_IMAGE=ubuntu:16.04 FROM ${BASE_IMAGE} # Install minimal dependencies required for Impala services to run. -# liblzo2-2 may be needed by the Impala-lzo plugin, which is used in tests. -# We install it in the base image for convenience. RUN apt-get update && \ apt-get install -y openjdk-8-jre-headless \ libsasl2-2 libsasl2-modules libsasl2-modules-gssapi-mit \ sudo netcat-openbsd less curl iproute2 vim iputils-ping \ - tzdata liblzo2-2 krb5-user && \ + tzdata krb5-user && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* diff --git a/docker/test-with-docker.py b/docker/test-with-docker.py index 9dc867c..b348d3b 100755 --- a/docker/test-with-docker.py +++ b/docker/test-with-docker.py @@ -184,11 +184,6 @@ def main(): default=os.path.expanduser("~/.ccache")) parser.add_argument('--tail', action="store_true", help="Run tail on all container log files.") - parser.add_argument('--impala-lzo-repo', - default="https://github.com/cloudera/impala-lzo.git", - help="Git repo for Impala-lzo repo") - parser.add_argument('--impala-lzo-ref', default='master', - help="Branch name for Impala-lzo repo.") parser.add_argument('--env', metavar='K=V', default=[], action='append', help="""Passes given environment variables (expressed as KEY=VALUE) through containers. @@ -210,8 +205,6 @@ def main(): suite_concurrency=args.suite_concurrency, impalad_mem_limit_bytes=args.impalad_mem_limit_bytes, tail=args.tail, - impala_lzo_repo=args.impala_lzo_repo, - impala_lzo_ref=args.impala_lzo_ref, env=args.env, base_image=args.base_image) fh = logging.FileHandler(os.path.join(_make_dir_if_not_exist(t.log_dir), "log.txt")) @@ -449,7 +442,7 @@ class TestWithDocker(object): cleanup_image, ccache_dir, test_mode, suite_concurrency, parallel_test_concurrency, impalad_mem_limit_bytes, tail, - impala_lzo_repo, impala_lzo_ref, env, base_image): + env, base_image): self.build_image = build_image self.name = name self.containers = [] @@ -485,8 +478,6 @@ class TestWithDocker(object): self.parallel_test_concurrency = parallel_test_concurrency self.impalad_mem_limit_bytes = impalad_mem_limit_bytes self.tail = tail - self.impala_lzo_repo = impala_lzo_repo - self.impala_lzo_ref = impala_lzo_ref self.env = env self.base_image = base_image @@ -571,8 +562,6 @@ class TestWithDocker(object): "-v", self.git_root + ":/repo:ro", "-v", self.git_common_dir + ":/git_common_dir:ro", "-e", "GIT_HEAD_REV=" + self.git_head_rev, - "-e", "IMPALA_LZO_REPO=" + self.impala_lzo_repo, - "-e", "IMPALA_LZO_REF=" + self.impala_lzo_ref, # Share timezone between host and container "-e", "LOCALTIME_LINK_TARGET=" + localtime_link_target, "-v", self.ccache_dir + ":/ccache", diff --git a/fe/src/main/java/org/apache/impala/analysis/ToSqlUtils.java b/fe/src/main/java/org/apache/impala/analysis/ToSqlUtils.java index 1c5a781..562412b 100644 --- a/fe/src/main/java/org/apache/impala/analysis/ToSqlUtils.java +++ b/fe/src/main/java/org/apache/impala/analysis/ToSqlUtils.java @@ -468,8 +468,7 @@ public class ToSqlUtils { } if (storageHandlerClass == null) { - // TODO: Remove this special case when we have the LZO_TEXT writer - // We must handle LZO_TEXT specially because Impala does not yet support creating + // We must handle LZO_TEXT specially because Impala does not support creating // tables with this row format. In this case, we cannot output "WITH // SERDEPROPERTIES" because Hive does not support it with "STORED AS". For any // other HdfsFileFormat we want to output the serdeproperties because it is diff --git a/fe/src/main/java/org/apache/impala/catalog/HdfsCompression.java b/fe/src/main/java/org/apache/impala/catalog/HdfsCompression.java index b9218bd..df76463 100644 --- a/fe/src/main/java/org/apache/impala/catalog/HdfsCompression.java +++ b/fe/src/main/java/org/apache/impala/catalog/HdfsCompression.java @@ -27,10 +27,12 @@ import com.google.common.collect.ImmutableMap; * Support for recognizing compression suffixes on data files. * Compression of a file is recognized in mapreduce by looking for suffixes of * supported codecs. - * For now Impala supports LZO, GZIP, SNAPPY, BZIP2 and some additional formats if plugins + * For now Impala supports GZIP, SNAPPY, BZIP2 and some additional formats if plugins * are available. Even if a plugin is available, we need to add the file suffixes here so * that we can resolve the compression type from the file name. LZO can use the specific * HIVE input class. + * Some compression types here are detected even though they are not supported. This + * allows for better error messages (e.g. LZ4, LZO). */ public enum HdfsCompression { NONE, diff --git a/fe/src/main/java/org/apache/impala/catalog/HdfsFileFormat.java b/fe/src/main/java/org/apache/impala/catalog/HdfsFileFormat.java index 460d171..4eeebe1 100644 --- a/fe/src/main/java/org/apache/impala/catalog/HdfsFileFormat.java +++ b/fe/src/main/java/org/apache/impala/catalog/HdfsFileFormat.java @@ -49,6 +49,9 @@ public enum HdfsFileFormat { "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", false, false, true), + // LZO_TEXT is never used as an actual HdfsFileFormat. It is used only to store the + // input format class and match against it (e.g. in HdfsCompression). Outside of this + // file, tables that use the LZO input format class use HdfsFileFormat.TEXT. LZO_TEXT("com.hadoop.mapred.DeprecatedLzoTextInputFormat", "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", "", false, false, true), @@ -194,8 +197,7 @@ public enum HdfsFileFormat { case TEXT: if (compressionType == HdfsCompression.LZO || compressionType == HdfsCompression.LZO_INDEX) { - // TODO: Update this when we can write LZO text. - // It is not currently possible to create a table with LZO compressed text files + // It is not possible to create a table with LZO compressed text files // in Impala, but this is valid in Hive. return String.format("INPUTFORMAT '%s' OUTPUTFORMAT '%s'", LZO_TEXT.inputFormat(), LZO_TEXT.outputFormat()); diff --git a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java index 70e239d..81b03e0 100644 --- a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java +++ b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java @@ -179,7 +179,6 @@ public class HdfsScanNode extends ScanNode { ImmutableSet.<HdfsFileFormat>builder() .add(HdfsFileFormat.RC_FILE) .add(HdfsFileFormat.TEXT) - .add(HdfsFileFormat.LZO_TEXT) .add(HdfsFileFormat.SEQUENCE_FILE) .add(HdfsFileFormat.AVRO) .build(); diff --git a/fe/src/main/java/org/apache/impala/planner/HdfsTableSink.java b/fe/src/main/java/org/apache/impala/planner/HdfsTableSink.java index 970b088..af67dc9 100644 --- a/fe/src/main/java/org/apache/impala/planner/HdfsTableSink.java +++ b/fe/src/main/java/org/apache/impala/planner/HdfsTableSink.java @@ -62,8 +62,8 @@ public class HdfsTableSink extends TableSink { protected final boolean inputIsClustered_; private static final Set<HdfsFileFormat> SUPPORTED_FILE_FORMATS = ImmutableSet.of( - HdfsFileFormat.PARQUET, HdfsFileFormat.TEXT, HdfsFileFormat.LZO_TEXT, - HdfsFileFormat.RC_FILE, HdfsFileFormat.SEQUENCE_FILE, HdfsFileFormat.AVRO); + HdfsFileFormat.PARQUET, HdfsFileFormat.TEXT, HdfsFileFormat.RC_FILE, + HdfsFileFormat.SEQUENCE_FILE, HdfsFileFormat.AVRO); // Stores the indices into the list of non-clustering columns of the target table that // are stored in the 'sort.columns' table property. This is sent to the backend to @@ -150,7 +150,7 @@ public class HdfsTableSink extends TableSink { return 1024L * 1024L * 1024L; } - // For all other supported formats (TEXT, LZO_TEXT, RC_FILE, SEQUENCE_FILE & AVRO) + // For all other supported formats (TEXT, RC_FILE, SEQUENCE_FILE & AVRO) // 100KB is a very approximate estimate of the amount of data buffered. return 100L * 1024L; } diff --git a/fe/src/test/java/org/apache/impala/analysis/AnalyzeStmtsTest.java b/fe/src/test/java/org/apache/impala/analysis/AnalyzeStmtsTest.java index dba2a85..2e57b9f 100644 --- a/fe/src/test/java/org/apache/impala/analysis/AnalyzeStmtsTest.java +++ b/fe/src/test/java/org/apache/impala/analysis/AnalyzeStmtsTest.java @@ -3385,15 +3385,15 @@ public class AnalyzeStmtsTest extends AnalyzerTest { // File type / table type mismatch. AnalyzesOk(String.format("load data inpath '%s' %s into table " + "tpch.lineitem", - "/test-warehouse/alltypes_text_lzo/year=2009/month=4", overwrite)); + "/test-warehouse/alltypes_text_gzip/year=2009/month=4", overwrite)); // When table type matches, analysis passes for partitioned and unpartitioned // tables. AnalyzesOk(String.format("load data inpath '%s' %s into table " + - "functional_text_lzo.alltypes partition(year=2009, month=4)", - "/test-warehouse/alltypes_text_lzo/year=2009/month=4", overwrite)); + "functional_text_gzip.alltypes partition(year=2009, month=4)", + "/test-warehouse/alltypes_text_gzip/year=2009/month=4", overwrite)); AnalyzesOk(String.format("load data inpath '%s' %s into table " + - "functional_text_lzo.jointbl", - "/test-warehouse/alltypes_text_lzo/year=2009/month=4", overwrite)); + "functional_text_gzip.jointbl", + "/test-warehouse/alltypes_text_gzip/year=2009/month=4", overwrite)); // Verify with a read-only table AnalysisError(String.format("load data inpath '%s' into table " + diff --git a/fe/src/test/java/org/apache/impala/analysis/AnalyzerTest.java b/fe/src/test/java/org/apache/impala/analysis/AnalyzerTest.java index cc4e44d..97ea64a 100644 --- a/fe/src/test/java/org/apache/impala/analysis/AnalyzerTest.java +++ b/fe/src/test/java/org/apache/impala/analysis/AnalyzerTest.java @@ -101,12 +101,10 @@ public class AnalyzerTest extends FrontendTestBase { @Test public void TestCompressedText() throws AnalysisException { - AnalyzesOk("SELECT count(*) FROM functional_text_lzo.tinyinttable"); - // TODO: Disabling the text/{gzip,bzip,snap} analysis test until the corresponding - // databases are loaded. - // AnalyzesOk("SELECT count(*) FROM functional_text_gzip.tinyinttable"); - // AnalyzesOk("SELECT count(*) FROM functional_text_snap.tinyinttable"); - // AnalyzesOk("SELECT count(*) FROM functional_text_bzip.tinyinttable"); + AnalyzesOk("SELECT count(*) FROM functional_text_bzip.tinyinttable"); + AnalyzesOk("SELECT count(*) FROM functional_text_def.tinyinttable"); + AnalyzesOk("SELECT count(*) FROM functional_text_gzip.tinyinttable"); + AnalyzesOk("SELECT count(*) FROM functional_text_snap.tinyinttable"); } @Test diff --git a/testdata/bad_text_lzo/bad_text.lzo b/testdata/bad_text_lzo/bad_text.lzo deleted file mode 100644 index 65bb703..0000000 Binary files a/testdata/bad_text_lzo/bad_text.lzo and /dev/null differ diff --git a/testdata/bad_text_lzo/bad_text.lzo.index b/testdata/bad_text_lzo/bad_text.lzo.index deleted file mode 100755 index b7fdebb..0000000 Binary files a/testdata/bad_text_lzo/bad_text.lzo.index and /dev/null differ diff --git a/testdata/bin/create-load-data.sh b/testdata/bin/create-load-data.sh index 8d20b83..4bb4978 100755 --- a/testdata/bin/create-load-data.sh +++ b/testdata/bin/create-load-data.sh @@ -453,28 +453,6 @@ EOF } function load-custom-data { - # Load the index files for corrupted lzo data. - hadoop fs -mkdir -p /test-warehouse/bad_text_lzo_text_lzo - hadoop fs -rm -f /test-warehouse/bad_text_lzo_text_lzo/bad_text.lzo.index - hadoop fs -put ${IMPALA_HOME}/testdata/bad_text_lzo/bad_text.lzo.index \ - /test-warehouse/bad_text_lzo_text_lzo/ - - hadoop fs -rm -r -f /bad_text_lzo_text_lzo/ - hadoop fs -mv /test-warehouse/bad_text_lzo_text_lzo/ / - # Cleanup the old bad_text_lzo files, if they exist. - hadoop fs -rm -r -f /test-warehouse/bad_text_lzo/ - - # TODO: Why is there a REMOTE_LOAD condition? See IMPALA-4347 - if [[ -z $REMOTE_LOAD ]]; then - # Index all lzo files in HDFS under /test-warehouse - ${IMPALA_HOME}/testdata/bin/lzo_indexer.sh /test-warehouse - fi - - hadoop fs -mv /bad_text_lzo_text_lzo/ /test-warehouse/ - - # Remove all index files in this partition. - hadoop fs -rm -f /test-warehouse/alltypes_text_lzo/year=2009/month=1/*.lzo.index - # Add a sequence file that only contains a header (see IMPALA-362) hadoop fs -put -f ${IMPALA_HOME}/testdata/tinytable_seq_snap/tinytable_seq_snap_header_only \ /test-warehouse/tinytable_seq_snap diff --git a/testdata/bin/generate-schema-statements.py b/testdata/bin/generate-schema-statements.py index 503c749..441fd36 100755 --- a/testdata/bin/generate-schema-statements.py +++ b/testdata/bin/generate-schema-statements.py @@ -171,7 +171,6 @@ COMPRESSION_MAP = {'def': 'org.apache.hadoop.io.compress.DefaultCodec', 'gzip': 'org.apache.hadoop.io.compress.GzipCodec', 'bzip': 'org.apache.hadoop.io.compress.BZip2Codec', 'snap': 'org.apache.hadoop.io.compress.SnappyCodec', - 'lzo': 'com.hadoop.compression.lzo.LzopCodec', 'none': '' } @@ -188,9 +187,6 @@ FILE_FORMAT_MAP = { 'orc': 'ORC', 'parquet': 'PARQUET', 'hudiparquet': 'HUDIPARQUET', - 'text_lzo': - "\nINPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'" + - "\nOUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'", 'avro': 'AVRO', 'hbase': "'org.apache.hadoop.hive.hbase.HBaseStorageHandler'", 'kudu': "KUDU", @@ -224,7 +220,7 @@ WITH SERDEPROPERTIES ( "{hbase_column_mapping}") {tbl_properties}{{hdfs_location}}""" -KNOWN_EXPLORATION_STRATEGIES = ['core', 'pairwise', 'exhaustive', 'lzo'] +KNOWN_EXPLORATION_STRATEGIES = ['core', 'pairwise', 'exhaustive'] def build_create_statement(table_template, table_name, db_name, db_suffix, file_format, compression, hdfs_location, @@ -232,8 +228,6 @@ def build_create_statement(table_template, table_name, db_name, db_suffix, create_stmt = '' if (force_reload): create_stmt += 'DROP TABLE IF EXISTS %s%s.%s;\n' % (db_name, db_suffix, table_name) - if compression == 'lzo': - file_format = '%s_%s' % (file_format, compression) # hbase / kudu tables are external, and not read from hdfs. We don't need an # hdfs_location. if file_format in ['hbase', 'kudu']: @@ -454,9 +448,9 @@ def build_insert_into_statement(insert, db_name, db_suffix, table_name, file_for statement += "set hive.auto.convert.join=true;\n" # For some reason (hive bug?) we need to have the CombineHiveInputFormat set - # for cases where we are compressing in bzip or lzo on certain tables that + # for cases where we are compressing in bzip on certain tables that # have multiple files. - if 'multi' in table_name and ('bzip' in db_suffix or 'lzo' in db_suffix): + if 'multi' in table_name and ('bzip' in db_suffix): statement += SET_HIVE_INPUT_FORMAT % "CombineHiveInputFormat" else: statement += SET_HIVE_INPUT_FORMAT % "HiveInputFormat" @@ -682,9 +676,6 @@ def generate_statements(output_name, test_vectors, sections, output = impala_create if create_hive or file_format == 'hbase': output = hive_output - elif codec == 'lzo': - # Impala CREATE TABLE doesn't allow INPUTFORMAT. - output = hive_output # TODO: Currently, Kudu does not support partitioned tables via Impala. # If a CREATE_KUDU section was provided, assume it handles the partition columns @@ -748,21 +739,7 @@ def generate_statements(output_name, test_vectors, sections, # moment, it assumes we're only using ALTER for partitioning the table. if alter and file_format not in ("hbase", "kudu"): use_db = 'USE {db_name};\n'.format(db_name=db) - if output == hive_output and codec == 'lzo': - # Hive ALTER TABLE ADD PARTITION doesn't handle null partitions, so - # we can't run the ALTER section in this case. - if options.force_reload: - # IMPALA-2278: Hive INSERT OVERWRITE won't clear out partition directories - # that weren't already added to the table. So, for force reload, manually - # delete the partition directories. - output.create.append(("DFS -rm -R {data_path};").format( - data_path=data_path)) - else: - # If this is not a force reload use msck repair to add the partitions - # into the table. - output.create.append(use_db + 'msck repair table %s;' % (table_name)) - else: - output.create.append(use_db + alter.format(table_name=table_name)) + output.create.append(use_db + alter.format(table_name=table_name)) # If the directory already exists in HDFS, assume that data files already exist # and skip loading the data. Otherwise, the data is generated using either an diff --git a/testdata/bin/generate-test-vectors.py b/testdata/bin/generate-test-vectors.py index 4219aab..9d0be9e 100755 --- a/testdata/bin/generate-test-vectors.py +++ b/testdata/bin/generate-test-vectors.py @@ -92,7 +92,6 @@ def is_valid_combination(vector): if len(vector) == 4: return not ( (vector[FILE_FORMAT_IDX] == 'text' and vector[COMPRESSION_IDX] in ['def']) or - (vector[FILE_FORMAT_IDX] != 'text' and vector[COMPRESSION_IDX] == 'lzo') or (vector[COMPRESSION_IDX] == 'none' and vector[COMPRESSION_TYPE_IDX] != 'none') or (vector[COMPRESSION_IDX] != 'none' and vector[COMPRESSION_TYPE_IDX] == 'none') or (vector[FILE_FORMAT_IDX] != 'seq' and vector[COMPRESSION_TYPE_IDX] == 'record') or diff --git a/testdata/bin/load_nested.py b/testdata/bin/load_nested.py index 8b6f6ed..76a43b1 100755 --- a/testdata/bin/load_nested.py +++ b/testdata/bin/load_nested.py @@ -44,8 +44,7 @@ COMPRESSION_VALUES_MAP = { "parquet": { "none": "SNAPPY", "snap": "SNAPPY", - "gzip": "GZIP", - "lzo": "LZO" + "gzip": "GZIP" }, # Currently, only three codecs are supported in Hive for ORC. See Hive codes in # org.apache.orc.impl.WriterImpl#createCodec (in module hive-orc) @@ -397,7 +396,7 @@ if __name__ == "__main__": source_db = args.source_db target_db = args.target_db file_format, compression_value = args.table_format.split("/") - # 'compression_value' is one of [none,def,gzip,bzip,snap,lzo]. We should translate it + # 'compression_value' is one of [none,def,gzip,bzip,snap]. We should translate it # into values that can be set to Hive. if file_format not in COMPRESSION_KEYS_MAP: raise Exception("Nested types in file format %s are not supported" % file_format) diff --git a/testdata/bin/lzo_indexer.sh b/testdata/bin/lzo_indexer.sh deleted file mode 100755 index 4b95c1d..0000000 --- a/testdata/bin/lzo_indexer.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -hadoop jar ${HADOOP_LZO}/build/hadoop-lzo-0.4.15.jar com.hadoop.compression.lzo.DistributedLzoIndexer $* diff --git a/testdata/cluster/node_templates/common/etc/hadoop/conf/core-site.xml.py b/testdata/cluster/node_templates/common/etc/hadoop/conf/core-site.xml.py index 035795c..611cabf 100644 --- a/testdata/cluster/node_templates/common/etc/hadoop/conf/core-site.xml.py +++ b/testdata/cluster/node_templates/common/etc/hadoop/conf/core-site.xml.py @@ -26,8 +26,6 @@ target_filesystem = os.environ.get('TARGET_FILESYSTEM') compression_codecs = [ 'org.apache.hadoop.io.compress.GzipCodec', 'org.apache.hadoop.io.compress.DefaultCodec', - 'com.hadoop.compression.lzo.LzoCodec', - 'com.hadoop.compression.lzo.LzopCodec', 'org.apache.hadoop.io.compress.BZip2Codec' ] @@ -44,7 +42,6 @@ CONFIG = { # Compression codecs 'io.compression.codecs': ",".join(compression_codecs), - 'io.compression.deoc.lzo.class': 'com.hadoop.compression.lzo.LzoCodec', # Set up proxyuser 'hadoop.proxyuser.${USER}.hosts': '*', diff --git a/testdata/cluster/node_templates/common/etc/hadoop/conf/yarn-site.xml.py b/testdata/cluster/node_templates/common/etc/hadoop/conf/yarn-site.xml.py index 769685f..0987925 100644 --- a/testdata/cluster/node_templates/common/etc/hadoop/conf/yarn-site.xml.py +++ b/testdata/cluster/node_templates/common/etc/hadoop/conf/yarn-site.xml.py @@ -76,9 +76,7 @@ app_classpath = [ '$HADOOP_HDFS_HOME/share/hadoop/hdfs/*', '$HADOOP_HDFS_HOME/share/hadoop/hdfs/lib/*', '$HADOOP_YARN_HOME/share/hadoop/yarn/*', - '$HADOOP_YARN_HOME/share/hadoop/yarn/lib/*', - # Append the LZO jar for LZO-compressed file support. - '${LZO_JAR_PATH}'] + '$HADOOP_YARN_HOME/share/hadoop/yarn/lib/*'] # Hive 3 needs Tez on the classpath. if hive_major_version == 3: diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql index c323666..0c1fb9f 100644 --- a/testdata/datasets/functional/functional_schema_template.sql +++ b/testdata/datasets/functional/functional_schema_template.sql @@ -1563,17 +1563,6 @@ OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ---- DATASET functional ---- BASE_TABLE_NAME -bad_text_lzo ----- COLUMNS -field STRING ----- DEPENDENT_LOAD_HIVE --- Error recovery test data for LZO compression. -LOAD DATA LOCAL INPATH '{impala_home}/testdata/bad_text_lzo/bad_text.lzo' -OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; -==== ----- DATASET -functional ----- BASE_TABLE_NAME bad_text_gzip ---- COLUMNS s STRING diff --git a/testdata/datasets/functional/schema_constraints.csv b/testdata/datasets/functional/schema_constraints.csv index a308c13..95324fa 100644 --- a/testdata/datasets/functional/schema_constraints.csv +++ b/testdata/datasets/functional/schema_constraints.csv @@ -33,7 +33,6 @@ table_name:insert_overwrite_partitioned, constraint:restrict_to, table_format:pa table_name:insert_string_partitioned, constraint:restrict_to, table_format:parquet/none/none table_name:old_rcfile_table, constraint:restrict_to, table_format:rc/none/none -table_name:bad_text_lzo, constraint:restrict_to, table_format:text/lzo/block table_name:bad_text_gzip, constraint:restrict_to, table_format:text/gzip/block table_name:bad_seq_snap, constraint:restrict_to, table_format:seq/snap/block table_name:bad_avro_snap_strings, constraint:restrict_to, table_format:avro/snap/block @@ -242,13 +241,11 @@ table_name:date_tbl, constraint:restrict_to, table_format:avro/snap/block table_name:date_tbl, constraint:restrict_to, table_format:orc/def/block table_name:date_tbl, constraint:restrict_to, table_format:hbase/none/none table_name:date_tbl, constraint:restrict_to, table_format:text/none/none -table_name:date_tbl, constraint:restrict_to, table_format:text/lzo/block table_name:date_tbl, constraint:restrict_to, table_format:text/bzip/block table_name:date_tbl, constraint:restrict_to, table_format:text/gzip/block table_name:date_tbl, constraint:restrict_to, table_format:text/snap/block table_name:date_tbl, constraint:restrict_to, table_format:text/def/block table_name:date_tbl_error, constraint:restrict_to, table_format:text/none/none -table_name:date_tbl_error, constraint:restrict_to, table_format:text/lzo/block table_name:date_tbl_error, constraint:restrict_to, table_format:text/bzip/block table_name:date_tbl_error, constraint:restrict_to, table_format:text/gzip/block table_name:date_tbl_error, constraint:restrict_to, table_format:text/snap/block @@ -280,7 +277,6 @@ table_name:bucketed_ext_table, constraint:exclude, table_format:hbase/none/none table_name:bucketed_ext_table, constraint:exclude, table_format:kudu/none/none table_name:bucketed_table, constraint:exclude, table_format:hbase/none/none table_name:bucketed_table, constraint:exclude, table_format:kudu/none/none -table_name:bucketed_table, constraint:exclude, table_format:text/lzo/block # The uncompressed ORC tables are mainly used in test_scanners_fuzz.py to avoid creating # them each time when running the test. Developers may run this test many times locally. diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/joins-hdfs-num-rows-est-enabled.test b/testdata/workloads/functional-planner/queries/PlannerTest/joins-hdfs-num-rows-est-enabled.test index f711777..fc4644f 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/joins-hdfs-num-rows-est-enabled.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/joins-hdfs-num-rows-est-enabled.test @@ -531,8 +531,8 @@ PLAN-ROOT SINK # join involving tables with no table stats # one of the tables (alltypes) is a compressed text file # tests that the default join strategy is broadcast -select * from functional_text_lzo.emptytable a inner join -functional_text_lzo.alltypes b on a.f2 = b.int_col +select * from functional_text_gzip.emptytable a inner join +functional_text_gzip.alltypes b on a.f2 = b.int_col ---- PLAN PLAN-ROOT SINK | @@ -541,11 +541,11 @@ PLAN-ROOT SINK | runtime filters: RF000 <- a.f2 | row-size=96B cardinality=5.65K | -|--00:SCAN HDFS [functional_text_lzo.emptytable a] +|--00:SCAN HDFS [functional_text_gzip.emptytable a] | partitions=0/0 files=0 size=0B | row-size=16B cardinality=0 | -01:SCAN HDFS [functional_text_lzo.alltypes b] +01:SCAN HDFS [functional_text_gzip.alltypes b] HDFS partitions=24/24 files=24 size=123.32KB runtime filters: RF000 -> b.int_col row-size=80B cardinality=5.65K diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/joins.test b/testdata/workloads/functional-planner/queries/PlannerTest/joins.test index 721b2ab..8e1f693 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/joins.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/joins.test @@ -3056,8 +3056,8 @@ PLAN-ROOT SINK # join involving tables with no table stats # one of the tables (alltypes) is a compressed text file # tests that the default join strategy is broadcast -select * from functional_text_lzo.emptytable a inner join -functional_text_lzo.alltypes b on a.f2 = b.int_col +select * from functional_text_gzip.emptytable a inner join +functional_text_gzip.alltypes b on a.f2 = b.int_col ---- PLAN PLAN-ROOT SINK | @@ -3066,11 +3066,11 @@ PLAN-ROOT SINK | runtime filters: RF000 <- b.int_col | row-size=96B cardinality=0 | -|--01:SCAN HDFS [functional_text_lzo.alltypes b] +|--01:SCAN HDFS [functional_text_gzip.alltypes b] | HDFS partitions=24/24 files=24 size=123.32KB | row-size=80B cardinality=unavailable | -00:SCAN HDFS [functional_text_lzo.emptytable a] +00:SCAN HDFS [functional_text_gzip.emptytable a] partitions=0/0 files=0 size=0B runtime filters: RF000 -> a.f2 row-size=16B cardinality=0 diff --git a/testdata/workloads/functional-query/functional-query_dimensions.csv b/testdata/workloads/functional-query/functional-query_dimensions.csv index bcb4406..ecb6a0e 100644 --- a/testdata/workloads/functional-query/functional-query_dimensions.csv +++ b/testdata/workloads/functional-query/functional-query_dimensions.csv @@ -1,4 +1,4 @@ file_format: text,seq,rc,avro,parquet,orc,hbase,kudu dataset: functional -compression_codec: none,def,gzip,bzip,snap,lzo +compression_codec: none,def,gzip,bzip,snap compression_type: none,block,record diff --git a/testdata/workloads/functional-query/functional-query_exhaustive.csv b/testdata/workloads/functional-query/functional-query_exhaustive.csv index c2ef09f..148dd5b 100644 --- a/testdata/workloads/functional-query/functional-query_exhaustive.csv +++ b/testdata/workloads/functional-query/functional-query_exhaustive.csv @@ -4,7 +4,6 @@ file_format: text, dataset: functional, compression_codec: def, compression_type file_format: text, dataset: functional, compression_codec: gzip, compression_type: block file_format: text, dataset: functional, compression_codec: bzip, compression_type: block file_format: text, dataset: functional, compression_codec: snap, compression_type: block -file_format: text, dataset: functional, compression_codec: lzo, compression_type: block file_format: seq, dataset: functional, compression_codec: none, compression_type: none file_format: seq, dataset: functional, compression_codec: def, compression_type: block file_format: seq, dataset: functional, compression_codec: def, compression_type: record diff --git a/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-scan-node-errors.test b/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-scan-node-errors.test index c9e4b39..d1ce4ca 100644 --- a/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-scan-node-errors.test +++ b/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-scan-node-errors.test @@ -107,24 +107,6 @@ row_regex: .*Error parsing row: file: $NAMENODE/.* before offset: \d+ INT, DATE, DATE ==== ---- QUERY -select count(*) from functional_text_lzo.bad_text_lzo ----- ERRORS -Blocksize: 536870911 is greater than LZO_MAX_BLOCK_SIZE: 67108864 ----- RESULTS -5141 ----- TYPES -bigint -==== ----- QUERY -select count(field) from functional_text_lzo.bad_text_lzo ----- ERRORS -Blocksize: 536870911 is greater than LZO_MAX_BLOCK_SIZE: 67108864 ----- RESULTS -5141 ----- TYPES -bigint -==== ----- QUERY select * from alltypeserrornonulls ---- ERRORS Error converting column: 3 to SMALLINT diff --git a/testdata/workloads/functional-query/queries/QueryTest/disable-lzo-plugin.test b/testdata/workloads/functional-query/queries/QueryTest/disable-lzo-plugin.test deleted file mode 100644 index b141fd9..0000000 --- a/testdata/workloads/functional-query/queries/QueryTest/disable-lzo-plugin.test +++ /dev/null @@ -1,7 +0,0 @@ -==== ----- QUERY -# Test that running with plugin disabled fails gracefully. -select * from functional_text_lzo.alltypes ----- CATCH -Scanner plugin 'LZO' is not one of the enabled plugins: '' -==== diff --git a/testdata/workloads/functional-query/queries/QueryTest/show-create-table.test b/testdata/workloads/functional-query/queries/QueryTest/show-create-table.test index 274bff1..ddac51c 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/show-create-table.test +++ b/testdata/workloads/functional-query/queries/QueryTest/show-create-table.test @@ -371,18 +371,6 @@ LOCATION '$$location_uri$$' TBLPROPERTIES ('external.table.purge'='TRUE') ==== ---- QUERY -SHOW CREATE TABLE functional_text_lzo.tinytable ----- RESULTS-HIVE -CREATE EXTERNAL TABLE functional_text_lzo.tinytable ( - a STRING, - b STRING -) -ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' -STORED AS INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat' - OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' -LOCATION '$$location_uri$$' -==== ----- QUERY SHOW CREATE TABLE functional.allcomplextypes ---- RESULTS-HIVE CREATE EXTERNAL TABLE functional.allcomplextypes ( diff --git a/testdata/workloads/functional-query/queries/QueryTest/unsupported-compression-partitions.test b/testdata/workloads/functional-query/queries/QueryTest/unsupported-compression-partitions.test index 23199cc..9bc5c5c 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/unsupported-compression-partitions.test +++ b/testdata/workloads/functional-query/queries/QueryTest/unsupported-compression-partitions.test @@ -13,7 +13,7 @@ BIGINT select count(*) from multi_text_compression where month <= 3 ---- CATCH -Scanner plugin 'LZ4' is not one of the enabled plugins: 'LZO' +Scanner plugin 'LZ4' is not one of the enabled plugins: '' ==== ---- QUERY # Unknown compression suffix is treated as uncompressed text. @@ -26,3 +26,10 @@ INT Error converting column: 0 to INT Error parsing row: file: __HDFS_FILENAME__, before offset: 16 ==== +---- QUERY +# Test that querying partition with unsupported plugin fails gracefully. +select count(*) +from multi_text_compression where month = 5 +---- CATCH +Scanner plugin 'LZO' is not one of the enabled plugins: '' +==== \ No newline at end of file diff --git a/testdata/workloads/perf-regression/perf-regression_dimensions.csv b/testdata/workloads/perf-regression/perf-regression_dimensions.csv index 705d48e..218c609 100644 --- a/testdata/workloads/perf-regression/perf-regression_dimensions.csv +++ b/testdata/workloads/perf-regression/perf-regression_dimensions.csv @@ -1,4 +1,4 @@ file_format: text,seq dataset: tpch -compression_codec: none,def,gzip,bzip,snap,lzo +compression_codec: none,def,gzip,bzip,snap compression_type: none,block,record diff --git a/testdata/workloads/perf-regression/perf-regression_exhaustive.csv b/testdata/workloads/perf-regression/perf-regression_exhaustive.csv index 8ef6907..8f52e14 100644 --- a/testdata/workloads/perf-regression/perf-regression_exhaustive.csv +++ b/testdata/workloads/perf-regression/perf-regression_exhaustive.csv @@ -1,6 +1,5 @@ # Generated File. file_format: text, dataset: tpch, compression_codec: none, compression_type: none -file_format: text, dataset: tpch, compression_codec: lzo, compression_type: block file_format: seq, dataset: tpch, compression_codec: none, compression_type: none file_format: seq, dataset: tpch, compression_codec: def, compression_type: block file_format: seq, dataset: tpch, compression_codec: def, compression_type: record diff --git a/testdata/workloads/perf-regression/perf-regression_pairwise.csv b/testdata/workloads/perf-regression/perf-regression_pairwise.csv index 3ba426f..ef758df 100644 --- a/testdata/workloads/perf-regression/perf-regression_pairwise.csv +++ b/testdata/workloads/perf-regression/perf-regression_pairwise.csv @@ -2,4 +2,3 @@ file_format: text, dataset: tpch, compression_codec: none, compression_type: none file_format: seq, dataset: tpch, compression_codec: def, compression_type: block file_format: seq, dataset: tpch, compression_codec: gzip, compression_type: record -file_format: text, dataset: tpch, compression_codec: lzo, compression_type: block diff --git a/testdata/workloads/targeted-perf/targeted-perf_dimensions.csv b/testdata/workloads/targeted-perf/targeted-perf_dimensions.csv index 1de34aa..db1610e 100644 --- a/testdata/workloads/targeted-perf/targeted-perf_dimensions.csv +++ b/testdata/workloads/targeted-perf/targeted-perf_dimensions.csv @@ -1,4 +1,4 @@ file_format: text,seq,rc,avro,parquet,kudu dataset: tpch -compression_codec: none,def,gzip,bzip,snap,lzo +compression_codec: none,def,gzip,bzip,snap compression_type: none,block,record diff --git a/testdata/workloads/targeted-perf/targeted-perf_exhaustive.csv b/testdata/workloads/targeted-perf/targeted-perf_exhaustive.csv index 098bb65..a2d4fed 100644 --- a/testdata/workloads/targeted-perf/targeted-perf_exhaustive.csv +++ b/testdata/workloads/targeted-perf/targeted-perf_exhaustive.csv @@ -1,6 +1,5 @@ # Generated File. file_format: text, dataset: tpch, compression_codec: none, compression_type: none -file_format: text, dataset: tpch, compression_codec: lzo, compression_type: block file_format: seq, dataset: tpch, compression_codec: none, compression_type: none file_format: seq, dataset: tpch, compression_codec: def, compression_type: block file_format: seq, dataset: tpch, compression_codec: def, compression_type: record diff --git a/testdata/workloads/targeted-perf/targeted-perf_pairwise.csv b/testdata/workloads/targeted-perf/targeted-perf_pairwise.csv index 3ba426f..ef758df 100644 --- a/testdata/workloads/targeted-perf/targeted-perf_pairwise.csv +++ b/testdata/workloads/targeted-perf/targeted-perf_pairwise.csv @@ -2,4 +2,3 @@ file_format: text, dataset: tpch, compression_codec: none, compression_type: none file_format: seq, dataset: tpch, compression_codec: def, compression_type: block file_format: seq, dataset: tpch, compression_codec: gzip, compression_type: record -file_format: text, dataset: tpch, compression_codec: lzo, compression_type: block diff --git a/testdata/workloads/targeted-stress/targeted-stress_dimensions.csv b/testdata/workloads/targeted-stress/targeted-stress_dimensions.csv index b5a61ff..baca6ac 100644 --- a/testdata/workloads/targeted-stress/targeted-stress_dimensions.csv +++ b/testdata/workloads/targeted-stress/targeted-stress_dimensions.csv @@ -1,4 +1,4 @@ file_format: text,seq, parquet dataset: tpch -compression_codec: none,def,gzip,bzip,snap,lzo +compression_codec: none,def,gzip,bzip,snap compression_type: none,block,record diff --git a/testdata/workloads/targeted-stress/targeted-stress_exhaustive.csv b/testdata/workloads/targeted-stress/targeted-stress_exhaustive.csv index 8ef6907..8f52e14 100644 --- a/testdata/workloads/targeted-stress/targeted-stress_exhaustive.csv +++ b/testdata/workloads/targeted-stress/targeted-stress_exhaustive.csv @@ -1,6 +1,5 @@ # Generated File. file_format: text, dataset: tpch, compression_codec: none, compression_type: none -file_format: text, dataset: tpch, compression_codec: lzo, compression_type: block file_format: seq, dataset: tpch, compression_codec: none, compression_type: none file_format: seq, dataset: tpch, compression_codec: def, compression_type: block file_format: seq, dataset: tpch, compression_codec: def, compression_type: record diff --git a/testdata/workloads/targeted-stress/targeted-stress_pairwise.csv b/testdata/workloads/targeted-stress/targeted-stress_pairwise.csv index 3ba426f..ef758df 100644 --- a/testdata/workloads/targeted-stress/targeted-stress_pairwise.csv +++ b/testdata/workloads/targeted-stress/targeted-stress_pairwise.csv @@ -2,4 +2,3 @@ file_format: text, dataset: tpch, compression_codec: none, compression_type: none file_format: seq, dataset: tpch, compression_codec: def, compression_type: block file_format: seq, dataset: tpch, compression_codec: gzip, compression_type: record -file_format: text, dataset: tpch, compression_codec: lzo, compression_type: block diff --git a/testdata/workloads/tpcds-unmodified/tpcds-unmodified_dimensions.csv b/testdata/workloads/tpcds-unmodified/tpcds-unmodified_dimensions.csv index 8137b7a..3de278f 100644 --- a/testdata/workloads/tpcds-unmodified/tpcds-unmodified_dimensions.csv +++ b/testdata/workloads/tpcds-unmodified/tpcds-unmodified_dimensions.csv @@ -1,4 +1,4 @@ file_format: text,seq,rc,avro,parquet dataset: tpcds -compression_codec: none,def,gzip,bzip,snap,lzo +compression_codec: none,def,gzip,bzip,snap compression_type: none,block,record diff --git a/testdata/workloads/tpcds-unmodified/tpcds-unmodified_exhaustive.csv b/testdata/workloads/tpcds-unmodified/tpcds-unmodified_exhaustive.csv index c4b4f99..46bcc18 100644 --- a/testdata/workloads/tpcds-unmodified/tpcds-unmodified_exhaustive.csv +++ b/testdata/workloads/tpcds-unmodified/tpcds-unmodified_exhaustive.csv @@ -1,6 +1,5 @@ # Generated File. file_format: text, dataset: tpcds, compression_codec: none, compression_type: none -file_format: text, dataset: tpcds, compression_codec: lzo, compression_type: block file_format: seq, dataset: tpcds, compression_codec: none, compression_type: none file_format: seq, dataset: tpcds, compression_codec: def, compression_type: block file_format: seq, dataset: tpcds, compression_codec: def, compression_type: record diff --git a/testdata/workloads/tpcds-unmodified/tpcds-unmodified_pairwise.csv b/testdata/workloads/tpcds-unmodified/tpcds-unmodified_pairwise.csv index e643495..b63d108 100644 --- a/testdata/workloads/tpcds-unmodified/tpcds-unmodified_pairwise.csv +++ b/testdata/workloads/tpcds-unmodified/tpcds-unmodified_pairwise.csv @@ -8,7 +8,6 @@ file_format: parquet, dataset: tpcds, compression_codec: def, compression_type: file_format: avro, dataset: tpcds, compression_codec: def, compression_type: block file_format: rc, dataset: tpcds, compression_codec: bzip, compression_type: block file_format: seq, dataset: tpcds, compression_codec: snap, compression_type: record -file_format: text, dataset: tpcds, compression_codec: lzo, compression_type: block file_format: rc, dataset: tpcds, compression_codec: def, compression_type: block file_format: avro, dataset: tpcds, compression_codec: none, compression_type: none file_format: parquet, dataset: tpcds, compression_codec: none, compression_type: none diff --git a/testdata/workloads/tpcds/tpcds_dimensions.csv b/testdata/workloads/tpcds/tpcds_dimensions.csv index bae5d90..34117d9 100644 --- a/testdata/workloads/tpcds/tpcds_dimensions.csv +++ b/testdata/workloads/tpcds/tpcds_dimensions.csv @@ -1,4 +1,4 @@ file_format: text,seq,rc,avro,parquet,orc dataset: tpcds -compression_codec: none,def,gzip,bzip,snap,lzo +compression_codec: none,def,gzip,bzip,snap compression_type: none,block,record diff --git a/testdata/workloads/tpcds/tpcds_exhaustive.csv b/testdata/workloads/tpcds/tpcds_exhaustive.csv index 57fcddd..6cb3b9b 100644 --- a/testdata/workloads/tpcds/tpcds_exhaustive.csv +++ b/testdata/workloads/tpcds/tpcds_exhaustive.csv @@ -1,6 +1,5 @@ # Generated File. file_format: text, dataset: tpcds, compression_codec: none, compression_type: none -file_format: text, dataset: tpcds, compression_codec: lzo, compression_type: block file_format: seq, dataset: tpcds, compression_codec: none, compression_type: none file_format: seq, dataset: tpcds, compression_codec: def, compression_type: block file_format: seq, dataset: tpcds, compression_codec: def, compression_type: record diff --git a/testdata/workloads/tpcds/tpcds_pairwise.csv b/testdata/workloads/tpcds/tpcds_pairwise.csv index 61ee66c..7d4515d 100644 --- a/testdata/workloads/tpcds/tpcds_pairwise.csv +++ b/testdata/workloads/tpcds/tpcds_pairwise.csv @@ -8,7 +8,6 @@ file_format: parquet, dataset: tpcds, compression_codec: def, compression_type: file_format: avro, dataset: tpcds, compression_codec: def, compression_type: block file_format: rc, dataset: tpcds, compression_codec: bzip, compression_type: block file_format: seq, dataset: tpcds, compression_codec: snap, compression_type: record -file_format: text, dataset: tpcds, compression_codec: lzo, compression_type: block file_format: rc, dataset: tpcds, compression_codec: def, compression_type: block file_format: avro, dataset: tpcds, compression_codec: none, compression_type: none file_format: parquet, dataset: tpcds, compression_codec: none, compression_type: none diff --git a/testdata/workloads/tpch/tpch_dimensions.csv b/testdata/workloads/tpch/tpch_dimensions.csv index f1ce5f0..57e0dd4 100644 --- a/testdata/workloads/tpch/tpch_dimensions.csv +++ b/testdata/workloads/tpch/tpch_dimensions.csv @@ -1,4 +1,4 @@ file_format: text,seq,rc,avro,parquet,orc,kudu dataset: tpch -compression_codec: none,def,gzip,bzip,snap,lzo +compression_codec: none,def,gzip,bzip,snap compression_type: none,block,record diff --git a/testdata/workloads/tpch/tpch_exhaustive.csv b/testdata/workloads/tpch/tpch_exhaustive.csv index 3513dc5..fffaa92 100644 --- a/testdata/workloads/tpch/tpch_exhaustive.csv +++ b/testdata/workloads/tpch/tpch_exhaustive.csv @@ -1,7 +1,6 @@ # Generated File. file_format: text, dataset: tpch, compression_codec: none, compression_type: none file_format: text, dataset: tpch, compression_codec: gzip, compression_type: block -file_format: text, dataset: tpch, compression_codec: lzo, compression_type: block file_format: seq, dataset: tpch, compression_codec: none, compression_type: none file_format: seq, dataset: tpch, compression_codec: def, compression_type: block file_format: seq, dataset: tpch, compression_codec: def, compression_type: record diff --git a/testdata/workloads/tpch/tpch_pairwise.csv b/testdata/workloads/tpch/tpch_pairwise.csv index 2eb4176..e245e3b 100644 --- a/testdata/workloads/tpch/tpch_pairwise.csv +++ b/testdata/workloads/tpch/tpch_pairwise.csv @@ -8,7 +8,6 @@ file_format: parquet, dataset: tpch, compression_codec: def, compression_type: b file_format: avro, dataset: tpch, compression_codec: def, compression_type: block file_format: rc, dataset: tpch, compression_codec: bzip, compression_type: block file_format: seq, dataset: tpch, compression_codec: snap, compression_type: record -file_format: text, dataset: tpch, compression_codec: lzo, compression_type: block file_format: rc, dataset: tpch, compression_codec: def, compression_type: block file_format: avro, dataset: tpch, compression_codec: none, compression_type: none file_format: parquet, dataset: tpch, compression_codec: none, compression_type: none diff --git a/tests/common/test_dimensions.py b/tests/common/test_dimensions.py index c352c83..8a144e5 100644 --- a/tests/common/test_dimensions.py +++ b/tests/common/test_dimensions.py @@ -32,7 +32,7 @@ class TableFormatInfo(object): KNOWN_FILE_FORMATS = ['text', 'seq', 'rc', 'parquet', 'orc', 'avro', 'hbase'] if os.environ['KUDU_IS_SUPPORTED'] == 'true': KNOWN_FILE_FORMATS.append('kudu') - KNOWN_COMPRESSION_CODECS = ['none', 'snap', 'gzip', 'bzip', 'def', 'lzo', 'zstd', 'lz4'] + KNOWN_COMPRESSION_CODECS = ['none', 'snap', 'gzip', 'bzip', 'def', 'zstd', 'lz4'] KNOWN_COMPRESSION_TYPES = ['none', 'block', 'record'] def __init__(self, **kwargs): diff --git a/tests/custom_cluster/test_hive_text_codec_interop.py b/tests/custom_cluster/test_hive_text_codec_interop.py index 556a0e0..7d4f094 100644 --- a/tests/custom_cluster/test_hive_text_codec_interop.py +++ b/tests/custom_cluster/test_hive_text_codec_interop.py @@ -26,7 +26,7 @@ from tests.common.test_dimensions import create_exec_option_dimension from tests.common.test_result_verifier import verify_query_result_is_equal # compression codecs impala support reading in text file type -TEXT_CODECS = ['snappy', 'gzip', 'zstd', 'lzo', 'bzip2', 'deflate', 'default'] +TEXT_CODECS = ['snappy', 'gzip', 'zstd', 'bzip2', 'deflate', 'default'] class TestTextInterop(CustomClusterTestSuite): @@ -84,7 +84,6 @@ class TestTextInterop(CustomClusterTestSuite): 'snappy': 'org.apache.hadoop.io.compress.SnappyCodec', 'gzip': 'org.apache.hadoop.io.compress.GzipCodec', 'zstd': 'org.apache.hadoop.io.compress.ZStandardCodec', - 'lzo': 'com.hadoop.compression.lzo.LzopCodec', 'bzip2': 'org.apache.hadoop.io.compress.BZip2Codec', 'deflate': 'org.apache.hadoop.io.compress.DeflateCodec', 'default': 'org.apache.hadoop.io.compress.DefaultCodec' diff --git a/tests/custom_cluster/test_scanner_plugin.py b/tests/custom_cluster/test_scanner_plugin.py deleted file mode 100644 index e30e6f5..0000000 --- a/tests/custom_cluster/test_scanner_plugin.py +++ /dev/null @@ -1,34 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import pytest - -from tests.common.custom_cluster_test_suite import CustomClusterTestSuite - -class TestScannerPlugin(CustomClusterTestSuite): - """Tests that involve changing the scanner plugin option.""" - - @classmethod - def get_workload(self): - return 'functional-query' - - @pytest.mark.execute_serially - @CustomClusterTestSuite.with_args("--enabled_hdfs_text_scanner_plugins=") - def test_disable_lzo_plugin(self, vector): - """Test that we can gracefully handle a disabled plugin.""" - # Should be able to query valid partitions only. - self.run_test_case('QueryTest/disable-lzo-plugin', vector) diff --git a/tests/metadata/test_metadata_query_statements.py b/tests/metadata/test_metadata_query_statements.py index 6423534..6626567 100644 --- a/tests/metadata/test_metadata_query_statements.py +++ b/tests/metadata/test_metadata_query_statements.py @@ -102,7 +102,7 @@ class TestMetadataQueryStatements(ImpalaTestSuite): self.exec_and_compare_hive_and_impala_hs2("describe formatted functional.alltypes", compare=compare_describe_formatted) self.exec_and_compare_hive_and_impala_hs2( - "describe formatted functional_text_lzo.alltypes", + "describe formatted functional_text_gzip.alltypes", compare=compare_describe_formatted) # Describe an unpartitioned table. diff --git a/tests/metadata/test_partition_metadata.py b/tests/metadata/test_partition_metadata.py index 280b7d0..5d7d109 100644 --- a/tests/metadata/test_partition_metadata.py +++ b/tests/metadata/test_partition_metadata.py @@ -181,7 +181,7 @@ class TestPartitionMetadataUncompressedTextOnly(ImpalaTestSuite): FQ_TBL_NAME, TBL_LOCATION)) self.__add_alltypes_partition(vector, FQ_TBL_NAME, "functional", 2009, 1) - self.__add_alltypes_partition(vector, FQ_TBL_NAME, "functional_text_lzo", 2009, 2) + self.__add_alltypes_partition(vector, FQ_TBL_NAME, "functional_text_gzip", 2009, 2) # Create a new partition with a bogus file with the unsupported LZ4 suffix. lz4_year = 2009 @@ -204,8 +204,18 @@ class TestPartitionMetadataUncompressedTextOnly(ImpalaTestSuite): "alter table {0} add partition (year={1}, month={2}) location '{3}'".format( FQ_TBL_NAME, fake_comp_year, fake_comp_month, fake_comp_ym_partition_loc)) + # Create a new partition with a bogus file with the now-unsupported LZO suffix + lzo_year = 2009 + lzo_month = 5 + lzo_ym_partition_loc = self.__make_ym_partition_dir(TBL_LOCATION, lzo_year, lzo_month) + self.filesystem_client.create_file("{0}/fake.lzo".format(lzo_ym_partition_loc)[1:], + "some test data") + self.client.execute( + "alter table {0} add partition (year={1}, month={2}) location '{3}'".format( + FQ_TBL_NAME, lzo_year, lzo_month, lzo_ym_partition_loc)) + show_files_result = self.client.execute("show files in {0}".format(FQ_TBL_NAME)) - assert len(show_files_result.data) == 4, "Expected one file per partition dir" + assert len(show_files_result.data) == 5, "Expected one file per partition dir" self.run_test_case('QueryTest/unsupported-compression-partitions', vector, unique_database) @@ -222,8 +232,11 @@ class TestPartitionMetadataUncompressedTextOnly(ImpalaTestSuite): """Create the year/month partition directory and return the path.""" y_partition_loc = "{0}/year={1}".format(tbl_location, year) ym_partition_loc = "{0}/month={1}".format(y_partition_loc, month) - self.filesystem_client.delete_file_dir(tbl_location[1:], recursive=True) - self.filesystem_client.make_dir(tbl_location[1:]) - self.filesystem_client.make_dir(y_partition_loc[1:]) + if not self.filesystem_client.exists(tbl_location[1:]): + self.filesystem_client.make_dir(tbl_location[1:]) + if not self.filesystem_client.exists(y_partition_loc[1:]): + self.filesystem_client.make_dir(y_partition_loc[1:]) + if self.filesystem_client.exists(ym_partition_loc[1:]): + self.filesystem_client.delete_file_dir(ym_partition_loc[1:], recursive=True) self.filesystem_client.make_dir(ym_partition_loc[1:]) return ym_partition_loc diff --git a/tests/query_test/test_compressed_formats.py b/tests/query_test/test_compressed_formats.py index 9f9177a..8aa4705 100644 --- a/tests/query_test/test_compressed_formats.py +++ b/tests/query_test/test_compressed_formats.py @@ -75,7 +75,6 @@ class TestCompressedFormats(ImpalaTestSuite): file_format = vector.get_value('file_format') extension, suffix = vector.get_value('compression_format') if file_format in ['rc', 'seq']: - # TODO: How about LZO? # Test that {gzip,snappy,bzip,deflate}-compressed # {RC,sequence,text} files are supported. db_suffix = '_%s_%s' % (file_format, suffix) diff --git a/tests/query_test/test_scanners_fuzz.py b/tests/query_test/test_scanners_fuzz.py index 73d734b..c25dd39 100644 --- a/tests/query_test/test_scanners_fuzz.py +++ b/tests/query_test/test_scanners_fuzz.py @@ -72,7 +72,7 @@ class TestScannersFuzzing(ImpalaTestSuite): cls.ImpalaTestMatrix.add_constraint(lambda v: v.get_value('table_format').file_format in ('avro', 'parquet', 'orc') or (v.get_value('table_format').file_format == 'text' and - v.get_value('table_format').compression_codec in ('none', 'lzo'))) + v.get_value('table_format').compression_codec in ('none'))) def test_fuzz_alltypes(self, vector, unique_database): @@ -247,8 +247,7 @@ class TestScannersFuzzing(ImpalaTestSuite): msg = "Should not throw error when abort_on_error=0: '{0}'".format(e) LOG.error(msg) # Parquet and compressed text can fail the query for some parse errors. - # E.g. corrupt Parquet footer (IMPALA-3773) or a corrupt LZO index file - # (IMPALA-4013). + # E.g. corrupt Parquet footer (IMPALA-3773) table_format = vector.get_value('table_format') if table_format.file_format not in ['parquet', 'orc', 'rc', 'seq'] \ and not (table_format.file_format == 'text' and