IMPALA-7102 (Part 1): Disable reading of erasure coding by default In this patch we add a query option ALLOW_ERASURE_CODED_FILES, that allows us to enable or disable the support of erasure coded files. Even though Impala should be able to handle HDFS erasure coded files already, this feature hasn't been tested thoroughly yet. Also, Impala lacks metrics, observability and DDL commands related to erasure coding. This is a query option instead of a startup flag because we want to make it possible for advanced users to enable the feature.
We may also need a follow on patch to also disable the write path with this flag. Cherry-picks: not for 2.x Change-Id: Icd3b1754541262467a6e67068b0b447882a40fb3 Reviewed-on: http://gerrit.cloudera.org:8080/10646 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/impala/repo Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/8060f4d5 Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/8060f4d5 Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/8060f4d5 Branch: refs/heads/master Commit: 8060f4d50e6e9530fe90c32c65b67b2537681302 Parents: 5c880e5 Author: Taras Bobrovytsky <[email protected]> Authored: Thu May 31 18:44:57 2018 -0700 Committer: Impala Public Jenkins <[email protected]> Committed: Fri Jun 29 23:26:35 2018 +0000 ---------------------------------------------------------------------- be/src/service/query-options.cc | 4 ++++ be/src/service/query-options.h | 4 +++- bin/run-all-tests.sh | 2 ++ common/thrift/ImpalaInternalService.thrift | 3 +++ common/thrift/ImpalaService.thrift | 3 +++ .../java/org/apache/impala/planner/HdfsScanNode.java | 6 ++++++ testdata/bin/create-load-data.sh | 4 ++++ .../queries/QueryTest/hdfs-erasure-coding.test | 15 +++++++++++++++ tests/common/custom_cluster_test_suite.py | 3 +++ tests/common/skip.py | 1 + tests/query_test/test_observability.py | 12 ++++++++---- tests/query_test/test_scanners.py | 9 +++++++++ 12 files changed, 61 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/impala/blob/8060f4d5/be/src/service/query-options.cc ---------------------------------------------------------------------- diff --git a/be/src/service/query-options.cc b/be/src/service/query-options.cc index 218e4e6..2e3415f 100644 --- a/be/src/service/query-options.cc +++ b/be/src/service/query-options.cc @@ -690,6 +690,10 @@ Status impala::SetQueryOption(const string& key, const string& value, return Status(Substitute("Invalid kudu_read_mode '$0'. Valid values are " "DEFAULT, READ_LATEST, and READ_AT_SNAPSHOT.", value)); } + } + case TImpalaQueryOptions::ALLOW_ERASURE_CODED_FILES: { + query_options->__set_allow_erasure_coded_files( + iequals(value, "true") || iequals(value, "1")); break; } default: http://git-wip-us.apache.org/repos/asf/impala/blob/8060f4d5/be/src/service/query-options.h ---------------------------------------------------------------------- diff --git a/be/src/service/query-options.h b/be/src/service/query-options.h index 5c4f51d..fce042c 100644 --- a/be/src/service/query-options.h +++ b/be/src/service/query-options.h @@ -41,7 +41,7 @@ typedef std::unordered_map<string, beeswax::TQueryOptionLevel::type> // the DCHECK. #define QUERY_OPTS_TABLE\ DCHECK_EQ(_TImpalaQueryOptions_VALUES_TO_NAMES.size(),\ - TImpalaQueryOptions::KUDU_READ_MODE + 1);\ + TImpalaQueryOptions::ALLOW_ERASURE_CODED_FILES + 1);\ REMOVED_QUERY_OPT_FN(abort_on_default_limit_exceeded, ABORT_ON_DEFAULT_LIMIT_EXCEEDED)\ QUERY_OPT_FN(abort_on_error, ABORT_ON_ERROR, TQueryOptionLevel::REGULAR)\ QUERY_OPT_FN(allow_unsupported_formats, ALLOW_UNSUPPORTED_FORMATS,\ @@ -138,6 +138,8 @@ typedef std::unordered_map<string, beeswax::TQueryOptionLevel::type> QUERY_OPT_FN(thread_reservation_aggregate_limit, THREAD_RESERVATION_AGGREGATE_LIMIT,\ TQueryOptionLevel::REGULAR)\ QUERY_OPT_FN(kudu_read_mode, KUDU_READ_MODE, TQueryOptionLevel::ADVANCED)\ + QUERY_OPT_FN(allow_erasure_coded_files, ALLOW_ERASURE_CODED_FILES,\ + TQueryOptionLevel::DEVELOPMENT)\ ; /// Enforce practical limits on some query options to avoid undesired query state. http://git-wip-us.apache.org/repos/asf/impala/blob/8060f4d5/bin/run-all-tests.sh ---------------------------------------------------------------------- diff --git a/bin/run-all-tests.sh b/bin/run-all-tests.sh index 08d140a..83b3548 100755 --- a/bin/run-all-tests.sh +++ b/bin/run-all-tests.sh @@ -73,6 +73,8 @@ if [[ "${ERASURE_CODING}" = true ]]; then # We do not run FE tests when erasure coding is enabled because planner tests # would fail. FE_TEST=false + TEST_START_CLUSTER_ARGS="${TEST_START_CLUSTER_ARGS} \ + --impalad_args=--default_query_options=allow_erasure_coded_files=true" fi # If KRPC tests are disabled, pass the flag to disable KRPC during cluster start. http://git-wip-us.apache.org/repos/asf/impala/blob/8060f4d5/common/thrift/ImpalaInternalService.thrift ---------------------------------------------------------------------- diff --git a/common/thrift/ImpalaInternalService.thrift b/common/thrift/ImpalaInternalService.thrift index d2a9d29..6780138 100644 --- a/common/thrift/ImpalaInternalService.thrift +++ b/common/thrift/ImpalaInternalService.thrift @@ -298,6 +298,9 @@ struct TQueryOptions { // See comment in ImpalaService.thrift. 68: optional TKuduReadMode kudu_read_mode = TKuduReadMode.DEFAULT; + + // Allow reading of erasure coded files in HDFS. + 69: optional bool allow_erasure_coded_files = false; } // Impala currently has two types of sessions: Beeswax and HiveServer2 http://git-wip-us.apache.org/repos/asf/impala/blob/8060f4d5/common/thrift/ImpalaService.thrift ---------------------------------------------------------------------- diff --git a/common/thrift/ImpalaService.thrift b/common/thrift/ImpalaService.thrift index cf62ddf..529af04 100644 --- a/common/thrift/ImpalaService.thrift +++ b/common/thrift/ImpalaService.thrift @@ -326,6 +326,9 @@ enum TImpalaQueryOptions { // Overrides the -kudu_read_mode flag to set the consistency level for Kudu scans. // Possible values are DEFAULT, READ_LATEST, and READ_AT_SNAPSHOT. KUDU_READ_MODE, + + // Allow reading of erasure coded files. + ALLOW_ERASURE_CODED_FILES, } // The summary of a DML statement. http://git-wip-us.apache.org/repos/asf/impala/blob/8060f4d5/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java index 7088a7c..d1d7fd8 100644 --- a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java +++ b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java @@ -799,6 +799,12 @@ public class HdfsScanNode extends ScanNode { totalBytes_ += partitionBytes; totalFiles_ += fileDescs.size(); for (FileDescriptor fileDesc: fileDescs) { + if (!analyzer.getQueryOptions().isAllow_erasure_coded_files() && + fileDesc.getIsEc()) { + throw new ImpalaRuntimeException(String.format( + "Scanning of HDFS erasure-coded file (%s/%s) is not supported", + partition.getLocation(), fileDesc.getFileName())); + } if (!fsHasBlocks) { Preconditions.checkState(fileDesc.getNumFileBlocks() == 0); generateScanRangeSpecs(partition, fileDesc, scanRangeBytesLimit); http://git-wip-us.apache.org/repos/asf/impala/blob/8060f4d5/testdata/bin/create-load-data.sh ---------------------------------------------------------------------- diff --git a/testdata/bin/create-load-data.sh b/testdata/bin/create-load-data.sh index 3d06b0f..bb95f48 100755 --- a/testdata/bin/create-load-data.sh +++ b/testdata/bin/create-load-data.sh @@ -142,6 +142,10 @@ function start-impala { else START_CLUSTER_ARGS_INT+=("-s 3") fi + if [[ "${ERASURE_CODING}" == true ]]; then + START_CLUSTER_ARGS="${START_CLUSTER_ARGS} \ + --impalad_args=--default_query_options=allow_erasure_coded_files=true" + fi START_CLUSTER_ARGS_INT+=("${START_CLUSTER_ARGS}") ${IMPALA_HOME}/bin/start-impala-cluster.py --log_dir=${IMPALA_DATA_LOADING_LOGS_DIR} \ ${START_CLUSTER_ARGS_INT[@]} http://git-wip-us.apache.org/repos/asf/impala/blob/8060f4d5/testdata/workloads/functional-query/queries/QueryTest/hdfs-erasure-coding.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/hdfs-erasure-coding.test b/testdata/workloads/functional-query/queries/QueryTest/hdfs-erasure-coding.test new file mode 100644 index 0000000..0c773b4 --- /dev/null +++ b/testdata/workloads/functional-query/queries/QueryTest/hdfs-erasure-coding.test @@ -0,0 +1,15 @@ +==== +---- QUERY +set allow_erasure_coded_files=false; +select count(*) from functional.alltypes; +---- CATCH +ImpalaRuntimeException: Scanning of HDFS erasure-coded file (hdfs://localhost:20500/test-warehouse/alltypes/year=2009/month=1/090101.txt) is not supported +==== +---- QUERY +set allow_erasure_coded_files=true; +select count(*) from functional.alltypes; +---- RESULTS +7300 +---- TYPES +BIGINT +==== http://git-wip-us.apache.org/repos/asf/impala/blob/8060f4d5/tests/common/custom_cluster_test_suite.py ---------------------------------------------------------------------- diff --git a/tests/common/custom_cluster_test_suite.py b/tests/common/custom_cluster_test_suite.py index 20037bf..51e3f8f 100644 --- a/tests/common/custom_cluster_test_suite.py +++ b/tests/common/custom_cluster_test_suite.py @@ -140,6 +140,9 @@ class CustomClusterTestSuite(ImpalaTestSuite): if pytest.config.option.test_no_krpc: cmd.append("--disable_krpc") + if os.environ.get("ERASURE_CODING") == "true": + cmd.append("--impalad_args=--default_query_options=allow_erasure_coded_files=true") + try: check_call(cmd + options, close_fds=True) finally: http://git-wip-us.apache.org/repos/asf/impala/blob/8060f4d5/tests/common/skip.py ---------------------------------------------------------------------- diff --git a/tests/common/skip.py b/tests/common/skip.py index ca49327..e84c75b 100644 --- a/tests/common/skip.py +++ b/tests/common/skip.py @@ -86,6 +86,7 @@ class SkipIf: reason="Kudu is not supported") not_s3 = pytest.mark.skipif(not IS_S3, reason="S3 Filesystem needed") not_hdfs = pytest.mark.skipif(not IS_HDFS, reason="HDFS Filesystem needed") + not_ec = pytest.mark.skipif(not IS_EC, reason="Erasure Coding needed") no_secondary_fs = pytest.mark.skipif(not SECONDARY_FILESYSTEM, reason="Secondary filesystem needed") not_krpc = pytest.mark.skipif(pytest.config.option.test_no_krpc, http://git-wip-us.apache.org/repos/asf/impala/blob/8060f4d5/tests/query_test/test_observability.py ---------------------------------------------------------------------- diff --git a/tests/query_test/test_observability.py b/tests/query_test/test_observability.py index 86e78cc..f71a45f 100644 --- a/tests/query_test/test_observability.py +++ b/tests/query_test/test_observability.py @@ -15,9 +15,10 @@ # specific language governing permissions and limitations # under the License. +from tests.common.impala_cluster import ImpalaCluster from tests.common.impala_test_suite import ImpalaTestSuite from tests.common.skip import SkipIfS3, SkipIfADLS, SkipIfIsilon, SkipIfLocal -from tests.common.impala_cluster import ImpalaCluster +from tests.util.filesystem_utils import IS_EC import logging import pytest import re @@ -117,9 +118,12 @@ class TestObservability(ImpalaTestSuite): profile # For this query, the planner sets NUM_NODES=1, NUM_SCANNER_THREADS=1, # RUNTIME_FILTER_MODE=0 and MT_DOP=0 - assert "Query Options (set by configuration and planner): MEM_LIMIT=8589934592," \ - "NUM_NODES=1,NUM_SCANNER_THREADS=1,RUNTIME_FILTER_MODE=0,MT_DOP=0\n" \ - in profile + expected_str = ("Query Options (set by configuration and planner): " + "MEM_LIMIT=8589934592,NUM_NODES=1,NUM_SCANNER_THREADS=1," + "RUNTIME_FILTER_MODE=0,MT_DOP=0{erasure_coding}\n") + expected_str = expected_str.format( + erasure_coding=",ALLOW_ERASURE_CODED_FILES=1" if IS_EC else "") + assert expected_str in profile def test_exec_summary(self): """Test that the exec summary is populated correctly in every query state""" http://git-wip-us.apache.org/repos/asf/impala/blob/8060f4d5/tests/query_test/test_scanners.py ---------------------------------------------------------------------- diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py index 3d71e2d..bd3c286 100644 --- a/tests/query_test/test_scanners.py +++ b/tests/query_test/test_scanners.py @@ -33,6 +33,7 @@ from subprocess import check_call from testdata.common import widetable from tests.common.impala_test_suite import ImpalaTestSuite, LOG from tests.common.skip import ( + SkipIf, SkipIfS3, SkipIfADLS, SkipIfEC, @@ -1082,3 +1083,11 @@ class TestScannerReservation(ImpalaTestSuite): def test_scanners(self, vector): self.run_test_case('QueryTest/scanner-reservation', vector) +class TestErasureCoding(ImpalaTestSuite): + @classmethod + def get_workload(cls): + return 'functional-query' + + @SkipIf.not_ec + def test_erasure_coding(self, vector): + self.run_test_case('QueryTest/hdfs-erasure-coding', vector)
