IMPALA-7102 (Part 1): Disable reading of erasure coding by default

In this patch we add a query option ALLOW_ERASURE_CODED_FILES, that
allows us to enable or disable the support of erasure coded files. Even
though Impala should be able to handle HDFS erasure coded files already,
this feature hasn't been tested thoroughly yet. Also, Impala lacks
metrics, observability and DDL commands related to erasure coding. This
is a query option instead of a startup flag because we want to make it
possible for advanced users to enable the feature.

We may also need a follow on patch to also disable the write path with
this flag.

Cherry-picks: not for 2.x

Change-Id: Icd3b1754541262467a6e67068b0b447882a40fb3
Reviewed-on: http://gerrit.cloudera.org:8080/10646
Reviewed-by: Impala Public Jenkins <[email protected]>
Tested-by: Impala Public Jenkins <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/8060f4d5
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/8060f4d5
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/8060f4d5

Branch: refs/heads/master
Commit: 8060f4d50e6e9530fe90c32c65b67b2537681302
Parents: 5c880e5
Author: Taras Bobrovytsky <[email protected]>
Authored: Thu May 31 18:44:57 2018 -0700
Committer: Impala Public Jenkins <[email protected]>
Committed: Fri Jun 29 23:26:35 2018 +0000

----------------------------------------------------------------------
 be/src/service/query-options.cc                      |  4 ++++
 be/src/service/query-options.h                       |  4 +++-
 bin/run-all-tests.sh                                 |  2 ++
 common/thrift/ImpalaInternalService.thrift           |  3 +++
 common/thrift/ImpalaService.thrift                   |  3 +++
 .../java/org/apache/impala/planner/HdfsScanNode.java |  6 ++++++
 testdata/bin/create-load-data.sh                     |  4 ++++
 .../queries/QueryTest/hdfs-erasure-coding.test       | 15 +++++++++++++++
 tests/common/custom_cluster_test_suite.py            |  3 +++
 tests/common/skip.py                                 |  1 +
 tests/query_test/test_observability.py               | 12 ++++++++----
 tests/query_test/test_scanners.py                    |  9 +++++++++
 12 files changed, 61 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/8060f4d5/be/src/service/query-options.cc
----------------------------------------------------------------------
diff --git a/be/src/service/query-options.cc b/be/src/service/query-options.cc
index 218e4e6..2e3415f 100644
--- a/be/src/service/query-options.cc
+++ b/be/src/service/query-options.cc
@@ -690,6 +690,10 @@ Status impala::SetQueryOption(const string& key, const 
string& value,
           return Status(Substitute("Invalid kudu_read_mode '$0'. Valid values 
are "
               "DEFAULT, READ_LATEST, and READ_AT_SNAPSHOT.", value));
         }
+      }
+      case TImpalaQueryOptions::ALLOW_ERASURE_CODED_FILES: {
+        query_options->__set_allow_erasure_coded_files(
+            iequals(value, "true") || iequals(value, "1"));
         break;
       }
       default:

http://git-wip-us.apache.org/repos/asf/impala/blob/8060f4d5/be/src/service/query-options.h
----------------------------------------------------------------------
diff --git a/be/src/service/query-options.h b/be/src/service/query-options.h
index 5c4f51d..fce042c 100644
--- a/be/src/service/query-options.h
+++ b/be/src/service/query-options.h
@@ -41,7 +41,7 @@ typedef std::unordered_map<string, 
beeswax::TQueryOptionLevel::type>
 // the DCHECK.
 #define QUERY_OPTS_TABLE\
   DCHECK_EQ(_TImpalaQueryOptions_VALUES_TO_NAMES.size(),\
-      TImpalaQueryOptions::KUDU_READ_MODE + 1);\
+      TImpalaQueryOptions::ALLOW_ERASURE_CODED_FILES + 1);\
   REMOVED_QUERY_OPT_FN(abort_on_default_limit_exceeded, 
ABORT_ON_DEFAULT_LIMIT_EXCEEDED)\
   QUERY_OPT_FN(abort_on_error, ABORT_ON_ERROR, TQueryOptionLevel::REGULAR)\
   QUERY_OPT_FN(allow_unsupported_formats, ALLOW_UNSUPPORTED_FORMATS,\
@@ -138,6 +138,8 @@ typedef std::unordered_map<string, 
beeswax::TQueryOptionLevel::type>
   QUERY_OPT_FN(thread_reservation_aggregate_limit, 
THREAD_RESERVATION_AGGREGATE_LIMIT,\
       TQueryOptionLevel::REGULAR)\
   QUERY_OPT_FN(kudu_read_mode, KUDU_READ_MODE, TQueryOptionLevel::ADVANCED)\
+  QUERY_OPT_FN(allow_erasure_coded_files, ALLOW_ERASURE_CODED_FILES,\
+      TQueryOptionLevel::DEVELOPMENT)\
   ;
 
 /// Enforce practical limits on some query options to avoid undesired query 
state.

http://git-wip-us.apache.org/repos/asf/impala/blob/8060f4d5/bin/run-all-tests.sh
----------------------------------------------------------------------
diff --git a/bin/run-all-tests.sh b/bin/run-all-tests.sh
index 08d140a..83b3548 100755
--- a/bin/run-all-tests.sh
+++ b/bin/run-all-tests.sh
@@ -73,6 +73,8 @@ if [[ "${ERASURE_CODING}" = true ]]; then
   # We do not run FE tests when erasure coding is enabled because planner tests
   # would fail.
   FE_TEST=false
+  TEST_START_CLUSTER_ARGS="${TEST_START_CLUSTER_ARGS} \
+    --impalad_args=--default_query_options=allow_erasure_coded_files=true"
 fi
 
 # If KRPC tests are disabled, pass the flag to disable KRPC during cluster 
start.

http://git-wip-us.apache.org/repos/asf/impala/blob/8060f4d5/common/thrift/ImpalaInternalService.thrift
----------------------------------------------------------------------
diff --git a/common/thrift/ImpalaInternalService.thrift 
b/common/thrift/ImpalaInternalService.thrift
index d2a9d29..6780138 100644
--- a/common/thrift/ImpalaInternalService.thrift
+++ b/common/thrift/ImpalaInternalService.thrift
@@ -298,6 +298,9 @@ struct TQueryOptions {
 
   // See comment in ImpalaService.thrift.
   68: optional TKuduReadMode kudu_read_mode = TKuduReadMode.DEFAULT;
+
+  // Allow reading of erasure coded files in HDFS.
+  69: optional bool allow_erasure_coded_files = false;
 }
 
 // Impala currently has two types of sessions: Beeswax and HiveServer2

http://git-wip-us.apache.org/repos/asf/impala/blob/8060f4d5/common/thrift/ImpalaService.thrift
----------------------------------------------------------------------
diff --git a/common/thrift/ImpalaService.thrift 
b/common/thrift/ImpalaService.thrift
index cf62ddf..529af04 100644
--- a/common/thrift/ImpalaService.thrift
+++ b/common/thrift/ImpalaService.thrift
@@ -326,6 +326,9 @@ enum TImpalaQueryOptions {
   // Overrides the -kudu_read_mode flag to set the consistency level for Kudu 
scans.
   // Possible values are DEFAULT, READ_LATEST, and READ_AT_SNAPSHOT.
   KUDU_READ_MODE,
+
+  // Allow reading of erasure coded files.
+  ALLOW_ERASURE_CODED_FILES,
 }
 
 // The summary of a DML statement.

http://git-wip-us.apache.org/repos/asf/impala/blob/8060f4d5/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java 
b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
index 7088a7c..d1d7fd8 100644
--- a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java
@@ -799,6 +799,12 @@ public class HdfsScanNode extends ScanNode {
       totalBytes_ += partitionBytes;
       totalFiles_ += fileDescs.size();
       for (FileDescriptor fileDesc: fileDescs) {
+        if (!analyzer.getQueryOptions().isAllow_erasure_coded_files() &&
+            fileDesc.getIsEc()) {
+          throw new ImpalaRuntimeException(String.format(
+              "Scanning of HDFS erasure-coded file (%s/%s) is not supported",
+              partition.getLocation(), fileDesc.getFileName()));
+        }
         if (!fsHasBlocks) {
           Preconditions.checkState(fileDesc.getNumFileBlocks() == 0);
           generateScanRangeSpecs(partition, fileDesc, scanRangeBytesLimit);

http://git-wip-us.apache.org/repos/asf/impala/blob/8060f4d5/testdata/bin/create-load-data.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/create-load-data.sh b/testdata/bin/create-load-data.sh
index 3d06b0f..bb95f48 100755
--- a/testdata/bin/create-load-data.sh
+++ b/testdata/bin/create-load-data.sh
@@ -142,6 +142,10 @@ function start-impala {
   else
     START_CLUSTER_ARGS_INT+=("-s 3")
   fi
+  if [[ "${ERASURE_CODING}" == true ]]; then
+    START_CLUSTER_ARGS="${START_CLUSTER_ARGS} \
+      --impalad_args=--default_query_options=allow_erasure_coded_files=true"
+  fi
   START_CLUSTER_ARGS_INT+=("${START_CLUSTER_ARGS}")
   ${IMPALA_HOME}/bin/start-impala-cluster.py 
--log_dir=${IMPALA_DATA_LOADING_LOGS_DIR} \
     ${START_CLUSTER_ARGS_INT[@]}

http://git-wip-us.apache.org/repos/asf/impala/blob/8060f4d5/testdata/workloads/functional-query/queries/QueryTest/hdfs-erasure-coding.test
----------------------------------------------------------------------
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/hdfs-erasure-coding.test
 
b/testdata/workloads/functional-query/queries/QueryTest/hdfs-erasure-coding.test
new file mode 100644
index 0000000..0c773b4
--- /dev/null
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/hdfs-erasure-coding.test
@@ -0,0 +1,15 @@
+====
+---- QUERY
+set allow_erasure_coded_files=false;
+select count(*) from functional.alltypes;
+---- CATCH
+ImpalaRuntimeException: Scanning of HDFS erasure-coded file 
(hdfs://localhost:20500/test-warehouse/alltypes/year=2009/month=1/090101.txt) 
is not supported
+====
+---- QUERY
+set allow_erasure_coded_files=true;
+select count(*) from functional.alltypes;
+---- RESULTS
+7300
+---- TYPES
+BIGINT
+====

http://git-wip-us.apache.org/repos/asf/impala/blob/8060f4d5/tests/common/custom_cluster_test_suite.py
----------------------------------------------------------------------
diff --git a/tests/common/custom_cluster_test_suite.py 
b/tests/common/custom_cluster_test_suite.py
index 20037bf..51e3f8f 100644
--- a/tests/common/custom_cluster_test_suite.py
+++ b/tests/common/custom_cluster_test_suite.py
@@ -140,6 +140,9 @@ class CustomClusterTestSuite(ImpalaTestSuite):
     if pytest.config.option.test_no_krpc:
       cmd.append("--disable_krpc")
 
+    if os.environ.get("ERASURE_CODING") == "true":
+      
cmd.append("--impalad_args=--default_query_options=allow_erasure_coded_files=true")
+
     try:
       check_call(cmd + options, close_fds=True)
     finally:

http://git-wip-us.apache.org/repos/asf/impala/blob/8060f4d5/tests/common/skip.py
----------------------------------------------------------------------
diff --git a/tests/common/skip.py b/tests/common/skip.py
index ca49327..e84c75b 100644
--- a/tests/common/skip.py
+++ b/tests/common/skip.py
@@ -86,6 +86,7 @@ class SkipIf:
       reason="Kudu is not supported")
   not_s3 = pytest.mark.skipif(not IS_S3, reason="S3 Filesystem needed")
   not_hdfs = pytest.mark.skipif(not IS_HDFS, reason="HDFS Filesystem needed")
+  not_ec = pytest.mark.skipif(not IS_EC, reason="Erasure Coding needed")
   no_secondary_fs = pytest.mark.skipif(not SECONDARY_FILESYSTEM,
       reason="Secondary filesystem needed")
   not_krpc = pytest.mark.skipif(pytest.config.option.test_no_krpc,

http://git-wip-us.apache.org/repos/asf/impala/blob/8060f4d5/tests/query_test/test_observability.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_observability.py 
b/tests/query_test/test_observability.py
index 86e78cc..f71a45f 100644
--- a/tests/query_test/test_observability.py
+++ b/tests/query_test/test_observability.py
@@ -15,9 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 
+from tests.common.impala_cluster import ImpalaCluster
 from tests.common.impala_test_suite import ImpalaTestSuite
 from tests.common.skip import SkipIfS3, SkipIfADLS, SkipIfIsilon, SkipIfLocal
-from tests.common.impala_cluster import ImpalaCluster
+from tests.util.filesystem_utils import IS_EC
 import logging
 import pytest
 import re
@@ -117,9 +118,12 @@ class TestObservability(ImpalaTestSuite):
         profile
     # For this query, the planner sets NUM_NODES=1, NUM_SCANNER_THREADS=1,
     # RUNTIME_FILTER_MODE=0 and MT_DOP=0
-    assert "Query Options (set by configuration and planner): 
MEM_LIMIT=8589934592," \
-        "NUM_NODES=1,NUM_SCANNER_THREADS=1,RUNTIME_FILTER_MODE=0,MT_DOP=0\n" \
-        in profile
+    expected_str = ("Query Options (set by configuration and planner): "
+        "MEM_LIMIT=8589934592,NUM_NODES=1,NUM_SCANNER_THREADS=1,"
+        "RUNTIME_FILTER_MODE=0,MT_DOP=0{erasure_coding}\n")
+    expected_str = expected_str.format(
+        erasure_coding=",ALLOW_ERASURE_CODED_FILES=1" if IS_EC else "")
+    assert expected_str in profile
 
   def test_exec_summary(self):
     """Test that the exec summary is populated correctly in every query 
state"""

http://git-wip-us.apache.org/repos/asf/impala/blob/8060f4d5/tests/query_test/test_scanners.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_scanners.py 
b/tests/query_test/test_scanners.py
index 3d71e2d..bd3c286 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -33,6 +33,7 @@ from subprocess import check_call
 from testdata.common import widetable
 from tests.common.impala_test_suite import ImpalaTestSuite, LOG
 from tests.common.skip import (
+    SkipIf,
     SkipIfS3,
     SkipIfADLS,
     SkipIfEC,
@@ -1082,3 +1083,11 @@ class TestScannerReservation(ImpalaTestSuite):
   def test_scanners(self, vector):
     self.run_test_case('QueryTest/scanner-reservation', vector)
 
+class TestErasureCoding(ImpalaTestSuite):
+  @classmethod
+  def get_workload(cls):
+    return 'functional-query'
+
+  @SkipIf.not_ec
+  def test_erasure_coding(self, vector):
+    self.run_test_case('QueryTest/hdfs-erasure-coding', vector)

Reply via email to