Repository: incubator-impala Updated Branches: refs/heads/master 07a713881 -> f2580acba
IMPALA-4988: Add query option read_parquet_statistics This change adds a query option to disable reading Parquet statistics. It provides a workaround when dealing with files that have corrupt parquet statistics. Note that Impala handles Parquet files affected by PARQUET-251 correctly by ignoring statistics for anything but plain numeric types. This query option is supposed to help with files affected by unknown or errors or by errors that are yet to be made. Change-Id: I427f7fde40d0f4b703751e40f3c2109a850643f7 Reviewed-on: http://gerrit.cloudera.org:8080/7001 Reviewed-by: Dan Hecht <[email protected]> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/f7f8c451 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/f7f8c451 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/f7f8c451 Branch: refs/heads/master Commit: f7f8c4518a6a329ad8c0fe19fcae05272adfab0f Parents: 07a7138 Author: Lars Volker <[email protected]> Authored: Fri May 26 15:30:32 2017 -0700 Committer: Impala Public Jenkins <[email protected]> Committed: Thu Jun 1 01:00:13 2017 +0000 ---------------------------------------------------------------------- be/src/exec/hdfs-parquet-scanner.cc | 2 ++ be/src/service/query-options.cc | 5 +++++ be/src/service/query-options.h | 3 ++- common/thrift/ImpalaInternalService.thrift | 5 +++++ common/thrift/ImpalaService.thrift | 5 +++++ .../queries/QueryTest/parquet_stats.test | 11 +++++++++++ 6 files changed, 30 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f7f8c451/be/src/exec/hdfs-parquet-scanner.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-parquet-scanner.cc b/be/src/exec/hdfs-parquet-scanner.cc index 4f90cb1..4e2c5fd 100644 --- a/be/src/exec/hdfs-parquet-scanner.cc +++ b/be/src/exec/hdfs-parquet-scanner.cc @@ -495,6 +495,8 @@ Status HdfsParquetScanner::EvaluateStatsConjuncts( bool* skip_row_group) { *skip_row_group = false; + if (!state_->query_options().parquet_read_statistics) return Status::OK(); + const TupleDescriptor* min_max_tuple_desc = scan_node_->min_max_tuple_desc(); if (!min_max_tuple_desc) return Status::OK(); http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f7f8c451/be/src/service/query-options.cc ---------------------------------------------------------------------- diff --git a/be/src/service/query-options.cc b/be/src/service/query-options.cc index 1df0aae..0f0d30c 100644 --- a/be/src/service/query-options.cc +++ b/be/src/service/query-options.cc @@ -475,6 +475,11 @@ Status impala::SetQueryOption(const string& key, const string& value, iequals(value, "true") || iequals(value, "1")); break; } + case TImpalaQueryOptions::PARQUET_READ_STATISTICS: { + query_options->__set_parquet_read_statistics( + iequals(value, "true") || iequals(value, "1")); + break; + } default: // We hit this DCHECK(false) if we forgot to add the corresponding entry here // when we add a new query option. http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f7f8c451/be/src/service/query-options.h ---------------------------------------------------------------------- diff --git a/be/src/service/query-options.h b/be/src/service/query-options.h index ddb1b23..3a11383 100644 --- a/be/src/service/query-options.h +++ b/be/src/service/query-options.h @@ -35,7 +35,7 @@ class TQueryOptions; // the DCHECK. #define QUERY_OPTS_TABLE\ DCHECK_EQ(_TImpalaQueryOptions_VALUES_TO_NAMES.size(),\ - TImpalaQueryOptions::PARQUET_ARRAY_RESOLUTION + 1);\ + TImpalaQueryOptions::PARQUET_READ_STATISTICS + 1);\ QUERY_OPT_FN(abort_on_default_limit_exceeded, ABORT_ON_DEFAULT_LIMIT_EXCEEDED)\ QUERY_OPT_FN(abort_on_error, ABORT_ON_ERROR)\ QUERY_OPT_FN(allow_unsupported_formats, ALLOW_UNSUPPORTED_FORMATS)\ @@ -90,6 +90,7 @@ class TQueryOptions; QUERY_OPT_FN(decimal_v2, DECIMAL_V2)\ QUERY_OPT_FN(parquet_dictionary_filtering, PARQUET_DICTIONARY_FILTERING)\ QUERY_OPT_FN(parquet_array_resolution, PARQUET_ARRAY_RESOLUTION)\ + QUERY_OPT_FN(parquet_read_statistics, PARQUET_READ_STATISTICS)\ ; http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f7f8c451/common/thrift/ImpalaInternalService.thrift ---------------------------------------------------------------------- diff --git a/common/thrift/ImpalaInternalService.thrift b/common/thrift/ImpalaInternalService.thrift index b17aeec..17bfc4a 100644 --- a/common/thrift/ImpalaInternalService.thrift +++ b/common/thrift/ImpalaInternalService.thrift @@ -236,6 +236,11 @@ struct TQueryOptions { // Policy for resolving nested array fields in Parquet files. 54: optional TParquetArrayResolution parquet_array_resolution = TParquetArrayResolution.TWO_LEVEL_THEN_THREE_LEVEL + + // Indicates whether to read statistics from Parquet files and use them during query + // processing. This includes skipping data based on the statistics and computing query + // results like "select min()". + 55: optional bool parquet_read_statistics = true } // Impala currently has two types of sessions: Beeswax and HiveServer2 http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f7f8c451/common/thrift/ImpalaService.thrift ---------------------------------------------------------------------- diff --git a/common/thrift/ImpalaService.thrift b/common/thrift/ImpalaService.thrift index c238e85..dd89e52 100644 --- a/common/thrift/ImpalaService.thrift +++ b/common/thrift/ImpalaService.thrift @@ -266,6 +266,11 @@ enum TImpalaQueryOptions { // TODO: Remove the TWO_LEVEL_THEN_THREE_LEVEL mode completely or at least make // it non-default in a compatibility breaking release. PARQUET_ARRAY_RESOLUTION, + + // Indicates whether to read statistics from Parquet files and use them during query + // processing. This includes skipping data based on the statistics and computing query + // results like "select min()". + PARQUET_READ_STATISTICS, } // The summary of a DML statement. http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f7f8c451/testdata/workloads/functional-query/queries/QueryTest/parquet_stats.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet_stats.test b/testdata/workloads/functional-query/queries/QueryTest/parquet_stats.test index c149dc6..d03b4c9 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/parquet_stats.test +++ b/testdata/workloads/functional-query/queries/QueryTest/parquet_stats.test @@ -447,3 +447,14 @@ select count(*) from chars where c <= "aaaa" aggregation(SUM, NumRowGroups): 1 aggregation(SUM, NumStatsFilteredRowGroups): 0 ==== +---- QUERY +# IMPALA-4988: Test that stats support can be disabled using the parquet_read_statistics +# query option. +set parquet_read_statistics=0; +select count(*) from functional_parquet.alltypes where id < 0; +---- RESULTS +0 +---- RUNTIME_PROFILE +aggregation(SUM, NumRowGroups): 24 +aggregation(SUM, NumStatsFilteredRowGroups): 0 +====
