Repository: impala
Updated Branches:
  refs/heads/master a018038df -> c856b30e3


IMPALA-6113: Skip row groups with predicates on NULL columns

Based on the existing Parquet column chunk level statistics null_count,
Impala's Parquet scanner is enhanced to skip an entire row group if the
null_count statistics indicate that all the values under the predicated
column are NULL as we wouldn't get any result rows from that row group
anyway.

Change-Id: I141317af0e0df30da8f220b29b0bfba364f40ddf
Reviewed-on: http://gerrit.cloudera.org:8080/9140
Reviewed-by: Tim Armstrong <tarmstr...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/097f2f3f
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/097f2f3f
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/097f2f3f

Branch: refs/heads/master
Commit: 097f2f3f3b25a0f148bb8f5465debe7d690b07fe
Parents: a018038
Author: Gabor Kaszab <gaborkas...@cloudera.com>
Authored: Wed Jan 24 17:01:34 2018 +0100
Committer: Impala Public Jenkins <impala-public-jenk...@gerrit.cloudera.org>
Committed: Sat Feb 3 03:24:37 2018 +0000

----------------------------------------------------------------------
 be/src/exec/hdfs-parquet-scanner.cc             |  7 +++++
 be/src/exec/parquet-column-stats.cc             | 13 ++++++++
 be/src/exec/parquet-column-stats.h              |  6 ++++
 .../queries/QueryTest/parquet-stats.test        | 33 ++++++++++++++++++++
 4 files changed, 59 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/097f2f3f/be/src/exec/hdfs-parquet-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-parquet-scanner.cc 
b/be/src/exec/hdfs-parquet-scanner.cc
index c14edd7..7a10f3c 100644
--- a/be/src/exec/hdfs-parquet-scanner.cc
+++ b/be/src/exec/hdfs-parquet-scanner.cc
@@ -584,6 +584,13 @@ Status HdfsParquetScanner::EvaluateStatsConjuncts(
       DCHECK(false) << "Unsupported function name for statistics evaluation: " 
<< fn_name;
     }
 
+    int64_t null_count = 0;
+    bool null_count_result = ColumnStatsBase::ReadNullCountStat(col_chunk, 
&null_count);
+    if (null_count_result && null_count == col_chunk.meta_data.num_values) {
+      *skip_row_group = true;
+      break;
+    }
+
     if (stats_read) {
       TupleRow row;
       row.SetTuple(0, min_max_tuple_);

http://git-wip-us.apache.org/repos/asf/impala/blob/097f2f3f/be/src/exec/parquet-column-stats.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/parquet-column-stats.cc 
b/be/src/exec/parquet-column-stats.cc
index 4443309..a1d1155 100644
--- a/be/src/exec/parquet-column-stats.cc
+++ b/be/src/exec/parquet-column-stats.cc
@@ -129,6 +129,19 @@ bool ColumnStatsBase::ReadFromThrift(const 
parquet::ColumnChunk& col_chunk,
   return false;
 }
 
+bool ColumnStatsBase::ReadNullCountStat(const parquet::ColumnChunk& col_chunk,
+    int64_t* null_count) {
+  if (!(col_chunk.__isset.meta_data && 
col_chunk.meta_data.__isset.statistics)) {
+    return false;
+  }
+  const parquet::Statistics& stats = col_chunk.meta_data.statistics;
+  if (stats.__isset.null_count) {
+    *null_count = stats.null_count;
+    return true;
+  }
+  return false;
+}
+
 Status ColumnStatsBase::CopyToBuffer(StringBuffer* buffer, StringValue* value) 
{
   if (value->ptr == buffer->buffer()) return Status::OK();
   buffer->Clear();

http://git-wip-us.apache.org/repos/asf/impala/blob/097f2f3f/be/src/exec/parquet-column-stats.h
----------------------------------------------------------------------
diff --git a/be/src/exec/parquet-column-stats.h 
b/be/src/exec/parquet-column-stats.h
index 0ff277c..e9cf801 100644
--- a/be/src/exec/parquet-column-stats.h
+++ b/be/src/exec/parquet-column-stats.h
@@ -73,6 +73,12 @@ class ColumnStatsBase {
       const ColumnType& col_type, const parquet::ColumnOrder* col_order,
       StatsField stats_field, void* slot);
 
+  // Gets the null_count statistics from the given column chunk's metadata and 
returns
+  // it via an output parameter.
+  // Returns true if the null_count stats were read successfully, false 
otherwise.
+  static bool ReadNullCountStat(const parquet::ColumnChunk& col_chunk,
+      int64_t* null_count);
+
   /// Merges this statistics object with values from 'other'. If other has not 
been
   /// initialized, then this object will not be changed.
   virtual void Merge(const ColumnStatsBase& other) = 0;

http://git-wip-us.apache.org/repos/asf/impala/blob/097f2f3f/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test
----------------------------------------------------------------------
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test 
b/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test
index d03b4c9..70b5f27 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test
@@ -458,3 +458,36 @@ select count(*) from functional_parquet.alltypes where id 
< 0;
 aggregation(SUM, NumRowGroups): 24
 aggregation(SUM, NumStatsFilteredRowGroups): 0
 ====
+---- QUERY
+# Check that all the row groups are skipped using null_count stat
+create table table_for_null_count_test (i int, j int) stored as parquet;
+insert into table_for_null_count_test values (1, NULL), (2, NULL), (3, NULL);
+select count(*) from table_for_null_count_test where j < 3;
+---- RESULTS
+0
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 1
+aggregation(SUM, NumStatsFilteredRowGroups): 1
+====
+---- QUERY
+# Insert another row group where not all the 'j' values are NULL
+insert into table_for_null_count_test values (4, 1), (5, NULL);
+select i from table_for_null_count_test where j < 3;
+---- RESULTS
+4
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 2
+aggregation(SUM, NumStatsFilteredRowGroups): 1
+====
+---- QUERY
+# Turning off parquet stats and verifying that no row groups are skipped
+set PARQUET_READ_STATISTICS=0;
+create table table_for_null_count_test2 (i int, j int) stored as parquet;
+insert into table_for_null_count_test2 values (1, NULL), (2, NULL), (3, NULL);
+select count(*) from table_for_null_count_test2 where j < 3;
+---- RESULTS
+0
+---- RUNTIME_PROFILE
+aggregation(SUM, NumRowGroups): 1
+aggregation(SUM, NumStatsFilteredRowGroups): 0
+====

Reply via email to