IMPALA-5679: Fix Parquet count(*) with group by string

In a recent patch (IMPALA-5036) a bug was introduced where a count(*)
query with a group by a string partition column returned incorrect
results. Data was being written into the tuple at an incorrect offset.

Testing:
- Added an end to end test where we are selecting from a table
  partitioned by string.

Change-Id: I225547574c2b2259ca81cb642d082e151f3bed6b
Reviewed-on: http://gerrit.cloudera.org:8080/7481
Reviewed-by: Tim Armstrong <[email protected]>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/408b0aac
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/408b0aac
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/408b0aac

Branch: refs/heads/master
Commit: 408b0aac831ab7d6d6459353848f9a11b811e281
Parents: 1653419
Author: Taras Bobrovytsky <[email protected]>
Authored: Fri Jul 21 14:13:28 2017 -0700
Committer: Impala Public Jenkins <[email protected]>
Committed: Sat Jul 22 05:53:06 2017 +0000

----------------------------------------------------------------------
 be/src/exec/hdfs-scan-node-base.h               |  2 +-
 .../queries/QueryTest/parquet-stats-agg.test    | 22 ++++++++++++++++++++
 tests/query_test/test_aggregation.py            |  6 +++---
 3 files changed, 26 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/408b0aac/be/src/exec/hdfs-scan-node-base.h
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-scan-node-base.h 
b/be/src/exec/hdfs-scan-node-base.h
index f71f5b4..c79a6e8 100644
--- a/be/src/exec/hdfs-scan-node-base.h
+++ b/be/src/exec/hdfs-scan-node-base.h
@@ -155,7 +155,7 @@ class HdfsScanNodeBase : public ScanNode {
   int skip_header_line_count() const { return skip_header_line_count_; }
   DiskIoRequestContext* reader_context() { return reader_context_; }
   bool optimize_parquet_count_star() const { return 
optimize_parquet_count_star_; }
-  bool parquet_count_star_slot_offset() const { return 
parquet_count_star_slot_offset_; }
+  int parquet_count_star_slot_offset() const { return 
parquet_count_star_slot_offset_; }
 
   typedef std::unordered_map<TupleId, std::vector<ScalarExprEvaluator*>>
     ConjunctEvaluatorsMap;

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/408b0aac/testdata/workloads/functional-query/queries/QueryTest/parquet-stats-agg.test
----------------------------------------------------------------------
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/parquet-stats-agg.test 
b/testdata/workloads/functional-query/queries/QueryTest/parquet-stats-agg.test
index 3b1c33b..620c50b 100644
--- 
a/testdata/workloads/functional-query/queries/QueryTest/parquet-stats-agg.test
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/parquet-stats-agg.test
@@ -115,3 +115,25 @@ select count(*) from tpch_parquet.lineitem
 ---- TYPES
 bigint
 =====
+---- QUERY
+# IMPALA-5679: Count(*) with group by on a string partition column.
+drop table if exists $DATABASE.string_partitioned_table;
+create table $DATABASE.string_partitioned_table (int_col integer)
+partitioned by (string_col STRING) stored as parquet;
+insert into $DATABASE.string_partitioned_table partition(string_col)
+select int_col, string_col from functional.alltypes;
+select string_col, count(*) from $DATABASE.string_partitioned_table group by 
string_col;
+---- RESULTS
+'0',730
+'1',730
+'2',730
+'3',730
+'4',730
+'5',730
+'6',730
+'7',730
+'8',730
+'9',730
+---- TYPES
+string, bigint
+=====

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/408b0aac/tests/query_test/test_aggregation.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_aggregation.py 
b/tests/query_test/test_aggregation.py
index 289a867..4999afe 100644
--- a/tests/query_test/test_aggregation.py
+++ b/tests/query_test/test_aggregation.py
@@ -271,14 +271,14 @@ class TestAggregationQueries(ImpalaTestSuite):
       # Verify codegen was enabled for all four stages of the aggregation.
       assert_codegen_enabled(result.runtime_profile, [1, 2, 4, 6])
 
-  def test_parquet_count_star_optimization(self, vector):
+  def test_parquet_count_star_optimization(self, vector, unique_database):
     if (vector.get_value('table_format').file_format != 'text' or
         vector.get_value('table_format').compression_codec != 'none'):
       # No need to run this test on all file formats
       pytest.skip()
-    self.run_test_case('QueryTest/parquet-stats-agg', vector)
+    self.run_test_case('QueryTest/parquet-stats-agg', vector, unique_database)
     vector.get_value('exec_option')['batch_size'] = 1
-    self.run_test_case('QueryTest/parquet-stats-agg', vector)
+    self.run_test_case('QueryTest/parquet-stats-agg', vector, unique_database)
 
 class TestWideAggregationQueries(ImpalaTestSuite):
   """Test that aggregations with many grouping columns work"""

Reply via email to