IMPALA-5679: Fix Parquet count(*) with group by string In a recent patch (IMPALA-5036) a bug was introduced where a count(*) query with a group by a string partition column returned incorrect results. Data was being written into the tuple at an incorrect offset.
Testing: - Added an end to end test where we are selecting from a table partitioned by string. Change-Id: I225547574c2b2259ca81cb642d082e151f3bed6b Reviewed-on: http://gerrit.cloudera.org:8080/7481 Reviewed-by: Tim Armstrong <[email protected]> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/408b0aac Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/408b0aac Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/408b0aac Branch: refs/heads/master Commit: 408b0aac831ab7d6d6459353848f9a11b811e281 Parents: 1653419 Author: Taras Bobrovytsky <[email protected]> Authored: Fri Jul 21 14:13:28 2017 -0700 Committer: Impala Public Jenkins <[email protected]> Committed: Sat Jul 22 05:53:06 2017 +0000 ---------------------------------------------------------------------- be/src/exec/hdfs-scan-node-base.h | 2 +- .../queries/QueryTest/parquet-stats-agg.test | 22 ++++++++++++++++++++ tests/query_test/test_aggregation.py | 6 +++--- 3 files changed, 26 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/408b0aac/be/src/exec/hdfs-scan-node-base.h ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-scan-node-base.h b/be/src/exec/hdfs-scan-node-base.h index f71f5b4..c79a6e8 100644 --- a/be/src/exec/hdfs-scan-node-base.h +++ b/be/src/exec/hdfs-scan-node-base.h @@ -155,7 +155,7 @@ class HdfsScanNodeBase : public ScanNode { int skip_header_line_count() const { return skip_header_line_count_; } DiskIoRequestContext* reader_context() { return reader_context_; } bool optimize_parquet_count_star() const { return optimize_parquet_count_star_; } - bool parquet_count_star_slot_offset() const { return parquet_count_star_slot_offset_; } + int parquet_count_star_slot_offset() const { return parquet_count_star_slot_offset_; } typedef std::unordered_map<TupleId, std::vector<ScalarExprEvaluator*>> ConjunctEvaluatorsMap; http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/408b0aac/testdata/workloads/functional-query/queries/QueryTest/parquet-stats-agg.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-stats-agg.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-stats-agg.test index 3b1c33b..620c50b 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/parquet-stats-agg.test +++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-stats-agg.test @@ -115,3 +115,25 @@ select count(*) from tpch_parquet.lineitem ---- TYPES bigint ===== +---- QUERY +# IMPALA-5679: Count(*) with group by on a string partition column. +drop table if exists $DATABASE.string_partitioned_table; +create table $DATABASE.string_partitioned_table (int_col integer) +partitioned by (string_col STRING) stored as parquet; +insert into $DATABASE.string_partitioned_table partition(string_col) +select int_col, string_col from functional.alltypes; +select string_col, count(*) from $DATABASE.string_partitioned_table group by string_col; +---- RESULTS +'0',730 +'1',730 +'2',730 +'3',730 +'4',730 +'5',730 +'6',730 +'7',730 +'8',730 +'9',730 +---- TYPES +string, bigint +===== http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/408b0aac/tests/query_test/test_aggregation.py ---------------------------------------------------------------------- diff --git a/tests/query_test/test_aggregation.py b/tests/query_test/test_aggregation.py index 289a867..4999afe 100644 --- a/tests/query_test/test_aggregation.py +++ b/tests/query_test/test_aggregation.py @@ -271,14 +271,14 @@ class TestAggregationQueries(ImpalaTestSuite): # Verify codegen was enabled for all four stages of the aggregation. assert_codegen_enabled(result.runtime_profile, [1, 2, 4, 6]) - def test_parquet_count_star_optimization(self, vector): + def test_parquet_count_star_optimization(self, vector, unique_database): if (vector.get_value('table_format').file_format != 'text' or vector.get_value('table_format').compression_codec != 'none'): # No need to run this test on all file formats pytest.skip() - self.run_test_case('QueryTest/parquet-stats-agg', vector) + self.run_test_case('QueryTest/parquet-stats-agg', vector, unique_database) vector.get_value('exec_option')['batch_size'] = 1 - self.run_test_case('QueryTest/parquet-stats-agg', vector) + self.run_test_case('QueryTest/parquet-stats-agg', vector, unique_database) class TestWideAggregationQueries(ImpalaTestSuite): """Test that aggregations with many grouping columns work"""
