IMPALA-6187: Fix missing conjuncts evaluation with empty projection Previously, scanners will assume that there are no conjuncts associated with a scan node for queries with no materialized slots (e.g. count(*)). This is not necessarily the case as one can write queries such as select count(*) from tpch.lineitem where rand() * 10 < 0; or select count(*) from tpch.lineitem where rand() > <a partition column>. In which case, the conjuncts should still be evaluated once per row.
This change fixes the problem in the short-circuit handling logic for count(*) to evaluate the conjuncts once per row and only commits a row to the output row batch if the conjuncts evaluate to true. Testing done: Added the example above to the scanner test Change-Id: Ib530f1fdcd2c6de699977db163b3f6eb38481517 Reviewed-on: http://gerrit.cloudera.org:8080/8623 Reviewed-by: Tim Armstrong <tarmstr...@cloudera.com> Reviewed-by: Alex Behm <alex.b...@cloudera.com> Reviewed-by: Dan Hecht <dhe...@cloudera.com> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/impala/repo Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/63f17e9c Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/63f17e9c Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/63f17e9c Branch: refs/heads/master Commit: 63f17e9ceaed92a28ea12567a36b746e54fffdb3 Parents: 2fba80e Author: Michael Ho <k...@cloudera.com> Authored: Mon Nov 20 19:35:06 2017 -0800 Committer: Impala Public Jenkins <impala-public-jenk...@gerrit.cloudera.org> Committed: Wed Nov 29 05:53:15 2017 +0000 ---------------------------------------------------------------------- be/src/exec/hdfs-parquet-scanner.cc | 2 +- be/src/exec/hdfs-scanner.cc | 25 +++++++++++++++----- be/src/exec/hdfs-scanner.h | 8 +++++-- be/src/exec/hdfs-sequence-scanner.cc | 2 +- be/src/exec/kudu-scanner.cc | 13 +++++++++- be/src/exec/kudu-scanner.h | 2 ++ .../queries/QueryTest/kudu-scan-node.test | 9 +++++++ .../queries/QueryTest/scanners.test | 24 +++++++++++++++++++ 8 files changed, 74 insertions(+), 11 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/impala/blob/63f17e9c/be/src/exec/hdfs-parquet-scanner.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-parquet-scanner.cc b/be/src/exec/hdfs-parquet-scanner.cc index f407877..4e4abef 100644 --- a/be/src/exec/hdfs-parquet-scanner.cc +++ b/be/src/exec/hdfs-parquet-scanner.cc @@ -461,7 +461,7 @@ Status HdfsParquetScanner::GetNextInternal(RowBatch* row_batch) { Status status = CommitRows(row_batch, num_to_commit); assemble_rows_timer_.Stop(); RETURN_IF_ERROR(status); - row_group_rows_read_ += num_to_commit; + row_group_rows_read_ += max_tuples; COUNTER_ADD(scan_node_->rows_read_counter(), row_group_rows_read_); return Status::OK(); } http://git-wip-us.apache.org/repos/asf/impala/blob/63f17e9c/be/src/exec/hdfs-scanner.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-scanner.cc b/be/src/exec/hdfs-scanner.cc index 0dbfc5f..f53b7d0 100644 --- a/be/src/exec/hdfs-scanner.cc +++ b/be/src/exec/hdfs-scanner.cc @@ -207,12 +207,25 @@ Status HdfsScanner::CommitRows(int num_rows, RowBatch* row_batch) { int HdfsScanner::WriteTemplateTuples(TupleRow* row, int num_tuples) { DCHECK_GE(num_tuples, 0); DCHECK_EQ(scan_node_->tuple_idx(), 0); - DCHECK_EQ(conjunct_evals_->size(), 0); - if (num_tuples == 0 || template_tuple_ == NULL) return num_tuples; - - Tuple** row_tuple = reinterpret_cast<Tuple**>(row); - for (int i = 0; i < num_tuples; ++i) row_tuple[i] = template_tuple_; - return num_tuples; + DCHECK_EQ(scan_node_->materialized_slots().size(), 0); + int num_to_commit = 0; + if (LIKELY(conjunct_evals_->size() == 0)) { + num_to_commit = num_tuples; + } else { + TupleRow template_tuple_row; + template_tuple_row.SetTuple(0, template_tuple_); + // Evaluate any conjuncts which may reference the partition columns. + for (int i = 0; i < num_tuples; ++i) { + if (EvalConjuncts(&template_tuple_row)) ++num_to_commit; + } + } + if (template_tuple_ != nullptr) { + Tuple** row_tuple = reinterpret_cast<Tuple**>(row); + for (int i = 0; i < num_to_commit; ++i) row_tuple[i] = template_tuple_; + } else { + DCHECK_EQ(scan_node_->tuple_desc()->byte_size(), 0); + } + return num_to_commit; } bool HdfsScanner::WriteCompleteTuple(MemPool* pool, FieldLocation* fields, http://git-wip-us.apache.org/repos/asf/impala/blob/63f17e9c/be/src/exec/hdfs-scanner.h ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-scanner.h b/be/src/exec/hdfs-scanner.h index c603593..9b80d6c 100644 --- a/be/src/exec/hdfs-scanner.h +++ b/be/src/exec/hdfs-scanner.h @@ -328,8 +328,12 @@ class HdfsScanner { return ExecNode::EvalConjuncts(conjunct_evals_->data(), conjunct_evals_->size(), row); } - /// Sets 'num_tuples' template tuples in the batch that 'row' points to. Assumes the - /// 'tuple_row' only has a single tuple. Returns the number of tuples set. + /// Handles the case when there are no slots materialized (e.g. count(*)) by adding + /// up to 'num_tuples' rows to the row batch which 'row' points to. Assumes each tuple + /// row only has one tuple. Set the added tuples in the row batch with the template + /// tuple if it's not NULL. In the rare case when there are conjuncts, evaluate them + /// once for each row and only add a row when they evaluate to true. Returns the number + /// of tuple rows added. int WriteTemplateTuples(TupleRow* row, int num_tuples); /// Processes batches of fields and writes them out to tuple_row_mem. http://git-wip-us.apache.org/repos/asf/impala/blob/63f17e9c/be/src/exec/hdfs-sequence-scanner.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/hdfs-sequence-scanner.cc b/be/src/exec/hdfs-sequence-scanner.cc index 67f598c..346a18a 100644 --- a/be/src/exec/hdfs-sequence-scanner.cc +++ b/be/src/exec/hdfs-sequence-scanner.cc @@ -341,7 +341,7 @@ Status HdfsSequenceScanner::ProcessRange(RowBatch* row_batch) { } } } else { - add_row = WriteTemplateTuples(tuple_row_mem, 1); + add_row = WriteTemplateTuples(tuple_row_mem, 1) > 0; } num_rows_read++; if (add_row) RETURN_IF_ERROR(CommitRows(1, row_batch)); http://git-wip-us.apache.org/repos/asf/impala/blob/63f17e9c/be/src/exec/kudu-scanner.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/kudu-scanner.cc b/be/src/exec/kudu-scanner.cc index 7db8878..a9b56fe 100644 --- a/be/src/exec/kudu-scanner.cc +++ b/be/src/exec/kudu-scanner.cc @@ -244,8 +244,19 @@ Status KuduScanner::HandleEmptyProjection(RowBatch* row_batch) { int num_rows_remaining = cur_kudu_batch_.NumRows() - cur_kudu_batch_num_read_; int rows_to_add = std::min(row_batch->capacity() - row_batch->num_rows(), num_rows_remaining); + int num_to_commit = 0; + if (LIKELY(conjunct_evals_.empty())) { + num_to_commit = rows_to_add; + } else { + for (int i = 0; i < rows_to_add; ++i) { + if (ExecNode::EvalConjuncts(conjunct_evals_.data(), + conjunct_evals_.size(), nullptr)) { + ++num_to_commit; + } + } + } cur_kudu_batch_num_read_ += rows_to_add; - row_batch->CommitRows(rows_to_add); + row_batch->CommitRows(num_to_commit); return Status::OK(); } http://git-wip-us.apache.org/repos/asf/impala/blob/63f17e9c/be/src/exec/kudu-scanner.h ---------------------------------------------------------------------- diff --git a/be/src/exec/kudu-scanner.h b/be/src/exec/kudu-scanner.h index e6d4ca9..5617847 100644 --- a/be/src/exec/kudu-scanner.h +++ b/be/src/exec/kudu-scanner.h @@ -64,6 +64,8 @@ class KuduScanner { private: /// Handles the case where the projection is empty (e.g. count(*)). /// Does this by adding sets of rows to 'row_batch' instead of adding one-by-one. + /// If in the rare case where there is any conjunct, evaluate them once for each row + /// and add a row to the row batch only when the conjuncts evaluate to true. Status HandleEmptyProjection(RowBatch* row_batch); /// Decodes rows previously fetched from kudu, now in 'cur_rows_' into a RowBatch. http://git-wip-us.apache.org/repos/asf/impala/blob/63f17e9c/testdata/workloads/functional-query/queries/QueryTest/kudu-scan-node.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/kudu-scan-node.test b/testdata/workloads/functional-query/queries/QueryTest/kudu-scan-node.test index 115affa..f32c8a1 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/kudu-scan-node.test +++ b/testdata/workloads/functional-query/queries/QueryTest/kudu-scan-node.test @@ -140,3 +140,12 @@ order by id; ---- TYPES INT, TIMESTAMP ==== +---- QUERY +# Regression test for IMPALA-6187. Make sure count(*) queries with partition columns only +# won't miss conjuncts evaluation. 'id' is the partition column here. +select count(*) from functional_kudu.alltypes where rand() + id < 0.0; +---- RESULTS +0 +---- TYPES +BIGINT +==== \ No newline at end of file http://git-wip-us.apache.org/repos/asf/impala/blob/63f17e9c/testdata/workloads/functional-query/queries/QueryTest/scanners.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/scanners.test b/testdata/workloads/functional-query/queries/QueryTest/scanners.test index 658e4cf..99ec5c5 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/scanners.test +++ b/testdata/workloads/functional-query/queries/QueryTest/scanners.test @@ -74,3 +74,27 @@ from nulltable where b = '' ---- TYPES STRING, STRING ==== +---- QUERY +# The following 3 tests are regression tests for IMPALA-6187. Make sure the conjuncts are +# evaluated when there are no materialized slots or only partition columns are accessed. +select count(*) from alltypes where rand() * 10 >= 0.0; +---- RESULTS +7300 +---- TYPES +BIGINT +==== +---- QUERY +select count(*) from alltypes where rand() * 10 < 0.0; +---- RESULTS +0 +---- TYPES +BIGINT +==== +---- QUERY +# 'year' and 'month' are partition columns. +select count(*) from alltypes where rand() - year > month; +---- RESULTS +0 +---- TYPES +BIGINT +====