IMPALA-5554: sorter DCHECK on null column The bug was in the DCHECK. The DCHECK is intended to make sure that a tuple's string data didn't get split across blocks. The logic assumed that if the second-or-later string column was in the next-block, that the strings were split between blocks. However, that assumption is invalid if there are NULL strings, which do not belong in any block.
The fix for the DCHECK (which is still useful) is to count the number of non-NULL strings and make sure that no non-NULL strings were split between blocks. Testing: Added a test that reproduces the crash. Change-Id: I7a8dee982501008efff5b5abc192cfb5e6544a90 Reviewed-on: http://gerrit.cloudera.org:8080/7295 Reviewed-by: Tim Armstrong <[email protected]> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/4a3ef9c7 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/4a3ef9c7 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/4a3ef9c7 Branch: refs/heads/master Commit: 4a3ef9c7731a45c028a36ef75d3a6a10bd4bf4cd Parents: d9fc9be Author: Tim Armstrong <[email protected]> Authored: Sun Jun 25 15:37:33 2017 -0700 Committer: Impala Public Jenkins <[email protected]> Committed: Tue Jun 27 05:36:11 2017 +0000 ---------------------------------------------------------------------- be/src/runtime/sorter.cc | 6 +++- .../QueryTest/single-node-large-sorts.test | 36 ++++++++++++++++++-- 2 files changed, 39 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4a3ef9c7/be/src/runtime/sorter.cc ---------------------------------------------------------------------- diff --git a/be/src/runtime/sorter.cc b/be/src/runtime/sorter.cc index b8182e2..6760373 100644 --- a/be/src/runtime/sorter.cc +++ b/be/src/runtime/sorter.cc @@ -1006,9 +1006,11 @@ bool Sorter::Run::ConvertOffsetsToPtrs(Tuple* tuple) { var_len_blocks_[var_len_blocks_index_]->buffer(); const vector<SlotDescriptor*>& string_slots = sort_tuple_desc_->string_slots(); + int num_non_null_string_slots = 0; for (int i = 0; i < string_slots.size(); ++i) { SlotDescriptor* slot_desc = string_slots[i]; if (tuple->IsNull(slot_desc->null_indicator_offset())) continue; + ++num_non_null_string_slots; DCHECK(slot_desc->type().IsVarLenStringType()); StringValue* value = reinterpret_cast<StringValue*>( @@ -1026,7 +1028,9 @@ bool Sorter::Run::ConvertOffsetsToPtrs(Tuple* tuple) { DCHECK_LE(block_index, var_len_blocks_.size()); DCHECK_EQ(block_index, var_len_blocks_index_ + 1); DCHECK_EQ(block_offset, 0); // The data is the first thing in the next block. - DCHECK_EQ(i, 0); // Var len data for tuple shouldn't be split across blocks. + // This must be the first slot with var len data for the tuple. Var len data + // for tuple shouldn't be split across blocks. + DCHECK_EQ(num_non_null_string_slots, 1); return false; } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/4a3ef9c7/testdata/workloads/functional-query/queries/QueryTest/single-node-large-sorts.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/single-node-large-sorts.test b/testdata/workloads/functional-query/queries/QueryTest/single-node-large-sorts.test index 8000ad3..74b7eee 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/single-node-large-sorts.test +++ b/testdata/workloads/functional-query/queries/QueryTest/single-node-large-sorts.test @@ -29,6 +29,38 @@ STRING,STRING,STRING 'Faaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa','','' 'Faaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa','','' ---- RUNTIME_PROFILE -row_regex: .* TotalMergesPerformed: [^0] .* -row_regex: .* SpilledRuns: [^0] .* +row_regex: .* TotalMergesPerformed: [^0].* +row_regex: .* SpilledRuns: [^0].* +==== +---- QUERY +# Regression test for IMPALA-5554: first string column in sort tuple is null +# on boundary of spilled block. Test does two sorts with a NULL and non-NULL +# string column in both potential orders. +set max_block_mgr_memory=50m; +select * +from ( + select *, first_value(col) over (order by sort_col) fv + from ( + select concat(l_linestatus, repeat('a', 63)) sort_col, if(l_returnflag = 'foo', l_returnflag, NULL) col + from tpch_parquet.lineitem limit 100000 + union all + select if(l_returnflag = 'foo', l_returnflag, NULL) sort_col, concat(l_linestatus, repeat('a', 63)) col + from tpch_parquet.lineitem) q limit 100000 + ) q2 +limit 10 +---- TYPES +STRING,STRING,STRING +---- RESULTS +'Faaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa','NULL','NULL' +'Faaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa','NULL','NULL' +'Faaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa','NULL','NULL' +'Faaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa','NULL','NULL' +'Faaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa','NULL','NULL' +'Faaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa','NULL','NULL' +'Faaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa','NULL','NULL' +'Faaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa','NULL','NULL' +'Faaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa','NULL','NULL' +'Faaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa','NULL','NULL' +---- RUNTIME_PROFILE +row_regex: .* SpilledRuns: [^0].* ====
