IMPALA-6595: fix crash in NljBuilder::Close() The bug is that the right child of a blocking join node could be closed before the builder if an error was encountered when sending a batch to the sink. This hits a DCHECK because Buffers owned by the sink may still be accounted against the child node.
Testing: Added the test that originally triggered the problem. It reproduced the failure when based on the IMPALA-4835 patch, but I can't reproduce the failure after rebase onto master. Change-Id: Ie46b87a4889d7cee907124796c830db41125cf15 Reviewed-on: http://gerrit.cloudera.org:8080/9493 Tested-by: Impala Public Jenkins Reviewed-by: Tim Armstrong <tarmstr...@cloudera.com> Project: http://git-wip-us.apache.org/repos/asf/impala/repo Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/a8630cfd Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/a8630cfd Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/a8630cfd Branch: refs/heads/2.x Commit: a8630cfd7df8f7743f5db5e35e210d28bf3f59bc Parents: 18823f6 Author: Tim Armstrong <tarmstr...@cloudera.com> Authored: Sat Mar 3 16:44:00 2018 -0800 Committer: Impala Public Jenkins <impala-public-jenk...@gerrit.cloudera.org> Committed: Wed Mar 7 19:32:21 2018 +0000 ---------------------------------------------------------------------- be/src/exec/blocking-join-node.cc | 5 +++-- be/src/exec/data-sink.h | 1 + be/src/exec/exec-node.h | 3 +-- be/src/exec/nested-loop-join-node.cc | 6 +++++- .../QueryTest/single-node-nlj-exhaustive.test | 15 ++++++++++++++- 5 files changed, 24 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/impala/blob/a8630cfd/be/src/exec/blocking-join-node.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/blocking-join-node.cc b/be/src/exec/blocking-join-node.cc index e8c0948..cfaf91a 100644 --- a/be/src/exec/blocking-join-node.cc +++ b/be/src/exec/blocking-join-node.cc @@ -159,9 +159,10 @@ void BlockingJoinNode::ProcessBuildInputAsync( // probed. BlockingJoinNode::Open() will return failure as soon as child(0)->Open() // completes. if (CanCloseBuildEarly() || !status->ok()) { - // Release resources in 'build_batch_' before closing the children as some of the - // resources are still accounted towards the children node. + // Release resources in 'build_batch_' and 'build_sink' before closing the children + // as some of the resources are still accounted towards the children node. build_batch_.reset(); + if (!status->ok()) build_sink->Close(state); child(1)->Close(state); } http://git-wip-us.apache.org/repos/asf/impala/blob/a8630cfd/be/src/exec/data-sink.h ---------------------------------------------------------------------- diff --git a/be/src/exec/data-sink.h b/be/src/exec/data-sink.h index 23df628..d2e80af 100644 --- a/be/src/exec/data-sink.h +++ b/be/src/exec/data-sink.h @@ -99,6 +99,7 @@ class DataSink { const std::vector<ScalarExprEvaluator*>& output_expr_evals() const { return output_expr_evals_; } + bool is_closed() const { return closed_; } protected: /// Set to true after Close() has been called. Subclasses should check and set this in http://git-wip-us.apache.org/repos/asf/impala/blob/a8630cfd/be/src/exec/exec-node.h ---------------------------------------------------------------------- diff --git a/be/src/exec/exec-node.h b/be/src/exec/exec-node.h index 187d9a7..302f2e5 100644 --- a/be/src/exec/exec-node.h +++ b/be/src/exec/exec-node.h @@ -210,6 +210,7 @@ class ExecNode { MemTracker* expr_mem_tracker() { return expr_mem_tracker_.get(); } MemPool* expr_perm_pool() { return expr_perm_pool_.get(); } MemPool* expr_results_pool() { return expr_results_pool_.get(); } + bool is_closed() const { return is_closed_; } /// Return true if codegen was disabled by the planner for this ExecNode. Does not /// check to see if codegen was enabled for the enclosing fragment. @@ -335,8 +336,6 @@ class ExecNode { /// reservations pool in Close(). BufferPool::ClientHandle buffer_pool_client_; - bool is_closed() const { return is_closed_; } - /// Pointer to the containing SubplanNode or NULL if not inside a subplan. /// Set by SubplanNode::Init(). Not owned. SubplanNode* containing_subplan_; http://git-wip-us.apache.org/repos/asf/impala/blob/a8630cfd/be/src/exec/nested-loop-join-node.cc ---------------------------------------------------------------------- diff --git a/be/src/exec/nested-loop-join-node.cc b/be/src/exec/nested-loop-join-node.cc index c5ecf53..e0a375b 100644 --- a/be/src/exec/nested-loop-join-node.cc +++ b/be/src/exec/nested-loop-join-node.cc @@ -138,7 +138,11 @@ void NestedLoopJoinNode::Close(RuntimeState* state) { if (is_closed()) return; ScalarExprEvaluator::Close(join_conjunct_evals_, state); ScalarExpr::Close(join_conjuncts_); - if (builder_ != NULL) builder_->Close(state); + if (builder_ != NULL) { + // IMPALA-6595: builder must be closed before child. + DCHECK(builder_->is_closed() || !children_[1]->is_closed()); + builder_->Close(state); + } build_batches_ = NULL; if (matching_build_rows_ != NULL) { mem_tracker()->Release(matching_build_rows_->MemUsage()); http://git-wip-us.apache.org/repos/asf/impala/blob/a8630cfd/testdata/workloads/functional-query/queries/QueryTest/single-node-nlj-exhaustive.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/single-node-nlj-exhaustive.test b/testdata/workloads/functional-query/queries/QueryTest/single-node-nlj-exhaustive.test index a6b3cae..e194465 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/single-node-nlj-exhaustive.test +++ b/testdata/workloads/functional-query/queries/QueryTest/single-node-nlj-exhaustive.test @@ -6,7 +6,7 @@ select straight_join * from (values(1 id), (2), (3)) v1, (select *, count(*) over() from tpch.lineitem where l_orderkey < 100000) v2 order by id, l_orderkey, l_partkey, l_suppkey, l_linenumber limit 5 -----RESULTS +---- RESULTS 1,1,2132,4633,4,28.00,28955.64,0.09,0.06,'N','O','1996-04-21','1996-03-30','1996-05-16','NONE','AIR','lites. fluffily even de',100382 1,1,15635,638,6,32.00,49620.16,0.07,0.02,'N','O','1996-01-30','1996-02-07','1996-02-03','DELIVER IN PERSON','MAIL','arefully slyly ex',100382 1,1,24027,1534,5,24.00,22824.48,0.10,0.04,'N','O','1996-03-30','1996-03-14','1996-04-01','NONE','FOB',' pending foxes. slyly re',100382 @@ -15,3 +15,16 @@ limit 5 ---- TYPES tinyint,bigint,bigint,bigint,int,decimal,decimal,decimal,decimal,string,string,string,string,string,string,string,string,bigint ==== +---- QUERY +# IMPALA-6595: same query as above, but tuned to trigger a different bug where we hit +# "Memory Limit Exceeded" when adding a batch to NljBuilder. +# TODO: after IMPALA-4835 goes in, retune this query. +set buffer_pool_limit=50m; +set mem_limit=70m; +select straight_join * from (values(1 id), (2), (3)) v1, +(select *, count(*) over() from tpch.lineitem where l_orderkey < 100000) v2 +order by id, l_orderkey, l_partkey, l_suppkey, l_linenumber +limit 5 +---- CATCH +Memory limit exceeded +====