This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 6b292bdd1527ec5501685c4564145d2a725195d9 Author: wzhou-code <[email protected]> AuthorDate: Tue Dec 8 23:55:17 2020 -0800 IMPALA-10336: Coordinator return incorrect error to client Due to race condition, coordinator could set execution status as RPC aborted due to cancellation. This internal error should not be returned to client. This patch fixed the issue by setting the backend status as CANCELLED instead of ABORTED if the exec RPC was aborted due to cancellation. Testing: - Manual tests Since this is a racy bug, I could only reproduce the situation by adding some artificial delays in 3 places: QueryExecMgr.StartQuery(), Coordinator.UpdateBackendExecStatus(), and Coordinator::StartBackendExec() when running test case test_scanners.py::TestOrc::test_type_conversions_hive3. Verified that the issue did not happen after applying this patch by running test_scanners.py::TestOrc::test_type_conversions_hive3 in a loop for hours. - Passed exhausive test. Change-Id: I75f252e43006c6ff6980800e3254672de396b318 Reviewed-on: http://gerrit.cloudera.org:8080/16849 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- be/src/runtime/coordinator-backend-state.cc | 14 ++++++++++++-- be/src/runtime/coordinator-backend-state.h | 3 +++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/be/src/runtime/coordinator-backend-state.cc b/be/src/runtime/coordinator-backend-state.cc index c7beac4..df75f35 100644 --- a/be/src/runtime/coordinator-backend-state.cc +++ b/be/src/runtime/coordinator-backend-state.cc @@ -213,8 +213,17 @@ void Coordinator::BackendState::ExecCompleteCb( rpc_latency_ = MonotonicMillis() - start_ms; if (!exec_rpc_status_.ok()) { - SetExecError( - FromKuduStatus(exec_rpc_status_, "Exec() rpc failed"), exec_status_barrier); + // Return CANCELLED instead of ABORTED if the RPC is cancelled. + if (cancel_exec_rpc_ && exec_rpc_status_.IsAborted()) { + LOG(ERROR) << "ExecQueryFInstances rpc query_id=" << PrintId(query_id_) + << " was aborted by cancellation"; + status_ = Status::CANCELLED; + exec_done_ = true; + exec_status_barrier->NotifyRemaining(status_); + } else { + SetExecError( + FromKuduStatus(exec_rpc_status_, "Exec() rpc failed"), exec_status_barrier); + } goto done; } @@ -584,6 +593,7 @@ Coordinator::BackendState::CancelResult Coordinator::BackendState::Cancel( // and then wait for it to be done. if (!exec_done_) { VLogForBackend("Attempting to cancel Exec() rpc"); + cancel_exec_rpc_ = true; exec_rpc_controller_.Cancel(); WaitOnExecLocked(&l); } diff --git a/be/src/runtime/coordinator-backend-state.h b/be/src/runtime/coordinator-backend-state.h index d9b5bf8..9c6fc55 100644 --- a/be/src/runtime/coordinator-backend-state.h +++ b/be/src/runtime/coordinator-backend-state.h @@ -418,6 +418,9 @@ class Coordinator::BackendState { /// True if a CancelQueryFInstances RPC was already sent to this backend. bool sent_cancel_rpc_ = false; + /// True if Exec() RPC is cancelled. + bool cancel_exec_rpc_ = false; + /// Total scan ranges complete across all scan nodes. Set in ApplyExecStatusReport(). int64_t total_ranges_complete_ = 0;
