IMPALA-7055: fix race with DML errors Error statuses could be lost because backend_exec_complete_barrier_ went to 0 before the query was transitioned to an error state. Reordering the UpdateExecState() and backend_exec_complete_barrier_ calls prevents this race.
Change-Id: Idafd0b342e77a065be7cc28fa8c8a9df445622c2 Reviewed-on: http://gerrit.cloudera.org:8080/10491 Reviewed-by: Tim Armstrong <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/impala/repo Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/2362b672 Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/2362b672 Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/2362b672 Branch: refs/heads/master Commit: 2362b672ccd94ed97331fe9c84ac1603ecb3772f Parents: a22ee64 Author: Tim Armstrong <[email protected]> Authored: Wed May 23 14:03:12 2018 -0700 Committer: Impala Public Jenkins <[email protected]> Committed: Thu May 24 19:09:49 2018 +0000 ---------------------------------------------------------------------- be/src/runtime/coordinator.cc | 24 ++++++++++++++++-------- be/src/runtime/coordinator.h | 3 +++ 2 files changed, 19 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/impala/blob/2362b672/be/src/runtime/coordinator.cc ---------------------------------------------------------------------- diff --git a/be/src/runtime/coordinator.cc b/be/src/runtime/coordinator.cc index 998fee2..90e2390 100644 --- a/be/src/runtime/coordinator.cc +++ b/be/src/runtime/coordinator.cc @@ -669,25 +669,33 @@ Status Coordinator::UpdateBackendExecStatus(const TReportExecStatusParams& param if (backend_state->ApplyExecStatusReport(params, &exec_summary_, &progress_)) { // This backend execution has completed. + if (VLOG_QUERY_IS_ON) { + // Don't log backend completion if the query has already been cancelled. + int pending_backends = backend_exec_complete_barrier_->pending(); + if (pending_backends >= 1) { + VLOG_QUERY << "Backend completed:" + << " host=" << TNetworkAddressToString(backend_state->impalad_address()) + << " remaining=" << pending_backends + << " query_id=" << PrintId(query_id()); + BackendState::LogFirstInProgress(backend_states_); + } + } bool is_fragment_failure; TUniqueId failed_instance_id; Status status = backend_state->GetStatus(&is_fragment_failure, &failed_instance_id); - int pending_backends = backend_exec_complete_barrier_->Notify(); - if (VLOG_QUERY_IS_ON && pending_backends >= 0) { - VLOG_QUERY << "Backend completed:" - << " host=" << TNetworkAddressToString(backend_state->impalad_address()) - << " remaining=" << pending_backends - << " query_id=" << PrintId(query_id()); - BackendState::LogFirstInProgress(backend_states_); - } if (!status.ok()) { // We may start receiving status reports before all exec rpcs are complete. // Can't apply state transition until no more exec rpcs will be sent. exec_rpcs_complete_barrier_->Wait(); + // Transition the status if we're not already in a terminal state. This won't block + // because either this transitions to an ERROR state or the query is already in + // a terminal state. discard_result(UpdateExecState(status, is_fragment_failure ? &failed_instance_id : nullptr, TNetworkAddressToString(backend_state->impalad_address()))); } + // We've applied all changes from the final status report - notify waiting threads. + backend_exec_complete_barrier_->Notify(); } // If all results have been returned, return a cancelled status to force the fragment // instance to stop executing. http://git-wip-us.apache.org/repos/asf/impala/blob/2362b672/be/src/runtime/coordinator.h ---------------------------------------------------------------------- diff --git a/be/src/runtime/coordinator.h b/be/src/runtime/coordinator.h index ae85bcd..5bb399f 100644 --- a/be/src/runtime/coordinator.h +++ b/be/src/runtime/coordinator.h @@ -350,6 +350,9 @@ class Coordinator { // NOLINT: The member variables could be re-ordered to save /// the Coordinator object), then finalizes execution (cancels remaining backends if /// transitioning to CANCELLED; in all cases releases resources and calls /// ComputeQuerySummary()). Must not be called if exec RPCs are pending. + /// Will block waiting for backends to completed if transitioning to the + /// RETURNED_RESULTS terminal state. Does not block if already in terminal state or + /// transitioning to ERROR or CANCELLED. void HandleExecStateTransition(const ExecState old_state, const ExecState new_state); /// Return true if 'exec_state_' is RETURNED_RESULTS.
