This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 6b292bdd1527ec5501685c4564145d2a725195d9
Author: wzhou-code <[email protected]>
AuthorDate: Tue Dec 8 23:55:17 2020 -0800

    IMPALA-10336: Coordinator return incorrect error to client
    
    Due to race condition, coordinator could set execution status as RPC
    aborted due to cancellation. This internal error should not be
    returned to client.
    This patch fixed the issue by setting the backend status as CANCELLED
    instead of ABORTED if the exec RPC was aborted due to cancellation.
    
    Testing:
     - Manual tests
       Since this is a racy bug, I could only reproduce the situation by
       adding some artificial delays in 3 places: QueryExecMgr.StartQuery(),
       Coordinator.UpdateBackendExecStatus(), and
       Coordinator::StartBackendExec() when running test case
       test_scanners.py::TestOrc::test_type_conversions_hive3.
       Verified that the issue did not happen after applying this patch
       by running test_scanners.py::TestOrc::test_type_conversions_hive3
       in a loop for hours.
     - Passed exhausive test.
    
    Change-Id: I75f252e43006c6ff6980800e3254672de396b318
    Reviewed-on: http://gerrit.cloudera.org:8080/16849
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 be/src/runtime/coordinator-backend-state.cc | 14 ++++++++++++--
 be/src/runtime/coordinator-backend-state.h  |  3 +++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/be/src/runtime/coordinator-backend-state.cc 
b/be/src/runtime/coordinator-backend-state.cc
index c7beac4..df75f35 100644
--- a/be/src/runtime/coordinator-backend-state.cc
+++ b/be/src/runtime/coordinator-backend-state.cc
@@ -213,8 +213,17 @@ void Coordinator::BackendState::ExecCompleteCb(
     rpc_latency_ = MonotonicMillis() - start_ms;
 
     if (!exec_rpc_status_.ok()) {
-      SetExecError(
-          FromKuduStatus(exec_rpc_status_, "Exec() rpc failed"), 
exec_status_barrier);
+      // Return CANCELLED instead of ABORTED if the RPC is cancelled.
+      if (cancel_exec_rpc_ && exec_rpc_status_.IsAborted()) {
+        LOG(ERROR) << "ExecQueryFInstances rpc query_id=" << PrintId(query_id_)
+                   << " was aborted by cancellation";
+        status_ = Status::CANCELLED;
+        exec_done_ = true;
+        exec_status_barrier->NotifyRemaining(status_);
+      } else {
+        SetExecError(
+            FromKuduStatus(exec_rpc_status_, "Exec() rpc failed"), 
exec_status_barrier);
+      }
       goto done;
     }
 
@@ -584,6 +593,7 @@ Coordinator::BackendState::CancelResult 
Coordinator::BackendState::Cancel(
   // and then wait for it to be done.
   if (!exec_done_) {
     VLogForBackend("Attempting to cancel Exec() rpc");
+    cancel_exec_rpc_ = true;
     exec_rpc_controller_.Cancel();
     WaitOnExecLocked(&l);
   }
diff --git a/be/src/runtime/coordinator-backend-state.h 
b/be/src/runtime/coordinator-backend-state.h
index d9b5bf8..9c6fc55 100644
--- a/be/src/runtime/coordinator-backend-state.h
+++ b/be/src/runtime/coordinator-backend-state.h
@@ -418,6 +418,9 @@ class Coordinator::BackendState {
   /// True if a CancelQueryFInstances RPC was already sent to this backend.
   bool sent_cancel_rpc_ = false;
 
+  /// True if Exec() RPC is cancelled.
+  bool cancel_exec_rpc_ = false;
+
   /// Total scan ranges complete across all scan nodes. Set in 
ApplyExecStatusReport().
   int64_t total_ranges_complete_ = 0;
 

Reply via email to