This is an automated email from the ASF dual-hosted git repository.

jasonmfehr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new ec809fc16 IMPALA-14433: Fix OpenTelemetry Tracing Deadlock
ec809fc16 is described below

commit ec809fc16c32afd47ad795d4b8a26880413bf660
Author: jasonmfehr <jf...@cloudera.com>
AuthorDate: Fri Sep 12 16:24:07 2025 -0700

    IMPALA-14433: Fix OpenTelemetry Tracing Deadlock
    
    All functions in the SpanManager class operate under the assumption
    that child_span_mu_ in the SpanManager class will be locked before
    the ClientRequestState lock. However, the
    ImpalaServer::ExecuteInternal function takes the ClientRequestState
    lock before calling SpanManager::EndChildSpanPlanning. If another
    function in the SpanManager class has already taken the
    child_span_mu_ lock and is waiting for the ClientRequestState lock,
    a deadlock occurs.
    
    This issue was found by running end-to-end tests with OpenTelemetry
    tracing enabled and a release buildof Impala.
    
    Testing accomplished by re-running the end-to-end tests with
    OpenTelemetry tracing enabled and verifying that the deadlock no
    longer occurs.
    
    Change-Id: I7b43dba794cfe61d283bdd476e4056b9304d8947
    Reviewed-on: http://gerrit.cloudera.org:8080/23422
    Reviewed-by: Joe McDonnell <joemcdonn...@cloudera.com>
    Reviewed-by: Michael Smith <michael.sm...@cloudera.com>
    Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
---
 be/src/observe/span-manager.cc  | 1 +
 be/src/observe/span-manager.h   | 9 ++++-----
 be/src/service/impala-server.cc | 6 +++---
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/be/src/observe/span-manager.cc b/be/src/observe/span-manager.cc
index f5b21ac15..c556fb3a9 100644
--- a/be/src/observe/span-manager.cc
+++ b/be/src/observe/span-manager.cc
@@ -252,6 +252,7 @@ void SpanManager::StartChildSpanPlanning() {
 
 void SpanManager::EndChildSpanPlanning() {
   lock_guard<mutex> l(child_span_mu_);
+  lock_guard<mutex> crs_lock(*(client_request_state_->lock()));
   DoEndChildSpanPlanning();
 } // function EndChildSpanPlanning
 
diff --git a/be/src/observe/span-manager.h b/be/src/observe/span-manager.h
index 643e404b0..2d8d4ccd5 100644
--- a/be/src/observe/span-manager.h
+++ b/be/src/observe/span-manager.h
@@ -92,15 +92,11 @@ public:
   // client_request_state_->lock().
   void EndChildSpanInit();
   void EndChildSpanSubmitted();
+  void EndChildSpanPlanning();
   void EndChildSpanAdmissionControl(const Status& cause);
   void EndChildSpanQueryExecution();
   void EndChildSpanClose();
 
-  // Function to end the Planning child span. This function takes ownership of
-  // child_span_mu_ BUT NOT client_request_state_->lock(). The code calling 
this function
-  // already holds client_request_state_->lock().
-  void EndChildSpanPlanning();
-
 private:
   // Tracer instance used to construct spans.
   opentelemetry::nostd::shared_ptr<opentelemetry::trace::Tracer> tracer_;
@@ -120,6 +116,9 @@ private:
   std::shared_ptr<TimedSpan> root_;
 
   // TimedSpan instance for the current child span and the mutex to protect it.
+  // To ensure no deadlocks, locks must be acquired in the following order:
+  // 1. SpanManager::child_span_mu_
+  // 2. ClientRequestState::lock_
   std::unique_ptr<TimedSpan> current_child_;
   std::mutex child_span_mu_;
 
diff --git a/be/src/service/impala-server.cc b/be/src/service/impala-server.cc
index 23be7ee4b..7b9b639ab 100644
--- a/be/src/service/impala-server.cc
+++ b/be/src/service/impala-server.cc
@@ -1422,10 +1422,10 @@ Status ImpalaServer::ExecuteInternal(const TQueryCtx& 
query_ctx,
     if (result.__isset.result_set_metadata) {
       (*query_handle)->set_result_metadata(result.result_set_metadata);
     }
+  }
 
-    if ((*query_handle)->otel_trace_query()) {
-      (*query_handle)->otel_span_manager()->EndChildSpanPlanning();
-    }
+  if ((*query_handle)->otel_trace_query()) {
+    (*query_handle)->otel_span_manager()->EndChildSpanPlanning();
   }
 
   VLOG(2) << "Execution request: "

Reply via email to