This is an automated email from the ASF dual-hosted git repository. jasonmfehr pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push: new ec809fc16 IMPALA-14433: Fix OpenTelemetry Tracing Deadlock ec809fc16 is described below commit ec809fc16c32afd47ad795d4b8a26880413bf660 Author: jasonmfehr <jf...@cloudera.com> AuthorDate: Fri Sep 12 16:24:07 2025 -0700 IMPALA-14433: Fix OpenTelemetry Tracing Deadlock All functions in the SpanManager class operate under the assumption that child_span_mu_ in the SpanManager class will be locked before the ClientRequestState lock. However, the ImpalaServer::ExecuteInternal function takes the ClientRequestState lock before calling SpanManager::EndChildSpanPlanning. If another function in the SpanManager class has already taken the child_span_mu_ lock and is waiting for the ClientRequestState lock, a deadlock occurs. This issue was found by running end-to-end tests with OpenTelemetry tracing enabled and a release buildof Impala. Testing accomplished by re-running the end-to-end tests with OpenTelemetry tracing enabled and verifying that the deadlock no longer occurs. Change-Id: I7b43dba794cfe61d283bdd476e4056b9304d8947 Reviewed-on: http://gerrit.cloudera.org:8080/23422 Reviewed-by: Joe McDonnell <joemcdonn...@cloudera.com> Reviewed-by: Michael Smith <michael.sm...@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com> --- be/src/observe/span-manager.cc | 1 + be/src/observe/span-manager.h | 9 ++++----- be/src/service/impala-server.cc | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/be/src/observe/span-manager.cc b/be/src/observe/span-manager.cc index f5b21ac15..c556fb3a9 100644 --- a/be/src/observe/span-manager.cc +++ b/be/src/observe/span-manager.cc @@ -252,6 +252,7 @@ void SpanManager::StartChildSpanPlanning() { void SpanManager::EndChildSpanPlanning() { lock_guard<mutex> l(child_span_mu_); + lock_guard<mutex> crs_lock(*(client_request_state_->lock())); DoEndChildSpanPlanning(); } // function EndChildSpanPlanning diff --git a/be/src/observe/span-manager.h b/be/src/observe/span-manager.h index 643e404b0..2d8d4ccd5 100644 --- a/be/src/observe/span-manager.h +++ b/be/src/observe/span-manager.h @@ -92,15 +92,11 @@ public: // client_request_state_->lock(). void EndChildSpanInit(); void EndChildSpanSubmitted(); + void EndChildSpanPlanning(); void EndChildSpanAdmissionControl(const Status& cause); void EndChildSpanQueryExecution(); void EndChildSpanClose(); - // Function to end the Planning child span. This function takes ownership of - // child_span_mu_ BUT NOT client_request_state_->lock(). The code calling this function - // already holds client_request_state_->lock(). - void EndChildSpanPlanning(); - private: // Tracer instance used to construct spans. opentelemetry::nostd::shared_ptr<opentelemetry::trace::Tracer> tracer_; @@ -120,6 +116,9 @@ private: std::shared_ptr<TimedSpan> root_; // TimedSpan instance for the current child span and the mutex to protect it. + // To ensure no deadlocks, locks must be acquired in the following order: + // 1. SpanManager::child_span_mu_ + // 2. ClientRequestState::lock_ std::unique_ptr<TimedSpan> current_child_; std::mutex child_span_mu_; diff --git a/be/src/service/impala-server.cc b/be/src/service/impala-server.cc index 23be7ee4b..7b9b639ab 100644 --- a/be/src/service/impala-server.cc +++ b/be/src/service/impala-server.cc @@ -1422,10 +1422,10 @@ Status ImpalaServer::ExecuteInternal(const TQueryCtx& query_ctx, if (result.__isset.result_set_metadata) { (*query_handle)->set_result_metadata(result.result_set_metadata); } + } - if ((*query_handle)->otel_trace_query()) { - (*query_handle)->otel_span_manager()->EndChildSpanPlanning(); - } + if ((*query_handle)->otel_trace_query()) { + (*query_handle)->otel_span_manager()->EndChildSpanPlanning(); } VLOG(2) << "Execution request: "