This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-4.1 by this push:
     new d616fb44db1 branch-4.1: [fix](fragment) avoid query-ctx map clear 
self-deadlock when stop FragmentMgr (#64552)
d616fb44db1 is described below

commit d616fb44db17549800736477c1f9dda62f1f1249
Author: Pxl <[email protected]>
AuthorDate: Tue Jun 16 18:15:45 2026 +0800

    branch-4.1: [fix](fragment) avoid query-ctx map clear self-deadlock when 
stop FragmentMgr (#64552)
    
    ### What problem does this PR solve?
    
    Issue Number: close #xxx
    
    Related PR: #62954
    
    Problem Summary: Backport #62954 to branch-4.1. During BE shutdown,
    FragmentMgr clears `_query_ctx_map_delay_delete`. The old
    `ConcurrentContextMap::clear()` destroyed map values while holding the
    shard mutex. Releasing the last `QueryContext` reference can run
    `QueryContext::~QueryContext()`, which calls back into
    `FragmentMgr::remove_query_context()` and attempts to erase from the
    same delay-delete map, causing `std::system_error` with `Resource
    deadlock avoided` and then BE abort. The fix swaps each shard map into a
    local map under the lock, releases the lock, and only then destroys the
    values.
    
    ### Release note
    
    None
    
    ### Check List (For Author)
    
    - Test:
        - Manual test: `git diff --check upstream/branch-4.1..HEAD`
        - Manual test: `./build-support/check-format.sh`
    - Unit Test: `./run-be-ut.sh --run
    --filter=FragmentMgrDelayDeleteMapTest.*` was started locally but not
    completed because it triggered a full ASAN_UT rebuild (`14920` ninja
    targets); the run was interrupted before test execution.
    - Behavior changed: No
    - Does this need documentation: No
    
    Co-authored-by: linrrarity <[email protected]>
---
 be/src/runtime/fragment_mgr.cpp                    | 17 ++++++++++--
 .../fragment_mgr_cross_cluster_cancel_test.cpp     | 30 ++++++++++++++++++++++
 2 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/be/src/runtime/fragment_mgr.cpp b/be/src/runtime/fragment_mgr.cpp
index 7fc67d20f56..2b61032502c 100644
--- a/be/src/runtime/fragment_mgr.cpp
+++ b/be/src/runtime/fragment_mgr.cpp
@@ -298,9 +298,22 @@ void ConcurrentContextMap<Key, Value, 
ValueType>::insert(const Key& query_id,
 
 template <typename Key, typename Value, typename ValueType>
 void ConcurrentContextMap<Key, Value, ValueType>::clear() {
+    // Avoid self-deadlock from releasing the last QueryContext
+    // in _query_ctx_map_delay_delete:
+    // FragmentMgr::stop()
+    //   -> _query_ctx_map_delay_delete.clear()
+    //     -> unique_lock(query_id_lock)
+    //     -> map.clear()
+    //       -> QueryContext::~QueryContext()
+    //         -> FragmentMgr::remove_query_context(query_id)
+    //           -> _query_ctx_map_delay_delete.erase(query_id)
+    //             -> unique_lock(query_id_lock) <- deadlock
     for (auto& pair : _internal_map) {
-        std::unique_lock lock(*pair.first);
-        auto& map = pair.second;
+        phmap::flat_hash_map<Key, Value> map;
+        {
+            std::unique_lock lock(*pair.first);
+            map.swap(pair.second);
+        }
         map.clear();
     }
 }
diff --git a/be/test/runtime/fragment_mgr_cross_cluster_cancel_test.cpp 
b/be/test/runtime/fragment_mgr_cross_cluster_cancel_test.cpp
index 91c242f4b8f..70dd2e874ba 100644
--- a/be/test/runtime/fragment_mgr_cross_cluster_cancel_test.cpp
+++ b/be/test/runtime/fragment_mgr_cross_cluster_cancel_test.cpp
@@ -153,4 +153,34 @@ TEST_F(FragmentMgrCrossClusterCancelTest, 
CancelWorkerInvalidQueryDetectionSkips
     EXPECT_TRUE(queries_pipeline_task_leak.empty());
 }
 
+TEST(FragmentMgrDelayDeleteMapTest, 
ClearShouldNotAbortWhenReleasingLastQueryContextRef) {
+    auto* exec_env = ExecEnv::GetInstance();
+    auto* previous_fragment_mgr = exec_env->_fragment_mgr;
+    exec_env->_fragment_mgr = new FragmentMgr(exec_env);
+
+    TUniqueId query_id;
+    query_id.__set_hi(101);
+    query_id.__set_lo(202);
+
+    TQueryOptions query_options;
+    query_options.__set_query_type(TQueryType::SELECT);
+    query_options.__set_execution_timeout(60);
+    query_options.__set_mem_limit(64L * 1024 * 1024);
+
+    TNetworkAddress fe_addr;
+    fe_addr.hostname = "127.0.0.1";
+    fe_addr.port = 9030;
+
+    auto query_ctx =
+            QueryContext::create(query_id, exec_env, query_options, fe_addr,
+                                 /*is_nereids*/ true, fe_addr, 
QuerySource::INTERNAL_FRONTEND);
+    exec_env->_fragment_mgr->_query_ctx_map_delay_delete.insert(query_id, 
query_ctx);
+    query_ctx.reset();
+
+    exec_env->_fragment_mgr->_query_ctx_map_delay_delete.clear();
+    exec_env->_fragment_mgr->stop();
+    delete exec_env->_fragment_mgr;
+    exec_env->_fragment_mgr = previous_fragment_mgr;
+}
+
 } // namespace doris


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to