This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.1 by this push:
new d616fb44db1 branch-4.1: [fix](fragment) avoid query-ctx map clear
self-deadlock when stop FragmentMgr (#64552)
d616fb44db1 is described below
commit d616fb44db17549800736477c1f9dda62f1f1249
Author: Pxl <[email protected]>
AuthorDate: Tue Jun 16 18:15:45 2026 +0800
branch-4.1: [fix](fragment) avoid query-ctx map clear self-deadlock when
stop FragmentMgr (#64552)
### What problem does this PR solve?
Issue Number: close #xxx
Related PR: #62954
Problem Summary: Backport #62954 to branch-4.1. During BE shutdown,
FragmentMgr clears `_query_ctx_map_delay_delete`. The old
`ConcurrentContextMap::clear()` destroyed map values while holding the
shard mutex. Releasing the last `QueryContext` reference can run
`QueryContext::~QueryContext()`, which calls back into
`FragmentMgr::remove_query_context()` and attempts to erase from the
same delay-delete map, causing `std::system_error` with `Resource
deadlock avoided` and then BE abort. The fix swaps each shard map into a
local map under the lock, releases the lock, and only then destroys the
values.
### Release note
None
### Check List (For Author)
- Test:
- Manual test: `git diff --check upstream/branch-4.1..HEAD`
- Manual test: `./build-support/check-format.sh`
- Unit Test: `./run-be-ut.sh --run
--filter=FragmentMgrDelayDeleteMapTest.*` was started locally but not
completed because it triggered a full ASAN_UT rebuild (`14920` ninja
targets); the run was interrupted before test execution.
- Behavior changed: No
- Does this need documentation: No
Co-authored-by: linrrarity <[email protected]>
---
be/src/runtime/fragment_mgr.cpp | 17 ++++++++++--
.../fragment_mgr_cross_cluster_cancel_test.cpp | 30 ++++++++++++++++++++++
2 files changed, 45 insertions(+), 2 deletions(-)
diff --git a/be/src/runtime/fragment_mgr.cpp b/be/src/runtime/fragment_mgr.cpp
index 7fc67d20f56..2b61032502c 100644
--- a/be/src/runtime/fragment_mgr.cpp
+++ b/be/src/runtime/fragment_mgr.cpp
@@ -298,9 +298,22 @@ void ConcurrentContextMap<Key, Value,
ValueType>::insert(const Key& query_id,
template <typename Key, typename Value, typename ValueType>
void ConcurrentContextMap<Key, Value, ValueType>::clear() {
+ // Avoid self-deadlock from releasing the last QueryContext
+ // in _query_ctx_map_delay_delete:
+ // FragmentMgr::stop()
+ // -> _query_ctx_map_delay_delete.clear()
+ // -> unique_lock(query_id_lock)
+ // -> map.clear()
+ // -> QueryContext::~QueryContext()
+ // -> FragmentMgr::remove_query_context(query_id)
+ // -> _query_ctx_map_delay_delete.erase(query_id)
+ // -> unique_lock(query_id_lock) <- deadlock
for (auto& pair : _internal_map) {
- std::unique_lock lock(*pair.first);
- auto& map = pair.second;
+ phmap::flat_hash_map<Key, Value> map;
+ {
+ std::unique_lock lock(*pair.first);
+ map.swap(pair.second);
+ }
map.clear();
}
}
diff --git a/be/test/runtime/fragment_mgr_cross_cluster_cancel_test.cpp
b/be/test/runtime/fragment_mgr_cross_cluster_cancel_test.cpp
index 91c242f4b8f..70dd2e874ba 100644
--- a/be/test/runtime/fragment_mgr_cross_cluster_cancel_test.cpp
+++ b/be/test/runtime/fragment_mgr_cross_cluster_cancel_test.cpp
@@ -153,4 +153,34 @@ TEST_F(FragmentMgrCrossClusterCancelTest,
CancelWorkerInvalidQueryDetectionSkips
EXPECT_TRUE(queries_pipeline_task_leak.empty());
}
+TEST(FragmentMgrDelayDeleteMapTest,
ClearShouldNotAbortWhenReleasingLastQueryContextRef) {
+ auto* exec_env = ExecEnv::GetInstance();
+ auto* previous_fragment_mgr = exec_env->_fragment_mgr;
+ exec_env->_fragment_mgr = new FragmentMgr(exec_env);
+
+ TUniqueId query_id;
+ query_id.__set_hi(101);
+ query_id.__set_lo(202);
+
+ TQueryOptions query_options;
+ query_options.__set_query_type(TQueryType::SELECT);
+ query_options.__set_execution_timeout(60);
+ query_options.__set_mem_limit(64L * 1024 * 1024);
+
+ TNetworkAddress fe_addr;
+ fe_addr.hostname = "127.0.0.1";
+ fe_addr.port = 9030;
+
+ auto query_ctx =
+ QueryContext::create(query_id, exec_env, query_options, fe_addr,
+ /*is_nereids*/ true, fe_addr,
QuerySource::INTERNAL_FRONTEND);
+ exec_env->_fragment_mgr->_query_ctx_map_delay_delete.insert(query_id,
query_ctx);
+ query_ctx.reset();
+
+ exec_env->_fragment_mgr->_query_ctx_map_delay_delete.clear();
+ exec_env->_fragment_mgr->stop();
+ delete exec_env->_fragment_mgr;
+ exec_env->_fragment_mgr = previous_fragment_mgr;
+}
+
} // namespace doris
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]