This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.1 by this push:
new 6567223b6eb branch-4.1: [feat](cloud) Add lock wait and bthread
schedule delay metrics to SyncRowset profile #61036 (#61450)
6567223b6eb is described below
commit 6567223b6eb65892f9e8ca30c6aed8a90ddced9f
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Wed Mar 18 14:39:25 2026 +0800
branch-4.1: [feat](cloud) Add lock wait and bthread schedule delay metrics
to SyncRowset profile #61036 (#61450)
Cherry-picked from #61036
Co-authored-by: bobhan1 <[email protected]>
---
be/src/cloud/cloud_meta_mgr.cpp | 14 +++++++++++
be/src/cloud/cloud_tablet.cpp | 36 +++++++++++++++++++++++++++++
be/src/cloud/cloud_tablet.h | 5 ++++
be/src/pipeline/exec/olap_scan_operator.cpp | 22 +++++++++++++++++-
be/src/pipeline/exec/olap_scan_operator.h | 3 +++
5 files changed, 79 insertions(+), 1 deletion(-)
diff --git a/be/src/cloud/cloud_meta_mgr.cpp b/be/src/cloud/cloud_meta_mgr.cpp
index 8afb9e42560..44ee033fb33 100644
--- a/be/src/cloud/cloud_meta_mgr.cpp
+++ b/be/src/cloud/cloud_meta_mgr.cpp
@@ -590,7 +590,14 @@ Status
CloudMetaMgr::sync_tablet_rowsets_unlocked(CloudTablet* tablet,
idx->set_index_id(index_id);
idx->set_partition_id(tablet->partition_id());
{
+ auto lock_start = std::chrono::steady_clock::now();
std::shared_lock rlock(tablet->get_header_lock());
+ if (sync_stats) {
+ sync_stats->meta_lock_wait_ns +=
+ std::chrono::duration_cast<std::chrono::nanoseconds>(
+ std::chrono::steady_clock::now() - lock_start)
+ .count();
+ }
if (options.full_sync) {
req.set_start_version(0);
} else {
@@ -693,7 +700,14 @@ Status
CloudMetaMgr::sync_tablet_rowsets_unlocked(CloudTablet* tablet,
});
{
const auto& stats = resp.stats();
+ auto lock_start = std::chrono::steady_clock::now();
std::unique_lock wlock(tablet->get_header_lock());
+ if (sync_stats) {
+ sync_stats->meta_lock_wait_ns +=
+ std::chrono::duration_cast<std::chrono::nanoseconds>(
+ std::chrono::steady_clock::now() - lock_start)
+ .count();
+ }
// ATTN: we are facing following data race
//
diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp
index 014c16d180e..9d382bfc134 100644
--- a/be/src/cloud/cloud_tablet.cpp
+++ b/be/src/cloud/cloud_tablet.cpp
@@ -301,16 +301,34 @@ Status CloudTablet::sync_rowsets(const SyncOptions&
options, SyncRowsetStats* st
RETURN_IF_ERROR(sync_if_not_running(stats));
if (options.query_version > 0) {
+ auto lock_start = std::chrono::steady_clock::now();
std::shared_lock rlock(_meta_lock);
+ if (stats) {
+ stats->meta_lock_wait_ns +=
std::chrono::duration_cast<std::chrono::nanoseconds>(
+
std::chrono::steady_clock::now() - lock_start)
+ .count();
+ }
if (_max_version >= options.query_version) {
return Status::OK();
}
}
// serially execute sync to reduce unnecessary network overhead
+ auto sync_lock_start = std::chrono::steady_clock::now();
std::unique_lock lock(_sync_meta_lock);
+ if (stats) {
+ stats->sync_meta_lock_wait_ns +=
std::chrono::duration_cast<std::chrono::nanoseconds>(
+
std::chrono::steady_clock::now() - sync_lock_start)
+ .count();
+ }
if (options.query_version > 0) {
+ auto lock_start = std::chrono::steady_clock::now();
std::shared_lock rlock(_meta_lock);
+ if (stats) {
+ stats->meta_lock_wait_ns +=
std::chrono::duration_cast<std::chrono::nanoseconds>(
+
std::chrono::steady_clock::now() - lock_start)
+ .count();
+ }
if (_max_version >= options.query_version) {
return Status::OK();
}
@@ -333,10 +351,22 @@ Status CloudTablet::sync_if_not_running(SyncRowsetStats*
stats) {
}
// Serially execute sync to reduce unnecessary network overhead
+ auto sync_lock_start = std::chrono::steady_clock::now();
std::unique_lock lock(_sync_meta_lock);
+ if (stats) {
+ stats->sync_meta_lock_wait_ns +=
std::chrono::duration_cast<std::chrono::nanoseconds>(
+
std::chrono::steady_clock::now() - sync_lock_start)
+ .count();
+ }
{
+ auto lock_start = std::chrono::steady_clock::now();
std::shared_lock rlock(_meta_lock);
+ if (stats) {
+ stats->meta_lock_wait_ns +=
std::chrono::duration_cast<std::chrono::nanoseconds>(
+
std::chrono::steady_clock::now() - lock_start)
+ .count();
+ }
if (tablet_state() == TABLET_RUNNING) {
return Status::OK();
}
@@ -358,7 +388,13 @@ Status CloudTablet::sync_if_not_running(SyncRowsetStats*
stats) {
TimestampedVersionTracker empty_tracker;
{
+ auto lock_start = std::chrono::steady_clock::now();
std::lock_guard wlock(_meta_lock);
+ if (stats) {
+ stats->meta_lock_wait_ns +=
std::chrono::duration_cast<std::chrono::nanoseconds>(
+
std::chrono::steady_clock::now() - lock_start)
+ .count();
+ }
RETURN_IF_ERROR(set_tablet_state(TABLET_RUNNING));
_rs_version_map.clear();
_stale_rs_version_map.clear();
diff --git a/be/src/cloud/cloud_tablet.h b/be/src/cloud/cloud_tablet.h
index 7b35b7c4d3f..f4642266f47 100644
--- a/be/src/cloud/cloud_tablet.h
+++ b/be/src/cloud/cloud_tablet.h
@@ -53,6 +53,11 @@ struct SyncRowsetStats {
int64_t get_remote_tablet_meta_rpc_ns {0};
int64_t tablet_meta_cache_hit {0};
int64_t tablet_meta_cache_miss {0};
+
+ int64_t bthread_schedule_delay_ns {0};
+ int64_t meta_lock_wait_ns {0}; // _meta_lock (std::shared_mutex) wait
across all acquisitions
+ int64_t sync_meta_lock_wait_ns {
+ 0}; // _sync_meta_lock (bthread::Mutex) wait across all
acquisitions
};
struct SyncOptions {
diff --git a/be/src/pipeline/exec/olap_scan_operator.cpp
b/be/src/pipeline/exec/olap_scan_operator.cpp
index 2082dd7ea87..6fa5f52c0e8 100644
--- a/be/src/pipeline/exec/olap_scan_operator.cpp
+++ b/be/src/pipeline/exec/olap_scan_operator.cpp
@@ -172,6 +172,12 @@ Status OlapScanLocalState::_init_profile() {
TUnit::BYTES, sync_rowset_timer_name);
_sync_rowset_get_remote_delete_bitmap_rpc_timer = ADD_CHILD_TIMER(
_scanner_profile, "SyncRowsetGetRemoteDeleteBitmapRpcTime",
sync_rowset_timer_name);
+ _sync_rowset_bthread_schedule_wait_timer = ADD_CHILD_TIMER(
+ _scanner_profile, "SyncRowsetBthreadScheduleWaitTime",
sync_rowset_timer_name);
+ _sync_rowset_meta_lock_wait_timer = ADD_CHILD_TIMER(
+ _scanner_profile, "SyncRowsetMetaLockWaitTime",
sync_rowset_timer_name);
+ _sync_rowset_sync_meta_lock_wait_timer = ADD_CHILD_TIMER(
+ _scanner_profile, "SyncRowsetSyncMetaLockWaitTime",
sync_rowset_timer_name);
}
_block_init_timer = ADD_TIMER(_segment_profile, "BlockInitTime");
_block_init_seek_timer = ADD_TIMER(_segment_profile, "BlockInitSeekTime");
@@ -614,7 +620,16 @@ Status
OlapScanLocalState::_sync_cloud_tablets(RuntimeState* state) {
_scan_ranges[i]->version.data() +
_scan_ranges[i]->version.size(),
version);
auto task_ctx = state->get_task_execution_context();
- tasks.emplace_back([this, sync_stats, version, i, task_ctx]() {
+ auto task_create_time = std::chrono::steady_clock::now();
+ tasks.emplace_back([this, sync_stats, version, i, task_ctx,
task_create_time]() {
+ // Record bthread scheduling delay
+ auto task_start_time = std::chrono::steady_clock::now();
+ if (sync_stats) {
+ sync_stats->bthread_schedule_delay_ns +=
+
std::chrono::duration_cast<std::chrono::nanoseconds>(
+ task_start_time - task_create_time)
+ .count();
+ }
auto task_lock = task_ctx.lock();
if (task_lock == nullptr) {
return Status::OK();
@@ -688,6 +703,11 @@ Status OlapScanLocalState::prepare(RuntimeState* state) {
sync_stats.get_remote_delete_bitmap_bytes);
COUNTER_UPDATE(_sync_rowset_get_remote_delete_bitmap_rpc_timer,
sync_stats.get_remote_delete_bitmap_rpc_ns);
+ COUNTER_UPDATE(_sync_rowset_bthread_schedule_wait_timer,
+ sync_stats.bthread_schedule_delay_ns);
+ COUNTER_UPDATE(_sync_rowset_meta_lock_wait_timer,
sync_stats.meta_lock_wait_ns);
+ COUNTER_UPDATE(_sync_rowset_sync_meta_lock_wait_timer,
+ sync_stats.sync_meta_lock_wait_ns);
}
auto time_ms = _sync_cloud_tablets_watcher.elapsed_time_microseconds();
if (time_ms >= config::sync_rowsets_slow_threshold_ms) {
diff --git a/be/src/pipeline/exec/olap_scan_operator.h
b/be/src/pipeline/exec/olap_scan_operator.h
index 292081b29a6..5077b46b7cb 100644
--- a/be/src/pipeline/exec/olap_scan_operator.h
+++ b/be/src/pipeline/exec/olap_scan_operator.h
@@ -181,6 +181,9 @@ private:
RuntimeProfile::Counter* _sync_rowset_get_remote_delete_bitmap_key_count =
nullptr;
RuntimeProfile::Counter* _sync_rowset_get_remote_delete_bitmap_bytes =
nullptr;
RuntimeProfile::Counter* _sync_rowset_get_remote_delete_bitmap_rpc_timer =
nullptr;
+ RuntimeProfile::Counter* _sync_rowset_bthread_schedule_wait_timer =
nullptr;
+ RuntimeProfile::Counter* _sync_rowset_meta_lock_wait_timer = nullptr;
+ RuntimeProfile::Counter* _sync_rowset_sync_meta_lock_wait_timer = nullptr;
RuntimeProfile::Counter* _block_load_timer = nullptr;
RuntimeProfile::Counter* _block_load_counter = nullptr;
// Add more detail seek timer and counter profile
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]