This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-4.1 by this push:
     new 6567223b6eb branch-4.1: [feat](cloud) Add lock wait and bthread 
schedule delay metrics to SyncRowset profile #61036 (#61450)
6567223b6eb is described below

commit 6567223b6eb65892f9e8ca30c6aed8a90ddced9f
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Wed Mar 18 14:39:25 2026 +0800

    branch-4.1: [feat](cloud) Add lock wait and bthread schedule delay metrics 
to SyncRowset profile #61036 (#61450)
    
    Cherry-picked from #61036
    
    Co-authored-by: bobhan1 <[email protected]>
---
 be/src/cloud/cloud_meta_mgr.cpp             | 14 +++++++++++
 be/src/cloud/cloud_tablet.cpp               | 36 +++++++++++++++++++++++++++++
 be/src/cloud/cloud_tablet.h                 |  5 ++++
 be/src/pipeline/exec/olap_scan_operator.cpp | 22 +++++++++++++++++-
 be/src/pipeline/exec/olap_scan_operator.h   |  3 +++
 5 files changed, 79 insertions(+), 1 deletion(-)

diff --git a/be/src/cloud/cloud_meta_mgr.cpp b/be/src/cloud/cloud_meta_mgr.cpp
index 8afb9e42560..44ee033fb33 100644
--- a/be/src/cloud/cloud_meta_mgr.cpp
+++ b/be/src/cloud/cloud_meta_mgr.cpp
@@ -590,7 +590,14 @@ Status 
CloudMetaMgr::sync_tablet_rowsets_unlocked(CloudTablet* tablet,
         idx->set_index_id(index_id);
         idx->set_partition_id(tablet->partition_id());
         {
+            auto lock_start = std::chrono::steady_clock::now();
             std::shared_lock rlock(tablet->get_header_lock());
+            if (sync_stats) {
+                sync_stats->meta_lock_wait_ns +=
+                        std::chrono::duration_cast<std::chrono::nanoseconds>(
+                                std::chrono::steady_clock::now() - lock_start)
+                                .count();
+            }
             if (options.full_sync) {
                 req.set_start_version(0);
             } else {
@@ -693,7 +700,14 @@ Status 
CloudMetaMgr::sync_tablet_rowsets_unlocked(CloudTablet* tablet,
         });
         {
             const auto& stats = resp.stats();
+            auto lock_start = std::chrono::steady_clock::now();
             std::unique_lock wlock(tablet->get_header_lock());
+            if (sync_stats) {
+                sync_stats->meta_lock_wait_ns +=
+                        std::chrono::duration_cast<std::chrono::nanoseconds>(
+                                std::chrono::steady_clock::now() - lock_start)
+                                .count();
+            }
 
             // ATTN: we are facing following data race
             //
diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp
index 014c16d180e..9d382bfc134 100644
--- a/be/src/cloud/cloud_tablet.cpp
+++ b/be/src/cloud/cloud_tablet.cpp
@@ -301,16 +301,34 @@ Status CloudTablet::sync_rowsets(const SyncOptions& 
options, SyncRowsetStats* st
     RETURN_IF_ERROR(sync_if_not_running(stats));
 
     if (options.query_version > 0) {
+        auto lock_start = std::chrono::steady_clock::now();
         std::shared_lock rlock(_meta_lock);
+        if (stats) {
+            stats->meta_lock_wait_ns += 
std::chrono::duration_cast<std::chrono::nanoseconds>(
+                                                
std::chrono::steady_clock::now() - lock_start)
+                                                .count();
+        }
         if (_max_version >= options.query_version) {
             return Status::OK();
         }
     }
 
     // serially execute sync to reduce unnecessary network overhead
+    auto sync_lock_start = std::chrono::steady_clock::now();
     std::unique_lock lock(_sync_meta_lock);
+    if (stats) {
+        stats->sync_meta_lock_wait_ns += 
std::chrono::duration_cast<std::chrono::nanoseconds>(
+                                                 
std::chrono::steady_clock::now() - sync_lock_start)
+                                                 .count();
+    }
     if (options.query_version > 0) {
+        auto lock_start = std::chrono::steady_clock::now();
         std::shared_lock rlock(_meta_lock);
+        if (stats) {
+            stats->meta_lock_wait_ns += 
std::chrono::duration_cast<std::chrono::nanoseconds>(
+                                                
std::chrono::steady_clock::now() - lock_start)
+                                                .count();
+        }
         if (_max_version >= options.query_version) {
             return Status::OK();
         }
@@ -333,10 +351,22 @@ Status CloudTablet::sync_if_not_running(SyncRowsetStats* 
stats) {
     }
 
     // Serially execute sync to reduce unnecessary network overhead
+    auto sync_lock_start = std::chrono::steady_clock::now();
     std::unique_lock lock(_sync_meta_lock);
+    if (stats) {
+        stats->sync_meta_lock_wait_ns += 
std::chrono::duration_cast<std::chrono::nanoseconds>(
+                                                 
std::chrono::steady_clock::now() - sync_lock_start)
+                                                 .count();
+    }
 
     {
+        auto lock_start = std::chrono::steady_clock::now();
         std::shared_lock rlock(_meta_lock);
+        if (stats) {
+            stats->meta_lock_wait_ns += 
std::chrono::duration_cast<std::chrono::nanoseconds>(
+                                                
std::chrono::steady_clock::now() - lock_start)
+                                                .count();
+        }
         if (tablet_state() == TABLET_RUNNING) {
             return Status::OK();
         }
@@ -358,7 +388,13 @@ Status CloudTablet::sync_if_not_running(SyncRowsetStats* 
stats) {
 
     TimestampedVersionTracker empty_tracker;
     {
+        auto lock_start = std::chrono::steady_clock::now();
         std::lock_guard wlock(_meta_lock);
+        if (stats) {
+            stats->meta_lock_wait_ns += 
std::chrono::duration_cast<std::chrono::nanoseconds>(
+                                                
std::chrono::steady_clock::now() - lock_start)
+                                                .count();
+        }
         RETURN_IF_ERROR(set_tablet_state(TABLET_RUNNING));
         _rs_version_map.clear();
         _stale_rs_version_map.clear();
diff --git a/be/src/cloud/cloud_tablet.h b/be/src/cloud/cloud_tablet.h
index 7b35b7c4d3f..f4642266f47 100644
--- a/be/src/cloud/cloud_tablet.h
+++ b/be/src/cloud/cloud_tablet.h
@@ -53,6 +53,11 @@ struct SyncRowsetStats {
     int64_t get_remote_tablet_meta_rpc_ns {0};
     int64_t tablet_meta_cache_hit {0};
     int64_t tablet_meta_cache_miss {0};
+
+    int64_t bthread_schedule_delay_ns {0};
+    int64_t meta_lock_wait_ns {0}; // _meta_lock (std::shared_mutex) wait 
across all acquisitions
+    int64_t sync_meta_lock_wait_ns {
+            0}; // _sync_meta_lock (bthread::Mutex) wait across all 
acquisitions
 };
 
 struct SyncOptions {
diff --git a/be/src/pipeline/exec/olap_scan_operator.cpp 
b/be/src/pipeline/exec/olap_scan_operator.cpp
index 2082dd7ea87..6fa5f52c0e8 100644
--- a/be/src/pipeline/exec/olap_scan_operator.cpp
+++ b/be/src/pipeline/exec/olap_scan_operator.cpp
@@ -172,6 +172,12 @@ Status OlapScanLocalState::_init_profile() {
                                   TUnit::BYTES, sync_rowset_timer_name);
         _sync_rowset_get_remote_delete_bitmap_rpc_timer = ADD_CHILD_TIMER(
                 _scanner_profile, "SyncRowsetGetRemoteDeleteBitmapRpcTime", 
sync_rowset_timer_name);
+        _sync_rowset_bthread_schedule_wait_timer = ADD_CHILD_TIMER(
+                _scanner_profile, "SyncRowsetBthreadScheduleWaitTime", 
sync_rowset_timer_name);
+        _sync_rowset_meta_lock_wait_timer = ADD_CHILD_TIMER(
+                _scanner_profile, "SyncRowsetMetaLockWaitTime", 
sync_rowset_timer_name);
+        _sync_rowset_sync_meta_lock_wait_timer = ADD_CHILD_TIMER(
+                _scanner_profile, "SyncRowsetSyncMetaLockWaitTime", 
sync_rowset_timer_name);
     }
     _block_init_timer = ADD_TIMER(_segment_profile, "BlockInitTime");
     _block_init_seek_timer = ADD_TIMER(_segment_profile, "BlockInitSeekTime");
@@ -614,7 +620,16 @@ Status 
OlapScanLocalState::_sync_cloud_tablets(RuntimeState* state) {
                                 _scan_ranges[i]->version.data() + 
_scan_ranges[i]->version.size(),
                                 version);
                 auto task_ctx = state->get_task_execution_context();
-                tasks.emplace_back([this, sync_stats, version, i, task_ctx]() {
+                auto task_create_time = std::chrono::steady_clock::now();
+                tasks.emplace_back([this, sync_stats, version, i, task_ctx, 
task_create_time]() {
+                    // Record bthread scheduling delay
+                    auto task_start_time = std::chrono::steady_clock::now();
+                    if (sync_stats) {
+                        sync_stats->bthread_schedule_delay_ns +=
+                                
std::chrono::duration_cast<std::chrono::nanoseconds>(
+                                        task_start_time - task_create_time)
+                                        .count();
+                    }
                     auto task_lock = task_ctx.lock();
                     if (task_lock == nullptr) {
                         return Status::OK();
@@ -688,6 +703,11 @@ Status OlapScanLocalState::prepare(RuntimeState* state) {
                            sync_stats.get_remote_delete_bitmap_bytes);
             COUNTER_UPDATE(_sync_rowset_get_remote_delete_bitmap_rpc_timer,
                            sync_stats.get_remote_delete_bitmap_rpc_ns);
+            COUNTER_UPDATE(_sync_rowset_bthread_schedule_wait_timer,
+                           sync_stats.bthread_schedule_delay_ns);
+            COUNTER_UPDATE(_sync_rowset_meta_lock_wait_timer, 
sync_stats.meta_lock_wait_ns);
+            COUNTER_UPDATE(_sync_rowset_sync_meta_lock_wait_timer,
+                           sync_stats.sync_meta_lock_wait_ns);
         }
         auto time_ms = _sync_cloud_tablets_watcher.elapsed_time_microseconds();
         if (time_ms >= config::sync_rowsets_slow_threshold_ms) {
diff --git a/be/src/pipeline/exec/olap_scan_operator.h 
b/be/src/pipeline/exec/olap_scan_operator.h
index 292081b29a6..5077b46b7cb 100644
--- a/be/src/pipeline/exec/olap_scan_operator.h
+++ b/be/src/pipeline/exec/olap_scan_operator.h
@@ -181,6 +181,9 @@ private:
     RuntimeProfile::Counter* _sync_rowset_get_remote_delete_bitmap_key_count = 
nullptr;
     RuntimeProfile::Counter* _sync_rowset_get_remote_delete_bitmap_bytes = 
nullptr;
     RuntimeProfile::Counter* _sync_rowset_get_remote_delete_bitmap_rpc_timer = 
nullptr;
+    RuntimeProfile::Counter* _sync_rowset_bthread_schedule_wait_timer = 
nullptr;
+    RuntimeProfile::Counter* _sync_rowset_meta_lock_wait_timer = nullptr;
+    RuntimeProfile::Counter* _sync_rowset_sync_meta_lock_wait_timer = nullptr;
     RuntimeProfile::Counter* _block_load_timer = nullptr;
     RuntimeProfile::Counter* _block_load_counter = nullptr;
     // Add more detail seek timer and counter profile


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to