This is an automated email from the ASF dual-hosted git repository.

hellostephen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new bf27b1845ee [feat](cloud) Add lock wait and bthread schedule delay 
metrics to SyncRowset profile (#61036)
bf27b1845ee is described below

commit bf27b1845eececa8df4e28afe86d811c8ee613dd
Author: bobhan1 <[email protected]>
AuthorDate: Wed Mar 18 10:34:45 2026 +0800

    [feat](cloud) Add lock wait and bthread schedule delay metrics to 
SyncRowset profile (#61036)
    
    ## Proposed changes
    
    This PR adds visibility into "hidden" time sinks in SyncRowset that are
    not covered by existing RPC metrics.
---
 be/src/cloud/cloud_meta_mgr.cpp             | 14 +++++++++++
 be/src/cloud/cloud_tablet.cpp               | 36 +++++++++++++++++++++++++++++
 be/src/cloud/cloud_tablet.h                 |  5 ++++
 be/src/exec/operator/olap_scan_operator.cpp | 22 +++++++++++++++++-
 be/src/exec/operator/olap_scan_operator.h   |  3 +++
 5 files changed, 79 insertions(+), 1 deletion(-)

diff --git a/be/src/cloud/cloud_meta_mgr.cpp b/be/src/cloud/cloud_meta_mgr.cpp
index f7bc86a7308..f51b58bb757 100644
--- a/be/src/cloud/cloud_meta_mgr.cpp
+++ b/be/src/cloud/cloud_meta_mgr.cpp
@@ -590,7 +590,14 @@ Status 
CloudMetaMgr::sync_tablet_rowsets_unlocked(CloudTablet* tablet,
         idx->set_index_id(index_id);
         idx->set_partition_id(tablet->partition_id());
         {
+            auto lock_start = std::chrono::steady_clock::now();
             std::shared_lock rlock(tablet->get_header_lock());
+            if (sync_stats) {
+                sync_stats->meta_lock_wait_ns +=
+                        std::chrono::duration_cast<std::chrono::nanoseconds>(
+                                std::chrono::steady_clock::now() - lock_start)
+                                .count();
+            }
             if (options.full_sync) {
                 req.set_start_version(0);
             } else {
@@ -693,7 +700,14 @@ Status 
CloudMetaMgr::sync_tablet_rowsets_unlocked(CloudTablet* tablet,
         });
         {
             const auto& stats = resp.stats();
+            auto lock_start = std::chrono::steady_clock::now();
             std::unique_lock wlock(tablet->get_header_lock());
+            if (sync_stats) {
+                sync_stats->meta_lock_wait_ns +=
+                        std::chrono::duration_cast<std::chrono::nanoseconds>(
+                                std::chrono::steady_clock::now() - lock_start)
+                                .count();
+            }
 
             // ATTN: we are facing following data race
             //
diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp
index eb07def6006..9936ac374f6 100644
--- a/be/src/cloud/cloud_tablet.cpp
+++ b/be/src/cloud/cloud_tablet.cpp
@@ -275,16 +275,34 @@ Status CloudTablet::sync_rowsets(const SyncOptions& 
options, SyncRowsetStats* st
     RETURN_IF_ERROR(sync_if_not_running(stats));
 
     if (options.query_version > 0) {
+        auto lock_start = std::chrono::steady_clock::now();
         std::shared_lock rlock(_meta_lock);
+        if (stats) {
+            stats->meta_lock_wait_ns += 
std::chrono::duration_cast<std::chrono::nanoseconds>(
+                                                
std::chrono::steady_clock::now() - lock_start)
+                                                .count();
+        }
         if (_max_version >= options.query_version) {
             return Status::OK();
         }
     }
 
     // serially execute sync to reduce unnecessary network overhead
+    auto sync_lock_start = std::chrono::steady_clock::now();
     std::unique_lock lock(_sync_meta_lock);
+    if (stats) {
+        stats->sync_meta_lock_wait_ns += 
std::chrono::duration_cast<std::chrono::nanoseconds>(
+                                                 
std::chrono::steady_clock::now() - sync_lock_start)
+                                                 .count();
+    }
     if (options.query_version > 0) {
+        auto lock_start = std::chrono::steady_clock::now();
         std::shared_lock rlock(_meta_lock);
+        if (stats) {
+            stats->meta_lock_wait_ns += 
std::chrono::duration_cast<std::chrono::nanoseconds>(
+                                                
std::chrono::steady_clock::now() - lock_start)
+                                                .count();
+        }
         if (_max_version >= options.query_version) {
             return Status::OK();
         }
@@ -307,10 +325,22 @@ Status CloudTablet::sync_if_not_running(SyncRowsetStats* 
stats) {
     }
 
     // Serially execute sync to reduce unnecessary network overhead
+    auto sync_lock_start = std::chrono::steady_clock::now();
     std::unique_lock lock(_sync_meta_lock);
+    if (stats) {
+        stats->sync_meta_lock_wait_ns += 
std::chrono::duration_cast<std::chrono::nanoseconds>(
+                                                 
std::chrono::steady_clock::now() - sync_lock_start)
+                                                 .count();
+    }
 
     {
+        auto lock_start = std::chrono::steady_clock::now();
         std::shared_lock rlock(_meta_lock);
+        if (stats) {
+            stats->meta_lock_wait_ns += 
std::chrono::duration_cast<std::chrono::nanoseconds>(
+                                                
std::chrono::steady_clock::now() - lock_start)
+                                                .count();
+        }
         if (tablet_state() == TABLET_RUNNING) {
             return Status::OK();
         }
@@ -332,7 +362,13 @@ Status CloudTablet::sync_if_not_running(SyncRowsetStats* 
stats) {
 
     TimestampedVersionTracker empty_tracker;
     {
+        auto lock_start = std::chrono::steady_clock::now();
         std::lock_guard wlock(_meta_lock);
+        if (stats) {
+            stats->meta_lock_wait_ns += 
std::chrono::duration_cast<std::chrono::nanoseconds>(
+                                                
std::chrono::steady_clock::now() - lock_start)
+                                                .count();
+        }
         RETURN_IF_ERROR(set_tablet_state(TABLET_RUNNING));
         _rs_version_map.clear();
         _stale_rs_version_map.clear();
diff --git a/be/src/cloud/cloud_tablet.h b/be/src/cloud/cloud_tablet.h
index 38675681db7..e7a9b13e851 100644
--- a/be/src/cloud/cloud_tablet.h
+++ b/be/src/cloud/cloud_tablet.h
@@ -53,6 +53,11 @@ struct SyncRowsetStats {
     int64_t get_remote_tablet_meta_rpc_ns {0};
     int64_t tablet_meta_cache_hit {0};
     int64_t tablet_meta_cache_miss {0};
+
+    int64_t bthread_schedule_delay_ns {0};
+    int64_t meta_lock_wait_ns {0}; // _meta_lock (std::shared_mutex) wait 
across all acquisitions
+    int64_t sync_meta_lock_wait_ns {
+            0}; // _sync_meta_lock (bthread::Mutex) wait across all 
acquisitions
 };
 
 struct SyncOptions {
diff --git a/be/src/exec/operator/olap_scan_operator.cpp 
b/be/src/exec/operator/olap_scan_operator.cpp
index 0c6dc914a09..083d1326f9e 100644
--- a/be/src/exec/operator/olap_scan_operator.cpp
+++ b/be/src/exec/operator/olap_scan_operator.cpp
@@ -171,6 +171,12 @@ Status OlapScanLocalState::_init_profile() {
                                   TUnit::BYTES, sync_rowset_timer_name);
         _sync_rowset_get_remote_delete_bitmap_rpc_timer = ADD_CHILD_TIMER(
                 _scanner_profile, "SyncRowsetGetRemoteDeleteBitmapRpcTime", 
sync_rowset_timer_name);
+        _sync_rowset_bthread_schedule_wait_timer = ADD_CHILD_TIMER(
+                _scanner_profile, "SyncRowsetBthreadScheduleWaitTime", 
sync_rowset_timer_name);
+        _sync_rowset_meta_lock_wait_timer = ADD_CHILD_TIMER(
+                _scanner_profile, "SyncRowsetMetaLockWaitTime", 
sync_rowset_timer_name);
+        _sync_rowset_sync_meta_lock_wait_timer = ADD_CHILD_TIMER(
+                _scanner_profile, "SyncRowsetSyncMetaLockWaitTime", 
sync_rowset_timer_name);
     }
     _block_init_timer = ADD_TIMER(_segment_profile, "BlockInitTime");
     _block_init_seek_timer = ADD_TIMER(_segment_profile, "BlockInitSeekTime");
@@ -616,7 +622,16 @@ Status 
OlapScanLocalState::_sync_cloud_tablets(RuntimeState* state) {
                                 _scan_ranges[i]->version.data() + 
_scan_ranges[i]->version.size(),
                                 version);
                 auto task_ctx = state->get_task_execution_context();
-                tasks.emplace_back([this, sync_stats, version, i, task_ctx]() {
+                auto task_create_time = std::chrono::steady_clock::now();
+                tasks.emplace_back([this, sync_stats, version, i, task_ctx, 
task_create_time]() {
+                    // Record bthread scheduling delay
+                    auto task_start_time = std::chrono::steady_clock::now();
+                    if (sync_stats) {
+                        sync_stats->bthread_schedule_delay_ns +=
+                                
std::chrono::duration_cast<std::chrono::nanoseconds>(
+                                        task_start_time - task_create_time)
+                                        .count();
+                    }
                     auto task_lock = task_ctx.lock();
                     if (task_lock == nullptr) {
                         return Status::OK();
@@ -690,6 +705,11 @@ Status OlapScanLocalState::prepare(RuntimeState* state) {
                            sync_stats.get_remote_delete_bitmap_bytes);
             COUNTER_UPDATE(_sync_rowset_get_remote_delete_bitmap_rpc_timer,
                            sync_stats.get_remote_delete_bitmap_rpc_ns);
+            COUNTER_UPDATE(_sync_rowset_bthread_schedule_wait_timer,
+                           sync_stats.bthread_schedule_delay_ns);
+            COUNTER_UPDATE(_sync_rowset_meta_lock_wait_timer, 
sync_stats.meta_lock_wait_ns);
+            COUNTER_UPDATE(_sync_rowset_sync_meta_lock_wait_timer,
+                           sync_stats.sync_meta_lock_wait_ns);
         }
         auto time_ms = _sync_cloud_tablets_watcher.elapsed_time_microseconds();
         if (time_ms >= config::sync_rowsets_slow_threshold_ms) {
diff --git a/be/src/exec/operator/olap_scan_operator.h 
b/be/src/exec/operator/olap_scan_operator.h
index 2ccf7e3b65e..e38d1eaf8f8 100644
--- a/be/src/exec/operator/olap_scan_operator.h
+++ b/be/src/exec/operator/olap_scan_operator.h
@@ -180,6 +180,9 @@ private:
     RuntimeProfile::Counter* _sync_rowset_get_remote_delete_bitmap_key_count = 
nullptr;
     RuntimeProfile::Counter* _sync_rowset_get_remote_delete_bitmap_bytes = 
nullptr;
     RuntimeProfile::Counter* _sync_rowset_get_remote_delete_bitmap_rpc_timer = 
nullptr;
+    RuntimeProfile::Counter* _sync_rowset_bthread_schedule_wait_timer = 
nullptr;
+    RuntimeProfile::Counter* _sync_rowset_meta_lock_wait_timer = nullptr;
+    RuntimeProfile::Counter* _sync_rowset_sync_meta_lock_wait_timer = nullptr;
     RuntimeProfile::Counter* _block_load_timer = nullptr;
     RuntimeProfile::Counter* _block_load_counter = nullptr;
     // Add more detail seek timer and counter profile


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to