(doris) branch master updated: [improvement](executor) use real elapsed time to compute workload group metrics refresh interval (#63537)

zclll Tue, 26 May 2026 00:19:02 -0700

This is an automated email from the ASF dual-hosted git repository.

zclllyybb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/master by this push:
     new 1d72e0f22c3 [improvement](executor) use real elapsed time to compute 
workload group metrics refresh interval (#63537)
1d72e0f22c3 is described below

commit 1d72e0f22c330f6f8efc479847a0f6cc658e4019
Author: Nelson Boss <[email protected]>
AuthorDate: Tue May 26 15:18:13 2026 +0800

    [improvement](executor) use real elapsed time to compute workload group 
metrics refresh interval (#63537)
    
    The original implementation of `WorkloadGroupMetrics::refresh_metrics()`
    uses `config::workload_group_metrics_interval_ms / 1000` as a fixed
    divisor to compute per-second CPU and scan IO rates. This is inaccurate
    when:
    1. The refresh thread is delayed due to system load or scheduling jitter
    2. The configured interval is changed at runtime
    
    In both cases, the reported per-second CPU/IO rates diverge from
    reality.
    
    This PR replaces the fixed config-based interval with the actual
    monotonic time delta between two consecutive refreshes, so the rates
    stay
    accurate regardless of thread scheduling delays or runtime config
    changes. It also adds a division-by-zero guard for sub-second refresh
      intervals and corresponding unit tests.
    
    ---------
    
    Co-authored-by: Claude Opus 4.6 (1M context) <[email protected]>
---
 .../workload_group/workload_group_metrics.cpp      |   9 +-
 .../workload_group/workload_group_metrics.h        |   1 +
 .../workload_group/workload_group_metrics_test.cpp | 152 +++++++++++++++++++++
 3 files changed, 161 insertions(+), 1 deletion(-)

diff --git a/be/src/runtime/workload_group/workload_group_metrics.cpp 
b/be/src/runtime/workload_group/workload_group_metrics.cpp
index 41da5af1e78..c951cb7c44b 100644
--- a/be/src/runtime/workload_group/workload_group_metrics.cpp
+++ b/be/src/runtime/workload_group/workload_group_metrics.cpp
@@ -23,6 +23,7 @@
 #include "runtime/workload_group/workload_group.h"
 #include "runtime/workload_management/io_throttle.h"
 #include "storage/olap_common.h"
+#include "util/time.h"
 
 namespace doris {
 
@@ -83,7 +84,13 @@ void 
WorkloadGroupMetrics::update_remote_scan_io_bytes(uint64_t delta_io_bytes)
 }
 
 void WorkloadGroupMetrics::refresh_metrics() {
-    int interval_second = config::workload_group_metrics_interval_ms / 1000;
+    uint64_t current_time_ms = MonotonicMillis();
+    uint64_t interval_second = (current_time_ms - _last_refresh_time_ms) / 
1000;
+    _last_refresh_time_ms = current_time_ms;
+
+    if (interval_second == 0) {
+        return;
+    }
 
     // cpu
     uint64_t _current_cpu_time_nanos = _cpu_time_nanos.load();
diff --git a/be/src/runtime/workload_group/workload_group_metrics.h 
b/be/src/runtime/workload_group/workload_group_metrics.h
index 67085d8374f..ca04e0929ae 100644
--- a/be/src/runtime/workload_group/workload_group_metrics.h
+++ b/be/src/runtime/workload_group/workload_group_metrics.h
@@ -73,6 +73,7 @@ private:
 
     std::atomic<uint64_t> _cpu_time_nanos {0};
     std::atomic<uint64_t> _last_cpu_time_nanos {0};
+    std::atomic<uint64_t> _last_refresh_time_ms {0};
     std::atomic<uint64_t> _per_sec_cpu_time_nanos {0}; // used for system table
 
     std::atomic<uint64_t> _per_sec_local_scan_bytes {0};
diff --git a/be/test/runtime/workload_group/workload_group_metrics_test.cpp 
b/be/test/runtime/workload_group/workload_group_metrics_test.cpp
new file mode 100644
index 00000000000..9f163dc3a29
--- /dev/null
+++ b/be/test/runtime/workload_group/workload_group_metrics_test.cpp
@@ -0,0 +1,152 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "runtime/workload_group/workload_group_metrics.h"
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <chrono>
+#include <thread>
+
+#include "runtime/workload_group/workload_group.h"
+#include "util/time.h"
+
+namespace doris {
+
+class WorkloadGroupMetricsTest : public testing::Test {
+protected:
+    void SetUp() override {
+        // Use a unique id for each test instance to avoid metric entity 
conflicts
+        static std::atomic<uint64_t> next_id {1};
+        uint64_t id = next_id.fetch_add(1);
+        WorkloadGroupInfo wg_info {.id = id, .name = "test_wg_" + 
std::to_string(id)};
+        _wg = std::make_shared<WorkloadGroup>(wg_info);
+        _metrics = _wg->get_metrics();
+    }
+
+    void TearDown() override {
+        _metrics.reset();
+        _wg.reset();
+    }
+
+    std::shared_ptr<WorkloadGroup> _wg;
+    std::shared_ptr<WorkloadGroupMetrics> _metrics;
+};
+
+// Test that refresh_metrics uses real elapsed time to compute per-second 
rates.
+// After sleeping for a known interval, the per-second CPU rate should reflect
+// the actual elapsed time rather than the config-based fixed interval.
+TEST_F(WorkloadGroupMetricsTest, refresh_uses_real_elapsed_time) {
+    // First call to refresh_metrics to initialize _last_refresh_time_ms
+    _metrics->refresh_metrics();
+
+    // Add known CPU time: 2,000,000,000 nanos = 2 seconds of CPU
+    const uint64_t cpu_delta_nanos = 2000000000ULL;
+    _metrics->update_cpu_time_nanos(cpu_delta_nanos);
+
+    // Sleep for ~1.1 seconds so the real interval is ~1 second
+    std::this_thread::sleep_for(std::chrono::milliseconds(1100));
+
+    // Refresh metrics — should compute rate based on real ~1 second interval
+    _metrics->refresh_metrics();
+
+    // Expected: 2,000,000,000 nanos / 1 second = ~2,000,000,000 nanos per 
second
+    // Allow generous tolerance for timing imprecision in CI
+    uint64_t cpu_per_sec = _metrics->get_cpu_time_nanos_per_second();
+    EXPECT_GT(cpu_per_sec, 500000000ULL) << "CPU per-second rate too low: " << 
cpu_per_sec;
+    EXPECT_LT(cpu_per_sec, 4000000000ULL) << "CPU per-second rate too high: " 
<< cpu_per_sec;
+}
+
+// Test that when interval is less than 1 second, refresh_metrics does not
+// cause division by zero and preserves previous rates.
+TEST_F(WorkloadGroupMetricsTest, 
refresh_skips_when_interval_less_than_one_second) {
+    // First call to initialize _last_refresh_time_ms
+    _metrics->refresh_metrics();
+
+    // Add some CPU time
+    _metrics->update_cpu_time_nanos(1000000000ULL); // 1B nanos
+
+    // Call refresh immediately (< 1 second elapsed) — should not crash
+    // and should not update per-second rates (interval_second == 0 → early 
return)
+    _metrics->refresh_metrics();
+
+    // Per-second rate should still be 0 (from the initial state)
+    // because the sub-second refresh skips the rate calculation
+    uint64_t cpu_per_sec = _metrics->get_cpu_time_nanos_per_second();
+    EXPECT_EQ(cpu_per_sec, 0) << "CPU per-second rate should remain unchanged 
when interval < 1s";
+}
+
+// Test that different real intervals produce proportionally different rates.
+// A shorter interval with the same delta should yield a higher per-second 
rate.
+TEST_F(WorkloadGroupMetricsTest, shorter_interval_yields_higher_rate) {
+    // --- First measurement: 1 second interval ---
+    _metrics->refresh_metrics();
+    _metrics->update_cpu_time_nanos(1000000000ULL); // 1B nanos
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(1100));
+    _metrics->refresh_metrics();
+
+    uint64_t rate_1s = _metrics->get_cpu_time_nanos_per_second();
+
+    // --- Second measurement: add same delta, wait 2 seconds ---
+    _metrics->update_cpu_time_nanos(1000000000ULL); // another 1B nanos
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(2100));
+    _metrics->refresh_metrics();
+
+    uint64_t rate_2s = _metrics->get_cpu_time_nanos_per_second();
+
+    // With the same absolute delta (1B nanos) but double the interval,
+    // the per-second rate should be roughly half.
+    // Allow generous tolerance for timing jitter
+    EXPECT_GT(rate_1s, rate_2s) << "1s interval rate (" << rate_1s
+                                << ") should be higher than 2s interval rate 
(" << rate_2s << ")";
+}
+
+// Test that memory metrics are correctly reported
+TEST_F(WorkloadGroupMetricsTest, memory_used_reported_correctly) {
+    const int64_t mem_used = 1024L * 1024 * 512; // 512 MB
+    _metrics->update_memory_used_bytes(mem_used);
+    _metrics->refresh_metrics();
+
+    // Need to wait > 1 second for refresh to take effect
+    std::this_thread::sleep_for(std::chrono::milliseconds(1100));
+    _metrics->refresh_metrics();
+
+    EXPECT_EQ(_metrics->get_memory_used(), mem_used);
+}
+
+// Test that the first refresh (from _last_refresh_time_ms == 0) does not 
produce
+// unreasonable rates since the interval is very large (time since boot).
+TEST_F(WorkloadGroupMetricsTest, first_refresh_produces_near_zero_rate) {
+    // Add some CPU time before the first refresh
+    _metrics->update_cpu_time_nanos(5000000000ULL); // 5B nanos
+
+    // First refresh: interval = current_time_ms / 1000 (time since boot in 
seconds)
+    // For a system with uptime > 5 seconds, rate = 5B / uptime_seconds
+    // This should be small relative to the delta
+    _metrics->refresh_metrics();
+
+    uint64_t cpu_per_sec = _metrics->get_cpu_time_nanos_per_second();
+    // With system uptime of at least 60 seconds (reasonable assumption),
+    // rate = 5B / 60+ < 84M nanos/sec
+    EXPECT_LT(cpu_per_sec, 1000000000ULL)
+            << "First refresh rate should be modest since interval is system 
uptime";
+}
+
+} // namespace doris


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch master updated: [improvement](executor) use real elapsed time to compute workload group metrics refresh interval (#63537)

Reply via email to