This is an automated email from the ASF dual-hosted git repository.
zclllyybb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 1d72e0f22c3 [improvement](executor) use real elapsed time to compute
workload group metrics refresh interval (#63537)
1d72e0f22c3 is described below
commit 1d72e0f22c330f6f8efc479847a0f6cc658e4019
Author: Nelson Boss <[email protected]>
AuthorDate: Tue May 26 15:18:13 2026 +0800
[improvement](executor) use real elapsed time to compute workload group
metrics refresh interval (#63537)
The original implementation of `WorkloadGroupMetrics::refresh_metrics()`
uses `config::workload_group_metrics_interval_ms / 1000` as a fixed
divisor to compute per-second CPU and scan IO rates. This is inaccurate
when:
1. The refresh thread is delayed due to system load or scheduling jitter
2. The configured interval is changed at runtime
In both cases, the reported per-second CPU/IO rates diverge from
reality.
This PR replaces the fixed config-based interval with the actual
monotonic time delta between two consecutive refreshes, so the rates
stay
accurate regardless of thread scheduling delays or runtime config
changes. It also adds a division-by-zero guard for sub-second refresh
intervals and corresponding unit tests.
---------
Co-authored-by: Claude Opus 4.6 (1M context) <[email protected]>
---
.../workload_group/workload_group_metrics.cpp | 9 +-
.../workload_group/workload_group_metrics.h | 1 +
.../workload_group/workload_group_metrics_test.cpp | 152 +++++++++++++++++++++
3 files changed, 161 insertions(+), 1 deletion(-)
diff --git a/be/src/runtime/workload_group/workload_group_metrics.cpp
b/be/src/runtime/workload_group/workload_group_metrics.cpp
index 41da5af1e78..c951cb7c44b 100644
--- a/be/src/runtime/workload_group/workload_group_metrics.cpp
+++ b/be/src/runtime/workload_group/workload_group_metrics.cpp
@@ -23,6 +23,7 @@
#include "runtime/workload_group/workload_group.h"
#include "runtime/workload_management/io_throttle.h"
#include "storage/olap_common.h"
+#include "util/time.h"
namespace doris {
@@ -83,7 +84,13 @@ void
WorkloadGroupMetrics::update_remote_scan_io_bytes(uint64_t delta_io_bytes)
}
void WorkloadGroupMetrics::refresh_metrics() {
- int interval_second = config::workload_group_metrics_interval_ms / 1000;
+ uint64_t current_time_ms = MonotonicMillis();
+ uint64_t interval_second = (current_time_ms - _last_refresh_time_ms) /
1000;
+ _last_refresh_time_ms = current_time_ms;
+
+ if (interval_second == 0) {
+ return;
+ }
// cpu
uint64_t _current_cpu_time_nanos = _cpu_time_nanos.load();
diff --git a/be/src/runtime/workload_group/workload_group_metrics.h
b/be/src/runtime/workload_group/workload_group_metrics.h
index 67085d8374f..ca04e0929ae 100644
--- a/be/src/runtime/workload_group/workload_group_metrics.h
+++ b/be/src/runtime/workload_group/workload_group_metrics.h
@@ -73,6 +73,7 @@ private:
std::atomic<uint64_t> _cpu_time_nanos {0};
std::atomic<uint64_t> _last_cpu_time_nanos {0};
+ std::atomic<uint64_t> _last_refresh_time_ms {0};
std::atomic<uint64_t> _per_sec_cpu_time_nanos {0}; // used for system table
std::atomic<uint64_t> _per_sec_local_scan_bytes {0};
diff --git a/be/test/runtime/workload_group/workload_group_metrics_test.cpp
b/be/test/runtime/workload_group/workload_group_metrics_test.cpp
new file mode 100644
index 00000000000..9f163dc3a29
--- /dev/null
+++ b/be/test/runtime/workload_group/workload_group_metrics_test.cpp
@@ -0,0 +1,152 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "runtime/workload_group/workload_group_metrics.h"
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <chrono>
+#include <thread>
+
+#include "runtime/workload_group/workload_group.h"
+#include "util/time.h"
+
+namespace doris {
+
+class WorkloadGroupMetricsTest : public testing::Test {
+protected:
+ void SetUp() override {
+ // Use a unique id for each test instance to avoid metric entity
conflicts
+ static std::atomic<uint64_t> next_id {1};
+ uint64_t id = next_id.fetch_add(1);
+ WorkloadGroupInfo wg_info {.id = id, .name = "test_wg_" +
std::to_string(id)};
+ _wg = std::make_shared<WorkloadGroup>(wg_info);
+ _metrics = _wg->get_metrics();
+ }
+
+ void TearDown() override {
+ _metrics.reset();
+ _wg.reset();
+ }
+
+ std::shared_ptr<WorkloadGroup> _wg;
+ std::shared_ptr<WorkloadGroupMetrics> _metrics;
+};
+
+// Test that refresh_metrics uses real elapsed time to compute per-second
rates.
+// After sleeping for a known interval, the per-second CPU rate should reflect
+// the actual elapsed time rather than the config-based fixed interval.
+TEST_F(WorkloadGroupMetricsTest, refresh_uses_real_elapsed_time) {
+ // First call to refresh_metrics to initialize _last_refresh_time_ms
+ _metrics->refresh_metrics();
+
+ // Add known CPU time: 2,000,000,000 nanos = 2 seconds of CPU
+ const uint64_t cpu_delta_nanos = 2000000000ULL;
+ _metrics->update_cpu_time_nanos(cpu_delta_nanos);
+
+ // Sleep for ~1.1 seconds so the real interval is ~1 second
+ std::this_thread::sleep_for(std::chrono::milliseconds(1100));
+
+ // Refresh metrics — should compute rate based on real ~1 second interval
+ _metrics->refresh_metrics();
+
+ // Expected: 2,000,000,000 nanos / 1 second = ~2,000,000,000 nanos per
second
+ // Allow generous tolerance for timing imprecision in CI
+ uint64_t cpu_per_sec = _metrics->get_cpu_time_nanos_per_second();
+ EXPECT_GT(cpu_per_sec, 500000000ULL) << "CPU per-second rate too low: " <<
cpu_per_sec;
+ EXPECT_LT(cpu_per_sec, 4000000000ULL) << "CPU per-second rate too high: "
<< cpu_per_sec;
+}
+
+// Test that when interval is less than 1 second, refresh_metrics does not
+// cause division by zero and preserves previous rates.
+TEST_F(WorkloadGroupMetricsTest,
refresh_skips_when_interval_less_than_one_second) {
+ // First call to initialize _last_refresh_time_ms
+ _metrics->refresh_metrics();
+
+ // Add some CPU time
+ _metrics->update_cpu_time_nanos(1000000000ULL); // 1B nanos
+
+ // Call refresh immediately (< 1 second elapsed) — should not crash
+ // and should not update per-second rates (interval_second == 0 → early
return)
+ _metrics->refresh_metrics();
+
+ // Per-second rate should still be 0 (from the initial state)
+ // because the sub-second refresh skips the rate calculation
+ uint64_t cpu_per_sec = _metrics->get_cpu_time_nanos_per_second();
+ EXPECT_EQ(cpu_per_sec, 0) << "CPU per-second rate should remain unchanged
when interval < 1s";
+}
+
+// Test that different real intervals produce proportionally different rates.
+// A shorter interval with the same delta should yield a higher per-second
rate.
+TEST_F(WorkloadGroupMetricsTest, shorter_interval_yields_higher_rate) {
+ // --- First measurement: 1 second interval ---
+ _metrics->refresh_metrics();
+ _metrics->update_cpu_time_nanos(1000000000ULL); // 1B nanos
+
+ std::this_thread::sleep_for(std::chrono::milliseconds(1100));
+ _metrics->refresh_metrics();
+
+ uint64_t rate_1s = _metrics->get_cpu_time_nanos_per_second();
+
+ // --- Second measurement: add same delta, wait 2 seconds ---
+ _metrics->update_cpu_time_nanos(1000000000ULL); // another 1B nanos
+
+ std::this_thread::sleep_for(std::chrono::milliseconds(2100));
+ _metrics->refresh_metrics();
+
+ uint64_t rate_2s = _metrics->get_cpu_time_nanos_per_second();
+
+ // With the same absolute delta (1B nanos) but double the interval,
+ // the per-second rate should be roughly half.
+ // Allow generous tolerance for timing jitter
+ EXPECT_GT(rate_1s, rate_2s) << "1s interval rate (" << rate_1s
+ << ") should be higher than 2s interval rate
(" << rate_2s << ")";
+}
+
+// Test that memory metrics are correctly reported
+TEST_F(WorkloadGroupMetricsTest, memory_used_reported_correctly) {
+ const int64_t mem_used = 1024L * 1024 * 512; // 512 MB
+ _metrics->update_memory_used_bytes(mem_used);
+ _metrics->refresh_metrics();
+
+ // Need to wait > 1 second for refresh to take effect
+ std::this_thread::sleep_for(std::chrono::milliseconds(1100));
+ _metrics->refresh_metrics();
+
+ EXPECT_EQ(_metrics->get_memory_used(), mem_used);
+}
+
+// Test that the first refresh (from _last_refresh_time_ms == 0) does not
produce
+// unreasonable rates since the interval is very large (time since boot).
+TEST_F(WorkloadGroupMetricsTest, first_refresh_produces_near_zero_rate) {
+ // Add some CPU time before the first refresh
+ _metrics->update_cpu_time_nanos(5000000000ULL); // 5B nanos
+
+ // First refresh: interval = current_time_ms / 1000 (time since boot in
seconds)
+ // For a system with uptime > 5 seconds, rate = 5B / uptime_seconds
+ // This should be small relative to the delta
+ _metrics->refresh_metrics();
+
+ uint64_t cpu_per_sec = _metrics->get_cpu_time_nanos_per_second();
+ // With system uptime of at least 60 seconds (reasonable assumption),
+ // rate = 5B / 60+ < 84M nanos/sec
+ EXPECT_LT(cpu_per_sec, 1000000000ULL)
+ << "First refresh rate should be modest since interval is system
uptime";
+}
+
+} // namespace doris
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]