This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new 3ac3bdf16d1 branch-4.0: [fix](cloud) Fix the residual metrics of
cluster after drop compute group #57235 (#60055)
3ac3bdf16d1 is described below
commit 3ac3bdf16d1ac5c22fa5515d0376fbf83d4c1007
Author: deardeng <[email protected]>
AuthorDate: Wed Jan 21 05:37:25 2026 +0800
branch-4.0: [fix](cloud) Fix the residual metrics of cluster after drop
compute group #57235 (#60055)
cherry pick from #57235
---
docker/runtime/doris-compose/Dockerfile | 4 +-
.../doris/cloud/catalog/CloudClusterChecker.java | 2 +-
.../cloud/catalog/CloudInstanceStatusChecker.java | 2 +
.../doris/cloud/system/CloudSystemInfoService.java | 1 +
.../org/apache/doris/metric/AutoMappedMetric.java | 3 +
.../java/org/apache/doris/metric/MetricRepo.java | 73 ++++++++++++++++++++++
6 files changed, 82 insertions(+), 3 deletions(-)
diff --git a/docker/runtime/doris-compose/Dockerfile
b/docker/runtime/doris-compose/Dockerfile
index d3f6f1a22ce..f3b4c77fdea 100644
--- a/docker/runtime/doris-compose/Dockerfile
+++ b/docker/runtime/doris-compose/Dockerfile
@@ -23,7 +23,7 @@
# choose a base image
# doris 2.1, 3.0+, master use JDK 17
-ARG JDK_IMAGE=openjdk:17-jdk-slim
+ARG JDK_IMAGE=openjdk:17.0.1-jdk-slim
# doris 2.0 use JDK 8
# build 2.0 image, example:
@@ -88,4 +88,4 @@ COPY --chmod=777 ${OUTPUT_PATH} /opt/apache-doris/
RUN sed -i 's/\<chmod\>/echo/g' /opt/apache-doris/be/bin/start_be.sh
RUN if [ -d /opt/apache-doris/ms/bin ]; then \
sed -i 's/\<chmod\>/echo/g' /opt/apache-doris/ms/bin/start.sh ; \
- fi
\ No newline at end of file
+ fi
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java
b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java
index 4ba162dfe11..338d619604f 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java
@@ -152,10 +152,10 @@ public class CloudClusterChecker extends MasterDaemon {
// del clusterName
String delClusterName =
cloudSystemInfoService.getClusterNameByClusterId(delId);
if (delClusterName.isEmpty()) {
- LOG.warn("can't get delClusterName, clusterId: {}, plz
check", delId);
return;
}
// del clusterID
+ MetricRepo.unregisterCloudMetrics(delId, delClusterName,
toDel);
cloudSystemInfoService.dropCluster(delId, delClusterName);
}
);
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudInstanceStatusChecker.java
b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudInstanceStatusChecker.java
index e1648466a27..90c8ea42573 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudInstanceStatusChecker.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudInstanceStatusChecker.java
@@ -529,6 +529,8 @@ public class CloudInstanceStatusChecker extends
MasterDaemon {
// in fe mem, but not in meta server
if (!msVirtualClusters.contains(computeGroup.getId())) {
LOG.info("virtual compute group {} will be removed.",
computeGroup.getName());
+ MetricRepo.unregisterCloudMetrics(computeGroup.getId(),
computeGroup.getName(),
+ Collections.emptyList());
cloudSystemInfoService.removeComputeGroup(computeGroup.getId(),
computeGroup.getName());
// cancel invalid job
if
(!computeGroup.getPolicy().getCacheWarmupJobIds().isEmpty()) {
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java
b/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java
index f21da5e039a..3daf14e0dfa 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java
@@ -408,6 +408,7 @@ public class CloudSystemInfoService extends
SystemInfoService {
// ATTN: Empty clusters are treated as dropped clusters.
if (be.isEmpty()) {
LOG.info("del clusterId {} and clusterName {} due to be nodes
eq 0", clusterId, clusterName);
+ MetricRepo.unregisterCloudMetrics(clusterId, clusterName,
toDel);
boolean succ = clusterNameToId.remove(clusterName, clusterId);
// remove from computeGroupIdToComputeGroup
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/metric/AutoMappedMetric.java
b/fe/fe-core/src/main/java/org/apache/doris/metric/AutoMappedMetric.java
index 440e00330b4..5b348c73b15 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/metric/AutoMappedMetric.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/metric/AutoMappedMetric.java
@@ -38,4 +38,7 @@ public class AutoMappedMetric<M> {
return nameToMetric;
}
+ public void remove(String name) {
+ nameToMetric.remove(name);
+ }
}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java
b/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java
index 7060817a62a..b7bbfea437b 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java
@@ -1532,4 +1532,77 @@ public final class MetricRepo {
String key = clusterId + CloudMetrics.CLOUD_CLUSTER_DELIMITER +
clusterName;
CloudMetrics.CLUSTER_QUERY_LATENCY_HISTO.getOrAdd(key).update(elapseMs);
}
+
+ public static void unregisterCloudMetrics(String clusterId, String
clusterName, List<Backend> backends) {
+ if (!MetricRepo.isInit || Config.isNotCloudMode() ||
Strings.isNullOrEmpty(clusterId)) {
+ return;
+ }
+ LOG.debug("unregister cloud metrics for cluster {}", clusterId);
+ try {
+ List<MetricLabel> labels = new ArrayList<>();
+ labels.add(new MetricLabel("cluster_id", clusterId));
+ labels.add(new MetricLabel("cluster_name", clusterName));
+
+ LongCounterMetric requestAllCounter =
CloudMetrics.CLUSTER_REQUEST_ALL_COUNTER.getOrAdd(clusterId);
+ CloudMetrics.CLUSTER_REQUEST_ALL_COUNTER.remove(clusterId);
+
DORIS_METRIC_REGISTER.removeMetricsByNameAndLabels(requestAllCounter.getName(),
labels);
+
+ LongCounterMetric queryAllCounter =
CloudMetrics.CLUSTER_QUERY_ALL_COUNTER.getOrAdd(clusterId);
+ CloudMetrics.CLUSTER_QUERY_ALL_COUNTER.remove(clusterId);
+
DORIS_METRIC_REGISTER.removeMetricsByNameAndLabels(queryAllCounter.getName(),
labels);
+
+ LongCounterMetric queryErrCounter =
CloudMetrics.CLUSTER_QUERY_ERR_COUNTER.getOrAdd(clusterId);
+ CloudMetrics.CLUSTER_QUERY_ERR_COUNTER.remove(clusterId);
+
DORIS_METRIC_REGISTER.removeMetricsByNameAndLabels(queryErrCounter.getName(),
labels);
+
+ LongCounterMetric warmUpJobExecCounter =
CloudMetrics.CLUSTER_WARM_UP_JOB_EXEC_COUNT.getOrAdd(clusterId);
+ CloudMetrics.CLUSTER_WARM_UP_JOB_EXEC_COUNT.remove(clusterId);
+
DORIS_METRIC_REGISTER.removeMetricsByNameAndLabels(warmUpJobExecCounter.getName(),
labels);
+
+ LongCounterMetric warmUpJobRequestedTablets =
+
CloudMetrics.CLUSTER_WARM_UP_JOB_REQUESTED_TABLETS.getOrAdd(clusterId);
+
CloudMetrics.CLUSTER_WARM_UP_JOB_REQUESTED_TABLETS.remove(clusterId);
+
DORIS_METRIC_REGISTER.removeMetricsByNameAndLabels(warmUpJobRequestedTablets.getName(),
labels);
+
+ LongCounterMetric warmUpJobFinishedTablets =
+
CloudMetrics.CLUSTER_WARM_UP_JOB_FINISHED_TABLETS.getOrAdd(clusterId);
+
CloudMetrics.CLUSTER_WARM_UP_JOB_FINISHED_TABLETS.remove(clusterId);
+
DORIS_METRIC_REGISTER.removeMetricsByNameAndLabels(warmUpJobFinishedTablets.getName(),
labels);
+
+ GaugeMetricImpl<Double> requestPerSecondGauge =
CloudMetrics.CLUSTER_REQUEST_PER_SECOND_GAUGE
+ .getOrAdd(clusterId);
+ CloudMetrics.CLUSTER_REQUEST_PER_SECOND_GAUGE.remove(clusterId);
+
DORIS_METRIC_REGISTER.removeMetricsByNameAndLabels(requestPerSecondGauge.getName(),
labels);
+
+ GaugeMetricImpl<Double> queryPerSecondGauge =
CloudMetrics.CLUSTER_QUERY_PER_SECOND_GAUGE
+ .getOrAdd(clusterId);
+ CloudMetrics.CLUSTER_QUERY_PER_SECOND_GAUGE.remove(clusterId);
+
DORIS_METRIC_REGISTER.removeMetricsByNameAndLabels(queryPerSecondGauge.getName(),
labels);
+
+ GaugeMetricImpl<Double> queryErrRateGauge =
CloudMetrics.CLUSTER_QUERY_ERR_RATE_GAUGE.getOrAdd(clusterId);
+ CloudMetrics.CLUSTER_QUERY_ERR_RATE_GAUGE.remove(clusterId);
+
DORIS_METRIC_REGISTER.removeMetricsByNameAndLabels(queryErrRateGauge.getName(),
labels);
+
+ METRIC_REGISTER.getHistograms().keySet().stream()
+ .filter(k -> k.contains(clusterId))
+ .forEach(METRIC_REGISTER::remove);
+
+ for (Backend backend : backends) {
+ List<MetricLabel> backendLabels = new ArrayList<>();
+ backendLabels.add(new MetricLabel("cluster_id", clusterId));
+ backendLabels.add(new MetricLabel("cluster_name",
clusterName));
+ backendLabels.add(new MetricLabel("address",
backend.getAddress()));
+ String key = clusterId + "_" + backend.getAddress();
+ GaugeMetricImpl<Integer> metric =
CloudMetrics.CLUSTER_BACKEND_ALIVE.getOrAdd(key);
+
MetricRepo.DORIS_METRIC_REGISTER.removeMetricsByNameAndLabels(metric.getName(),
backendLabels);
+ }
+
+ GaugeMetricImpl<Integer> backendAliveTotal =
CloudMetrics.CLUSTER_BACKEND_ALIVE_TOTAL.getOrAdd(clusterId);
+ CloudMetrics.CLUSTER_BACKEND_ALIVE_TOTAL.remove(clusterId);
+
MetricRepo.DORIS_METRIC_REGISTER.removeMetricsByNameAndLabels(backendAliveTotal.getName(),
labels);
+
+ } catch (Throwable t) {
+ LOG.warn("unregister cloud metrics for cluster {} failed",
clusterId, t);
+ }
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]