This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new 3ac3bdf16d1 branch-4.0: [fix](cloud) Fix the residual metrics of 
cluster after drop compute group #57235 (#60055)
3ac3bdf16d1 is described below

commit 3ac3bdf16d1ac5c22fa5515d0376fbf83d4c1007
Author: deardeng <[email protected]>
AuthorDate: Wed Jan 21 05:37:25 2026 +0800

    branch-4.0: [fix](cloud) Fix the residual metrics of cluster after drop 
compute group #57235 (#60055)
    
    cherry pick from #57235
---
 docker/runtime/doris-compose/Dockerfile            |  4 +-
 .../doris/cloud/catalog/CloudClusterChecker.java   |  2 +-
 .../cloud/catalog/CloudInstanceStatusChecker.java  |  2 +
 .../doris/cloud/system/CloudSystemInfoService.java |  1 +
 .../org/apache/doris/metric/AutoMappedMetric.java  |  3 +
 .../java/org/apache/doris/metric/MetricRepo.java   | 73 ++++++++++++++++++++++
 6 files changed, 82 insertions(+), 3 deletions(-)

diff --git a/docker/runtime/doris-compose/Dockerfile 
b/docker/runtime/doris-compose/Dockerfile
index d3f6f1a22ce..f3b4c77fdea 100644
--- a/docker/runtime/doris-compose/Dockerfile
+++ b/docker/runtime/doris-compose/Dockerfile
@@ -23,7 +23,7 @@
 
 # choose a base image
 # doris 2.1, 3.0+, master use JDK 17
-ARG JDK_IMAGE=openjdk:17-jdk-slim
+ARG JDK_IMAGE=openjdk:17.0.1-jdk-slim
 
 # doris 2.0 use JDK 8
 # build 2.0 image, example:
@@ -88,4 +88,4 @@ COPY --chmod=777 ${OUTPUT_PATH} /opt/apache-doris/
 RUN sed -i 's/\<chmod\>/echo/g' /opt/apache-doris/be/bin/start_be.sh
 RUN if [ -d /opt/apache-doris/ms/bin ]; then                            \
         sed -i 's/\<chmod\>/echo/g' /opt/apache-doris/ms/bin/start.sh ; \
-    fi
\ No newline at end of file
+    fi
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java
 
b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java
index 4ba162dfe11..338d619604f 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java
@@ -152,10 +152,10 @@ public class CloudClusterChecker extends MasterDaemon {
                 // del clusterName
                 String delClusterName = 
cloudSystemInfoService.getClusterNameByClusterId(delId);
                 if (delClusterName.isEmpty()) {
-                    LOG.warn("can't get delClusterName, clusterId: {}, plz 
check", delId);
                     return;
                 }
                 // del clusterID
+                MetricRepo.unregisterCloudMetrics(delId, delClusterName, 
toDel);
                 cloudSystemInfoService.dropCluster(delId, delClusterName);
             }
         );
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudInstanceStatusChecker.java
 
b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudInstanceStatusChecker.java
index e1648466a27..90c8ea42573 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudInstanceStatusChecker.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudInstanceStatusChecker.java
@@ -529,6 +529,8 @@ public class CloudInstanceStatusChecker extends 
MasterDaemon {
             // in fe mem, but not in meta server
             if (!msVirtualClusters.contains(computeGroup.getId())) {
                 LOG.info("virtual compute group {} will be removed.", 
computeGroup.getName());
+                MetricRepo.unregisterCloudMetrics(computeGroup.getId(), 
computeGroup.getName(),
+                        Collections.emptyList());
                 
cloudSystemInfoService.removeComputeGroup(computeGroup.getId(), 
computeGroup.getName());
                 // cancel invalid job
                 if 
(!computeGroup.getPolicy().getCacheWarmupJobIds().isEmpty()) {
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java
 
b/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java
index f21da5e039a..3daf14e0dfa 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java
@@ -408,6 +408,7 @@ public class CloudSystemInfoService extends 
SystemInfoService {
             // ATTN: Empty clusters are treated as dropped clusters.
             if (be.isEmpty()) {
                 LOG.info("del clusterId {} and clusterName {} due to be nodes 
eq 0", clusterId, clusterName);
+                MetricRepo.unregisterCloudMetrics(clusterId, clusterName, 
toDel);
                 boolean succ = clusterNameToId.remove(clusterName, clusterId);
 
                 // remove from computeGroupIdToComputeGroup
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/metric/AutoMappedMetric.java 
b/fe/fe-core/src/main/java/org/apache/doris/metric/AutoMappedMetric.java
index 440e00330b4..5b348c73b15 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/metric/AutoMappedMetric.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/metric/AutoMappedMetric.java
@@ -38,4 +38,7 @@ public class AutoMappedMetric<M> {
         return nameToMetric;
     }
 
+    public void remove(String name) {
+        nameToMetric.remove(name);
+    }
 }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java 
b/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java
index 7060817a62a..b7bbfea437b 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java
@@ -1532,4 +1532,77 @@ public final class MetricRepo {
         String key = clusterId + CloudMetrics.CLOUD_CLUSTER_DELIMITER + 
clusterName;
         
CloudMetrics.CLUSTER_QUERY_LATENCY_HISTO.getOrAdd(key).update(elapseMs);
     }
+
+    public static void unregisterCloudMetrics(String clusterId, String 
clusterName, List<Backend> backends) {
+        if (!MetricRepo.isInit || Config.isNotCloudMode() || 
Strings.isNullOrEmpty(clusterId)) {
+            return;
+        }
+        LOG.debug("unregister cloud metrics for cluster {}", clusterId);
+        try {
+            List<MetricLabel> labels = new ArrayList<>();
+            labels.add(new MetricLabel("cluster_id", clusterId));
+            labels.add(new MetricLabel("cluster_name", clusterName));
+
+            LongCounterMetric requestAllCounter = 
CloudMetrics.CLUSTER_REQUEST_ALL_COUNTER.getOrAdd(clusterId);
+            CloudMetrics.CLUSTER_REQUEST_ALL_COUNTER.remove(clusterId);
+            
DORIS_METRIC_REGISTER.removeMetricsByNameAndLabels(requestAllCounter.getName(), 
labels);
+
+            LongCounterMetric queryAllCounter = 
CloudMetrics.CLUSTER_QUERY_ALL_COUNTER.getOrAdd(clusterId);
+            CloudMetrics.CLUSTER_QUERY_ALL_COUNTER.remove(clusterId);
+            
DORIS_METRIC_REGISTER.removeMetricsByNameAndLabels(queryAllCounter.getName(), 
labels);
+
+            LongCounterMetric queryErrCounter = 
CloudMetrics.CLUSTER_QUERY_ERR_COUNTER.getOrAdd(clusterId);
+            CloudMetrics.CLUSTER_QUERY_ERR_COUNTER.remove(clusterId);
+            
DORIS_METRIC_REGISTER.removeMetricsByNameAndLabels(queryErrCounter.getName(), 
labels);
+
+            LongCounterMetric warmUpJobExecCounter = 
CloudMetrics.CLUSTER_WARM_UP_JOB_EXEC_COUNT.getOrAdd(clusterId);
+            CloudMetrics.CLUSTER_WARM_UP_JOB_EXEC_COUNT.remove(clusterId);
+            
DORIS_METRIC_REGISTER.removeMetricsByNameAndLabels(warmUpJobExecCounter.getName(),
 labels);
+
+            LongCounterMetric warmUpJobRequestedTablets =
+                    
CloudMetrics.CLUSTER_WARM_UP_JOB_REQUESTED_TABLETS.getOrAdd(clusterId);
+            
CloudMetrics.CLUSTER_WARM_UP_JOB_REQUESTED_TABLETS.remove(clusterId);
+            
DORIS_METRIC_REGISTER.removeMetricsByNameAndLabels(warmUpJobRequestedTablets.getName(),
 labels);
+
+            LongCounterMetric warmUpJobFinishedTablets =
+                    
CloudMetrics.CLUSTER_WARM_UP_JOB_FINISHED_TABLETS.getOrAdd(clusterId);
+            
CloudMetrics.CLUSTER_WARM_UP_JOB_FINISHED_TABLETS.remove(clusterId);
+            
DORIS_METRIC_REGISTER.removeMetricsByNameAndLabels(warmUpJobFinishedTablets.getName(),
 labels);
+
+            GaugeMetricImpl<Double> requestPerSecondGauge = 
CloudMetrics.CLUSTER_REQUEST_PER_SECOND_GAUGE
+                    .getOrAdd(clusterId);
+            CloudMetrics.CLUSTER_REQUEST_PER_SECOND_GAUGE.remove(clusterId);
+            
DORIS_METRIC_REGISTER.removeMetricsByNameAndLabels(requestPerSecondGauge.getName(),
 labels);
+
+            GaugeMetricImpl<Double> queryPerSecondGauge = 
CloudMetrics.CLUSTER_QUERY_PER_SECOND_GAUGE
+                    .getOrAdd(clusterId);
+            CloudMetrics.CLUSTER_QUERY_PER_SECOND_GAUGE.remove(clusterId);
+            
DORIS_METRIC_REGISTER.removeMetricsByNameAndLabels(queryPerSecondGauge.getName(),
 labels);
+
+            GaugeMetricImpl<Double> queryErrRateGauge = 
CloudMetrics.CLUSTER_QUERY_ERR_RATE_GAUGE.getOrAdd(clusterId);
+            CloudMetrics.CLUSTER_QUERY_ERR_RATE_GAUGE.remove(clusterId);
+            
DORIS_METRIC_REGISTER.removeMetricsByNameAndLabels(queryErrRateGauge.getName(), 
labels);
+
+            METRIC_REGISTER.getHistograms().keySet().stream()
+                    .filter(k -> k.contains(clusterId))
+                    .forEach(METRIC_REGISTER::remove);
+
+            for (Backend backend : backends) {
+                List<MetricLabel> backendLabels = new ArrayList<>();
+                backendLabels.add(new MetricLabel("cluster_id", clusterId));
+                backendLabels.add(new MetricLabel("cluster_name", 
clusterName));
+                backendLabels.add(new MetricLabel("address", 
backend.getAddress()));
+                String key = clusterId + "_" + backend.getAddress();
+                GaugeMetricImpl<Integer> metric = 
CloudMetrics.CLUSTER_BACKEND_ALIVE.getOrAdd(key);
+                
MetricRepo.DORIS_METRIC_REGISTER.removeMetricsByNameAndLabels(metric.getName(), 
backendLabels);
+            }
+
+            GaugeMetricImpl<Integer> backendAliveTotal = 
CloudMetrics.CLUSTER_BACKEND_ALIVE_TOTAL.getOrAdd(clusterId);
+            CloudMetrics.CLUSTER_BACKEND_ALIVE_TOTAL.remove(clusterId);
+            
MetricRepo.DORIS_METRIC_REGISTER.removeMetricsByNameAndLabels(backendAliveTotal.getName(),
 labels);
+
+        } catch (Throwable t) {
+            LOG.warn("unregister cloud metrics for cluster {} failed", 
clusterId, t);
+        }
+    }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to