This is an automated email from the ASF dual-hosted git repository.
gavinchou pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new a4a9e801cf4 [Fix](cloud) Fix cluster status inconsistent with bes and
add config disable auto (#40799)
a4a9e801cf4 is described below
commit a4a9e801cf477746cb3665eaa904d8ea20ceee6c
Author: deardeng <[email protected]>
AuthorDate: Sat Sep 14 19:37:54 2024 +0800
[Fix](cloud) Fix cluster status inconsistent with bes and add config
disable auto (#40799)
1. add switch for auto start in cloud
2. fix inconsistent with cluster status in be tag when add be node
3. fix docker compose, `[INVALID_ARGUMENT]cloud_instance_id in fe.conf
and be.conf are not same, fe: , be: reg_cloud_instance`
---
.../main/java/org/apache/doris/common/Config.java | 9 ++++++++
.../doris/cloud/catalog/CloudClusterChecker.java | 18 ++++++++++++---
.../doris/cloud/system/CloudSystemInfoService.java | 20 +++++++++++++----
.../main/java/org/apache/doris/system/Backend.java | 2 +-
.../java/org/apache/doris/system/HeartbeatMgr.java | 4 +++-
.../cloud_p0/multi_cluster/test_auto_start.groovy | 26 +++++++++++++++++++++-
6 files changed, 69 insertions(+), 10 deletions(-)
diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
index 968a307c842..282fbf3a7bc 100644
--- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
+++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java
@@ -3026,6 +3026,15 @@ public class Config extends ConfigBase {
@ConfField(mutable = true, description =
{"存算分离模式下,当tablet分布的be异常,是否立即映射tablet到新的be上,默认true"})
public static boolean enable_immediate_be_assign = true;
+ @ConfField(mutable = true, description = {"存算分离模式下是否启用自动启停功能,默认true",
+ "Whether to enable the automatic start-stop feature in cloud model,
default is true."})
+ public static boolean enable_auto_start_for_cloud_cluster = true;
+
+ @ConfField(mutable = true, description =
{"存算分离模式下自动启停等待cluster唤醒退避重试次数,默认300次大约5分钟",
+ "The automatic start-stop wait time for cluster wake-up backoff retry
count in the cloud "
+ + "model is set to 300 times, which is approximately 5 minutes by
default."})
+ public static int auto_start_wait_to_resume_times = 300;
+
// ATTN: DONOT add any config not related to cloud mode here
// ATTN: DONOT add any config not related to cloud mode here
// ATTN: DONOT add any config not related to cloud mode here
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java
b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java
index 567dc4b3124..0dfcf322a0c 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudClusterChecker.java
@@ -219,7 +219,19 @@ public class CloudClusterChecker extends MasterDaemon {
if (LOG.isDebugEnabled()) {
LOG.debug("current cluster status {} {}",
currentClusterStatus, newClusterStatus);
}
- if (!currentClusterStatus.equals(newClusterStatus)) {
+ boolean needChange = false;
+ // ATTN: found bug, In the same cluster, the cluster status in the
tags of BE nodes is inconsistent.
+ // Using a set to collect the cluster statuses from the BE nodes.
+ Set<String> clusterStatusInMem = new HashSet<>();
+ for (Backend backend : currentBes) {
+ String beClusterStatus =
backend.getTagMap().get(Tag.CLOUD_CLUSTER_STATUS);
+ clusterStatusInMem.add(beClusterStatus == null ? "NOT_SET" :
beClusterStatus);
+ }
+ if (clusterStatusInMem.size() != 1) {
+ LOG.warn("cluster {}, multi be nodes cluster status
inconsistent, fix it {}", cid, clusterStatusInMem);
+ needChange = true;
+ }
+ if (!currentClusterStatus.equals(newClusterStatus) || needChange) {
// cluster's status changed
LOG.info("cluster_status corresponding to cluster_id has been
changed,"
+ " cluster_id : {} , current_cluster_status : {},
new_cluster_status :{}",
@@ -426,8 +438,8 @@ public class CloudClusterChecker extends MasterDaemon {
}
return nodeMap;
});
- LOG.info("diffFrontends nodes: {}, current: {}, toAdd: {}, toDel: {}",
- expectedFes, currentFes, toAdd, toDel);
+ LOG.info("diffFrontends nodes: {}, current: {}, toAdd: {}, toDel: {},
enable auto start: {}",
+ expectedFes, currentFes, toAdd, toDel,
Config.enable_auto_start_for_cloud_cluster);
if (toAdd.isEmpty() && toDel.isEmpty()) {
if (LOG.isDebugEnabled()) {
LOG.debug("runAfterCatalogReady getObserverFes nothing todo");
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java
b/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java
index 606f52369e5..a91892870d6 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/cloud/system/CloudSystemInfoService.java
@@ -55,6 +55,7 @@ import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
+import java.util.Optional;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicLong;
@@ -567,10 +568,18 @@ public class CloudSystemInfoService extends
SystemInfoService {
}
}
+ public Set<String> getClusterStatus(List<Backend> backends) {
+ // ATTN: found bug, In the same cluster, the cluster status in the
tags of BE nodes is inconsistent.
+ // Using a set to collect the cluster statuses from the BE nodes.
+ return
backends.stream().map(Backend::getCloudClusterStatus).collect(Collectors.toSet());
+ }
+
public String getCloudStatusByIdNoLock(final String clusterId) {
- return clusterIdToBackend.getOrDefault(clusterId, new ArrayList<>())
- .stream().map(Backend::getCloudClusterStatus).findFirst()
- .orElse(String.valueOf(Cloud.ClusterStatus.UNKNOWN));
+ List<Backend> bes = clusterIdToBackend.getOrDefault(clusterId, new
ArrayList<>());
+ Optional<String> hasNormal =
bes.stream().map(Backend::getCloudClusterStatus)
+ .filter(status ->
status.equals(String.valueOf(Cloud.ClusterStatus.NORMAL))).findAny();
+ return hasNormal.orElseGet(() ->
bes.stream().map(Backend::getCloudClusterStatus).findFirst()
+ .orElse(String.valueOf(Cloud.ClusterStatus.NORMAL)));
}
public void updateClusterNameToId(final String newName,
@@ -949,6 +958,9 @@ public class CloudSystemInfoService extends
SystemInfoService {
if (Config.isNotCloudMode()) {
return null;
}
+ if (!Config.enable_auto_start_for_cloud_cluster) {
+ return null;
+ }
clusterName = getClusterNameAutoStart(clusterName);
if (Strings.isNullOrEmpty(clusterName)) {
LOG.warn("auto start in cloud mode, but clusterName empty {}",
clusterName);
@@ -999,7 +1011,7 @@ public class CloudSystemInfoService extends
SystemInfoService {
}
}
// wait 5 mins
- int retryTimes = 5 * 60;
+ int retryTimes = Config.auto_start_wait_to_resume_times < 0 ? 300 :
Config.auto_start_wait_to_resume_times;
int retryTime = 0;
StopWatch stopWatch = new StopWatch();
stopWatch.start();
diff --git a/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java
b/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java
index 876e6ca40b4..01bf800e97e 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/system/Backend.java
@@ -188,7 +188,7 @@ public class Backend implements Writable {
}
public String getCloudClusterStatus() {
- return tagMap.getOrDefault(Tag.CLOUD_CLUSTER_STATUS,
String.valueOf(Cloud.ClusterStatus.UNKNOWN));
+ return tagMap.getOrDefault(Tag.CLOUD_CLUSTER_STATUS,
String.valueOf(Cloud.ClusterStatus.NORMAL));
}
public void setCloudClusterStatus(final String clusterStatus) {
diff --git a/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatMgr.java
b/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatMgr.java
index 5dd8dd9fca1..ee85a242239 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatMgr.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatMgr.java
@@ -99,7 +99,9 @@ public class HeartbeatMgr extends MasterDaemon {
// Set cloud_instance_id and meta_service_endpoint even if there
are empty
// Be can knowns that fe is working in cloud mode.
// Set the cloud instance ID for cloud deployment identification
- tMasterInfo.setCloudInstanceId(Config.cloud_instance_id);
+ if (!Strings.isNullOrEmpty(Config.cloud_instance_id)) {
+ tMasterInfo.setCloudInstanceId(Config.cloud_instance_id);
+ }
// Set the endpoint for the metadata service in cloud mode
tMasterInfo.setMetaServiceEndpoint(Config.meta_service_endpoint);
}
diff --git
a/regression-test/suites/cloud_p0/multi_cluster/test_auto_start.groovy
b/regression-test/suites/cloud_p0/multi_cluster/test_auto_start.groovy
index 2ce9a9d8f4b..d6db6364d38 100644
--- a/regression-test/suites/cloud_p0/multi_cluster/test_auto_start.groovy
+++ b/regression-test/suites/cloud_p0/multi_cluster/test_auto_start.groovy
@@ -22,7 +22,7 @@ import org.awaitility.Awaitility;
import org.apache.doris.regression.util.Http
import static java.util.concurrent.TimeUnit.SECONDS;
-suite('test_auto_start_in_cloud', 'multi_cluster') {
+suite('test_auto_start_in_cloud', 'multi_cluster, docker') {
if (!isCloudMode()) {
return;
}
@@ -168,5 +168,29 @@ suite('test_auto_start_in_cloud', 'multi_cluster') {
future1.get()
future2.get()
+
+ tag = getCloudBeTagByName(clusterName)
+ logger.info("tag check = {}", tag)
+ jsonObject = jsonSlurper.parseText(tag)
+ String cluster_status = jsonObject.cloud_cluster_status
+ assertEquals("NORMAL", cluster_status)
+
+ // add 1 nodes, check it status NORMAL
+ cluster.addBackend(1, null)
+ dockerAwaitUntil(5) {
+ result = sql """SHOW BACKENDS"""
+ result.size() == 4
+ }
+
+ def bes = sql_return_maparray "SHOW BACKENDS"
+ bes.each {
+ tag = it.Tag
+ if (!tag.contains(clusterName)) {
+ return
+ }
+ jsonObject = jsonSlurper.parseText(tag)
+ cluster_status = jsonObject.cloud_cluster_status
+ assertEquals("NORMAL", cluster_status)
+ }
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]