This is an automated email from the ASF dual-hosted git repository.
dataroaring pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new c4193bdb676 [feat](regression) Add dockers to manage multi docker
clusters (#57236)
c4193bdb676 is described below
commit c4193bdb676d0777cb1480a300bafadfd8f0b6bc
Author: walter <[email protected]>
AuthorDate: Wed Oct 22 22:09:03 2025 +0800
[feat](regression) Add dockers to manage multi docker clusters (#57236)
and support sharing MS between clusters.
---
docker/runtime/doris-compose/Readme.md | 56 +++++++++
docker/runtime/doris-compose/cluster.py | 86 +++++++++++--
docker/runtime/doris-compose/command.py | 116 +++++++++++++++--
docker/runtime/doris-compose/resource/common.sh | 37 ++++++
docker/runtime/doris-compose/resource/init_be.sh | 6 +-
.../runtime/doris-compose/resource/init_cloud.sh | 35 +-----
docker/runtime/doris-compose/resource/init_fe.sh | 17 ++-
docker/runtime/doris-compose/utils.py | 6 +-
.../org/apache/doris/regression/Config.groovy | 1 +
.../org/apache/doris/regression/suite/Suite.groovy | 138 ++++++++++++++++++++-
.../doris/regression/suite/SuiteCluster.groovy | 43 +++++++
.../suites/demo_p0/test_external_ms_cluster.groovy | 83 +++++++++++++
12 files changed, 565 insertions(+), 59 deletions(-)
diff --git a/docker/runtime/doris-compose/Readme.md
b/docker/runtime/doris-compose/Readme.md
index c4b60460e49..d8fbf0f4ed6 100644
--- a/docker/runtime/doris-compose/Readme.md
+++ b/docker/runtime/doris-compose/Readme.md
@@ -207,6 +207,62 @@ steps:
2. Generate regression-conf-custom.groovy: `python
docker/runtime/doris-compose/doris-compose.py config my-cluster
<doris-root-path> --connect-follow-fe`
3. Run regression test: `bash run-regression-test.sh --run -times 1 -parallel
1 -suiteParallel 1 -d cloud/multi_cluster`
+### Multi cloud cluster with shared Meta Service
+
+Doris compose now supports creating multiple cloud clusters that share the
same Meta Service (MS), FDB, and Recycler services. This is useful for testing
cross-cluster operations (such as cloning, backup/restore) under the same Meta
Service instance.
+
+#### Create the first cluster
+
+First, create a complete cloud cluster that will provide MS/FDB/Recycler
services:
+
+```shell
+python docker/runtime/doris-compose/doris-compose.py up cluster1 <image>
--cloud --add-fe-num 1 --add-be-num 3
+```
+
+This creates the first cluster with:
+- 1 FDB node
+- 1 Meta Service (MS) node
+- 1 Recycler node
+- 1 FE node
+- 3 BE nodes
+
+#### Create additional clusters sharing the same MS
+
+Now you can create additional sql/compute clusters that share the first
cluster's Meta Service:
+
+```shell
+# Create second cluster sharing cluster1's MS
+python docker/runtime/doris-compose/doris-compose.py up cluster2 <image>
--cloud --external-ms cluster1 --instance-id instance_cluster2 --add-fe-num 1
--add-be-num 3
+
+# Create third cluster sharing cluster1's MS
+python docker/runtime/doris-compose/doris-compose.py up cluster3 <image>
--cloud --external-ms cluster1 --instance-id instance_cluster3 --add-fe-num 1
--add-be-num 3
+```
+
+Key points:
+- `--external-ms cluster1`: Specifies that this cluster will use cluster1's
MS/FDB/Recycler services
+- `--instance-id`: Must be unique for each cluster. If not specified, will
auto-generate as `instance_<cluster-name>`
+- The new clusters will NOT create their own MS/FDB/Recycler nodes, saving
resources
+- All clusters share the same object storage and meta service infrastructure
+- Each cluster maintains its own FE/BE nodes for compute isolation
+
+#### Network architecture
+
+When using external MS:
+- Each cluster has its own Docker network
+- Compute clusters join the external MS cluster's network as well
+- DNS resolution is configured automatically for all MS/FDB/Recycler nodes
+- BE and FE nodes can communicate with MS nodes using their container names
+
+#### Validation
+
+Doris compose automatically validates:
+1. External MS cluster exists
+2. External cluster is a cloud cluster
+3. MS and FDB nodes are present
+4. MS and FDB containers are running
+
+If validation fails, you'll get a clear error message explaining what needs to
be fixed.
+
## Problem investigation
### Log
diff --git a/docker/runtime/doris-compose/cluster.py
b/docker/runtime/doris-compose/cluster.py
index b54a9a413d3..0aff78f04cc 100644
--- a/docker/runtime/doris-compose/cluster.py
+++ b/docker/runtime/doris-compose/cluster.py
@@ -440,6 +440,8 @@ class Node(object):
f"{port_name} = {port}"
for port_name, port in self.meta["ports"].items()
]
+ else:
+ cfg.append(f"priority_networks = {self.cluster.get_cidr()}")
return cfg
def docker_ports(self):
@@ -497,15 +499,47 @@ class Node(object):
content["network_mode"] = "host"
else:
content["hostname"] = self.get_name()
- content["networks"] = {
+
+ # Configure container networks: local cluster network + external
MS network (if any)
+ networks = {
utils.with_doris_prefix(self.cluster.name): {
"ipv4_address": self.get_ip(),
}
}
+
+ # If using external MS cluster, let the container join the
external MS cluster network
+ if self.cluster.external_ms_cluster:
+ external_network_name =
utils.get_network_name(self.cluster.external_ms_cluster)
+ networks[external_network_name] = {}
+ LOG.debug(f"Node {self.get_name()} joins external network:
{external_network_name}")
+
+ content["networks"] = networks
+
extra_hosts.extend([
"{}:{}".format(node.get_name(), node.get_ip())
for node in self.cluster.get_all_nodes()
])
+
+ # Add external MS cluster nodes to extra_hosts
+ if self.cluster.external_ms_cluster:
+ try:
+ external_cluster =
Cluster.load(self.cluster.external_ms_cluster)
+ for ms_node in
external_cluster.get_all_nodes(Node.TYPE_MS):
+ extra_hosts.append(
+ "{}:{}".format(ms_node.get_name(),
ms_node.get_ip())
+ )
+ for fdb_node in
external_cluster.get_all_nodes(Node.TYPE_FDB):
+ extra_hosts.append(
+ "{}:{}".format(fdb_node.get_name(),
fdb_node.get_ip())
+ )
+ for recycle_node in
external_cluster.get_all_nodes(Node.TYPE_RECYCLE):
+ extra_hosts.append(
+ "{}:{}".format(recycle_node.get_name(),
recycle_node.get_ip())
+ )
+ LOG.debug(f"Added external MS cluster hosts for
{self.get_name()}")
+ except Exception as e:
+ LOG.warning(f"Failed to add external MS cluster hosts:
{e}")
+
content["ports"] = self.docker_ports()
user_hosts = getattr(self.cluster, "extra_hosts", [])
if user_hosts:
@@ -573,8 +607,14 @@ class FE(Node):
def docker_env(self):
envs = super().docker_env()
+ # Create instance when using external MS cluster, and pass cloud store
config
+ if self.cluster.external_ms_cluster:
+ envs["AUTO_CREATE_INSTANCE"] = 1
+ for key, value in self.cluster.cloud_store_config.items():
+ envs[key] = value
if self.cluster.is_cloud:
envs["CLOUD_UNIQUE_ID"] = self.cloud_unique_id()
+ envs["INSTANCE_ID"] = self.cluster.instance_id
if self.meta["is_cloud_follower"]:
envs["IS_FE_FOLLOWER"] = 1
envs["MY_QUERY_PORT"] = self.meta["ports"]["query_port"]
@@ -592,7 +632,7 @@ class FE(Node):
}
def cloud_unique_id(self):
- return "sql_server_{}".format(self.id)
+ return "{}_sql_server_{}".format(self.cluster.name, self.id)
def start_script(self):
return ["init_fe.sh"]
@@ -693,6 +733,7 @@ class BE(Node):
"heartbeat_service_port"]
if self.cluster.is_cloud:
envs["CLOUD_UNIQUE_ID"] = self.cloud_unique_id()
+ envs["INSTANCE_ID"] = self.cluster.instance_id
envs["REG_BE_TO_MS"] = 1 if self.cluster.reg_be else 0
envs["CLUSTER_NAME"] = self.meta["cluster_name"]
return envs
@@ -707,7 +748,7 @@ class BE(Node):
}
def cloud_unique_id(self):
- return "compute_node_{}".format(self.id)
+ return "{}_compute_node_{}".format(self.cluster.name, self.id)
def docker_home_dir(self):
return os.path.join(DOCKER_DORIS_PATH, "be")
@@ -757,6 +798,7 @@ class MS(CLOUD):
def docker_env(self):
envs = super().docker_env()
+ envs["INSTANCE_ID"] = self.cluster.instance_id
for key, value in self.cluster.cloud_store_config.items():
envs[key] = value
return envs
@@ -818,7 +860,8 @@ class Cluster(object):
be_config, ms_config, recycle_config, remote_master_fe,
local_network_ip, fe_follower, be_disks, be_cluster, reg_be,
extra_hosts, coverage_dir, cloud_store_config,
- sql_mode_node_mgr, be_metaservice_endpoint, be_cluster_id,
tde_ak, tde_sk):
+ sql_mode_node_mgr, be_metaservice_endpoint, be_cluster_id,
tde_ak, tde_sk,
+ external_ms_cluster, instance_id):
self.name = name
self.subnet = subnet
self.image = image
@@ -837,6 +880,10 @@ class Cluster(object):
self.extra_hosts = extra_hosts
self.coverage_dir = coverage_dir
self.cloud_store_config = cloud_store_config
+ self.external_ms_cluster = external_ms_cluster
+ self.instance_id = instance_id
+ if not self.instance_id:
+ self.instance_id = f"instance_{name}" if self.external_ms_cluster
else "default_instance_id"
self.groups = {
node_type: Group(node_type)
for node_type in Node.TYPE_ALL
@@ -855,7 +902,8 @@ class Cluster(object):
ms_config, recycle_config, remote_master_fe, local_network_ip,
fe_follower, be_disks, be_cluster, reg_be, extra_hosts,
coverage_dir, cloud_store_config, sql_mode_node_mgr,
- be_metaservice_endpoint, be_cluster_id, tde_ak, tde_sk):
+ be_metaservice_endpoint, be_cluster_id, tde_ak, tde_sk,
+ external_ms_cluster, instance_id):
if not os.path.exists(LOCAL_DORIS_PATH):
os.makedirs(LOCAL_DORIS_PATH, exist_ok=True)
os.chmod(LOCAL_DORIS_PATH, 0o777)
@@ -870,7 +918,7 @@ class Cluster(object):
be_disks, be_cluster, reg_be, extra_hosts,
coverage_dir, cloud_store_config,
sql_mode_node_mgr, be_metaservice_endpoint,
- be_cluster_id, tde_ak, tde_sk)
+ be_cluster_id, tde_ak, tde_sk,
external_ms_cluster, instance_id)
os.makedirs(cluster.get_path(), exist_ok=True)
os.makedirs(get_status_path(name), exist_ok=True)
cluster._save_meta()
@@ -996,20 +1044,32 @@ class Cluster(object):
return node
def get_fdb_cluster(self):
+ if self.external_ms_cluster:
+ external_cluster = Cluster.load(self.external_ms_cluster)
+ return external_cluster.get_fdb_cluster()
fdb = self.get_node(Node.TYPE_FDB, 1)
return "123456:123456@{}:{}".format(fdb.get_ip(),
fdb.meta["ports"]["fdb_port"])
def get_meta_server_addr(self):
+ if self.external_ms_cluster:
+ external_cluster = Cluster.load(self.external_ms_cluster)
+ return external_cluster.get_meta_server_addr()
meta_server = self.get_node(Node.TYPE_MS, 1)
return "{}:{}".format(meta_server.get_ip(),
meta_server.meta["ports"]["brpc_listen_port"])
def get_recycle_addr(self):
+ if self.external_ms_cluster:
+ external_cluster = Cluster.load(self.external_ms_cluster)
+ return external_cluster.get_recycle_addr()
recycler = self.get_node(Node.TYPE_RECYCLE, 1)
return "{}:{}".format(recycler.get_ip(),
recycler.meta["ports"]["brpc_listen_port"])
+ def get_cidr(self):
+ return "{}.0.0/16".format(self.subnet)
+
def remove(self, node_type, id):
group = self.get_group(node_type)
group.remove(id)
@@ -1032,17 +1092,27 @@ class Cluster(object):
"services": services,
}
if not self.is_host_network():
- compose["networks"] = {
+ networks = {
utils.with_doris_prefix(self.name): {
"driver": "bridge",
"ipam": {
"config": [{
- "subnet": "{}.0.0/16".format(self.subnet),
+ "subnet": self.get_cidr(),
}]
},
},
}
+ # If using external MS cluster, declare the external network
+ if self.external_ms_cluster:
+ external_network_name =
utils.get_network_name(self.external_ms_cluster)
+ networks[external_network_name] = {
+ "external": True
+ }
+ LOG.debug(f"Added external network: {external_network_name}")
+
+ compose["networks"] = networks
+
utils.write_compose_file(self.get_compose_file(), compose)
def get_compose_file(self):
diff --git a/docker/runtime/doris-compose/command.py
b/docker/runtime/doris-compose/command.py
index c4a4f9ceff8..2e0fee727ca 100644
--- a/docker/runtime/doris-compose/command.py
+++ b/docker/runtime/doris-compose/command.py
@@ -462,6 +462,25 @@ class UpCommand(Command):
"Only use when creating new cluster and specify
--remote-master-fe."
)
+ parser.add_argument(
+ "--external-ms",
+ type=str,
+ help=
+ "Use external meta service cluster (specify cluster name). " \
+ "This cluster will not create its own MS/FDB/Recycler, but use the
specified cluster's services. " \
+ "The external cluster must be a cloud cluster with MS/FDB already
running. " \
+ "Example: --external-ms shared-meta. Only use when creating new
cloud cluster."
+ )
+
+ parser.add_argument(
+ "--instance-id",
+ type=str,
+ help=
+ "Specify instance ID for cloud mode. If not specified, will
auto-generate 'default_instance_id'. " \
+ "When using external MS with multiple clusters, each cluster
should have a unique instance ID. " \
+ "Example: --instance-id prod_instance_1"
+ )
+
if self._support_boolean_action():
parser.add_argument(
"--be-metaservice-endpoint",
@@ -583,17 +602,29 @@ class UpCommand(Command):
cloud_store_config = {}
if args.cloud:
- add_fdb_num = 1
- if not args.add_ms_num:
- args.add_ms_num = 1
- if not args.add_recycle_num:
- args.add_recycle_num = 1
+ external_ms_cluster = getattr(args, 'external_ms', None)
+ if external_ms_cluster:
+ # Using the MS nodes from external cluster, no need to add
FDB/MS/Recycler
+ self._validate_external_ms_cluster(external_ms_cluster)
+ add_fdb_num = 0
+ args.add_ms_num = 0
+ args.add_recycle_num = 0
+ LOG.info(f"Using external MS cluster:
{external_ms_cluster}")
+ else:
+ add_fdb_num = 1
+ if not args.add_ms_num:
+ args.add_ms_num = 1
+ if not args.add_recycle_num:
+ args.add_recycle_num = 1
+ external_ms_cluster = None
+
if not args.be_cluster:
args.be_cluster = "compute_cluster"
cloud_store_config = self._get_cloud_store_config()
else:
args.add_ms_num = 0
args.add_recycle_num = 0
+ external_ms_cluster = None
if args.remote_master_fe:
if not args.local_network_ip:
@@ -609,13 +640,16 @@ class UpCommand(Command):
if args.cloud:
args.sql_mode_node_mgr = True
+ instance_id = getattr(args, 'instance_id', None)
+
cluster = CLUSTER.Cluster.new(
args.NAME, args.IMAGE, args.cloud, args.root, args.fe_config,
args.be_config, args.ms_config, args.recycle_config,
args.remote_master_fe, args.local_network_ip, args.fe_follower,
args.be_disks, args.be_cluster, args.reg_be, args.extra_hosts,
args.coverage_dir, cloud_store_config, args.sql_mode_node_mgr,
- args.be_metaservice_endpoint, args.be_cluster_id, args.tde_ak,
args.tde_sk)
+ args.be_metaservice_endpoint, args.be_cluster_id, args.tde_ak,
args.tde_sk,
+ external_ms_cluster, instance_id)
LOG.info("Create new cluster {} succ, cluster path is {}".format(
args.NAME, cluster.get_path()))
@@ -825,6 +859,70 @@ class UpCommand(Command):
},
}
+ def _validate_external_ms_cluster(self, external_ms_cluster_name):
+ # 1. Is the external cluster exist?
+ try:
+ external_cluster = CLUSTER.Cluster.load(external_ms_cluster_name)
+ except Exception as e:
+ raise Exception(
+ f"External MS cluster '{external_ms_cluster_name}' not found. "
+ f"Please create it first with: "
+ f"python doris-compose.py up {external_ms_cluster_name}
<image> --cloud --add-fe-num 0 --add-be-num 0"
+ ) from e
+
+ # 2. Is the external cluster a cloud cluster?
+ if not external_cluster.is_cloud:
+ raise Exception(
+ f"External MS cluster '{external_ms_cluster_name}' is not a
cloud cluster. "
+ f"Only cloud clusters can be used as external MS."
+ )
+
+ # 3. Does the external cluster have MS and FDB nodes?
+ ms_group = external_cluster.get_group(CLUSTER.Node.TYPE_MS)
+ fdb_group = external_cluster.get_group(CLUSTER.Node.TYPE_FDB)
+
+ if ms_group.get_node_num() == 0:
+ raise Exception(
+ f"External MS cluster '{external_ms_cluster_name}' has no MS
nodes. "
+ f"Please add MS nodes first."
+ )
+
+ if fdb_group.get_node_num() == 0:
+ raise Exception(
+ f"External MS cluster '{external_ms_cluster_name}' has no FDB
nodes. "
+ f"Please add FDB nodes first."
+ )
+
+ # 4. Are the MS and FDB containers running?
+ containers =
utils.get_doris_running_containers(external_ms_cluster_name)
+
+ ms_running = False
+ fdb_running = False
+ for container_name in containers.keys():
+ _, node_type, _ = utils.parse_service_name(container_name)
+ if node_type == CLUSTER.Node.TYPE_MS:
+ ms_running = True
+ elif node_type == CLUSTER.Node.TYPE_FDB:
+ fdb_running = True
+
+ if not ms_running:
+ raise Exception(
+ f"External MS cluster '{external_ms_cluster_name}' MS node is
not running. "
+ f"Please start it with: python doris-compose.py start
{external_ms_cluster_name}"
+ )
+
+ if not fdb_running:
+ raise Exception(
+ f"External MS cluster '{external_ms_cluster_name}' FDB node is
not running. "
+ f"Please start it with: python doris-compose.py start
{external_ms_cluster_name}"
+ )
+
+ LOG.info(utils.render_green(
+ f"✓ External MS cluster '{external_ms_cluster_name}' validation
passed: "
+ f"MS={external_cluster.get_meta_server_addr()}, "
+ f"FDB={external_cluster.get_fdb_cluster()}"
+ ))
+
def _get_cloud_store_config(self):
example_cfg_file = os.path.join(CLUSTER.LOCAL_RESOURCE_PATH,
"cloud.ini.example")
@@ -1394,9 +1492,9 @@ class ListCommand(Command):
if cluster and cluster.is_host_network():
node.ip = cluster.local_network_ip
else:
- node.ip = list(
- container.attrs["NetworkSettings"]["Networks"].
- values())[0]["IPAMConfig"]["IPv4Address"]
+ network_name = utils.get_network_name(cluster.name)
+ node.ip =
container.attrs["NetworkSettings"]["Networks"][network_name] \
+ ["IPAMConfig"]["IPv4Address"]
node.image = container.attrs["Config"]["Image"]
if not node.image:
node.image = ",".join(container.image.tags)
diff --git a/docker/runtime/doris-compose/resource/common.sh
b/docker/runtime/doris-compose/resource/common.sh
index 2c53ca587a5..63073fbab15 100644
--- a/docker/runtime/doris-compose/resource/common.sh
+++ b/docker/runtime/doris-compose/resource/common.sh
@@ -147,3 +147,40 @@ wait_pid() {
health_log "wait end"
}
+
+create_doris_instance() {
+ while true; do
+
+ lock_cluster
+
+ output=$(curl -s
"${META_SERVICE_ENDPOINT}/MetaService/http/create_instance?token=greedisgood9999"
\
+ -d '{"instance_id":"'"${INSTANCE_ID}"'",
+ "name": "'"${INSTANCE_ID}"'",
+ "user_id": "'"${DORIS_CLOUD_USER}"'",
+ "obj_info": {
+ "ak": "'"${DORIS_CLOUD_AK}"'",
+ "sk": "'"${DORIS_CLOUD_SK}"'",
+ "bucket": "'"${DORIS_CLOUD_BUCKET}"'",
+ "endpoint": "'"${DORIS_CLOUD_ENDPOINT}"'",
+ "external_endpoint":
"'"${DORIS_CLOUD_EXTERNAL_ENDPOINT}"'",
+ "prefix": "'"${DORIS_CLOUD_PREFIX}"'",
+ "region": "'"${DORIS_CLOUD_REGION}"'",
+ "provider": "'"${DORIS_CLOUD_PROVIDER}"'"
+ }}')
+
+ unlock_cluster
+
+ health_log "create instance output: $output"
+ code=$(jq -r '.code' <<<$output)
+
+ if [ "$code" != "OK" ]; then
+ health_log "create instance failed"
+ sleep 1
+ continue
+ fi
+
+ health_log "create doris instance succ, output: $output"
+ touch $HAS_CREATE_INSTANCE_FILE
+ break
+ done
+}
diff --git a/docker/runtime/doris-compose/resource/init_be.sh
b/docker/runtime/doris-compose/resource/init_be.sh
index e4ac48bda76..c8716d02da6 100755
--- a/docker/runtime/doris-compose/resource/init_be.sh
+++ b/docker/runtime/doris-compose/resource/init_be.sh
@@ -73,7 +73,7 @@ add_cloud_be() {
lock_cluster
output=$(curl -s
"${META_SERVICE_ENDPOINT}/MetaService/http/add_cluster?token=greedisgood9999" \
- -d '{"instance_id": "default_instance_id",
+ -d '{"instance_id": "'"${INSTANCE_ID}"'",
"cluster": {
"type": "COMPUTE",
"cluster_name": "'"${cluster_name}"'",
@@ -87,7 +87,7 @@ add_cloud_be() {
# cluster has exists
if [ "$code" == "ALREADY_EXISTED" ]; then
output=$(curl -s
"${META_SERVICE_ENDPOINT}/MetaService/http/add_node?token=greedisgood9999" \
- -d '{"instance_id": "default_instance_id",
+ -d '{"instance_id": "'"${INSTANCE_ID}"'",
"cluster": {
"type": "COMPUTE",
"cluster_name": "'"${cluster_name}"'",
@@ -107,7 +107,7 @@ add_cloud_be() {
fi
output=$(curl -s
"${META_SERVICE_ENDPOINT}/MetaService/http/get_cluster?token=greedisgood9999" \
- -d '{"instance_id": "default_instance_id",
+ -d '{"instance_id": "'"${INSTANCE_ID}"'",
"cloud_unique_id": "'"${CLOUD_UNIQUE_ID}"'",
"cluster_name": "'"${cluster_name}"'",
"cluster_id": "'"${cluster_id}"'"
diff --git a/docker/runtime/doris-compose/resource/init_cloud.sh
b/docker/runtime/doris-compose/resource/init_cloud.sh
index 22883ab2e40..dc0a1745060 100644
--- a/docker/runtime/doris-compose/resource/init_cloud.sh
+++ b/docker/runtime/doris-compose/resource/init_cloud.sh
@@ -53,40 +53,7 @@ check_init_cloud() {
return
fi
- while true; do
-
- lock_cluster
-
- output=$(curl -s
"${META_SERVICE_ENDPOINT}/MetaService/http/create_instance?token=greedisgood9999"
\
- -d '{"instance_id":"default_instance_id",
- "name": "default_instance",
- "user_id": "'"${DORIS_CLOUD_USER}"'",
- "obj_info": {
- "ak": "'"${DORIS_CLOUD_AK}"'",
- "sk": "'"${DORIS_CLOUD_SK}"'",
- "bucket": "'"${DORIS_CLOUD_BUCKET}"'",
- "endpoint": "'"${DORIS_CLOUD_ENDPOINT}"'",
- "external_endpoint":
"'"${DORIS_CLOUD_EXTERNAL_ENDPOINT}"'",
- "prefix": "'"${DORIS_CLOUD_PREFIX}"'",
- "region": "'"${DORIS_CLOUD_REGION}"'",
- "provider": "'"${DORIS_CLOUD_PROVIDER}"'"
- }}')
-
- unlock_cluster
-
- health_log "create instance output: $output"
- code=$(jq -r '.code' <<<$output)
-
- if [ "$code" != "OK" ]; then
- health_log "create instance failed"
- sleep 1
- continue
- fi
-
- health_log "create doris instance succ, output: $output"
- touch $HAS_CREATE_INSTANCE_FILE
- break
- done
+ create_doris_instance
}
stop_cloud() {
diff --git a/docker/runtime/doris-compose/resource/init_fe.sh
b/docker/runtime/doris-compose/resource/init_fe.sh
index 4e846ed182f..68c70f92821 100755
--- a/docker/runtime/doris-compose/resource/init_fe.sh
+++ b/docker/runtime/doris-compose/resource/init_fe.sh
@@ -115,7 +115,18 @@ start_cloud_fe() {
return
fi
- wait_create_instance
+ # Support to create instance in FE startup.
+ AUTO_CREATE_INSTANCE=${AUTO_CREATE_INSTANCE:-"0"}
+ if [ "a$MY_ID" == "a1" ] && [ "a$AUTO_CREATE_INSTANCE" == "a1" ]; then
+ health_log "auto create instance is enabled, trying to create instance"
+ if [ -f $HAS_CREATE_INSTANCE_FILE ]; then
+ health_log "instance has been created before, skip create instance"
+ else
+ create_doris_instance
+ fi
+ else
+ wait_create_instance
+ fi
action=add_cluster
node_type=FE_MASTER
@@ -139,7 +150,7 @@ start_cloud_fe() {
lock_cluster
output=$(curl -s
"${META_SERVICE_ENDPOINT}/MetaService/http/${action}?token=greedisgood9999" \
- -d '{"instance_id": "default_instance_id",
+ -d '{"instance_id": "'"${INSTANCE_ID}"'",
"cluster": {
"type": "SQL",
"cluster_name": "RESERVED_CLUSTER_NAME_FOR_SQL_SERVER",
@@ -158,7 +169,7 @@ start_cloud_fe() {
fi
output=$(curl -s
"${META_SERVICE_ENDPOINT}/MetaService/http/get_cluster?token=greedisgood9999" \
- -d '{"instance_id": "default_instance_id",
+ -d '{"instance_id": "'"${INSTANCE_ID}"'",
"cloud_unique_id": "'"${CLOUD_UNIQUE_ID}"'",
"cluster_name": "RESERVED_CLUSTER_NAME_FOR_SQL_SERVER",
"cluster_id": "RESERVED_CLUSTER_ID_FOR_SQL_SERVER"}')
diff --git a/docker/runtime/doris-compose/utils.py
b/docker/runtime/doris-compose/utils.py
index 4ff87ef2229..917c83426e4 100644
--- a/docker/runtime/doris-compose/utils.py
+++ b/docker/runtime/doris-compose/utils.py
@@ -177,10 +177,14 @@ def get_doris_running_containers(cluster_name):
}
+def get_network_name(cluster_name):
+ return cluster_name + "_" + with_doris_prefix(cluster_name)
+
+
def remove_docker_network(cluster_name):
client = docker.client.from_env()
for network in client.networks.list(
- names=[cluster_name + "_" + with_doris_prefix(cluster_name)]):
+ names=[get_network_name(cluster_name)]):
network.remove()
diff --git
a/regression-test/framework/src/main/groovy/org/apache/doris/regression/Config.groovy
b/regression-test/framework/src/main/groovy/org/apache/doris/regression/Config.groovy
index 3cc4ba72b71..1b98e808987 100644
---
a/regression-test/framework/src/main/groovy/org/apache/doris/regression/Config.groovy
+++
b/regression-test/framework/src/main/groovy/org/apache/doris/regression/Config.groovy
@@ -660,6 +660,7 @@ class Config {
config.ccrDownstreamUser = configToString(obj.ccrDownstreamUser)
config.ccrDownstreamPassword =
configToString(obj.ccrDownstreamPassword)
config.image = configToString(obj.image)
+ config.dorisComposePath = configToString(obj.dorisComposePath)
config.dockerCoverageOutputDir =
configToString(obj.dockerCoverageOutputDir)
config.dockerEndDeleteFiles = configToBoolean(obj.dockerEndDeleteFiles)
config.dockerEndNoKill = configToBoolean(obj.dockerEndNoKill)
diff --git
a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy
b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy
index d94e25e5caf..7565028db19 100644
---
a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy
+++
b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy
@@ -278,6 +278,16 @@ class Suite implements GroovyInterceptable {
return context.connect(user, password, url, actionSupplier)
}
+ public <T> T connectWithDockerCluster(
+ SuiteCluster cluster,
+ Boolean connectToFollower = false,
+ String user = context.config.jdbcUser,
+ String password = context.config.jdbcPassword,
+ Closure<T> actionSupplier) {
+ def jdbcUrl = cluster.getJdbcUrl(connectToFollower)
+ return context.connect(user, password, jdbcUrl, actionSupplier)
+ }
+
public <T> T connectInDocker(String user = context.config.jdbcUser, String
password = context.config.jdbcPassword,
Closure<T> actionSupplier) {
def connInfo = context.threadLocalConn.get()
@@ -336,7 +346,6 @@ class Suite implements GroovyInterceptable {
}
}
-
private void dockerImpl(ClusterOptions options, boolean isCloud, Closure
actionSupplier) throws Exception {
logger.info("=== start run suite {} in {} mode. ===", name, (isCloud ?
"cloud" : "not_cloud"))
def originConnection = context.threadLocalConn.get()
@@ -384,6 +393,133 @@ class Suite implements GroovyInterceptable {
}
}
+ /**
+ * Create and manage multiple Docker clusters for multi-cluster test
scenarios.
+ *
+ * Usage example:
+ * dockers([
+ * "cluster_1": new ClusterOptions(cloudMode: true, feNum: 1, beNum:
1, msNum: 1),
+ * "cluster_2": new ClusterOptions(cloudMode: true, feNum: 1, beNum:
1, msNum: 0, externalMsCluster: "cluster_1")
+ * ]) { clusters ->
+ * connectWithDockerCluster(clusters.cluster_1) { sql "..." }
+ * connectWithDockerCluster(clusters.cluster_2) { sql "..." }
+ * }
+ *
+ * Important:
+ * - Must use LinkedHashMap to preserve insertion order
+ * - Clusters are created in map insertion order
+ * - Clusters are destroyed in reverse order (dependent clusters first)
+ * - If using externalMsCluster, the referenced cluster must appear
earlier in the map
+ *
+ * @param clusterConfigs LinkedHashMap of cluster name to ClusterOptions
+ * @param actionSupplier Closure receiving Map<String, SuiteCluster> for
test execution
+ */
+ void dockers(LinkedHashMap<String, ClusterOptions> clusterConfigs, Closure
actionSupplier) throws Exception {
+ if (context.config.excludeDockerTest) {
+ logger.info("do not run the docker suite {}, because regression
config excludeDockerTest=true", name)
+ return
+ }
+
+ if (RegressionTest.getGroupExecType(group) !=
RegressionTest.GroupExecType.DOCKER) {
+ throw new Exception("Need to add 'docker' to docker suite's belong
groups, "
+ + "see example demo_p0/docker_action.groovy")
+ }
+
+ // Validate cluster configs
+ Set<String> clusterNames = new HashSet<>()
+ for (def entry : clusterConfigs.entrySet()) {
+ String clusterName = entry.key
+ ClusterOptions options = entry.value
+
+ if (clusterNames.contains(clusterName)) {
+ throw new Exception("Duplicate cluster name: ${clusterName}")
+ }
+ clusterNames.add(clusterName)
+
+ // Validate externalMsCluster reference
+ if (options.externalMsCluster != null &&
!options.externalMsCluster.isEmpty()) {
+ if (!clusterNames.contains(options.externalMsCluster)) {
+ throw new Exception("Cluster ${clusterName} references
non-existent external MS cluster: ${options.externalMsCluster}")
+ }
+ if (options.msNum > 0) {
+ throw new Exception("Cluster ${clusterName} cannot have
its own MS when using external MS cluster")
+ }
+ }
+ }
+
+ List<String> clusterNamesReversed = new
ArrayList<>(clusterConfigs.keySet())
+ Collections.reverse(clusterNamesReversed)
+
+ // Use LinkedHashMap to preserve order
+ Map<String, SuiteCluster> clusters = new LinkedHashMap<>()
+
+ try {
+ // Create and initialize clusters in order
+ for (def entry : clusterConfigs.entrySet()) {
+ String clusterName = entry.key
+ ClusterOptions options = entry.value
+
+ logger.info("Creating cluster: ${clusterName}")
+ SuiteCluster cluster = new SuiteCluster(clusterName,
context.config)
+
+ clusters.put(clusterName, cluster)
+ }
+
+ for (String clusterName : clusterNamesReversed) {
+ clusters.get(clusterName).destroy(true)
+ }
+
+ for (def entry : clusterConfigs.entrySet()) {
+ String clusterName = entry.key
+ ClusterOptions options = entry.value
+ SuiteCluster cluster = clusters.get(clusterName)
+
+ // Determine cloud mode
+ boolean isCloud = false
+ if (options.cloudMode == null) {
+ // If not specified, use config default or run both modes
+ if (context.config.runMode == RunMode.CLOUD) {
+ isCloud = true
+ } else if (context.config.runMode == RunMode.NOT_CLOUD) {
+ isCloud = false
+ } else {
+ throw new Exception("cloudMode must be specified when
runMode is UNKNOWN for multi-cluster setup")
+ }
+ } else {
+ if (options.cloudMode == true && context.config.runMode ==
RunMode.NOT_CLOUD) {
+ logger.info("Skip cluster ${clusterName} because
cloudMode=true but regression test is in local mode")
+ continue
+ }
+ if (options.cloudMode == false && context.config.runMode
== RunMode.CLOUD) {
+ logger.info("Skip cluster ${clusterName} because
cloudMode=false but regression test is in cloud mode")
+ continue
+ }
+ isCloud = options.cloudMode
+ }
+ logger.info("Initializing cluster ${cluster.name} in ${isCloud
? 'cloud' : 'not_cloud'} mode")
+ cluster.init(options, isCloud)
+ logger.info("Cluster ${clusterName} initialized successfully")
+ }
+
+ // Wait for BE to report
+ Thread.sleep(5000)
+
+ actionSupplier.call(clusters)
+ } finally {
+ // Destroy clusters in reverse order
+ if (!context.config.dockerEndNoKill) {
+ for (String clusterName : clusterNamesReversed) {
+ try {
+ logger.info("Destroying cluster: ${clusterName}")
+
clusters.get(clusterName).destroy(context.config.dockerEndDeleteFiles)
+ } catch (Throwable t) {
+ logger.warn("Failed to destroy cluster
${clusterName}", t)
+ }
+ }
+ }
+ }
+ }
+
String get_ccr_body(String table, String db = null) {
if (db == null) {
db = context.dbName
diff --git
a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteCluster.groovy
b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteCluster.groovy
index cb9c34b064a..536889fc0e2 100644
---
a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteCluster.groovy
+++
b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteCluster.groovy
@@ -94,6 +94,16 @@ class ClusterOptions {
String tdeAk = "";
String tdeSk = "";
+ // Use external meta service cluster (shared MS/FDB)
+ // Specify the cluster name that provides MS/FDB services
+ // When set, this cluster will not create its own MS/FDB/Recycler
+ // Example: externalMsCluster = "shared-meta" (Cloud mode only)
+ String externalMsCluster = null
+
+ // Specify the instance id.
+ // When not set, "default_instance_id" will be used. (Cloud mode only)
+ String instanceId = null;
+
void enableDebugPoints() {
feConfigs.add('enable_debug_points=true')
beConfigs.add('enable_debug_points=true')
@@ -380,6 +390,14 @@ class SuiteCluster {
cmd += options.tdeSk
}
+ if (options.externalMsCluster != null && options.externalMsCluster !=
"") {
+ cmd += ['--external-ms', options.externalMsCluster]
+ }
+
+ if (options.instanceId != null && options.instanceId != "") {
+ cmd += ['--instance-id', options.instanceId]
+ }
+
cmd += ['--wait-timeout', String.valueOf(options.waitTimeout)]
sqlModeNodeMgr = options.sqlModeNodeMgr
@@ -392,6 +410,31 @@ class SuiteCluster {
running = true
}
+ String getJdbcUrl(boolean connectToFollower) {
+ def user = config.jdbcUser
+ def password = config.jdbcPassword
+ Frontend fe = null
+ for (def i=0; (fe == null || !fe.alive) && i<30; i++) {
+ if (connectToFollower) {
+ fe = getOneFollowerFe()
+ } else {
+ fe = getMasterFe()
+ }
+ Thread.sleep(1000)
+ }
+
+ if (fe == null) {
+ throw new Exception('No available frontend found in cluster: ' +
name)
+ }
+
+ logger.info("get fe host {} , queryPort {}", fe.host, fe.queryPort)
+
+ jdbcUrl = String.format(
+
"jdbc:mysql://%s:%s/?useLocalSessionState=true&allowLoadLocalInfile=false",
+ fe.host, fe.queryPort)
+ return jdbcUrl
+ }
+
void injectDebugPoints(NodeType type, Map<String, Map<String, String>>
injectPoints) {
if (injectPoints == null || injectPoints.isEmpty()) {
return
diff --git a/regression-test/suites/demo_p0/test_external_ms_cluster.groovy
b/regression-test/suites/demo_p0/test_external_ms_cluster.groovy
new file mode 100644
index 00000000000..17d53ea5977
--- /dev/null
+++ b/regression-test/suites/demo_p0/test_external_ms_cluster.groovy
@@ -0,0 +1,83 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+import org.apache.doris.regression.suite.ClusterOptions
+import org.apache.doris.regression.suite.SuiteCluster
+
+suite("test_external_ms_cluster", "docker") {
+
+ // This test demonstrates how to use a shared MS/FDB to create multiple
isolated Doris clusters.
+ // Scenario:
+ // Create two clusters where the second one shares the MS/FDB cluster
(without FE/BE)
+ // with the first one.
+
+ // ATTN: This test only runs in cloud mode.
+ if (!isCloudMode()) {
+ logger.info("Skip test_external_ms_cluster because not in cloud mode")
+ return
+ }
+
+ def opt1 = new ClusterOptions(
+ cloudMode: true, feNum: 1, beNum: 1, msNum: 1)
+
+ def opt2 = new ClusterOptions(
+ cloudMode: true, feNum: 1, beNum: 1, msNum: 0, externalMsCluster:
"cluster_2")
+
+ // cluster_1 depends on cluster_2's MS/FDB
+ dockers([
+ "cluster_2": opt1,
+ "cluster_1": opt2
+ ]) { clusters ->
+ connectWithDockerCluster(clusters.cluster_2) {
+ // Create database and table on cluster_2
+ sql "CREATE DATABASE IF NOT EXISTS test_db"
+ sql "USE test_db"
+ sql """
+ CREATE TABLE IF NOT EXISTS test_table (
+ id INT,
+ name VARCHAR(100)
+ ) DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 3
+ PROPERTIES ("replication_num" = "1")
+ """
+ sql "INSERT INTO test_table VALUES (1, 'cluster1_data')"
+
+ def result1 = sql "SELECT * FROM test_table"
+ logger.info("Cluster1 data: ${result1}")
+ assert result1.size() == 1
+ }
+
+ connectWithDockerCluster(clusters.cluster_1) {
+ // Create different database and table on cluster_1
+ sql "CREATE DATABASE IF NOT EXISTS test_db"
+ sql "USE test_db"
+ sql """
+ CREATE TABLE IF NOT EXISTS test_table (
+ id INT,
+ value VARCHAR(100)
+ ) DUPLICATE KEY(id)
+ DISTRIBUTED BY HASH(id) BUCKETS 3
+ PROPERTIES ("replication_num" = "1")
+ """
+ sql "INSERT INTO test_table VALUES (2, 'cluster2_data')"
+
+ def result2 = sql "SELECT * FROM test_table"
+ logger.info("Cluster2 data: ${result2}")
+ assert result2.size() == 1
+ }
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]