This is an automated email from the ASF dual-hosted git repository.

dataroaring pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new c4193bdb676 [feat](regression) Add dockers to manage multi docker 
clusters (#57236)
c4193bdb676 is described below

commit c4193bdb676d0777cb1480a300bafadfd8f0b6bc
Author: walter <[email protected]>
AuthorDate: Wed Oct 22 22:09:03 2025 +0800

    [feat](regression) Add dockers to manage multi docker clusters (#57236)
    
    and support sharing MS between clusters.
---
 docker/runtime/doris-compose/Readme.md             |  56 +++++++++
 docker/runtime/doris-compose/cluster.py            |  86 +++++++++++--
 docker/runtime/doris-compose/command.py            | 116 +++++++++++++++--
 docker/runtime/doris-compose/resource/common.sh    |  37 ++++++
 docker/runtime/doris-compose/resource/init_be.sh   |   6 +-
 .../runtime/doris-compose/resource/init_cloud.sh   |  35 +-----
 docker/runtime/doris-compose/resource/init_fe.sh   |  17 ++-
 docker/runtime/doris-compose/utils.py              |   6 +-
 .../org/apache/doris/regression/Config.groovy      |   1 +
 .../org/apache/doris/regression/suite/Suite.groovy | 138 ++++++++++++++++++++-
 .../doris/regression/suite/SuiteCluster.groovy     |  43 +++++++
 .../suites/demo_p0/test_external_ms_cluster.groovy |  83 +++++++++++++
 12 files changed, 565 insertions(+), 59 deletions(-)

diff --git a/docker/runtime/doris-compose/Readme.md 
b/docker/runtime/doris-compose/Readme.md
index c4b60460e49..d8fbf0f4ed6 100644
--- a/docker/runtime/doris-compose/Readme.md
+++ b/docker/runtime/doris-compose/Readme.md
@@ -207,6 +207,62 @@ steps:
 2. Generate regression-conf-custom.groovy: `python 
docker/runtime/doris-compose/doris-compose.py config my-cluster  
<doris-root-path> --connect-follow-fe`
 3. Run regression test: `bash run-regression-test.sh --run -times 1 -parallel 
1 -suiteParallel 1 -d cloud/multi_cluster`
 
+### Multi cloud cluster with shared Meta Service
+
+Doris compose now supports creating multiple cloud clusters that share the 
same Meta Service (MS), FDB, and Recycler services. This is useful for testing 
cross-cluster operations (such as cloning, backup/restore) under the same Meta 
Service instance.
+
+#### Create the first cluster
+
+First, create a complete cloud cluster that will provide MS/FDB/Recycler 
services:
+
+```shell
+python docker/runtime/doris-compose/doris-compose.py up cluster1 <image> 
--cloud --add-fe-num 1 --add-be-num 3
+```
+
+This creates the first cluster with:
+- 1 FDB node
+- 1 Meta Service (MS) node
+- 1 Recycler node
+- 1 FE node
+- 3 BE nodes
+
+#### Create additional clusters sharing the same MS
+
+Now you can create additional sql/compute clusters that share the first 
cluster's Meta Service:
+
+```shell
+# Create second cluster sharing cluster1's MS
+python docker/runtime/doris-compose/doris-compose.py up cluster2 <image> 
--cloud --external-ms cluster1 --instance-id instance_cluster2 --add-fe-num 1 
--add-be-num 3
+
+# Create third cluster sharing cluster1's MS
+python docker/runtime/doris-compose/doris-compose.py up cluster3 <image> 
--cloud --external-ms cluster1 --instance-id instance_cluster3 --add-fe-num 1 
--add-be-num 3
+```
+
+Key points:
+- `--external-ms cluster1`: Specifies that this cluster will use cluster1's 
MS/FDB/Recycler services
+- `--instance-id`: Must be unique for each cluster. If not specified, will 
auto-generate as `instance_<cluster-name>`
+- The new clusters will NOT create their own MS/FDB/Recycler nodes, saving 
resources
+- All clusters share the same object storage and meta service infrastructure
+- Each cluster maintains its own FE/BE nodes for compute isolation
+
+#### Network architecture
+
+When using external MS:
+- Each cluster has its own Docker network
+- Compute clusters join the external MS cluster's network as well
+- DNS resolution is configured automatically for all MS/FDB/Recycler nodes
+- BE and FE nodes can communicate with MS nodes using their container names
+
+#### Validation
+
+Doris compose automatically validates:
+1. External MS cluster exists
+2. External cluster is a cloud cluster
+3. MS and FDB nodes are present
+4. MS and FDB containers are running
+
+If validation fails, you'll get a clear error message explaining what needs to 
be fixed.
+
 ## Problem investigation
 
 ### Log
diff --git a/docker/runtime/doris-compose/cluster.py 
b/docker/runtime/doris-compose/cluster.py
index b54a9a413d3..0aff78f04cc 100644
--- a/docker/runtime/doris-compose/cluster.py
+++ b/docker/runtime/doris-compose/cluster.py
@@ -440,6 +440,8 @@ class Node(object):
                 f"{port_name} = {port}"
                 for port_name, port in self.meta["ports"].items()
             ]
+        else:
+            cfg.append(f"priority_networks = {self.cluster.get_cidr()}")
         return cfg
 
     def docker_ports(self):
@@ -497,15 +499,47 @@ class Node(object):
             content["network_mode"] = "host"
         else:
             content["hostname"] = self.get_name()
-            content["networks"] = {
+
+            # Configure container networks: local cluster network + external 
MS network (if any)
+            networks = {
                 utils.with_doris_prefix(self.cluster.name): {
                     "ipv4_address": self.get_ip(),
                 }
             }
+
+            # If using external MS cluster, let the container join the 
external MS cluster network
+            if self.cluster.external_ms_cluster:
+                external_network_name = 
utils.get_network_name(self.cluster.external_ms_cluster)
+                networks[external_network_name] = {}
+                LOG.debug(f"Node {self.get_name()} joins external network: 
{external_network_name}")
+
+            content["networks"] = networks
+
             extra_hosts.extend([
                 "{}:{}".format(node.get_name(), node.get_ip())
                 for node in self.cluster.get_all_nodes()
             ])
+
+            # Add external MS cluster nodes to extra_hosts
+            if self.cluster.external_ms_cluster:
+                try:
+                    external_cluster = 
Cluster.load(self.cluster.external_ms_cluster)
+                    for ms_node in 
external_cluster.get_all_nodes(Node.TYPE_MS):
+                        extra_hosts.append(
+                            "{}:{}".format(ms_node.get_name(), 
ms_node.get_ip())
+                        )
+                    for fdb_node in 
external_cluster.get_all_nodes(Node.TYPE_FDB):
+                        extra_hosts.append(
+                            "{}:{}".format(fdb_node.get_name(), 
fdb_node.get_ip())
+                        )
+                    for recycle_node in 
external_cluster.get_all_nodes(Node.TYPE_RECYCLE):
+                        extra_hosts.append(
+                            "{}:{}".format(recycle_node.get_name(), 
recycle_node.get_ip())
+                        )
+                    LOG.debug(f"Added external MS cluster hosts for 
{self.get_name()}")
+                except Exception as e:
+                    LOG.warning(f"Failed to add external MS cluster hosts: 
{e}")
+
             content["ports"] = self.docker_ports()
         user_hosts = getattr(self.cluster, "extra_hosts", [])
         if user_hosts:
@@ -573,8 +607,14 @@ class FE(Node):
 
     def docker_env(self):
         envs = super().docker_env()
+        # Create instance when using external MS cluster, and pass cloud store 
config
+        if self.cluster.external_ms_cluster:
+            envs["AUTO_CREATE_INSTANCE"] = 1
+            for key, value in self.cluster.cloud_store_config.items():
+                envs[key] = value
         if self.cluster.is_cloud:
             envs["CLOUD_UNIQUE_ID"] = self.cloud_unique_id()
+            envs["INSTANCE_ID"] = self.cluster.instance_id
             if self.meta["is_cloud_follower"]:
                 envs["IS_FE_FOLLOWER"] = 1
         envs["MY_QUERY_PORT"] = self.meta["ports"]["query_port"]
@@ -592,7 +632,7 @@ class FE(Node):
         }
 
     def cloud_unique_id(self):
-        return "sql_server_{}".format(self.id)
+        return "{}_sql_server_{}".format(self.cluster.name, self.id)
 
     def start_script(self):
         return ["init_fe.sh"]
@@ -693,6 +733,7 @@ class BE(Node):
             "heartbeat_service_port"]
         if self.cluster.is_cloud:
             envs["CLOUD_UNIQUE_ID"] = self.cloud_unique_id()
+            envs["INSTANCE_ID"] = self.cluster.instance_id
             envs["REG_BE_TO_MS"] = 1 if self.cluster.reg_be else 0
             envs["CLUSTER_NAME"] = self.meta["cluster_name"]
         return envs
@@ -707,7 +748,7 @@ class BE(Node):
         }
 
     def cloud_unique_id(self):
-        return "compute_node_{}".format(self.id)
+        return "{}_compute_node_{}".format(self.cluster.name, self.id)
 
     def docker_home_dir(self):
         return os.path.join(DOCKER_DORIS_PATH, "be")
@@ -757,6 +798,7 @@ class MS(CLOUD):
 
     def docker_env(self):
         envs = super().docker_env()
+        envs["INSTANCE_ID"] = self.cluster.instance_id
         for key, value in self.cluster.cloud_store_config.items():
             envs[key] = value
         return envs
@@ -818,7 +860,8 @@ class Cluster(object):
                  be_config, ms_config, recycle_config, remote_master_fe,
                  local_network_ip, fe_follower, be_disks, be_cluster, reg_be,
                  extra_hosts, coverage_dir, cloud_store_config,
-                 sql_mode_node_mgr, be_metaservice_endpoint, be_cluster_id, 
tde_ak, tde_sk):
+                 sql_mode_node_mgr, be_metaservice_endpoint, be_cluster_id, 
tde_ak, tde_sk,
+                 external_ms_cluster, instance_id):
         self.name = name
         self.subnet = subnet
         self.image = image
@@ -837,6 +880,10 @@ class Cluster(object):
         self.extra_hosts = extra_hosts
         self.coverage_dir = coverage_dir
         self.cloud_store_config = cloud_store_config
+        self.external_ms_cluster = external_ms_cluster
+        self.instance_id = instance_id
+        if not self.instance_id:
+            self.instance_id = f"instance_{name}" if self.external_ms_cluster 
else "default_instance_id"
         self.groups = {
             node_type: Group(node_type)
             for node_type in Node.TYPE_ALL
@@ -855,7 +902,8 @@ class Cluster(object):
             ms_config, recycle_config, remote_master_fe, local_network_ip,
             fe_follower, be_disks, be_cluster, reg_be, extra_hosts,
             coverage_dir, cloud_store_config, sql_mode_node_mgr,
-            be_metaservice_endpoint, be_cluster_id, tde_ak, tde_sk):
+            be_metaservice_endpoint, be_cluster_id, tde_ak, tde_sk,
+            external_ms_cluster, instance_id):
         if not os.path.exists(LOCAL_DORIS_PATH):
             os.makedirs(LOCAL_DORIS_PATH, exist_ok=True)
             os.chmod(LOCAL_DORIS_PATH, 0o777)
@@ -870,7 +918,7 @@ class Cluster(object):
                               be_disks, be_cluster, reg_be, extra_hosts,
                               coverage_dir, cloud_store_config,
                               sql_mode_node_mgr, be_metaservice_endpoint,
-                              be_cluster_id, tde_ak, tde_sk)
+                              be_cluster_id, tde_ak, tde_sk, 
external_ms_cluster, instance_id)
             os.makedirs(cluster.get_path(), exist_ok=True)
             os.makedirs(get_status_path(name), exist_ok=True)
             cluster._save_meta()
@@ -996,20 +1044,32 @@ class Cluster(object):
         return node
 
     def get_fdb_cluster(self):
+        if self.external_ms_cluster:
+            external_cluster = Cluster.load(self.external_ms_cluster)
+            return external_cluster.get_fdb_cluster()
         fdb = self.get_node(Node.TYPE_FDB, 1)
         return "123456:123456@{}:{}".format(fdb.get_ip(),
                                             fdb.meta["ports"]["fdb_port"])
 
     def get_meta_server_addr(self):
+        if self.external_ms_cluster:
+            external_cluster = Cluster.load(self.external_ms_cluster)
+            return external_cluster.get_meta_server_addr()
         meta_server = self.get_node(Node.TYPE_MS, 1)
         return "{}:{}".format(meta_server.get_ip(),
                               meta_server.meta["ports"]["brpc_listen_port"])
 
     def get_recycle_addr(self):
+        if self.external_ms_cluster:
+            external_cluster = Cluster.load(self.external_ms_cluster)
+            return external_cluster.get_recycle_addr()
         recycler = self.get_node(Node.TYPE_RECYCLE, 1)
         return "{}:{}".format(recycler.get_ip(),
                               recycler.meta["ports"]["brpc_listen_port"])
 
+    def get_cidr(self):
+        return "{}.0.0/16".format(self.subnet)
+
     def remove(self, node_type, id):
         group = self.get_group(node_type)
         group.remove(id)
@@ -1032,17 +1092,27 @@ class Cluster(object):
             "services": services,
         }
         if not self.is_host_network():
-            compose["networks"] = {
+            networks = {
                 utils.with_doris_prefix(self.name): {
                     "driver": "bridge",
                     "ipam": {
                         "config": [{
-                            "subnet": "{}.0.0/16".format(self.subnet),
+                            "subnet": self.get_cidr(),
                         }]
                     },
                 },
             }
 
+            # If using external MS cluster, declare the external network
+            if self.external_ms_cluster:
+                external_network_name = 
utils.get_network_name(self.external_ms_cluster)
+                networks[external_network_name] = {
+                    "external": True
+                }
+                LOG.debug(f"Added external network: {external_network_name}")
+
+            compose["networks"] = networks
+
         utils.write_compose_file(self.get_compose_file(), compose)
 
     def get_compose_file(self):
diff --git a/docker/runtime/doris-compose/command.py 
b/docker/runtime/doris-compose/command.py
index c4a4f9ceff8..2e0fee727ca 100644
--- a/docker/runtime/doris-compose/command.py
+++ b/docker/runtime/doris-compose/command.py
@@ -462,6 +462,25 @@ class UpCommand(Command):
                 "Only use when creating new cluster and specify 
--remote-master-fe."
         )
 
+        parser.add_argument(
+            "--external-ms",
+            type=str,
+            help=
+            "Use external meta service cluster (specify cluster name). " \
+            "This cluster will not create its own MS/FDB/Recycler, but use the 
specified cluster's services. " \
+            "The external cluster must be a cloud cluster with MS/FDB already 
running. " \
+            "Example: --external-ms shared-meta. Only use when creating new 
cloud cluster."
+        )
+
+        parser.add_argument(
+            "--instance-id",
+            type=str,
+            help=
+            "Specify instance ID for cloud mode. If not specified, will 
auto-generate 'default_instance_id'. " \
+            "When using external MS with multiple clusters, each cluster 
should have a unique instance ID. " \
+            "Example: --instance-id prod_instance_1"
+        )
+
         if self._support_boolean_action():
             parser.add_argument(
                 "--be-metaservice-endpoint",
@@ -583,17 +602,29 @@ class UpCommand(Command):
 
             cloud_store_config = {}
             if args.cloud:
-                add_fdb_num = 1
-                if not args.add_ms_num:
-                    args.add_ms_num = 1
-                if not args.add_recycle_num:
-                    args.add_recycle_num = 1
+                external_ms_cluster = getattr(args, 'external_ms', None)
+                if external_ms_cluster:
+                    # Using the MS nodes from external cluster, no need to add 
FDB/MS/Recycler
+                    self._validate_external_ms_cluster(external_ms_cluster)
+                    add_fdb_num = 0
+                    args.add_ms_num = 0
+                    args.add_recycle_num = 0
+                    LOG.info(f"Using external MS cluster: 
{external_ms_cluster}")
+                else:
+                    add_fdb_num = 1
+                    if not args.add_ms_num:
+                        args.add_ms_num = 1
+                    if not args.add_recycle_num:
+                        args.add_recycle_num = 1
+                    external_ms_cluster = None
+
                 if not args.be_cluster:
                     args.be_cluster = "compute_cluster"
                 cloud_store_config = self._get_cloud_store_config()
             else:
                 args.add_ms_num = 0
                 args.add_recycle_num = 0
+                external_ms_cluster = None
 
             if args.remote_master_fe:
                 if not args.local_network_ip:
@@ -609,13 +640,16 @@ class UpCommand(Command):
                 if args.cloud:
                     args.sql_mode_node_mgr = True
 
+            instance_id = getattr(args, 'instance_id', None)
+
             cluster = CLUSTER.Cluster.new(
                 args.NAME, args.IMAGE, args.cloud, args.root, args.fe_config,
                 args.be_config, args.ms_config, args.recycle_config,
                 args.remote_master_fe, args.local_network_ip, args.fe_follower,
                 args.be_disks, args.be_cluster, args.reg_be, args.extra_hosts,
                 args.coverage_dir, cloud_store_config, args.sql_mode_node_mgr,
-                args.be_metaservice_endpoint, args.be_cluster_id, args.tde_ak, 
args.tde_sk)
+                args.be_metaservice_endpoint, args.be_cluster_id, args.tde_ak, 
args.tde_sk,
+                external_ms_cluster, instance_id)
             LOG.info("Create new cluster {} succ, cluster path is {}".format(
                 args.NAME, cluster.get_path()))
 
@@ -825,6 +859,70 @@ class UpCommand(Command):
             },
         }
 
+    def _validate_external_ms_cluster(self, external_ms_cluster_name):
+        # 1. Is the external cluster exist?
+        try:
+            external_cluster = CLUSTER.Cluster.load(external_ms_cluster_name)
+        except Exception as e:
+            raise Exception(
+                f"External MS cluster '{external_ms_cluster_name}' not found. "
+                f"Please create it first with: "
+                f"python doris-compose.py up {external_ms_cluster_name} 
<image> --cloud --add-fe-num 0 --add-be-num 0"
+            ) from e
+
+        # 2. Is the external cluster a cloud cluster?
+        if not external_cluster.is_cloud:
+            raise Exception(
+                f"External MS cluster '{external_ms_cluster_name}' is not a 
cloud cluster. "
+                f"Only cloud clusters can be used as external MS."
+            )
+
+        # 3. Does the external cluster have MS and FDB nodes?
+        ms_group = external_cluster.get_group(CLUSTER.Node.TYPE_MS)
+        fdb_group = external_cluster.get_group(CLUSTER.Node.TYPE_FDB)
+
+        if ms_group.get_node_num() == 0:
+            raise Exception(
+                f"External MS cluster '{external_ms_cluster_name}' has no MS 
nodes. "
+                f"Please add MS nodes first."
+            )
+
+        if fdb_group.get_node_num() == 0:
+            raise Exception(
+                f"External MS cluster '{external_ms_cluster_name}' has no FDB 
nodes. "
+                f"Please add FDB nodes first."
+            )
+
+        # 4. Are the MS and FDB containers running?
+        containers = 
utils.get_doris_running_containers(external_ms_cluster_name)
+
+        ms_running = False
+        fdb_running = False
+        for container_name in containers.keys():
+            _, node_type, _ = utils.parse_service_name(container_name)
+            if node_type == CLUSTER.Node.TYPE_MS:
+                ms_running = True
+            elif node_type == CLUSTER.Node.TYPE_FDB:
+                fdb_running = True
+
+        if not ms_running:
+            raise Exception(
+                f"External MS cluster '{external_ms_cluster_name}' MS node is 
not running. "
+                f"Please start it with: python doris-compose.py start 
{external_ms_cluster_name}"
+            )
+
+        if not fdb_running:
+            raise Exception(
+                f"External MS cluster '{external_ms_cluster_name}' FDB node is 
not running. "
+                f"Please start it with: python doris-compose.py start 
{external_ms_cluster_name}"
+            )
+
+        LOG.info(utils.render_green(
+            f"✓ External MS cluster '{external_ms_cluster_name}' validation 
passed: "
+            f"MS={external_cluster.get_meta_server_addr()}, "
+            f"FDB={external_cluster.get_fdb_cluster()}"
+        ))
+
     def _get_cloud_store_config(self):
         example_cfg_file = os.path.join(CLUSTER.LOCAL_RESOURCE_PATH,
                                         "cloud.ini.example")
@@ -1394,9 +1492,9 @@ class ListCommand(Command):
                     if cluster and cluster.is_host_network():
                         node.ip = cluster.local_network_ip
                     else:
-                        node.ip = list(
-                            container.attrs["NetworkSettings"]["Networks"].
-                            values())[0]["IPAMConfig"]["IPv4Address"]
+                        network_name = utils.get_network_name(cluster.name)
+                        node.ip = 
container.attrs["NetworkSettings"]["Networks"][network_name] \
+                                ["IPAMConfig"]["IPv4Address"]
                     node.image = container.attrs["Config"]["Image"]
                     if not node.image:
                         node.image = ",".join(container.image.tags)
diff --git a/docker/runtime/doris-compose/resource/common.sh 
b/docker/runtime/doris-compose/resource/common.sh
index 2c53ca587a5..63073fbab15 100644
--- a/docker/runtime/doris-compose/resource/common.sh
+++ b/docker/runtime/doris-compose/resource/common.sh
@@ -147,3 +147,40 @@ wait_pid() {
 
     health_log "wait end"
 }
+
+create_doris_instance() {
+    while true; do
+
+        lock_cluster
+
+        output=$(curl -s 
"${META_SERVICE_ENDPOINT}/MetaService/http/create_instance?token=greedisgood9999"
 \
+            -d '{"instance_id":"'"${INSTANCE_ID}"'",
+                    "name": "'"${INSTANCE_ID}"'",
+                    "user_id": "'"${DORIS_CLOUD_USER}"'",
+                    "obj_info": {
+                    "ak": "'"${DORIS_CLOUD_AK}"'",
+                    "sk": "'"${DORIS_CLOUD_SK}"'",
+                    "bucket": "'"${DORIS_CLOUD_BUCKET}"'",
+                    "endpoint": "'"${DORIS_CLOUD_ENDPOINT}"'",
+                    "external_endpoint": 
"'"${DORIS_CLOUD_EXTERNAL_ENDPOINT}"'",
+                    "prefix": "'"${DORIS_CLOUD_PREFIX}"'",
+                    "region": "'"${DORIS_CLOUD_REGION}"'",
+                    "provider": "'"${DORIS_CLOUD_PROVIDER}"'"
+                }}')
+
+        unlock_cluster
+
+        health_log "create instance output: $output"
+        code=$(jq -r '.code' <<<$output)
+
+        if [ "$code" != "OK" ]; then
+            health_log "create instance failed"
+            sleep 1
+            continue
+        fi
+
+        health_log "create doris instance succ, output: $output"
+        touch $HAS_CREATE_INSTANCE_FILE
+        break
+    done
+}
diff --git a/docker/runtime/doris-compose/resource/init_be.sh 
b/docker/runtime/doris-compose/resource/init_be.sh
index e4ac48bda76..c8716d02da6 100755
--- a/docker/runtime/doris-compose/resource/init_be.sh
+++ b/docker/runtime/doris-compose/resource/init_be.sh
@@ -73,7 +73,7 @@ add_cloud_be() {
     lock_cluster
 
     output=$(curl -s 
"${META_SERVICE_ENDPOINT}/MetaService/http/add_cluster?token=greedisgood9999" \
-        -d '{"instance_id": "default_instance_id",
+        -d '{"instance_id": "'"${INSTANCE_ID}"'",
         "cluster": {
         "type": "COMPUTE",
         "cluster_name": "'"${cluster_name}"'",
@@ -87,7 +87,7 @@ add_cloud_be() {
     # cluster has exists
     if [ "$code" == "ALREADY_EXISTED" ]; then
         output=$(curl -s 
"${META_SERVICE_ENDPOINT}/MetaService/http/add_node?token=greedisgood9999" \
-            -d '{"instance_id": "default_instance_id",
+            -d '{"instance_id": "'"${INSTANCE_ID}"'",
             "cluster": {
             "type": "COMPUTE",
             "cluster_name": "'"${cluster_name}"'",
@@ -107,7 +107,7 @@ add_cloud_be() {
     fi
 
     output=$(curl -s 
"${META_SERVICE_ENDPOINT}/MetaService/http/get_cluster?token=greedisgood9999" \
-        -d '{"instance_id": "default_instance_id",
+        -d '{"instance_id": "'"${INSTANCE_ID}"'",
             "cloud_unique_id": "'"${CLOUD_UNIQUE_ID}"'",
             "cluster_name": "'"${cluster_name}"'",
             "cluster_id": "'"${cluster_id}"'"
diff --git a/docker/runtime/doris-compose/resource/init_cloud.sh 
b/docker/runtime/doris-compose/resource/init_cloud.sh
index 22883ab2e40..dc0a1745060 100644
--- a/docker/runtime/doris-compose/resource/init_cloud.sh
+++ b/docker/runtime/doris-compose/resource/init_cloud.sh
@@ -53,40 +53,7 @@ check_init_cloud() {
         return
     fi
 
-    while true; do
-
-        lock_cluster
-
-        output=$(curl -s 
"${META_SERVICE_ENDPOINT}/MetaService/http/create_instance?token=greedisgood9999"
 \
-            -d '{"instance_id":"default_instance_id",
-                    "name": "default_instance",
-                    "user_id": "'"${DORIS_CLOUD_USER}"'",
-                    "obj_info": {
-                    "ak": "'"${DORIS_CLOUD_AK}"'",
-                    "sk": "'"${DORIS_CLOUD_SK}"'",
-                    "bucket": "'"${DORIS_CLOUD_BUCKET}"'",
-                    "endpoint": "'"${DORIS_CLOUD_ENDPOINT}"'",
-                    "external_endpoint": 
"'"${DORIS_CLOUD_EXTERNAL_ENDPOINT}"'",
-                    "prefix": "'"${DORIS_CLOUD_PREFIX}"'",
-                    "region": "'"${DORIS_CLOUD_REGION}"'",
-                    "provider": "'"${DORIS_CLOUD_PROVIDER}"'"
-                }}')
-
-        unlock_cluster
-
-        health_log "create instance output: $output"
-        code=$(jq -r '.code' <<<$output)
-
-        if [ "$code" != "OK" ]; then
-            health_log "create instance failed"
-            sleep 1
-            continue
-        fi
-
-        health_log "create doris instance succ, output: $output"
-        touch $HAS_CREATE_INSTANCE_FILE
-        break
-    done
+    create_doris_instance
 }
 
 stop_cloud() {
diff --git a/docker/runtime/doris-compose/resource/init_fe.sh 
b/docker/runtime/doris-compose/resource/init_fe.sh
index 4e846ed182f..68c70f92821 100755
--- a/docker/runtime/doris-compose/resource/init_fe.sh
+++ b/docker/runtime/doris-compose/resource/init_fe.sh
@@ -115,7 +115,18 @@ start_cloud_fe() {
         return
     fi
 
-    wait_create_instance
+    # Support to create instance in FE startup.
+    AUTO_CREATE_INSTANCE=${AUTO_CREATE_INSTANCE:-"0"}
+    if [ "a$MY_ID" == "a1" ] && [ "a$AUTO_CREATE_INSTANCE" == "a1" ]; then
+        health_log "auto create instance is enabled, trying to create instance"
+        if [ -f $HAS_CREATE_INSTANCE_FILE ]; then
+            health_log "instance has been created before, skip create instance"
+        else
+            create_doris_instance
+        fi
+    else
+        wait_create_instance
+    fi
 
     action=add_cluster
     node_type=FE_MASTER
@@ -139,7 +150,7 @@ start_cloud_fe() {
     lock_cluster
 
     output=$(curl -s 
"${META_SERVICE_ENDPOINT}/MetaService/http/${action}?token=greedisgood9999" \
-        -d '{"instance_id": "default_instance_id",
+        -d '{"instance_id": "'"${INSTANCE_ID}"'",
         "cluster": {
             "type": "SQL",
             "cluster_name": "RESERVED_CLUSTER_NAME_FOR_SQL_SERVER",
@@ -158,7 +169,7 @@ start_cloud_fe() {
     fi
 
     output=$(curl -s 
"${META_SERVICE_ENDPOINT}/MetaService/http/get_cluster?token=greedisgood9999" \
-        -d '{"instance_id": "default_instance_id",
+        -d '{"instance_id": "'"${INSTANCE_ID}"'",
             "cloud_unique_id": "'"${CLOUD_UNIQUE_ID}"'",
             "cluster_name": "RESERVED_CLUSTER_NAME_FOR_SQL_SERVER",
             "cluster_id": "RESERVED_CLUSTER_ID_FOR_SQL_SERVER"}')
diff --git a/docker/runtime/doris-compose/utils.py 
b/docker/runtime/doris-compose/utils.py
index 4ff87ef2229..917c83426e4 100644
--- a/docker/runtime/doris-compose/utils.py
+++ b/docker/runtime/doris-compose/utils.py
@@ -177,10 +177,14 @@ def get_doris_running_containers(cluster_name):
     }
 
 
+def get_network_name(cluster_name):
+    return cluster_name + "_" + with_doris_prefix(cluster_name)
+
+
 def remove_docker_network(cluster_name):
     client = docker.client.from_env()
     for network in client.networks.list(
-            names=[cluster_name + "_" + with_doris_prefix(cluster_name)]):
+            names=[get_network_name(cluster_name)]):
         network.remove()
 
 
diff --git 
a/regression-test/framework/src/main/groovy/org/apache/doris/regression/Config.groovy
 
b/regression-test/framework/src/main/groovy/org/apache/doris/regression/Config.groovy
index 3cc4ba72b71..1b98e808987 100644
--- 
a/regression-test/framework/src/main/groovy/org/apache/doris/regression/Config.groovy
+++ 
b/regression-test/framework/src/main/groovy/org/apache/doris/regression/Config.groovy
@@ -660,6 +660,7 @@ class Config {
         config.ccrDownstreamUser = configToString(obj.ccrDownstreamUser)
         config.ccrDownstreamPassword = 
configToString(obj.ccrDownstreamPassword)
         config.image = configToString(obj.image)
+        config.dorisComposePath = configToString(obj.dorisComposePath)
         config.dockerCoverageOutputDir = 
configToString(obj.dockerCoverageOutputDir)
         config.dockerEndDeleteFiles = configToBoolean(obj.dockerEndDeleteFiles)
         config.dockerEndNoKill = configToBoolean(obj.dockerEndNoKill)
diff --git 
a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy
 
b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy
index d94e25e5caf..7565028db19 100644
--- 
a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy
+++ 
b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/Suite.groovy
@@ -278,6 +278,16 @@ class Suite implements GroovyInterceptable {
         return context.connect(user, password, url, actionSupplier)
     }
 
+    public <T> T connectWithDockerCluster(
+            SuiteCluster cluster,
+            Boolean connectToFollower = false,
+            String user = context.config.jdbcUser,
+            String password = context.config.jdbcPassword,
+            Closure<T> actionSupplier) {
+        def jdbcUrl = cluster.getJdbcUrl(connectToFollower)
+        return context.connect(user, password, jdbcUrl, actionSupplier)
+    }
+
     public <T> T connectInDocker(String user = context.config.jdbcUser, String 
password = context.config.jdbcPassword,
                         Closure<T> actionSupplier) {
         def connInfo = context.threadLocalConn.get()
@@ -336,7 +346,6 @@ class Suite implements GroovyInterceptable {
         }
     }
 
-
     private void dockerImpl(ClusterOptions options, boolean isCloud, Closure 
actionSupplier) throws Exception {
         logger.info("=== start run suite {} in {} mode. ===", name, (isCloud ? 
"cloud" : "not_cloud"))
         def originConnection = context.threadLocalConn.get()
@@ -384,6 +393,133 @@ class Suite implements GroovyInterceptable {
         }
     }
 
+    /**
+     * Create and manage multiple Docker clusters for multi-cluster test 
scenarios.
+     *
+     * Usage example:
+     *   dockers([
+     *       "cluster_1": new ClusterOptions(cloudMode: true, feNum: 1, beNum: 
1, msNum: 1),
+     *       "cluster_2": new ClusterOptions(cloudMode: true, feNum: 1, beNum: 
1, msNum: 0, externalMsCluster: "cluster_1")
+     *   ]) { clusters ->
+     *       connectWithDockerCluster(clusters.cluster_1) { sql "..." }
+     *       connectWithDockerCluster(clusters.cluster_2) { sql "..." }
+     *   }
+     *
+     * Important:
+     *   - Must use LinkedHashMap to preserve insertion order
+     *   - Clusters are created in map insertion order
+     *   - Clusters are destroyed in reverse order (dependent clusters first)
+     *   - If using externalMsCluster, the referenced cluster must appear 
earlier in the map
+     *
+     * @param clusterConfigs LinkedHashMap of cluster name to ClusterOptions
+     * @param actionSupplier Closure receiving Map<String, SuiteCluster> for 
test execution
+     */
+    void dockers(LinkedHashMap<String, ClusterOptions> clusterConfigs, Closure 
actionSupplier) throws Exception {
+        if (context.config.excludeDockerTest) {
+            logger.info("do not run the docker suite {}, because regression 
config excludeDockerTest=true", name)
+            return
+        }
+
+        if (RegressionTest.getGroupExecType(group) != 
RegressionTest.GroupExecType.DOCKER) {
+            throw new Exception("Need to add 'docker' to docker suite's belong 
groups, "
+                    + "see example demo_p0/docker_action.groovy")
+        }
+
+        // Validate cluster configs
+        Set<String> clusterNames = new HashSet<>()
+        for (def entry : clusterConfigs.entrySet()) {
+            String clusterName = entry.key
+            ClusterOptions options = entry.value
+
+            if (clusterNames.contains(clusterName)) {
+                throw new Exception("Duplicate cluster name: ${clusterName}")
+            }
+            clusterNames.add(clusterName)
+
+            // Validate externalMsCluster reference
+            if (options.externalMsCluster != null && 
!options.externalMsCluster.isEmpty()) {
+                if (!clusterNames.contains(options.externalMsCluster)) {
+                    throw new Exception("Cluster ${clusterName} references 
non-existent external MS cluster: ${options.externalMsCluster}")
+                }
+                if (options.msNum > 0) {
+                    throw new Exception("Cluster ${clusterName} cannot have 
its own MS when using external MS cluster")
+                }
+            }
+        }
+
+        List<String> clusterNamesReversed = new 
ArrayList<>(clusterConfigs.keySet())
+        Collections.reverse(clusterNamesReversed)
+
+         // Use LinkedHashMap to preserve order
+        Map<String, SuiteCluster> clusters = new LinkedHashMap<>()
+
+        try {
+            // Create and initialize clusters in order
+            for (def entry : clusterConfigs.entrySet()) {
+                String clusterName = entry.key
+                ClusterOptions options = entry.value
+
+                logger.info("Creating cluster: ${clusterName}")
+                SuiteCluster cluster = new SuiteCluster(clusterName, 
context.config)
+
+                clusters.put(clusterName, cluster)
+            }
+
+            for (String clusterName : clusterNamesReversed) {
+                clusters.get(clusterName).destroy(true)
+            }
+
+            for (def entry : clusterConfigs.entrySet()) {
+                String clusterName = entry.key
+                ClusterOptions options = entry.value
+                SuiteCluster cluster = clusters.get(clusterName)
+
+                // Determine cloud mode
+                boolean isCloud = false
+                if (options.cloudMode == null) {
+                    // If not specified, use config default or run both modes
+                    if (context.config.runMode == RunMode.CLOUD) {
+                        isCloud = true
+                    } else if (context.config.runMode == RunMode.NOT_CLOUD) {
+                        isCloud = false
+                    } else {
+                        throw new Exception("cloudMode must be specified when 
runMode is UNKNOWN for multi-cluster setup")
+                    }
+                } else {
+                    if (options.cloudMode == true && context.config.runMode == 
RunMode.NOT_CLOUD) {
+                        logger.info("Skip cluster ${clusterName} because 
cloudMode=true but regression test is in local mode")
+                        continue
+                    }
+                    if (options.cloudMode == false && context.config.runMode 
== RunMode.CLOUD) {
+                        logger.info("Skip cluster ${clusterName} because 
cloudMode=false but regression test is in cloud mode")
+                        continue
+                    }
+                    isCloud = options.cloudMode
+                }
+                logger.info("Initializing cluster ${cluster.name} in ${isCloud 
? 'cloud' : 'not_cloud'} mode")
+                cluster.init(options, isCloud)
+                logger.info("Cluster ${clusterName} initialized successfully")
+            }
+
+            // Wait for BE to report
+            Thread.sleep(5000)
+
+            actionSupplier.call(clusters)
+        } finally {
+            // Destroy clusters in reverse order
+            if (!context.config.dockerEndNoKill) {
+                for (String clusterName : clusterNamesReversed) {
+                    try {
+                        logger.info("Destroying cluster: ${clusterName}")
+                        
clusters.get(clusterName).destroy(context.config.dockerEndDeleteFiles)
+                    } catch (Throwable t) {
+                        logger.warn("Failed to destroy cluster 
${clusterName}", t)
+                    }
+                }
+            }
+        }
+    }
+
     String get_ccr_body(String table, String db = null) {
         if (db == null) {
             db = context.dbName
diff --git 
a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteCluster.groovy
 
b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteCluster.groovy
index cb9c34b064a..536889fc0e2 100644
--- 
a/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteCluster.groovy
+++ 
b/regression-test/framework/src/main/groovy/org/apache/doris/regression/suite/SuiteCluster.groovy
@@ -94,6 +94,16 @@ class ClusterOptions {
     String tdeAk = "";
     String tdeSk = "";
 
+    // Use external meta service cluster (shared MS/FDB)
+    // Specify the cluster name that provides MS/FDB services
+    // When set, this cluster will not create its own MS/FDB/Recycler
+    // Example: externalMsCluster = "shared-meta" (Cloud mode only)
+    String externalMsCluster = null
+
+    // Specify the instance id.
+    // When not set, "default_instance_id" will be used. (Cloud mode only)
+    String instanceId = null;
+
     void enableDebugPoints() {
         feConfigs.add('enable_debug_points=true')
         beConfigs.add('enable_debug_points=true')
@@ -380,6 +390,14 @@ class SuiteCluster {
             cmd += options.tdeSk
         }
 
+        if (options.externalMsCluster != null && options.externalMsCluster != 
"") {
+            cmd += ['--external-ms', options.externalMsCluster]
+        }
+
+        if (options.instanceId != null && options.instanceId != "") {
+            cmd += ['--instance-id', options.instanceId]
+        }
+
         cmd += ['--wait-timeout', String.valueOf(options.waitTimeout)]
 
         sqlModeNodeMgr = options.sqlModeNodeMgr
@@ -392,6 +410,31 @@ class SuiteCluster {
         running = true
     }
 
+    String getJdbcUrl(boolean connectToFollower) {
+        def user = config.jdbcUser
+        def password = config.jdbcPassword
+        Frontend fe = null
+        for (def i=0; (fe == null || !fe.alive) && i<30; i++) {
+            if (connectToFollower) {
+                fe = getOneFollowerFe()
+            } else {
+                fe = getMasterFe()
+            }
+            Thread.sleep(1000)
+        }
+
+        if (fe == null) {
+            throw new Exception('No available frontend found in cluster: ' + 
name)
+        }
+
+        logger.info("get fe host {} , queryPort {}", fe.host, fe.queryPort)
+
+        jdbcUrl = String.format(
+                
"jdbc:mysql://%s:%s/?useLocalSessionState=true&allowLoadLocalInfile=false",
+                fe.host, fe.queryPort)
+        return jdbcUrl
+    }
+
     void injectDebugPoints(NodeType type, Map<String, Map<String, String>> 
injectPoints) {
         if (injectPoints == null || injectPoints.isEmpty()) {
             return
diff --git a/regression-test/suites/demo_p0/test_external_ms_cluster.groovy 
b/regression-test/suites/demo_p0/test_external_ms_cluster.groovy
new file mode 100644
index 00000000000..17d53ea5977
--- /dev/null
+++ b/regression-test/suites/demo_p0/test_external_ms_cluster.groovy
@@ -0,0 +1,83 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+import org.apache.doris.regression.suite.ClusterOptions
+import org.apache.doris.regression.suite.SuiteCluster
+
+suite("test_external_ms_cluster", "docker") {
+
+    // This test demonstrates how to use a shared MS/FDB to create multiple 
isolated Doris clusters.
+    // Scenario:
+    //  Create two clusters where the second one shares the MS/FDB cluster 
(without FE/BE)
+    // with the first one.
+
+    // ATTN: This test only runs in cloud mode.
+    if (!isCloudMode()) {
+        logger.info("Skip test_external_ms_cluster because not in cloud mode")
+        return
+    }
+
+    def opt1 = new ClusterOptions(
+        cloudMode: true, feNum: 1, beNum: 1, msNum: 1)
+
+    def opt2 = new ClusterOptions(
+        cloudMode: true, feNum: 1, beNum: 1, msNum: 0, externalMsCluster: 
"cluster_2")
+
+    // cluster_1 depends on cluster_2's MS/FDB
+    dockers([
+        "cluster_2": opt1,
+        "cluster_1": opt2
+    ]) { clusters ->
+        connectWithDockerCluster(clusters.cluster_2) {
+            // Create database and table on cluster_2
+            sql "CREATE DATABASE IF NOT EXISTS test_db"
+            sql "USE test_db"
+            sql """
+                CREATE TABLE IF NOT EXISTS test_table (
+                    id INT,
+                    name VARCHAR(100)
+                ) DUPLICATE KEY(id)
+                DISTRIBUTED BY HASH(id) BUCKETS 3
+                PROPERTIES ("replication_num" = "1")
+            """
+            sql "INSERT INTO test_table VALUES (1, 'cluster1_data')"
+
+            def result1 = sql "SELECT * FROM test_table"
+            logger.info("Cluster1 data: ${result1}")
+            assert result1.size() == 1
+        }
+
+        connectWithDockerCluster(clusters.cluster_1) {
+            // Create different database and table on cluster_1
+            sql "CREATE DATABASE IF NOT EXISTS test_db"
+            sql "USE test_db"
+            sql """
+                CREATE TABLE IF NOT EXISTS test_table (
+                    id INT,
+                    value VARCHAR(100)
+                ) DUPLICATE KEY(id)
+                DISTRIBUTED BY HASH(id) BUCKETS 3
+                PROPERTIES ("replication_num" = "1")
+            """
+            sql "INSERT INTO test_table VALUES (2, 'cluster2_data')"
+
+            def result2 = sql "SELECT * FROM test_table"
+            logger.info("Cluster2 data: ${result2}")
+            assert result2.size() == 1
+        }
+    }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to