This is an automated email from the ASF dual-hosted git repository.

CRZbulabula pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iotdb.git


The following commit(s) were added to refs/heads/master by this push:
     new c5240154a70 Clarify RemoveDataNode single-replica error and add 
diagnostics for the no-available-RegionGroup race (#17878)
c5240154a70 is described below

commit c5240154a7044d9328de7c908f3c5b55972e3284
Author: Yongzao <[email protected]>
AuthorDate: Wed Jun 10 19:19:22 2026 +0800

    Clarify RemoveDataNode single-replica error and add diagnostics for the 
no-available-RegionGroup race (#17878)
---
 .../removedatanode/IoTDBRemoveLastDataNodeIT.java  | 111 +++++++++++++++++++++
 .../iotdb/confignode/i18n/ProcedureMessages.java   |   5 +
 .../iotdb/confignode/i18n/ProcedureMessages.java   |   5 +
 .../manager/partition/PartitionManager.java        |  18 ++++
 .../procedure/env/ConfigNodeProcedureEnv.java      |   7 +-
 .../procedure/env/RemoveDataNodeHandler.java       |  52 ++++++----
 .../impl/region/CreateRegionGroupsProcedure.java   |  10 +-
 7 files changed, 188 insertions(+), 20 deletions(-)

diff --git 
a/integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveLastDataNodeIT.java
 
b/integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveLastDataNodeIT.java
new file mode 100644
index 00000000000..ca5a2929fd6
--- /dev/null
+++ 
b/integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveLastDataNodeIT.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iotdb.confignode.it.removedatanode;
+
+import org.apache.iotdb.commons.schema.column.ColumnHeaderConstant;
+import org.apache.iotdb.consensus.ConsensusFactory;
+import org.apache.iotdb.it.env.EnvFactory;
+import org.apache.iotdb.it.framework.IoTDBTestRunner;
+import org.apache.iotdb.itbase.category.LocalStandaloneIT;
+import org.apache.iotdb.jdbc.IoTDBSQLException;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.junit.runner.RunWith;
+
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.Statement;
+import java.util.HashSet;
+import java.util.Set;
+
+import static 
org.apache.iotdb.confignode.it.removedatanode.IoTDBRemoveDataNodeUtils.generateRemoveString;
+import static 
org.apache.iotdb.confignode.it.removedatanode.IoTDBRemoveDataNodeUtils.selectRemoveDataNodes;
+import static org.apache.iotdb.util.MagicUtils.makeItCloseQuietly;
+
+/**
+ * Removing the last DataNode of a single-replica cluster must be rejected. 
This only needs a 1C1D
+ * cluster, so it lives in the 1C1D (LocalStandaloneIT) suite, separate from 
the multi-DataNode
+ * removal tests in {@link IoTDBRemoveDataNodeNormalIT}.
+ */
+@Category({LocalStandaloneIT.class})
+@RunWith(IoTDBTestRunner.class)
+public class IoTDBRemoveLastDataNodeIT {
+
+  private static final String SHOW_DATANODES = "show datanodes";
+
+  @Before
+  public void setUp() throws Exception {
+    EnvFactory.getEnv()
+        .getConfig()
+        .getCommonConfig()
+        .setConfigNodeConsensusProtocolClass(ConsensusFactory.RATIS_CONSENSUS)
+        
.setSchemaRegionConsensusProtocolClass(ConsensusFactory.RATIS_CONSENSUS);
+  }
+
+  @After
+  public void tearDown() throws InterruptedException {
+    EnvFactory.getEnv().cleanClusterEnvironment();
+  }
+
+  @Test
+  public void failWhenRemovingLastSingleReplicaDataNodeUseSQL() throws 
Exception {
+    // With a single replica (schema_replication_factor and 
data_replication_factor are both 1),
+    // removing DataNodes is still supported as long as more than one DataNode 
remains, but the last
+    // remaining DataNode cannot be removed because there is nowhere to 
migrate its regions to.
+    // Here we set up 1C1D with single replica and try to remove the only 
DataNode, which must fail
+    // because removing it would leave the cluster with no DataNode.
+    EnvFactory.getEnv()
+        .getConfig()
+        .getCommonConfig()
+        .setDataRegionConsensusProtocolClass(ConsensusFactory.IOT_CONSENSUS)
+        .setSchemaReplicationFactor(1)
+        .setDataReplicationFactor(1)
+        .setDefaultDataRegionGroupNumPerDatabase(1);
+    EnvFactory.getEnv().initClusterEnvironment(1, 1);
+
+    try (final Connection connection = 
makeItCloseQuietly(EnvFactory.getEnv().getConnection());
+        final Statement statement = 
makeItCloseQuietly(connection.createStatement());
+        final ResultSet resultSet = statement.executeQuery(SHOW_DATANODES)) {
+      final Set<Integer> allDataNodeId = new HashSet<>();
+      while (resultSet.next()) {
+        allDataNodeId.add(resultSet.getInt(ColumnHeaderConstant.NODE_ID));
+      }
+
+      final String removeDataNodeSQL =
+          generateRemoveString(selectRemoveDataNodes(allDataNodeId, 1));
+      try {
+        statement.execute(removeDataNodeSQL);
+        Assert.fail(
+            "Remove DataNode should fail when it would leave no DataNode under 
single replica");
+      } catch (final IoTDBSQLException e) {
+        // The unified rejection message reports the gap and, for a single 
replica, appends the
+        // "at least one DataNode must always remain" hint.
+        Assert.assertTrue(e.getMessage(), e.getMessage().contains("Cannot 
remove"));
+        Assert.assertTrue(e.getMessage(), e.getMessage().contains("single 
replica"));
+        Assert.assertFalse(
+            e.getMessage(), e.getMessage().contains("Failed to remove all 
requested data nodes"));
+      }
+    }
+  }
+}
diff --git 
a/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java
 
b/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java
index 085a803777e..a43e5151d40 100644
--- 
a/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java
+++ 
b/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java
@@ -445,6 +445,11 @@ public final class ProcedureMessages {
       "Failed to push topic meta to dataNodes, details: %s";
   public static final String FAILED_TO_REMOVE_DATA_NODE_BECAUSE_IT_IS_NOT_IN =
       "Failed to remove data node {} because it is not in running and the 
configuration of cluster is one replication";
+
+  public static final String FAILED_TO_REMOVE_DATA_NODE_WOULD_LEAVE_TOO_FEW =
+      "Cannot remove %d DataNode(s): the cluster has %d available DataNode(s) 
and must retain at least %d of them (max(schema_replication_factor=%d, 
data_replication_factor=%d)) so that every region keeps enough replicas, but 
this request would leave only %d.";
+  public static final String FAILED_TO_REMOVE_DATA_NODE_SINGLE_REPLICA_HINT =
+      " With a single replica there is nowhere to migrate regions to, so at 
least one DataNode must always remain.";
   public static final String 
FAILED_TO_ROLLBACK_ALTER_PIPE_DETAILS_METADATA_WILL_BE_SYNCHRONIZED =
       "Failed to rollback alter pipe {}, details: {}, metadata will be 
synchronized later.";
   public static final String 
FAILED_TO_ROLLBACK_COMMIT_SET_TEMPLATE_ON_PATH_DUE_TO =
diff --git 
a/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java
 
b/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java
index d928f4f6bae..dbad526cce7 100644
--- 
a/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java
+++ 
b/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java
@@ -445,6 +445,11 @@ public final class ProcedureMessages {
       "Failed to push topic meta to dataNodes, details: %s";
   public static final String FAILED_TO_REMOVE_DATA_NODE_BECAUSE_IT_IS_NOT_IN =
       "Failed to remove data node {} because it is not in running and the 
configuration of cluster is one replication";
+
+  public static final String FAILED_TO_REMOVE_DATA_NODE_WOULD_LEAVE_TOO_FEW =
+      "无法移除 %d 个 DataNode:集群当前有 %d 个可用 DataNode,且至少需保留 %d 
个(max(schema_replication_factor=%d, data_replication_factor=%d)),以保证每个 Region 
仍有足够的副本;但本次请求执行后将只剩 %d 个。";
+  public static final String FAILED_TO_REMOVE_DATA_NODE_SINGLE_REPLICA_HINT =
+      " 单副本下没有其它节点可供迁移 Region,因此必须始终保留至少一个 DataNode。";
   public static final String 
FAILED_TO_ROLLBACK_ALTER_PIPE_DETAILS_METADATA_WILL_BE_SYNCHRONIZED =
       "Failed to rollback alter pipe {}, details: {}, metadata will be 
synchronized later.";
   public static final String 
FAILED_TO_ROLLBACK_COMMIT_SET_TEMPLATE_ON_PATH_DUE_TO =
diff --git 
a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/partition/PartitionManager.java
 
b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/partition/PartitionManager.java
index 5be81256b5c..dfa3448bc0f 100644
--- 
a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/partition/PartitionManager.java
+++ 
b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/partition/PartitionManager.java
@@ -108,6 +108,7 @@ import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.LinkedHashMap;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
@@ -993,6 +994,23 @@ public class PartitionManager {
     }
 
     if (result.isEmpty()) {
+      // Diagnostic for the intermittent "no available RegionGroup" CI 
failures: dump every
+      // RegionGroup visible in PartitionInfo for this Database together with 
its LoadCache status.
+      // This pinpoints whether PartitionInfo simply has no RegionGroup yet 
(newly created
+      // RegionGroup not exposed) or it has some but all of them are currently 
Disabled.
+      // Only logged on the failure path right before throwing, so it never 
floods the log.
+      final Map<TConsensusGroupId, RegionGroupStatus> 
visibleRegionGroupStatusMap =
+          new LinkedHashMap<>();
+      regionGroupSlotsCounter.forEach(
+          slotsCounter ->
+              visibleRegionGroupStatusMap.put(
+                  slotsCounter.getRight(),
+                  
getLoadManager().getRegionGroupStatus(slotsCounter.getRight())));
+      LOGGER.warn(
+          "No available {} RegionGroup for Database: {}. RegionGroups visible 
in PartitionInfo and their LoadCache status: {}",
+          type,
+          database,
+          visibleRegionGroupStatusMap);
       throw new NoAvailableRegionGroupException(type, 
Collections.singletonList(database));
     }
 
diff --git 
a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java
 
b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java
index d271d5ef33b..4105b8ecf5c 100644
--- 
a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java
+++ 
b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java
@@ -511,12 +511,15 @@ public class ConfigNodeProcedureEnv {
     return clientHandler.getResponseList();
   }
 
-  public void persistRegionGroup(CreateRegionGroupsPlan 
createRegionGroupsPlan) {
+  public TSStatus persistRegionGroup(CreateRegionGroupsPlan 
createRegionGroupsPlan) {
     // Persist the allocation result
     try {
-      getConsensusManager().write(createRegionGroupsPlan);
+      return getConsensusManager().write(createRegionGroupsPlan);
     } catch (ConsensusException e) {
       LOG.warn("Failed in the write API executing the consensus layer due to: 
", e);
+      return new TSStatus(TSStatusCode.CREATE_REGION_ERROR.getStatusCode())
+          .setMessage(
+              "Failed to persist RegionGroup allocation in the consensus 
layer: " + e.getMessage());
     }
   }
 
diff --git 
a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java
 
b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java
index 5b505ec001b..6782c1b652a 100644
--- 
a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java
+++ 
b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java
@@ -567,19 +567,25 @@ public class RemoveDataNodeHandler {
     // when the configuration is one replication, it will be failed if the 
data node is not in
     // running state.
     if (CONF.getSchemaReplicationFactor() == 1 || 
CONF.getDataReplicationFactor() == 1) {
-      for (TDataNodeLocation dataNodeLocation : removedDataNodes) {
-        // check whether removed data node is in running state
-        if (!NodeStatus.Running.equals(
-            
configManager.getLoadManager().getNodeStatus(dataNodeLocation.getDataNodeId())))
 {
-          removedDataNodes.remove(dataNodeLocation);
-          LOGGER.error(
-              
ProcedureMessages.FAILED_TO_REMOVE_DATA_NODE_BECAUSE_IT_IS_NOT_IN, 
dataNodeLocation);
-        }
-        if (removedDataNodes.isEmpty()) {
-          status.setCode(TSStatusCode.NO_ENOUGH_DATANODE.getStatusCode());
-          
status.setMessage(ProcedureMessages.FAILED_TO_REMOVE_ALL_REQUESTED_DATA_NODES);
-          return status;
-        }
+      final List<TDataNodeLocation> notRunningDataNodes =
+          removedDataNodes.stream()
+              .filter(
+                  dataNodeLocation ->
+                      !NodeStatus.Running.equals(
+                          configManager
+                              .getLoadManager()
+                              
.getNodeStatus(dataNodeLocation.getDataNodeId())))
+              .collect(Collectors.toList());
+      notRunningDataNodes.forEach(
+          dataNodeLocation ->
+              LOGGER.error(
+                  
ProcedureMessages.FAILED_TO_REMOVE_DATA_NODE_BECAUSE_IT_IS_NOT_IN,
+                  dataNodeLocation));
+      removedDataNodes.removeAll(notRunningDataNodes);
+      if (removedDataNodes.isEmpty()) {
+        status.setCode(TSStatusCode.NO_ENOUGH_DATANODE.getStatusCode());
+        
status.setMessage(ProcedureMessages.FAILED_TO_REMOVE_ALL_REQUESTED_DATA_NODES);
+        return status;
       }
     }
 
@@ -593,13 +599,25 @@ public class RemoveDataNodeHandler {
                 .count();
     if (availableDatanodeSize - removedDataNodeSize < 
NodeInfo.getMinimumDataNode()) {
       status.setCode(TSStatusCode.NO_ENOUGH_DATANODE.getStatusCode());
-      status.setMessage(
+      // Report the concrete numbers so operators can see the gap: how many 
DataNodes are being
+      // removed, how many are available, the minimum that must remain (the 
larger of the schema and
+      // data replication factors) and how many would be left.
+      String message =
           String.format(
-              "Can't remove datanode due to the limit of replication factor, "
-                  + "availableDataNodeSize: %s, maxReplicaFactor: %s, max 
allowed removed Data Node size is: %s",
+              ProcedureMessages.FAILED_TO_REMOVE_DATA_NODE_WOULD_LEAVE_TOO_FEW,
+              removedDataNodeSize,
               availableDatanodeSize,
               NodeInfo.getMinimumDataNode(),
-              (availableDatanodeSize - NodeInfo.getMinimumDataNode())));
+              CONF.getSchemaReplicationFactor(),
+              CONF.getDataReplicationFactor(),
+              availableDatanodeSize - removedDataNodeSize);
+      if (NodeInfo.getMinimumDataNode() == 1) {
+        // With a single replica (schema_replication_factor and 
data_replication_factor are both 1)
+        // the only copy of each region lives on one DataNode, so at least one 
DataNode must always
+        // remain: there is nowhere to migrate its regions to.
+        message += 
ProcedureMessages.FAILED_TO_REMOVE_DATA_NODE_SINGLE_REPLICA_HINT;
+      }
+      status.setMessage(message);
     }
     return status;
   }
diff --git 
a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/region/CreateRegionGroupsProcedure.java
 
b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/region/CreateRegionGroupsProcedure.java
index 2cb283d400e..e9cce807e77 100644
--- 
a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/region/CreateRegionGroupsProcedure.java
+++ 
b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/region/CreateRegionGroupsProcedure.java
@@ -23,7 +23,9 @@ import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId;
 import org.apache.iotdb.common.rpc.thrift.TConsensusGroupType;
 import org.apache.iotdb.common.rpc.thrift.TDataNodeLocation;
 import org.apache.iotdb.common.rpc.thrift.TRegionReplicaSet;
+import org.apache.iotdb.common.rpc.thrift.TSStatus;
 import org.apache.iotdb.commons.cluster.RegionStatus;
+import org.apache.iotdb.commons.exception.IoTDBException;
 import org.apache.iotdb.commons.utils.TestOnly;
 import org.apache.iotdb.commons.utils.ThriftCommonsSerDeUtils;
 import org.apache.iotdb.confignode.conf.ConfigNodeConfig;
@@ -36,10 +38,12 @@ import 
org.apache.iotdb.confignode.manager.load.cache.region.RegionHeartbeatSamp
 import 
org.apache.iotdb.confignode.persistence.partition.maintainer.RegionCreateTask;
 import 
org.apache.iotdb.confignode.persistence.partition.maintainer.RegionDeleteTask;
 import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv;
+import org.apache.iotdb.confignode.procedure.exception.ProcedureException;
 import org.apache.iotdb.confignode.procedure.impl.StateMachineProcedure;
 import org.apache.iotdb.confignode.procedure.state.CreateRegionGroupsState;
 import org.apache.iotdb.confignode.procedure.store.ProcedureType;
 import org.apache.iotdb.consensus.exception.ConsensusException;
+import org.apache.iotdb.rpc.TSStatusCode;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -173,7 +177,11 @@ public class CreateRegionGroupsProcedure
                           }
                         }));
 
-        env.persistRegionGroup(persistPlan);
+        final TSStatus persistStatus = env.persistRegionGroup(persistPlan);
+        if (persistStatus.getCode() != 
TSStatusCode.SUCCESS_STATUS.getStatusCode()) {
+          setFailure(new ProcedureException(new 
IoTDBException(persistStatus)));
+          return Flow.NO_MORE_STATE;
+        }
         try {
           env.getConfigManager().getConsensusManager().write(offerPlan);
         } catch (final ConsensusException e) {

Reply via email to