This is an automated email from the ASF dual-hosted git repository.
CRZbulabula pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iotdb.git
The following commit(s) were added to refs/heads/master by this push:
new c5240154a70 Clarify RemoveDataNode single-replica error and add
diagnostics for the no-available-RegionGroup race (#17878)
c5240154a70 is described below
commit c5240154a7044d9328de7c908f3c5b55972e3284
Author: Yongzao <[email protected]>
AuthorDate: Wed Jun 10 19:19:22 2026 +0800
Clarify RemoveDataNode single-replica error and add diagnostics for the
no-available-RegionGroup race (#17878)
---
.../removedatanode/IoTDBRemoveLastDataNodeIT.java | 111 +++++++++++++++++++++
.../iotdb/confignode/i18n/ProcedureMessages.java | 5 +
.../iotdb/confignode/i18n/ProcedureMessages.java | 5 +
.../manager/partition/PartitionManager.java | 18 ++++
.../procedure/env/ConfigNodeProcedureEnv.java | 7 +-
.../procedure/env/RemoveDataNodeHandler.java | 52 ++++++----
.../impl/region/CreateRegionGroupsProcedure.java | 10 +-
7 files changed, 188 insertions(+), 20 deletions(-)
diff --git
a/integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveLastDataNodeIT.java
b/integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveLastDataNodeIT.java
new file mode 100644
index 00000000000..ca5a2929fd6
--- /dev/null
+++
b/integration-test/src/test/java/org/apache/iotdb/confignode/it/removedatanode/IoTDBRemoveLastDataNodeIT.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iotdb.confignode.it.removedatanode;
+
+import org.apache.iotdb.commons.schema.column.ColumnHeaderConstant;
+import org.apache.iotdb.consensus.ConsensusFactory;
+import org.apache.iotdb.it.env.EnvFactory;
+import org.apache.iotdb.it.framework.IoTDBTestRunner;
+import org.apache.iotdb.itbase.category.LocalStandaloneIT;
+import org.apache.iotdb.jdbc.IoTDBSQLException;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+import org.junit.runner.RunWith;
+
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.Statement;
+import java.util.HashSet;
+import java.util.Set;
+
+import static
org.apache.iotdb.confignode.it.removedatanode.IoTDBRemoveDataNodeUtils.generateRemoveString;
+import static
org.apache.iotdb.confignode.it.removedatanode.IoTDBRemoveDataNodeUtils.selectRemoveDataNodes;
+import static org.apache.iotdb.util.MagicUtils.makeItCloseQuietly;
+
+/**
+ * Removing the last DataNode of a single-replica cluster must be rejected.
This only needs a 1C1D
+ * cluster, so it lives in the 1C1D (LocalStandaloneIT) suite, separate from
the multi-DataNode
+ * removal tests in {@link IoTDBRemoveDataNodeNormalIT}.
+ */
+@Category({LocalStandaloneIT.class})
+@RunWith(IoTDBTestRunner.class)
+public class IoTDBRemoveLastDataNodeIT {
+
+ private static final String SHOW_DATANODES = "show datanodes";
+
+ @Before
+ public void setUp() throws Exception {
+ EnvFactory.getEnv()
+ .getConfig()
+ .getCommonConfig()
+ .setConfigNodeConsensusProtocolClass(ConsensusFactory.RATIS_CONSENSUS)
+
.setSchemaRegionConsensusProtocolClass(ConsensusFactory.RATIS_CONSENSUS);
+ }
+
+ @After
+ public void tearDown() throws InterruptedException {
+ EnvFactory.getEnv().cleanClusterEnvironment();
+ }
+
+ @Test
+ public void failWhenRemovingLastSingleReplicaDataNodeUseSQL() throws
Exception {
+ // With a single replica (schema_replication_factor and
data_replication_factor are both 1),
+ // removing DataNodes is still supported as long as more than one DataNode
remains, but the last
+ // remaining DataNode cannot be removed because there is nowhere to
migrate its regions to.
+ // Here we set up 1C1D with single replica and try to remove the only
DataNode, which must fail
+ // because removing it would leave the cluster with no DataNode.
+ EnvFactory.getEnv()
+ .getConfig()
+ .getCommonConfig()
+ .setDataRegionConsensusProtocolClass(ConsensusFactory.IOT_CONSENSUS)
+ .setSchemaReplicationFactor(1)
+ .setDataReplicationFactor(1)
+ .setDefaultDataRegionGroupNumPerDatabase(1);
+ EnvFactory.getEnv().initClusterEnvironment(1, 1);
+
+ try (final Connection connection =
makeItCloseQuietly(EnvFactory.getEnv().getConnection());
+ final Statement statement =
makeItCloseQuietly(connection.createStatement());
+ final ResultSet resultSet = statement.executeQuery(SHOW_DATANODES)) {
+ final Set<Integer> allDataNodeId = new HashSet<>();
+ while (resultSet.next()) {
+ allDataNodeId.add(resultSet.getInt(ColumnHeaderConstant.NODE_ID));
+ }
+
+ final String removeDataNodeSQL =
+ generateRemoveString(selectRemoveDataNodes(allDataNodeId, 1));
+ try {
+ statement.execute(removeDataNodeSQL);
+ Assert.fail(
+ "Remove DataNode should fail when it would leave no DataNode under
single replica");
+ } catch (final IoTDBSQLException e) {
+ // The unified rejection message reports the gap and, for a single
replica, appends the
+ // "at least one DataNode must always remain" hint.
+ Assert.assertTrue(e.getMessage(), e.getMessage().contains("Cannot
remove"));
+ Assert.assertTrue(e.getMessage(), e.getMessage().contains("single
replica"));
+ Assert.assertFalse(
+ e.getMessage(), e.getMessage().contains("Failed to remove all
requested data nodes"));
+ }
+ }
+ }
+}
diff --git
a/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java
b/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java
index 085a803777e..a43e5151d40 100644
---
a/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java
+++
b/iotdb-core/confignode/src/main/i18n/en/org/apache/iotdb/confignode/i18n/ProcedureMessages.java
@@ -445,6 +445,11 @@ public final class ProcedureMessages {
"Failed to push topic meta to dataNodes, details: %s";
public static final String FAILED_TO_REMOVE_DATA_NODE_BECAUSE_IT_IS_NOT_IN =
"Failed to remove data node {} because it is not in running and the
configuration of cluster is one replication";
+
+ public static final String FAILED_TO_REMOVE_DATA_NODE_WOULD_LEAVE_TOO_FEW =
+ "Cannot remove %d DataNode(s): the cluster has %d available DataNode(s)
and must retain at least %d of them (max(schema_replication_factor=%d,
data_replication_factor=%d)) so that every region keeps enough replicas, but
this request would leave only %d.";
+ public static final String FAILED_TO_REMOVE_DATA_NODE_SINGLE_REPLICA_HINT =
+ " With a single replica there is nowhere to migrate regions to, so at
least one DataNode must always remain.";
public static final String
FAILED_TO_ROLLBACK_ALTER_PIPE_DETAILS_METADATA_WILL_BE_SYNCHRONIZED =
"Failed to rollback alter pipe {}, details: {}, metadata will be
synchronized later.";
public static final String
FAILED_TO_ROLLBACK_COMMIT_SET_TEMPLATE_ON_PATH_DUE_TO =
diff --git
a/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java
b/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java
index d928f4f6bae..dbad526cce7 100644
---
a/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java
+++
b/iotdb-core/confignode/src/main/i18n/zh/org/apache/iotdb/confignode/i18n/ProcedureMessages.java
@@ -445,6 +445,11 @@ public final class ProcedureMessages {
"Failed to push topic meta to dataNodes, details: %s";
public static final String FAILED_TO_REMOVE_DATA_NODE_BECAUSE_IT_IS_NOT_IN =
"Failed to remove data node {} because it is not in running and the
configuration of cluster is one replication";
+
+ public static final String FAILED_TO_REMOVE_DATA_NODE_WOULD_LEAVE_TOO_FEW =
+ "无法移除 %d 个 DataNode:集群当前有 %d 个可用 DataNode,且至少需保留 %d
个(max(schema_replication_factor=%d, data_replication_factor=%d)),以保证每个 Region
仍有足够的副本;但本次请求执行后将只剩 %d 个。";
+ public static final String FAILED_TO_REMOVE_DATA_NODE_SINGLE_REPLICA_HINT =
+ " 单副本下没有其它节点可供迁移 Region,因此必须始终保留至少一个 DataNode。";
public static final String
FAILED_TO_ROLLBACK_ALTER_PIPE_DETAILS_METADATA_WILL_BE_SYNCHRONIZED =
"Failed to rollback alter pipe {}, details: {}, metadata will be
synchronized later.";
public static final String
FAILED_TO_ROLLBACK_COMMIT_SET_TEMPLATE_ON_PATH_DUE_TO =
diff --git
a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/partition/PartitionManager.java
b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/partition/PartitionManager.java
index 5be81256b5c..dfa3448bc0f 100644
---
a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/partition/PartitionManager.java
+++
b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/manager/partition/PartitionManager.java
@@ -108,6 +108,7 @@ import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
+import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
@@ -993,6 +994,23 @@ public class PartitionManager {
}
if (result.isEmpty()) {
+ // Diagnostic for the intermittent "no available RegionGroup" CI
failures: dump every
+ // RegionGroup visible in PartitionInfo for this Database together with
its LoadCache status.
+ // This pinpoints whether PartitionInfo simply has no RegionGroup yet
(newly created
+ // RegionGroup not exposed) or it has some but all of them are currently
Disabled.
+ // Only logged on the failure path right before throwing, so it never
floods the log.
+ final Map<TConsensusGroupId, RegionGroupStatus>
visibleRegionGroupStatusMap =
+ new LinkedHashMap<>();
+ regionGroupSlotsCounter.forEach(
+ slotsCounter ->
+ visibleRegionGroupStatusMap.put(
+ slotsCounter.getRight(),
+
getLoadManager().getRegionGroupStatus(slotsCounter.getRight())));
+ LOGGER.warn(
+ "No available {} RegionGroup for Database: {}. RegionGroups visible
in PartitionInfo and their LoadCache status: {}",
+ type,
+ database,
+ visibleRegionGroupStatusMap);
throw new NoAvailableRegionGroupException(type,
Collections.singletonList(database));
}
diff --git
a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java
b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java
index d271d5ef33b..4105b8ecf5c 100644
---
a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java
+++
b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/ConfigNodeProcedureEnv.java
@@ -511,12 +511,15 @@ public class ConfigNodeProcedureEnv {
return clientHandler.getResponseList();
}
- public void persistRegionGroup(CreateRegionGroupsPlan
createRegionGroupsPlan) {
+ public TSStatus persistRegionGroup(CreateRegionGroupsPlan
createRegionGroupsPlan) {
// Persist the allocation result
try {
- getConsensusManager().write(createRegionGroupsPlan);
+ return getConsensusManager().write(createRegionGroupsPlan);
} catch (ConsensusException e) {
LOG.warn("Failed in the write API executing the consensus layer due to:
", e);
+ return new TSStatus(TSStatusCode.CREATE_REGION_ERROR.getStatusCode())
+ .setMessage(
+ "Failed to persist RegionGroup allocation in the consensus
layer: " + e.getMessage());
}
}
diff --git
a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java
b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java
index 5b505ec001b..6782c1b652a 100644
---
a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java
+++
b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/env/RemoveDataNodeHandler.java
@@ -567,19 +567,25 @@ public class RemoveDataNodeHandler {
// when the configuration is one replication, it will be failed if the
data node is not in
// running state.
if (CONF.getSchemaReplicationFactor() == 1 ||
CONF.getDataReplicationFactor() == 1) {
- for (TDataNodeLocation dataNodeLocation : removedDataNodes) {
- // check whether removed data node is in running state
- if (!NodeStatus.Running.equals(
-
configManager.getLoadManager().getNodeStatus(dataNodeLocation.getDataNodeId())))
{
- removedDataNodes.remove(dataNodeLocation);
- LOGGER.error(
-
ProcedureMessages.FAILED_TO_REMOVE_DATA_NODE_BECAUSE_IT_IS_NOT_IN,
dataNodeLocation);
- }
- if (removedDataNodes.isEmpty()) {
- status.setCode(TSStatusCode.NO_ENOUGH_DATANODE.getStatusCode());
-
status.setMessage(ProcedureMessages.FAILED_TO_REMOVE_ALL_REQUESTED_DATA_NODES);
- return status;
- }
+ final List<TDataNodeLocation> notRunningDataNodes =
+ removedDataNodes.stream()
+ .filter(
+ dataNodeLocation ->
+ !NodeStatus.Running.equals(
+ configManager
+ .getLoadManager()
+
.getNodeStatus(dataNodeLocation.getDataNodeId())))
+ .collect(Collectors.toList());
+ notRunningDataNodes.forEach(
+ dataNodeLocation ->
+ LOGGER.error(
+
ProcedureMessages.FAILED_TO_REMOVE_DATA_NODE_BECAUSE_IT_IS_NOT_IN,
+ dataNodeLocation));
+ removedDataNodes.removeAll(notRunningDataNodes);
+ if (removedDataNodes.isEmpty()) {
+ status.setCode(TSStatusCode.NO_ENOUGH_DATANODE.getStatusCode());
+
status.setMessage(ProcedureMessages.FAILED_TO_REMOVE_ALL_REQUESTED_DATA_NODES);
+ return status;
}
}
@@ -593,13 +599,25 @@ public class RemoveDataNodeHandler {
.count();
if (availableDatanodeSize - removedDataNodeSize <
NodeInfo.getMinimumDataNode()) {
status.setCode(TSStatusCode.NO_ENOUGH_DATANODE.getStatusCode());
- status.setMessage(
+ // Report the concrete numbers so operators can see the gap: how many
DataNodes are being
+ // removed, how many are available, the minimum that must remain (the
larger of the schema and
+ // data replication factors) and how many would be left.
+ String message =
String.format(
- "Can't remove datanode due to the limit of replication factor, "
- + "availableDataNodeSize: %s, maxReplicaFactor: %s, max
allowed removed Data Node size is: %s",
+ ProcedureMessages.FAILED_TO_REMOVE_DATA_NODE_WOULD_LEAVE_TOO_FEW,
+ removedDataNodeSize,
availableDatanodeSize,
NodeInfo.getMinimumDataNode(),
- (availableDatanodeSize - NodeInfo.getMinimumDataNode())));
+ CONF.getSchemaReplicationFactor(),
+ CONF.getDataReplicationFactor(),
+ availableDatanodeSize - removedDataNodeSize);
+ if (NodeInfo.getMinimumDataNode() == 1) {
+ // With a single replica (schema_replication_factor and
data_replication_factor are both 1)
+ // the only copy of each region lives on one DataNode, so at least one
DataNode must always
+ // remain: there is nowhere to migrate its regions to.
+ message +=
ProcedureMessages.FAILED_TO_REMOVE_DATA_NODE_SINGLE_REPLICA_HINT;
+ }
+ status.setMessage(message);
}
return status;
}
diff --git
a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/region/CreateRegionGroupsProcedure.java
b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/region/CreateRegionGroupsProcedure.java
index 2cb283d400e..e9cce807e77 100644
---
a/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/region/CreateRegionGroupsProcedure.java
+++
b/iotdb-core/confignode/src/main/java/org/apache/iotdb/confignode/procedure/impl/region/CreateRegionGroupsProcedure.java
@@ -23,7 +23,9 @@ import org.apache.iotdb.common.rpc.thrift.TConsensusGroupId;
import org.apache.iotdb.common.rpc.thrift.TConsensusGroupType;
import org.apache.iotdb.common.rpc.thrift.TDataNodeLocation;
import org.apache.iotdb.common.rpc.thrift.TRegionReplicaSet;
+import org.apache.iotdb.common.rpc.thrift.TSStatus;
import org.apache.iotdb.commons.cluster.RegionStatus;
+import org.apache.iotdb.commons.exception.IoTDBException;
import org.apache.iotdb.commons.utils.TestOnly;
import org.apache.iotdb.commons.utils.ThriftCommonsSerDeUtils;
import org.apache.iotdb.confignode.conf.ConfigNodeConfig;
@@ -36,10 +38,12 @@ import
org.apache.iotdb.confignode.manager.load.cache.region.RegionHeartbeatSamp
import
org.apache.iotdb.confignode.persistence.partition.maintainer.RegionCreateTask;
import
org.apache.iotdb.confignode.persistence.partition.maintainer.RegionDeleteTask;
import org.apache.iotdb.confignode.procedure.env.ConfigNodeProcedureEnv;
+import org.apache.iotdb.confignode.procedure.exception.ProcedureException;
import org.apache.iotdb.confignode.procedure.impl.StateMachineProcedure;
import org.apache.iotdb.confignode.procedure.state.CreateRegionGroupsState;
import org.apache.iotdb.confignode.procedure.store.ProcedureType;
import org.apache.iotdb.consensus.exception.ConsensusException;
+import org.apache.iotdb.rpc.TSStatusCode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -173,7 +177,11 @@ public class CreateRegionGroupsProcedure
}
}));
- env.persistRegionGroup(persistPlan);
+ final TSStatus persistStatus = env.persistRegionGroup(persistPlan);
+ if (persistStatus.getCode() !=
TSStatusCode.SUCCESS_STATUS.getStatusCode()) {
+ setFailure(new ProcedureException(new
IoTDBException(persistStatus)));
+ return Flow.NO_MORE_STATE;
+ }
try {
env.getConfigManager().getConsensusManager().write(offerPlan);
} catch (final ConsensusException e) {