This is an automated email from the ASF dual-hosted git repository.
ashishkumar50 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 396e875943a HDDS-15138. Add EC DN safemode rule and control
RATIS/THREE background pipelines for EC-default clusters. (#10157)
396e875943a is described below
commit 396e875943ac9f421b31022dcfb1f7b5bc84d50a
Author: Aryan Gupta <[email protected]>
AuthorDate: Mon Jun 22 12:47:22 2026 +0530
HDDS-15138. Add EC DN safemode rule and control RATIS/THREE background
pipelines for EC-default clusters. (#10157)
---
.../org/apache/hadoop/hdds/scm/ScmConfigKeys.java | 12 ++
.../common/src/main/resources/ozone-default.xml | 11 ++
.../java/org/apache/hadoop/hdds/scm/ScmUtils.java | 23 ++++
.../scm/pipeline/BackgroundPipelineCreator.java | 91 ++++++++-----
.../scm/safemode/ECMinDataNodeSafeModeRule.java | 151 +++++++++++++++++++++
.../scm/safemode/HealthyPipelineSafeModeRule.java | 51 +++----
.../safemode/OneReplicaPipelineSafeModeRule.java | 19 +--
.../hdds/scm/safemode/SafeModeRuleFactory.java | 50 ++++++-
.../pipeline/TestBackgroundPipelineCreator.java | 116 ++++++++++++++++
.../safemode/TestECMinDataNodeSafeModeRule.java | 135 ++++++++++++++++++
.../TestOneReplicaPipelineSafeModeRule.java | 3 +-
.../hdds/scm/safemode/TestSCMSafeModeManager.java | 84 +++++++++++-
.../hdds/scm/safemode/TestSafeModeRuleFactory.java | 74 ++++++++--
.../safemode/TestSCMSafeModeWithPipelineRules.java | 68 +++++++++-
14 files changed, 798 insertions(+), 90 deletions(-)
diff --git
a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java
b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java
index 669e744189b..93fb3ec1933 100644
---
a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java
+++
b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java
@@ -459,6 +459,18 @@ public final class ScmConfigKeys {
public static final boolean
OZONE_SCM_PIPELINE_AUTO_CREATE_FACTOR_ONE_DEFAULT = true;
+ /**
+ * If true, BackgroundPipelineCreator will create RATIS/THREE pipelines even
+ * when the default replication is EC. This keeps RATIS write paths warm for
+ * mixed-workload clusters. If false, RATIS/THREE pipeline creation is
+ * skipped for EC-default clusters.
+ */
+ public static final String OZONE_SCM_PIPELINE_CREATE_RATIS_THREE =
+ "ozone.scm.pipeline.creation.ratis.three";
+
+ public static final boolean
+ OZONE_SCM_PIPELINE_CREATE_RATIS_THREE_DEFAULT = true;
+
public static final String
OZONE_SCM_BLOCK_DELETION_PER_DN_DISTRIBUTION_FACTOR =
"ozone.scm.block.deletion.per.dn.distribution.factor";
diff --git a/hadoop-hdds/common/src/main/resources/ozone-default.xml
b/hadoop-hdds/common/src/main/resources/ozone-default.xml
index 917eab5c30e..dd33a02b02a 100644
--- a/hadoop-hdds/common/src/main/resources/ozone-default.xml
+++ b/hadoop-hdds/common/src/main/resources/ozone-default.xml
@@ -1709,6 +1709,17 @@
If enabled, SCM will auto create RATIS factor ONE pipeline.
</description>
</property>
+ <property>
+ <name>ozone.scm.pipeline.creation.ratis.three</name>
+ <value>true</value>
+ <tag>OZONE, SCM, PIPELINE</tag>
+ <description>
+ When true, SCM creates RATIS/THREE pipelines in the background and
+ requires them during safemode. Applies only when the cluster default
+ replication type is EC. For RATIS-default clusters this flag has no
+ effect.
+ </description>
+ </property>
<property>
<name>hdds.scm.safemode.threshold.pct</name>
<value>0.99</value>
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ScmUtils.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ScmUtils.java
index 21685daebd6..a5465aa993d 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ScmUtils.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ScmUtils.java
@@ -45,6 +45,7 @@
import java.util.Optional;
import java.util.OptionalInt;
import java.util.concurrent.BlockingQueue;
+import org.apache.hadoop.hdds.client.ReplicationConfig;
import org.apache.hadoop.hdds.conf.ConfigurationSource;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.scm.events.SCMEvents;
@@ -54,6 +55,7 @@
import
org.apache.hadoop.hdds.scm.server.SCMDatanodeHeartbeatDispatcher.ContainerReport;
import org.apache.hadoop.hdds.security.SecurityConfig;
import org.apache.hadoop.net.NetUtils;
+import org.apache.hadoop.ozone.OzoneConfigKeys;
import org.apache.hadoop.ozone.ha.ConfUtils;
import org.apache.hadoop.util.StringUtils;
import org.slf4j.Logger;
@@ -218,4 +220,25 @@ public static void checkIfCertSignRequestAllowed(
}
}
}
+
+ /**
+ * Returns default replication config, or null when configured values are
+ * invalid. Callers can decide whether to fallback or skip their operation.
+ */
+ public static ReplicationConfig getDefaultReplicationConfig(
+ ConfigurationSource conf, Logger logger, String componentName) {
+ try {
+ return ReplicationConfig.getDefault(conf);
+ } catch (IllegalArgumentException e) {
+ logger.warn("Ignoring invalid default replication config in {}: "
+ + "type={}, replication={}.",
+ componentName,
+ conf.get(OzoneConfigKeys.OZONE_REPLICATION_TYPE,
+ OzoneConfigKeys.OZONE_REPLICATION_TYPE_DEFAULT),
+ conf.get(OzoneConfigKeys.OZONE_REPLICATION,
+ OzoneConfigKeys.OZONE_REPLICATION_DEFAULT),
+ e);
+ return null;
+ }
+ }
}
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/pipeline/BackgroundPipelineCreator.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/pipeline/BackgroundPipelineCreator.java
index 0aefbedbd43..97455d56d51 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/pipeline/BackgroundPipelineCreator.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/pipeline/BackgroundPipelineCreator.java
@@ -25,6 +25,7 @@
import static
org.apache.hadoop.hdds.scm.ha.SCMService.Event.PRE_CHECK_COMPLETED;
import static
org.apache.hadoop.hdds.scm.ha.SCMService.Event.UNHEALTHY_TO_HEALTHY_NODE_HANDLER_TRIGGERED;
+import com.google.common.annotations.VisibleForTesting;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import java.io.IOException;
import java.time.Clock;
@@ -43,9 +44,9 @@
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationFactor;
import org.apache.hadoop.hdds.scm.ScmConfigKeys;
+import org.apache.hadoop.hdds.scm.ScmUtils;
import org.apache.hadoop.hdds.scm.ha.SCMContext;
import org.apache.hadoop.hdds.scm.ha.SCMService;
-import org.apache.hadoop.ozone.OzoneConfigKeys;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -88,6 +89,7 @@ public class BackgroundPipelineCreator implements SCMService {
private final AtomicBoolean running = new AtomicBoolean(false);
private final long intervalInMillis;
private final Clock clock;
+ private final boolean createRatisThreeForEcDefault;
BackgroundPipelineCreator(PipelineManager pipelineManager,
ConfigurationSource conf, SCMContext scmContext, Clock clock) {
@@ -109,6 +111,9 @@ public class BackgroundPipelineCreator implements
SCMService {
ScmConfigKeys.OZONE_SCM_PIPELINE_CREATION_INTERVAL,
ScmConfigKeys.OZONE_SCM_PIPELINE_CREATION_INTERVAL_DEFAULT,
TimeUnit.MILLISECONDS);
+ this.createRatisThreeForEcDefault = conf.getBoolean(
+ ScmConfigKeys.OZONE_SCM_PIPELINE_CREATE_RATIS_THREE,
+ ScmConfigKeys.OZONE_SCM_PIPELINE_CREATE_RATIS_THREE_DEFAULT);
threadName = scmContext.threadNamePrefix() + THREAD_NAME;
}
@@ -204,42 +209,18 @@ private boolean skipCreation(ReplicationConfig
replicationConfig,
}
private void createPipelines() throws RuntimeException {
- // TODO: #CLUTIL Different replication factor may need to be supported
- HddsProtos.ReplicationType type = HddsProtos.ReplicationType.valueOf(
- conf.get(OzoneConfigKeys.OZONE_REPLICATION_TYPE,
- OzoneConfigKeys.OZONE_REPLICATION_TYPE_DEFAULT));
- boolean autoCreateFactorOne = conf.getBoolean(
- ScmConfigKeys.OZONE_SCM_PIPELINE_AUTO_CREATE_FACTOR_ONE,
+ boolean autoCreateFactorOne =
conf.getBoolean(ScmConfigKeys.OZONE_SCM_PIPELINE_AUTO_CREATE_FACTOR_ONE,
ScmConfigKeys.OZONE_SCM_PIPELINE_AUTO_CREATE_FACTOR_ONE_DEFAULT);
- List<ReplicationConfig> list =
- new ArrayList<>();
- for (HddsProtos.ReplicationFactor factor : HddsProtos.ReplicationFactor
- .values()) {
- if (factor == ReplicationFactor.ZERO) {
- continue; // Ignore it.
- }
- final ReplicationConfig replicationConfig;
- if (type != EC) {
- replicationConfig =
- ReplicationConfig.fromProtoTypeAndFactor(type, factor);
- } else if (factor == ReplicationFactor.ONE) {
- replicationConfig =
- ReplicationConfig.fromProtoTypeAndFactor(RATIS, factor);
- } else {
- continue;
- }
- if (skipCreation(replicationConfig, autoCreateFactorOne)) {
- // Skip this iteration for creating pipeline
- continue;
- }
- list.add(replicationConfig);
+ List<ReplicationConfig> list = getReplicationConfigs(autoCreateFactorOne);
+ if (list.isEmpty()) {
+ LOG.debug("No replication configs selected for background pipeline
creation.");
+ return;
}
LoopingIterator it = new LoopingIterator(list);
while (it.hasNext()) {
- ReplicationConfig replicationConfig =
- (ReplicationConfig) it.next();
+ ReplicationConfig replicationConfig = (ReplicationConfig) it.next();
try {
Pipeline pipeline = pipelineManager.createPipeline(replicationConfig);
@@ -255,6 +236,54 @@ private void createPipelines() throws RuntimeException {
LOG.debug("BackgroundPipelineCreator createPipelines finished.");
}
+ /**
+ * Returns replication configs eligible for background pipeline creation.
+ *
+ * <p>If the default replication config is invalid, this returns an empty
+ * list and skips pipeline creation to avoid guessing from raw config values.
+ * For EC-default clusters, this only returns RATIS/THREE when
+ * {@link ScmConfigKeys#OZONE_SCM_PIPELINE_CREATE_RATIS_THREE} is enabled.
+ */
+ @VisibleForTesting
+ List<ReplicationConfig> getReplicationConfigs(boolean autoCreateFactorOne) {
+ List<ReplicationConfig> list = new ArrayList<>();
+ ReplicationConfig defaultReplicationConfig = ScmUtils
+ .getDefaultReplicationConfig(conf, LOG,
+ BackgroundPipelineCreator.class.getSimpleName());
+ if (defaultReplicationConfig == null) {
+ LOG.warn("Skipping background pipeline creation: default replication "
+ + "config is invalid.");
+ return list;
+ }
+ // TODO: #CLUTIL Different replication factor may need to be supported
+ HddsProtos.ReplicationType type =
+ defaultReplicationConfig.getReplicationType();
+ if (type == EC && createRatisThreeForEcDefault) {
+ list.add(ReplicationConfig.fromProtoTypeAndFactor(RATIS,
+ ReplicationFactor.THREE));
+ }
+ if (type == EC) {
+ return list;
+ }
+
+ for (HddsProtos.ReplicationFactor factor
+ : HddsProtos.ReplicationFactor.values()) {
+ if (factor == ReplicationFactor.ZERO) {
+ continue; // Ignore it.
+ }
+ final ReplicationConfig replicationConfig =
+ ReplicationConfig.fromProtoTypeAndFactor(type, factor);
+ if (skipCreation(replicationConfig, autoCreateFactorOne)) {
+ // Skip this iteration for creating pipeline
+ continue;
+ }
+ if (!list.contains(replicationConfig)) {
+ list.add(replicationConfig);
+ }
+ }
+ return list;
+ }
+
@Override
public void notifyStatusChanged() {
serviceLock.lock();
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECMinDataNodeSafeModeRule.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECMinDataNodeSafeModeRule.java
new file mode 100644
index 00000000000..1add969c5d1
--- /dev/null
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECMinDataNodeSafeModeRule.java
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdds.scm.safemode;
+
+import com.google.common.annotations.VisibleForTesting;
+import java.util.HashSet;
+import java.util.Set;
+import org.apache.hadoop.hdds.client.ECReplicationConfig;
+import org.apache.hadoop.hdds.client.ReplicationConfig;
+import org.apache.hadoop.hdds.conf.ConfigurationSource;
+import org.apache.hadoop.hdds.protocol.DatanodeID;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
+import org.apache.hadoop.hdds.scm.ScmUtils;
+import org.apache.hadoop.hdds.scm.events.SCMEvents;
+import org.apache.hadoop.hdds.scm.node.NodeManager;
+import org.apache.hadoop.hdds.scm.node.NodeStatus;
+import
org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer.NodeRegistrationContainerReport;
+import org.apache.hadoop.hdds.server.events.EventQueue;
+import org.apache.hadoop.hdds.server.events.TypedEvent;
+
+/**
+ * Safe mode exit rule for EC-default clusters.
+ *
+ * <p>EC pipelines are ephemeral and created on demand. This rule ensures that
+ * at least {@code data + parity} healthy DataNodes are available before SCM
+ * exits safe mode for EC-default clusters.
+ *
+ * <p>For non-EC defaults this rule is a no-op.
+ */
+public class ECMinDataNodeSafeModeRule
+ extends SafeModeExitRule<NodeRegistrationContainerReport> {
+
+ private final boolean enabled;
+ private final int requiredDns;
+ private final String ecConfigLabel;
+ private final NodeManager nodeManager;
+ private final Set<DatanodeID> registeredDnSet;
+
+ public ECMinDataNodeSafeModeRule(EventQueue eventQueue,
+ ConfigurationSource conf,
+ NodeManager nodeManager,
+ SCMSafeModeManager safeModeManager) {
+ super(safeModeManager, eventQueue);
+ this.nodeManager = nodeManager;
+
+ ReplicationConfig defaultConfig = ScmUtils.getDefaultReplicationConfig(
+ conf, SCMSafeModeManager.getLogger(),
+ ECMinDataNodeSafeModeRule.class.getSimpleName());
+ if (defaultConfig != null
+ && defaultConfig.getReplicationType() ==
HddsProtos.ReplicationType.EC) {
+ ECReplicationConfig ecConfig = (ECReplicationConfig) defaultConfig;
+ this.requiredDns = ecConfig.getRequiredNodes();
+ this.ecConfigLabel = ecConfig.configFormat();
+ this.enabled = true;
+ this.registeredDnSet = new HashSet<>(Math.max(requiredDns * 2, 1));
+ SCMSafeModeManager.getLogger().info(
+ "ECMinDataNodeSafeModeRule enabled for default EC config {}. "
+ + "Required healthy DataNodes for safemode exit: {}.",
+ ecConfigLabel, requiredDns);
+ } else {
+ this.requiredDns = 0;
+ this.ecConfigLabel = "";
+ this.enabled = false;
+ this.registeredDnSet = new HashSet<>(0);
+ SCMSafeModeManager.getLogger().debug(
+ "ECMinDataNodeSafeModeRule disabled: default replication is not
EC.");
+ }
+ }
+
+ @Override
+ protected TypedEvent<NodeRegistrationContainerReport> getEventType() {
+ return SCMEvents.NODE_REGISTRATION_CONT_REPORT;
+ }
+
+ @Override
+ protected synchronized boolean validate() {
+ if (!enabled) {
+ return true;
+ }
+ if (validateBasedOnReportProcessing()) {
+ return getRegisteredDns() >= requiredDns;
+ }
+ return nodeManager.getNodes(NodeStatus.inServiceHealthy()).size() >=
requiredDns;
+ }
+
+ @Override
+ protected synchronized void process(NodeRegistrationContainerReport report) {
+ if (!enabled) {
+ return;
+ }
+ DatanodeID dnId = report.getDatanodeDetails().getID();
+ if (registeredDnSet.add(dnId)) {
+ if (scmInSafeMode()) {
+ SCMSafeModeManager.getLogger().info(
+ "SCM in safe mode. EC rule progress: {} of {} required "
+ + "DataNodes registered for EC {}.",
+ getRegisteredDns(), requiredDns, ecConfigLabel);
+ }
+ }
+ }
+
+ @Override
+ protected synchronized void cleanup() {
+ registeredDnSet.clear();
+ }
+
+ @Override
+ public synchronized String getStatusText() {
+ if (!enabled) {
+ return "ECMinDataNodeSafeModeRule is not applicable "
+ + "(default replication is not EC)";
+ }
+ return String.format(
+ "EC (%s) safemode: registered DataNodes (=%d) >= required DataNodes
(=%d)",
+ ecConfigLabel, getRegisteredDns(), requiredDns);
+ }
+
+ @Override
+ public void refresh(boolean forceRefresh) {
+ // Nothing to refresh from SCM DB for this rule.
+ }
+
+ @VisibleForTesting
+ int getRequiredDns() {
+ return requiredDns;
+ }
+
+ @VisibleForTesting
+ synchronized int getRegisteredDns() {
+ return registeredDnSet.size();
+ }
+
+ boolean isEnabled() {
+ return enabled;
+ }
+}
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/HealthyPipelineSafeModeRule.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/HealthyPipelineSafeModeRule.java
index 3e590013c11..d6c301c32c5 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/HealthyPipelineSafeModeRule.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/HealthyPipelineSafeModeRule.java
@@ -31,6 +31,7 @@
import org.apache.hadoop.hdds.conf.ConfigurationSource;
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationFactor;
import org.apache.hadoop.hdds.scm.events.SCMEvents;
import org.apache.hadoop.hdds.scm.ha.SCMContext;
import org.apache.hadoop.hdds.scm.node.NodeManager;
@@ -66,6 +67,12 @@ public class HealthyPipelineSafeModeRule extends
SafeModeExitRule<Pipeline> {
private final SCMContext scmContext;
private final Set<PipelineID> unProcessedPipelineSet = new HashSet<>();
private final NodeManager nodeManager;
+ private final RatisReplicationConfig targetReplicationConfig =
+ RatisReplicationConfig.getInstance(ReplicationFactor.THREE);
+ private final int targetRequiredNodes =
+ HddsProtos.ReplicationFactor.THREE_VALUE;
+ private final String targetReplicationLabel =
+ targetReplicationConfig.configFormat();
HealthyPipelineSafeModeRule(EventQueue eventQueue,
PipelineManager pipelineManager, SCMSafeModeManager manager,
@@ -80,7 +87,6 @@ public class HealthyPipelineSafeModeRule extends
SafeModeExitRule<Pipeline> {
HddsConfigKeys.
HDDS_SCM_SAFEMODE_HEALTHY_PIPELINE_THRESHOLD_PCT_DEFAULT);
- // We only care about THREE replica pipeline
minHealthyPipelines = getMinHealthyPipelines(configuration);
Preconditions.checkArgument(
@@ -97,7 +103,6 @@ private int getMinHealthyPipelines(ConfigurationSource
config) {
HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE,
HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE_DEFAULT);
- // We only care about THREE replica pipeline
return minDatanodes / HddsProtos.ReplicationFactor.THREE_VALUE;
}
@@ -141,14 +146,12 @@ protected synchronized void process(Pipeline pipeline) {
// datanode can send pipeline report again, or SCMPipelineManager will
// create new pipelines.
- // Only handle RATIS + 3-replica pipelines.
- if (pipeline.getType() != HddsProtos.ReplicationType.RATIS ||
- ((RatisReplicationConfig)
pipeline.getReplicationConfig()).getReplicationFactor() !=
- HddsProtos.ReplicationFactor.THREE) {
+ if (!targetReplicationConfig.equals(pipeline.getReplicationConfig())) {
Logger safeModeManagerLog = SCMSafeModeManager.getLogger();
if (safeModeManagerLog.isDebugEnabled()) {
- safeModeManagerLog.debug("Skipping pipeline safemode report processing
as Replication type isn't RATIS " +
- "or replication factor isn't 3.");
+ safeModeManagerLog.debug("Skipping pipeline safemode report processing"
+ + " as replication config {} does not match target {}.",
+ pipeline.getReplicationConfig(), targetReplicationConfig);
}
return;
}
@@ -161,9 +164,10 @@ protected synchronized void process(Pipeline pipeline) {
}
List<DatanodeDetails> pipelineDns = pipeline.getNodes();
- if (pipelineDns.size() != 3) {
- LOG.warn("Only {} DNs reported this pipeline: {}, all 3 DNs should
report the pipeline", pipelineDns.size(),
- pipeline.getId());
+ if (pipelineDns.size() != targetRequiredNodes) {
+ LOG.warn("Only {} DNs reported this pipeline: {}, all {} DNs should "
+ + "report the pipeline",
+ pipelineDns.size(), pipeline.getId(), targetRequiredNodes);
return;
}
@@ -218,8 +222,7 @@ public synchronized void refresh(boolean forceRefresh) {
private synchronized void initializeRule(boolean refresh) {
unProcessedPipelineSet.addAll(pipelineManager.getPipelines(
- RatisReplicationConfig.getInstance(
- HddsProtos.ReplicationFactor.THREE),
+ targetReplicationConfig,
Pipeline.PipelineState.OPEN).stream().map(Pipeline::getId)
.collect(Collectors.toSet()));
@@ -245,10 +248,11 @@ private synchronized void initializeRule(boolean refresh)
{
private boolean validateHealthyPipelineSafeModeRuleUsingPipelineManager() {
// Query PipelineManager directly for healthy pipeline count
List<Pipeline> openPipelines = pipelineManager.getPipelines(
- RatisReplicationConfig.getInstance(HddsProtos.ReplicationFactor.THREE),
+ targetReplicationConfig,
Pipeline.PipelineState.OPEN);
-
- LOG.debug("Found {} open RATIS/THREE pipelines", openPipelines.size());
+
+ LOG.debug("Found {} open {} pipelines", openPipelines.size(),
+ targetReplicationLabel);
int pipelineCount = openPipelines.size();
healthyPipelineThresholdCount = Math.max(minHealthyPipelines,
@@ -271,11 +275,11 @@ private boolean
validateHealthyPipelineSafeModeRuleUsingPipelineManager() {
}
boolean isPipelineHealthy(Pipeline pipeline) {
- // Verify pipeline has all 3 nodes
+ // Verify pipeline has all required nodes for target replication.
List<DatanodeDetails> nodes = pipeline.getNodes();
- if (nodes.size() != 3) {
- LOG.debug("Pipeline {} is not healthy: has {} nodes instead of 3",
- pipeline.getId(), nodes.size());
+ if (nodes.size() != targetRequiredNodes) {
+ LOG.debug("Pipeline {} is not healthy: has {} nodes instead of {}",
+ pipeline.getId(), nodes.size(), targetRequiredNodes);
return false;
}
@@ -316,8 +320,9 @@ public synchronized int getHealthyPipelineThresholdCount() {
@Override
public String getStatusText() {
String status = String.format(
- "healthy Ratis/THREE pipelines (=%d) >= healthyPipelineThresholdCount"
+
- " (=%d)", getCurrentHealthyPipelineCount(),
+ "healthy %s pipelines (=%d) >= healthyPipelineThresholdCount" +
+ " (=%d)", targetReplicationLabel,
+ getCurrentHealthyPipelineCount(),
getHealthyPipelineThresholdCount());
status = updateStatusTextWithSamplePipelines(status);
return status;
@@ -327,7 +332,7 @@ private synchronized String
updateStatusTextWithSamplePipelines(
String status) {
if (validateBasedOnReportProcessing()) {
List<Pipeline> openPipelines = pipelineManager.getPipelines(
-
RatisReplicationConfig.getInstance(HddsProtos.ReplicationFactor.THREE),
+ targetReplicationConfig,
Pipeline.PipelineState.OPEN);
Set<PipelineID> unhealthyPipelines = openPipelines.stream()
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/OneReplicaPipelineSafeModeRule.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/OneReplicaPipelineSafeModeRule.java
index 8b1fc593af3..227d7ddb47b 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/OneReplicaPipelineSafeModeRule.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/OneReplicaPipelineSafeModeRule.java
@@ -58,6 +58,8 @@ public class OneReplicaPipelineSafeModeRule extends
private int currentReportedPipelineCount = 0;
private PipelineManager pipelineManager;
private final double pipelinePercent;
+ private final RatisReplicationConfig targetReplicationConfig =
+ RatisReplicationConfig.getInstance(ReplicationFactor.THREE);
public OneReplicaPipelineSafeModeRule(EventQueue eventQueue, PipelineManager
pipelineManager,
SCMSafeModeManager safeModeManager, ConfigurationSource configuration) {
@@ -108,8 +110,7 @@ protected synchronized void
process(PipelineReportFromDatanode report) {
continue;
}
- if (RatisReplicationConfig
- .hasFactor(pipeline.getReplicationConfig(), ReplicationFactor.THREE)
+ if (targetReplicationConfig.equals(pipeline.getReplicationConfig())
&& pipeline.isOpen() &&
!reportedPipelineIDSet.contains(pipeline.getId())) {
if (oldPipelineIDSet.contains(pipeline.getId())) {
@@ -152,8 +153,10 @@ Set<PipelineID> getReportedPipelineIDSet() {
@Override
public String getStatusText() {
String status = String.format(
- "reported Ratis/THREE pipelines with at least one datanode (=%d) "
- + ">= threshold (=%d)", getCurrentReportedPipelineCount(),
+ "reported %s pipelines with at least one datanode (=%d) "
+ + ">= threshold (=%d)",
+ targetReplicationConfig.configFormat(),
+ getCurrentReportedPipelineCount(),
getThresholdCount());
status = updateStatusTextWithSamplePipelines(status);
return status;
@@ -184,11 +187,11 @@ public synchronized void refresh(boolean forceRefresh) {
}
private void updateReportedPipelineSet() {
- List<Pipeline> openRatisPipelines =
-
pipelineManager.getPipelines(RatisReplicationConfig.getInstance(ReplicationFactor.THREE),
+ List<Pipeline> openTargetPipelines =
+ pipelineManager.getPipelines(targetReplicationConfig,
Pipeline.PipelineState.OPEN);
- for (Pipeline pipeline : openRatisPipelines) {
+ for (Pipeline pipeline : openTargetPipelines) {
PipelineID pipelineID = pipeline.getId();
if (!pipeline.getNodeSet().isEmpty()
&& oldPipelineIDSet.contains(pipelineID)
@@ -202,7 +205,7 @@ private void updateReportedPipelineSet() {
private void initializeRule(boolean refresh) {
oldPipelineIDSet = pipelineManager.getPipelines(
- RatisReplicationConfig.getInstance(ReplicationFactor.THREE),
+ targetReplicationConfig,
Pipeline.PipelineState.OPEN)
.stream().map(p -> p.getId()).collect(Collectors.toSet());
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeRuleFactory.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeRuleFactory.java
index 398eb19b56e..bb7056d2c3c 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeRuleFactory.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeRuleFactory.java
@@ -19,7 +19,10 @@
import java.util.ArrayList;
import java.util.List;
+import org.apache.hadoop.hdds.client.ReplicationConfig;
import org.apache.hadoop.hdds.conf.ConfigurationSource;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
+import org.apache.hadoop.hdds.scm.ScmConfigKeys;
import org.apache.hadoop.hdds.scm.container.ContainerManager;
import org.apache.hadoop.hdds.scm.ha.SCMContext;
import org.apache.hadoop.hdds.scm.ha.SCMHAManager;
@@ -75,10 +78,13 @@ private void loadRules(SCMSafeModeManager safeModeManager) {
config, containerManager, safeModeManager);
SafeModeExitRule<?> datanodeRule = new DataNodeSafeModeRule(eventQueue,
config, nodeManager, safeModeManager);
+ SafeModeExitRule<?> ecMinDnRule = new ECMinDataNodeSafeModeRule(eventQueue,
+ config, nodeManager, safeModeManager);
safeModeRules.add(ratisContainerRule);
safeModeRules.add(ecContainerRule);
safeModeRules.add(datanodeRule);
+ safeModeRules.add(ecMinDnRule);
preCheckRules.add(datanodeRule);
@@ -93,12 +99,48 @@ private void loadRules(SCMSafeModeManager safeModeManager) {
}
if (pipelineManager != null) {
- safeModeRules.add(new HealthyPipelineSafeModeRule(eventQueue,
pipelineManager,
- safeModeManager, config, scmContext, nodeManager));
- safeModeRules.add(new OneReplicaPipelineSafeModeRule(eventQueue,
pipelineManager,
- safeModeManager, config));
+ if (shouldEnableRatisThreePipelineRules()) {
+ safeModeRules.add(new HealthyPipelineSafeModeRule(eventQueue,
+ pipelineManager, safeModeManager, config, scmContext,
nodeManager));
+ safeModeRules.add(new OneReplicaPipelineSafeModeRule(eventQueue,
pipelineManager,
+ safeModeManager, config));
+ } else {
+ SCMSafeModeManager.getLogger().info(
+ "RATIS/THREE pipeline safemode rules are disabled because "
+ + "{} is false for an EC-default cluster or the default "
+ + "replication config is invalid.",
+ ScmConfigKeys.OZONE_SCM_PIPELINE_CREATE_RATIS_THREE);
+ }
+ }
+
+ }
+
+ /**
+ * Returns true when RATIS/THREE pipeline safemode rules should be active.
+ * For EC-default clusters, these rules are only meaningful when RATIS/THREE
+ * background pipeline creation is also enabled (same flag); if no
+ * RATIS/THREE pipelines are created, requiring them in safemode would block
+ * safemode exit.
+ */
+ private boolean shouldEnableRatisThreePipelineRules() {
+ ReplicationConfig defaultReplicationConfig;
+ try {
+ defaultReplicationConfig = ReplicationConfig.getDefault(config);
+ } catch (IllegalArgumentException e) {
+ SCMSafeModeManager.getLogger().warn(
+ "Disabling RATIS/THREE pipeline safemode rules because default "
+ + "replication config could not be parsed.",
+ e);
+ return false;
+ }
+
+ if (defaultReplicationConfig.getReplicationType()
+ != HddsProtos.ReplicationType.EC) {
+ return true;
}
+ return
config.getBoolean(ScmConfigKeys.OZONE_SCM_PIPELINE_CREATE_RATIS_THREE,
+ ScmConfigKeys.OZONE_SCM_PIPELINE_CREATE_RATIS_THREE_DEFAULT);
}
public static synchronized SafeModeRuleFactory getInstance() {
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/pipeline/TestBackgroundPipelineCreator.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/pipeline/TestBackgroundPipelineCreator.java
new file mode 100644
index 00000000000..e297266cc28
--- /dev/null
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/pipeline/TestBackgroundPipelineCreator.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdds.scm.pipeline;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.mockito.Mockito.mock;
+
+import java.time.Clock;
+import java.util.List;
+import org.apache.hadoop.hdds.client.RatisReplicationConfig;
+import org.apache.hadoop.hdds.client.ReplicationConfig;
+import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
+import org.apache.hadoop.hdds.scm.ScmConfigKeys;
+import org.apache.hadoop.hdds.scm.ha.SCMContext;
+import org.apache.hadoop.ozone.OzoneConfigKeys;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Tests for BackgroundPipelineCreator replication config selection.
+ */
+public class TestBackgroundPipelineCreator {
+
+ @Test
+ public void testEcDefaultReplicationWithoutRatisThreeFlagCreatesNoPipelines()
+ throws Exception {
+ OzoneConfiguration conf = new OzoneConfiguration();
+ conf.set(OzoneConfigKeys.OZONE_REPLICATION_TYPE,
+ HddsProtos.ReplicationType.EC.name());
+ conf.set(OzoneConfigKeys.OZONE_REPLICATION, "rs-3-2-1024k");
+ conf.setBoolean(ScmConfigKeys.OZONE_SCM_PIPELINE_CREATE_RATIS_THREE,
+ false);
+
+ BackgroundPipelineCreator creator = new BackgroundPipelineCreator(
+ mock(PipelineManager.class), conf, mock(SCMContext.class),
+ Clock.systemUTC());
+
+ List<ReplicationConfig> configs = creator.getReplicationConfigs(false);
+
+ assertTrue(configs.isEmpty());
+ }
+
+ @Test
+ public void testEcDefaultReplicationWithRatisThreeFlag() throws Exception {
+ OzoneConfiguration conf = new OzoneConfiguration();
+ conf.set(OzoneConfigKeys.OZONE_REPLICATION_TYPE,
+ HddsProtos.ReplicationType.EC.name());
+ conf.set(OzoneConfigKeys.OZONE_REPLICATION, "rs-3-2-1024k");
+ conf.setBoolean(ScmConfigKeys.OZONE_SCM_PIPELINE_CREATE_RATIS_THREE,
+ true);
+
+ BackgroundPipelineCreator creator = new BackgroundPipelineCreator(
+ mock(PipelineManager.class), conf, mock(SCMContext.class),
+ Clock.systemUTC());
+
+ List<ReplicationConfig> configs = creator.getReplicationConfigs(false);
+
+ assertEquals(1, configs.size());
+ assertTrue(configs.stream()
+ .anyMatch(c -> RatisReplicationConfig.hasFactor(c,
+ HddsProtos.ReplicationFactor.THREE)));
+ assertFalse(configs.stream().anyMatch(c ->
+ c.getReplicationType() == HddsProtos.ReplicationType.EC));
+ }
+
+ @Test
+ public void testRatisDefaultReplicationBehaviorUnchanged() throws Exception {
+ OzoneConfiguration conf = new OzoneConfiguration();
+ conf.set(OzoneConfigKeys.OZONE_REPLICATION_TYPE,
+ HddsProtos.ReplicationType.RATIS.name());
+
+ BackgroundPipelineCreator creator = new BackgroundPipelineCreator(
+ mock(PipelineManager.class), conf, mock(SCMContext.class),
+ Clock.systemUTC());
+
+ List<ReplicationConfig> configs = creator.getReplicationConfigs(false);
+
+ assertEquals(1, configs.size());
+ assertTrue(RatisReplicationConfig.hasFactor(configs.get(0),
+ HddsProtos.ReplicationFactor.THREE));
+ }
+
+ @Test
+ public void testInvalidDefaultReplicationConfigCreatesNoPipelines() {
+ OzoneConfiguration conf = new OzoneConfiguration();
+ conf.set(OzoneConfigKeys.OZONE_REPLICATION_TYPE,
+ HddsProtos.ReplicationType.RATIS.name());
+ conf.set(OzoneConfigKeys.OZONE_REPLICATION, "invalid-replication-value");
+
+ BackgroundPipelineCreator creator =
+ new BackgroundPipelineCreator(mock(PipelineManager.class), conf,
+ mock(SCMContext.class), Clock.systemUTC());
+
+ List<ReplicationConfig> configs = creator.getReplicationConfigs(false);
+
+ assertTrue(configs.isEmpty());
+ }
+
+}
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestECMinDataNodeSafeModeRule.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestECMinDataNodeSafeModeRule.java
new file mode 100644
index 00000000000..fe3cc9fb8cd
--- /dev/null
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestECMinDataNodeSafeModeRule.java
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdds.scm.safemode;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.hdds.protocol.DatanodeDetails;
+import org.apache.hadoop.hdds.protocol.DatanodeID;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
+import org.apache.hadoop.hdds.scm.node.NodeManager;
+import
org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer.NodeRegistrationContainerReport;
+import org.apache.hadoop.hdds.server.events.EventQueue;
+import org.apache.hadoop.ozone.OzoneConfigKeys;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Tests for {@link ECMinDataNodeSafeModeRule}.
+ */
+public class TestECMinDataNodeSafeModeRule {
+
+ @Test
+ public void testDisabledForNonEcDefault() {
+ OzoneConfiguration conf = new OzoneConfiguration();
+ conf.set(OzoneConfigKeys.OZONE_REPLICATION_TYPE,
+ HddsProtos.ReplicationType.RATIS.name());
+
+ NodeManager nodeManager = mock(NodeManager.class);
+ SCMSafeModeManager safeModeManager = mock(SCMSafeModeManager.class);
+
when(safeModeManager.getSafeModeMetrics()).thenReturn(mock(SafeModeMetrics.class));
+
+ ECMinDataNodeSafeModeRule rule = new ECMinDataNodeSafeModeRule(
+ new EventQueue(), conf, nodeManager, safeModeManager);
+
+ assertFalse(rule.isEnabled());
+ assertTrue(rule.validate());
+ }
+
+ @Test
+ public void testEnabledForEcDefaultAndUsesRequiredNodeCount() {
+ OzoneConfiguration conf = new OzoneConfiguration();
+ conf.set(OzoneConfigKeys.OZONE_REPLICATION_TYPE,
+ HddsProtos.ReplicationType.EC.name());
+ conf.set(OzoneConfigKeys.OZONE_REPLICATION, "rs-3-2-1024k");
+
+ List<DatanodeDetails> enoughDns = new ArrayList<>();
+ List<DatanodeDetails> insufficientDns = new ArrayList<>();
+ for (int i = 0; i < 5; i++) {
+ enoughDns.add(mock(DatanodeDetails.class));
+ if (i < 4) {
+ insufficientDns.add(mock(DatanodeDetails.class));
+ }
+ }
+
+ NodeManager nodeManager = mock(NodeManager.class);
+ when(nodeManager.getNodes(any())).thenReturn(enoughDns, insufficientDns);
+ SCMSafeModeManager safeModeManager = mock(SCMSafeModeManager.class);
+
when(safeModeManager.getSafeModeMetrics()).thenReturn(mock(SafeModeMetrics.class));
+
+ ECMinDataNodeSafeModeRule rule = new ECMinDataNodeSafeModeRule(
+ new EventQueue(), conf, nodeManager, safeModeManager);
+ rule.setValidateBasedOnReportProcessing(false);
+
+ assertTrue(rule.isEnabled());
+ assertTrue(rule.validate());
+ assertFalse(rule.validate());
+ }
+
+ @Test
+ public void testProcessCountsAndDeduplicatesRegisteredDnsInReportMode() {
+ OzoneConfiguration conf = new OzoneConfiguration();
+ conf.set(OzoneConfigKeys.OZONE_REPLICATION_TYPE,
+ HddsProtos.ReplicationType.EC.name());
+ conf.set(OzoneConfigKeys.OZONE_REPLICATION, "rs-3-2-1024k");
+
+ NodeManager nodeManager = mock(NodeManager.class);
+ SCMSafeModeManager safeModeManager = mock(SCMSafeModeManager.class);
+
when(safeModeManager.getSafeModeMetrics()).thenReturn(mock(SafeModeMetrics.class));
+
+ ECMinDataNodeSafeModeRule rule = new ECMinDataNodeSafeModeRule(
+ new EventQueue(), conf, nodeManager, safeModeManager);
+
+ assertTrue(rule.isEnabled());
+ assertFalse(rule.validate());
+ assertEquals(0, rule.getRegisteredDns());
+
+ NodeRegistrationContainerReport report1 = createNodeRegistrationReport();
+ NodeRegistrationContainerReport report2 = createNodeRegistrationReport();
+ NodeRegistrationContainerReport report3 = createNodeRegistrationReport();
+ NodeRegistrationContainerReport report4 = createNodeRegistrationReport();
+ NodeRegistrationContainerReport report5 = createNodeRegistrationReport();
+
+ rule.process(report1);
+ rule.process(report2);
+ rule.process(report3);
+ rule.process(report4);
+ rule.process(report5);
+ rule.process(report5);
+
+ assertEquals(5, rule.getRegisteredDns());
+ assertTrue(rule.validate());
+ }
+
+ private static NodeRegistrationContainerReport
createNodeRegistrationReport() {
+ NodeRegistrationContainerReport report =
+ mock(NodeRegistrationContainerReport.class);
+ DatanodeDetails dnDetails = mock(DatanodeDetails.class);
+ DatanodeID dnId = mock(DatanodeID.class);
+ when(dnDetails.getID()).thenReturn(dnId);
+ when(report.getDatanodeDetails()).thenReturn(dnDetails);
+ return report;
+ }
+}
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestOneReplicaPipelineSafeModeRule.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestOneReplicaPipelineSafeModeRule.java
index e83b6e51a93..801b5351324 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestOneReplicaPipelineSafeModeRule.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestOneReplicaPipelineSafeModeRule.java
@@ -31,7 +31,6 @@
import java.util.Map;
import org.apache.hadoop.hdds.HddsConfigKeys;
import org.apache.hadoop.hdds.client.RatisReplicationConfig;
-import org.apache.hadoop.hdds.client.ReplicationConfig;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
@@ -217,7 +216,7 @@ public void
testOneReplicaPipelineRuleWithReportProcessingFalse() {
java.util.Collections.singletonList(mock(DatanodeDetails.class))));
when(mockedPipelineManager.getPipelines(
- Mockito.any(ReplicationConfig.class),
+ Mockito.any(),
Mockito.eq(Pipeline.PipelineState.OPEN)))
.thenReturn(java.util.Collections.singletonList(mockedPipeline));
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
index d99caa8708b..cf262873eb0 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
@@ -23,6 +23,7 @@
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.params.provider.Arguments.arguments;
+import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
@@ -47,6 +48,7 @@
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType;
import
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos;
import org.apache.hadoop.hdds.scm.HddsTestUtils;
+import org.apache.hadoop.hdds.scm.ScmConfigKeys;
import org.apache.hadoop.hdds.scm.container.ContainerInfo;
import org.apache.hadoop.hdds.scm.container.ContainerManager;
import org.apache.hadoop.hdds.scm.container.ContainerManagerImpl;
@@ -72,6 +74,7 @@
import org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer;
import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
import org.apache.hadoop.hdds.server.events.EventQueue;
+import org.apache.hadoop.ozone.OzoneConfigKeys;
import org.apache.ozone.test.GenericTestUtils;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
@@ -406,10 +409,10 @@ public void
testSafeModeExitRuleWithPipelineAvailabilityCheck(
assertEquals(1,
scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value());
if (healthyPipelinePercent > 0) {
validateRuleStatus("HealthyPipelineSafeModeRule",
- "healthy Ratis/THREE pipelines");
+ "healthy RATIS/THREE pipelines");
}
validateRuleStatus("OneReplicaPipelineSafeModeRule",
- "reported Ratis/THREE pipelines with at least one datanode");
+ "reported RATIS/THREE pipelines with at least one datanode");
testContainerThreshold(containers, 1.0);
@@ -481,7 +484,7 @@ private void validateRuleStatus(String safeModeRule, String
stringToMatch) {
if (entry.getKey().equals(safeModeRule)) {
Pair<Boolean, String> value = entry.getValue();
assertEquals(false, value.getLeft());
- assertThat(value.getRight()).contains(stringToMatch);
+ assertThat(value.getRight()).containsIgnoringCase(stringToMatch);
}
}
}
@@ -822,6 +825,81 @@ public void testSafeModePipelineExitRule() throws
Exception {
pipelineManager.close();
}
+ @Test
+ public void testEcDefaultDisablesHealthyPipelineRuleWhenRatisThreeDisabled()
{
+ OzoneConfiguration conf = new OzoneConfiguration(config);
+ conf.set(OzoneConfigKeys.OZONE_REPLICATION_TYPE,
+ ReplicationType.EC.name());
+ conf.set(OzoneConfigKeys.OZONE_REPLICATION, "rs-3-2-1024k");
+ conf.setBoolean(ScmConfigKeys.OZONE_SCM_PIPELINE_CREATE_RATIS_THREE,
+ false);
+
+ MockNodeManager mockNodeManager = new MockNodeManager(true, 5);
+ PipelineManager pipelineManager = mock(PipelineManager.class);
+ when(pipelineManager.getPipelines(any(), any()))
+ .thenReturn(Collections.emptyList());
+ when(pipelineManager.getPipelines())
+ .thenReturn(Collections.emptyList());
+
+ ContainerManager containerManager = mock(ContainerManager.class);
+ when(containerManager.getContainers(ReplicationType.RATIS))
+ .thenReturn(Collections.emptyList());
+ when(containerManager.getContainers(ReplicationType.EC))
+ .thenReturn(Collections.emptyList());
+ when(containerManager.getContainers())
+ .thenReturn(Collections.emptyList());
+
+ scmSafeModeManager = new SCMSafeModeManager(conf, mockNodeManager,
+ pipelineManager, containerManager, serviceManager, queue, scmContext);
+ scmSafeModeManager.start();
+
+ assertThat(SafeModeRuleFactory.getInstance()
+ .getSafeModeRule(HealthyPipelineSafeModeRule.class)).isNull();
+ assertThat(SafeModeRuleFactory.getInstance()
+ .getSafeModeRule(OneReplicaPipelineSafeModeRule.class)).isNull();
+ ECMinDataNodeSafeModeRule ecMinDnRule = SafeModeRuleFactory.getInstance()
+ .getSafeModeRule(ECMinDataNodeSafeModeRule.class);
+ assertThat(ecMinDnRule).isNotNull();
+ assertThat(ecMinDnRule.isEnabled()).isTrue();
+ assertThat(ecMinDnRule.getRequiredDns()).isEqualTo(5);
+ }
+
+ @Test
+ public void testEcDefaultKeepsHealthyPipelineRuleWhenRatisThreeEnabled() {
+ OzoneConfiguration conf = new OzoneConfiguration(config);
+ conf.set(OzoneConfigKeys.OZONE_REPLICATION_TYPE,
+ ReplicationType.EC.name());
+ conf.set(OzoneConfigKeys.OZONE_REPLICATION, "rs-3-2-1024k");
+ conf.setBoolean(ScmConfigKeys.OZONE_SCM_PIPELINE_CREATE_RATIS_THREE,
+ true);
+
+ MockNodeManager mockNodeManager = new MockNodeManager(true, 5);
+ PipelineManager pipelineManager = mock(PipelineManager.class);
+ when(pipelineManager.getPipelines(any(), any()))
+ .thenReturn(Collections.emptyList());
+ when(pipelineManager.getPipelines())
+ .thenReturn(Collections.emptyList());
+
+ ContainerManager containerManager = mock(ContainerManager.class);
+ when(containerManager.getContainers(ReplicationType.RATIS))
+ .thenReturn(Collections.emptyList());
+ when(containerManager.getContainers(ReplicationType.EC))
+ .thenReturn(Collections.emptyList());
+ when(containerManager.getContainers())
+ .thenReturn(Collections.emptyList());
+
+ scmSafeModeManager = new SCMSafeModeManager(conf, mockNodeManager,
+ pipelineManager, containerManager, serviceManager, queue, scmContext);
+ scmSafeModeManager.start();
+
+ assertThat(SafeModeRuleFactory.getInstance()
+ .getSafeModeRule(HealthyPipelineSafeModeRule.class)).isNotNull();
+ assertThat(SafeModeRuleFactory.getInstance()
+ .getSafeModeRule(OneReplicaPipelineSafeModeRule.class)).isNotNull();
+ assertThat(SafeModeRuleFactory.getInstance()
+ .getSafeModeRule(ECMinDataNodeSafeModeRule.class)).isNotNull();
+ }
+
@Test
public void testPipelinesNotCreatedUntilPreCheckPasses() throws Exception {
int numOfDns = 5;
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java
index cdafe912c9f..3f0b8f415f2 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java
@@ -25,30 +25,27 @@
import java.lang.reflect.Field;
import org.apache.hadoop.hdds.HddsConfigKeys;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
+import org.apache.hadoop.hdds.scm.ScmConfigKeys;
import org.apache.hadoop.hdds.scm.container.ContainerManager;
import org.apache.hadoop.hdds.scm.ha.SCMContext;
import org.apache.hadoop.hdds.scm.node.NodeManager;
import org.apache.hadoop.hdds.scm.pipeline.PipelineManager;
import org.apache.hadoop.hdds.server.events.EventQueue;
+import org.apache.hadoop.ozone.OzoneConfigKeys;
import org.junit.jupiter.api.Test;
class TestSafeModeRuleFactory {
@Test
public void testIllegalState() {
- // If the initialization is already done by different test, we have to
reset it.
- try {
- final Field instance =
SafeModeRuleFactory.class.getDeclaredField("instance");
- instance.setAccessible(true);
- instance.set(null, null);
- } catch (Exception e) {
- throw new RuntimeException();
- }
+ resetInstance();
assertThrows(IllegalStateException.class,
SafeModeRuleFactory::getInstance);
}
@Test
public void testLoadedSafeModeRules() {
+ resetInstance();
SCMSafeModeManager safeModeManager = initializeSafeModeRuleFactory();
final SafeModeRuleFactory factory = SafeModeRuleFactory.getInstance();
factory.addSafeModeManager(safeModeManager);
@@ -57,13 +54,14 @@ public void testLoadedSafeModeRules() {
// as the rules are hardcoded in SafeModeRuleFactory.
// This will be fixed once we load rules using annotation.
- assertEquals(5, factory.getSafeModeRules().size(),
+ assertEquals(6, factory.getSafeModeRules().size(),
"The total safemode rules count doesn't match");
}
@Test
public void testLoadedPreCheckRules() {
+ resetInstance();
SCMSafeModeManager safeModeManager = initializeSafeModeRuleFactory();
final SafeModeRuleFactory factory = SafeModeRuleFactory.getInstance();
factory.addSafeModeManager(safeModeManager);
@@ -77,16 +75,68 @@ public void testLoadedPreCheckRules() {
}
+ @Test
+ public void testRuleCountForEcDefaultWithRatisThreeFlagDisabled() {
+ resetInstance();
+ OzoneConfiguration conf = new OzoneConfiguration();
+ conf.set(OzoneConfigKeys.OZONE_REPLICATION_TYPE,
+ HddsProtos.ReplicationType.EC.name());
+ conf.set(OzoneConfigKeys.OZONE_REPLICATION, "rs-3-2-1024k");
+ conf.setBoolean(ScmConfigKeys.OZONE_SCM_PIPELINE_CREATE_RATIS_THREE,
+ false);
+
+ SCMSafeModeManager safeModeManager = initializeSafeModeRuleFactory(conf);
+ final SafeModeRuleFactory factory = SafeModeRuleFactory.getInstance();
+ factory.addSafeModeManager(safeModeManager);
+
+ assertEquals(4, factory.getSafeModeRules().size(),
+ "EC default with flag=false should skip RATIS/THREE pipeline rules");
+ }
+
+ @Test
+ public void testRuleCountForEcDefaultWithRatisThreeFlagEnabled() {
+ resetInstance();
+ OzoneConfiguration conf = new OzoneConfiguration();
+ conf.set(OzoneConfigKeys.OZONE_REPLICATION_TYPE,
+ HddsProtos.ReplicationType.EC.name());
+ conf.set(OzoneConfigKeys.OZONE_REPLICATION, "rs-3-2-1024k");
+ conf.setBoolean(ScmConfigKeys.OZONE_SCM_PIPELINE_CREATE_RATIS_THREE,
+ true);
+
+ SCMSafeModeManager safeModeManager = initializeSafeModeRuleFactory(conf);
+ final SafeModeRuleFactory factory = SafeModeRuleFactory.getInstance();
+ factory.addSafeModeManager(safeModeManager);
+
+ assertEquals(6, factory.getSafeModeRules().size(),
+ "EC default with flag=true should include RATIS/THREE pipeline rules");
+ }
+
private SCMSafeModeManager initializeSafeModeRuleFactory() {
+ return initializeSafeModeRuleFactory(new OzoneConfiguration());
+ }
+
+ private SCMSafeModeManager initializeSafeModeRuleFactory(
+ OzoneConfiguration configuration) {
final SCMSafeModeManager safeModeManager = mock(SCMSafeModeManager.class);
when(safeModeManager.getSafeModeMetrics()).thenReturn(mock(SafeModeMetrics.class));
- OzoneConfiguration conf = new OzoneConfiguration();
- conf.set(HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, "0s");
- SafeModeRuleFactory.initialize(conf,
+ configuration.set(HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL,
+ "0s");
+ SafeModeRuleFactory.initialize(configuration,
SCMContext.emptyContext(), new EventQueue(), mock(
PipelineManager.class),
mock(ContainerManager.class), mock(NodeManager.class));
return safeModeManager;
}
+ private static void resetInstance() {
+ try {
+ final Field instance = SafeModeRuleFactory.class.getDeclaredField(
+ "instance");
+ instance.setAccessible(true);
+ instance.set(null, null);
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
}
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeWithPipelineRules.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeWithPipelineRules.java
index 135a8389c34..4a74d94a67b 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeWithPipelineRules.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeWithPipelineRules.java
@@ -17,6 +17,7 @@
package org.apache.hadoop.hdds.scm.safemode;
+import static java.nio.charset.StandardCharsets.UTF_8;
import static java.util.concurrent.TimeUnit.SECONDS;
import static
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_COMMAND_STATUS_REPORT_INTERVAL;
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_HEARTBEAT_INTERVAL;
@@ -31,9 +32,11 @@
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import org.apache.hadoop.hdds.HddsConfigKeys;
+import org.apache.hadoop.hdds.client.ECReplicationConfig;
import org.apache.hadoop.hdds.client.RatisReplicationConfig;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationFactor;
import org.apache.hadoop.hdds.scm.PlacementPolicy;
import org.apache.hadoop.hdds.scm.ScmConfigKeys;
@@ -43,6 +46,10 @@
import org.apache.hadoop.hdds.scm.pipeline.PipelineManager;
import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
import org.apache.hadoop.ozone.MiniOzoneCluster;
+import org.apache.hadoop.ozone.OzoneConfigKeys;
+import org.apache.hadoop.ozone.TestDataUtil;
+import org.apache.hadoop.ozone.client.OzoneBucket;
+import org.apache.hadoop.ozone.client.OzoneClient;
import org.apache.ozone.test.GenericTestUtils;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Test;
@@ -58,6 +65,17 @@ public class TestSCMSafeModeWithPipelineRules {
public void setup(int numDatanodes) throws Exception {
OzoneConfiguration conf = new OzoneConfiguration();
+ configureTestCluster(conf);
+
+ cluster = MiniOzoneCluster.newBuilder(conf)
+ .setNumDatanodes(numDatanodes)
+ .build();
+ cluster.waitForClusterToBeReady();
+ StorageContainerManager scm = cluster.getStorageContainerManager();
+ pipelineManager = scm.getPipelineManager();
+ }
+
+ private static void configureTestCluster(OzoneConfiguration conf) {
conf.setTimeDuration(OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL,
100, TimeUnit.MILLISECONDS);
conf.set(HddsConfigKeys.HDDS_SCM_WAIT_TIME_AFTER_SAFE_MODE_EXIT, "10s");
@@ -69,13 +87,6 @@ public void setup(int numDatanodes) throws Exception {
conf.setBoolean(ScmConfigKeys.OZONE_SCM_DATANODE_DISALLOW_SAME_PEERS,
true);
conf.setClass(ScmConfigKeys.OZONE_SCM_CONTAINER_PLACEMENT_IMPL_KEY,
SCMContainerPlacementCapacity.class, PlacementPolicy.class);
-
- cluster = MiniOzoneCluster.newBuilder(conf)
- .setNumDatanodes(numDatanodes)
- .build();
- cluster.waitForClusterToBeReady();
- StorageContainerManager scm = cluster.getStorageContainerManager();
- pipelineManager = scm.getPipelineManager();
}
@Test
@@ -156,6 +167,49 @@ void testScmSafeMode() throws Exception {
GenericTestUtils.waitFor(replicationManager::isRunning, 1000, 60000);
}
+ @Test
+ void testSafeModeExitAfterScmRestartWithMixedEcAndRatisThreeKeys()
+ throws Exception {
+ OzoneConfiguration conf = new OzoneConfiguration();
+ configureTestCluster(conf);
+ conf.set(OzoneConfigKeys.OZONE_REPLICATION_TYPE,
+ HddsProtos.ReplicationType.EC.name());
+ conf.set(OzoneConfigKeys.OZONE_REPLICATION, "rs-3-2-1024k");
+ conf.setBoolean(ScmConfigKeys.OZONE_SCM_PIPELINE_CREATE_RATIS_THREE,
+ true);
+
+ cluster = MiniOzoneCluster.newBuilder(conf)
+ .setNumDatanodes(5)
+ .build();
+ cluster.waitForClusterToBeReady();
+ pipelineManager =
cluster.getStorageContainerManager().getPipelineManager();
+ waitForRatis3NodePipelines(1);
+
+ try (OzoneClient client = cluster.newClient()) {
+ OzoneBucket bucket = TestDataUtil.createVolumeAndBucket(client);
+ TestDataUtil.createKey(bucket, "ec-key",
+ new ECReplicationConfig("rs-3-2-1024k"),
+ "ec-data".getBytes(UTF_8));
+ TestDataUtil.createKey(bucket, "ratis3-key",
+ RatisReplicationConfig.getInstance(ReplicationFactor.THREE),
+ "ratis-data".getBytes(UTF_8));
+ }
+
+ cluster.restartStorageContainerManager(false);
+ SCMSafeModeManager scmSafeModeManager =
+ cluster.getStorageContainerManager().getScmSafeModeManager();
+ final ECMinDataNodeSafeModeRule ecRule = SafeModeRuleFactory.getInstance()
+ .getSafeModeRule(ECMinDataNodeSafeModeRule.class);
+
+ assertTrue(ecRule.isEnabled());
+ ecRule.setValidateBasedOnReportProcessing(false);
+ GenericTestUtils.waitFor(ecRule::validate, 1000, 60000);
+ GenericTestUtils.waitFor(() -> {
+ scmSafeModeManager.refreshAndValidate();
+ return !scmSafeModeManager.getInSafeMode();
+ }, 1000, 60000);
+ }
+
@AfterEach
public void tearDown() {
if (cluster != null) {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]