This is an automated email from the ASF dual-hosted git repository.
agupta pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new c7117dcc17 HDDS-11974. Split Container Safemode Rule into Ratis & EC
Container Safemode Rules (#7951)
c7117dcc17 is described below
commit c7117dcc1731a5f8a82fc6f06b99bda9cd6e01c0
Author: Peter Lee <[email protected]>
AuthorDate: Tue Apr 15 16:07:20 2025 +0800
HDDS-11974. Split Container Safemode Rule into Ratis & EC Container
Safemode Rules (#7951)
---
.../hdds/scm/safemode/ContainerSafeModeRule.java | 359 ---------------------
.../hdds/scm/safemode/ECContainerSafeModeRule.java | 249 ++++++++++++++
.../scm/safemode/RatisContainerSafeModeRule.java | 201 ++++++++++++
.../hdds/scm/safemode/SCMSafeModeManager.java | 18 +-
.../hdds/scm/safemode/SafeModeRuleFactory.java | 7 +-
.../hdds/scm/safemode/TestSCMSafeModeManager.java | 29 +-
.../hdds/scm/safemode/TestSafeModeRuleFactory.java | 2 +-
7 files changed, 485 insertions(+), 380 deletions(-)
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ContainerSafeModeRule.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ContainerSafeModeRule.java
deleted file mode 100644
index 4fec2e4a17..0000000000
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ContainerSafeModeRule.java
+++ /dev/null
@@ -1,359 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.hdds.scm.safemode;
-
-import static
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT;
-import static
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT;
-
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.Sets;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.UUID;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.atomic.AtomicLong;
-import java.util.stream.Collectors;
-import org.apache.hadoop.hdds.client.ReplicationConfig;
-import org.apache.hadoop.hdds.conf.ConfigurationSource;
-import org.apache.hadoop.hdds.protocol.DatanodeDetails;
-import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
-import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState;
-import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType;
-import
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos;
-import org.apache.hadoop.hdds.scm.container.ContainerID;
-import org.apache.hadoop.hdds.scm.container.ContainerInfo;
-import org.apache.hadoop.hdds.scm.container.ContainerManager;
-import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException;
-import org.apache.hadoop.hdds.scm.events.SCMEvents;
-import
org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer.NodeRegistrationContainerReport;
-import org.apache.hadoop.hdds.server.events.EventQueue;
-import org.apache.hadoop.hdds.server.events.TypedEvent;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Class defining Safe mode exit criteria for Containers.
- */
-public class ContainerSafeModeRule extends
- SafeModeExitRule<NodeRegistrationContainerReport> {
-
- private static final Logger LOG =
LoggerFactory.getLogger(ContainerSafeModeRule.class);
-
- private static final String NAME = "ContainerSafeModeRule";
-
- private final ContainerManager containerManager;
- // Required cutoff % for containers with at least 1 reported replica.
- private final double safeModeCutoff;
- // Containers read from scm db (excluding containers in ALLOCATED state).
- private final Set<Long> ratisContainers;
- private final Set<Long> ecContainers;
- private final Map<Long, Set<UUID>> ecContainerDNsMap;
- private final AtomicLong ratisContainerWithMinReplicas = new AtomicLong(0);
- private final AtomicLong ecContainerWithMinReplicas = new AtomicLong(0);
-
- private double ratisMaxContainer;
- private double ecMaxContainer;
-
- public ContainerSafeModeRule(final EventQueue eventQueue,
- final ConfigurationSource conf,
- final ContainerManager containerManager,
- final SCMSafeModeManager manager) {
- super(manager, NAME, eventQueue);
- this.safeModeCutoff = getSafeModeCutoff(conf);
- this.containerManager = containerManager;
- this.ratisContainers = new HashSet<>();
- this.ecContainers = new HashSet<>();
- this.ecContainerDNsMap = new ConcurrentHashMap<>();
- initializeRule();
- }
-
-
- private static double getSafeModeCutoff(ConfigurationSource conf) {
- final double cutoff = conf.getDouble(HDDS_SCM_SAFEMODE_THRESHOLD_PCT,
- HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT);
- Preconditions.checkArgument((cutoff >= 0.0 && cutoff <= 1.0),
- HDDS_SCM_SAFEMODE_THRESHOLD_PCT +
- " value should be >= 0.0 and <= 1.0");
- return cutoff;
- }
-
- @Override
- protected TypedEvent<NodeRegistrationContainerReport> getEventType() {
- return SCMEvents.CONTAINER_REGISTRATION_REPORT;
- }
-
- @Override
- protected synchronized boolean validate() {
- if (validateBasedOnReportProcessing()) {
- return (getCurrentContainerThreshold() >= safeModeCutoff) &&
- (getCurrentECContainerThreshold() >= safeModeCutoff);
- }
-
- // TODO: Split ContainerSafeModeRule into RatisContainerSafeModeRule and
- // ECContainerSafeModeRule
- final List<ContainerInfo> containers = containerManager.getContainers(
- ReplicationType.RATIS);
-
- return containers.stream()
- .filter(this::isClosed)
- .map(ContainerInfo::containerID)
- .noneMatch(this::isMissing);
- }
-
- /**
- * Checks if the container has any replica.
- */
- private boolean isMissing(ContainerID id) {
- try {
- return containerManager.getContainerReplicas(id).isEmpty();
- } catch (ContainerNotFoundException ex) {
- /*
- * This should never happen, in case this happens the container
- * somehow got removed from SCM.
- * Safemode rule doesn't have to log/fix this. We will just exclude this
- * from the rule validation.
- */
- return false;
-
- }
- }
-
- @VisibleForTesting
- public double getCurrentContainerThreshold() {
- return ratisMaxContainer == 0 ? 1 :
- (ratisContainerWithMinReplicas.doubleValue() / ratisMaxContainer);
- }
-
- @VisibleForTesting
- public double getCurrentECContainerThreshold() {
- return ecMaxContainer == 0 ? 1 :
- (ecContainerWithMinReplicas.doubleValue() / ecMaxContainer);
- }
-
-
- // TODO: Report processing logic will be removed in future. HDDS-11958.
- @Override
- protected synchronized void process(
- final NodeRegistrationContainerReport reportsProto) {
- final DatanodeDetails datanodeDetails = reportsProto.getDatanodeDetails();
- final UUID datanodeUUID = datanodeDetails.getUuid();
- StorageContainerDatanodeProtocolProtos.ContainerReportsProto report =
reportsProto.getReport();
-
- report.getReportsList().forEach(c -> {
- long containerID = c.getContainerID();
-
-
- // If it is a Ratis container.
- if (ratisContainers.contains(containerID)) {
- recordReportedContainer(containerID, Boolean.FALSE);
- ratisContainers.remove(containerID);
- }
-
- // If it is an EC container.
- if (ecContainers.contains(containerID)) {
- putInContainerDNsMap(containerID, ecContainerDNsMap, datanodeUUID);
- recordReportedContainer(containerID, Boolean.TRUE);
- }
- });
-
- if (scmInSafeMode()) {
- SCMSafeModeManager.getLogger().info(
- "SCM in safe mode. {} % containers [Ratis] have at least one"
- + " reported replica, {} % containers [EC] have at N reported
replica.",
- getCurrentContainerThreshold() * 100,
getCurrentECContainerThreshold() * 100);
- }
- }
-
- /**
- * Record the reported Container.
- *
- * We will differentiate and count according to the type of Container.
- *
- * @param containerID containerID
- * @param isEcContainer true, means ECContainer, false, means not
ECContainer.
- */
- private void recordReportedContainer(long containerID, boolean
isEcContainer) {
-
- int uuids = 1;
- if (isEcContainer && ecContainerDNsMap.containsKey(containerID)) {
- uuids = ecContainerDNsMap.get(containerID).size();
- }
-
- int minReplica = getMinReplica(containerID);
- if (uuids >= minReplica) {
- if (isEcContainer) {
- getSafeModeMetrics()
- .incCurrentContainersWithECDataReplicaReportedCount();
- ecContainerWithMinReplicas.getAndAdd(1);
- } else {
- ratisContainerWithMinReplicas.getAndAdd(1);
- getSafeModeMetrics()
- .incCurrentContainersWithOneReplicaReportedCount();
- }
- }
- }
-
- /**
- * Get the minimum replica.
- *
- * If it is a Ratis Contianer, the minimum copy is 1.
- * If it is an EC Container, the minimum copy will be the number of Data in
replicationConfig.
- *
- * @param pContainerID containerID
- * @return MinReplica.
- */
- private int getMinReplica(long pContainerID) {
-
- try {
- ContainerID containerID = ContainerID.valueOf(pContainerID);
- ContainerInfo container = containerManager.getContainer(containerID);
- ReplicationConfig replicationConfig = container.getReplicationConfig();
- return replicationConfig.getMinimumNodes();
- } catch (ContainerNotFoundException e) {
- LOG.error("containerId = {} not found.", pContainerID, e);
- } catch (Exception e) {
- LOG.error("containerId = {} not found.", pContainerID, e);
- }
-
- return 1;
- }
-
- private void putInContainerDNsMap(long containerID, Map<Long, Set<UUID>>
containerDNsMap,
- UUID datanodeUUID) {
- containerDNsMap.computeIfAbsent(containerID, key -> Sets.newHashSet());
- containerDNsMap.get(containerID).add(datanodeUUID);
- }
-
- @Override
- protected synchronized void cleanup() {
- ratisContainers.clear();
- ecContainers.clear();
- ecContainerDNsMap.clear();
- }
-
- @Override
- public String getStatusText() {
-
- // ratis container
- String status = String.format(
- "%1.2f%% of [Ratis] Containers(%s / %s) with at least one reported
replica (=%1.2f) >= " +
- "safeModeCutoff (=%1.2f);",
- getCurrentContainerThreshold() * 100,
- ratisContainerWithMinReplicas, (long) ratisMaxContainer,
- getCurrentContainerThreshold(), this.safeModeCutoff);
-
- Set<Long> sampleRatisContainers = ratisContainers.stream().
- limit(SAMPLE_CONTAINER_DISPLAY_LIMIT).
- collect(Collectors.toSet());
-
- if (!sampleRatisContainers.isEmpty()) {
- String sampleContainerText =
- "Sample Ratis Containers not satisfying the criteria : " +
sampleRatisContainers + ";";
- status = status.concat("\n").concat(sampleContainerText);
- }
-
- // ec container
- String ecStatus = String.format(
- "%1.2f%% of [EC] Containers(%s / %s) with at least N reported replica
(=%1.2f) >= " +
- "safeModeCutoff (=%1.2f);",
- getCurrentECContainerThreshold() * 100,
- ecContainerWithMinReplicas, (long) ecMaxContainer,
- getCurrentECContainerThreshold(), this.safeModeCutoff);
- status = status.concat("\n").concat(ecStatus);
-
- Set<Long> sampleEcContainers = ecContainerDNsMap.entrySet().stream().
- filter(entry -> {
- Long containerId = entry.getKey();
- int minReplica = getMinReplica(containerId);
- Set<UUID> allReplicas = entry.getValue();
- if (allReplicas.size() >= minReplica) {
- return false;
- }
- return true;
- }).
- map(Map.Entry::getKey).
- limit(SAMPLE_CONTAINER_DISPLAY_LIMIT).
- collect(Collectors.toSet());
-
- if (!sampleEcContainers.isEmpty()) {
- String sampleECContainerText =
- "Sample EC Containers not satisfying the criteria : " +
sampleEcContainers + ";";
- status = status.concat("\n").concat(sampleECContainerText);
- }
-
- return status;
- }
-
-
- @Override
- public synchronized void refresh(boolean forceRefresh) {
- if (forceRefresh || !validate()) {
- initializeRule();
- }
- }
-
- private boolean isClosed(ContainerInfo container) {
- final LifeCycleState state = container.getState();
- return state == LifeCycleState.QUASI_CLOSED ||
- state == LifeCycleState.CLOSED;
- }
-
- private void initializeRule() {
- final List<ContainerInfo> containers = containerManager.getContainers();
- // Clean up the related data in the map.
- ratisContainers.clear();
- ecContainers.clear();
-
- // Iterate through the container list to
- // get the minimum replica count for each container.
- containers.forEach(container -> {
- // There can be containers in OPEN/CLOSING state which were never
- // created by the client. We are not considering these containers for
- // now. These containers can be handled by tracking pipelines.
-
- HddsProtos.ReplicationType replicationType =
container.getReplicationType();
-
- if (isClosed(container) && container.getNumberOfKeys() > 0) {
- // If it's of type Ratis
- if (replicationType.equals(HddsProtos.ReplicationType.RATIS)) {
- ratisContainers.add(container.getContainerID());
- }
-
- // If it's of type EC
- if (replicationType.equals(HddsProtos.ReplicationType.EC)) {
- ecContainers.add(container.getContainerID());
- }
- }
- });
-
- ratisMaxContainer = ratisContainers.size();
- ecMaxContainer = ecContainers.size();
-
- long ratisCutOff = (long) Math.ceil(ratisMaxContainer * safeModeCutoff);
- long ecCutOff = (long) Math.ceil(ecMaxContainer * safeModeCutoff);
-
-
getSafeModeMetrics().setNumContainerWithOneReplicaReportedThreshold(ratisCutOff);
-
getSafeModeMetrics().setNumContainerWithECDataReplicaReportedThreshold(ecCutOff);
-
- LOG.info("Refreshed Containers with one replica threshold count {}, " +
- "with ec n replica threshold count {}.", ratisCutOff, ecCutOff);
- }
-}
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java
new file mode 100644
index 0000000000..a6c30aace9
--- /dev/null
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ECContainerSafeModeRule.java
@@ -0,0 +1,249 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdds.scm.safemode;
+
+import static
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT;
+import static
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Sets;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.UUID;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.stream.Collectors;
+import org.apache.hadoop.hdds.client.ReplicationConfig;
+import org.apache.hadoop.hdds.conf.ConfigurationSource;
+import org.apache.hadoop.hdds.protocol.DatanodeDetails;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType;
+import org.apache.hadoop.hdds.scm.container.ContainerID;
+import org.apache.hadoop.hdds.scm.container.ContainerInfo;
+import org.apache.hadoop.hdds.scm.container.ContainerManager;
+import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException;
+import org.apache.hadoop.hdds.scm.events.SCMEvents;
+import
org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer.NodeRegistrationContainerReport;
+import org.apache.hadoop.hdds.server.events.EventQueue;
+import org.apache.hadoop.hdds.server.events.TypedEvent;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Safe mode rule for EC containers.
+ */
+public class ECContainerSafeModeRule extends
SafeModeExitRule<NodeRegistrationContainerReport> {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(ECContainerSafeModeRule.class);
+ private static final String NAME = "ECContainerSafeModeRule";
+ private static final int DEFAULT_MIN_REPLICA = 1;
+
+ private final ContainerManager containerManager;
+ private final double safeModeCutoff;
+ private final Set<Long> ecContainers;
+ private final Map<Long, Set<UUID>> ecContainerDNsMap;
+ private final AtomicLong ecContainerWithMinReplicas;
+ private double ecMaxContainer;
+
+ public ECContainerSafeModeRule(EventQueue eventQueue,
+ ConfigurationSource conf,
+ ContainerManager containerManager,
+ SCMSafeModeManager manager) {
+ super(manager, NAME, eventQueue);
+ this.safeModeCutoff = getSafeModeCutoff(conf);
+ this.containerManager = containerManager;
+ this.ecContainers = new HashSet<>();
+ this.ecContainerDNsMap = new ConcurrentHashMap<>();
+ this.ecContainerWithMinReplicas = new AtomicLong(0);
+ initializeRule();
+ }
+
+ private static double getSafeModeCutoff(ConfigurationSource conf) {
+ final double cutoff = conf.getDouble(HDDS_SCM_SAFEMODE_THRESHOLD_PCT,
+ HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT);
+ Preconditions.checkArgument((cutoff >= 0.0 && cutoff <= 1.0),
+ HDDS_SCM_SAFEMODE_THRESHOLD_PCT + " value should be >= 0.0 and <=
1.0");
+ return cutoff;
+ }
+
+ @Override
+ protected TypedEvent<NodeRegistrationContainerReport> getEventType() {
+ return SCMEvents.CONTAINER_REGISTRATION_REPORT;
+ }
+
+ @Override
+ protected synchronized boolean validate() {
+ if (validateBasedOnReportProcessing()) {
+ return getCurrentContainerThreshold() >= safeModeCutoff;
+ }
+
+ final List<ContainerInfo> containers = containerManager.getContainers(
+ ReplicationType.EC);
+
+ return containers.stream()
+ .filter(this::isClosed)
+ .map(ContainerInfo::containerID)
+ .noneMatch(this::isMissing);
+ }
+
+ /**
+ * Checks if the container has at least the minimum required number of
replicas.
+ */
+ private boolean isMissing(ContainerID id) {
+ try {
+ int minReplica = getMinReplica(id.getId());
+ return containerManager.getContainerReplicas(id).size() < minReplica;
+ } catch (ContainerNotFoundException ex) {
+ /*
+ * This should never happen, in case this happens the container
+ * somehow got removed from SCM.
+ * Safemode rule doesn't have to log/fix this. We will just exclude this
+ * from the rule validation.
+ */
+ return false;
+ }
+ }
+
+ @VisibleForTesting
+ public double getCurrentContainerThreshold() {
+ return ecMaxContainer == 0 ? 1 : (ecContainerWithMinReplicas.doubleValue()
/ ecMaxContainer);
+ }
+
+ /**
+ * Get the minimum replica.
+ *
+ * @param pContainerID containerID
+ * @return MinReplica.
+ */
+ private int getMinReplica(long pContainerID) {
+ try {
+ ContainerID containerID = ContainerID.valueOf(pContainerID);
+ ContainerInfo container = containerManager.getContainer(containerID);
+ ReplicationConfig replicationConfig = container.getReplicationConfig();
+ return replicationConfig.getMinimumNodes();
+ } catch (Exception e) {
+ LOG.error("containerId = {} not found.", pContainerID, e);
+ }
+
+ return DEFAULT_MIN_REPLICA;
+ }
+
+ @Override
+ protected void process(NodeRegistrationContainerReport report) {
+ DatanodeDetails datanodeDetails = report.getDatanodeDetails();
+ UUID datanodeUUID = datanodeDetails.getUuid();
+
+ report.getReport().getReportsList().forEach(c -> {
+ long containerID = c.getContainerID();
+ if (ecContainers.contains(containerID)) {
+ putInContainerDNsMap(containerID, ecContainerDNsMap, datanodeUUID);
+ recordReportedContainer(containerID);
+ }
+ });
+
+ if (scmInSafeMode()) {
+ SCMSafeModeManager.getLogger().info(
+ "SCM in safe mode. {} % containers [EC] have at N reported replica",
+ getCurrentContainerThreshold() * 100);
+ }
+ }
+
+ private void putInContainerDNsMap(long containerID,
+ Map<Long, Set<UUID>> containerDNsMap,
+ UUID datanodeUUID) {
+ containerDNsMap.computeIfAbsent(containerID, key ->
Sets.newHashSet()).add(datanodeUUID);
+ }
+
+ /**
+ * Record the reported Container.
+ *
+ * @param containerID containerID
+ */
+ private void recordReportedContainer(long containerID) {
+
+ int uuids = 1;
+ if (ecContainerDNsMap.containsKey(containerID)) {
+ uuids = ecContainerDNsMap.get(containerID).size();
+ }
+
+ int minReplica = getMinReplica(containerID);
+ if (uuids >= minReplica) {
+ getSafeModeMetrics()
+ .incCurrentContainersWithECDataReplicaReportedCount();
+ ecContainerWithMinReplicas.getAndAdd(1);
+ }
+ }
+
+ private void initializeRule() {
+ ecContainers.clear();
+ ecContainerDNsMap.clear();
+ containerManager.getContainers(ReplicationType.EC).stream()
+ .filter(this::isClosed).filter(c -> c.getNumberOfKeys() > 0)
+ .map(ContainerInfo::getContainerID).forEach(ecContainers::add);
+ ecMaxContainer = ecContainers.size();
+ long ecCutOff = (long) Math.ceil(ecMaxContainer * safeModeCutoff);
+
getSafeModeMetrics().setNumContainerWithECDataReplicaReportedThreshold(ecCutOff);
+
+ LOG.info("Refreshed Containers with ec n replica threshold count {}.",
ecCutOff);
+ }
+
+ private boolean isClosed(ContainerInfo container) {
+ final LifeCycleState state = container.getState();
+ return state == LifeCycleState.QUASI_CLOSED || state ==
LifeCycleState.CLOSED;
+ }
+
+ @Override
+ public String getStatusText() {
+ String status = String.format(
+ "%1.2f%% of [EC] Containers(%s / %s) with at least N reported replica
(=%1.2f) >= " +
+ "safeModeCutoff (=%1.2f);",
+ getCurrentContainerThreshold() * 100,
+ ecContainerWithMinReplicas, (long) ecMaxContainer,
+ getCurrentContainerThreshold(), this.safeModeCutoff);
+
+ Set<Long> sampleEcContainers =
ecContainerDNsMap.entrySet().stream().filter(entry -> {
+ Long containerId = entry.getKey();
+ int minReplica = getMinReplica(containerId);
+ Set<UUID> allReplicas = entry.getValue();
+ return allReplicas.size() < minReplica;
+
}).map(Map.Entry::getKey).limit(SAMPLE_CONTAINER_DISPLAY_LIMIT).collect(Collectors.toSet());
+
+ if (!sampleEcContainers.isEmpty()) {
+ String sampleECContainerText = "Sample EC Containers not satisfying the
criteria : " + sampleEcContainers + ";";
+ status = status.concat("\n").concat(sampleECContainerText);
+ }
+
+ return status;
+ }
+
+ @Override
+ public synchronized void refresh(boolean forceRefresh) {
+ if (forceRefresh || !validate()) {
+ initializeRule();
+ }
+ }
+
+ @Override
+ protected void cleanup() {
+ ecContainers.clear();
+ ecContainerDNsMap.clear();
+ }
+}
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java
new file mode 100644
index 0000000000..4015fd81b4
--- /dev/null
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/RatisContainerSafeModeRule.java
@@ -0,0 +1,201 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdds.scm.safemode;
+
+import static
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT;
+import static
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.stream.Collectors;
+import org.apache.hadoop.hdds.conf.ConfigurationSource;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType;
+import org.apache.hadoop.hdds.scm.container.ContainerID;
+import org.apache.hadoop.hdds.scm.container.ContainerInfo;
+import org.apache.hadoop.hdds.scm.container.ContainerManager;
+import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException;
+import org.apache.hadoop.hdds.scm.events.SCMEvents;
+import
org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer.NodeRegistrationContainerReport;
+import org.apache.hadoop.hdds.server.events.EventQueue;
+import org.apache.hadoop.hdds.server.events.TypedEvent;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+/**
+ * Class defining Safe mode exit criteria for Ratis Containers.
+ */
+public class RatisContainerSafeModeRule extends
SafeModeExitRule<NodeRegistrationContainerReport> {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(RatisContainerSafeModeRule.class);
+ private static final String NAME = "RatisContainerSafeModeRule";
+
+ private final ContainerManager containerManager;
+ // Required cutoff % for containers with at least 1 reported replica.
+ private final double safeModeCutoff;
+ // Containers read from scm db (excluding containers in ALLOCATED state).
+ private final Set<Long> ratisContainers;
+ private final AtomicLong ratisContainerWithMinReplicas;
+ private double ratisMaxContainer;
+
+ public RatisContainerSafeModeRule(EventQueue eventQueue,
+ ConfigurationSource conf,
+ ContainerManager containerManager,
+ SCMSafeModeManager manager) {
+ super(manager, NAME, eventQueue);
+ this.safeModeCutoff = getSafeModeCutoff(conf);
+ this.containerManager = containerManager;
+ this.ratisContainers = new HashSet<>();
+ this.ratisContainerWithMinReplicas = new AtomicLong(0);
+ initializeRule();
+ }
+
+ private static double getSafeModeCutoff(ConfigurationSource conf) {
+ final double cutoff = conf.getDouble(HDDS_SCM_SAFEMODE_THRESHOLD_PCT,
+ HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT);
+ Preconditions.checkArgument((cutoff >= 0.0 && cutoff <= 1.0),
+ HDDS_SCM_SAFEMODE_THRESHOLD_PCT + " value should be >= 0.0 and <=
1.0");
+ return cutoff;
+ }
+
+ @Override
+ protected TypedEvent<NodeRegistrationContainerReport> getEventType() {
+ return SCMEvents.CONTAINER_REGISTRATION_REPORT;
+ }
+
+ @Override
+ protected synchronized boolean validate() {
+ if (validateBasedOnReportProcessing()) {
+ return (getCurrentContainerThreshold() >= safeModeCutoff);
+ }
+
+ final List<ContainerInfo> containers = containerManager.getContainers(
+ ReplicationType.RATIS);
+
+ return containers.stream()
+ .filter(this::isClosed)
+ .map(ContainerInfo::containerID)
+ .noneMatch(this::isMissing);
+ }
+
+ /**
+ * Checks if the container has any replica.
+ */
+ private boolean isMissing(ContainerID id) {
+ try {
+ return containerManager.getContainerReplicas(id).isEmpty();
+ } catch (ContainerNotFoundException ex) {
+ /*
+ * This should never happen, in case this happens the container
+ * somehow got removed from SCM.
+ * Safemode rule doesn't have to log/fix this. We will just exclude this
+ * from the rule validation.
+ */
+ return false;
+
+ }
+ }
+
+ @VisibleForTesting
+ public double getCurrentContainerThreshold() {
+ return ratisMaxContainer == 0 ? 1 :
(ratisContainerWithMinReplicas.doubleValue() / ratisMaxContainer);
+ }
+
+ @Override
+ protected void process(NodeRegistrationContainerReport report) {
+ report.getReport().getReportsList().forEach(c -> {
+ long containerID = c.getContainerID();
+ if (ratisContainers.contains(containerID)) {
+ recordReportedContainer(containerID);
+ ratisContainers.remove(containerID);
+ }
+ });
+
+ if (scmInSafeMode()) {
+ SCMSafeModeManager.getLogger().info(
+ "SCM in safe mode. {} % containers [Ratis] have at least one
reported replica",
+ String.format("%.2f", getCurrentContainerThreshold() * 100));
+ }
+ }
+
+ /**
+ * Record the reported Container.
+ *
+ * @param containerID containerID
+ */
+ private void recordReportedContainer(long containerID) {
+ ratisContainerWithMinReplicas.getAndAdd(1);
+ getSafeModeMetrics()
+ .incCurrentContainersWithOneReplicaReportedCount();
+ }
+
+ private void initializeRule() {
+ ratisContainers.clear();
+ containerManager.getContainers(ReplicationType.RATIS).stream()
+ .filter(this::isClosed).filter(c -> c.getNumberOfKeys() > 0)
+ .map(ContainerInfo::getContainerID).forEach(ratisContainers::add);
+ ratisMaxContainer = ratisContainers.size();
+ long ratisCutOff = (long) Math.ceil(ratisMaxContainer * safeModeCutoff);
+
getSafeModeMetrics().setNumContainerWithOneReplicaReportedThreshold(ratisCutOff);
+
+ LOG.info("Refreshed Containers with one replica threshold count {}.",
ratisCutOff);
+ }
+
+ private boolean isClosed(ContainerInfo container) {
+ final LifeCycleState state = container.getState();
+ return state == LifeCycleState.QUASI_CLOSED || state ==
LifeCycleState.CLOSED;
+ }
+
+ @Override
+ public String getStatusText() {
+ String status = String.format(
+ "%1.2f%% of [Ratis] Containers(%s / %s) with at least one reported
replica (=%1.2f) >= " +
+ "safeModeCutoff (=%1.2f);",
+ getCurrentContainerThreshold() * 100,
+ ratisContainerWithMinReplicas, (long) ratisMaxContainer,
+ getCurrentContainerThreshold(), this.safeModeCutoff);
+
+ Set<Long> sampleRatisContainers =
ratisContainers.stream().limit(SAMPLE_CONTAINER_DISPLAY_LIMIT)
+ .collect(Collectors.toSet());
+
+ if (!sampleRatisContainers.isEmpty()) {
+ String sampleContainerText = "Sample Ratis Containers not satisfying the
criteria : " + sampleRatisContainers
+ + ";";
+ status = status.concat("\n").concat(sampleContainerText);
+ }
+
+ return status;
+ }
+
+ @Override
+ public synchronized void refresh(boolean forceRefresh) {
+ if (forceRefresh || !validate()) {
+ initializeRule();
+ }
+ }
+
+ @Override
+ protected void cleanup() {
+ ratisContainers.clear();
+ }
+}
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
index 6378459925..9dd79ca815 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
@@ -91,8 +91,9 @@ public class SCMSafeModeManager implements SafeModeManager {
private Map<String, SafeModeExitRule> exitRules = new HashMap<>(1);
private Set<String> preCheckRules = new HashSet<>(1);
private ConfigurationSource config;
+ private static final String RATIS_CONTAINER_EXIT_RULE =
"RatisContainerSafeModeRule";
+ private static final String EC_CONTAINER_EXIT_RULE =
"ECContainerSafeModeRule";
private static final String DN_EXIT_RULE = "DataNodeSafeModeRule";
- private static final String CONT_EXIT_RULE = "ContainerSafeModeRule";
private static final String HEALTHY_PIPELINE_EXIT_RULE =
"HealthyPipelineSafeModeRule";
private static final String ATLEAST_ONE_DATANODE_REPORTED_PIPELINE_EXIT_RULE
=
@@ -319,19 +320,24 @@ public static Logger getLogger() {
@VisibleForTesting
public double getCurrentContainerThreshold() {
- return ((ContainerSafeModeRule) exitRules.get(CONT_EXIT_RULE))
+ return ((RatisContainerSafeModeRule)
exitRules.get(RATIS_CONTAINER_EXIT_RULE))
.getCurrentContainerThreshold();
}
@VisibleForTesting
public double getCurrentECContainerThreshold() {
- return ((ContainerSafeModeRule) exitRules.get(CONT_EXIT_RULE))
- .getCurrentECContainerThreshold();
+ return ((ECContainerSafeModeRule) exitRules.get(EC_CONTAINER_EXIT_RULE))
+ .getCurrentContainerThreshold();
+ }
+
+ @VisibleForTesting
+ public RatisContainerSafeModeRule getRatisContainerSafeModeRule() {
+ return (RatisContainerSafeModeRule)
exitRules.get(RATIS_CONTAINER_EXIT_RULE);
}
@VisibleForTesting
- public ContainerSafeModeRule getContainerSafeModeRule() {
- return (ContainerSafeModeRule) exitRules.get(CONT_EXIT_RULE);
+ public ECContainerSafeModeRule getECContainerSafeModeRule() {
+ return (ECContainerSafeModeRule) exitRules.get(EC_CONTAINER_EXIT_RULE);
}
@VisibleForTesting
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeRuleFactory.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeRuleFactory.java
index 65be38ae6e..96c22d06e7 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeRuleFactory.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeRuleFactory.java
@@ -68,12 +68,15 @@ private SafeModeRuleFactory(final ConfigurationSource
config,
private void loadRules() {
// TODO: Use annotation to load the rules. (HDDS-11730)
- SafeModeExitRule<?> containerRule = new ContainerSafeModeRule(eventQueue,
+ SafeModeExitRule<?> ratisContainerRule = new
RatisContainerSafeModeRule(eventQueue,
+ config, containerManager, safeModeManager);
+ SafeModeExitRule<?> ecContainerRule = new
ECContainerSafeModeRule(eventQueue,
config, containerManager, safeModeManager);
SafeModeExitRule<?> datanodeRule = new DataNodeSafeModeRule(eventQueue,
config, nodeManager, safeModeManager);
- safeModeRules.add(containerRule);
+ safeModeRules.add(ratisContainerRule);
+ safeModeRules.add(ecContainerRule);
safeModeRules.add(datanodeRule);
preCheckRules.add(datanodeRule);
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
index f693df81dc..7a43792eb5 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
@@ -44,6 +44,7 @@
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationFactor;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationType;
import
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos;
import org.apache.hadoop.hdds.scm.HddsTestUtils;
import org.apache.hadoop.hdds.scm.container.ContainerInfo;
@@ -130,7 +131,7 @@ private void testSafeMode(int numContainers) throws
Exception {
container.setNumberOfKeys(10);
}
ContainerManager containerManager = mock(ContainerManager.class);
- when(containerManager.getContainers()).thenReturn(containers);
+
when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers);
scmSafeModeManager = new SCMSafeModeManager(
config, containerManager, null, null, queue,
serviceManager, scmContext);
@@ -169,7 +170,7 @@ public void testSafeModeExitRule() throws Exception {
container.setNumberOfKeys(10);
}
ContainerManager containerManager = mock(ContainerManager.class);
- when(containerManager.getContainers()).thenReturn(containers);
+
when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers);
scmSafeModeManager = new SCMSafeModeManager(
config, containerManager, null, null, queue,
serviceManager, scmContext);
@@ -235,7 +236,7 @@ public void
testHealthyPipelinePercentWithIncorrectValue(double healthyPercent,
serviceManager,
Clock.system(ZoneOffset.UTC));
ContainerManager containerManager = mock(ContainerManager.class);
- when(containerManager.getContainers()).thenReturn(containers);
+
when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers);
IllegalArgumentException exception =
assertThrows(IllegalArgumentException.class,
() -> new SCMSafeModeManager(conf, containerManager,
pipelineManager, mockNodeManager, queue, serviceManager,
scmContext));
@@ -301,7 +302,7 @@ public void
testSafeModeExitRuleWithPipelineAvailabilityCheck(
}
ContainerManager containerManager = mock(ContainerManager.class);
- when(containerManager.getContainers()).thenReturn(containers);
+
when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers);
scmSafeModeManager = new SCMSafeModeManager(
conf, containerManager, pipelineManager, mockNodeManager, queue,
@@ -438,8 +439,8 @@ public void testDisableSafeMode() {
conf.setBoolean(HddsConfigKeys.HDDS_SCM_SAFEMODE_ENABLED, false);
PipelineManager pipelineManager = mock(PipelineManager.class);
ContainerManager containerManager = mock(ContainerManager.class);
+
when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers);
NodeManager nodeManager = mock(SCMNodeManager.class);
- when(containerManager.getContainers()).thenReturn(containers);
scmSafeModeManager = new SCMSafeModeManager(
conf, containerManager, pipelineManager, nodeManager, queue,
serviceManager, scmContext);
@@ -480,7 +481,7 @@ public void testContainerSafeModeRule() throws Exception {
}
ContainerManager containerManager = mock(ContainerManager.class);
- when(containerManager.getContainers()).thenReturn(containers);
+
when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers);
scmSafeModeManager = new SCMSafeModeManager(
config, containerManager, null, null, queue, serviceManager,
scmContext);
@@ -572,16 +573,20 @@ public void testContainerSafeModeRuleEC(int data, int
parity) throws Exception {
// the threshold will reach 100%.
testECContainerThreshold(containers.subList(10, 20), 1.0, data);
- ContainerSafeModeRule containerSafeModeRule =
- scmSafeModeManager.getContainerSafeModeRule();
- assertTrue(containerSafeModeRule.validate());
+ ECContainerSafeModeRule ecContainerSafeModeRule =
+ scmSafeModeManager.getECContainerSafeModeRule();
+ assertTrue(ecContainerSafeModeRule.validate());
+
+ RatisContainerSafeModeRule ratisContainerSafeModeRule =
+ scmSafeModeManager.getRatisContainerSafeModeRule();
+ assertTrue(ratisContainerSafeModeRule.validate());
}
private void testSafeModeDataNodes(int numOfDns) throws Exception {
OzoneConfiguration conf = new OzoneConfiguration(config);
conf.setInt(HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE, numOfDns);
ContainerManager containerManager = mock(ContainerManager.class);
- when(containerManager.getContainers()).thenReturn(containers);
+
when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers);
scmSafeModeManager = new SCMSafeModeManager(
conf, containerManager, null, null, queue,
serviceManager, scmContext);
@@ -689,7 +694,7 @@ public void testSafeModePipelineExitRule() throws Exception
{
pipeline = pipelineManager.getPipeline(pipeline.getId());
MockRatisPipelineProvider.markPipelineHealthy(pipeline);
ContainerManager containerManager = mock(ContainerManager.class);
- when(containerManager.getContainers()).thenReturn(containers);
+
when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers);
scmSafeModeManager = new SCMSafeModeManager(
config, containerManager, pipelineManager, nodeManager, queue,
@@ -738,7 +743,7 @@ public void testPipelinesNotCreatedUntilPreCheckPasses()
throws Exception {
mockRatisProvider);
ContainerManager containerManager = mock(ContainerManager.class);
- when(containerManager.getContainers()).thenReturn(containers);
+
when(containerManager.getContainers(ReplicationType.RATIS)).thenReturn(containers);
scmSafeModeManager = new SCMSafeModeManager(
config, containerManager, pipelineManager, nodeManager, queue,
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java
index 1868d9e3cb..35917039d5 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java
@@ -55,7 +55,7 @@ public void testLoadedSafeModeRules() {
// as the rules are hardcoded in SafeModeRuleFactory.
// This will be fixed once we load rules using annotation.
- assertEquals(4, factory.getSafeModeRules().size(),
+ assertEquals(5, factory.getSafeModeRules().size(),
"The total safemode rules count doesn't match");
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]