rakeshadr commented on code in PR #10157:
URL: https://github.com/apache/ozone/pull/10157#discussion_r3371796166


##########
hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/HealthyPipelineSafeModeRule.java:
##########
@@ -97,8 +101,7 @@ private int getMinHealthyPipelines(ConfigurationSource 
config) {
         HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE,
         HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE_DEFAULT);
 
-    // We only care about THREE replica pipeline
-    return minDatanodes / HddsProtos.ReplicationFactor.THREE_VALUE;
+    return minDatanodes / targetRequiredNodes;

Review Comment:
   How abt to introduce new EC rule ?
   
     ```
   SafeModeRuleFactory.java
   
     SafeModeExitRule<?> datanodeRule = new DataNodeSafeModeRule(eventQueue,
           config, nodeManager, safeModeManager);
       ECMinDataNodeSafeModeRule ecMinDnRule = new ECMinDataNodeSafeModeRule(
           eventQueue, config, nodeManager, safeModeManager);
       safeModeRules.add(ratisContainerRule);
       safeModeRules.add(ecContainerRule);
       safeModeRules.add(datanodeRule);
       // Only adds a non-trivial gate when default replication is EC;
       // the rule is a no-op (validate() == true) for RATIS-default clusters.
       safeModeRules.add(ecMinDnRule);
   ```
   
   
   ```
   ECMinDataNodeSafeModeRule.java
   
   package org.apache.hadoop.hdds.scm.safemode;
   
   import java.util.HashSet;
   import java.util.Set;
   import org.apache.hadoop.hdds.client.ECReplicationConfig;
   import org.apache.hadoop.hdds.client.ReplicationConfig;
   import org.apache.hadoop.hdds.conf.ConfigurationSource;
   import org.apache.hadoop.hdds.protocol.DatanodeID;
   import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
   import org.apache.hadoop.hdds.scm.events.SCMEvents;
   import org.apache.hadoop.hdds.scm.node.NodeManager;
   import org.apache.hadoop.hdds.scm.node.NodeStatus;
   import 
org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer.NodeRegistrationContainerReport;
   import org.apache.hadoop.hdds.server.events.EventQueue;
   import org.apache.hadoop.hdds.server.events.TypedEvent;
   
   /**
    * Safemode exit rule for EC-default clusters.
    *
    * <p>EC pipelines are ephemeral — they are created on-demand and do not
    * survive SCM restarts. Therefore {@link HealthyPipelineSafeModeRule} cannot
    * reliably gate safemode exit for EC. Instead, this rule ensures that at 
least
    * {@code data + parity} healthy DataNodes are registered before SCM exits
    * safemode, guaranteeing that one full EC stripe can be written immediately
    * after exit.
    *
    * <p>The required node count is derived directly from the EC replication
    * config (e.g. RS-3-2 → 5 nodes, RS-6-3 → 9 nodes), independent of
    * {@code hdds.scm.safemode.min.datanode}.
    *
    * <p>When the cluster default replication is <em>not</em> EC, this rule is a
    * no-op ({@link #validate()} always returns {@code true}) and adds no 
overhead.
    */
   public class ECMinDataNodeSafeModeRule
       extends SafeModeExitRule<NodeRegistrationContainerReport> {
   
     private final boolean enabled;
     private final int requiredDns;
     private final String ecConfigLabel;
     private final NodeManager nodeManager;
   
     private final Set<DatanodeID> registeredDnSet;
     private int registeredDns = 0;
   
     public ECMinDataNodeSafeModeRule(EventQueue eventQueue,
         ConfigurationSource conf,
         NodeManager nodeManager,
         SCMSafeModeManager safeModeManager) {
       super(safeModeManager, eventQueue);
       this.nodeManager = nodeManager;
   
       ReplicationConfig defaultConfig = getDefaultReplicationConfig(conf);
       if (defaultConfig != null
           && defaultConfig.getReplicationType() == 
HddsProtos.ReplicationType.EC) {
         ECReplicationConfig ecConfig = (ECReplicationConfig) defaultConfig;
         this.requiredDns = ecConfig.getRequiredNodes(); // data + parity
         this.ecConfigLabel = ecConfig.configFormat();
         this.enabled = true;
         this.registeredDnSet = new HashSet<>(requiredDns * 2);
         SCMSafeModeManager.getLogger().info(
             "ECMinDataNodeSafeModeRule enabled: EC default replication is {}, "
                 + "requiring {} healthy DataNodes before safemode exit.",
             ecConfigLabel, requiredDns);
       } else {
         this.requiredDns = 0;
         this.ecConfigLabel = "";
         this.enabled = false;
         this.registeredDnSet = new HashSet<>(0);
         SCMSafeModeManager.getLogger().debug(
             "ECMinDataNodeSafeModeRule disabled: default replication is not 
EC.");
       }
     }
   
     @Override
     protected TypedEvent<NodeRegistrationContainerReport> getEventType() {
       return SCMEvents.NODE_REGISTRATION_CONT_REPORT;
     }
   
     @Override
     protected boolean validate() {
       if (!enabled) {
         return true;
       }
       if (validateBasedOnReportProcessing()) {
         return registeredDns >= requiredDns;
       }
       return nodeManager.getNodes(NodeStatus.inServiceHealthy()).size() >= 
requiredDns;
     }
   
     @Override
     protected void process(NodeRegistrationContainerReport report) {
       if (!enabled) {
         return;
       }
       DatanodeID dnId = report.getDatanodeDetails().getID();
       if (registeredDnSet.add(dnId)) {
         registeredDns = registeredDnSet.size();
         SCMSafeModeManager.getLogger().info(
             "ECMinDataNodeSafeModeRule: {} of {} required DataNodes registered 
"
                 + "for EC ({}) safemode exit.",
             registeredDns, requiredDns, ecConfigLabel);
       }
     }
   
     @Override
     protected void cleanup() {
       registeredDnSet.clear();
     }
   
     @Override
     public String getStatusText() {
       if (!enabled) {
         return "ECMinDataNodeSafeModeRule is not applicable (default 
replication is not EC)";
       }
       return String.format(
           "EC (%s) safemode: registered DataNodes (=%d) >= required DataNodes 
(=%d)",
           ecConfigLabel, registeredDns, requiredDns);
     }
   
     @Override
     public void refresh(boolean forceRefresh) {
       // Nothing to refresh from SCM DB for this rule.
     }
   
     public int getRequiredDns() {
       return requiredDns;
     }
   
     public int getRegisteredDns() {
       return registeredDns;
     }
   
     public boolean isEnabled() {
       return enabled;
     }
   
     private static ReplicationConfig getDefaultReplicationConfig(
         ConfigurationSource conf) {
       try {
         return ReplicationConfig.getDefault(conf);
       } catch (IllegalArgumentException e) {
         SCMSafeModeManager.getLogger().warn(
             "ECMinDataNodeSafeModeRule: could not parse default replication "
                 + "config, rule will be disabled. Error: {}", e.getMessage());
         return null;
       }
     }
   }
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to