This is an automated email from the ASF dual-hosted git repository.

sumitagrawl pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new 15216f2c7d5 HDDS-14868. refreshAndValidate ContainerSafemodeRule 
periodically, not on each applyTransaction (#9953)
15216f2c7d5 is described below

commit 15216f2c7d578ceb69a55e8071e0f3925d0c38e0
Author: Sadanand Shenoy <[email protected]>
AuthorDate: Tue Jun 16 09:15:18 2026 +0530

    HDDS-14868. refreshAndValidate ContainerSafemodeRule periodically, not on 
each applyTransaction (#9953)
---
 .../org/apache/hadoop/hdds/HddsConfigKeys.java     |  8 +++
 .../common/src/main/resources/ozone-default.xml    |  7 +++
 .../apache/hadoop/hdds/scm/ha/SCMStateMachine.java |  6 --
 .../hdds/scm/safemode/SCMSafeModeManager.java      | 38 ++++++++++++
 .../AbstractContainerSafeModeRuleTest.java         |  7 +++
 .../hdds/scm/safemode/TestSCMSafeModeManager.java  | 71 ++++++++++++++++++++++
 .../hdds/scm/safemode/TestSafeModeRuleFactory.java |  5 +-
 .../client/rpc/TestContainerStateMachine.java      |  2 +
 .../apache/hadoop/ozone/om/TestScmSafeMode.java    | 24 ++++++++
 9 files changed, 161 insertions(+), 7 deletions(-)

diff --git 
a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java 
b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java
index b321d5cc260..8e9d4e9e098 100644
--- 
a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java
+++ 
b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java
@@ -116,6 +116,14 @@ public final class HddsConfigKeys {
       "hdds.scm.safemode.log.interval";
   public static final String HDDS_SCM_SAFEMODE_LOG_INTERVAL_DEFAULT = "1m";
 
+  /**
+   * Interval for background refresh of safeMode rules. 0 disables the 
background thread.
+   */
+  public static final String HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL =
+      "hdds.scm.safemode.rule.refresh.interval";
+  public static final String
+      HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL_DEFAULT = "5s";
+
   // This configuration setting is used as a fallback location by all
   // Ozone/HDDS services for their metadata. It is useful as a single
   // config point for test/PoC clusters.
diff --git a/hadoop-hdds/common/src/main/resources/ozone-default.xml 
b/hadoop-hdds/common/src/main/resources/ozone-default.xml
index 7e4086d7e98..ac715a108a6 100644
--- a/hadoop-hdds/common/src/main/resources/ozone-default.xml
+++ b/hadoop-hdds/common/src/main/resources/ozone-default.xml
@@ -1717,6 +1717,13 @@
       reported replica before SCM comes out of safe mode.
     </description>
   </property>
+  <property>
+    <name>hdds.scm.safemode.rule.refresh.interval</name>
+    <value>5s</value>
+    <tag>HDDS,SCM,OPERATION</tag>
+    <description> Refresh interval in SCM Safemode.
+    </description>
+  </property>
 
   <property>
     <name>hdds.scm.wait.time.after.safemode.exit</name>
diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java
index 267ddc6c5b5..f6295fa7a7c 100644
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java
@@ -172,12 +172,6 @@ public CompletableFuture<Message> applyTransaction(
         // Ratis client, leaving SCM intact.
         applyTransactionFuture.completeExceptionally(ex);
       }
-
-      // After previous term transactions are applied, still in safe mode,
-      // perform refreshAndValidate to update the safemode rule state.
-      if (scm.isInSafeMode() && isStateMachineReady.get()) {
-        scm.getScmSafeModeManager().refreshAndValidate();
-      }
       final TermIndex appliedTermIndex = TermIndex.valueOf(trx.getLogEntry());
       
transactionBuffer.updateLatestTrxInfo(TransactionInfo.valueOf(appliedTermIndex));
       updateLastAppliedTermIndex(appliedTermIndex);
diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
index 65e52ec4272..b185f3a37fc 100644
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
@@ -19,6 +19,8 @@
 
 import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_ENABLED;
 import static 
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_ENABLED_DEFAULT;
+import static 
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL;
+import static 
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL_DEFAULT;
 
 import com.google.common.util.concurrent.ThreadFactoryBuilder;
 import java.util.HashMap;
@@ -89,6 +91,7 @@ public class SCMSafeModeManager implements SafeModeManager {
   private long safeModeLogIntervalMs;
   private ScheduledExecutorService safeModeLogExecutor;
   private ScheduledFuture<?> safeModeLogTask;
+  private final long refreshIntervalMs;
 
   /** Monotonic time when SCM entered safe mode; used to report exit duration. 
*/
   private long safeModeEnteredAtNanos = -1L;
@@ -121,6 +124,33 @@ public SCMSafeModeManager(final ConfigurationSource conf,
       status.set(SafeModeStatus.OUT_OF_SAFE_MODE);
       emitSafeModeStatus();
     }
+
+    this.refreshIntervalMs = conf.getTimeDuration(
+        HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL,
+        HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL_DEFAULT,
+        TimeUnit.MILLISECONDS);
+  }
+
+  private void startRefresh() {
+    final boolean enabled = refreshIntervalMs > 0;
+    LOG.info("Container safe mode rule refresh: enabled? {}, {}={}ms",
+        enabled, HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, refreshIntervalMs);
+    if (!enabled) {
+      return;
+    }
+    final String name = "safemode-refresh-thread";
+    final Thread t = new Thread(() -> {
+      try {
+        while (getInSafeMode()) {
+          Thread.sleep(refreshIntervalMs);
+          runRefreshAndValidate();
+        }
+      } catch (InterruptedException e) {
+        LOG.info("Interrupted {}", name, e);
+      }
+    }, name);
+    t.setDaemon(true);
+    t.start();
   }
 
   public void start() {
@@ -129,6 +159,7 @@ public void start() {
     }
     emitSafeModeStatus();
     startSafeModePeriodicLogger();
+    startRefresh();
   }
 
   public void stop() {
@@ -216,6 +247,13 @@ public void refresh() {
    * Refresh Rule state and validate rules.
    */
   public void refreshAndValidate() {
+    if (refreshIntervalMs > 0) {
+      return; // use executor to refresh
+    }
+    runRefreshAndValidate();
+  }
+
+  private void runRefreshAndValidate() {
     if (getInSafeMode()) {
       exitRules.values().forEach(rule -> {
         rule.refresh(false);
diff --git 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java
 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java
index baf3c269d71..0a81ef71817 100644
--- 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java
+++ 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java
@@ -17,6 +17,8 @@
 
 package org.apache.hadoop.hdds.scm.safemode;
 
+import static 
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL;
+import static 
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL_DEFAULT;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -32,6 +34,7 @@
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
+import java.util.concurrent.TimeUnit;
 import org.apache.hadoop.hdds.conf.ConfigurationSource;
 import org.apache.hadoop.hdds.protocol.DatanodeDetails;
 import org.apache.hadoop.hdds.protocol.DatanodeID;
@@ -73,6 +76,10 @@ public void setup() throws ContainerNotFoundException {
     safeModeMetrics = mock(SafeModeMetrics.class);
 
     when(safeModeManager.getSafeModeMetrics()).thenReturn(safeModeMetrics);
+    when(conf.getTimeDuration(
+        HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL,
+        HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL_DEFAULT,
+        TimeUnit.MILLISECONDS)).thenReturn(0L);
     containers = new ArrayList<>();
     
when(containerManager.getContainers(getReplicationType())).thenReturn(containers);
     
when(containerManager.getContainers(LifeCycleState.DELETED)).thenReturn(deletedContainers);
diff --git 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
index fdf38a7a67c..d99caa8708b 100644
--- 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
+++ 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
@@ -108,6 +108,7 @@ public void setUp() throws IOException {
         false);
     config.set(HddsConfigKeys.OZONE_METADATA_DIRS, tempDir.getAbsolutePath());
     config.setInt(HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE, 1);
+    config.set(HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, "0s");
     scmMetadataStore = new SCMMetadataStoreImpl(config);
   }
 
@@ -168,6 +169,76 @@ private void testSafeMode(int numContainers) throws 
Exception {
 
   }
 
+  @Test
+  public void testSafeModeExitWithPeriodicContainerRuleRefresh() throws 
Exception {
+    /*
+     * Start SCM with 5 closed Ratis containers.
+     * Mark 2 containers as deleted in ContainerManager.
+     * Wait until the rule’s total drops from 5 to 3.
+     * Fires DN reports and checks safemode exits using the refreshed count.
+     */
+    config.set(HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, 
"100ms");
+
+    List<ContainerInfo> ratisContainers = new ArrayList<>();
+    List<ContainerInfo> deletedContainers = new ArrayList<>();
+    ratisContainers.addAll(HddsTestUtils.getContainerInfo(5));
+    for (ContainerInfo container : ratisContainers) {
+      container.setState(HddsProtos.LifeCycleState.CLOSED);
+      container.setNumberOfKeys(10);
+    }
+
+    ContainerManager containerManager = mock(ContainerManager.class);
+    when(containerManager.getContainers(ReplicationType.RATIS))
+        .thenAnswer(invocation -> new ArrayList<>(ratisContainers));
+    when(containerManager.getContainers(ReplicationType.EC))
+        .thenReturn(Collections.emptyList());
+    when(containerManager.getContainers(HddsProtos.LifeCycleState.DELETED))
+        .thenAnswer(invocation -> new ArrayList<>(deletedContainers));
+
+    scmSafeModeManager = new SCMSafeModeManager(config, null, null, 
containerManager,
+        serviceManager, queue, scmContext);
+    scmSafeModeManager.start();
+
+    assertTrue(scmSafeModeManager.getInSafeMode());
+
+    RatisContainerSafeModeRule ratisRule = SafeModeRuleFactory.getInstance()
+        .getSafeModeRule(RatisContainerSafeModeRule.class);
+    assertEquals(5, ratisRule.getTotalNumberOfContainers(),
+        "initial Ratis container count from ContainerManager");
+
+    for (int i = 3; i < ratisContainers.size(); i++) {
+      ratisContainers.get(i).setState(HddsProtos.LifeCycleState.DELETED);
+      ratisContainers.get(i).setNumberOfKeys(10);
+      deletedContainers.add(ratisContainers.get(i));
+    }
+
+    GenericTestUtils.waitFor(
+        () -> ratisRule.getTotalNumberOfContainers() == 3,
+        100,
+        15000);
+
+    SCMDatanodeProtocolServer.NodeRegistrationContainerReport report =
+        HddsTestUtils.createNodeRegistrationContainerReport(ratisContainers);
+    queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT, report);
+    queue.fireEvent(SCMEvents.CONTAINER_REGISTRATION_REPORT, report);
+
+    long cutOff = (long) Math.ceil(3 * config.getDouble(
+        HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT,
+        HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT));
+
+    assertEquals(cutOff, scmSafeModeManager.getSafeModeMetrics()
+        .getNumContainerWithOneReplicaReportedThreshold().value());
+
+    GenericTestUtils.waitFor(() -> !scmSafeModeManager.getInSafeMode(),
+        100, 1000 * 30);
+    GenericTestUtils.waitFor(() ->
+            scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value() 
== 0,
+        100, 1000 * 5);
+
+    assertEquals(cutOff, scmSafeModeManager.getSafeModeMetrics()
+        .getCurrentContainersWithOneReplicaReportedCount().value());
+  }
+
   @Test
   public void testSafeModeExitRule() throws Exception {
     containers = new ArrayList<>();
diff --git 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java
 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java
index f795a6c5762..cdafe912c9f 100644
--- 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java
+++ 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java
@@ -23,6 +23,7 @@
 import static org.mockito.Mockito.when;
 
 import java.lang.reflect.Field;
+import org.apache.hadoop.hdds.HddsConfigKeys;
 import org.apache.hadoop.hdds.conf.OzoneConfiguration;
 import org.apache.hadoop.hdds.scm.container.ContainerManager;
 import org.apache.hadoop.hdds.scm.ha.SCMContext;
@@ -79,7 +80,9 @@ public void testLoadedPreCheckRules() {
   private SCMSafeModeManager initializeSafeModeRuleFactory() {
     final SCMSafeModeManager safeModeManager = mock(SCMSafeModeManager.class);
     
when(safeModeManager.getSafeModeMetrics()).thenReturn(mock(SafeModeMetrics.class));
-    SafeModeRuleFactory.initialize(new OzoneConfiguration(),
+    OzoneConfiguration conf = new OzoneConfiguration();
+    conf.set(HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, "0s");
+    SafeModeRuleFactory.initialize(conf,
         SCMContext.emptyContext(), new EventQueue(), mock(
             PipelineManager.class),
         mock(ContainerManager.class), mock(NodeManager.class));
diff --git 
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachine.java
 
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachine.java
index 68f6ec4c77c..508f530fc4a 100644
--- 
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachine.java
+++ 
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachine.java
@@ -34,6 +34,7 @@
 import java.util.List;
 import java.util.concurrent.TimeUnit;
 import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.hdds.HddsConfigKeys;
 import org.apache.hadoop.hdds.client.ReplicationConfig;
 import org.apache.hadoop.hdds.client.ReplicationFactor;
 import org.apache.hadoop.hdds.client.ReplicationType;
@@ -93,6 +94,7 @@ public void setup() throws Exception {
     conf.set(OzoneConfigKeys.OZONE_SCM_CLOSE_CONTAINER_WAIT_DURATION, "2s");
     conf.set(ScmConfigKeys.OZONE_SCM_PIPELINE_SCRUB_INTERVAL, "2s");
     conf.set(ScmConfigKeys.OZONE_SCM_PIPELINE_DESTROY_TIMEOUT, "5s");
+    conf.set(HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, "0s");
 
     OzoneClientConfig clientConfig = conf.getObject(OzoneClientConfig.class);
     clientConfig.setStreamBufferFlushDelay(false);
diff --git 
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestScmSafeMode.java
 
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestScmSafeMode.java
index c5f30fdb895..00089c1cc26 100644
--- 
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestScmSafeMode.java
+++ 
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestScmSafeMode.java
@@ -19,6 +19,7 @@
 
 import static java.util.concurrent.TimeUnit.MILLISECONDS;
 import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_HEARTBEAT_INTERVAL;
+import static 
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL;
 import static org.apache.hadoop.hdds.client.ReplicationFactor.ONE;
 import static org.apache.hadoop.hdds.client.ReplicationType.RATIS;
 import static 
org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_DEADNODE_INTERVAL;
@@ -196,6 +197,29 @@ void testIsScmInSafeModeAndForceExit() throws Exception {
 
   }
 
+  @Test
+  void testClusterExitsSafeModeWithPeriodicRuleRefresh() throws Exception {
+    cluster.shutdown();
+    conf.set(HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, "1s");
+    builder = MiniOzoneCluster.newBuilder(conf).setStartDataNodes(true);
+    cluster = builder.build();
+    cluster.waitForClusterToBeReady();
+    final StorageContainerManager scm = cluster.getStorageContainerManager();
+    TestDataUtil.createKeys(cluster, 100);
+    GenericTestUtils.waitFor(() -> 
scm.getContainerManager().getContainers().size() >= 3,
+        100, 1000 * 30);
+
+    cluster.restartStorageContainerManager(false);
+
+    assertTrue(cluster.getStorageContainerManager().isInSafeMode(), "SCM 
should start in safe mode");
+    GenericTestUtils.waitFor(() -> 
scm.getContainerManager().getContainers().size() >= 3,
+        100, 1000 * 15);
+
+    cluster.waitTobeOutOfSafeMode();
+
+    assertFalse(scm.isInSafeMode(), "SCM should exit safe mode with periodic 
rule refresh enabled");
+  }
+
   @Test
   void testSCMSafeMode() throws Exception {
     // Test1: Test safe mode  when there are no containers in system.


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to