This is an automated email from the ASF dual-hosted git repository.
sumitagrawl pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 15216f2c7d5 HDDS-14868. refreshAndValidate ContainerSafemodeRule
periodically, not on each applyTransaction (#9953)
15216f2c7d5 is described below
commit 15216f2c7d578ceb69a55e8071e0f3925d0c38e0
Author: Sadanand Shenoy <[email protected]>
AuthorDate: Tue Jun 16 09:15:18 2026 +0530
HDDS-14868. refreshAndValidate ContainerSafemodeRule periodically, not on
each applyTransaction (#9953)
---
.../org/apache/hadoop/hdds/HddsConfigKeys.java | 8 +++
.../common/src/main/resources/ozone-default.xml | 7 +++
.../apache/hadoop/hdds/scm/ha/SCMStateMachine.java | 6 --
.../hdds/scm/safemode/SCMSafeModeManager.java | 38 ++++++++++++
.../AbstractContainerSafeModeRuleTest.java | 7 +++
.../hdds/scm/safemode/TestSCMSafeModeManager.java | 71 ++++++++++++++++++++++
.../hdds/scm/safemode/TestSafeModeRuleFactory.java | 5 +-
.../client/rpc/TestContainerStateMachine.java | 2 +
.../apache/hadoop/ozone/om/TestScmSafeMode.java | 24 ++++++++
9 files changed, 161 insertions(+), 7 deletions(-)
diff --git
a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java
b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java
index b321d5cc260..8e9d4e9e098 100644
---
a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java
+++
b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java
@@ -116,6 +116,14 @@ public final class HddsConfigKeys {
"hdds.scm.safemode.log.interval";
public static final String HDDS_SCM_SAFEMODE_LOG_INTERVAL_DEFAULT = "1m";
+ /**
+ * Interval for background refresh of safeMode rules. 0 disables the
background thread.
+ */
+ public static final String HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL =
+ "hdds.scm.safemode.rule.refresh.interval";
+ public static final String
+ HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL_DEFAULT = "5s";
+
// This configuration setting is used as a fallback location by all
// Ozone/HDDS services for their metadata. It is useful as a single
// config point for test/PoC clusters.
diff --git a/hadoop-hdds/common/src/main/resources/ozone-default.xml
b/hadoop-hdds/common/src/main/resources/ozone-default.xml
index 7e4086d7e98..ac715a108a6 100644
--- a/hadoop-hdds/common/src/main/resources/ozone-default.xml
+++ b/hadoop-hdds/common/src/main/resources/ozone-default.xml
@@ -1717,6 +1717,13 @@
reported replica before SCM comes out of safe mode.
</description>
</property>
+ <property>
+ <name>hdds.scm.safemode.rule.refresh.interval</name>
+ <value>5s</value>
+ <tag>HDDS,SCM,OPERATION</tag>
+ <description> Refresh interval in SCM Safemode.
+ </description>
+ </property>
<property>
<name>hdds.scm.wait.time.after.safemode.exit</name>
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java
index 267ddc6c5b5..f6295fa7a7c 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java
@@ -172,12 +172,6 @@ public CompletableFuture<Message> applyTransaction(
// Ratis client, leaving SCM intact.
applyTransactionFuture.completeExceptionally(ex);
}
-
- // After previous term transactions are applied, still in safe mode,
- // perform refreshAndValidate to update the safemode rule state.
- if (scm.isInSafeMode() && isStateMachineReady.get()) {
- scm.getScmSafeModeManager().refreshAndValidate();
- }
final TermIndex appliedTermIndex = TermIndex.valueOf(trx.getLogEntry());
transactionBuffer.updateLatestTrxInfo(TransactionInfo.valueOf(appliedTermIndex));
updateLastAppliedTermIndex(appliedTermIndex);
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
index 65e52ec4272..b185f3a37fc 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
@@ -19,6 +19,8 @@
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_ENABLED;
import static
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_ENABLED_DEFAULT;
+import static
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL;
+import static
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL_DEFAULT;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import java.util.HashMap;
@@ -89,6 +91,7 @@ public class SCMSafeModeManager implements SafeModeManager {
private long safeModeLogIntervalMs;
private ScheduledExecutorService safeModeLogExecutor;
private ScheduledFuture<?> safeModeLogTask;
+ private final long refreshIntervalMs;
/** Monotonic time when SCM entered safe mode; used to report exit duration.
*/
private long safeModeEnteredAtNanos = -1L;
@@ -121,6 +124,33 @@ public SCMSafeModeManager(final ConfigurationSource conf,
status.set(SafeModeStatus.OUT_OF_SAFE_MODE);
emitSafeModeStatus();
}
+
+ this.refreshIntervalMs = conf.getTimeDuration(
+ HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL,
+ HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL_DEFAULT,
+ TimeUnit.MILLISECONDS);
+ }
+
+ private void startRefresh() {
+ final boolean enabled = refreshIntervalMs > 0;
+ LOG.info("Container safe mode rule refresh: enabled? {}, {}={}ms",
+ enabled, HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, refreshIntervalMs);
+ if (!enabled) {
+ return;
+ }
+ final String name = "safemode-refresh-thread";
+ final Thread t = new Thread(() -> {
+ try {
+ while (getInSafeMode()) {
+ Thread.sleep(refreshIntervalMs);
+ runRefreshAndValidate();
+ }
+ } catch (InterruptedException e) {
+ LOG.info("Interrupted {}", name, e);
+ }
+ }, name);
+ t.setDaemon(true);
+ t.start();
}
public void start() {
@@ -129,6 +159,7 @@ public void start() {
}
emitSafeModeStatus();
startSafeModePeriodicLogger();
+ startRefresh();
}
public void stop() {
@@ -216,6 +247,13 @@ public void refresh() {
* Refresh Rule state and validate rules.
*/
public void refreshAndValidate() {
+ if (refreshIntervalMs > 0) {
+ return; // use executor to refresh
+ }
+ runRefreshAndValidate();
+ }
+
+ private void runRefreshAndValidate() {
if (getInSafeMode()) {
exitRules.values().forEach(rule -> {
rule.refresh(false);
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java
index baf3c269d71..0a81ef71817 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java
@@ -17,6 +17,8 @@
package org.apache.hadoop.hdds.scm.safemode;
+import static
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL;
+import static
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL_DEFAULT;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -32,6 +34,7 @@
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
+import java.util.concurrent.TimeUnit;
import org.apache.hadoop.hdds.conf.ConfigurationSource;
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
import org.apache.hadoop.hdds.protocol.DatanodeID;
@@ -73,6 +76,10 @@ public void setup() throws ContainerNotFoundException {
safeModeMetrics = mock(SafeModeMetrics.class);
when(safeModeManager.getSafeModeMetrics()).thenReturn(safeModeMetrics);
+ when(conf.getTimeDuration(
+ HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL,
+ HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL_DEFAULT,
+ TimeUnit.MILLISECONDS)).thenReturn(0L);
containers = new ArrayList<>();
when(containerManager.getContainers(getReplicationType())).thenReturn(containers);
when(containerManager.getContainers(LifeCycleState.DELETED)).thenReturn(deletedContainers);
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
index fdf38a7a67c..d99caa8708b 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
@@ -108,6 +108,7 @@ public void setUp() throws IOException {
false);
config.set(HddsConfigKeys.OZONE_METADATA_DIRS, tempDir.getAbsolutePath());
config.setInt(HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE, 1);
+ config.set(HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, "0s");
scmMetadataStore = new SCMMetadataStoreImpl(config);
}
@@ -168,6 +169,76 @@ private void testSafeMode(int numContainers) throws
Exception {
}
+ @Test
+ public void testSafeModeExitWithPeriodicContainerRuleRefresh() throws
Exception {
+ /*
+ * Start SCM with 5 closed Ratis containers.
+ * Mark 2 containers as deleted in ContainerManager.
+ * Wait until the rule’s total drops from 5 to 3.
+ * Fires DN reports and checks safemode exits using the refreshed count.
+ */
+ config.set(HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL,
"100ms");
+
+ List<ContainerInfo> ratisContainers = new ArrayList<>();
+ List<ContainerInfo> deletedContainers = new ArrayList<>();
+ ratisContainers.addAll(HddsTestUtils.getContainerInfo(5));
+ for (ContainerInfo container : ratisContainers) {
+ container.setState(HddsProtos.LifeCycleState.CLOSED);
+ container.setNumberOfKeys(10);
+ }
+
+ ContainerManager containerManager = mock(ContainerManager.class);
+ when(containerManager.getContainers(ReplicationType.RATIS))
+ .thenAnswer(invocation -> new ArrayList<>(ratisContainers));
+ when(containerManager.getContainers(ReplicationType.EC))
+ .thenReturn(Collections.emptyList());
+ when(containerManager.getContainers(HddsProtos.LifeCycleState.DELETED))
+ .thenAnswer(invocation -> new ArrayList<>(deletedContainers));
+
+ scmSafeModeManager = new SCMSafeModeManager(config, null, null,
containerManager,
+ serviceManager, queue, scmContext);
+ scmSafeModeManager.start();
+
+ assertTrue(scmSafeModeManager.getInSafeMode());
+
+ RatisContainerSafeModeRule ratisRule = SafeModeRuleFactory.getInstance()
+ .getSafeModeRule(RatisContainerSafeModeRule.class);
+ assertEquals(5, ratisRule.getTotalNumberOfContainers(),
+ "initial Ratis container count from ContainerManager");
+
+ for (int i = 3; i < ratisContainers.size(); i++) {
+ ratisContainers.get(i).setState(HddsProtos.LifeCycleState.DELETED);
+ ratisContainers.get(i).setNumberOfKeys(10);
+ deletedContainers.add(ratisContainers.get(i));
+ }
+
+ GenericTestUtils.waitFor(
+ () -> ratisRule.getTotalNumberOfContainers() == 3,
+ 100,
+ 15000);
+
+ SCMDatanodeProtocolServer.NodeRegistrationContainerReport report =
+ HddsTestUtils.createNodeRegistrationContainerReport(ratisContainers);
+ queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT, report);
+ queue.fireEvent(SCMEvents.CONTAINER_REGISTRATION_REPORT, report);
+
+ long cutOff = (long) Math.ceil(3 * config.getDouble(
+ HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT,
+ HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT));
+
+ assertEquals(cutOff, scmSafeModeManager.getSafeModeMetrics()
+ .getNumContainerWithOneReplicaReportedThreshold().value());
+
+ GenericTestUtils.waitFor(() -> !scmSafeModeManager.getInSafeMode(),
+ 100, 1000 * 30);
+ GenericTestUtils.waitFor(() ->
+ scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value()
== 0,
+ 100, 1000 * 5);
+
+ assertEquals(cutOff, scmSafeModeManager.getSafeModeMetrics()
+ .getCurrentContainersWithOneReplicaReportedCount().value());
+ }
+
@Test
public void testSafeModeExitRule() throws Exception {
containers = new ArrayList<>();
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java
index f795a6c5762..cdafe912c9f 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSafeModeRuleFactory.java
@@ -23,6 +23,7 @@
import static org.mockito.Mockito.when;
import java.lang.reflect.Field;
+import org.apache.hadoop.hdds.HddsConfigKeys;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.scm.container.ContainerManager;
import org.apache.hadoop.hdds.scm.ha.SCMContext;
@@ -79,7 +80,9 @@ public void testLoadedPreCheckRules() {
private SCMSafeModeManager initializeSafeModeRuleFactory() {
final SCMSafeModeManager safeModeManager = mock(SCMSafeModeManager.class);
when(safeModeManager.getSafeModeMetrics()).thenReturn(mock(SafeModeMetrics.class));
- SafeModeRuleFactory.initialize(new OzoneConfiguration(),
+ OzoneConfiguration conf = new OzoneConfiguration();
+ conf.set(HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, "0s");
+ SafeModeRuleFactory.initialize(conf,
SCMContext.emptyContext(), new EventQueue(), mock(
PipelineManager.class),
mock(ContainerManager.class), mock(NodeManager.class));
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachine.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachine.java
index 68f6ec4c77c..508f530fc4a 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachine.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachine.java
@@ -34,6 +34,7 @@
import java.util.List;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.hdds.HddsConfigKeys;
import org.apache.hadoop.hdds.client.ReplicationConfig;
import org.apache.hadoop.hdds.client.ReplicationFactor;
import org.apache.hadoop.hdds.client.ReplicationType;
@@ -93,6 +94,7 @@ public void setup() throws Exception {
conf.set(OzoneConfigKeys.OZONE_SCM_CLOSE_CONTAINER_WAIT_DURATION, "2s");
conf.set(ScmConfigKeys.OZONE_SCM_PIPELINE_SCRUB_INTERVAL, "2s");
conf.set(ScmConfigKeys.OZONE_SCM_PIPELINE_DESTROY_TIMEOUT, "5s");
+ conf.set(HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, "0s");
OzoneClientConfig clientConfig = conf.getObject(OzoneClientConfig.class);
clientConfig.setStreamBufferFlushDelay(false);
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestScmSafeMode.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestScmSafeMode.java
index c5f30fdb895..00089c1cc26 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestScmSafeMode.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestScmSafeMode.java
@@ -19,6 +19,7 @@
import static java.util.concurrent.TimeUnit.MILLISECONDS;
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_HEARTBEAT_INTERVAL;
+import static
org.apache.hadoop.hdds.HddsConfigKeys.HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL;
import static org.apache.hadoop.hdds.client.ReplicationFactor.ONE;
import static org.apache.hadoop.hdds.client.ReplicationType.RATIS;
import static
org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_DEADNODE_INTERVAL;
@@ -196,6 +197,29 @@ void testIsScmInSafeModeAndForceExit() throws Exception {
}
+ @Test
+ void testClusterExitsSafeModeWithPeriodicRuleRefresh() throws Exception {
+ cluster.shutdown();
+ conf.set(HDDS_SCM_SAFEMODE_RULE_REFRESH_INTERVAL, "1s");
+ builder = MiniOzoneCluster.newBuilder(conf).setStartDataNodes(true);
+ cluster = builder.build();
+ cluster.waitForClusterToBeReady();
+ final StorageContainerManager scm = cluster.getStorageContainerManager();
+ TestDataUtil.createKeys(cluster, 100);
+ GenericTestUtils.waitFor(() ->
scm.getContainerManager().getContainers().size() >= 3,
+ 100, 1000 * 30);
+
+ cluster.restartStorageContainerManager(false);
+
+ assertTrue(cluster.getStorageContainerManager().isInSafeMode(), "SCM
should start in safe mode");
+ GenericTestUtils.waitFor(() ->
scm.getContainerManager().getContainers().size() >= 3,
+ 100, 1000 * 15);
+
+ cluster.waitTobeOutOfSafeMode();
+
+ assertFalse(scm.isInSafeMode(), "SCM should exit safe mode with periodic
rule refresh enabled");
+ }
+
@Test
void testSCMSafeMode() throws Exception {
// Test1: Test safe mode when there are no containers in system.
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]