This is an automated email from the ASF dual-hosted git repository.
sadanand48 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new e07f8b5fd1c HDDS-14954. Add logging and metric for SCM safemode
duration. (#10018)
e07f8b5fd1c is described below
commit e07f8b5fd1c6877cdbe043d204a1f1bb3c195d7d
Author: Sadanand Shenoy <[email protected]>
AuthorDate: Mon May 18 17:36:40 2026 +0530
HDDS-14954. Add logging and metric for SCM safemode duration. (#10018)
---
.../safemode/AbstractContainerSafeModeRule.java | 11 +-
.../hdds/scm/safemode/SCMSafeModeManager.java | 23 +++
.../hadoop/hdds/scm/safemode/SafeModeMetrics.java | 37 +++-
.../AbstractContainerSafeModeRuleTest.java | 40 +++-
.../grafana/dashboards/Ozone - SCM Safemode.json | 203 +++++++++++++++++++++
5 files changed, 310 insertions(+), 4 deletions(-)
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java
index a5e3a4eed0f..9d13f951978 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java
@@ -26,6 +26,7 @@
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
import org.apache.hadoop.hdds.conf.ConfigurationSource;
@@ -40,6 +41,7 @@
import
org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer.NodeRegistrationContainerReport;
import org.apache.hadoop.hdds.server.events.EventQueue;
import org.apache.hadoop.hdds.server.events.TypedEvent;
+import org.apache.hadoop.util.Time;
/**
* Abstract class for Container Safe mode exit rule.
@@ -158,7 +160,14 @@ public double getCurrentContainerThreshold() {
@Override
public synchronized void refresh(boolean forceRefresh) {
if (forceRefresh || !validate()) {
- reinitializeRule();
+ final long startNanos = Time.monotonicNowNanos();
+ getSafeModeMetrics().incNumContainerSafeModeRuleRefreshes();
+ try {
+ reinitializeRule();
+ } finally {
+ long durationMs =
TimeUnit.NANOSECONDS.toMillis(Time.monotonicNowNanos() - startNanos);
+
getSafeModeMetrics().setLastContainerSafeModeRuleRefreshDurationMs(getContainerType(),
durationMs);
+ }
}
}
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
index 2c9173b2bf0..65e52ec4272 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
@@ -40,6 +40,7 @@
import org.apache.hadoop.hdds.scm.node.NodeManager;
import org.apache.hadoop.hdds.scm.pipeline.PipelineManager;
import org.apache.hadoop.hdds.server.events.EventQueue;
+import org.apache.hadoop.util.Time;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -89,6 +90,9 @@ public class SCMSafeModeManager implements SafeModeManager {
private ScheduledExecutorService safeModeLogExecutor;
private ScheduledFuture<?> safeModeLogTask;
+ /** Monotonic time when SCM entered safe mode; used to report exit duration.
*/
+ private long safeModeEnteredAtNanos = -1L;
+
public SCMSafeModeManager(final ConfigurationSource conf,
final NodeManager nodeManager,
final PipelineManager pipelineManager,
@@ -120,6 +124,9 @@ public SCMSafeModeManager(final ConfigurationSource conf,
}
public void start() {
+ if (getInSafeMode()) {
+ safeModeEnteredAtNanos = Time.monotonicNowNanos();
+ }
emitSafeModeStatus();
startSafeModePeriodicLogger();
}
@@ -177,13 +184,18 @@ public synchronized void validateSafeModeExitRules(String
ruleName) {
LOG.info("ScmSafeModeManager, all rules are successfully validated");
LOG.info("SCM exiting safe mode.");
emitSafeModeStatus();
+ recordSafeModeExitDuration();
}
}
public void forceExitSafeMode() {
+ boolean wasInSafeMode = getInSafeMode();
LOG.info("SCM force-exiting safe mode.");
status.set(SafeModeStatus.OUT_OF_SAFE_MODE);
emitSafeModeStatus();
+ if (wasInSafeMode) {
+ recordSafeModeExitDuration();
+ }
}
/**
@@ -308,6 +320,17 @@ private synchronized void logSafeModeStatus() {
}
}
+ private void recordSafeModeExitDuration() {
+ if (safeModeEnteredAtNanos < 0) {
+ return;
+ }
+ long durationMs =
+ TimeUnit.NANOSECONDS.toMillis(Time.monotonicNowNanos() -
safeModeEnteredAtNanos);
+ safeModeEnteredAtNanos = -1;
+ safeModeMetrics.setScmSafeModeExitDurationMs(durationMs);
+ LOG.info("SCM safe mode exit duration {} ms (since start() while in safe
mode)", durationMs);
+ }
+
/**
* Stops the periodic safe mode logger.
* Called when safe mode exits.
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeMetrics.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeMetrics.java
index 1f1daaae09b..d2cc94e261a 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeMetrics.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeMetrics.java
@@ -59,6 +59,15 @@ public class SafeModeMetrics {
@Metric private MutableGaugeLong numRequiredDatanodesThreshold;
@Metric private MutableCounterLong currentRegisteredDatanodesCount;
+ @Metric("Wall-clock time (ms) SCM spent in safe mode for the last exit")
+ private MutableGaugeLong scmSafeModeExitDurationMs;
+ @Metric("Duration (ms) of the last Ratis container safe mode rule
incremental refresh")
+ private MutableGaugeLong lastRatisContainerSafeModeRuleRefreshDurationMs;
+ @Metric("Duration (ms) of the last EC container safe mode rule incremental
refresh")
+ private MutableGaugeLong lastEcContainerSafeModeRuleRefreshDurationMs;
+ @Metric("Number of refresh calls before exiting safemode")
+ private MutableCounterLong numContainerSafeModeRuleRefreshes;
+
public static SafeModeMetrics create() {
final MetricsSystem ms = DefaultMetricsSystem.instance();
return ms.register(SOURCE_NAME, "SCM Safemode Metrics", new
SafeModeMetrics());
@@ -113,10 +122,32 @@ public void
incCurrentContainersWithECDataReplicaReportedCount() {
this.currentContainersWithECDataReplicaReportedCount.incr();
}
+ public void incNumContainerSafeModeRuleRefreshes() {
+ this.numContainerSafeModeRuleRefreshes.incr();
+ }
+
public void incCurrentRegisteredDatanodesCount() {
this.currentRegisteredDatanodesCount.incr();
}
+ public void setScmSafeModeExitDurationMs(long durationMs) {
+ this.scmSafeModeExitDurationMs.set(durationMs);
+ }
+
+ public void setLastContainerSafeModeRuleRefreshDurationMs(
+ HddsProtos.ReplicationType type, long durationMs) {
+ switch (type) {
+ case RATIS:
+ this.lastRatisContainerSafeModeRuleRefreshDurationMs.set(durationMs);
+ break;
+ case EC:
+ this.lastEcContainerSafeModeRuleRefreshDurationMs.set(durationMs);
+ break;
+ default:
+ break;
+ }
+ }
+
MutableGaugeLong getNumHealthyPipelinesThreshold() {
return numHealthyPipelinesThreshold;
}
@@ -145,7 +176,11 @@ MutableGaugeLong
getNumContainerWithECDataReplicaReportedThreshold() {
MutableCounterLong getCurrentContainersWithOneReplicaReportedCount() {
return currentContainersWithOneReplicaReportedCount;
}
-
+
+ public MutableCounterLong getNumContainerSafeModeRuleRefreshes() {
+ return numContainerSafeModeRuleRefreshes;
+ }
+
MutableCounterLong getCurrentRegisteredDatanodesCount() {
return currentRegisteredDatanodesCount;
}
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java
index 77485e722b9..7f3fd432803 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java
@@ -21,7 +21,12 @@
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.ArgumentMatchers.anyLong;
+import static org.mockito.ArgumentMatchers.eq;
import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.never;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
import java.util.ArrayList;
@@ -44,6 +49,7 @@
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.EnumSource;
+import org.mockito.ArgumentCaptor;
/**
* Abstract base class for container safe mode rule tests.
@@ -54,6 +60,8 @@ public abstract class AbstractContainerSafeModeRuleTest {
private ConfigurationSource conf;
private ContainerManager containerManager;
private EventQueue eventQueue;
+ private AbstractContainerSafeModeRule safeModeRule;
+ private SafeModeMetrics safeModeMetrics;
@BeforeEach
public void setup() throws ContainerNotFoundException {
@@ -61,9 +69,9 @@ public void setup() throws ContainerNotFoundException {
conf = mock(ConfigurationSource.class);
eventQueue = mock(EventQueue.class);
safeModeManager = mock(SCMSafeModeManager.class);
- final SafeModeMetrics metrics = mock(SafeModeMetrics.class);
+ safeModeMetrics = mock(SafeModeMetrics.class);
- when(safeModeManager.getSafeModeMetrics()).thenReturn(metrics);
+ when(safeModeManager.getSafeModeMetrics()).thenReturn(safeModeMetrics);
containers = new ArrayList<>();
when(containerManager.getContainers(getReplicationType())).thenReturn(containers);
when(containerManager.getContainer(any(ContainerID.class))).thenAnswer(invocation
-> {
@@ -73,6 +81,9 @@ public void setup() throws ContainerNotFoundException {
.findFirst()
.orElseThrow(ContainerNotFoundException::new);
});
+
+ safeModeRule = createRule(eventQueue, conf, containerManager,
safeModeManager);
+ safeModeRule.setValidateBasedOnReportProcessing(false);
}
@Test
@@ -171,6 +182,31 @@ public void testAllContainersOpen() {
assertTrue(rule.validate(), "Validate should return true when all
containers are open");
}
+ @Test
+ public void testRefreshRecordsDurationAndIncrementsRefreshCount() {
+ containers.add(mockContainer(LifeCycleState.OPEN, 1L));
+ int count = 3;
+ for (int i = 0; i < count; i++) {
+ safeModeRule.refresh(true);
+ }
+
+ ArgumentCaptor<Long> durationCaptor = ArgumentCaptor.forClass(Long.class);
+ verify(safeModeMetrics,
times(count)).incNumContainerSafeModeRuleRefreshes();
+ verify(safeModeMetrics,
times(count)).setLastContainerSafeModeRuleRefreshDurationMs(
+ eq(getReplicationType()), durationCaptor.capture());
+ durationCaptor.getAllValues().forEach(durationMs -> assertTrue(durationMs
>= 0L));
+ }
+
+ @Test
+ public void testRefreshSkippedWhenValidWithoutForce() {
+ containers.add(mockContainer(LifeCycleState.OPEN, 1L));
+
+ safeModeRule.refresh(false);
+
+ verify(safeModeMetrics, never()).incNumContainerSafeModeRuleRefreshes();
+ verify(safeModeMetrics,
never()).setLastContainerSafeModeRuleRefreshDurationMs(any(), anyLong());
+ }
+
@Test
public void testDuplicateContainerIdsInReports() {
long containerId = 42L;
diff --git a/hadoop-ozone/dist/src/main/compose/common/grafana/dashboards/Ozone
- SCM Safemode.json
b/hadoop-ozone/dist/src/main/compose/common/grafana/dashboards/Ozone - SCM
Safemode.json
index ac0c291b83a..5cbc09a2fec 100644
--- a/hadoop-ozone/dist/src/main/compose/common/grafana/dashboards/Ozone - SCM
Safemode.json
+++ b/hadoop-ozone/dist/src/main/compose/common/grafana/dashboards/Ozone - SCM
Safemode.json
@@ -747,6 +747,209 @@
],
"title": "Registered DataNodes: Target vs Actual",
"type": "timeseries"
+ },
+ {
+ "collapsed": false,
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 25 },
+ "id": 200,
+ "panels": [],
+ "title": "SCM Safemode: Durations",
+ "type": "row"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "min": 0,
+ "decimals": 0,
+ "unit": "ms",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "Duration",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 26 },
+ "id": 201,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "disableTextWrap": false,
+ "editorMode": "code",
+ "expr": "safe_mode_metrics_scm_safe_mode_exit_duration_ms",
+ "fullMetaSearch": false,
+ "includeNullMetadata": true,
+ "legendFormat": "{{hostname}}",
+ "range": true,
+ "refId": "A",
+ "useBackend": false
+ }
+ ],
+ "title": "Last safe mode exit duration",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "min": 0,
+ "decimals": 0,
+ "unit": "ms",
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "Duration",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 26 },
+ "id": 202,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "disableTextWrap": false,
+ "editorMode": "code",
+ "expr":
"safe_mode_metrics_last_ratis_container_safe_mode_rule_refresh_duration_ms",
+ "fullMetaSearch": false,
+ "includeNullMetadata": true,
+ "legendFormat": "{{hostname}} Ratis",
+ "range": true,
+ "refId": "A",
+ "useBackend": false
+ },
+ {
+ "disableTextWrap": false,
+ "editorMode": "code",
+ "expr":
"safe_mode_metrics_last_ec_container_safe_mode_rule_refresh_duration_ms",
+ "fullMetaSearch": false,
+ "includeNullMetadata": true,
+ "legendFormat": "{{hostname}} EC",
+ "range": true,
+ "refId": "B",
+ "useBackend": false
+ }
+ ],
+ "title": "Last container rule refresh duration",
+ "type": "timeseries"
}
],
"preload": false,
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]