This is an automated email from the ASF dual-hosted git repository.

sadanand48 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new e07f8b5fd1c HDDS-14954. Add logging and metric for SCM safemode 
duration. (#10018)
e07f8b5fd1c is described below

commit e07f8b5fd1c6877cdbe043d204a1f1bb3c195d7d
Author: Sadanand Shenoy <[email protected]>
AuthorDate: Mon May 18 17:36:40 2026 +0530

    HDDS-14954. Add logging and metric for SCM safemode duration. (#10018)
---
 .../safemode/AbstractContainerSafeModeRule.java    |  11 +-
 .../hdds/scm/safemode/SCMSafeModeManager.java      |  23 +++
 .../hadoop/hdds/scm/safemode/SafeModeMetrics.java  |  37 +++-
 .../AbstractContainerSafeModeRuleTest.java         |  40 +++-
 .../grafana/dashboards/Ozone - SCM Safemode.json   | 203 +++++++++++++++++++++
 5 files changed, 310 insertions(+), 4 deletions(-)

diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java
index a5e3a4eed0f..9d13f951978 100644
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRule.java
@@ -26,6 +26,7 @@
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.Collectors;
 import org.apache.hadoop.hdds.conf.ConfigurationSource;
@@ -40,6 +41,7 @@
 import 
org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer.NodeRegistrationContainerReport;
 import org.apache.hadoop.hdds.server.events.EventQueue;
 import org.apache.hadoop.hdds.server.events.TypedEvent;
+import org.apache.hadoop.util.Time;
 
 /**
  * Abstract class for Container Safe mode exit rule.
@@ -158,7 +160,14 @@ public double getCurrentContainerThreshold() {
   @Override
   public synchronized void refresh(boolean forceRefresh) {
     if (forceRefresh || !validate()) {
-      reinitializeRule();
+      final long startNanos = Time.monotonicNowNanos();
+      getSafeModeMetrics().incNumContainerSafeModeRuleRefreshes();
+      try {
+        reinitializeRule();
+      } finally {
+        long durationMs = 
TimeUnit.NANOSECONDS.toMillis(Time.monotonicNowNanos() - startNanos);
+        
getSafeModeMetrics().setLastContainerSafeModeRuleRefreshDurationMs(getContainerType(),
 durationMs);
+      }
     }
   }
 
diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
index 2c9173b2bf0..65e52ec4272 100644
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
@@ -40,6 +40,7 @@
 import org.apache.hadoop.hdds.scm.node.NodeManager;
 import org.apache.hadoop.hdds.scm.pipeline.PipelineManager;
 import org.apache.hadoop.hdds.server.events.EventQueue;
+import org.apache.hadoop.util.Time;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -89,6 +90,9 @@ public class SCMSafeModeManager implements SafeModeManager {
   private ScheduledExecutorService safeModeLogExecutor;
   private ScheduledFuture<?> safeModeLogTask;
 
+  /** Monotonic time when SCM entered safe mode; used to report exit duration. 
*/
+  private long safeModeEnteredAtNanos = -1L;
+
   public SCMSafeModeManager(final ConfigurationSource conf,
                             final NodeManager nodeManager,
                             final PipelineManager pipelineManager,
@@ -120,6 +124,9 @@ public SCMSafeModeManager(final ConfigurationSource conf,
   }
 
   public void start() {
+    if (getInSafeMode()) {
+      safeModeEnteredAtNanos = Time.monotonicNowNanos();
+    }
     emitSafeModeStatus();
     startSafeModePeriodicLogger();
   }
@@ -177,13 +184,18 @@ public synchronized void validateSafeModeExitRules(String 
ruleName) {
       LOG.info("ScmSafeModeManager, all rules are successfully validated");
       LOG.info("SCM exiting safe mode.");
       emitSafeModeStatus();
+      recordSafeModeExitDuration();
     }
   }
 
   public void forceExitSafeMode() {
+    boolean wasInSafeMode = getInSafeMode();
     LOG.info("SCM force-exiting safe mode.");
     status.set(SafeModeStatus.OUT_OF_SAFE_MODE);
     emitSafeModeStatus();
+    if (wasInSafeMode) {
+      recordSafeModeExitDuration();
+    }
   }
 
   /**
@@ -308,6 +320,17 @@ private synchronized void logSafeModeStatus() {
     }
   }
 
+  private void recordSafeModeExitDuration() {
+    if (safeModeEnteredAtNanos < 0) {
+      return;
+    }
+    long durationMs =
+        TimeUnit.NANOSECONDS.toMillis(Time.monotonicNowNanos() - 
safeModeEnteredAtNanos);
+    safeModeEnteredAtNanos = -1;
+    safeModeMetrics.setScmSafeModeExitDurationMs(durationMs);
+    LOG.info("SCM safe mode exit duration {} ms (since start() while in safe 
mode)", durationMs);
+  }
+
   /**
    * Stops the periodic safe mode logger.
    * Called when safe mode exits.
diff --git 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeMetrics.java
 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeMetrics.java
index 1f1daaae09b..d2cc94e261a 100644
--- 
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeMetrics.java
+++ 
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeMetrics.java
@@ -59,6 +59,15 @@ public class SafeModeMetrics {
   @Metric private MutableGaugeLong numRequiredDatanodesThreshold;
   @Metric private MutableCounterLong currentRegisteredDatanodesCount;
 
+  @Metric("Wall-clock time (ms) SCM spent in safe mode for the last exit")
+  private MutableGaugeLong scmSafeModeExitDurationMs;
+  @Metric("Duration (ms) of the last Ratis container safe mode rule 
incremental refresh")
+  private MutableGaugeLong lastRatisContainerSafeModeRuleRefreshDurationMs;
+  @Metric("Duration (ms) of the last EC container safe mode rule incremental 
refresh")
+  private MutableGaugeLong lastEcContainerSafeModeRuleRefreshDurationMs;
+  @Metric("Number of refresh calls before exiting safemode")
+  private MutableCounterLong numContainerSafeModeRuleRefreshes;
+
   public static SafeModeMetrics create() {
     final MetricsSystem ms = DefaultMetricsSystem.instance();
     return ms.register(SOURCE_NAME, "SCM Safemode Metrics", new 
SafeModeMetrics());
@@ -113,10 +122,32 @@ public void 
incCurrentContainersWithECDataReplicaReportedCount() {
     this.currentContainersWithECDataReplicaReportedCount.incr();
   }
 
+  public void incNumContainerSafeModeRuleRefreshes() {
+    this.numContainerSafeModeRuleRefreshes.incr();
+  }
+
   public void incCurrentRegisteredDatanodesCount() {
     this.currentRegisteredDatanodesCount.incr();
   }
 
+  public void setScmSafeModeExitDurationMs(long durationMs) {
+    this.scmSafeModeExitDurationMs.set(durationMs);
+  }
+
+  public void setLastContainerSafeModeRuleRefreshDurationMs(
+      HddsProtos.ReplicationType type, long durationMs) {
+    switch (type) {
+    case RATIS:
+      this.lastRatisContainerSafeModeRuleRefreshDurationMs.set(durationMs);
+      break;
+    case EC:
+      this.lastEcContainerSafeModeRuleRefreshDurationMs.set(durationMs);
+      break;
+    default:
+      break;
+    }
+  }
+
   MutableGaugeLong getNumHealthyPipelinesThreshold() {
     return numHealthyPipelinesThreshold;
   }
@@ -145,7 +176,11 @@ MutableGaugeLong 
getNumContainerWithECDataReplicaReportedThreshold() {
   MutableCounterLong getCurrentContainersWithOneReplicaReportedCount() {
     return currentContainersWithOneReplicaReportedCount;
   }
-  
+
+  public MutableCounterLong getNumContainerSafeModeRuleRefreshes() {
+    return numContainerSafeModeRuleRefreshes;
+  }
+
   MutableCounterLong getCurrentRegisteredDatanodesCount() {
     return currentRegisteredDatanodesCount;
   }
diff --git 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java
 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java
index 77485e722b9..7f3fd432803 100644
--- 
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java
+++ 
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/AbstractContainerSafeModeRuleTest.java
@@ -21,7 +21,12 @@
 import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.ArgumentMatchers.anyLong;
+import static org.mockito.ArgumentMatchers.eq;
 import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.never;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
 import static org.mockito.Mockito.when;
 
 import java.util.ArrayList;
@@ -44,6 +49,7 @@
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.params.ParameterizedTest;
 import org.junit.jupiter.params.provider.EnumSource;
+import org.mockito.ArgumentCaptor;
 
 /**
  * Abstract base class for container safe mode rule tests.
@@ -54,6 +60,8 @@ public abstract class AbstractContainerSafeModeRuleTest {
   private ConfigurationSource conf;
   private ContainerManager containerManager;
   private EventQueue eventQueue;
+  private AbstractContainerSafeModeRule safeModeRule;
+  private SafeModeMetrics safeModeMetrics;
 
   @BeforeEach
   public void setup() throws ContainerNotFoundException {
@@ -61,9 +69,9 @@ public void setup() throws ContainerNotFoundException {
     conf = mock(ConfigurationSource.class);
     eventQueue = mock(EventQueue.class);
     safeModeManager = mock(SCMSafeModeManager.class);
-    final SafeModeMetrics metrics = mock(SafeModeMetrics.class);
+    safeModeMetrics = mock(SafeModeMetrics.class);
 
-    when(safeModeManager.getSafeModeMetrics()).thenReturn(metrics);
+    when(safeModeManager.getSafeModeMetrics()).thenReturn(safeModeMetrics);
     containers = new ArrayList<>();
     
when(containerManager.getContainers(getReplicationType())).thenReturn(containers);
     
when(containerManager.getContainer(any(ContainerID.class))).thenAnswer(invocation
 -> {
@@ -73,6 +81,9 @@ public void setup() throws ContainerNotFoundException {
           .findFirst()
           .orElseThrow(ContainerNotFoundException::new);
     });
+
+    safeModeRule = createRule(eventQueue, conf, containerManager, 
safeModeManager);
+    safeModeRule.setValidateBasedOnReportProcessing(false);
   }
 
   @Test
@@ -171,6 +182,31 @@ public void testAllContainersOpen() {
     assertTrue(rule.validate(), "Validate should return true when all 
containers are open");
   }
 
+  @Test
+  public void testRefreshRecordsDurationAndIncrementsRefreshCount() {
+    containers.add(mockContainer(LifeCycleState.OPEN, 1L));
+    int count = 3;
+    for (int i = 0; i < count; i++) {
+      safeModeRule.refresh(true);
+    }
+
+    ArgumentCaptor<Long> durationCaptor = ArgumentCaptor.forClass(Long.class);
+    verify(safeModeMetrics, 
times(count)).incNumContainerSafeModeRuleRefreshes();
+    verify(safeModeMetrics, 
times(count)).setLastContainerSafeModeRuleRefreshDurationMs(
+        eq(getReplicationType()), durationCaptor.capture());
+    durationCaptor.getAllValues().forEach(durationMs -> assertTrue(durationMs 
>= 0L));
+  }
+
+  @Test
+  public void testRefreshSkippedWhenValidWithoutForce() {
+    containers.add(mockContainer(LifeCycleState.OPEN, 1L));
+
+    safeModeRule.refresh(false);
+
+    verify(safeModeMetrics, never()).incNumContainerSafeModeRuleRefreshes();
+    verify(safeModeMetrics, 
never()).setLastContainerSafeModeRuleRefreshDurationMs(any(), anyLong());
+  }
+
   @Test
   public void testDuplicateContainerIdsInReports() {
     long containerId = 42L;
diff --git a/hadoop-ozone/dist/src/main/compose/common/grafana/dashboards/Ozone 
- SCM Safemode.json 
b/hadoop-ozone/dist/src/main/compose/common/grafana/dashboards/Ozone - SCM 
Safemode.json
index ac0c291b83a..5cbc09a2fec 100644
--- a/hadoop-ozone/dist/src/main/compose/common/grafana/dashboards/Ozone - SCM 
Safemode.json    
+++ b/hadoop-ozone/dist/src/main/compose/common/grafana/dashboards/Ozone - SCM 
Safemode.json    
@@ -747,6 +747,209 @@
       ],
       "title": "Registered DataNodes: Target vs Actual",
       "type": "timeseries"
+    },
+    {
+      "collapsed": false,
+      "gridPos": { "h": 1, "w": 24, "x": 0, "y": 25 },
+      "id": 200,
+      "panels": [],
+      "title": "SCM Safemode: Durations",
+      "type": "row"
+    },
+    {
+      "datasource": {
+        "type": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "min": 0,
+          "decimals": 0,
+          "unit": "ms",
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "Duration",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 26 },
+      "id": 201,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "safe_mode_metrics_scm_safe_mode_exit_duration_ms",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "legendFormat": "{{hostname}}",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Last safe mode exit duration",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "type": "prometheus"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "min": 0,
+          "decimals": 0,
+          "unit": "ms",
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "Duration",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 26 },
+      "id": 202,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": 
"safe_mode_metrics_last_ratis_container_safe_mode_rule_refresh_duration_ms",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "legendFormat": "{{hostname}} Ratis",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": 
"safe_mode_metrics_last_ec_container_safe_mode_rule_refresh_duration_ms",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "legendFormat": "{{hostname}} EC",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        }
+      ],
+      "title": "Last container rule refresh duration",
+      "type": "timeseries"
     }
   ],
   "preload": false,


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to