This is an automated email from the ASF dual-hosted git repository.
weichiu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 7e4e5f34361 HDDS-14039. Create Grafana dashboard for Ozone SCM
safemode rules and exit (#9400)
7e4e5f34361 is described below
commit 7e4e5f34361f34da49d41ba75161463248f5f0e7
Author: sreejasahithi <[email protected]>
AuthorDate: Tue Jan 13 10:33:48 2026 +0530
HDDS-14039. Create Grafana dashboard for Ozone SCM safemode rules and exit
(#9400)
---
.../hdds/scm/safemode/DataNodeSafeModeRule.java | 8 +-
.../hdds/scm/safemode/SCMSafeModeManager.java | 1 +
.../hadoop/hdds/scm/safemode/SafeModeMetrics.java | 27 +
.../scm/safemode/TestDataNodeSafeModeRule.java | 8 +-
.../hdds/scm/safemode/TestSCMSafeModeManager.java | 32 +
.../grafana/dashboards/Ozone - SCM Safemode.json | 766 +++++++++++++++++++++
6 files changed, 840 insertions(+), 2 deletions(-)
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/DataNodeSafeModeRule.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/DataNodeSafeModeRule.java
index 63be485e028..0cd763413e8 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/DataNodeSafeModeRule.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/DataNodeSafeModeRule.java
@@ -51,6 +51,7 @@ public DataNodeSafeModeRule(EventQueue eventQueue,
requiredDns = conf.getInt(
HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE,
HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE_DEFAULT);
+ getSafeModeMetrics().setNumRequiredDatanodesThreshold(requiredDns);
registeredDnSet = new HashSet<>(requiredDns * 2);
this.nodeManager = nodeManager;
}
@@ -71,9 +72,14 @@ protected boolean validate() {
@Override
protected void process(NodeRegistrationContainerReport reportsProto) {
- registeredDnSet.add(reportsProto.getDatanodeDetails().getID());
+ DatanodeID dnId = reportsProto.getDatanodeDetails().getID();
+ boolean added = registeredDnSet.add(dnId);
registeredDns = registeredDnSet.size();
+ if (added) {
+ getSafeModeMetrics().incCurrentRegisteredDatanodesCount();
+ }
+
if (scmInSafeMode()) {
SCMSafeModeManager.getLogger().info(
"SCM in safe mode. {} DataNodes registered, {} required.",
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
index bc2a26fbf91..67d47d101df 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
@@ -119,6 +119,7 @@ public SafeModeMetrics getSafeModeMetrics() {
private void emitSafeModeStatus() {
final SafeModeStatus safeModeStatus = status.get();
+ safeModeMetrics.setScmInSafeMode(safeModeStatus.isInSafeMode());
scmContext.updateSafeModeStatus(safeModeStatus);
// notify SCMServiceManager
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeMetrics.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeMetrics.java
index f5f4ce12992..ae65eafcb91 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeMetrics.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeMetrics.java
@@ -22,6 +22,7 @@
import org.apache.hadoop.metrics2.annotation.Metric;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.hadoop.metrics2.lib.MutableCounterLong;
+import org.apache.hadoop.metrics2.lib.MutableGaugeInt;
import org.apache.hadoop.metrics2.lib.MutableGaugeLong;
/**
@@ -52,6 +53,12 @@ public class SafeModeMetrics {
private @Metric MutableCounterLong
currentPipelinesWithAtleastOneReplicaReportedCount;
+ @Metric("Metric will be set to 1 if SCM is in SafeMode, otherwise 0")
+ private MutableGaugeInt scmInSafeMode;
+
+ @Metric private MutableGaugeLong numRequiredDatanodesThreshold;
+ @Metric private MutableCounterLong currentRegisteredDatanodesCount;
+
public static SafeModeMetrics create() {
final MetricsSystem ms = DefaultMetricsSystem.instance();
return ms.register(SOURCE_NAME, "SCM Safemode Metrics", new
SafeModeMetrics());
@@ -86,6 +93,14 @@ public void
setNumContainerReportedThreshold(HddsProtos.ReplicationType type, lo
}
}
+ public void setScmInSafeMode(boolean inSafeMode) {
+ this.scmInSafeMode.set(inSafeMode ? 1 : 0);
+ }
+
+ public void setNumRequiredDatanodesThreshold(long val) {
+ this.numRequiredDatanodesThreshold.set(val);
+ }
+
public void incCurrentContainersWithOneReplicaReportedCount() {
this.currentContainersWithOneReplicaReportedCount.incr();
}
@@ -94,6 +109,10 @@ public void
incCurrentContainersWithECDataReplicaReportedCount() {
this.currentContainersWithECDataReplicaReportedCount.incr();
}
+ public void incCurrentRegisteredDatanodesCount() {
+ this.currentRegisteredDatanodesCount.incr();
+ }
+
MutableGaugeLong getNumHealthyPipelinesThreshold() {
return numHealthyPipelinesThreshold;
}
@@ -122,6 +141,14 @@ MutableGaugeLong
getNumContainerWithECDataReplicaReportedThreshold() {
MutableCounterLong getCurrentContainersWithOneReplicaReportedCount() {
return currentContainersWithOneReplicaReportedCount;
}
+
+ MutableCounterLong getCurrentRegisteredDatanodesCount() {
+ return currentRegisteredDatanodesCount;
+ }
+
+ MutableGaugeInt getScmInSafeMode() {
+ return scmInSafeMode;
+ }
public void unRegister() {
MetricsSystem ms = DefaultMetricsSystem.instance();
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestDataNodeSafeModeRule.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestDataNodeSafeModeRule.java
index c62293e7648..011e97aac99 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestDataNodeSafeModeRule.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestDataNodeSafeModeRule.java
@@ -17,6 +17,7 @@
package org.apache.hadoop.hdds.scm.safemode;
+import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -53,6 +54,7 @@ public class TestDataNodeSafeModeRule {
private EventQueue eventQueue;
private NodeManager nodeManager;
private SCMSafeModeManager mockSafeModeManager;
+ private SafeModeMetrics metrics;
private void setup(int requiredDns) throws Exception {
OzoneConfiguration ozoneConfiguration = new OzoneConfiguration();
@@ -65,6 +67,8 @@ private void setup(int requiredDns) throws Exception {
eventQueue = new EventQueue();
mockSafeModeManager = mock(SCMSafeModeManager.class);
+ metrics = SafeModeMetrics.create();
+ when(mockSafeModeManager.getSafeModeMetrics()).thenReturn(metrics);
rule = new DataNodeSafeModeRule(eventQueue, ozoneConfiguration,
nodeManager, mockSafeModeManager);
assertNotNull(rule);
@@ -94,6 +98,7 @@ public void testDataNodeSafeModeRuleWithNoNodes() throws
Exception {
"SCM in safe mode. 1 DataNodes registered, 1 required."), 1000, 5000);
assertTrue(rule.validate());
+ assertEquals(1, metrics.getCurrentRegisteredDatanodesCount().value());
}
@Test
@@ -120,7 +125,7 @@ public void testDataNodeSafeModeRuleWithMultipleNodes()
throws Exception {
"SCM in safe mode. 2 DataNodes registered, 3 required."), 1000, 5000);
assertFalse(rule.validate());
-
+ assertEquals(2, metrics.getCurrentRegisteredDatanodesCount().value());
DatanodeDetails dd = MockDatanodeDetails.randomDatanodeDetails();
NodeRegistrationContainerReport nodeReg =
new NodeRegistrationContainerReport(dd, null);
@@ -131,6 +136,7 @@ public void testDataNodeSafeModeRuleWithMultipleNodes()
throws Exception {
"SCM in safe mode. 3 DataNodes registered, 3 required."), 1000, 5000);
assertTrue(rule.validate());
+ assertEquals(3, metrics.getCurrentRegisteredDatanodesCount().value());
}
@Test
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
index 1cbd6bc3725..1ef531f8bf8 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
@@ -109,6 +109,9 @@ public void setUp() throws IOException {
@AfterEach
public void destroyDbStore() throws Exception {
+ if (scmSafeModeManager != null) {
+ scmSafeModeManager.getSafeModeMetrics().unRegister();
+ }
if (scmMetadataStore.getStore() != null) {
scmMetadataStore.getStore().close();
}
@@ -136,6 +139,7 @@ private void testSafeMode(int numContainers) throws
Exception {
scmSafeModeManager.start();
assertTrue(scmSafeModeManager.getInSafeMode());
+ assertEquals(1,
scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value());
validateRuleStatus("DatanodeSafeModeRule", "registered datanodes 0");
SCMDatanodeProtocolServer.NodeRegistrationContainerReport
nodeRegistrationContainerReport =
HddsTestUtils.createNodeRegistrationContainerReport(containers);
@@ -151,6 +155,9 @@ private void testSafeMode(int numContainers) throws
Exception {
GenericTestUtils.waitFor(() -> !scmSafeModeManager.getInSafeMode(),
100, 1000 * 5);
+ GenericTestUtils.waitFor(() ->
+ scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value()
== 0,
+ 100, 1000 * 5);
assertEquals(cutOff, scmSafeModeManager.getSafeModeMetrics()
.getCurrentContainersWithOneReplicaReportedCount().value());
@@ -182,6 +189,7 @@ public void testSafeModeExitRule() throws Exception {
.getNumContainerWithOneReplicaReportedThreshold().value());
assertTrue(scmSafeModeManager.getInSafeMode());
+ assertEquals(1,
scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value());
validateRuleStatus("ContainerSafeModeRule",
"0.00% of [Ratis] Containers(0 / 100) with at least one reported");
testContainerThreshold(containers.subList(0, 25), 0.25);
@@ -202,6 +210,9 @@ public void testSafeModeExitRule() throws Exception {
GenericTestUtils.waitFor(() -> !scmSafeModeManager.getInSafeMode(),
100, 1000 * 5);
+ GenericTestUtils.waitFor(() ->
+ scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value()
== 0,
+ 100, 1000 * 5);
}
private OzoneConfiguration createConf(double healthyPercent,
@@ -306,6 +317,7 @@ public void
testSafeModeExitRuleWithPipelineAvailabilityCheck(
scmSafeModeManager.start();
assertTrue(scmSafeModeManager.getInSafeMode());
+ assertEquals(1,
scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value());
if (healthyPipelinePercent > 0) {
validateRuleStatus("HealthyPipelineSafeModeRule",
"healthy Ratis/THREE pipelines");
@@ -367,6 +379,9 @@ public void
testSafeModeExitRuleWithPipelineAvailabilityCheck(
GenericTestUtils.waitFor(() -> !scmSafeModeManager.getInSafeMode(),
100, 1000 * 5);
+ GenericTestUtils.waitFor(() ->
+ scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value()
== 0,
+ 100, 1000 * 5);
}
/**
@@ -477,8 +492,10 @@ public void testContainerSafeModeRule() throws Exception {
scmSafeModeManager = new SCMSafeModeManager(config, null, null,
containerManager, serviceManager, queue, scmContext);
+ scmSafeModeManager.start();
assertTrue(scmSafeModeManager.getInSafeMode());
+ assertEquals(1,
scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value());
// When 10 CLOSED containers are reported by DNs, the computed container
// threshold should be 10/20 as there are only 20 CLOSED NON-EMPTY
@@ -494,6 +511,9 @@ public void testContainerSafeModeRule() throws Exception {
GenericTestUtils.waitFor(() -> !scmSafeModeManager.getInSafeMode(),
100, 1000 * 5);
+ GenericTestUtils.waitFor(() ->
+ scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value()
== 0,
+ 100, 1000 * 5);
}
// We simulate common EC types: EC-2-2-1024K, EC-3-2-1024K, EC-6-3-1024K.
@@ -584,6 +604,7 @@ private void testSafeModeDataNodes(int numOfDns) throws
Exception {
// Assert SCM is in Safe mode.
assertTrue(scmSafeModeManager.getInSafeMode());
+ assertEquals(1,
scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value());
// Register all DataNodes except last one and assert SCM is in safe mode.
for (int i = 0; i < numOfDns - 1; i++) {
@@ -606,6 +627,9 @@ private void testSafeModeDataNodes(int numOfDns) throws
Exception {
HddsTestUtils.createNodeRegistrationContainerReport(containers));
GenericTestUtils.waitFor(() -> !scmSafeModeManager.getInSafeMode(),
10, 1000 * 10);
+ GenericTestUtils.waitFor(() ->
+ scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value()
== 0,
+ 100, 1000 * 5);
}
private void testContainerThreshold(List<ContainerInfo> dnContainers,
@@ -700,11 +724,15 @@ public void testSafeModePipelineExitRule() throws
Exception {
assertTrue(scmSafeModeManager.getInSafeMode());
+ assertEquals(1,
scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value());
firePipelineEvent(pipelineManager, pipeline);
GenericTestUtils.waitFor(() -> !scmSafeModeManager.getInSafeMode(),
100, 1000 * 10);
+ GenericTestUtils.waitFor(() ->
+ scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value()
== 0,
+ 100, 1000 * 5);
pipelineManager.close();
}
@@ -744,6 +772,7 @@ public void testPipelinesNotCreatedUntilPreCheckPasses()
throws Exception {
// Assert SCM is in Safe mode.
assertTrue(scmSafeModeManager.getInSafeMode());
+ assertEquals(1,
scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value());
// stop background pipeline creator as we manually create
// pipeline below
@@ -781,5 +810,8 @@ public void testPipelinesNotCreatedUntilPreCheckPasses()
throws Exception {
queue.processAll(5000);
assertTrue(scmSafeModeManager.getPreCheckComplete());
assertFalse(scmSafeModeManager.getInSafeMode());
+ GenericTestUtils.waitFor(() ->
+ scmSafeModeManager.getSafeModeMetrics().getScmInSafeMode().value()
== 0,
+ 100, 1000 * 5);
}
}
diff --git a/hadoop-ozone/dist/src/main/compose/common/grafana/dashboards/Ozone
- SCM Safemode.json
b/hadoop-ozone/dist/src/main/compose/common/grafana/dashboards/Ozone - SCM
Safemode.json
new file mode 100644
index 00000000000..ac0c291b83a
--- /dev/null
+++ b/hadoop-ozone/dist/src/main/compose/common/grafana/dashboards/Ozone - SCM
Safemode.json
@@ -0,0 +1,766 @@
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": {
+ "type": "prometheus"
+ },
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "fiscalYearStartMonth": 0,
+ "graphTooltip": 0,
+ "id": 1,
+ "links": [],
+ "panels": [
+ {
+ "collapsed": false,
+ "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
+ "id": 100,
+ "panels": [],
+ "title": "SCM Safemode: Summary",
+ "type": "row"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "unit": "short",
+ "decimals": 0,
+ "mappings": [
+ {
+ "options": {
+ "0": {
+ "color": "green",
+ "text": "Exited safemode"
+ },
+ "1": {
+ "color": "red",
+ "text": "In Safemode"
+ }
+ },
+ "type": "value"
+ }
+ ],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": 0
+ },
+ {
+ "color": "red",
+ "value": 1
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 },
+ "id": 101,
+ "options": {
+ "alignValue": "center",
+ "legend": {
+ "displayMode": "hidden",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "mergeValues": true,
+ "rowHeight": 0.9,
+ "showValue": "always",
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "expr": "safe_mode_metrics_scm_in_safe_mode",
+ "fullMetaSearch": false,
+ "includeNullMetadata": true,
+ "legendFormat": "{{hostname}}",
+ "range": true,
+ "refId": "A",
+ "useBackend": false
+ }
+ ],
+ "title": "SCM Safemode Status",
+ "type": "state-timeline"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "min": 0,
+ "decimals": 0,
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "Containers",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Target Threshold"
+ },
+ "properties": [
+ {
+ "id": "custom.lineStyle",
+ "value": {
+ "dash": [10, 10],
+ "fill": "dash"
+ }
+ },
+ {
+ "id": "custom.lineWidth",
+ "value": 3
+ },
+ {
+ "id": "color",
+ "value": {
+ "fixedColor": "green",
+ "mode": "fixed"
+ }
+ }
+ ]
+ }
+ ]
+ },
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 },
+ "id": 102,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "disableTextWrap": false,
+ "editorMode": "code",
+ "expr":
"max(safe_mode_metrics_num_container_with_one_replica_reported_threshold)",
+ "fullMetaSearch": false,
+ "includeNullMetadata": true,
+ "legendFormat": "Target Threshold",
+ "range": true,
+ "refId": "A",
+ "useBackend": false
+ },
+ {
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "expr":
"safe_mode_metrics_current_containers_with_one_replica_reported_count",
+ "fullMetaSearch": false,
+ "includeNullMetadata": true,
+ "legendFormat": "{{hostname}} actual",
+ "range": true,
+ "refId": "B",
+ "useBackend": false
+ }
+ ],
+ "title": "Ratis Containers: Target vs Actual",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "min": 0,
+ "decimals": 0,
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "Containers",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Target Threshold"
+ },
+ "properties": [
+ {
+ "id": "custom.lineStyle",
+ "value": {
+ "dash": [10, 10],
+ "fill": "dash"
+ }
+ },
+ {
+ "id": "custom.lineWidth",
+ "value": 3
+ },
+ {
+ "id": "color",
+ "value": {
+ "fixedColor": "green",
+ "mode": "fixed"
+ }
+ }
+ ]
+ }
+ ]
+ },
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 },
+ "id": 103,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "disableTextWrap": false,
+ "editorMode": "code",
+ "expr":
"max(safe_mode_metrics_num_container_with_ec_data_replica_reported_threshold)",
+ "fullMetaSearch": false,
+ "includeNullMetadata": true,
+ "legendFormat": "Target Threshold",
+ "range": true,
+ "refId": "A",
+ "useBackend": false
+ },
+ {
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "expr":
"safe_mode_metrics_current_containers_with_ec_data_replica_reported_count",
+ "fullMetaSearch": false,
+ "includeNullMetadata": true,
+ "legendFormat": "{{hostname}} actual",
+ "range": true,
+ "refId": "B",
+ "useBackend": false
+ }
+ ],
+ "title": "EC Containers: Target vs Actual",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "min": 0,
+ "decimals": 0,
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "Pipelines",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Target Threshold"
+ },
+ "properties": [
+ {
+ "id": "custom.lineStyle",
+ "value": {
+ "dash": [10, 10],
+ "fill": "dash"
+ }
+ },
+ {
+ "id": "custom.lineWidth",
+ "value": 3
+ },
+ {
+ "id": "color",
+ "value": {
+ "fixedColor": "green",
+ "mode": "fixed"
+ }
+ }
+ ]
+ }
+ ]
+ },
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 },
+ "id": 104,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "disableTextWrap": false,
+ "editorMode": "code",
+ "expr": "max(safe_mode_metrics_num_healthy_pipelines_threshold)",
+ "fullMetaSearch": false,
+ "includeNullMetadata": true,
+ "legendFormat": "Target Threshold",
+ "range": true,
+ "refId": "A",
+ "useBackend": false
+ },
+ {
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "expr": "safe_mode_metrics_current_healthy_pipelines_count",
+ "fullMetaSearch": false,
+ "includeNullMetadata": true,
+ "legendFormat": "{{hostname}} actual",
+ "range": true,
+ "refId": "B",
+ "useBackend": false
+ }
+ ],
+ "title": "Healthy Pipelines: Target vs Actual",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "min": 0,
+ "decimals": 0,
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "Pipelines",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Target Threshold"
+ },
+ "properties": [
+ {
+ "id": "custom.lineStyle",
+ "value": {
+ "dash": [10, 10],
+ "fill": "dash"
+ }
+ },
+ {
+ "id": "custom.lineWidth",
+ "value": 3
+ },
+ {
+ "id": "color",
+ "value": {
+ "fixedColor": "green",
+ "mode": "fixed"
+ }
+ }
+ ]
+ }
+ ]
+ },
+ "gridPos": { "h": 8, "w": 12, "x": 0, "y": 17 },
+ "id": 105,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "disableTextWrap": false,
+ "editorMode": "code",
+ "expr":
"max(safe_mode_metrics_num_pipelines_with_atleast_one_replica_reported_threshold)",
+ "fullMetaSearch": false,
+ "includeNullMetadata": true,
+ "legendFormat": "Target Threshold",
+ "range": true,
+ "refId": "A",
+ "useBackend": false
+ },
+ {
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "expr":
"safe_mode_metrics_current_pipelines_with_atleast_one_replica_reported_count",
+ "fullMetaSearch": false,
+ "includeNullMetadata": true,
+ "legendFormat": "{{hostname}} actual",
+ "range": true,
+ "refId": "B",
+ "useBackend": false
+ }
+ ],
+ "title": "One-Replica Pipelines: Target vs Actual",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "min": 0,
+ "decimals": 0,
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisBorderShow": false,
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "DataNodes",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "barWidthFactor": 0.6,
+ "drawStyle": "line",
+ "fillOpacity": 0,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "insertNulls": false,
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": [
+ {
+ "matcher": {
+ "id": "byName",
+ "options": "Target Threshold"
+ },
+ "properties": [
+ {
+ "id": "custom.lineStyle",
+ "value": {
+ "dash": [10, 10],
+ "fill": "dash"
+ }
+ },
+ {
+ "id": "custom.lineWidth",
+ "value": 3
+ },
+ {
+ "id": "color",
+ "value": {
+ "fixedColor": "green",
+ "mode": "fixed"
+ }
+ }
+ ]
+ }
+ ]
+ },
+ "gridPos": { "h": 8, "w": 12, "x": 12, "y": 17 },
+ "id": 106,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "disableTextWrap": false,
+ "editorMode": "code",
+ "expr": "max(safe_mode_metrics_num_required_datanodes_threshold)",
+ "fullMetaSearch": false,
+ "includeNullMetadata": true,
+ "legendFormat": "Target Threshold",
+ "range": true,
+ "refId": "A",
+ "useBackend": false
+ },
+ {
+ "disableTextWrap": false,
+ "editorMode": "builder",
+ "expr": "safe_mode_metrics_current_registered_datanodes_count",
+ "fullMetaSearch": false,
+ "includeNullMetadata": true,
+ "legendFormat": "{{hostname}} actual",
+ "range": true,
+ "refId": "B",
+ "useBackend": false
+ }
+ ],
+ "title": "Registered DataNodes: Target vs Actual",
+ "type": "timeseries"
+ }
+ ],
+ "preload": false,
+ "refresh": "45s",
+ "tags": [],
+ "templating": {
+ "list": []
+ },
+ "time": {
+ "from": "now-30m",
+ "to": "now"
+ },
+ "timepicker": {},
+ "timezone": "",
+ "title": "Ozone - SCM Safemode",
+ "weekStart": ""
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]