This is an automated email from the ASF dual-hosted git repository.
xyuanlu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/helix.git
The following commit(s) were added to refs/heads/master by this push:
new ebea601e6 Adding a new metric to report number of partitions with
missing top state beyond threshold (#2381)
ebea601e6 is described below
commit ebea601e6d983708742b6e31f7b9a88c0a1e4ad2
Author: Rahul Rane <[email protected]>
AuthorDate: Mon Feb 27 15:37:27 2023 -0800
Adding a new metric to report number of partitions with missing top state
beyond threshold (#2381)
Adding a new metric to report number of partitions with missing top state
beyond threshold
---
.../controller/stages/MissingTopStateRecord.java | 1 +
.../stages/TopStateHandoffReportStage.java | 7 ++++--
.../helix/monitoring/mbeans/ResourceMonitor.java | 27 +++++++++++++++++++++-
.../mbeans/TestTopStateHandoffMetrics.java | 17 ++++++++++----
4 files changed, 45 insertions(+), 7 deletions(-)
diff --git
a/helix-core/src/main/java/org/apache/helix/controller/stages/MissingTopStateRecord.java
b/helix-core/src/main/java/org/apache/helix/controller/stages/MissingTopStateRecord.java
index 119ef9c97..74605bce5 100644
---
a/helix-core/src/main/java/org/apache/helix/controller/stages/MissingTopStateRecord.java
+++
b/helix-core/src/main/java/org/apache/helix/controller/stages/MissingTopStateRecord.java
@@ -49,6 +49,7 @@ public class MissingTopStateRecord {
}
/* package */ void setFailed() {
+ // Mark missingTopStateRecord as failed if partition has missing top state
beyond threshold value.
failed = true;
}
diff --git
a/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
b/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
index 9225be632..a7778b6ac 100644
---
a/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
+++
b/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
@@ -206,7 +206,9 @@ public class TopStateHandoffReportStage extends
AbstractBaseStage {
if (missingTopStateMap.containsKey(resourceName) &&
missingTopStateMap.get(resourceName)
.containsKey(partition.getPartitionName())) {
- // We previously recorded a top state missing, and it's not coming back
+ // We previously recorded a top state missing, and it's coming back.
+ // Note : Decrement missingTopStatePartitionsBeyondGuage in this code
path because this guage will be incremented
+ // only if we were able to record it in the first place.
reportTopStateComesBack(cache,
currentStateOutput.getCurrentStateMap(resourceName, partition),
resourceName, partition, clusterStatusMonitor, durationThreshold,
stateModelDef.getTopState());
@@ -304,7 +306,8 @@ public class TopStateHandoffReportStage extends
AbstractBaseStage {
/**
* Check if the given partition of the given resource has a missing top
state duration larger
- * than the threshold, if so, report a top state transition failure
+ * than the threshold, if so, report a top state transition failure as well
increment a guage which reports number
+ * of partitions with missing top state.
*
* @param cache cluster data cache
* @param resourceName resource name
diff --git
a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java
b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java
index a7060f3c1..8811e0625 100644
---
a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java
+++
b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java
@@ -54,6 +54,7 @@ public class ResourceMonitor extends DynamicMBeanProvider {
// Gauges
private SimpleDynamicMetric<Long> _numOfPartitions;
+ private SimpleDynamicMetric<Long>
_missingTopStatePartitionsBeyondThresholdGauge;
private SimpleDynamicMetric<Long> _numOfPartitionsInExternalView;
private SimpleDynamicMetric<Long> _numOfErrorPartitions;
private SimpleDynamicMetric<Long> _numNonTopStatePartitions;
@@ -70,7 +71,16 @@ public class ResourceMonitor extends DynamicMBeanProvider {
// Counters
private SimpleDynamicMetric<Long> _successfulTopStateHandoffDurationCounter;
private SimpleDynamicMetric<Long> _successTopStateHandoffCounter;
+
+ // A new Gauage _missingTopStatePartitionsBeyondThresholdGauge for reporting
number of partitions with missing top
+ // state has been added. Reason of deprecating this two metrics is because
they are similar and doesn't tell about
+ // how many partitions are missing beyond threshold. The average duration of
different partitions hands-off would not
+ // be helpful and can be used to find that out. Please find more info at
https://github.com/apache/helix/pull/2381
+ @Deprecated
private SimpleDynamicMetric<Long> _failedTopStateHandoffCounter;
+ @Deprecated
+ private HistogramDynamicMetric
_partitionTopStateNonGracefulHandoffDurationGauge;
+
private SimpleDynamicMetric<Long> _maxSinglePartitionTopStateHandoffDuration;
@Deprecated
private SimpleDynamicMetric<Long> _totalMessageReceived; // This should be
counter since the value behavior is ever-increasing
@@ -78,7 +88,6 @@ public class ResourceMonitor extends DynamicMBeanProvider {
// Histograms
private HistogramDynamicMetric _partitionTopStateHandoffDurationGauge;
private HistogramDynamicMetric _partitionTopStateHandoffHelixLatencyGauge;
- private HistogramDynamicMetric
_partitionTopStateNonGracefulHandoffDurationGauge;
private SimpleDynamicMetric<String> _rebalanceState;
@@ -125,6 +134,7 @@ public class ResourceMonitor extends DynamicMBeanProvider {
_numLessMinActiveReplicaPartitions =
new SimpleDynamicMetric("MissingMinActiveReplicaPartitionGauge", 0L);
_numNonTopStatePartitions = new
SimpleDynamicMetric("MissingTopStatePartitionGauge", 0L);
+ _missingTopStatePartitionsBeyondThresholdGauge = new
SimpleDynamicMetric("MissingTopStatePartitionsBeyondThresholdGauge", 0L);
_numOfErrorPartitions = new SimpleDynamicMetric("ErrorPartitionGauge", 0L);
_numOfPartitionsInExternalView = new
SimpleDynamicMetric("ExternalViewPartitionGauge", 0L);
_numOfPartitions = new SimpleDynamicMetric("PartitionGauge", 0L);
@@ -173,6 +183,10 @@ public class ResourceMonitor extends DynamicMBeanProvider {
return _numNonTopStatePartitions.getValue();
}
+ public long getMissingTopStatePartitionsBeyondThresholdGuage() {
+ return _missingTopStatePartitionsBeyondThresholdGauge.getValue();
+ }
+
public long getDifferenceWithIdealStateGauge() {
return _externalViewIdealStateDiff.getValue();
}
@@ -193,6 +207,7 @@ public class ResourceMonitor extends DynamicMBeanProvider {
return _partitionTopStateHandoffDurationGauge;
}
+ @Deprecated
public HistogramDynamicMetric
getPartitionTopStateNonGracefulHandoffDurationGauge() {
return _partitionTopStateNonGracefulHandoffDurationGauge;
}
@@ -201,6 +216,7 @@ public class ResourceMonitor extends DynamicMBeanProvider {
return _partitionTopStateHandoffHelixLatencyGauge;
}
+ @Deprecated
public long getFailedTopStateHandoffCounter() {
return _failedTopStateHandoffCounter.getValue();
}
@@ -356,6 +372,8 @@ public class ResourceMonitor extends DynamicMBeanProvider {
_partitionTopStateHandoffDurationGauge.updateValue(totalDuration);
_partitionTopStateHandoffHelixLatencyGauge.updateValue(helixLatency);
} else {
+ // TODO: Deprecated. use MissingTopStateBeyondThresholdGuage to find
out number of partitions with missing top
+ // state.
_partitionTopStateNonGracefulHandoffDurationGauge.updateValue(totalDuration);
}
if (totalDuration >
_maxSinglePartitionTopStateHandoffDuration.getValue()) {
@@ -363,7 +381,12 @@ public class ResourceMonitor extends DynamicMBeanProvider {
_lastResetTime = System.currentTimeMillis();
}
} else {
+ // TODO: Deprecated. use MissingTopStateBeyondThresholdGuage to find
out number of partitions with missing top
+ // state.
_failedTopStateHandoffCounter.updateValue(_failedTopStateHandoffCounter.getValue()
+ 1);
+ _missingTopStatePartitionsBeyondThresholdGauge
+
.updateValue(_missingTopStatePartitionsBeyondThresholdGauge.getValue() + 1);
+ _lastResetTime = System.currentTimeMillis();
}
break;
default:
@@ -459,6 +482,7 @@ public class ResourceMonitor extends DynamicMBeanProvider {
public void resetMaxTopStateHandoffGauge() {
if (_lastResetTime + DEFAULT_RESET_INTERVAL_MS <=
System.currentTimeMillis()) {
_maxSinglePartitionTopStateHandoffDuration.updateValue(0L);
+ _missingTopStatePartitionsBeyondThresholdGauge.updateValue(0L);
_lastResetTime = System.currentTimeMillis();
}
}
@@ -469,6 +493,7 @@ public class ResourceMonitor extends DynamicMBeanProvider {
_numOfPartitionsInExternalView,
_numOfErrorPartitions,
_numNonTopStatePartitions,
+ _missingTopStatePartitionsBeyondThresholdGauge,
_numLessMinActiveReplicaPartitions,
_numLessReplicaPartitions,
_numPendingRecoveryRebalanceReplicas,
diff --git
a/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
b/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
index 7833b1574..b5ec04412 100644
---
a/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
+++
b/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
@@ -183,7 +183,7 @@ public class TestTopStateHandoffMetrics extends
BaseStageTest {
liMap.remove("localhost_1");
cache.setLiveInstances(new ArrayList<>(liMap.values()));
}
- }, 1, 0,
+ }, 1, 0, 0, // threshold value not set hence no change in guage
expectedDuration,
DURATION_ZERO,
expectedDuration, expectedHelixLatency
@@ -212,7 +212,7 @@ public class TestTopStateHandoffMetrics extends
BaseStageTest {
cache.getInstanceOfflineTimeMap().put("localhost_0",
lastOfflineTime);
cache.notifyDataChange(HelixConstants.ChangeType.LIVE_INSTANCE);
}
- }, 1, 0,
+ }, 1, 0, 0, // threshold value not set hence no change in guage.
DURATION_ZERO, // graceful handoff duration should be 0
expectedDuration, // we should have an record for non-graceful handoff
expectedDuration, // max handoff should be same as non-graceful handoff
@@ -222,6 +222,12 @@ public class TestTopStateHandoffMetrics extends
BaseStageTest {
@Test(dataProvider = "failedCurrentStateInput")
public void testTopStateFailedHandoff(TestCaseConfig cfg) {
+ // There are two scenarios here :
+ // 1. localhost_0 looses top-state at 15000 and top state is recovered
on localhost_1 at 22000. This means that
+ // hand-off took 7000 which is greater than threshold (5000) hence
both failed counter as well
+ // missingTopStatePartitionsThresholdGuage will be set to 1.
+ // 2. localhost_0 looses top-state at 15000 and it's NEVER recovered. In
this scenario as well both failed counter
+ // as well as missingTopStatePartitionsThresholdGuage should be set
to 1.
ClusterConfig clusterConfig = new ClusterConfig(_clusterName);
clusterConfig.setMissTopStateDurationThreshold(5000L);
setClusterConfig(clusterConfig);
@@ -255,6 +261,7 @@ public class TestTopStateHandoffMetrics extends
BaseStageTest {
// helix latency will be less than the mocked helix latency
runStageAndVerify(Collections.EMPTY_MAP,
cfg.currentStateWithMissingTopState,
cfg.finalCurrentState, null, 1, 0,
+ 0, // No previous missingTopStateRecord so no change in guage
Range.closed(0L, helixLatency + userLatency),
DURATION_ZERO,
Range.closed(0L, helixLatency + userLatency),
@@ -312,7 +319,7 @@ public class TestTopStateHandoffMetrics extends
BaseStageTest {
cache.cacheMessages(Collections.singletonList(message));
}
}
- }, 1, 0,
+ }, 1, 0, 0,
Range.closed(durationToVerify, durationToVerify),
DURATION_ZERO,
Range.closed(durationToVerify, durationToVerify),
@@ -355,7 +362,7 @@ public class TestTopStateHandoffMetrics extends
BaseStageTest {
Range<Long> expectedHelixLatency =
cfg.isGraceful ? Range.closed(cfg.helixLatency, cfg.helixLatency) :
DURATION_ZERO;
runStageAndVerify(cfg.initialCurrentStates,
cfg.currentStateWithMissingTopState,
- cfg.finalCurrentState, null, expectFail ? 0 : 1, expectFail ? 1 : 0,
expectedDuration, expectedNonGracefulDuration,
+ cfg.finalCurrentState, null, expectFail ? 0 : 1, expectFail ? 1 : 0,
expectFail ? 1 : 0, expectedDuration, expectedNonGracefulDuration,
expectedDuration, expectedHelixLatency);
}
@@ -412,6 +419,7 @@ public class TestTopStateHandoffMetrics extends
BaseStageTest {
MissingStatesDataCacheInject inject,
int successCnt,
int failCnt,
+ int missingTopStatesBeyondThresholdCnt,
Range<Long> expectedDuration,
Range<Long> expectedNonGracefulDuration,
Range<Long> expectedMaxDuration,
@@ -426,6 +434,7 @@ public class TestTopStateHandoffMetrics extends
BaseStageTest {
Assert.assertEquals(monitor.getSucceededTopStateHandoffCounter(),
successCnt);
Assert.assertEquals(monitor.getFailedTopStateHandoffCounter(), failCnt);
+
Assert.assertEquals(monitor.getMissingTopStatePartitionsBeyondThresholdGuage(),
missingTopStatesBeyondThresholdCnt);
long graceful = monitor.getPartitionTopStateHandoffDurationGauge()
.getAttributeValue(GRACEFUL_HANDOFF_DURATION).longValue();