This is an automated email from the ASF dual-hosted git repository.
xyuanlu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/helix.git
The following commit(s) were added to refs/heads/master by this push:
new 7a180c9d2 Adjusted decrementing the
missingTopStatePartitionsBeyondThresholdGuage when top-state Instance recovers
after the threshold failure. (#2591)
7a180c9d2 is described below
commit 7a180c9d277a7d246f1018bf1cd31386c8da7a98
Author: Himanshu Kandwal <[email protected]>
AuthorDate: Tue Aug 15 09:35:26 2023 -0700
Adjusted decrementing the missingTopStatePartitionsBeyondThresholdGuage
when top-state Instance recovers after the threshold failure. (#2591)
Adjusted decrementing the missingTopStatePartitionsBeyondThresholdGuage
when top-state Instance recovers after the threshold failure.
---------
Co-authored-by: Himanshu Kandwal <[email protected]>
---
.../stages/TopStateHandoffReportStage.java | 6 ++++
.../monitoring/mbeans/ClusterStatusMonitor.java | 8 +++++
.../helix/monitoring/mbeans/ResourceMonitor.java | 18 +++++++---
.../mbeans/TestTopStateHandoffMetrics.java | 42 ++++++++++++++++++----
.../test/resources/TestTopStateHandoffMetrics.json | 36 ++++++++++---------
5 files changed, 81 insertions(+), 29 deletions(-)
diff --git
a/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
b/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
index 6ca0771e2..8d5cf802a 100644
---
a/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
+++
b/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
@@ -491,6 +491,12 @@ public class TopStateHandoffReportStage extends
AbstractBaseStage {
true);
}
}
+
+ // In case of recovery after failure, we should decrement the
missingTopStateBeyondThresholdGauge value.
+ if (clusterStatusMonitor != null && record.isFailed()) {
+
clusterStatusMonitor.decrementMissingTopStateBeyondThresholdGauge(resourceName);
+ }
+
removeFromStatsMap(missingTopStateMap, resourceName, partition);
}
diff --git
a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitor.java
b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitor.java
index bf0315d3a..c8e0d6ef3 100644
---
a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitor.java
+++
b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitor.java
@@ -583,6 +583,14 @@ public class ClusterStatusMonitor implements
ClusterStatusMonitorMBean {
}
}
+ public void decrementMissingTopStateBeyondThresholdGauge(String
resourceName) {
+ ResourceMonitor resourceMonitor = getOrCreateResourceMonitor(resourceName);
+
+ if (resourceMonitor != null) {
+ resourceMonitor.decrementMissingTopStateBeyondThresholdGauge();
+ }
+ }
+
public void updateRebalancerStats(String resourceName, long
numPendingRecoveryRebalancePartitions,
long numPendingLoadRebalancePartitions, long
numRecoveryRebalanceThrottledPartitions,
long numLoadRebalanceThrottledPartitions, boolean
rebalanceThrottledByErrorPartitions) {
diff --git
a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java
b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java
index 8811e0625..263d1923c 100644
---
a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java
+++
b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java
@@ -72,7 +72,7 @@ public class ResourceMonitor extends DynamicMBeanProvider {
private SimpleDynamicMetric<Long> _successfulTopStateHandoffDurationCounter;
private SimpleDynamicMetric<Long> _successTopStateHandoffCounter;
- // A new Gauage _missingTopStatePartitionsBeyondThresholdGauge for reporting
number of partitions with missing top
+ // A new Gauge _missingTopStatePartitionsBeyondThresholdGauge for reporting
number of partitions with missing top
// state has been added. Reason of deprecating this two metrics is because
they are similar and doesn't tell about
// how many partitions are missing beyond threshold. The average duration of
different partitions hands-off would not
// be helpful and can be used to find that out. Please find more info at
https://github.com/apache/helix/pull/2381
@@ -381,12 +381,10 @@ public class ResourceMonitor extends DynamicMBeanProvider
{
_lastResetTime = System.currentTimeMillis();
}
} else {
- // TODO: Deprecated. use MissingTopStateBeyondThresholdGuage to find
out number of partitions with missing top
+ // TODO: Deprecated. use MissingTopStateBeyondThresholdGauge to find
out number of partitions with missing top
// state.
_failedTopStateHandoffCounter.updateValue(_failedTopStateHandoffCounter.getValue()
+ 1);
- _missingTopStatePartitionsBeyondThresholdGauge
-
.updateValue(_missingTopStatePartitionsBeyondThresholdGauge.getValue() + 1);
- _lastResetTime = System.currentTimeMillis();
+ incrementMissingTopStateBeyondThresholdGauge();
}
break;
default:
@@ -487,6 +485,16 @@ public class ResourceMonitor extends DynamicMBeanProvider {
}
}
+ public void incrementMissingTopStateBeyondThresholdGauge() {
+
_missingTopStatePartitionsBeyondThresholdGauge.updateValue(_missingTopStatePartitionsBeyondThresholdGauge.getValue()
+ 1);
+ _lastResetTime = System.currentTimeMillis();
+ }
+
+ public void decrementMissingTopStateBeyondThresholdGauge() {
+
_missingTopStatePartitionsBeyondThresholdGauge.updateValue(_missingTopStatePartitionsBeyondThresholdGauge.getValue()
- 1);
+ _lastResetTime = System.currentTimeMillis();
+ }
+
private List<DynamicMetric<?, ?>> buildAttributeList() {
List<DynamicMetric<?, ?>> attributeList = Lists.newArrayList(
_numOfPartitions,
diff --git
a/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
b/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
index ed5a0ae1b..5ad35d0cf 100644
---
a/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
+++
b/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
@@ -121,18 +121,21 @@ public class TestTopStateHandoffMetrics extends
BaseStageTest {
final List<TestCaseConfig> failed;
final List<TestCaseConfig> fast;
final List<TestCaseConfig> succeededNonGraceful;
+ final List<TestCaseConfig> failedWithoutRecovery;
@JsonCreator
public TestConfig(
@JsonProperty("succeeded") List<TestCaseConfig> succeededCfg,
@JsonProperty("failed") List<TestCaseConfig> failedCfg,
@JsonProperty("fast") List<TestCaseConfig> fastCfg,
- @JsonProperty("succeededNonGraceful") List<TestCaseConfig> nonGraceful
+ @JsonProperty("succeededNonGraceful") List<TestCaseConfig> nonGraceful,
+ @JsonProperty("failedWithoutRecovery") List<TestCaseConfig>
unrecoveredFailedCfg
) {
succeeded = succeededCfg;
failed = failedCfg;
fast = fastCfg;
succeededNonGraceful = nonGraceful;
+ failedWithoutRecovery = unrecoveredFailedCfg;
}
}
@@ -224,17 +227,37 @@ public class TestTopStateHandoffMetrics extends
BaseStageTest {
public void testTopStateFailedHandoff(TestCaseConfig cfg) {
// There are two scenarios here :
// 1. localhost_0 looses top-state at 15000 and top state is recovered
on localhost_1 at 22000. This means that
- // hand-off took 7000 which is greater than threshold (5000) hence
both failed counter as well
- // missingTopStatePartitionsThresholdGuage will be set to 1.
- // 2. localhost_0 looses top-state at 15000 and it's NEVER recovered. In
this scenario as well both failed counter
- // as well as missingTopStatePartitionsThresholdGuage should be set
to 1.
- // 3. localhost_0 looses top-state at 15000 and recovers at 18000.
+ // hand-off took 7000 which is greater than threshold (5000).
+ // 2. localhost_0 looses top-state at 15000 and recovers at 18000.
+ // In both scenarios as recovery has been achieved so both failed counter
and missingTopStatePartitionsThresholdGauge
+ // will be set to 0.
ClusterConfig clusterConfig = new ClusterConfig(_clusterName);
clusterConfig.setMissTopStateDurationThreshold(5000L);
setClusterConfig(clusterConfig);
runTestWithNoInjection(cfg, true);
}
+ @Test(dataProvider = "failedWithoutRecovery")
+ public void testTopStateFailedUnrecoveredHandoff(TestCaseConfig cfg) {
+ // Scenario : localhost_0 looses top-state at 15000 and it's NEVER
recovered.
+ // In this scenario missingTopStatePartitionsThresholdGauge value should
be set to 1.
+ ClusterConfig clusterConfig = new ClusterConfig(_clusterName);
+ clusterConfig.setMissTopStateDurationThreshold(5000L);
+ setClusterConfig(clusterConfig);
+
+ preSetup();
+ Range<Long> duration = Range.closed(cfg.duration, cfg.duration);
+ Range<Long> expectedDuration = cfg.isGraceful ? duration : DURATION_ZERO;
+ Range<Long> expectedNonGracefulDuration = cfg.isGraceful ? DURATION_ZERO :
duration;
+ Range<Long> expectedHelixLatency = cfg.isGraceful ?
Range.closed(cfg.helixLatency, cfg.helixLatency) : DURATION_ZERO;
+
+ runStageAndVerify(cfg.initialCurrentStates,
cfg.currentStateWithMissingTopState,
+ cfg.finalCurrentState, null, 0,
+ 1, 1, // No recovered state observed so no change in gauge and
counter
+ expectedDuration, expectedNonGracefulDuration,
+ expectedDuration, expectedHelixLatency);
+ }
+
// Test success with no available clue about previous master.
// For example, controller is just changed to a new node.
@Test(
@@ -337,6 +360,11 @@ public class TestTopStateHandoffMetrics extends
BaseStageTest {
return testCaseConfigListToObjectArray(config.failed);
}
+ @DataProvider(name = "failedWithoutRecovery")
+ public Object[][] failedWithoutRecovery() {
+ return testCaseConfigListToObjectArray(config.failedWithoutRecovery);
+ }
+
@DataProvider(name = "fastCurrentStateInput")
public Object[][] fastCurrentState() {
return testCaseConfigListToObjectArray(config.fast);
@@ -363,7 +391,7 @@ public class TestTopStateHandoffMetrics extends
BaseStageTest {
Range<Long> expectedHelixLatency =
cfg.isGraceful ? Range.closed(cfg.helixLatency, cfg.helixLatency) :
DURATION_ZERO;
runStageAndVerify(cfg.initialCurrentStates,
cfg.currentStateWithMissingTopState,
- cfg.finalCurrentState, null, expectFail ? 0 : 1, expectFail ? 1 : 0,
expectFail ? 1 : 0, expectedDuration, expectedNonGracefulDuration,
+ cfg.finalCurrentState, null, expectFail ? 0 : 1, expectFail ? 1 : 0,
0, expectedDuration, expectedNonGracefulDuration,
expectedDuration, expectedHelixLatency);
}
diff --git a/helix-core/src/test/resources/TestTopStateHandoffMetrics.json
b/helix-core/src/test/resources/TestTopStateHandoffMetrics.json
index a0e52f3d2..7e58b6f2e 100644
--- a/helix-core/src/test/resources/TestTopStateHandoffMetrics.json
+++ b/helix-core/src/test/resources/TestTopStateHandoffMetrics.json
@@ -268,7 +268,7 @@
},
"localhost_2": {
"CurrentState": "SLAVE",
- "PreviousState": "MASTER",
+ "PreviousState": "OFFLINE",
"StartTime": 8000,
"EndTime": 10000
}
@@ -288,23 +288,23 @@
},
"localhost_2": {
"CurrentState": "SLAVE",
- "PreviousState": "MASTER",
+ "PreviousState": "OFFLINE",
"StartTime": 8000,
"EndTime": 10000
}
},
"HandoffCurrentStates": {
"localhost_0": {
- "CurrentState": "SLAVE",
- "PreviousState": "MASTER",
- "StartTime": 15000,
- "EndTime": 18000
+ "CurrentState": "MASTER",
+ "PreviousState": "SLAVE",
+ "StartTime": 20000,
+ "EndTime": 21000
},
"localhost_1": {
"CurrentState": "SLAVE",
"PreviousState": "OFFLINE",
- "StartTime": 20000,
- "EndTime": 22000
+ "StartTime": 8000,
+ "EndTime": 10000
},
"localhost_2": {
"CurrentState": "SLAVE",
@@ -316,7 +316,9 @@
"Duration": 0,
"HelixLatency": 0,
"IsGraceful": false
- },
+ }
+ ],
+ "failedWithoutRecovery": [
{
"InitialCurrentStates": {
"localhost_0": {
@@ -333,7 +335,7 @@
},
"localhost_2": {
"CurrentState": "SLAVE",
- "PreviousState": "OFFLINE",
+ "PreviousState": "MASTER",
"StartTime": 8000,
"EndTime": 10000
}
@@ -353,23 +355,23 @@
},
"localhost_2": {
"CurrentState": "SLAVE",
- "PreviousState": "OFFLINE",
+ "PreviousState": "MASTER",
"StartTime": 8000,
"EndTime": 10000
}
},
"HandoffCurrentStates": {
"localhost_0": {
- "CurrentState": "MASTER",
- "PreviousState": "SLAVE",
- "StartTime": 20000,
- "EndTime": 21000
+ "CurrentState": "SLAVE",
+ "PreviousState": "MASTER",
+ "StartTime": 15000,
+ "EndTime": 18000
},
"localhost_1": {
"CurrentState": "SLAVE",
"PreviousState": "OFFLINE",
- "StartTime": 8000,
- "EndTime": 10000
+ "StartTime": 20000,
+ "EndTime": 22000
},
"localhost_2": {
"CurrentState": "SLAVE",