This is an automated email from the ASF dual-hosted git repository.
jxue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/helix.git
The following commit(s) were added to refs/heads/master by this push:
new 2568f4a37 MissingTopState when the leader comes back on the same host
(#2514)
2568f4a37 is described below
commit 2568f4a37a497ade21d9d680d2d0e094600bebd2
Author: Komal Desai <[email protected]>
AuthorDate: Mon Jun 5 12:50:07 2023 -0700
MissingTopState when the leader comes back on the same host (#2514)
If you have a single replica and that goes offline and comes back on the
same host, we don't reset the counter. This fixes the behavior.
---
.../stages/TopStateHandoffReportStage.java | 15 ++---
.../mbeans/TestTopStateHandoffMetrics.java | 1 +
.../test/resources/TestTopStateHandoffMetrics.json | 65 ++++++++++++++++++++++
3 files changed, 71 insertions(+), 10 deletions(-)
diff --git
a/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
b/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
index a7778b6ac..9588d1801 100644
---
a/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
+++
b/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
@@ -67,10 +67,9 @@ public class TopStateHandoffReportStage extends
AbstractBaseStage {
// TODO: remove this if-else after splitting controller
if (cache instanceof WorkflowControllerDataProvider) {
throw new StageException("TopStateHandoffReportStage can only be used in
resource pipeline");
- } else {
- updateTopStateStatus((ResourceControllerDataProvider) cache,
clusterStatusMonitor,
- resourceMap, currentStateOutput, lastPipelineFinishTimestamp);
- }
+ }
+ updateTopStateStatus((ResourceControllerDataProvider) cache,
clusterStatusMonitor,
+ resourceMap, currentStateOutput, lastPipelineFinishTimestamp);
}
private void updateTopStateStatus(ResourceControllerDataProvider cache,
@@ -212,8 +211,7 @@ public class TopStateHandoffReportStage extends
AbstractBaseStage {
reportTopStateComesBack(cache,
currentStateOutput.getCurrentStateMap(resourceName, partition),
resourceName, partition, clusterStatusMonitor, durationThreshold,
stateModelDef.getTopState());
- } else if (lastTopStateInstance != null && !lastTopStateInstance
- .equals(currentTopStateInstance)) {
+ } else if (lastTopStateInstance != null) {
// With no missing top state record, but top state instance changed,
// we observed an entire top state handoff process
reportSingleTopStateHandoff(cache, lastTopStateInstance,
currentTopStateInstance,
@@ -242,9 +240,6 @@ public class TopStateHandoffReportStage extends
AbstractBaseStage {
private void reportSingleTopStateHandoff(ResourceControllerDataProvider
cache, String lastTopStateInstance,
String curTopStateInstance, String resourceName, Partition partition,
ClusterStatusMonitor clusterStatusMonitor, long
lastPipelineFinishTimestamp) {
- if (curTopStateInstance.equals(lastTopStateInstance)) {
- return;
- }
// Current state output generation logic guarantees that current top state
instance
// must be a live instance
@@ -260,7 +255,7 @@ public class TopStateHandoffReportStage extends
AbstractBaseStage {
long fromTopStateUserLatency = DEFAULT_HANDOFF_USER_LATENCY;
// Make sure last top state instance has not bounced during cluster data
cache refresh
- if (cache.getLiveInstances().containsKey(lastTopStateInstance)) {
+ if (!curTopStateInstance.equals(lastTopStateInstance) &&
cache.getLiveInstances().containsKey(lastTopStateInstance)) {
String lastTopStateSession =
cache.getLiveInstances().get(lastTopStateInstance).getEphemeralOwner();
// We need this null check as there are test cases creating incomplete
current state
diff --git
a/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
b/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
index b5ec04412..ed5a0ae1b 100644
---
a/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
+++
b/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
@@ -228,6 +228,7 @@ public class TestTopStateHandoffMetrics extends
BaseStageTest {
// missingTopStatePartitionsThresholdGuage will be set to 1.
// 2. localhost_0 looses top-state at 15000 and it's NEVER recovered. In
this scenario as well both failed counter
// as well as missingTopStatePartitionsThresholdGuage should be set
to 1.
+ // 3. localhost_0 looses top-state at 15000 and recovers at 18000.
ClusterConfig clusterConfig = new ClusterConfig(_clusterName);
clusterConfig.setMissTopStateDurationThreshold(5000L);
setClusterConfig(clusterConfig);
diff --git a/helix-core/src/test/resources/TestTopStateHandoffMetrics.json
b/helix-core/src/test/resources/TestTopStateHandoffMetrics.json
index c1e579fbc..a0e52f3d2 100644
--- a/helix-core/src/test/resources/TestTopStateHandoffMetrics.json
+++ b/helix-core/src/test/resources/TestTopStateHandoffMetrics.json
@@ -316,6 +316,71 @@
"Duration": 0,
"HelixLatency": 0,
"IsGraceful": false
+ },
+ {
+ "InitialCurrentStates": {
+ "localhost_0": {
+ "CurrentState": "MASTER",
+ "PreviousState": "SLAVE",
+ "StartTime": 10000,
+ "EndTime": 11000
+ },
+ "localhost_1": {
+ "CurrentState": "SLAVE",
+ "PreviousState": "OFFLINE",
+ "StartTime": 8000,
+ "EndTime": 10000
+ },
+ "localhost_2": {
+ "CurrentState": "SLAVE",
+ "PreviousState": "OFFLINE",
+ "StartTime": 8000,
+ "EndTime": 10000
+ }
+ },
+ "MissingTopStates": {
+ "localhost_0": {
+ "CurrentState": "SLAVE",
+ "PreviousState": "MASTER",
+ "StartTime": 15000,
+ "EndTime": 18000
+ },
+ "localhost_1": {
+ "CurrentState": "SLAVE",
+ "PreviousState": "OFFLINE",
+ "StartTime": 8000,
+ "EndTime": 10000
+ },
+ "localhost_2": {
+ "CurrentState": "SLAVE",
+ "PreviousState": "OFFLINE",
+ "StartTime": 8000,
+ "EndTime": 10000
+ }
+ },
+ "HandoffCurrentStates": {
+ "localhost_0": {
+ "CurrentState": "MASTER",
+ "PreviousState": "SLAVE",
+ "StartTime": 20000,
+ "EndTime": 21000
+ },
+ "localhost_1": {
+ "CurrentState": "SLAVE",
+ "PreviousState": "OFFLINE",
+ "StartTime": 8000,
+ "EndTime": 10000
+ },
+ "localhost_2": {
+ "CurrentState": "SLAVE",
+ "PreviousState": "OFFLINE",
+ "StartTime": 8000,
+ "EndTime": 10000
+ }
+ },
+ "Duration": 0,
+ "HelixLatency": 0,
+ "IsGraceful": false
}
],
"fast": [