This is an automated email from the ASF dual-hosted git repository.

jxue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/helix.git


The following commit(s) were added to refs/heads/master by this push:
     new 2568f4a37 MissingTopState when the leader comes back on the same host 
(#2514)
2568f4a37 is described below

commit 2568f4a37a497ade21d9d680d2d0e094600bebd2
Author: Komal Desai <[email protected]>
AuthorDate: Mon Jun 5 12:50:07 2023 -0700

    MissingTopState when the leader comes back on the same host (#2514)
    
    If you have a single replica and that goes offline and comes back on the 
same host, we don't reset the counter. This fixes the behavior.
---
 .../stages/TopStateHandoffReportStage.java         | 15 ++---
 .../mbeans/TestTopStateHandoffMetrics.java         |  1 +
 .../test/resources/TestTopStateHandoffMetrics.json | 65 ++++++++++++++++++++++
 3 files changed, 71 insertions(+), 10 deletions(-)

diff --git 
a/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
 
b/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
index a7778b6ac..9588d1801 100644
--- 
a/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
+++ 
b/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
@@ -67,10 +67,9 @@ public class TopStateHandoffReportStage extends 
AbstractBaseStage {
     // TODO: remove this if-else after splitting controller
     if (cache instanceof WorkflowControllerDataProvider) {
       throw new StageException("TopStateHandoffReportStage can only be used in 
resource pipeline");
-    } else {
-      updateTopStateStatus((ResourceControllerDataProvider) cache, 
clusterStatusMonitor,
-          resourceMap, currentStateOutput, lastPipelineFinishTimestamp);
-    }
+    } 
+    updateTopStateStatus((ResourceControllerDataProvider) cache, 
clusterStatusMonitor,
+        resourceMap, currentStateOutput, lastPipelineFinishTimestamp);
   }
 
   private void updateTopStateStatus(ResourceControllerDataProvider cache,
@@ -212,8 +211,7 @@ public class TopStateHandoffReportStage extends 
AbstractBaseStage {
       reportTopStateComesBack(cache, 
currentStateOutput.getCurrentStateMap(resourceName, partition),
           resourceName, partition, clusterStatusMonitor, durationThreshold,
           stateModelDef.getTopState());
-    } else if (lastTopStateInstance != null && !lastTopStateInstance
-        .equals(currentTopStateInstance)) {
+    } else if (lastTopStateInstance != null) {
       // With no missing top state record, but top state instance changed,
       // we observed an entire top state handoff process
       reportSingleTopStateHandoff(cache, lastTopStateInstance, 
currentTopStateInstance,
@@ -242,9 +240,6 @@ public class TopStateHandoffReportStage extends 
AbstractBaseStage {
   private void reportSingleTopStateHandoff(ResourceControllerDataProvider 
cache, String lastTopStateInstance,
       String curTopStateInstance, String resourceName, Partition partition,
       ClusterStatusMonitor clusterStatusMonitor, long 
lastPipelineFinishTimestamp) {
-    if (curTopStateInstance.equals(lastTopStateInstance)) {
-      return;
-    }
 
     // Current state output generation logic guarantees that current top state 
instance
     // must be a live instance
@@ -260,7 +255,7 @@ public class TopStateHandoffReportStage extends 
AbstractBaseStage {
     long fromTopStateUserLatency = DEFAULT_HANDOFF_USER_LATENCY;
 
     // Make sure last top state instance has not bounced during cluster data 
cache refresh
-    if (cache.getLiveInstances().containsKey(lastTopStateInstance)) {
+    if (!curTopStateInstance.equals(lastTopStateInstance) && 
cache.getLiveInstances().containsKey(lastTopStateInstance)) {
       String lastTopStateSession =
           
cache.getLiveInstances().get(lastTopStateInstance).getEphemeralOwner();
       // We need this null check as there are test cases creating incomplete 
current state
diff --git 
a/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
 
b/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
index b5ec04412..ed5a0ae1b 100644
--- 
a/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
+++ 
b/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
@@ -228,6 +228,7 @@ public class TestTopStateHandoffMetrics extends 
BaseStageTest {
     //       missingTopStatePartitionsThresholdGuage will be set to 1.
     //   2. localhost_0 looses top-state at 15000 and it's NEVER recovered. In 
this scenario as well both failed counter
     //      as well as missingTopStatePartitionsThresholdGuage should be set 
to 1.
+    //   3. localhost_0 looses top-state at 15000 and recovers at 18000.
     ClusterConfig clusterConfig = new ClusterConfig(_clusterName);
     clusterConfig.setMissTopStateDurationThreshold(5000L);
     setClusterConfig(clusterConfig);
diff --git a/helix-core/src/test/resources/TestTopStateHandoffMetrics.json 
b/helix-core/src/test/resources/TestTopStateHandoffMetrics.json
index c1e579fbc..a0e52f3d2 100644
--- a/helix-core/src/test/resources/TestTopStateHandoffMetrics.json
+++ b/helix-core/src/test/resources/TestTopStateHandoffMetrics.json
@@ -316,6 +316,71 @@
       "Duration": 0,
       "HelixLatency": 0,
       "IsGraceful": false
+    },
+    {
+      "InitialCurrentStates": {
+        "localhost_0": {
+          "CurrentState": "MASTER",
+          "PreviousState": "SLAVE",
+          "StartTime": 10000,
+          "EndTime": 11000
+        },
+        "localhost_1": {
+          "CurrentState": "SLAVE",
+          "PreviousState": "OFFLINE",
+          "StartTime": 8000,
+          "EndTime": 10000
+        },
+        "localhost_2": {
+          "CurrentState": "SLAVE",
+          "PreviousState": "OFFLINE",
+          "StartTime": 8000,
+          "EndTime": 10000
+        }
+      },
+      "MissingTopStates": {
+        "localhost_0": {
+          "CurrentState": "SLAVE",
+          "PreviousState": "MASTER",
+          "StartTime": 15000,
+          "EndTime": 18000
+        },
+        "localhost_1": {
+          "CurrentState": "SLAVE",
+          "PreviousState": "OFFLINE",
+          "StartTime": 8000,
+          "EndTime": 10000
+        },
+        "localhost_2": {
+          "CurrentState": "SLAVE",
+          "PreviousState": "OFFLINE",
+          "StartTime": 8000,
+          "EndTime": 10000
+        }
+      },
+      "HandoffCurrentStates": {
+        "localhost_0": {
+          "CurrentState": "MASTER",
+          "PreviousState": "SLAVE",
+          "StartTime": 20000,
+          "EndTime": 21000
+        },
+        "localhost_1": {
+          "CurrentState": "SLAVE",
+          "PreviousState": "OFFLINE",
+          "StartTime": 8000,
+          "EndTime": 10000
+        },
+        "localhost_2": {
+          "CurrentState": "SLAVE",
+          "PreviousState": "OFFLINE",
+          "StartTime": 8000,
+          "EndTime": 10000
+        }
+      },
+      "Duration": 0,
+      "HelixLatency": 0,
+      "IsGraceful": false
     }
   ],
   "fast": [

Reply via email to