This is an automated email from the ASF dual-hosted git repository.

xyuanlu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/helix.git


The following commit(s) were added to refs/heads/master by this push:
     new 7a180c9d2 Adjusted decrementing the 
missingTopStatePartitionsBeyondThresholdGuage when top-state Instance recovers 
after the threshold failure. (#2591)
7a180c9d2 is described below

commit 7a180c9d277a7d246f1018bf1cd31386c8da7a98
Author: Himanshu Kandwal <[email protected]>
AuthorDate: Tue Aug 15 09:35:26 2023 -0700

    Adjusted decrementing the missingTopStatePartitionsBeyondThresholdGuage 
when top-state Instance recovers after the threshold failure. (#2591)
    
    Adjusted decrementing the missingTopStatePartitionsBeyondThresholdGuage 
when top-state Instance recovers after the threshold failure.
    ---------
    
    Co-authored-by: Himanshu Kandwal <[email protected]>
---
 .../stages/TopStateHandoffReportStage.java         |  6 ++++
 .../monitoring/mbeans/ClusterStatusMonitor.java    |  8 +++++
 .../helix/monitoring/mbeans/ResourceMonitor.java   | 18 +++++++---
 .../mbeans/TestTopStateHandoffMetrics.java         | 42 ++++++++++++++++++----
 .../test/resources/TestTopStateHandoffMetrics.json | 36 ++++++++++---------
 5 files changed, 81 insertions(+), 29 deletions(-)

diff --git 
a/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
 
b/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
index 6ca0771e2..8d5cf802a 100644
--- 
a/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
+++ 
b/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
@@ -491,6 +491,12 @@ public class TopStateHandoffReportStage extends 
AbstractBaseStage {
                 true);
       }
     }
+
+    // In case of recovery after failure, we should decrement the 
missingTopStateBeyondThresholdGauge value.
+    if (clusterStatusMonitor != null && record.isFailed()) {
+      
clusterStatusMonitor.decrementMissingTopStateBeyondThresholdGauge(resourceName);
+    }
+
     removeFromStatsMap(missingTopStateMap, resourceName, partition);
   }
 
diff --git 
a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitor.java
 
b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitor.java
index bf0315d3a..c8e0d6ef3 100644
--- 
a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitor.java
+++ 
b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitor.java
@@ -583,6 +583,14 @@ public class ClusterStatusMonitor implements 
ClusterStatusMonitorMBean {
     }
   }
 
+  public void decrementMissingTopStateBeyondThresholdGauge(String 
resourceName) {
+    ResourceMonitor resourceMonitor = getOrCreateResourceMonitor(resourceName);
+
+    if (resourceMonitor != null) {
+      resourceMonitor.decrementMissingTopStateBeyondThresholdGauge();
+    }
+  }
+
   public void updateRebalancerStats(String resourceName, long 
numPendingRecoveryRebalancePartitions,
       long numPendingLoadRebalancePartitions, long 
numRecoveryRebalanceThrottledPartitions,
       long numLoadRebalanceThrottledPartitions, boolean 
rebalanceThrottledByErrorPartitions) {
diff --git 
a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java
 
b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java
index 8811e0625..263d1923c 100644
--- 
a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java
+++ 
b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java
@@ -72,7 +72,7 @@ public class ResourceMonitor extends DynamicMBeanProvider {
   private SimpleDynamicMetric<Long> _successfulTopStateHandoffDurationCounter;
   private SimpleDynamicMetric<Long> _successTopStateHandoffCounter;
 
-  // A new Gauage _missingTopStatePartitionsBeyondThresholdGauge for reporting 
number of partitions with missing top
+  // A new Gauge _missingTopStatePartitionsBeyondThresholdGauge for reporting 
number of partitions with missing top
   // state has been added. Reason of deprecating this two metrics is because 
they are similar and doesn't tell about
   // how many partitions are missing beyond threshold. The average duration of 
different partitions hands-off would not
   // be helpful and can be used to find that out. Please find more info at 
https://github.com/apache/helix/pull/2381
@@ -381,12 +381,10 @@ public class ResourceMonitor extends DynamicMBeanProvider 
{
           _lastResetTime = System.currentTimeMillis();
         }
       } else {
-        // TODO: Deprecated. use MissingTopStateBeyondThresholdGuage to find 
out number of partitions with missing top
+        // TODO: Deprecated. use MissingTopStateBeyondThresholdGauge to find 
out number of partitions with missing top
         //  state.
         
_failedTopStateHandoffCounter.updateValue(_failedTopStateHandoffCounter.getValue()
 + 1);
-        _missingTopStatePartitionsBeyondThresholdGauge
-            
.updateValue(_missingTopStatePartitionsBeyondThresholdGauge.getValue() + 1);
-        _lastResetTime = System.currentTimeMillis();
+        incrementMissingTopStateBeyondThresholdGauge();
       }
       break;
     default:
@@ -487,6 +485,16 @@ public class ResourceMonitor extends DynamicMBeanProvider {
     }
   }
 
+  public void incrementMissingTopStateBeyondThresholdGauge() {
+    
_missingTopStatePartitionsBeyondThresholdGauge.updateValue(_missingTopStatePartitionsBeyondThresholdGauge.getValue()
 + 1);
+    _lastResetTime = System.currentTimeMillis();
+  }
+
+  public void decrementMissingTopStateBeyondThresholdGauge() {
+    
_missingTopStatePartitionsBeyondThresholdGauge.updateValue(_missingTopStatePartitionsBeyondThresholdGauge.getValue()
 - 1);
+    _lastResetTime = System.currentTimeMillis();
+  }
+
   private List<DynamicMetric<?, ?>> buildAttributeList() {
     List<DynamicMetric<?, ?>> attributeList = Lists.newArrayList(
         _numOfPartitions,
diff --git 
a/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
 
b/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
index ed5a0ae1b..5ad35d0cf 100644
--- 
a/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
+++ 
b/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
@@ -121,18 +121,21 @@ public class TestTopStateHandoffMetrics extends 
BaseStageTest {
     final List<TestCaseConfig> failed;
     final List<TestCaseConfig> fast;
     final List<TestCaseConfig> succeededNonGraceful;
+    final List<TestCaseConfig> failedWithoutRecovery;
 
     @JsonCreator
     public TestConfig(
         @JsonProperty("succeeded") List<TestCaseConfig> succeededCfg,
         @JsonProperty("failed") List<TestCaseConfig> failedCfg,
         @JsonProperty("fast") List<TestCaseConfig> fastCfg,
-        @JsonProperty("succeededNonGraceful") List<TestCaseConfig> nonGraceful
+        @JsonProperty("succeededNonGraceful") List<TestCaseConfig> nonGraceful,
+        @JsonProperty("failedWithoutRecovery") List<TestCaseConfig> 
unrecoveredFailedCfg
     ) {
       succeeded = succeededCfg;
       failed = failedCfg;
       fast = fastCfg;
       succeededNonGraceful = nonGraceful;
+      failedWithoutRecovery = unrecoveredFailedCfg;
     }
   }
 
@@ -224,17 +227,37 @@ public class TestTopStateHandoffMetrics extends 
BaseStageTest {
   public void testTopStateFailedHandoff(TestCaseConfig cfg) {
     // There are two scenarios here :
     //    1. localhost_0 looses top-state at 15000 and top state is recovered 
on localhost_1 at 22000. This means that
-    //       hand-off took 7000 which is greater than threshold (5000) hence 
both failed counter as well
-    //       missingTopStatePartitionsThresholdGuage will be set to 1.
-    //   2. localhost_0 looses top-state at 15000 and it's NEVER recovered. In 
this scenario as well both failed counter
-    //      as well as missingTopStatePartitionsThresholdGuage should be set 
to 1.
-    //   3. localhost_0 looses top-state at 15000 and recovers at 18000.
+    //       hand-off took 7000 which is greater than threshold (5000).
+    //    2. localhost_0 looses top-state at 15000 and recovers at 18000.
+    // In both scenarios as recovery has been achieved so both failed counter 
and missingTopStatePartitionsThresholdGauge
+    // will be set to 0.
     ClusterConfig clusterConfig = new ClusterConfig(_clusterName);
     clusterConfig.setMissTopStateDurationThreshold(5000L);
     setClusterConfig(clusterConfig);
     runTestWithNoInjection(cfg, true);
   }
 
+  @Test(dataProvider = "failedWithoutRecovery")
+  public void testTopStateFailedUnrecoveredHandoff(TestCaseConfig cfg) {
+    // Scenario : localhost_0 looses top-state at 15000 and it's NEVER 
recovered.
+    // In this scenario missingTopStatePartitionsThresholdGauge value should 
be set to 1.
+    ClusterConfig clusterConfig = new ClusterConfig(_clusterName);
+    clusterConfig.setMissTopStateDurationThreshold(5000L);
+    setClusterConfig(clusterConfig);
+
+    preSetup();
+    Range<Long> duration = Range.closed(cfg.duration, cfg.duration);
+    Range<Long> expectedDuration = cfg.isGraceful ? duration : DURATION_ZERO;
+    Range<Long> expectedNonGracefulDuration = cfg.isGraceful ? DURATION_ZERO : 
duration;
+    Range<Long> expectedHelixLatency = cfg.isGraceful ? 
Range.closed(cfg.helixLatency, cfg.helixLatency) : DURATION_ZERO;
+
+    runStageAndVerify(cfg.initialCurrentStates, 
cfg.currentStateWithMissingTopState,
+        cfg.finalCurrentState, null,  0,
+        1, 1,   // No recovered state observed so no change in gauge and 
counter
+        expectedDuration, expectedNonGracefulDuration,
+        expectedDuration, expectedHelixLatency);
+  }
+
   // Test success with no available clue about previous master.
   // For example, controller is just changed to a new node.
   @Test(
@@ -337,6 +360,11 @@ public class TestTopStateHandoffMetrics extends 
BaseStageTest {
     return testCaseConfigListToObjectArray(config.failed);
   }
 
+  @DataProvider(name = "failedWithoutRecovery")
+  public Object[][] failedWithoutRecovery() {
+    return testCaseConfigListToObjectArray(config.failedWithoutRecovery);
+  }
+
   @DataProvider(name = "fastCurrentStateInput")
   public Object[][] fastCurrentState() {
     return testCaseConfigListToObjectArray(config.fast);
@@ -363,7 +391,7 @@ public class TestTopStateHandoffMetrics extends 
BaseStageTest {
     Range<Long> expectedHelixLatency =
         cfg.isGraceful ? Range.closed(cfg.helixLatency, cfg.helixLatency) : 
DURATION_ZERO;
     runStageAndVerify(cfg.initialCurrentStates, 
cfg.currentStateWithMissingTopState,
-        cfg.finalCurrentState, null, expectFail ? 0 : 1, expectFail ? 1 : 0, 
expectFail ? 1 : 0, expectedDuration, expectedNonGracefulDuration,
+        cfg.finalCurrentState, null, expectFail ? 0 : 1, expectFail ? 1 : 0, 
0, expectedDuration, expectedNonGracefulDuration,
         expectedDuration, expectedHelixLatency);
   }
 
diff --git a/helix-core/src/test/resources/TestTopStateHandoffMetrics.json 
b/helix-core/src/test/resources/TestTopStateHandoffMetrics.json
index a0e52f3d2..7e58b6f2e 100644
--- a/helix-core/src/test/resources/TestTopStateHandoffMetrics.json
+++ b/helix-core/src/test/resources/TestTopStateHandoffMetrics.json
@@ -268,7 +268,7 @@
         },
         "localhost_2": {
           "CurrentState": "SLAVE",
-          "PreviousState": "MASTER",
+          "PreviousState": "OFFLINE",
           "StartTime": 8000,
           "EndTime": 10000
         }
@@ -288,23 +288,23 @@
         },
         "localhost_2": {
           "CurrentState": "SLAVE",
-          "PreviousState": "MASTER",
+          "PreviousState": "OFFLINE",
           "StartTime": 8000,
           "EndTime": 10000
         }
       },
       "HandoffCurrentStates": {
         "localhost_0": {
-          "CurrentState": "SLAVE",
-          "PreviousState": "MASTER",
-          "StartTime": 15000,
-          "EndTime": 18000
+          "CurrentState": "MASTER",
+          "PreviousState": "SLAVE",
+          "StartTime": 20000,
+          "EndTime": 21000
         },
         "localhost_1": {
           "CurrentState": "SLAVE",
           "PreviousState": "OFFLINE",
-          "StartTime": 20000,
-          "EndTime": 22000
+          "StartTime": 8000,
+          "EndTime": 10000
         },
         "localhost_2": {
           "CurrentState": "SLAVE",
@@ -316,7 +316,9 @@
       "Duration": 0,
       "HelixLatency": 0,
       "IsGraceful": false
-    },
+    }
+  ],
+  "failedWithoutRecovery": [
     {
       "InitialCurrentStates": {
         "localhost_0": {
@@ -333,7 +335,7 @@
         },
         "localhost_2": {
           "CurrentState": "SLAVE",
-          "PreviousState": "OFFLINE",
+          "PreviousState": "MASTER",
           "StartTime": 8000,
           "EndTime": 10000
         }
@@ -353,23 +355,23 @@
         },
         "localhost_2": {
           "CurrentState": "SLAVE",
-          "PreviousState": "OFFLINE",
+          "PreviousState": "MASTER",
           "StartTime": 8000,
           "EndTime": 10000
         }
       },
       "HandoffCurrentStates": {
         "localhost_0": {
-          "CurrentState": "MASTER",
-          "PreviousState": "SLAVE",
-          "StartTime": 20000,
-          "EndTime": 21000
+          "CurrentState": "SLAVE",
+          "PreviousState": "MASTER",
+          "StartTime": 15000,
+          "EndTime": 18000
         },
         "localhost_1": {
           "CurrentState": "SLAVE",
           "PreviousState": "OFFLINE",
-          "StartTime": 8000,
-          "EndTime": 10000
+          "StartTime": 20000,
+          "EndTime": 22000
         },
         "localhost_2": {
           "CurrentState": "SLAVE",

Reply via email to