This is an automated email from the ASF dual-hosted git repository.

xyuanlu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/helix.git


The following commit(s) were added to refs/heads/master by this push:
     new ebea601e6 Adding a new metric to report number of partitions with 
missing top state beyond threshold (#2381)
ebea601e6 is described below

commit ebea601e6d983708742b6e31f7b9a88c0a1e4ad2
Author: Rahul Rane <[email protected]>
AuthorDate: Mon Feb 27 15:37:27 2023 -0800

    Adding a new metric to report number of partitions with missing top state 
beyond threshold (#2381)
    
    Adding a new metric to report number of partitions with missing top state 
beyond threshold
---
 .../controller/stages/MissingTopStateRecord.java   |  1 +
 .../stages/TopStateHandoffReportStage.java         |  7 ++++--
 .../helix/monitoring/mbeans/ResourceMonitor.java   | 27 +++++++++++++++++++++-
 .../mbeans/TestTopStateHandoffMetrics.java         | 17 ++++++++++----
 4 files changed, 45 insertions(+), 7 deletions(-)

diff --git 
a/helix-core/src/main/java/org/apache/helix/controller/stages/MissingTopStateRecord.java
 
b/helix-core/src/main/java/org/apache/helix/controller/stages/MissingTopStateRecord.java
index 119ef9c97..74605bce5 100644
--- 
a/helix-core/src/main/java/org/apache/helix/controller/stages/MissingTopStateRecord.java
+++ 
b/helix-core/src/main/java/org/apache/helix/controller/stages/MissingTopStateRecord.java
@@ -49,6 +49,7 @@ public class MissingTopStateRecord {
   }
 
   /* package */ void setFailed() {
+    // Mark missingTopStateRecord as failed if partition has missing top state 
beyond threshold value.
     failed = true;
   }
 
diff --git 
a/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
 
b/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
index 9225be632..a7778b6ac 100644
--- 
a/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
+++ 
b/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
@@ -206,7 +206,9 @@ public class TopStateHandoffReportStage extends 
AbstractBaseStage {
 
     if (missingTopStateMap.containsKey(resourceName) && 
missingTopStateMap.get(resourceName)
         .containsKey(partition.getPartitionName())) {
-      // We previously recorded a top state missing, and it's not coming back
+      // We previously recorded a top state missing, and it's coming back.
+      // Note : Decrement missingTopStatePartitionsBeyondGuage in this code 
path because this guage will be incremented
+      //        only if we were able to record it in the first place.
       reportTopStateComesBack(cache, 
currentStateOutput.getCurrentStateMap(resourceName, partition),
           resourceName, partition, clusterStatusMonitor, durationThreshold,
           stateModelDef.getTopState());
@@ -304,7 +306,8 @@ public class TopStateHandoffReportStage extends 
AbstractBaseStage {
 
   /**
    * Check if the given partition of the given resource has a missing top 
state duration larger
-   * than the threshold, if so, report a top state transition failure
+   * than the threshold, if so, report a top state transition failure as well 
increment a guage which reports number
+   * of partitions with missing top state.
    *
    * @param cache cluster data cache
    * @param resourceName resource name
diff --git 
a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java
 
b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java
index a7060f3c1..8811e0625 100644
--- 
a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java
+++ 
b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java
@@ -54,6 +54,7 @@ public class ResourceMonitor extends DynamicMBeanProvider {
 
   // Gauges
   private SimpleDynamicMetric<Long> _numOfPartitions;
+  private SimpleDynamicMetric<Long> 
_missingTopStatePartitionsBeyondThresholdGauge;
   private SimpleDynamicMetric<Long> _numOfPartitionsInExternalView;
   private SimpleDynamicMetric<Long> _numOfErrorPartitions;
   private SimpleDynamicMetric<Long> _numNonTopStatePartitions;
@@ -70,7 +71,16 @@ public class ResourceMonitor extends DynamicMBeanProvider {
   // Counters
   private SimpleDynamicMetric<Long> _successfulTopStateHandoffDurationCounter;
   private SimpleDynamicMetric<Long> _successTopStateHandoffCounter;
+
+  // A new Gauage _missingTopStatePartitionsBeyondThresholdGauge for reporting 
number of partitions with missing top
+  // state has been added. Reason of deprecating this two metrics is because 
they are similar and doesn't tell about
+  // how many partitions are missing beyond threshold. The average duration of 
different partitions hands-off would not
+  // be helpful and can be used to find that out. Please find more info at 
https://github.com/apache/helix/pull/2381
+  @Deprecated
   private SimpleDynamicMetric<Long> _failedTopStateHandoffCounter;
+  @Deprecated
+  private HistogramDynamicMetric 
_partitionTopStateNonGracefulHandoffDurationGauge;
+
   private SimpleDynamicMetric<Long> _maxSinglePartitionTopStateHandoffDuration;
   @Deprecated
   private SimpleDynamicMetric<Long> _totalMessageReceived; // This should be 
counter since the value behavior is ever-increasing
@@ -78,7 +88,6 @@ public class ResourceMonitor extends DynamicMBeanProvider {
   // Histograms
   private HistogramDynamicMetric _partitionTopStateHandoffDurationGauge;
   private HistogramDynamicMetric _partitionTopStateHandoffHelixLatencyGauge;
-  private HistogramDynamicMetric 
_partitionTopStateNonGracefulHandoffDurationGauge;
 
   private SimpleDynamicMetric<String> _rebalanceState;
 
@@ -125,6 +134,7 @@ public class ResourceMonitor extends DynamicMBeanProvider {
     _numLessMinActiveReplicaPartitions =
         new SimpleDynamicMetric("MissingMinActiveReplicaPartitionGauge", 0L);
     _numNonTopStatePartitions = new 
SimpleDynamicMetric("MissingTopStatePartitionGauge", 0L);
+    _missingTopStatePartitionsBeyondThresholdGauge = new 
SimpleDynamicMetric("MissingTopStatePartitionsBeyondThresholdGauge", 0L);
     _numOfErrorPartitions = new SimpleDynamicMetric("ErrorPartitionGauge", 0L);
     _numOfPartitionsInExternalView = new 
SimpleDynamicMetric("ExternalViewPartitionGauge", 0L);
     _numOfPartitions = new SimpleDynamicMetric("PartitionGauge", 0L);
@@ -173,6 +183,10 @@ public class ResourceMonitor extends DynamicMBeanProvider {
     return _numNonTopStatePartitions.getValue();
   }
 
+  public long getMissingTopStatePartitionsBeyondThresholdGuage() {
+    return _missingTopStatePartitionsBeyondThresholdGauge.getValue();
+  }
+
   public long getDifferenceWithIdealStateGauge() {
     return _externalViewIdealStateDiff.getValue();
   }
@@ -193,6 +207,7 @@ public class ResourceMonitor extends DynamicMBeanProvider {
     return _partitionTopStateHandoffDurationGauge;
   }
 
+  @Deprecated
   public HistogramDynamicMetric 
getPartitionTopStateNonGracefulHandoffDurationGauge() {
     return _partitionTopStateNonGracefulHandoffDurationGauge;
   }
@@ -201,6 +216,7 @@ public class ResourceMonitor extends DynamicMBeanProvider {
     return _partitionTopStateHandoffHelixLatencyGauge;
   }
 
+  @Deprecated
   public long getFailedTopStateHandoffCounter() {
     return _failedTopStateHandoffCounter.getValue();
   }
@@ -356,6 +372,8 @@ public class ResourceMonitor extends DynamicMBeanProvider {
           _partitionTopStateHandoffDurationGauge.updateValue(totalDuration);
           _partitionTopStateHandoffHelixLatencyGauge.updateValue(helixLatency);
         } else {
+          // TODO: Deprecated. use MissingTopStateBeyondThresholdGuage to find 
out number of partitions with missing top
+          //  state.
           
_partitionTopStateNonGracefulHandoffDurationGauge.updateValue(totalDuration);
         }
         if (totalDuration > 
_maxSinglePartitionTopStateHandoffDuration.getValue()) {
@@ -363,7 +381,12 @@ public class ResourceMonitor extends DynamicMBeanProvider {
           _lastResetTime = System.currentTimeMillis();
         }
       } else {
+        // TODO: Deprecated. use MissingTopStateBeyondThresholdGuage to find 
out number of partitions with missing top
+        //  state.
         
_failedTopStateHandoffCounter.updateValue(_failedTopStateHandoffCounter.getValue()
 + 1);
+        _missingTopStatePartitionsBeyondThresholdGauge
+            
.updateValue(_missingTopStatePartitionsBeyondThresholdGauge.getValue() + 1);
+        _lastResetTime = System.currentTimeMillis();
       }
       break;
     default:
@@ -459,6 +482,7 @@ public class ResourceMonitor extends DynamicMBeanProvider {
   public void resetMaxTopStateHandoffGauge() {
     if (_lastResetTime + DEFAULT_RESET_INTERVAL_MS <= 
System.currentTimeMillis()) {
       _maxSinglePartitionTopStateHandoffDuration.updateValue(0L);
+      _missingTopStatePartitionsBeyondThresholdGauge.updateValue(0L);
       _lastResetTime = System.currentTimeMillis();
     }
   }
@@ -469,6 +493,7 @@ public class ResourceMonitor extends DynamicMBeanProvider {
         _numOfPartitionsInExternalView,
         _numOfErrorPartitions,
         _numNonTopStatePartitions,
+        _missingTopStatePartitionsBeyondThresholdGauge,
         _numLessMinActiveReplicaPartitions,
         _numLessReplicaPartitions,
         _numPendingRecoveryRebalanceReplicas,
diff --git 
a/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
 
b/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
index 7833b1574..b5ec04412 100644
--- 
a/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
+++ 
b/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
@@ -183,7 +183,7 @@ public class TestTopStateHandoffMetrics extends 
BaseStageTest {
             liMap.remove("localhost_1");
             cache.setLiveInstances(new ArrayList<>(liMap.values()));
           }
-        }, 1, 0,
+        }, 1, 0, 0, // threshold value not set hence no change in guage
         expectedDuration,
         DURATION_ZERO,
         expectedDuration, expectedHelixLatency
@@ -212,7 +212,7 @@ public class TestTopStateHandoffMetrics extends 
BaseStageTest {
             cache.getInstanceOfflineTimeMap().put("localhost_0", 
lastOfflineTime);
             cache.notifyDataChange(HelixConstants.ChangeType.LIVE_INSTANCE);
           }
-        }, 1, 0,
+        }, 1, 0, 0, // threshold value not set hence no change in guage.
         DURATION_ZERO, // graceful handoff duration should be 0
         expectedDuration, // we should have an record for non-graceful handoff
         expectedDuration, // max handoff should be same as non-graceful handoff
@@ -222,6 +222,12 @@ public class TestTopStateHandoffMetrics extends 
BaseStageTest {
 
   @Test(dataProvider = "failedCurrentStateInput")
   public void testTopStateFailedHandoff(TestCaseConfig cfg) {
+    // There are two scenarios here :
+    //    1. localhost_0 looses top-state at 15000 and top state is recovered 
on localhost_1 at 22000. This means that
+    //       hand-off took 7000 which is greater than threshold (5000) hence 
both failed counter as well
+    //       missingTopStatePartitionsThresholdGuage will be set to 1.
+    //   2. localhost_0 looses top-state at 15000 and it's NEVER recovered. In 
this scenario as well both failed counter
+    //      as well as missingTopStatePartitionsThresholdGuage should be set 
to 1.
     ClusterConfig clusterConfig = new ClusterConfig(_clusterName);
     clusterConfig.setMissTopStateDurationThreshold(5000L);
     setClusterConfig(clusterConfig);
@@ -255,6 +261,7 @@ public class TestTopStateHandoffMetrics extends 
BaseStageTest {
     // helix latency will be less than the mocked helix latency
     runStageAndVerify(Collections.EMPTY_MAP, 
cfg.currentStateWithMissingTopState,
         cfg.finalCurrentState, null, 1, 0,
+        0, // No previous missingTopStateRecord so no change in guage
         Range.closed(0L, helixLatency + userLatency),
         DURATION_ZERO,
         Range.closed(0L, helixLatency + userLatency),
@@ -312,7 +319,7 @@ public class TestTopStateHandoffMetrics extends 
BaseStageTest {
               cache.cacheMessages(Collections.singletonList(message));
             }
           }
-        }, 1, 0,
+        }, 1, 0, 0,
         Range.closed(durationToVerify, durationToVerify),
         DURATION_ZERO,
         Range.closed(durationToVerify, durationToVerify),
@@ -355,7 +362,7 @@ public class TestTopStateHandoffMetrics extends 
BaseStageTest {
     Range<Long> expectedHelixLatency =
         cfg.isGraceful ? Range.closed(cfg.helixLatency, cfg.helixLatency) : 
DURATION_ZERO;
     runStageAndVerify(cfg.initialCurrentStates, 
cfg.currentStateWithMissingTopState,
-        cfg.finalCurrentState, null, expectFail ? 0 : 1, expectFail ? 1 : 0, 
expectedDuration, expectedNonGracefulDuration,
+        cfg.finalCurrentState, null, expectFail ? 0 : 1, expectFail ? 1 : 0, 
expectFail ? 1 : 0, expectedDuration, expectedNonGracefulDuration,
         expectedDuration, expectedHelixLatency);
   }
 
@@ -412,6 +419,7 @@ public class TestTopStateHandoffMetrics extends 
BaseStageTest {
       MissingStatesDataCacheInject inject,
       int successCnt,
       int failCnt,
+      int missingTopStatesBeyondThresholdCnt,
       Range<Long> expectedDuration,
       Range<Long> expectedNonGracefulDuration,
       Range<Long> expectedMaxDuration,
@@ -426,6 +434,7 @@ public class TestTopStateHandoffMetrics extends 
BaseStageTest {
 
     Assert.assertEquals(monitor.getSucceededTopStateHandoffCounter(), 
successCnt);
     Assert.assertEquals(monitor.getFailedTopStateHandoffCounter(), failCnt);
+    
Assert.assertEquals(monitor.getMissingTopStatePartitionsBeyondThresholdGuage(), 
missingTopStatesBeyondThresholdCnt);
 
     long graceful = monitor.getPartitionTopStateHandoffDurationGauge()
         .getAttributeValue(GRACEFUL_HANDOFF_DURATION).longValue();

Reply via email to