Repository: helix
Updated Branches:
  refs/heads/master 45009e2c4 -> ccf263c91


[HELIX-785] Record helix latency instead of user latency in top state handoff 
metrics


Project: http://git-wip-us.apache.org/repos/asf/helix/repo
Commit: http://git-wip-us.apache.org/repos/asf/helix/commit/ccf263c9
Tree: http://git-wip-us.apache.org/repos/asf/helix/tree/ccf263c9
Diff: http://git-wip-us.apache.org/repos/asf/helix/diff/ccf263c9

Branch: refs/heads/master
Commit: ccf263c9110fa5af54f1c1cabd8b5a2af64d473e
Parents: 45009e2
Author: Harry Zhang <hrzh...@linkedin.com>
Authored: Fri Nov 2 11:30:15 2018 -0700
Committer: Harry Zhang <hrzh...@linkedin.com>
Committed: Fri Nov 2 13:52:15 2018 -0700

----------------------------------------------------------------------
 .../stages/TopStateHandoffReportStage.java      | 14 +++---
 .../monitoring/mbeans/ClusterStatusMonitor.java |  4 +-
 .../monitoring/mbeans/ResourceMonitor.java      | 16 +++---
 .../mbeans/TestTopStateHandoffMetrics.java      | 52 ++++++++++----------
 .../resources/TestTopStateHandoffMetrics.json   | 12 ++---
 5 files changed, 48 insertions(+), 50 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/helix/blob/ccf263c9/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
----------------------------------------------------------------------
diff --git 
a/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
 
b/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
index 5bfd5e6..d86e430 100644
--- 
a/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
+++ 
b/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
@@ -287,13 +287,13 @@ public class TopStateHandoffReportStage extends 
AbstractBaseStage {
     }
 
     long duration = endTime - startTime;
-    long userLatency = fromTopStateUserLatency + toTopStateuserLatency;
+    long helixLatency = duration - fromTopStateUserLatency - 
toTopStateuserLatency;
     // We always treat such top state handoff as graceful as if top state 
handoff is triggered
     // by instance crash, we cannot observe the entire handoff process within 
1 pipeline run
-    logMissingTopStateInfo(duration, userLatency, true, 
partition.getPartitionName());
+    logMissingTopStateInfo(duration, helixLatency, true, 
partition.getPartitionName());
     if (clusterStatusMonitor != null) {
       clusterStatusMonitor
-          .updateMissingTopStateDurationStats(resourceName, duration, 
userLatency, true, true);
+          .updateMissingTopStateDurationStats(resourceName, duration, 
helixLatency, true, true);
     }
   }
 
@@ -468,17 +468,17 @@ public class TopStateHandoffReportStage extends 
AbstractBaseStage {
 
     if (handOffStartTime > 0 && handOffEndTime - handOffStartTime <= 
threshold) {
       long duration = handOffEndTime - handOffStartTime;
-      long userLatency = fromTopStateUserLatency + toTopStateUserLatency;
+      long helixLatency = duration - fromTopStateUserLatency - 
toTopStateUserLatency;
       // It is possible that during controller leader switch, we lost previous 
master information
       // and use current time to approximate missing top state start time. If 
we see the actual
       // user latency is larger than the duration we estimated, we use user 
latency to start with
-      duration = Math.max(duration, userLatency);
+      duration = Math.max(duration, helixLatency);
       boolean isGraceful = record.isGracefulHandoff();
-      logMissingTopStateInfo(duration, userLatency, isGraceful, 
partition.getPartitionName());
+      logMissingTopStateInfo(duration, helixLatency, isGraceful, 
partition.getPartitionName());
 
       if (clusterStatusMonitor != null) {
         clusterStatusMonitor
-            .updateMissingTopStateDurationStats(resourceName, duration, 
userLatency, isGraceful,
+            .updateMissingTopStateDurationStats(resourceName, duration, 
helixLatency, isGraceful,
                 true);
       }
     }

http://git-wip-us.apache.org/repos/asf/helix/blob/ccf263c9/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitor.java
----------------------------------------------------------------------
diff --git 
a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitor.java
 
b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitor.java
index 803bd3c..a980d3c 100644
--- 
a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitor.java
+++ 
b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitor.java
@@ -467,12 +467,12 @@ public class ClusterStatusMonitor implements 
ClusterStatusMonitorMBean {
   }
 
   public synchronized void updateMissingTopStateDurationStats(String 
resourceName,
-      long totalDuration, long userLatency, boolean isGraceful, boolean 
succeeded) {
+      long totalDuration, long helixLatency, boolean isGraceful, boolean 
succeeded) {
     ResourceMonitor resourceMonitor = getOrCreateResourceMonitor(resourceName);
 
     if (resourceMonitor != null) {
       
resourceMonitor.updateStateHandoffStats(ResourceMonitor.MonitorState.TOP_STATE, 
totalDuration,
-          userLatency, isGraceful, succeeded);
+          helixLatency, isGraceful, succeeded);
     }
   }
 

http://git-wip-us.apache.org/repos/asf/helix/blob/ccf263c9/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java
----------------------------------------------------------------------
diff --git 
a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java
 
b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java
index fb9a779..c66ae2d 100644
--- 
a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java
+++ 
b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java
@@ -71,7 +71,7 @@ public class ResourceMonitor extends DynamicMBeanProvider {
 
   // Histograms
   private HistogramDynamicMetric _partitionTopStateHandoffDurationGauge;
-  private HistogramDynamicMetric _partitionTopStateHandoffUserLatencyGauge;
+  private HistogramDynamicMetric _partitionTopStateHandoffHelixLatencyGauge;
   private HistogramDynamicMetric 
_partitionTopStateNonGracefulHandoffDurationGauge;
 
   private SimpleDynamicMetric<String> _rebalanceState;
@@ -101,7 +101,7 @@ public class ResourceMonitor extends DynamicMBeanProvider {
     attributeList.add(_failedTopStateHandoffCounter);
     attributeList.add(_maxSinglePartitionTopStateHandoffDuration);
     attributeList.add(_partitionTopStateHandoffDurationGauge);
-    attributeList.add(_partitionTopStateHandoffUserLatencyGauge);
+    attributeList.add(_partitionTopStateHandoffHelixLatencyGauge);
     attributeList.add(_partitionTopStateNonGracefulHandoffDurationGauge);
     attributeList.add(_totalMessageReceived);
     attributeList.add(_numPendingStateTransitions);
@@ -142,8 +142,8 @@ public class ResourceMonitor extends DynamicMBeanProvider {
         new HistogramDynamicMetric("PartitionTopStateHandoffDurationGauge", 
new Histogram(
             new SlidingTimeWindowArrayReservoir(DEFAULT_RESET_INTERVAL_MS, 
TimeUnit.MILLISECONDS)));
 
-    _partitionTopStateHandoffUserLatencyGauge =
-        new HistogramDynamicMetric("PartitionTopStateHandoffUserLatencyGauge", 
new Histogram(
+    _partitionTopStateHandoffHelixLatencyGauge =
+        new 
HistogramDynamicMetric("PartitionTopStateHandoffHelixLatencyGauge", new 
Histogram(
             new SlidingTimeWindowArrayReservoir(DEFAULT_RESET_INTERVAL_MS, 
TimeUnit.MILLISECONDS)));
     _partitionTopStateNonGracefulHandoffDurationGauge =
         new HistogramDynamicMetric("PartitionTopStateNonGracefulHandoffGauge", 
new Histogram(
@@ -202,8 +202,8 @@ public class ResourceMonitor extends DynamicMBeanProvider {
     return _partitionTopStateNonGracefulHandoffDurationGauge;
   }
 
-  public HistogramDynamicMetric getPartitionTopStateHandoffUserLatencyGauge() {
-    return _partitionTopStateHandoffUserLatencyGauge;
+  public HistogramDynamicMetric getPartitionTopStateHandoffHelixLatencyGauge() 
{
+    return _partitionTopStateHandoffHelixLatencyGauge;
   }
 
   public long getFailedTopStateHandoffCounter() {
@@ -345,7 +345,7 @@ public class ResourceMonitor extends DynamicMBeanProvider {
   }
 
   public void updateStateHandoffStats(MonitorState monitorState, long 
totalDuration,
-      long userLatency, boolean isGraceful, boolean succeeded) {
+      long helixLatency, boolean isGraceful, boolean succeeded) {
     switch (monitorState) {
     case TOP_STATE:
       if (succeeded) {
@@ -354,7 +354,7 @@ public class ResourceMonitor extends DynamicMBeanProvider {
             .updateValue(_successfulTopStateHandoffDurationCounter.getValue() 
+ totalDuration);
         if (isGraceful) {
           _partitionTopStateHandoffDurationGauge.updateValue(totalDuration);
-          _partitionTopStateHandoffUserLatencyGauge.updateValue(userLatency);
+          _partitionTopStateHandoffHelixLatencyGauge.updateValue(helixLatency);
         } else {
           
_partitionTopStateNonGracefulHandoffDurationGauge.updateValue(totalDuration);
         }

http://git-wip-us.apache.org/repos/asf/helix/blob/ccf263c9/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
----------------------------------------------------------------------
diff --git 
a/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
 
b/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
index c1cfce0..5550386 100644
--- 
a/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
+++ 
b/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
@@ -52,7 +52,7 @@ public class TestTopStateHandoffMetrics extends BaseStageTest 
{
   private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
   private static final String NON_GRACEFUL_HANDOFF_DURATION = 
"PartitionTopStateNonGracefulHandoffGauge.Max";
   private static final String GRACEFUL_HANDOFF_DURATION = 
"PartitionTopStateHandoffDurationGauge.Max";
-  private static final String HANDOFF_USER_LATENCY = 
"PartitionTopStateHandoffUserLatencyGauge.Max";
+  private static final String HANDOFF_HELIX_LATENCY = 
"PartitionTopStateHandoffHelixLatencyGauge.Max";
   private static final Range<Long> DURATION_ZERO = Range.closed(0L, 0L);
   private TestConfig config;
 
@@ -93,7 +93,7 @@ public class TestTopStateHandoffMetrics extends BaseStageTest 
{
     final Map<String, CurrentStateInfo> finalCurrentState;
     final long duration;
     final boolean isGraceful;
-    final long userLatency;
+    final long helixLatency;
 
     @JsonCreator
     public TestCaseConfig(
@@ -101,14 +101,14 @@ public class TestTopStateHandoffMetrics extends 
BaseStageTest {
         @JsonProperty("MissingTopStates") Map<String, CurrentStateInfo> 
missing,
         @JsonProperty("HandoffCurrentStates") Map<String, CurrentStateInfo> 
handoff,
         @JsonProperty("Duration") long d,
-        @JsonProperty("UserLatency") long user,
+        @JsonProperty("HelixLatency") long helix,
         @JsonProperty("IsGraceful") boolean graceful
     ) {
       initialCurrentStates = initial;
       currentStateWithMissingTopState = missing;
       finalCurrentState = handoff;
       duration = d;
-      userLatency = user;
+      helixLatency = helix;
       isGraceful = graceful;
     }
   }
@@ -170,7 +170,7 @@ public class TestTopStateHandoffMetrics extends 
BaseStageTest {
     // since this is a single top state handoff observed within 1 pipeline, we 
treat it as graceful,
     // and only record user latency for transiting to master
     Range<Long> expectedDuration = Range.closed(1500L, 1500L);
-    Range<Long> expectedUserLatency = Range.closed(1000L, 1000L);
+    Range<Long> expectedHelixLatency = Range.closed(500L, 500L);
     runStageAndVerify(
         cfg.initialCurrentStates, cfg.currentStateWithMissingTopState, 
cfg.finalCurrentState,
         new MissingStatesDataCacheInject() {
@@ -181,8 +181,7 @@ public class TestTopStateHandoffMetrics extends 
BaseStageTest {
         }, 1, 0,
         expectedDuration,
         DURATION_ZERO,
-        expectedDuration,
-        expectedUserLatency
+        expectedDuration, expectedHelixLatency
     );
     event.addAttribute(AttributeName.LastRebalanceFinishTimeStamp.name(),
         TopStateHandoffReportStage.TIMESTAMP_NOT_RECORDED);
@@ -235,26 +234,24 @@ public class TestTopStateHandoffMetrics extends 
BaseStageTest {
     // in such case, reportTopStateMissing will use current system time as 
missing top state
     // start time, and we assume it is a graceful handoff, and only "to top 
state" user latency
     // will be recorded
-    long userLatency = 0;
-    for (CurrentStateInfo info : cfg.currentStateWithMissingTopState.values()) 
{
-      if (info.previousState.equals("MASTER")) {
-        userLatency = cfg.userLatency - (info.endTime - info.startTime);
-      }
-      info.previousState = "OFFLINE";
-    }
+    long helixLatency = 1000;
+    long userLatency = 1000;
     for (CurrentStateInfo states : cfg.finalCurrentState.values()) {
       if (states.currentState.equals("MASTER")) {
-        states.endTime = System.currentTimeMillis();
-        states.startTime = System.currentTimeMillis() - userLatency;
+        states.endTime = System.currentTimeMillis() + helixLatency + 
userLatency;
+        states.startTime = System.currentTimeMillis() + helixLatency;
         break;
       }
     }
+
+    // actual timestamp when running the stage will be later than current 
time, so the expected
+    // helix latency will be less than the mocked helix latency
     runStageAndVerify(Collections.EMPTY_MAP, 
cfg.currentStateWithMissingTopState,
         cfg.finalCurrentState, null, 1, 0,
-        Range.closed(userLatency, userLatency),
+        Range.closed(0L, helixLatency + userLatency),
         DURATION_ZERO,
-        Range.closed(userLatency, userLatency),
-        Range.closed(userLatency, userLatency)
+        Range.closed(0L, helixLatency + userLatency),
+        Range.closed(0L, helixLatency)
     );
   }
 
@@ -277,6 +274,7 @@ public class TestTopStateHandoffMetrics extends 
BaseStageTest {
         userLatency = info.endTime - info.startTime;
       }
     }
+    long helixLatency = durationToVerify - userLatency;
 
     // No initialCurrentStates means no input can be used as the clue of the 
previous master.
     // in this case, we will treat the handoff as graceful and only to-master 
user latency
@@ -311,7 +309,7 @@ public class TestTopStateHandoffMetrics extends 
BaseStageTest {
         Range.closed(durationToVerify, durationToVerify),
         DURATION_ZERO,
         Range.closed(durationToVerify, durationToVerify),
-        Range.closed(userLatency, userLatency));
+        Range.closed(helixLatency, helixLatency));
   }
 
   @DataProvider(name = "successCurrentStateInput")
@@ -347,11 +345,11 @@ public class TestTopStateHandoffMetrics extends 
BaseStageTest {
     Range<Long> duration = Range.closed(cfg.duration, cfg.duration);
     Range<Long> expectedDuration = cfg.isGraceful ? duration : DURATION_ZERO;
     Range<Long> expectedNonGracefulDuration = cfg.isGraceful ? DURATION_ZERO : 
duration;
-    Range<Long> expectedUserLatency =
-        cfg.isGraceful ? Range.closed(cfg.userLatency, cfg.userLatency) : 
DURATION_ZERO;
+    Range<Long> expectedHelixLatency =
+        cfg.isGraceful ? Range.closed(cfg.helixLatency, cfg.helixLatency) : 
DURATION_ZERO;
     runStageAndVerify(cfg.initialCurrentStates, 
cfg.currentStateWithMissingTopState,
         cfg.finalCurrentState, null, expectFail ? 0 : 1, expectFail ? 1 : 0, 
expectedDuration, expectedNonGracefulDuration,
-        expectedDuration, expectedUserLatency);
+        expectedDuration, expectedHelixLatency);
   }
 
   private Map<String, CurrentState> generateCurrentStateMap(
@@ -410,7 +408,7 @@ public class TestTopStateHandoffMetrics extends 
BaseStageTest {
       Range<Long> expectedDuration,
       Range<Long> expectedNonGracefulDuration,
       Range<Long> expectedMaxDuration,
-      Range<Long> expectedUserLatency
+      Range<Long> expectedHelixLatency
   ) {
     runPipeLine(initialCurrentStates, missingTopStates, handOffCurrentStates, 
inject);
     ClusterStatusMonitor clusterStatusMonitor =
@@ -424,12 +422,12 @@ public class TestTopStateHandoffMetrics extends 
BaseStageTest {
         .getAttributeValue(GRACEFUL_HANDOFF_DURATION).longValue();
     long nonGraceful = 
monitor.getPartitionTopStateNonGracefulHandoffDurationGauge()
         .getAttributeValue(NON_GRACEFUL_HANDOFF_DURATION).longValue();
-    long user = monitor.getPartitionTopStateHandoffUserLatencyGauge()
-        .getAttributeValue(HANDOFF_USER_LATENCY).longValue();
+    long helix = monitor.getPartitionTopStateHandoffHelixLatencyGauge()
+        .getAttributeValue(HANDOFF_HELIX_LATENCY).longValue();
     long max = monitor.getMaxSinglePartitionTopStateHandoffDurationGauge();
     Assert.assertTrue(expectedDuration.contains(graceful));
     Assert.assertTrue(expectedNonGracefulDuration.contains(nonGraceful));
-    Assert.assertTrue(expectedUserLatency.contains(user));
+    Assert.assertTrue(expectedHelixLatency.contains(helix));
     Assert.assertTrue(expectedMaxDuration.contains(max));
   }
 

http://git-wip-us.apache.org/repos/asf/helix/blob/ccf263c9/helix-core/src/test/resources/TestTopStateHandoffMetrics.json
----------------------------------------------------------------------
diff --git a/helix-core/src/test/resources/TestTopStateHandoffMetrics.json 
b/helix-core/src/test/resources/TestTopStateHandoffMetrics.json
index 0b39a1d..c1e579f 100644
--- a/helix-core/src/test/resources/TestTopStateHandoffMetrics.json
+++ b/helix-core/src/test/resources/TestTopStateHandoffMetrics.json
@@ -62,7 +62,7 @@
         }
       },
       "Duration": 7000,
-      "UserLatency": 5000,
+      "HelixLatency": 2000,
       "IsGraceful": true
     },
     {
@@ -127,7 +127,7 @@
         }
       },
       "Duration": 14000,
-      "UserLatency": 4000,
+      "HelixLatency": 10000,
       "IsGraceful": true
     }
   ],
@@ -182,7 +182,7 @@
         }
       },
       "Duration": 7000,
-      "UserLatency": 5000,
+      "HelixLatency": 2000,
       "IsGraceful": false
     }
   ],
@@ -249,7 +249,7 @@
         }
       },
       "Duration": 0,
-      "UserLatency": 0,
+      "HelixLatency": 0,
       "IsGraceful": false
     },
     {
@@ -314,7 +314,7 @@
         }
       },
       "Duration": 0,
-      "UserLatency": 0,
+      "HelixLatency": 0,
       "IsGraceful": false
     }
   ],
@@ -364,7 +364,7 @@
 
       },
       "Duration": 3000,
-      "UserLatency": 2000,
+      "HelixLatency": 1000,
       "IsGraceful": true
     }
   ]

Reply via email to