Repository: helix Updated Branches: refs/heads/master 45009e2c4 -> ccf263c91
[HELIX-785] Record helix latency instead of user latency in top state handoff metrics Project: http://git-wip-us.apache.org/repos/asf/helix/repo Commit: http://git-wip-us.apache.org/repos/asf/helix/commit/ccf263c9 Tree: http://git-wip-us.apache.org/repos/asf/helix/tree/ccf263c9 Diff: http://git-wip-us.apache.org/repos/asf/helix/diff/ccf263c9 Branch: refs/heads/master Commit: ccf263c9110fa5af54f1c1cabd8b5a2af64d473e Parents: 45009e2 Author: Harry Zhang <hrzh...@linkedin.com> Authored: Fri Nov 2 11:30:15 2018 -0700 Committer: Harry Zhang <hrzh...@linkedin.com> Committed: Fri Nov 2 13:52:15 2018 -0700 ---------------------------------------------------------------------- .../stages/TopStateHandoffReportStage.java | 14 +++--- .../monitoring/mbeans/ClusterStatusMonitor.java | 4 +- .../monitoring/mbeans/ResourceMonitor.java | 16 +++--- .../mbeans/TestTopStateHandoffMetrics.java | 52 ++++++++++---------- .../resources/TestTopStateHandoffMetrics.json | 12 ++--- 5 files changed, 48 insertions(+), 50 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/helix/blob/ccf263c9/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java ---------------------------------------------------------------------- diff --git a/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java b/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java index 5bfd5e6..d86e430 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java +++ b/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java @@ -287,13 +287,13 @@ public class TopStateHandoffReportStage extends AbstractBaseStage { } long duration = endTime - startTime; - long userLatency = fromTopStateUserLatency + toTopStateuserLatency; + long helixLatency = duration - fromTopStateUserLatency - toTopStateuserLatency; // We always treat such top state handoff as graceful as if top state handoff is triggered // by instance crash, we cannot observe the entire handoff process within 1 pipeline run - logMissingTopStateInfo(duration, userLatency, true, partition.getPartitionName()); + logMissingTopStateInfo(duration, helixLatency, true, partition.getPartitionName()); if (clusterStatusMonitor != null) { clusterStatusMonitor - .updateMissingTopStateDurationStats(resourceName, duration, userLatency, true, true); + .updateMissingTopStateDurationStats(resourceName, duration, helixLatency, true, true); } } @@ -468,17 +468,17 @@ public class TopStateHandoffReportStage extends AbstractBaseStage { if (handOffStartTime > 0 && handOffEndTime - handOffStartTime <= threshold) { long duration = handOffEndTime - handOffStartTime; - long userLatency = fromTopStateUserLatency + toTopStateUserLatency; + long helixLatency = duration - fromTopStateUserLatency - toTopStateUserLatency; // It is possible that during controller leader switch, we lost previous master information // and use current time to approximate missing top state start time. If we see the actual // user latency is larger than the duration we estimated, we use user latency to start with - duration = Math.max(duration, userLatency); + duration = Math.max(duration, helixLatency); boolean isGraceful = record.isGracefulHandoff(); - logMissingTopStateInfo(duration, userLatency, isGraceful, partition.getPartitionName()); + logMissingTopStateInfo(duration, helixLatency, isGraceful, partition.getPartitionName()); if (clusterStatusMonitor != null) { clusterStatusMonitor - .updateMissingTopStateDurationStats(resourceName, duration, userLatency, isGraceful, + .updateMissingTopStateDurationStats(resourceName, duration, helixLatency, isGraceful, true); } } http://git-wip-us.apache.org/repos/asf/helix/blob/ccf263c9/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitor.java ---------------------------------------------------------------------- diff --git a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitor.java b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitor.java index 803bd3c..a980d3c 100644 --- a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitor.java +++ b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitor.java @@ -467,12 +467,12 @@ public class ClusterStatusMonitor implements ClusterStatusMonitorMBean { } public synchronized void updateMissingTopStateDurationStats(String resourceName, - long totalDuration, long userLatency, boolean isGraceful, boolean succeeded) { + long totalDuration, long helixLatency, boolean isGraceful, boolean succeeded) { ResourceMonitor resourceMonitor = getOrCreateResourceMonitor(resourceName); if (resourceMonitor != null) { resourceMonitor.updateStateHandoffStats(ResourceMonitor.MonitorState.TOP_STATE, totalDuration, - userLatency, isGraceful, succeeded); + helixLatency, isGraceful, succeeded); } } http://git-wip-us.apache.org/repos/asf/helix/blob/ccf263c9/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java ---------------------------------------------------------------------- diff --git a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java index fb9a779..c66ae2d 100644 --- a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java +++ b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java @@ -71,7 +71,7 @@ public class ResourceMonitor extends DynamicMBeanProvider { // Histograms private HistogramDynamicMetric _partitionTopStateHandoffDurationGauge; - private HistogramDynamicMetric _partitionTopStateHandoffUserLatencyGauge; + private HistogramDynamicMetric _partitionTopStateHandoffHelixLatencyGauge; private HistogramDynamicMetric _partitionTopStateNonGracefulHandoffDurationGauge; private SimpleDynamicMetric<String> _rebalanceState; @@ -101,7 +101,7 @@ public class ResourceMonitor extends DynamicMBeanProvider { attributeList.add(_failedTopStateHandoffCounter); attributeList.add(_maxSinglePartitionTopStateHandoffDuration); attributeList.add(_partitionTopStateHandoffDurationGauge); - attributeList.add(_partitionTopStateHandoffUserLatencyGauge); + attributeList.add(_partitionTopStateHandoffHelixLatencyGauge); attributeList.add(_partitionTopStateNonGracefulHandoffDurationGauge); attributeList.add(_totalMessageReceived); attributeList.add(_numPendingStateTransitions); @@ -142,8 +142,8 @@ public class ResourceMonitor extends DynamicMBeanProvider { new HistogramDynamicMetric("PartitionTopStateHandoffDurationGauge", new Histogram( new SlidingTimeWindowArrayReservoir(DEFAULT_RESET_INTERVAL_MS, TimeUnit.MILLISECONDS))); - _partitionTopStateHandoffUserLatencyGauge = - new HistogramDynamicMetric("PartitionTopStateHandoffUserLatencyGauge", new Histogram( + _partitionTopStateHandoffHelixLatencyGauge = + new HistogramDynamicMetric("PartitionTopStateHandoffHelixLatencyGauge", new Histogram( new SlidingTimeWindowArrayReservoir(DEFAULT_RESET_INTERVAL_MS, TimeUnit.MILLISECONDS))); _partitionTopStateNonGracefulHandoffDurationGauge = new HistogramDynamicMetric("PartitionTopStateNonGracefulHandoffGauge", new Histogram( @@ -202,8 +202,8 @@ public class ResourceMonitor extends DynamicMBeanProvider { return _partitionTopStateNonGracefulHandoffDurationGauge; } - public HistogramDynamicMetric getPartitionTopStateHandoffUserLatencyGauge() { - return _partitionTopStateHandoffUserLatencyGauge; + public HistogramDynamicMetric getPartitionTopStateHandoffHelixLatencyGauge() { + return _partitionTopStateHandoffHelixLatencyGauge; } public long getFailedTopStateHandoffCounter() { @@ -345,7 +345,7 @@ public class ResourceMonitor extends DynamicMBeanProvider { } public void updateStateHandoffStats(MonitorState monitorState, long totalDuration, - long userLatency, boolean isGraceful, boolean succeeded) { + long helixLatency, boolean isGraceful, boolean succeeded) { switch (monitorState) { case TOP_STATE: if (succeeded) { @@ -354,7 +354,7 @@ public class ResourceMonitor extends DynamicMBeanProvider { .updateValue(_successfulTopStateHandoffDurationCounter.getValue() + totalDuration); if (isGraceful) { _partitionTopStateHandoffDurationGauge.updateValue(totalDuration); - _partitionTopStateHandoffUserLatencyGauge.updateValue(userLatency); + _partitionTopStateHandoffHelixLatencyGauge.updateValue(helixLatency); } else { _partitionTopStateNonGracefulHandoffDurationGauge.updateValue(totalDuration); } http://git-wip-us.apache.org/repos/asf/helix/blob/ccf263c9/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java ---------------------------------------------------------------------- diff --git a/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java b/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java index c1cfce0..5550386 100644 --- a/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java +++ b/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java @@ -52,7 +52,7 @@ public class TestTopStateHandoffMetrics extends BaseStageTest { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final String NON_GRACEFUL_HANDOFF_DURATION = "PartitionTopStateNonGracefulHandoffGauge.Max"; private static final String GRACEFUL_HANDOFF_DURATION = "PartitionTopStateHandoffDurationGauge.Max"; - private static final String HANDOFF_USER_LATENCY = "PartitionTopStateHandoffUserLatencyGauge.Max"; + private static final String HANDOFF_HELIX_LATENCY = "PartitionTopStateHandoffHelixLatencyGauge.Max"; private static final Range<Long> DURATION_ZERO = Range.closed(0L, 0L); private TestConfig config; @@ -93,7 +93,7 @@ public class TestTopStateHandoffMetrics extends BaseStageTest { final Map<String, CurrentStateInfo> finalCurrentState; final long duration; final boolean isGraceful; - final long userLatency; + final long helixLatency; @JsonCreator public TestCaseConfig( @@ -101,14 +101,14 @@ public class TestTopStateHandoffMetrics extends BaseStageTest { @JsonProperty("MissingTopStates") Map<String, CurrentStateInfo> missing, @JsonProperty("HandoffCurrentStates") Map<String, CurrentStateInfo> handoff, @JsonProperty("Duration") long d, - @JsonProperty("UserLatency") long user, + @JsonProperty("HelixLatency") long helix, @JsonProperty("IsGraceful") boolean graceful ) { initialCurrentStates = initial; currentStateWithMissingTopState = missing; finalCurrentState = handoff; duration = d; - userLatency = user; + helixLatency = helix; isGraceful = graceful; } } @@ -170,7 +170,7 @@ public class TestTopStateHandoffMetrics extends BaseStageTest { // since this is a single top state handoff observed within 1 pipeline, we treat it as graceful, // and only record user latency for transiting to master Range<Long> expectedDuration = Range.closed(1500L, 1500L); - Range<Long> expectedUserLatency = Range.closed(1000L, 1000L); + Range<Long> expectedHelixLatency = Range.closed(500L, 500L); runStageAndVerify( cfg.initialCurrentStates, cfg.currentStateWithMissingTopState, cfg.finalCurrentState, new MissingStatesDataCacheInject() { @@ -181,8 +181,7 @@ public class TestTopStateHandoffMetrics extends BaseStageTest { }, 1, 0, expectedDuration, DURATION_ZERO, - expectedDuration, - expectedUserLatency + expectedDuration, expectedHelixLatency ); event.addAttribute(AttributeName.LastRebalanceFinishTimeStamp.name(), TopStateHandoffReportStage.TIMESTAMP_NOT_RECORDED); @@ -235,26 +234,24 @@ public class TestTopStateHandoffMetrics extends BaseStageTest { // in such case, reportTopStateMissing will use current system time as missing top state // start time, and we assume it is a graceful handoff, and only "to top state" user latency // will be recorded - long userLatency = 0; - for (CurrentStateInfo info : cfg.currentStateWithMissingTopState.values()) { - if (info.previousState.equals("MASTER")) { - userLatency = cfg.userLatency - (info.endTime - info.startTime); - } - info.previousState = "OFFLINE"; - } + long helixLatency = 1000; + long userLatency = 1000; for (CurrentStateInfo states : cfg.finalCurrentState.values()) { if (states.currentState.equals("MASTER")) { - states.endTime = System.currentTimeMillis(); - states.startTime = System.currentTimeMillis() - userLatency; + states.endTime = System.currentTimeMillis() + helixLatency + userLatency; + states.startTime = System.currentTimeMillis() + helixLatency; break; } } + + // actual timestamp when running the stage will be later than current time, so the expected + // helix latency will be less than the mocked helix latency runStageAndVerify(Collections.EMPTY_MAP, cfg.currentStateWithMissingTopState, cfg.finalCurrentState, null, 1, 0, - Range.closed(userLatency, userLatency), + Range.closed(0L, helixLatency + userLatency), DURATION_ZERO, - Range.closed(userLatency, userLatency), - Range.closed(userLatency, userLatency) + Range.closed(0L, helixLatency + userLatency), + Range.closed(0L, helixLatency) ); } @@ -277,6 +274,7 @@ public class TestTopStateHandoffMetrics extends BaseStageTest { userLatency = info.endTime - info.startTime; } } + long helixLatency = durationToVerify - userLatency; // No initialCurrentStates means no input can be used as the clue of the previous master. // in this case, we will treat the handoff as graceful and only to-master user latency @@ -311,7 +309,7 @@ public class TestTopStateHandoffMetrics extends BaseStageTest { Range.closed(durationToVerify, durationToVerify), DURATION_ZERO, Range.closed(durationToVerify, durationToVerify), - Range.closed(userLatency, userLatency)); + Range.closed(helixLatency, helixLatency)); } @DataProvider(name = "successCurrentStateInput") @@ -347,11 +345,11 @@ public class TestTopStateHandoffMetrics extends BaseStageTest { Range<Long> duration = Range.closed(cfg.duration, cfg.duration); Range<Long> expectedDuration = cfg.isGraceful ? duration : DURATION_ZERO; Range<Long> expectedNonGracefulDuration = cfg.isGraceful ? DURATION_ZERO : duration; - Range<Long> expectedUserLatency = - cfg.isGraceful ? Range.closed(cfg.userLatency, cfg.userLatency) : DURATION_ZERO; + Range<Long> expectedHelixLatency = + cfg.isGraceful ? Range.closed(cfg.helixLatency, cfg.helixLatency) : DURATION_ZERO; runStageAndVerify(cfg.initialCurrentStates, cfg.currentStateWithMissingTopState, cfg.finalCurrentState, null, expectFail ? 0 : 1, expectFail ? 1 : 0, expectedDuration, expectedNonGracefulDuration, - expectedDuration, expectedUserLatency); + expectedDuration, expectedHelixLatency); } private Map<String, CurrentState> generateCurrentStateMap( @@ -410,7 +408,7 @@ public class TestTopStateHandoffMetrics extends BaseStageTest { Range<Long> expectedDuration, Range<Long> expectedNonGracefulDuration, Range<Long> expectedMaxDuration, - Range<Long> expectedUserLatency + Range<Long> expectedHelixLatency ) { runPipeLine(initialCurrentStates, missingTopStates, handOffCurrentStates, inject); ClusterStatusMonitor clusterStatusMonitor = @@ -424,12 +422,12 @@ public class TestTopStateHandoffMetrics extends BaseStageTest { .getAttributeValue(GRACEFUL_HANDOFF_DURATION).longValue(); long nonGraceful = monitor.getPartitionTopStateNonGracefulHandoffDurationGauge() .getAttributeValue(NON_GRACEFUL_HANDOFF_DURATION).longValue(); - long user = monitor.getPartitionTopStateHandoffUserLatencyGauge() - .getAttributeValue(HANDOFF_USER_LATENCY).longValue(); + long helix = monitor.getPartitionTopStateHandoffHelixLatencyGauge() + .getAttributeValue(HANDOFF_HELIX_LATENCY).longValue(); long max = monitor.getMaxSinglePartitionTopStateHandoffDurationGauge(); Assert.assertTrue(expectedDuration.contains(graceful)); Assert.assertTrue(expectedNonGracefulDuration.contains(nonGraceful)); - Assert.assertTrue(expectedUserLatency.contains(user)); + Assert.assertTrue(expectedHelixLatency.contains(helix)); Assert.assertTrue(expectedMaxDuration.contains(max)); } http://git-wip-us.apache.org/repos/asf/helix/blob/ccf263c9/helix-core/src/test/resources/TestTopStateHandoffMetrics.json ---------------------------------------------------------------------- diff --git a/helix-core/src/test/resources/TestTopStateHandoffMetrics.json b/helix-core/src/test/resources/TestTopStateHandoffMetrics.json index 0b39a1d..c1e579f 100644 --- a/helix-core/src/test/resources/TestTopStateHandoffMetrics.json +++ b/helix-core/src/test/resources/TestTopStateHandoffMetrics.json @@ -62,7 +62,7 @@ } }, "Duration": 7000, - "UserLatency": 5000, + "HelixLatency": 2000, "IsGraceful": true }, { @@ -127,7 +127,7 @@ } }, "Duration": 14000, - "UserLatency": 4000, + "HelixLatency": 10000, "IsGraceful": true } ], @@ -182,7 +182,7 @@ } }, "Duration": 7000, - "UserLatency": 5000, + "HelixLatency": 2000, "IsGraceful": false } ], @@ -249,7 +249,7 @@ } }, "Duration": 0, - "UserLatency": 0, + "HelixLatency": 0, "IsGraceful": false }, { @@ -314,7 +314,7 @@ } }, "Duration": 0, - "UserLatency": 0, + "HelixLatency": 0, "IsGraceful": false } ], @@ -364,7 +364,7 @@ }, "Duration": 3000, - "UserLatency": 2000, + "HelixLatency": 1000, "IsGraceful": true } ]