kaisun2000 commented on a change in pull request #365: Fix RoutingTableProvider statePropagationLatency metric reporting bug URL: https://github.com/apache/helix/pull/365#discussion_r310838708
########## File path: helix-core/src/main/java/org/apache/helix/spectator/RoutingTableProvider.java ########## @@ -558,23 +558,30 @@ protected void refresh(Map<String, Map<String, Map<String, CurrentState>>> curre private void resetRoutingTableAndNotify(long startTime, RoutingTable newRoutingTable) { _routingTableRef.set(newRoutingTable); - logger.info("Refresh the RoutingTable for cluster {}, takes {} ms.", - (_helixManager != null ? _helixManager.getClusterName() : null), + String clusterName = _helixManager != null ? _helixManager.getClusterName() : null; + logger.info("Refresh the RoutingTable for cluster {}, takes {} ms.", clusterName, (System.currentTimeMillis() - startTime)); - notifyRoutingTableChange(); + + // TODO: move the callback user code logic to separate thread upon routing table statePropagation latency + // integration test result. If the latency is more than 2 secs, we need to change this part. + notifyRoutingTableChange(clusterName); // Update timestamp for last refresh if (_isPeriodicRefreshEnabled) { _lastRefreshTimestamp = System.currentTimeMillis(); } } - private void notifyRoutingTableChange() { - for (Map.Entry<RoutingTableChangeListener, ListenerContext> entry : _routingTableChangeListenerMap - .entrySet()) { - entry.getKey().onRoutingTableChange(new RoutingTableSnapshot(_routingTableRef.get()), - entry.getValue().getContext()); + private void notifyRoutingTableChange(String clusterName) { + // This call back is called in the main event queue of RoutingTableProvider. We add log to record time spent + // here. Potentially, we should call this callback in a separate thread if this is a bottleneck. + long startTime = System.currentTimeMillis(); + for (Map.Entry<RoutingTableChangeListener, ListenerContext> entry : _routingTableChangeListenerMap.entrySet()) { + entry.getKey() + .onRoutingTableChange(new RoutingTableSnapshot(_routingTableRef.get()), entry.getValue().getContext()); } + logger.info("RoutingTableProvider callback [aka not helix time] for cluster {}, take {} ms.", clusterName, Review comment: Helix Time == time spent in Helix side. Will change to "RoutingTableProvider user callback time" No. they will not overwhelm the log. One RoutingTableProvider will be created by one cluster. The routingTableProvider change will not be that frequent, definitely should be a lot less than all the pipeline running log. So we should be safe here. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services