J-HowHuang commented on code in PR #15618: URL: https://github.com/apache/pinot/pull/15618#discussion_r2070889761
########## pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/rebalance/TableRebalancer.java: ########## @@ -1303,47 +1303,75 @@ private IdealState waitForExternalViewToConverge(String tableNameWithType, boole long endTimeMs = System.currentTimeMillis() + externalViewStabilizationTimeoutInMs; IdealState idealState; - do { - tableRebalanceLogger.debug("Start to check if ExternalView converges to IdealStates"); - idealState = _helixDataAccessor.getProperty(_helixDataAccessor.keyBuilder().idealStates(tableNameWithType)); - // IdealState might be null if table got deleted, throwing exception to abort the rebalance - Preconditions.checkState(idealState != null, "Failed to find the IdealState"); - - ExternalView externalView = - _helixDataAccessor.getProperty(_helixDataAccessor.keyBuilder().externalView(tableNameWithType)); - // ExternalView might be null when table is just created, skipping check for this iteration - if (externalView != null) { - // Record external view and ideal state convergence status - TableRebalanceObserver.RebalanceContext rebalanceContext = new TableRebalanceObserver.RebalanceContext( - estimateAverageSegmentSizeInBytes, allSegmentsFromIdealState, segmentsToMonitor); - _tableRebalanceObserver.onTrigger( - TableRebalanceObserver.Trigger.EXTERNAL_VIEW_TO_IDEAL_STATE_CONVERGENCE_TRIGGER, - externalView.getRecord().getMapFields(), idealState.getRecord().getMapFields(), rebalanceContext); - // Update unique segment list as IS-EV trigger must have processed these - allSegmentsFromIdealState = idealState.getRecord().getMapFields().keySet(); - if (_tableRebalanceObserver.isStopped()) { - throw new RuntimeException( - String.format("Rebalance has already stopped with status: %s", _tableRebalanceObserver.getStopStatus())); + ExternalView externalView; + int previousRemainingSegments = -1; + while (true) { + do { + tableRebalanceLogger.debug("Start to check if ExternalView converges to IdealStates"); + idealState = _helixDataAccessor.getProperty(_helixDataAccessor.keyBuilder().idealStates(tableNameWithType)); + // IdealState might be null if table got deleted, throwing exception to abort the rebalance + Preconditions.checkState(idealState != null, "Failed to find the IdealState"); + + externalView = _helixDataAccessor.getProperty(_helixDataAccessor.keyBuilder().externalView(tableNameWithType)); + // ExternalView might be null when table is just created, skipping check for this iteration + if (externalView != null) { + // Record external view and ideal state convergence status + TableRebalanceObserver.RebalanceContext rebalanceContext = new TableRebalanceObserver.RebalanceContext( + estimateAverageSegmentSizeInBytes, allSegmentsFromIdealState, segmentsToMonitor); + _tableRebalanceObserver.onTrigger( + TableRebalanceObserver.Trigger.EXTERNAL_VIEW_TO_IDEAL_STATE_CONVERGENCE_TRIGGER, + externalView.getRecord().getMapFields(), idealState.getRecord().getMapFields(), rebalanceContext); + // Update unique segment list as IS-EV trigger must have processed these + allSegmentsFromIdealState = idealState.getRecord().getMapFields().keySet(); + if (_tableRebalanceObserver.isStopped()) { + throw new RuntimeException( + String.format("Rebalance has already stopped with status: %s", + _tableRebalanceObserver.getStopStatus())); + } + if (previousRemainingSegments < 0) { + // initialize previousRemainingSegments + previousRemainingSegments = getNumRemainingSegmentsToProcess(tableNameWithType, + externalView.getRecord().getMapFields(), idealState.getRecord().getMapFields(), lowDiskMode, + bestEfforts, segmentsToMonitor, tableRebalanceLogger, false); + if (previousRemainingSegments == 0) { + tableRebalanceLogger.info("ExternalView converged"); + return idealState; + } + } else if (isExternalViewConverged(tableNameWithType, externalView.getRecord().getMapFields(), + idealState.getRecord().getMapFields(), lowDiskMode, bestEfforts, segmentsToMonitor, + tableRebalanceLogger)) { + tableRebalanceLogger.info("ExternalView converged"); + return idealState; + } } - if (isExternalViewConverged(tableNameWithType, externalView.getRecord().getMapFields(), - idealState.getRecord().getMapFields(), lowDiskMode, bestEfforts, segmentsToMonitor, tableRebalanceLogger)) { - tableRebalanceLogger.info("ExternalView converged"); - return idealState; + tableRebalanceLogger.debug("ExternalView has not converged to IdealStates. Retry after: {}ms", + externalViewCheckIntervalInMs); + Thread.sleep(externalViewCheckIntervalInMs); + } while (System.currentTimeMillis() < endTimeMs); + if (bestEfforts) { + tableRebalanceLogger.warn( + "ExternalView has not converged within: {}ms, continuing the rebalance (best-efforts)", + externalViewStabilizationTimeoutInMs); + return idealState; + } + if (externalView != null) { + int currentRemainingSegments = getNumRemainingSegmentsToProcess(tableNameWithType, + externalView.getRecord().getMapFields(), idealState.getRecord().getMapFields(), lowDiskMode, bestEfforts, + segmentsToMonitor, tableRebalanceLogger, false); + if (currentRemainingSegments < previousRemainingSegments) { + tableRebalanceLogger.info( Review Comment: I think it's clear to see the previous remaining segment from the log, example from the manual test section in the PR description: ``` 2025/04/29 09:17:17.433 INFO [TableRebalancer-jhow_OFFLINE-324d0eb7-cf61-4ec9-be5f-704bce7c760d] [jersey-server-managed-async-executor-1] Extending EV stabilization timeout for another 15000ms, remaining 834 segments to be processed. 2025/04/29 09:17:32.620 INFO [TableRebalancer-jhow_OFFLINE-324d0eb7-cf61-4ec9-be5f-704bce7c760d] [jersey-server-managed-async-executor-1] Extending EV stabilization timeout for another 15000ms, remaining 629 segments to be processed. 2025/04/29 09:17:47.822 INFO [TableRebalancer-jhow_OFFLINE-324d0eb7-cf61-4ec9-be5f-704bce7c760d] [jersey-server-managed-async-executor-1] Extending EV stabilization timeout for another 15000ms, remaining 423 segments to be processed. 2025/04/29 09:18:03.025 INFO [TableRebalancer-jhow_OFFLINE-324d0eb7-cf61-4ec9-be5f-704bce7c760d] [jersey-server-managed-async-executor-1] Extending EV stabilization timeout for another 15000ms, remaining 237 segments to be processed. 2025/04/29 09:18:18.238 INFO [TableRebalancer-jhow_OFFLINE-324d0eb7-cf61-4ec9-be5f-704bce7c760d] [jersey-server-managed-async-executor-1] Extending EV stabilization timeout for another 15000ms, remaining 26 segments to be processed. ``` For the number of segments to monitor, what's the purpose? It's not related to the timeout. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org