Improve Helix maintenance mode 1. Remove the exception in best possible stage to let pipeline pass. 2. Add event generation for maintenance mode change.
Project: http://git-wip-us.apache.org/repos/asf/helix/repo Commit: http://git-wip-us.apache.org/repos/asf/helix/commit/89089b45 Tree: http://git-wip-us.apache.org/repos/asf/helix/tree/89089b45 Diff: http://git-wip-us.apache.org/repos/asf/helix/diff/89089b45 Branch: refs/heads/master Commit: 89089b4523e91e356a87f5ad151ee9432b574cf8 Parents: ec7eaaa Author: Junkai Xue <[email protected]> Authored: Fri Dec 15 11:43:07 2017 -0800 Committer: Junkai Xue <[email protected]> Committed: Wed Jan 24 18:32:46 2018 -0800 ---------------------------------------------------------------------- .../controller/GenericHelixController.java | 47 +++++++++++++------- .../stages/BestPossibleStateCalcStage.java | 8 ++-- 2 files changed, 35 insertions(+), 20 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/helix/blob/89089b45/helix-core/src/main/java/org/apache/helix/controller/GenericHelixController.java ---------------------------------------------------------------------- diff --git a/helix-core/src/main/java/org/apache/helix/controller/GenericHelixController.java b/helix-core/src/main/java/org/apache/helix/controller/GenericHelixController.java index 6d1af7c..2546bd2 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/GenericHelixController.java +++ b/helix-core/src/main/java/org/apache/helix/controller/GenericHelixController.java @@ -73,6 +73,7 @@ import org.apache.helix.model.CurrentState; import org.apache.helix.model.IdealState; import org.apache.helix.model.InstanceConfig; import org.apache.helix.model.LiveInstance; +import org.apache.helix.model.MaintenanceSignal; import org.apache.helix.model.Message; import org.apache.helix.model.PauseSignal; import org.apache.helix.model.ResourceConfig; @@ -128,6 +129,7 @@ public class GenericHelixController implements IdealStateChangeListener, * will be no-op. Other event handling logic keeps the same when the flag is set. */ private boolean _paused; + private boolean _inMaintenanceMode; /** * The timer that can periodically run the rebalancing pipeline. The timer will start if there is @@ -632,23 +634,10 @@ public class GenericHelixController implements IdealStateChangeListener, } PauseSignal pauseSignal = accessor.getProperty(keyBuilder.pause()); - if (pauseSignal != null) { - if (!_paused) { - _paused = true; - logger.info("controller is now paused"); - } - } else { - if (_paused) { - _paused = false; - logger.info("controller is now resumed"); - ClusterEvent event = new ClusterEvent(_clusterName, ClusterEventType.Resume); - event.addAttribute(AttributeName.changeContext.name(), changeContext); - event.addAttribute(AttributeName.helixmanager.name(), changeContext.getManager()); - event.addAttribute(AttributeName.eventData.name(), pauseSignal); - _eventQueue.put(event); - _taskEventQueue.put(event.clone()); - } - } + MaintenanceSignal maintenanceSignal = accessor.getProperty(keyBuilder.maintenance()); + _paused = updateControllerState(changeContext, pauseSignal, _paused); + _inMaintenanceMode = updateControllerState(changeContext, maintenanceSignal, _inMaintenanceMode); + synchronized (this) { if (_clusterStatusMonitor == null) { _clusterStatusMonitor = new ClusterStatusMonitor(changeContext.getManager().getClusterName()); @@ -758,6 +747,30 @@ public class GenericHelixController implements IdealStateChangeListener, } } + private boolean updateControllerState(NotificationContext changeContext, PauseSignal signal, + boolean statusFlag) { + if (signal != null) { + // This logic is used for recording first time entering PAUSE/MAINTENCE mode + if (!statusFlag) { + statusFlag = true; + logger.info(String.format("controller is now %s", + (signal instanceof MaintenanceSignal) ? "in maintenance mode" : "paused")); + } + } else { + if (statusFlag) { + statusFlag = false; + logger.info("controller is now resumed from paused state"); + ClusterEvent event = new ClusterEvent(_clusterName, ClusterEventType.Resume); + event.addAttribute(AttributeName.changeContext.name(), changeContext); + event.addAttribute(AttributeName.helixmanager.name(), changeContext.getManager()); + event.addAttribute(AttributeName.eventData.name(), signal); + _eventQueue.put(event); + _taskEventQueue.put(event.clone()); + } + } + return statusFlag; + } + // TODO: refactor this to use common/ClusterEventProcessor. private class ClusterEventProcessor extends Thread { http://git-wip-us.apache.org/repos/asf/helix/blob/89089b45/helix-core/src/main/java/org/apache/helix/controller/stages/BestPossibleStateCalcStage.java ---------------------------------------------------------------------- diff --git a/helix-core/src/main/java/org/apache/helix/controller/stages/BestPossibleStateCalcStage.java b/helix-core/src/main/java/org/apache/helix/controller/stages/BestPossibleStateCalcStage.java index e96f0f3..9566f2c 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/stages/BestPossibleStateCalcStage.java +++ b/helix-core/src/main/java/org/apache/helix/controller/stages/BestPossibleStateCalcStage.java @@ -180,15 +180,17 @@ public class BestPossibleStateCalcStage extends AbstractBaseStage { "Offline Instances count %d greater than allowed count %d. Stop rebalance pipeline and pause the cluster %s", offlineCount, maxOfflineInstancesAllowed, cache.getClusterName()); if (manager != null) { - manager.getClusterManagmentTool() - .enableMaintenanceMode(manager.getClusterName(), true, errMsg); + if (manager.getHelixDataAccessor() + .getProperty(manager.getHelixDataAccessor().keyBuilder().maintenance()) == null) { + manager.getClusterManagmentTool() + .enableMaintenanceMode(manager.getClusterName(), true, errMsg); + } } else { logger.error("Failed to pause cluster, HelixManager is not set!"); } if (!cache.isTaskCache()) { updateRebalanceStatus(true, manager, cache, clusterStatusMonitor, errMsg); } - throw new HelixException(errMsg); } } }
