[helix] branch master updated: Improve auto enter maintenance mode (#1650)

hzlu Mon, 22 Feb 2021 19:38:59 -0800

This is an automated email from the ASF dual-hosted git repository.

hzlu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/helix.git



The following commit(s) were added to refs/heads/master by this push:
     new 3338e95  Improve auto enter maintenance mode (#1650)
3338e95 is described below

commit 3338e95f149b0fe686eabbf0f0063dd03b45c2a6
Author: Huizhi Lu <[email protected]>
AuthorDate: Mon Feb 22 19:38:42 2021 -0800

    Improve auto enter maintenance mode (#1650)
    
    Assume enter M mode threshold is 5. Now 20 nodes are down at the same time, 
the running pipeline creates the maintenance znode. Instead of using the 
maintenance rebalancer immediately, this running pipeline still continues with 
the normal rebalancer, which moves new partitions on the online instances.
    
    This commit improves auto enter maintenance mode logic by enabling 
maintenance mode in the data cache, so the best possible mapping can be 
computed by the maintenance rebalancer immediately for the first pipeline.
---
 .../dataproviders/BaseControllerDataProvider.java  |  9 +++
 .../stages/BestPossibleStateCalcStage.java         | 13 ++--
 .../stages/TestBestPossibleStateCalcStage.java     | 75 +++++++++++++++++++++-
 .../java/org/apache/helix/mock/MockManager.java    |  3 +-
 4 files changed, 92 insertions(+), 8 deletions(-)

diff --git 
a/helix-core/src/main/java/org/apache/helix/controller/dataproviders/BaseControllerDataProvider.java
 
b/helix-core/src/main/java/org/apache/helix/controller/dataproviders/BaseControllerDataProvider.java
index 070c70a..6ce25e2 100644
--- 
a/helix-core/src/main/java/org/apache/helix/controller/dataproviders/BaseControllerDataProvider.java
+++ 
b/helix-core/src/main/java/org/apache/helix/controller/dataproviders/BaseControllerDataProvider.java
@@ -947,6 +947,15 @@ public class BaseControllerDataProvider implements 
ControlContextProvider {
     return _isMaintenanceModeEnabled;
   }
 
+  /**
+   * Mark the pipeline to enable maintenance mode. It is not supposed to use 
anywhere
+   * except in the auto enter maintenance mode logic when offline instances 
exceeded.
+   */
+  // TODO: refactor it to writable cache once read-only/writable caches are 
separated.
+  public void enableMaintenanceMode() {
+    _isMaintenanceModeEnabled = true;
+  }
+
   public boolean hasMaintenanceSignalChanged() {
     return _hasMaintenanceSignalChanged;
   }
diff --git 
a/helix-core/src/main/java/org/apache/helix/controller/stages/BestPossibleStateCalcStage.java
 
b/helix-core/src/main/java/org/apache/helix/controller/stages/BestPossibleStateCalcStage.java
index 7b346ec..7c1155d 100644
--- 
a/helix-core/src/main/java/org/apache/helix/controller/stages/BestPossibleStateCalcStage.java
+++ 
b/helix-core/src/main/java/org/apache/helix/controller/stages/BestPossibleStateCalcStage.java
@@ -116,8 +116,8 @@ public class BestPossibleStateCalcStage extends 
AbstractBaseStage {
         event.getAttribute(AttributeName.clusterStatusMonitor.name());
     WagedRebalancer wagedRebalancer = 
event.getAttribute(AttributeName.STATEFUL_REBALANCER.name());
 
-    // Check whether the offline/disabled instance count in the cluster 
reaches the set limit,
-    // if yes, pause the rebalancer.
+    // Check whether the offline/disabled instance count in the cluster 
exceeds the set limit,
+    // if yes, put the cluster into maintenance mode.
     boolean isValid =
         validateOfflineInstancesLimit(cache, 
event.getAttribute(AttributeName.helixmanager.name()));
 
@@ -193,7 +193,7 @@ public class BestPossibleStateCalcStage extends 
AbstractBaseStage {
   }
 
   // Check whether the offline/disabled instance count in the cluster reaches 
the set limit,
-  // if yes, pause the rebalancer, and throw exception to terminate rebalance 
cycle.
+  // if yes, auto enable maintenance mode, and use the maintenance rebalancer 
for this pipeline.
   private boolean validateOfflineInstancesLimit(final 
ResourceControllerDataProvider cache,
       final HelixManager manager) {
     int maxOfflineInstancesAllowed = 
cache.getClusterConfig().getMaxOfflineInstancesAllowed();
@@ -201,7 +201,8 @@ public class BestPossibleStateCalcStage extends 
AbstractBaseStage {
       int offlineCount = cache.getAllInstances().size() - 
cache.getEnabledLiveInstances().size();
       if (offlineCount > maxOfflineInstancesAllowed) {
         String errMsg = String.format(
-            "Offline Instances count %d greater than allowed count %d. Stop 
rebalance and put the cluster %s into maintenance mode.",
+            "Offline Instances count %d greater than allowed count %d. Put 
cluster %s into "
+                + "maintenance mode.",
             offlineCount, maxOfflineInstancesAllowed, cache.getClusterName());
         if (manager != null) {
           if (manager.getHelixDataAccessor()
@@ -215,6 +216,10 @@ public class BestPossibleStateCalcStage extends 
AbstractBaseStage {
           LogUtil.logError(logger, _eventId, "Failed to put cluster " + 
cache.getClusterName()
               + " into maintenance mode, HelixManager is not set!");
         }
+
+        // Enable maintenance mode in cache so the maintenance rebalancer is 
used for this pipeline
+        cache.enableMaintenanceMode();
+
         return false;
       }
     }
diff --git 
a/helix-core/src/test/java/org/apache/helix/controller/stages/TestBestPossibleStateCalcStage.java
 
b/helix-core/src/test/java/org/apache/helix/controller/stages/TestBestPossibleStateCalcStage.java
index ba70606..7289dc3 100644
--- 
a/helix-core/src/test/java/org/apache/helix/controller/stages/TestBestPossibleStateCalcStage.java
+++ 
b/helix-core/src/test/java/org/apache/helix/controller/stages/TestBestPossibleStateCalcStage.java
@@ -20,14 +20,16 @@ package org.apache.helix.controller.stages;
  */
 
 import java.util.Date;
+import java.util.List;
 import java.util.Map;
 
 import 
org.apache.helix.controller.dataproviders.ResourceControllerDataProvider;
 import org.apache.helix.model.BuiltInStateModelDefinitions;
+import org.apache.helix.model.ClusterConfig;
 import org.apache.helix.model.IdealState.RebalanceMode;
 import org.apache.helix.model.Partition;
 import org.apache.helix.model.Resource;
-import org.testng.AssertJUnit;
+import org.testng.Assert;
 import org.testng.annotations.Test;
 
 public class TestBestPossibleStateCalcStage extends BaseStageTest {
@@ -66,10 +68,79 @@ public class TestBestPossibleStateCalcStage extends 
BaseStageTest {
         event.getAttribute(AttributeName.BEST_POSSIBLE_STATE.name());
     for (int p = 0; p < 5; p++) {
       Partition resource = new Partition("testResourceName_" + p);
-      AssertJUnit.assertEquals("MASTER", 
output.getInstanceStateMap("testResourceName", resource)
+      Assert.assertEquals("MASTER", 
output.getInstanceStateMap("testResourceName", resource)
           .get("localhost_" + (p + 1) % 5));
     }
     System.out.println("END TestBestPossibleStateCalcStage at "
         + new Date(System.currentTimeMillis()));
   }
+
+  /*
+   * Tests the pipeline detects offline instances exceed the threshold and 
auto enters maintenance,
+   * the maintenance rebalancer is used immediately. No bootstraps in the best 
possible output.
+   */
+  @Test
+  public void testAutoEnterMaintenanceWhenExceedingOfflineNodes() {
+    String[] resources = new String[]{"testResourceName"};
+    int numInstances = 3;
+    int numPartitions = 3;
+
+    setupIdealState(numInstances, resources, numPartitions, 1, 
RebalanceMode.FULL_AUTO,
+        BuiltInStateModelDefinitions.MasterSlave.name());
+    setupInstances(numInstances);
+    List<String> liveInstances = setupLiveInstances(numInstances);
+    setupStateModel();
+
+    // Set offline instances threshold
+    ClusterConfig clusterConfig = 
accessor.getProperty(accessor.keyBuilder().clusterConfig());
+    clusterConfig.setMaxOfflineInstancesAllowed(1);
+    setClusterConfig(clusterConfig);
+
+    Map<String, Resource> resourceMap =
+        getResourceMap(resources, numPartitions, 
BuiltInStateModelDefinitions.MasterSlave.name());
+    CurrentStateOutput currentStateOutput = new CurrentStateOutput();
+
+    for (int p = 0; p < numPartitions; p++) {
+      Partition partition = new Partition("testResourceName_" + p);
+      currentStateOutput
+          .setCurrentState("testResourceName", partition, "localhost_" + (p + 
1) % numInstances,
+              "MASTER");
+    }
+
+    // Disable 2 instances so the pipeline should enter maintenance
+    for (int i = 0; i < 2; i++) {
+      admin.enableInstance(_clusterName, liveInstances.get(i), false);
+    }
+
+    event.addAttribute(AttributeName.helixmanager.name(), manager);
+    event.addAttribute(AttributeName.RESOURCES.name(), resourceMap);
+    event.addAttribute(AttributeName.RESOURCES_TO_REBALANCE.name(), 
resourceMap);
+    event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput);
+    event.addAttribute(AttributeName.ControllerDataProvider.name(),
+        new ResourceControllerDataProvider());
+
+    runStage(event, new ReadClusterDataStage());
+    runStage(event, new BestPossibleStateCalcStage());
+
+    BestPossibleStateOutput output = 
event.getAttribute(AttributeName.BEST_POSSIBLE_STATE.name());
+
+    // State on the disabled instances should be OFFLINE instead of DROPPED
+    // because of maintenance rebalancer.
+    Assert.assertEquals(
+        output.getInstanceStateMap("testResourceName", new 
Partition("testResourceName_2"))
+            .get("localhost_0"),
+        "OFFLINE",
+        "Actual state should not be DROPPED");
+
+    Assert.assertEquals(
+        output.getInstanceStateMap("testResourceName", new 
Partition("testResourceName_0"))
+            .get("localhost_1"),
+        "OFFLINE",
+        "Actual state should not be DROPPED");
+
+    // No state change for localhost_2 because the replica is already MASTER
+    Assert.assertNull(
+        output.getInstanceStateMap("testResourceName", new 
Partition("testResourceName_1"))
+            .get("localhost_2"));
+  }
 }
diff --git a/helix-core/src/test/java/org/apache/helix/mock/MockManager.java 
b/helix-core/src/test/java/org/apache/helix/mock/MockManager.java
index cbaa149..afa8d98 100644
--- a/helix-core/src/test/java/org/apache/helix/mock/MockManager.java
+++ b/helix-core/src/test/java/org/apache/helix/mock/MockManager.java
@@ -259,8 +259,7 @@ public class MockManager implements HelixManager {
 
   @Override
   public HelixAdmin getClusterManagmentTool() {
-    // TODO Auto-generated method stub
-    return null;
+    return new MockHelixAdmin(this);
   }
 
   @Override

[helix] branch master updated: Improve auto enter maintenance mode (#1650)

Reply via email to