This is an automated email from the ASF dual-hosted git repository.

jxue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/helix.git


The following commit(s) were added to refs/heads/master by this push:
     new 2b33ffebe skip min active check if instance replica in unhealthy state 
(#3019)
2b33ffebe is described below

commit 2b33ffebea4cd485c4c1e23cdefc886d95686292
Author: Grant Paláu Spencer <gspen...@linkedin.com>
AuthorDate: Wed Apr 30 13:49:58 2025 -0700

    skip min active check if instance replica in unhealthy state (#3019)
    
    Stoppable API will skip the min active replica check for a replica if that 
replica is already in an unhealthy state. This is because stopping the instance 
will not affect that partition's availability as the replica is already in an 
unhealthy state (offline/error). Previously, this would prevent a node with an 
error state replica from being stopped if min_active for that partition was not 
satisfied. With this change, we will not consider the sister replicas and drop 
anyways.
---
 .../stages/ParticipantDeregistrationStage.java     |  4 +-
 .../apache/helix/util/InstanceValidationUtil.java  |  5 +++
 .../org/apache/helix/integration/TestWagedNPE.java |  3 --
 .../helix/util/TestInstanceValidationUtil.java     | 45 ++++++++++++++++++++++
 .../helix/rest/server/TestInstancesAccessor.java   |  2 +-
 5 files changed, 53 insertions(+), 6 deletions(-)

diff --git 
a/helix-core/src/main/java/org/apache/helix/controller/stages/ParticipantDeregistrationStage.java
 
b/helix-core/src/main/java/org/apache/helix/controller/stages/ParticipantDeregistrationStage.java
index ef950a185..1918dca76 100644
--- 
a/helix-core/src/main/java/org/apache/helix/controller/stages/ParticipantDeregistrationStage.java
+++ 
b/helix-core/src/main/java/org/apache/helix/controller/stages/ParticipantDeregistrationStage.java
@@ -26,14 +26,14 @@ public class ParticipantDeregistrationStage extends 
AbstractAsyncBaseStage {
   @Override
   public void execute(ClusterEvent event) throws Exception {
     HelixManager manager = 
event.getAttribute(AttributeName.helixmanager.name());
-    ClusterConfig clusterConfig = 
manager.getConfigAccessor().getClusterConfig(manager.getClusterName());
+    ResourceControllerDataProvider cache = 
event.getAttribute(AttributeName.ControllerDataProvider.name());
+    ClusterConfig clusterConfig = cache.getClusterConfig();
     if (clusterConfig == null || 
!clusterConfig.isParticipantDeregistrationEnabled()) {
       LOG.debug("Cluster config is null or participant deregistration is not 
enabled. "
           + "Skipping participant deregistration.");
       return;
     }
 
-    ResourceControllerDataProvider cache = 
event.getAttribute(AttributeName.ControllerDataProvider.name());
     Map<String, Long> offlineTimeMap = cache.getInstanceOfflineTimeMap();
     long deregisterDelay = clusterConfig.getParticipantDeregistrationTimeout();
     long stageStartTime = System.currentTimeMillis();
diff --git 
a/helix-core/src/main/java/org/apache/helix/util/InstanceValidationUtil.java 
b/helix-core/src/main/java/org/apache/helix/util/InstanceValidationUtil.java
index 5dea68334..981d43092 100644
--- a/helix-core/src/main/java/org/apache/helix/util/InstanceValidationUtil.java
+++ b/helix-core/src/main/java/org/apache/helix/util/InstanceValidationUtil.java
@@ -449,6 +449,11 @@ public class InstanceValidationUtil {
         Map<String, String> stateByInstanceMap = 
externalView.getStateMap(partition);
         // found the resource hosted on the instance
         if (stateByInstanceMap.containsKey(instanceName)) {
+          // If this node's replica is in unhealthy state, skip the sibling 
check as removing this replica will not
+          // negatively affect availability.
+          if (unhealthyStates.contains(stateByInstanceMap.get(instanceName))) {
+            continue;
+          }
           int numHealthySiblings = 0;
           for (Map.Entry<String, String> entry : 
stateByInstanceMap.entrySet()) {
             String siblingInstanceName = entry.getKey();
diff --git 
a/helix-core/src/test/java/org/apache/helix/integration/TestWagedNPE.java 
b/helix-core/src/test/java/org/apache/helix/integration/TestWagedNPE.java
index 57930fa71..111ea5fa6 100644
--- a/helix-core/src/test/java/org/apache/helix/integration/TestWagedNPE.java
+++ b/helix-core/src/test/java/org/apache/helix/integration/TestWagedNPE.java
@@ -2,13 +2,10 @@ package org.apache.helix.integration;
 
 import java.util.ArrayList;
 import java.util.Collections;
-import java.util.HashSet;
 import java.util.List;
 import org.apache.helix.ConfigAccessor;
 import org.apache.helix.TestHelper;
 import org.apache.helix.common.ZkTestBase;
-import org.apache.helix.controller.rebalancer.DelayedAutoRebalancer;
-import 
org.apache.helix.controller.rebalancer.strategy.CrushEdRebalanceStrategy;
 import org.apache.helix.controller.rebalancer.waged.WagedRebalancer;
 import org.apache.helix.integration.manager.ClusterControllerManager;
 import org.apache.helix.integration.manager.MockParticipantManager;
diff --git 
a/helix-core/src/test/java/org/apache/helix/util/TestInstanceValidationUtil.java
 
b/helix-core/src/test/java/org/apache/helix/util/TestInstanceValidationUtil.java
index 88dd05351..c95dbb707 100644
--- 
a/helix-core/src/test/java/org/apache/helix/util/TestInstanceValidationUtil.java
+++ 
b/helix-core/src/test/java/org/apache/helix/util/TestInstanceValidationUtil.java
@@ -544,6 +544,51 @@ public class TestInstanceValidationUtil {
     InstanceValidationUtil.siblingNodesActiveReplicaCheck(mock.dataAccessor, 
TEST_INSTANCE);
   }
 
+  // Test that min active replica check only run on partitions where removing 
the instance's replica would reduce
+  // the partition's availability. This means min active check will not be 
performed on partitions where the instance's
+  // replica is in an unhealthy state
+  @Test
+  public void 
TestSiblingNodesActiveReplicaCheckSuccessWithReplicaInErrorState() {
+    String resource = "resource";
+    Mock mock = new Mock();
+    doReturn(ImmutableList.of(resource)).when(mock.dataAccessor)
+        .getChildNames(argThat(new 
PropertyKeyArgument(PropertyType.IDEALSTATES)));
+    // set ideal state
+    IdealState idealState = mock(IdealState.class);
+    when(idealState.isEnabled()).thenReturn(true);
+    when(idealState.isValid()).thenReturn(true);
+    when(idealState.getStateModelDefRef()).thenReturn("MasterSlave");
+    doReturn(idealState).when(mock.dataAccessor).getProperty(argThat(new 
PropertyKeyArgument(PropertyType.IDEALSTATES)));
+
+    ExternalView externalView = mock(ExternalView.class);
+    when(externalView.getMinActiveReplicas()).thenReturn(3);
+    when(externalView.getStateModelDefRef()).thenReturn("MasterSlave");
+    when(externalView.getPartitionSet()).thenReturn(ImmutableSet.of("db0"));
+    when(externalView.getStateMap("db0")).thenReturn(
+        ImmutableMap.of(TEST_INSTANCE, "ERROR", "instance1", "ERROR", 
"instance2", "ERROR"));
+    doReturn(externalView).when(mock.dataAccessor)
+        .getProperty(argThat(new 
PropertyKeyArgument(PropertyType.EXTERNALVIEW)));
+    StateModelDefinition stateModelDefinition = 
mock(StateModelDefinition.class);
+    when(stateModelDefinition.getInitialState()).thenReturn("OFFLINE");
+    doReturn(stateModelDefinition).when(mock.dataAccessor)
+        .getProperty(argThat(new 
PropertyKeyArgument(PropertyType.STATEMODELDEFS)));
+
+    // This min active replica check should pass as the instance's replica is 
in ERROR state, so it will not affect
+    // availability of the partition even though partition does not meet 
healthy min active replica count
+    boolean result =
+        
InstanceValidationUtil.siblingNodesActiveReplicaCheck(mock.dataAccessor, 
TEST_INSTANCE);
+    Assert.assertTrue(result);
+
+    // This min active replica check should fail as the instance is in a 
healthy state and removing it would reduce
+    // availability of the partition and partition would not meet the healthy 
min active replica count
+    when(externalView.getStateMap("db0")).thenReturn(
+        ImmutableMap.of(TEST_INSTANCE, "Master", "instance1", "ERROR", 
"instance2", "ERROR"));
+
+    result =
+        
InstanceValidationUtil.siblingNodesActiveReplicaCheck(mock.dataAccessor, 
TEST_INSTANCE);
+    Assert.assertFalse(result);
+  }
+
   private class Mock {
     HelixDataAccessor dataAccessor;
     ConfigAccessor configAccessor;
diff --git 
a/helix-rest/src/test/java/org/apache/helix/rest/server/TestInstancesAccessor.java
 
b/helix-rest/src/test/java/org/apache/helix/rest/server/TestInstancesAccessor.java
index 7fe4bae28..fc20d1d92 100644
--- 
a/helix-rest/src/test/java/org/apache/helix/rest/server/TestInstancesAccessor.java
+++ 
b/helix-rest/src/test/java/org/apache/helix/rest/server/TestInstancesAccessor.java
@@ -426,7 +426,7 @@ public class TestInstancesAccessor extends 
AbstractTestClass {
     JsonNode jsonResult = 
OBJECT_MAPPER.readTree(response.readEntity(String.class));
     Assert.assertFalse(jsonResult.get("stoppable").asBoolean());
     Assert.assertEquals(getStringSet(jsonResult, "failedChecks"),
-            
ImmutableSet.of("HELIX:HAS_DISABLED_PARTITION","HELIX:INSTANCE_NOT_ENABLED","HELIX:INSTANCE_NOT_STABLE","HELIX:MIN_ACTIVE_REPLICA_CHECK_FAILED"));
+            
ImmutableSet.of("HELIX:HAS_DISABLED_PARTITION","HELIX:INSTANCE_NOT_ENABLED","HELIX:INSTANCE_NOT_STABLE"));
 
     // Reenable instance0, it should passed the check
     
instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE);

Reply via email to