This is an automated email from the ASF dual-hosted git repository. jxue pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/helix.git
The following commit(s) were added to refs/heads/master by this push: new 2b33ffebe skip min active check if instance replica in unhealthy state (#3019) 2b33ffebe is described below commit 2b33ffebea4cd485c4c1e23cdefc886d95686292 Author: Grant Paláu Spencer <gspen...@linkedin.com> AuthorDate: Wed Apr 30 13:49:58 2025 -0700 skip min active check if instance replica in unhealthy state (#3019) Stoppable API will skip the min active replica check for a replica if that replica is already in an unhealthy state. This is because stopping the instance will not affect that partition's availability as the replica is already in an unhealthy state (offline/error). Previously, this would prevent a node with an error state replica from being stopped if min_active for that partition was not satisfied. With this change, we will not consider the sister replicas and drop anyways. --- .../stages/ParticipantDeregistrationStage.java | 4 +- .../apache/helix/util/InstanceValidationUtil.java | 5 +++ .../org/apache/helix/integration/TestWagedNPE.java | 3 -- .../helix/util/TestInstanceValidationUtil.java | 45 ++++++++++++++++++++++ .../helix/rest/server/TestInstancesAccessor.java | 2 +- 5 files changed, 53 insertions(+), 6 deletions(-) diff --git a/helix-core/src/main/java/org/apache/helix/controller/stages/ParticipantDeregistrationStage.java b/helix-core/src/main/java/org/apache/helix/controller/stages/ParticipantDeregistrationStage.java index ef950a185..1918dca76 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/stages/ParticipantDeregistrationStage.java +++ b/helix-core/src/main/java/org/apache/helix/controller/stages/ParticipantDeregistrationStage.java @@ -26,14 +26,14 @@ public class ParticipantDeregistrationStage extends AbstractAsyncBaseStage { @Override public void execute(ClusterEvent event) throws Exception { HelixManager manager = event.getAttribute(AttributeName.helixmanager.name()); - ClusterConfig clusterConfig = manager.getConfigAccessor().getClusterConfig(manager.getClusterName()); + ResourceControllerDataProvider cache = event.getAttribute(AttributeName.ControllerDataProvider.name()); + ClusterConfig clusterConfig = cache.getClusterConfig(); if (clusterConfig == null || !clusterConfig.isParticipantDeregistrationEnabled()) { LOG.debug("Cluster config is null or participant deregistration is not enabled. " + "Skipping participant deregistration."); return; } - ResourceControllerDataProvider cache = event.getAttribute(AttributeName.ControllerDataProvider.name()); Map<String, Long> offlineTimeMap = cache.getInstanceOfflineTimeMap(); long deregisterDelay = clusterConfig.getParticipantDeregistrationTimeout(); long stageStartTime = System.currentTimeMillis(); diff --git a/helix-core/src/main/java/org/apache/helix/util/InstanceValidationUtil.java b/helix-core/src/main/java/org/apache/helix/util/InstanceValidationUtil.java index 5dea68334..981d43092 100644 --- a/helix-core/src/main/java/org/apache/helix/util/InstanceValidationUtil.java +++ b/helix-core/src/main/java/org/apache/helix/util/InstanceValidationUtil.java @@ -449,6 +449,11 @@ public class InstanceValidationUtil { Map<String, String> stateByInstanceMap = externalView.getStateMap(partition); // found the resource hosted on the instance if (stateByInstanceMap.containsKey(instanceName)) { + // If this node's replica is in unhealthy state, skip the sibling check as removing this replica will not + // negatively affect availability. + if (unhealthyStates.contains(stateByInstanceMap.get(instanceName))) { + continue; + } int numHealthySiblings = 0; for (Map.Entry<String, String> entry : stateByInstanceMap.entrySet()) { String siblingInstanceName = entry.getKey(); diff --git a/helix-core/src/test/java/org/apache/helix/integration/TestWagedNPE.java b/helix-core/src/test/java/org/apache/helix/integration/TestWagedNPE.java index 57930fa71..111ea5fa6 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/TestWagedNPE.java +++ b/helix-core/src/test/java/org/apache/helix/integration/TestWagedNPE.java @@ -2,13 +2,10 @@ package org.apache.helix.integration; import java.util.ArrayList; import java.util.Collections; -import java.util.HashSet; import java.util.List; import org.apache.helix.ConfigAccessor; import org.apache.helix.TestHelper; import org.apache.helix.common.ZkTestBase; -import org.apache.helix.controller.rebalancer.DelayedAutoRebalancer; -import org.apache.helix.controller.rebalancer.strategy.CrushEdRebalanceStrategy; import org.apache.helix.controller.rebalancer.waged.WagedRebalancer; import org.apache.helix.integration.manager.ClusterControllerManager; import org.apache.helix.integration.manager.MockParticipantManager; diff --git a/helix-core/src/test/java/org/apache/helix/util/TestInstanceValidationUtil.java b/helix-core/src/test/java/org/apache/helix/util/TestInstanceValidationUtil.java index 88dd05351..c95dbb707 100644 --- a/helix-core/src/test/java/org/apache/helix/util/TestInstanceValidationUtil.java +++ b/helix-core/src/test/java/org/apache/helix/util/TestInstanceValidationUtil.java @@ -544,6 +544,51 @@ public class TestInstanceValidationUtil { InstanceValidationUtil.siblingNodesActiveReplicaCheck(mock.dataAccessor, TEST_INSTANCE); } + // Test that min active replica check only run on partitions where removing the instance's replica would reduce + // the partition's availability. This means min active check will not be performed on partitions where the instance's + // replica is in an unhealthy state + @Test + public void TestSiblingNodesActiveReplicaCheckSuccessWithReplicaInErrorState() { + String resource = "resource"; + Mock mock = new Mock(); + doReturn(ImmutableList.of(resource)).when(mock.dataAccessor) + .getChildNames(argThat(new PropertyKeyArgument(PropertyType.IDEALSTATES))); + // set ideal state + IdealState idealState = mock(IdealState.class); + when(idealState.isEnabled()).thenReturn(true); + when(idealState.isValid()).thenReturn(true); + when(idealState.getStateModelDefRef()).thenReturn("MasterSlave"); + doReturn(idealState).when(mock.dataAccessor).getProperty(argThat(new PropertyKeyArgument(PropertyType.IDEALSTATES))); + + ExternalView externalView = mock(ExternalView.class); + when(externalView.getMinActiveReplicas()).thenReturn(3); + when(externalView.getStateModelDefRef()).thenReturn("MasterSlave"); + when(externalView.getPartitionSet()).thenReturn(ImmutableSet.of("db0")); + when(externalView.getStateMap("db0")).thenReturn( + ImmutableMap.of(TEST_INSTANCE, "ERROR", "instance1", "ERROR", "instance2", "ERROR")); + doReturn(externalView).when(mock.dataAccessor) + .getProperty(argThat(new PropertyKeyArgument(PropertyType.EXTERNALVIEW))); + StateModelDefinition stateModelDefinition = mock(StateModelDefinition.class); + when(stateModelDefinition.getInitialState()).thenReturn("OFFLINE"); + doReturn(stateModelDefinition).when(mock.dataAccessor) + .getProperty(argThat(new PropertyKeyArgument(PropertyType.STATEMODELDEFS))); + + // This min active replica check should pass as the instance's replica is in ERROR state, so it will not affect + // availability of the partition even though partition does not meet healthy min active replica count + boolean result = + InstanceValidationUtil.siblingNodesActiveReplicaCheck(mock.dataAccessor, TEST_INSTANCE); + Assert.assertTrue(result); + + // This min active replica check should fail as the instance is in a healthy state and removing it would reduce + // availability of the partition and partition would not meet the healthy min active replica count + when(externalView.getStateMap("db0")).thenReturn( + ImmutableMap.of(TEST_INSTANCE, "Master", "instance1", "ERROR", "instance2", "ERROR")); + + result = + InstanceValidationUtil.siblingNodesActiveReplicaCheck(mock.dataAccessor, TEST_INSTANCE); + Assert.assertFalse(result); + } + private class Mock { HelixDataAccessor dataAccessor; ConfigAccessor configAccessor; diff --git a/helix-rest/src/test/java/org/apache/helix/rest/server/TestInstancesAccessor.java b/helix-rest/src/test/java/org/apache/helix/rest/server/TestInstancesAccessor.java index 7fe4bae28..fc20d1d92 100644 --- a/helix-rest/src/test/java/org/apache/helix/rest/server/TestInstancesAccessor.java +++ b/helix-rest/src/test/java/org/apache/helix/rest/server/TestInstancesAccessor.java @@ -426,7 +426,7 @@ public class TestInstancesAccessor extends AbstractTestClass { JsonNode jsonResult = OBJECT_MAPPER.readTree(response.readEntity(String.class)); Assert.assertFalse(jsonResult.get("stoppable").asBoolean()); Assert.assertEquals(getStringSet(jsonResult, "failedChecks"), - ImmutableSet.of("HELIX:HAS_DISABLED_PARTITION","HELIX:INSTANCE_NOT_ENABLED","HELIX:INSTANCE_NOT_STABLE","HELIX:MIN_ACTIVE_REPLICA_CHECK_FAILED")); + ImmutableSet.of("HELIX:HAS_DISABLED_PARTITION","HELIX:INSTANCE_NOT_ENABLED","HELIX:INSTANCE_NOT_STABLE")); // Reenable instance0, it should passed the check instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE);