This is an automated email from the ASF dual-hosted git repository.
xyuanlu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/helix.git
The following commit(s) were added to refs/heads/master by this push:
new 4738f6dd2 [apache/helix] -- Fixes #2638, Improve Hard Constraint
Failure Debuggability by adding details in the error message (#2639)
4738f6dd2 is described below
commit 4738f6dd225a8b8609c7cf479c695a050e7b127e
Author: Himanshu Kandwal <[email protected]>
AuthorDate: Fri Oct 20 00:00:50 2023 -0700
[apache/helix] -- Fixes #2638, Improve Hard Constraint Failure
Debuggability by adding details in the error message (#2639)
n this PR, we are logging the specific details of the hard constraint
validation failures, that improvs the issue debuggability.
---
.../constraints/FaultZoneAwareConstraint.java | 19 +++++--
.../waged/constraints/NodeCapacityConstraint.java | 8 ++-
.../NodeMaxPartitionLimitConstraint.java | 27 ++++++++--
.../constraints/ReplicaActivateConstraint.java | 16 ++++--
.../SamePartitionOnInstanceConstraint.java | 15 +++++-
.../waged/constraints/ValidGroupTagConstraint.java | 12 ++++-
.../constraints/TestConstraintBasedAlgorithm.java | 29 ++++++++++-
.../constraints/TestFaultZoneAwareConstraint.java | 5 +-
...int.java => TestReplicaActivateConstraint.java} | 59 ++++++++++------------
9 files changed, 138 insertions(+), 52 deletions(-)
diff --git
a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/FaultZoneAwareConstraint.java
b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/FaultZoneAwareConstraint.java
index cf3e32241..e0992b5d4 100644
---
a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/FaultZoneAwareConstraint.java
+++
b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/FaultZoneAwareConstraint.java
@@ -19,21 +19,34 @@ package
org.apache.helix.controller.rebalancer.waged.constraints;
* under the License.
*/
+import java.util.Set;
import org.apache.helix.controller.rebalancer.waged.model.AssignableNode;
import org.apache.helix.controller.rebalancer.waged.model.AssignableReplica;
import org.apache.helix.controller.rebalancer.waged.model.ClusterContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
class FaultZoneAwareConstraint extends HardConstraint {
+ private static final Logger LOG =
LoggerFactory.getLogger(FaultZoneAwareConstraint.class);
+
@Override
boolean isAssignmentValid(AssignableNode node, AssignableReplica replica,
ClusterContext clusterContext) {
if (!node.hasFaultZone()) {
return true;
}
- return !clusterContext
- .getPartitionsForResourceAndFaultZone(replica.getResourceName(),
node.getFaultZone())
- .contains(replica.getPartitionName());
+
+ Set<String> partitionsForResourceAndFaultZone =
+
clusterContext.getPartitionsForResourceAndFaultZone(replica.getResourceName(),
node.getFaultZone());
+
+ if
(partitionsForResourceAndFaultZone.contains(replica.getPartitionName())) {
+ LOG.debug("A fault zone cannot contain more than 1 replica of same
partition. Found replica for partition: {}",
+ replica.getPartitionName());
+ return false;
+ }
+ return true;
}
@Override
diff --git
a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/NodeCapacityConstraint.java
b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/NodeCapacityConstraint.java
index f18c2ffa5..056d7f842 100644
---
a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/NodeCapacityConstraint.java
+++
b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/NodeCapacityConstraint.java
@@ -20,13 +20,17 @@ package
org.apache.helix.controller.rebalancer.waged.constraints;
*/
import java.util.Map;
-
import org.apache.helix.controller.rebalancer.waged.model.AssignableNode;
import org.apache.helix.controller.rebalancer.waged.model.AssignableReplica;
import org.apache.helix.controller.rebalancer.waged.model.ClusterContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
class NodeCapacityConstraint extends HardConstraint {
+ private static final Logger LOG =
LoggerFactory.getLogger(NodeCapacityConstraint.class);
+
@Override
boolean isAssignmentValid(AssignableNode node, AssignableReplica replica,
ClusterContext clusterContext) {
@@ -36,6 +40,8 @@ class NodeCapacityConstraint extends HardConstraint {
for (String key : replicaCapacity.keySet()) {
if (nodeCapacity.containsKey(key)) {
if (nodeCapacity.get(key) < replicaCapacity.get(key)) {
+ LOG.debug("Node has insufficient capacity for: {}. Left available:
{}, Required: {}",
+ key, nodeCapacity.get(key), replicaCapacity.get(key));
return false;
}
}
diff --git
a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/NodeMaxPartitionLimitConstraint.java
b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/NodeMaxPartitionLimitConstraint.java
index a5e29e926..deedf4aa5 100644
---
a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/NodeMaxPartitionLimitConstraint.java
+++
b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/NodeMaxPartitionLimitConstraint.java
@@ -22,18 +22,37 @@ package
org.apache.helix.controller.rebalancer.waged.constraints;
import org.apache.helix.controller.rebalancer.waged.model.AssignableNode;
import org.apache.helix.controller.rebalancer.waged.model.AssignableReplica;
import org.apache.helix.controller.rebalancer.waged.model.ClusterContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
class NodeMaxPartitionLimitConstraint extends HardConstraint {
+ private static final Logger LOG =
LoggerFactory.getLogger(NodeMaxPartitionLimitConstraint.class);
+
@Override
boolean isAssignmentValid(AssignableNode node, AssignableReplica replica,
ClusterContext clusterContext) {
boolean exceedMaxPartitionLimit =
node.getMaxPartition() < 0 || node.getAssignedReplicaCount() <
node.getMaxPartition();
- boolean exceedResourceMaxPartitionLimit =
replica.getResourceMaxPartitionsPerInstance() < 0
- ||
node.getAssignedPartitionsByResource(replica.getResourceName()).size() < replica
- .getResourceMaxPartitionsPerInstance();
- return exceedMaxPartitionLimit && exceedResourceMaxPartitionLimit;
+
+ if (!exceedMaxPartitionLimit) {
+ LOG.debug("Cannot exceed the max number of partitions ({}) limitation on
node. Assigned replica count: {}",
+ node.getMaxPartition(), node.getAssignedReplicaCount());
+ return false;
+ }
+
+ int resourceMaxPartitionsPerInstance =
replica.getResourceMaxPartitionsPerInstance();
+ int assignedPartitionsByResourceSize =
node.getAssignedPartitionsByResource(replica.getResourceName()).size();
+ boolean exceedResourceMaxPartitionLimit = resourceMaxPartitionsPerInstance
< 0
+ || assignedPartitionsByResourceSize < resourceMaxPartitionsPerInstance;
+
+ if (!exceedResourceMaxPartitionLimit) {
+ LOG.debug("Cannot exceed the max number of partitions per resource ({})
limitation on node. Assigned replica count: {}",
+ resourceMaxPartitionsPerInstance, assignedPartitionsByResourceSize);
+ return false;
+ }
+ return true;
}
@Override
diff --git
a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/ReplicaActivateConstraint.java
b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/ReplicaActivateConstraint.java
index e06fe5922..13e380a0a 100644
---
a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/ReplicaActivateConstraint.java
+++
b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/ReplicaActivateConstraint.java
@@ -24,14 +24,24 @@ import java.util.List;
import org.apache.helix.controller.rebalancer.waged.model.AssignableNode;
import org.apache.helix.controller.rebalancer.waged.model.AssignableReplica;
import org.apache.helix.controller.rebalancer.waged.model.ClusterContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
class ReplicaActivateConstraint extends HardConstraint {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(ReplicaActivateConstraint.class);
+
@Override
boolean isAssignmentValid(AssignableNode node, AssignableReplica replica,
ClusterContext clusterContext) {
- List<String> disabledPartitions =
- node.getDisabledPartitionsMap().get(replica.getResourceName());
- return disabledPartitions == null ||
!disabledPartitions.contains(replica.getPartitionName());
+ List<String> disabledPartitions =
node.getDisabledPartitionsMap().get(replica.getResourceName());
+
+ if (disabledPartitions != null &&
disabledPartitions.contains(replica.getPartitionName())) {
+ LOG.debug("Cannot assign the inactive replica: {}",
replica.getPartitionName());
+ return false;
+ }
+ return true;
}
@Override
diff --git
a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/SamePartitionOnInstanceConstraint.java
b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/SamePartitionOnInstanceConstraint.java
index 8f6b523b4..e6de9989c 100644
---
a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/SamePartitionOnInstanceConstraint.java
+++
b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/SamePartitionOnInstanceConstraint.java
@@ -19,17 +19,28 @@ package
org.apache.helix.controller.rebalancer.waged.constraints;
* under the License.
*/
+import java.util.Set;
import org.apache.helix.controller.rebalancer.waged.model.AssignableNode;
import org.apache.helix.controller.rebalancer.waged.model.AssignableReplica;
import org.apache.helix.controller.rebalancer.waged.model.ClusterContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
class SamePartitionOnInstanceConstraint extends HardConstraint {
+ private static final Logger LOG =
LoggerFactory.getLogger(SamePartitionOnInstanceConstraint.class);
+
@Override
boolean isAssignmentValid(AssignableNode node, AssignableReplica replica,
ClusterContext clusterContext) {
- return !node.getAssignedPartitionsByResource(replica.getResourceName())
- .contains(replica.getPartitionName());
+ Set<String> assignedPartitionsByResource =
node.getAssignedPartitionsByResource(replica.getResourceName());
+
+ if (assignedPartitionsByResource.contains(replica.getPartitionName())) {
+ LOG.debug("Same partition ({}) of different states cannot co-exist in
one instance", replica.getPartitionName());
+ return false;
+ }
+ return true;
}
@Override
diff --git
a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/ValidGroupTagConstraint.java
b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/ValidGroupTagConstraint.java
index 2fd3743f3..b1e257c87 100644
---
a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/ValidGroupTagConstraint.java
+++
b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/constraints/ValidGroupTagConstraint.java
@@ -22,8 +22,14 @@ package
org.apache.helix.controller.rebalancer.waged.constraints;
import org.apache.helix.controller.rebalancer.waged.model.AssignableNode;
import org.apache.helix.controller.rebalancer.waged.model.AssignableReplica;
import org.apache.helix.controller.rebalancer.waged.model.ClusterContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
class ValidGroupTagConstraint extends HardConstraint {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(SamePartitionOnInstanceConstraint.class);
+
@Override
boolean isAssignmentValid(AssignableNode node, AssignableReplica replica,
ClusterContext clusterContext) {
@@ -31,7 +37,11 @@ class ValidGroupTagConstraint extends HardConstraint {
return true;
}
- return
node.getInstanceTags().contains(replica.getResourceInstanceGroupTag());
+ if
(!node.getInstanceTags().contains(replica.getResourceInstanceGroupTag())) {
+ LOG.debug("Instance doesn't have the tag of the replica ({})",
replica.getResourceInstanceGroupTag());
+ return false;
+ }
+ return true;
}
@Override
diff --git
a/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/constraints/TestConstraintBasedAlgorithm.java
b/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/constraints/TestConstraintBasedAlgorithm.java
index 3ba92ef57..0a75002f1 100644
---
a/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/constraints/TestConstraintBasedAlgorithm.java
+++
b/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/constraints/TestConstraintBasedAlgorithm.java
@@ -19,11 +19,14 @@ package
org.apache.helix.controller.rebalancer.waged.constraints;
* under the License.
*/
-import java.io.IOException;
-
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
import org.apache.helix.HelixRebalanceException;
+import org.apache.helix.controller.rebalancer.waged.model.AssignableReplica;
import org.apache.helix.controller.rebalancer.waged.model.ClusterModel;
import
org.apache.helix.controller.rebalancer.waged.model.ClusterModelTestHelper;
import org.apache.helix.controller.rebalancer.waged.model.OptimalAssignment;
@@ -118,4 +121,26 @@ public class TestConstraintBasedAlgorithm {
"The cluster does not have enough item1 capacity for all partitions.
Failure Type: FAILED_TO_CALCULATE");
}
}
+
+ @Test
+ public void testCalculateWithInvalidAssignmentForNodeCapacity() throws
IOException {
+ HardConstraint nodeCapacityConstraint = new NodeCapacityConstraint();
+ SoftConstraint soft1 = new MaxCapacityUsageInstanceConstraint();
+ SoftConstraint soft2 = new InstancePartitionsCountConstraint();
+ ConstraintBasedAlgorithm algorithm =
+ new ConstraintBasedAlgorithm(ImmutableList.of(nodeCapacityConstraint),
+ ImmutableMap.of(soft1, 1f, soft2, 1f));
+ ClusterModel clusterModel = new
ClusterModelTestHelper().getMultiNodeClusterModel();
+ // increase the ask capacity of item 3, which will trigger the capacity
constraint to fail.
+ Map<String, Set<AssignableReplica>> assignableReplicaMap = new
HashMap<>(clusterModel.getAssignableReplicaMap());
+ Set<AssignableReplica> resourceAssignableReplicas =
assignableReplicaMap.get("Resource3");
+ AssignableReplica replica = resourceAssignableReplicas.iterator().next();
+ replica.getCapacity().put("item3", 40); // available: 30, requested: 40.
+
+ try {
+ algorithm.calculate(clusterModel);
+ } catch (HelixRebalanceException ex) {
+ Assert.assertEquals(ex.getFailureType(),
HelixRebalanceException.Type.FAILED_TO_CALCULATE);
+ }
+ }
}
diff --git
a/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/constraints/TestFaultZoneAwareConstraint.java
b/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/constraints/TestFaultZoneAwareConstraint.java
index f5b34e3d6..2de5af93a 100644
---
a/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/constraints/TestFaultZoneAwareConstraint.java
+++
b/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/constraints/TestFaultZoneAwareConstraint.java
@@ -19,8 +19,7 @@ package
org.apache.helix.controller.rebalancer.waged.constraints;
* under the License.
*/
-import static org.mockito.Mockito.when;
-
+import com.google.common.collect.ImmutableSet;
import java.util.Collections;
import org.apache.helix.controller.rebalancer.waged.model.AssignableNode;
@@ -31,7 +30,7 @@ import org.testng.Assert;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;
-import com.google.common.collect.ImmutableSet;
+import static org.mockito.Mockito.*;
public class TestFaultZoneAwareConstraint {
private static final String TEST_PARTITION = "testPartition";
diff --git
a/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/constraints/TestFaultZoneAwareConstraint.java
b/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/constraints/TestReplicaActivateConstraint.java
similarity index 56%
copy from
helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/constraints/TestFaultZoneAwareConstraint.java
copy to
helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/constraints/TestReplicaActivateConstraint.java
index f5b34e3d6..359398280 100644
---
a/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/constraints/TestFaultZoneAwareConstraint.java
+++
b/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/constraints/TestReplicaActivateConstraint.java
@@ -19,61 +19,54 @@ package
org.apache.helix.controller.rebalancer.waged.constraints;
* under the License.
*/
-import static org.mockito.Mockito.when;
-
+import java.util.ArrayList;
import java.util.Collections;
-
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
import org.apache.helix.controller.rebalancer.waged.model.AssignableNode;
import org.apache.helix.controller.rebalancer.waged.model.AssignableReplica;
import org.apache.helix.controller.rebalancer.waged.model.ClusterContext;
import org.mockito.Mockito;
import org.testng.Assert;
-import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;
-import com.google.common.collect.ImmutableSet;
+import static org.mockito.Mockito.*;
+
+
+public class TestReplicaActivateConstraint {
-public class TestFaultZoneAwareConstraint {
- private static final String TEST_PARTITION = "testPartition";
- private static final String TEST_ZONE = "testZone";
private static final String TEST_RESOURCE = "testResource";
+ private static final String TEST_PARTITION = "testPartition";
+
private final AssignableReplica _testReplica =
Mockito.mock(AssignableReplica.class);
private final AssignableNode _testNode = Mockito.mock(AssignableNode.class);
private final ClusterContext _clusterContext =
Mockito.mock(ClusterContext.class);
- private final HardConstraint _faultZoneAwareConstraint = new
FaultZoneAwareConstraint();
+ private final HardConstraint _faultZoneAwareConstraint = new
ReplicaActivateConstraint();
+
+ @Test
+ public void validWhenEmptyDisabledReplicaMap() {
+ Map<String, List<String>> disabledReplicaMap = new HashMap<>();
+ disabledReplicaMap.put(TEST_RESOURCE, new ArrayList<>());
- @BeforeMethod
- public void init() {
when(_testReplica.getResourceName()).thenReturn(TEST_RESOURCE);
when(_testReplica.getPartitionName()).thenReturn(TEST_PARTITION);
- when(_testNode.getFaultZone()).thenReturn(TEST_ZONE);
- }
-
- @Test
- public void inValidWhenFaultZoneAlreadyAssigned() {
- when(_testNode.hasFaultZone()).thenReturn(true);
- when(_clusterContext.getPartitionsForResourceAndFaultZone(TEST_RESOURCE,
TEST_ZONE)).thenReturn(
- ImmutableSet.of(TEST_PARTITION));
+ when(_testNode.getDisabledPartitionsMap()).thenReturn(disabledReplicaMap);
- Assert.assertFalse(
- _faultZoneAwareConstraint.isAssignmentValid(_testNode, _testReplica,
_clusterContext));
+ Assert.assertTrue(_faultZoneAwareConstraint.isAssignmentValid(_testNode,
_testReplica, _clusterContext));
}
@Test
- public void validWhenEmptyAssignment() {
- when(_testNode.hasFaultZone()).thenReturn(true);
- when(_clusterContext.getPartitionsForResourceAndFaultZone(TEST_RESOURCE,
TEST_ZONE)).thenReturn(Collections.emptySet());
-
- Assert.assertTrue(
- _faultZoneAwareConstraint.isAssignmentValid(_testNode, _testReplica,
_clusterContext));
- }
+ public void invalidWhenPartitionIsDisabled() {
+ Map<String, List<String>> disabledReplicaMap = new HashMap<>();
+ disabledReplicaMap.put(TEST_RESOURCE,
Collections.singletonList(TEST_PARTITION));
- @Test
- public void validWhenNoFaultZone() {
- when(_testNode.hasFaultZone()).thenReturn(false);
+ when(_testReplica.getResourceName()).thenReturn(TEST_RESOURCE);
+ when(_testReplica.getPartitionName()).thenReturn(TEST_PARTITION);
+ when(_testNode.getDisabledPartitionsMap()).thenReturn(disabledReplicaMap);
- Assert.assertTrue(
- _faultZoneAwareConstraint.isAssignmentValid(_testNode, _testReplica,
_clusterContext));
+ Assert.assertFalse(_faultZoneAwareConstraint.isAssignmentValid(_testNode,
_testReplica, _clusterContext));
}
+
}