This is an automated email from the ASF dual-hosted git repository.
alizamus pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/helix.git
The following commit(s) were added to refs/heads/master by this push:
new 80de34b Fix TestCrushAutoRebalanceNonRack.testLackEnoughInstances
unstable issue (#1630) (#1631)
80de34b is described below
commit 80de34b6facac640e50cc3692fd589bbcfd8caeb
Author: kaisun2000 <[email protected]>
AuthorDate: Fri Feb 5 15:25:43 2021 -0800
Fix TestCrushAutoRebalanceNonRack.testLackEnoughInstances unstable issue
(#1630) (#1631)
This test still fails in production due to the race condition that the
controller sending message is async
to test dropping instance. Thus when there is a message on the queue,
dropping would throw
an exception. Here the test is stabilized by stopping the controller as a
workaround to avoid such race condition.
---
.../CrushRebalancers/TestCrushAutoRebalanceNonRack.java | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git
a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/CrushRebalancers/TestCrushAutoRebalanceNonRack.java
b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/CrushRebalancers/TestCrushAutoRebalanceNonRack.java
index 3db6909..2b3c91b 100644
---
a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/CrushRebalancers/TestCrushAutoRebalanceNonRack.java
+++
b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/CrushRebalancers/TestCrushAutoRebalanceNonRack.java
@@ -264,6 +264,12 @@ public class TestCrushAutoRebalanceNonRack extends
ZkStandAloneCMTestBase {
System.out.println("TestLackEnoughInstances " + rebalanceStrategyName);
enablePersistBestPossibleAssignment(_gZkClient, CLUSTER_NAME, true);
+ // Drop instance from admin tools and controller sending message to the
same instance are
+ // fundamentally async. The race condition can also happen in production.
For now we stabilize
+ // the test by disable controller and re-enable controller to eliminate
this race condition as
+ // a workaround. New design is needed to fundamentally resolve the expose
issue.
+ _controller.syncStop();
+
// shutdown participants, keep only two left
HelixDataAccessor helixDataAccessor =
new ZKHelixDataAccessor(CLUSTER_NAME, InstanceType.PARTICIPANT,
_baseAccessor);
@@ -281,6 +287,10 @@ public class TestCrushAutoRebalanceNonRack extends
ZkStandAloneCMTestBase {
_gSetupTool.dropInstanceFromCluster(CLUSTER_NAME, p.getInstanceName());
}
+ String controllerName = CONTROLLER_PREFIX + "_0";
+ _controller = new ClusterControllerManager(ZK_ADDR, CLUSTER_NAME,
controllerName);
+ _controller.syncStart();
+
int j = 0;
for (String stateModel : _testModels) {
String db = "Test-DB-" + rebalanceStrategyName + "-" + j++;