This is an automated email from the ASF dual-hosted git repository.
epugh pushed a commit to branch branch_10x
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/branch_10x by this push:
new 9ed0264875b fix ci flakiness leaderelection test (#4388)
9ed0264875b is described below
commit 9ed0264875bd8ece2bff37e85bc6e2c8a6d1c8af
Author: Eric Pugh <[email protected]>
AuthorDate: Fri May 8 17:52:30 2026 -0400
fix ci flakiness leaderelection test (#4388)
Co-authored-by: copilot-swe-agent[bot]
<[email protected]>
Co-authored-by: epugh <[email protected]>
(cherry picked from commit 89c5413a8be368b687ee43abca0b1a62d770a553)
---
.../solr/cloud/LeaderElectionIntegrationTest.java | 48 +++++++++++-----------
.../test/org/apache/solr/cloud/ZkFailoverTest.java | 19 +++------
.../solr/cloud/ZkShardTermsRecoveryTest.java | 10 ++++-
3 files changed, 38 insertions(+), 39 deletions(-)
diff --git
a/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java
b/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java
index 57c84aaf41b..2e190f60f89 100644
---
a/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java
+++
b/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java
@@ -19,7 +19,6 @@ package org.apache.solr.cloud;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
-import java.util.concurrent.TimeUnit;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.common.cloud.ZkNodeProps;
@@ -61,7 +60,10 @@ public class LeaderElectionIntegrationTest extends
SolrCloudTestCase {
String collection = "collection1";
createCollection(collection);
- cluster.waitForActiveCollection(collection, 10, TimeUnit.SECONDS, 2, 6);
+ waitForState(
+ "Timeout waiting for collection to become active",
+ collection,
+ clusterShape(2, NUM_REPLICAS_OF_SHARD1 + 1));
List<JettySolrRunner> stoppedRunners = new ArrayList<>();
for (int i = 0; i < 4; i++) {
// who is the leader?
@@ -107,15 +109,20 @@ public class LeaderElectionIntegrationTest extends
SolrCloudTestCase {
assertNotNull(jetty);
cluster.expireZkSession(jetty);
- for (int i = 0; i < 60; i++) { // wait till leader is changed
- if (jetty != getRunner(getLeader(collection))) {
- break;
- }
- Thread.sleep(100);
- }
-
- // make sure we have waited long enough for the first leader to have come
back
- Thread.sleep(ZkTestServer.TICK_TIME * 2 + 100);
+ // Wait until leadership has moved away from the expired-session node
+ waitForState(
+ "Expected leader to move away after expiring zk session",
+ collection,
+ c -> {
+ var l = c.getLeader("shard1");
+ return l != null && !jetty.getNodeName().equals(l.getNodeName());
+ });
+
+ // Wait until the expired-session node is live again before stopping others
+ waitForState(
+ "Expected expired-session node to rejoin live nodes",
+ collection,
+ (liveNodes, c) -> liveNodes.contains(jetty.getNodeName()));
// kill everyone but the first leader that should have reconnected by now
for (JettySolrRunner jetty2 : cluster.getJettySolrRunners()) {
@@ -124,18 +131,13 @@ public class LeaderElectionIntegrationTest extends
SolrCloudTestCase {
}
}
- for (int i = 0; i < 320; i++) { // wait till leader is changed
- try {
- if (jetty == getRunner(getLeader(collection))) {
- break;
- }
- Thread.sleep(100);
- } catch (Exception e) {
- continue;
- }
- }
-
- assertEquals(jetty, getRunner(getLeader(collection)));
+ waitForState(
+ "Expected original node to become leader after others stopped",
+ collection,
+ c -> {
+ var l = c.getLeader("shard1");
+ return l != null && jetty.getNodeName().equals(l.getNodeName());
+ });
}
private JettySolrRunner getRunner(String nodeName) {
diff --git a/solr/core/src/test/org/apache/solr/cloud/ZkFailoverTest.java
b/solr/core/src/test/org/apache/solr/cloud/ZkFailoverTest.java
index d97fdfe84be..36febd0495c 100644
--- a/solr/core/src/test/org/apache/solr/cloud/ZkFailoverTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/ZkFailoverTest.java
@@ -23,9 +23,7 @@ import org.apache.solr.client.solrj.request.QueryRequest;
import org.apache.solr.client.solrj.request.SolrQuery;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrInputDocument;
-import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.embedded.JettySolrRunner;
-import org.apache.zookeeper.KeeperException;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.slf4j.Logger;
@@ -60,12 +58,15 @@ public class ZkFailoverTest extends SolrCloudTestCase {
// This attempt will fail since it will time out after 1 second
System.setProperty("solr.cloud.wait.for.zk.seconds", "1");
restartSolrAndZk();
- waitForLiveNodes(0);
+ waitForState("Timeout waiting for 0 live nodes", coll, (liveNodes, c) ->
liveNodes.isEmpty());
// This attempt will succeed since there will be enough time to connect
System.setProperty("solr.cloud.wait.for.zk.seconds", "20");
restartSolrAndZk();
- waitForLiveNodes(cluster.getJettySolrRunners().size());
+ waitForState(
+ "Timeout waiting for all nodes to come up",
+ coll,
+ (liveNodes, c) -> liveNodes.size() ==
cluster.getJettySolrRunners().size());
waitForState("Timeout waiting for " + coll, coll, clusterShape(2, 2));
QueryResponse rsp =
new QueryRequest(new
SolrQuery("*:*")).process(cluster.getSolrClient(), coll);
@@ -99,14 +100,4 @@ public class ZkFailoverTest extends SolrCloudTestCase {
thread.join();
}
}
-
- private void waitForLiveNodes(int numNodes) throws InterruptedException,
KeeperException {
- ZkStateReader zkStateReader = cluster.getZkStateReader();
- for (int i = 0; i < 100; i++) {
- zkStateReader.updateLiveNodes();
- if (zkStateReader.getClusterState().getLiveNodes().size() == numNodes)
return;
- Thread.sleep(200);
- }
- fail("Timeout waiting for number of live nodes = " + numNodes);
- }
}
diff --git
a/solr/core/src/test/org/apache/solr/cloud/ZkShardTermsRecoveryTest.java
b/solr/core/src/test/org/apache/solr/cloud/ZkShardTermsRecoveryTest.java
index f52b0fce8db..ac1e9177fd0 100644
--- a/solr/core/src/test/org/apache/solr/cloud/ZkShardTermsRecoveryTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/ZkShardTermsRecoveryTest.java
@@ -52,14 +52,20 @@ public class ZkShardTermsRecoveryTest extends
SolrCloudTestCase {
CollectionAdminRequest.createCollection(COLLECTION, "conf",
NUM_SHARDS, NUM_REPLICAS)
.process(cluster.getSolrClient())
.getStatus());
- cluster.waitForActiveCollection(COLLECTION, 10, TimeUnit.SECONDS, 2,
NUM_SHARDS * NUM_REPLICAS);
+ waitForState(
+ "Timeout waiting for collection to be active after creation",
+ COLLECTION,
+ clusterShape(NUM_SHARDS, NUM_SHARDS * NUM_REPLICAS));
}
@Before
public void waitForActiveState() throws Exception {
CollectionAdminRequest.modifyCollection(COLLECTION, Map.of("readOnly",
false))
.process(cluster.getSolrClient());
- cluster.waitForActiveCollection(COLLECTION, 10, TimeUnit.SECONDS, 2,
NUM_SHARDS * NUM_REPLICAS);
+ waitForState(
+ "Timeout waiting for active collection",
+ COLLECTION,
+ clusterShape(NUM_SHARDS, NUM_SHARDS * NUM_REPLICAS));
}
@Test