This is an automated email from the ASF dual-hosted git repository.

epugh pushed a commit to branch branch_10x
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/branch_10x by this push:
     new 9ed0264875b fix ci flakiness leaderelection test (#4388)
9ed0264875b is described below

commit 9ed0264875bd8ece2bff37e85bc6e2c8a6d1c8af
Author: Eric Pugh <[email protected]>
AuthorDate: Fri May 8 17:52:30 2026 -0400

    fix ci flakiness leaderelection test (#4388)
    
    Co-authored-by: copilot-swe-agent[bot] 
<[email protected]>
    Co-authored-by: epugh <[email protected]>
    (cherry picked from commit 89c5413a8be368b687ee43abca0b1a62d770a553)
---
 .../solr/cloud/LeaderElectionIntegrationTest.java  | 48 +++++++++++-----------
 .../test/org/apache/solr/cloud/ZkFailoverTest.java | 19 +++------
 .../solr/cloud/ZkShardTermsRecoveryTest.java       | 10 ++++-
 3 files changed, 38 insertions(+), 39 deletions(-)

diff --git 
a/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java 
b/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java
index 57c84aaf41b..2e190f60f89 100644
--- 
a/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java
+++ 
b/solr/core/src/test/org/apache/solr/cloud/LeaderElectionIntegrationTest.java
@@ -19,7 +19,6 @@ package org.apache.solr.cloud;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
-import java.util.concurrent.TimeUnit;
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.request.CollectionAdminRequest;
 import org.apache.solr.common.cloud.ZkNodeProps;
@@ -61,7 +60,10 @@ public class LeaderElectionIntegrationTest extends 
SolrCloudTestCase {
     String collection = "collection1";
     createCollection(collection);
 
-    cluster.waitForActiveCollection(collection, 10, TimeUnit.SECONDS, 2, 6);
+    waitForState(
+        "Timeout waiting for collection to become active",
+        collection,
+        clusterShape(2, NUM_REPLICAS_OF_SHARD1 + 1));
     List<JettySolrRunner> stoppedRunners = new ArrayList<>();
     for (int i = 0; i < 4; i++) {
       // who is the leader?
@@ -107,15 +109,20 @@ public class LeaderElectionIntegrationTest extends 
SolrCloudTestCase {
     assertNotNull(jetty);
     cluster.expireZkSession(jetty);
 
-    for (int i = 0; i < 60; i++) { // wait till leader is changed
-      if (jetty != getRunner(getLeader(collection))) {
-        break;
-      }
-      Thread.sleep(100);
-    }
-
-    // make sure we have waited long enough for the first leader to have come 
back
-    Thread.sleep(ZkTestServer.TICK_TIME * 2 + 100);
+    // Wait until leadership has moved away from the expired-session node
+    waitForState(
+        "Expected leader to move away after expiring zk session",
+        collection,
+        c -> {
+          var l = c.getLeader("shard1");
+          return l != null && !jetty.getNodeName().equals(l.getNodeName());
+        });
+
+    // Wait until the expired-session node is live again before stopping others
+    waitForState(
+        "Expected expired-session node to rejoin live nodes",
+        collection,
+        (liveNodes, c) -> liveNodes.contains(jetty.getNodeName()));
 
     // kill everyone but the first leader that should have reconnected by now
     for (JettySolrRunner jetty2 : cluster.getJettySolrRunners()) {
@@ -124,18 +131,13 @@ public class LeaderElectionIntegrationTest extends 
SolrCloudTestCase {
       }
     }
 
-    for (int i = 0; i < 320; i++) { // wait till leader is changed
-      try {
-        if (jetty == getRunner(getLeader(collection))) {
-          break;
-        }
-        Thread.sleep(100);
-      } catch (Exception e) {
-        continue;
-      }
-    }
-
-    assertEquals(jetty, getRunner(getLeader(collection)));
+    waitForState(
+        "Expected original node to become leader after others stopped",
+        collection,
+        c -> {
+          var l = c.getLeader("shard1");
+          return l != null && jetty.getNodeName().equals(l.getNodeName());
+        });
   }
 
   private JettySolrRunner getRunner(String nodeName) {
diff --git a/solr/core/src/test/org/apache/solr/cloud/ZkFailoverTest.java 
b/solr/core/src/test/org/apache/solr/cloud/ZkFailoverTest.java
index d97fdfe84be..36febd0495c 100644
--- a/solr/core/src/test/org/apache/solr/cloud/ZkFailoverTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/ZkFailoverTest.java
@@ -23,9 +23,7 @@ import org.apache.solr.client.solrj.request.QueryRequest;
 import org.apache.solr.client.solrj.request.SolrQuery;
 import org.apache.solr.client.solrj.response.QueryResponse;
 import org.apache.solr.common.SolrInputDocument;
-import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.embedded.JettySolrRunner;
-import org.apache.zookeeper.KeeperException;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
 import org.slf4j.Logger;
@@ -60,12 +58,15 @@ public class ZkFailoverTest extends SolrCloudTestCase {
     // This attempt will fail since it will time out after 1 second
     System.setProperty("solr.cloud.wait.for.zk.seconds", "1");
     restartSolrAndZk();
-    waitForLiveNodes(0);
+    waitForState("Timeout waiting for 0 live nodes", coll, (liveNodes, c) -> 
liveNodes.isEmpty());
 
     // This attempt will succeed since there will be enough time to connect
     System.setProperty("solr.cloud.wait.for.zk.seconds", "20");
     restartSolrAndZk();
-    waitForLiveNodes(cluster.getJettySolrRunners().size());
+    waitForState(
+        "Timeout waiting for all nodes to come up",
+        coll,
+        (liveNodes, c) -> liveNodes.size() == 
cluster.getJettySolrRunners().size());
     waitForState("Timeout waiting for " + coll, coll, clusterShape(2, 2));
     QueryResponse rsp =
         new QueryRequest(new 
SolrQuery("*:*")).process(cluster.getSolrClient(), coll);
@@ -99,14 +100,4 @@ public class ZkFailoverTest extends SolrCloudTestCase {
       thread.join();
     }
   }
-
-  private void waitForLiveNodes(int numNodes) throws InterruptedException, 
KeeperException {
-    ZkStateReader zkStateReader = cluster.getZkStateReader();
-    for (int i = 0; i < 100; i++) {
-      zkStateReader.updateLiveNodes();
-      if (zkStateReader.getClusterState().getLiveNodes().size() == numNodes) 
return;
-      Thread.sleep(200);
-    }
-    fail("Timeout waiting for number of live nodes = " + numNodes);
-  }
 }
diff --git 
a/solr/core/src/test/org/apache/solr/cloud/ZkShardTermsRecoveryTest.java 
b/solr/core/src/test/org/apache/solr/cloud/ZkShardTermsRecoveryTest.java
index f52b0fce8db..ac1e9177fd0 100644
--- a/solr/core/src/test/org/apache/solr/cloud/ZkShardTermsRecoveryTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/ZkShardTermsRecoveryTest.java
@@ -52,14 +52,20 @@ public class ZkShardTermsRecoveryTest extends 
SolrCloudTestCase {
         CollectionAdminRequest.createCollection(COLLECTION, "conf", 
NUM_SHARDS, NUM_REPLICAS)
             .process(cluster.getSolrClient())
             .getStatus());
-    cluster.waitForActiveCollection(COLLECTION, 10, TimeUnit.SECONDS, 2, 
NUM_SHARDS * NUM_REPLICAS);
+    waitForState(
+        "Timeout waiting for collection to be active after creation",
+        COLLECTION,
+        clusterShape(NUM_SHARDS, NUM_SHARDS * NUM_REPLICAS));
   }
 
   @Before
   public void waitForActiveState() throws Exception {
     CollectionAdminRequest.modifyCollection(COLLECTION, Map.of("readOnly", 
false))
         .process(cluster.getSolrClient());
-    cluster.waitForActiveCollection(COLLECTION, 10, TimeUnit.SECONDS, 2, 
NUM_SHARDS * NUM_REPLICAS);
+    waitForState(
+        "Timeout waiting for active collection",
+        COLLECTION,
+        clusterShape(NUM_SHARDS, NUM_SHARDS * NUM_REPLICAS));
   }
 
   @Test

Reply via email to