dsmiley commented on code in PR #2737:
URL: https://github.com/apache/solr/pull/2737#discussion_r1798764375


##########
solr/core/src/test/org/apache/solr/cloud/TestRebalanceLeaders.java:
##########
@@ -604,74 +572,61 @@ private void forceUpdateCollectionStatus() {
 
   // Since we have to restart jettys, we don't want to try re-balancing etc. 
until we're sure all
   // jettys that should be up are and all replicas are active.
-  private void checkReplicasInactive(List<JettySolrRunner> downJettys) throws 
InterruptedException {
-    TimeOut timeout = new TimeOut(timeoutMs, TimeUnit.MILLISECONDS, 
TimeSource.NANO_TIME);
-    DocCollection docCollection = null;
-    Set<String> liveNodes = null;
+  private void checkReplicasInactive(List<JettySolrRunner> downJettys) {
 
     Set<String> downJettyNodes = new TreeSet<>();
     for (JettySolrRunner jetty : downJettys) {
       downJettyNodes.add(
           jetty.getBaseUrl().getHost() + ":" + jetty.getBaseUrl().getPort() + 
"_solr");
     }
-    while (timeout.hasTimedOut() == false) {
-      forceUpdateCollectionStatus();
-      docCollection = 
cluster.getSolrClient().getClusterState().getCollection(COLLECTION_NAME);
-      liveNodes = cluster.getSolrClient().getClusterState().getLiveNodes();
-      boolean expectedInactive = true;
-
-      for (Slice slice : docCollection.getSlices()) {
-        for (Replica rep : slice.getReplicas()) {
-          if (downJettyNodes.contains(rep.getNodeName()) == false) {
-            continue; // We are on a live node
-          }
-          // A replica on an allegedly down node is reported as active.
-          if (rep.isActive(liveNodes)) {
-            expectedInactive = false;
+
+    waitForState(
+        "Waiting for all replicas to become inactive",
+        COLLECTION_NAME,
+        (liveNodes, docCollection) -> {
+          boolean expectedInactive = true;
+
+          for (Slice slice : docCollection.getSlices()) {
+            for (Replica rep : slice.getReplicas()) {
+              if (!downJettyNodes.contains(rep.getNodeName())) {

Review Comment:
   I don't recommend flipping this wherever you see it, but you are quite 
welcome to use either style.  Over 10 years ago, Lucene (which included Solr) 
developers started preferring the higher clarity of == false over an 
exclamation mark, I recall related to the visual subtleness of a single char.  
Nevertheless there's no code standard on this.



##########
solr/core/src/java/org/apache/solr/cloud/api/collections/CreateCollectionCmd.java:
##########
@@ -221,24 +223,19 @@ public void call(ClusterState clusterState, ZkNodeProps 
message, NamedList<Objec
         }
 
         // wait for a while until we see the collection
-        TimeOut waitUntil =
-            new TimeOut(30, TimeUnit.SECONDS, 
ccc.getSolrCloudManager().getTimeSource());
-        boolean created = false;
-        while (!waitUntil.hasTimedOut()) {
-          waitUntil.sleep(100);
-          created = 
ccc.getSolrCloudManager().getClusterState().hasCollection(collectionName);
-          if (created) break;
-        }
-        if (!created) {
+        try {
+          newColl =
+              zkStateReader.waitForState(collectionName, 30, TimeUnit.SECONDS, 
Objects::nonNull);
+        } catch (TimeoutException e) {
           throw new SolrException(
               SolrException.ErrorCode.SERVER_ERROR,
-              "Could not fully create collection: " + collectionName);
+              "Could not fully create collection: " + collectionName,
+              e);
         }
 
         // refresh cluster state (value read below comes from Zookeeper watch 
firing following the
         // update done previously, be it by Overseer or by this thread when 
updates are distributed)
         clusterState = ccc.getSolrCloudManager().getClusterState();
-        newColl = clusterState.getCollection(collectionName);

Review Comment:
   Why remove this line?  I think it's supposed to be here.



##########
solr/test-framework/src/java/org/apache/solr/cloud/AbstractDistribZkTestBase.java:
##########
@@ -242,45 +240,15 @@ public static void waitForCollectionToDisappear(
     log.info("Collection has disappeared - collection:{}", collection);
   }
 
-  static void waitForNewLeader(
-      CloudSolrClient cloudClient, String shardName, Replica oldLeader, 
TimeOut timeOut)
+  static void waitForNewLeader(CloudSolrClient cloudClient, String shardName, 
Replica oldLeader)
       throws Exception {
-    log.info("Will wait for a node to become leader for {} secs", 
timeOut.timeLeft(SECONDS));
+    log.info("Will wait for a node to become leader for 15 secs");
     ZkStateReader zkStateReader = ZkStateReader.from(cloudClient);
-    zkStateReader.forceUpdateCollection(DEFAULT_COLLECTION);
-
-    for (; ; ) {
-      ClusterState clusterState = zkStateReader.getClusterState();
-      DocCollection coll = clusterState.getCollection("collection1");
-      Slice slice = coll.getSlice(shardName);
-      if (slice.getLeader() != null
-          && !slice.getLeader().equals(oldLeader)
-          && slice.getLeader().getState() == Replica.State.ACTIVE) {
-        if (log.isInfoEnabled()) {
-          log.info(
-              "Old leader {}, new leader {}. New leader got elected in {} ms",
-              oldLeader,
-              slice.getLeader(),
-              timeOut.timeElapsed(MILLISECONDS));
-        }
-        break;
-      }
-
-      if (timeOut.hasTimedOut()) {

Review Comment:
   the loss of the diagnostics seems unfortunate; we could retain that if we 
time out



##########
solr/core/src/java/org/apache/solr/cloud/api/collections/ReindexCollectionCmd.java:
##########
@@ -360,22 +362,17 @@ public void call(ClusterState clusterState, ZkNodeProps 
message, NamedList<Objec
       CollectionHandlingUtils.checkResults(
           "creating checkpoint collection " + chkCollection, cmdResults, true);
       // wait for a while until we see both collections
-      TimeOut waitUntil =
-          new TimeOut(30, TimeUnit.SECONDS, 
ccc.getSolrCloudManager().getTimeSource());
-      boolean created = false;
-      while (!waitUntil.hasTimedOut()) {
-        waitUntil.sleep(100);
-        // this also refreshes our local var clusterState
-        clusterState = ccc.getSolrCloudManager().getClusterState();
-        created =
-            clusterState.hasCollection(targetCollection)
-                && clusterState.hasCollection(chkCollection);
-        if (created) break;
-      }
-      if (!created) {
+      try {
+        for (String col : List.of(targetCollection, chkCollection)) {
+          ccc.getZkStateReader().waitForState(col, 30, TimeUnit.SECONDS, 
Objects::nonNull);

Review Comment:
   so much nicer :-)



##########
solr/modules/hdfs/src/test/org/apache/solr/hdfs/cloud/SharedFileSystemAutoReplicaFailoverTest.java:
##########
@@ -31,6 +31,7 @@
 import java.util.concurrent.Future;
 import java.util.concurrent.SynchronousQueue;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;

Review Comment:
   autoAddReplica was ported to work without the autoscaling framework; it 
survived the deletion.  I recommend not removing the test right now; deserves a 
separate discussion.



##########
solr/test-framework/src/java/org/apache/solr/cloud/AbstractDistribZkTestBase.java:
##########
@@ -242,45 +240,15 @@ public static void waitForCollectionToDisappear(
     log.info("Collection has disappeared - collection:{}", collection);
   }
 
-  static void waitForNewLeader(
-      CloudSolrClient cloudClient, String shardName, Replica oldLeader, 
TimeOut timeOut)
+  static void waitForNewLeader(CloudSolrClient cloudClient, String shardName, 
Replica oldLeader)
       throws Exception {
-    log.info("Will wait for a node to become leader for {} secs", 
timeOut.timeLeft(SECONDS));
+    log.info("Will wait for a node to become leader for 15 secs");
     ZkStateReader zkStateReader = ZkStateReader.from(cloudClient);
-    zkStateReader.forceUpdateCollection(DEFAULT_COLLECTION);
-
-    for (; ; ) {
-      ClusterState clusterState = zkStateReader.getClusterState();
-      DocCollection coll = clusterState.getCollection("collection1");
-      Slice slice = coll.getSlice(shardName);
-      if (slice.getLeader() != null
-          && !slice.getLeader().equals(oldLeader)
-          && slice.getLeader().getState() == Replica.State.ACTIVE) {
-        if (log.isInfoEnabled()) {
-          log.info(
-              "Old leader {}, new leader {}. New leader got elected in {} ms",
-              oldLeader,
-              slice.getLeader(),
-              timeOut.timeElapsed(MILLISECONDS));
-        }
-        break;
-      }
-
-      if (timeOut.hasTimedOut()) {
-        Diagnostics.logThreadDumps("Could not find new leader in specified 
timeout");
-        zkStateReader.getZkClient().printLayoutToStream(System.out);
-        fail(
-            "Could not find new leader even after waiting for "
-                + timeOut.timeElapsed(MILLISECONDS)
-                + "ms");
-      }
-
-      Thread.sleep(100);
-    }
 
+    long start = System.nanoTime();

Review Comment:
   I suggest naming this var "startNs" so it has units



##########
solr/test-framework/src/java/org/apache/solr/cloud/AbstractFullDistribZkTestBase.java:
##########
@@ -2217,83 +2218,60 @@ public static SolrInputDocument getDoc(Object... 
fields) throws Exception {
     return sdoc(fields);
   }
 
-  private String checkCollectionExpectations(
-      String collectionName,
-      List<Integer> numShardsNumReplicaList,
-      List<String> nodesAllowedToRunShards)
-      throws IOException {
-    getCommonCloudSolrClient();
-    ClusterState clusterState = cloudClient.getClusterState();
-    int expectedSlices = numShardsNumReplicaList.get(0);
-    // The Math.min thing is here, because we expect replication-factor to be 
reduced to if there
-    // are not enough live nodes to spread all shards of a collection over 
different nodes
-    int expectedShardsPerSlice = numShardsNumReplicaList.get(1);
-    int expectedTotalShards = expectedSlices * expectedShardsPerSlice;
-
-    //      Map<String,DocCollection> collections = clusterState
-    //          .getCollectionStates();
-    if (clusterState.hasCollection(collectionName)) {
-      Map<String, Slice> slices = 
clusterState.getCollection(collectionName).getSlicesMap();
-      // did we find expectedSlices slices/shards?
-      if (slices.size() != expectedSlices) {
-        return "Found new collection "
-            + collectionName
-            + ", but mismatch on number of slices. Expected: "
-            + expectedSlices
-            + ", actual: "
-            + slices.size();
-      }
-      int totalShards = 0;
-      for (String sliceName : slices.keySet()) {
-        for (Replica replica : slices.get(sliceName).getReplicas()) {
-          if (nodesAllowedToRunShards != null
-              && 
!nodesAllowedToRunShards.contains(replica.getStr(ZkStateReader.NODE_NAME_PROP)))
 {
-            return "Shard "
-                + replica.getName()
-                + " created on node "
-                + replica.getNodeName()
-                + " not allowed to run shards for the created collection "
-                + collectionName;
-          }
-        }
-        totalShards += slices.get(sliceName).getReplicas().size();
-      }
-      if (totalShards != expectedTotalShards) {
-        return "Found new collection "
-            + collectionName
-            + " with correct number of slices, but mismatch on number of 
shards. Expected: "
-            + expectedTotalShards
-            + ", actual: "
-            + totalShards;
-      }
-      return null;
-    } else {
-      return "Could not find new collection " + collectionName;
-    }
-  }
-
-  protected void checkForCollection(
-      String collectionName,
-      List<Integer> numShardsNumReplicaList,
-      List<String> nodesAllowedToRunShards)
+  protected void checkForCollection(String collectionName, List<Integer> 
numShardsNumReplicaList)
       throws Exception {
     // check for an expectedSlices new collection - we poll the state
-    final TimeOut timeout = new TimeOut(120, TimeUnit.SECONDS, 
TimeSource.NANO_TIME);
-    boolean success = false;
-    String checkResult = "Didnt get to perform a single check";
-    while (!timeout.hasTimedOut()) {
-      checkResult =
-          checkCollectionExpectations(
-              collectionName, numShardsNumReplicaList, 
nodesAllowedToRunShards);
-      if (checkResult == null) {
-        success = true;
-        break;
-      }
-      Thread.sleep(500);
-    }
-    if (!success) {
-      super.printLayout();
-      fail(checkResult);
+    ZkStateReader reader = ZkStateReader.from(cloudClient);
+
+    AtomicReference<String> message = new AtomicReference<>();
+    try {
+      reader.waitForState(
+          collectionName,
+          120,
+          TimeUnit.SECONDS,
+          c -> {
+            int expectedSlices = numShardsNumReplicaList.get(0);
+            // The Math.min thing is here, because we expect 
replication-factor to be reduced to if
+            // there

Review Comment:
   reflow this



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to