This is an automated email from the ASF dual-hosted git repository.

noble pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/main by this push:
     new 6988eec5c8a Shard split when NRT and pull replica & prs enabled leads 
to inactive replica (#1350)
6988eec5c8a is described below

commit 6988eec5c8a2d9736cf0a1cc2c8800ea4bcdc2ac
Author: Hitesh Khamesra <[email protected]>
AuthorDate: Sun Feb 19 03:06:30 2023 -0800

    Shard split when NRT and pull replica & prs enabled leads to inactive 
replica (#1350)
---
 solr/CHANGES.txt                                   |  2 +
 .../apache/solr/cloud/overseer/ReplicaMutator.java | 11 ---
 .../apache/solr/cloud/overseer/ZkStateWriter.java  |  8 --
 .../apache/solr/cloud/overseer/ZkWriteCommand.java | 10 ++-
 .../solr/cloud/SplitShardWithNodeRoleTest.java     | 98 ++++++++++++++++++++++
 .../solr/cloud/overseer/ZkStateReaderTest.java     | 12 +--
 .../solr/common/cloud/PerReplicaStatesOps.java     | 26 ------
 .../cloud/PerReplicaStatesIntegrationTest.java     | 10 +--
 8 files changed, 119 insertions(+), 58 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index a879006bd0c..ee5e0737055 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -213,6 +213,8 @@ Bug Fixes
 
 * SOLR-9698: Fix start/stop wait time and RMI_PORT on Windows (Colvin Cowie)
 
+SOLR-16653: Shard split on PRS collections with NRT + PULL replicas lead to 
down replicas (Hitesh Khamesra via noble)
+
 Build
 ---------------------
 * Upgrade forbiddenapis to 3.4 (Uwe Schindler)
diff --git 
a/solr/core/src/java/org/apache/solr/cloud/overseer/ReplicaMutator.java 
b/solr/core/src/java/org/apache/solr/cloud/overseer/ReplicaMutator.java
index 9f5fd99a2dd..b030346777f 100644
--- a/solr/core/src/java/org/apache/solr/cloud/overseer/ReplicaMutator.java
+++ b/solr/core/src/java/org/apache/solr/cloud/overseer/ReplicaMutator.java
@@ -29,7 +29,6 @@ import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.NoSuchElementException;
-import java.util.Objects;
 import java.util.Set;
 import java.util.concurrent.TimeUnit;
 import org.apache.commons.lang3.StringUtils;
@@ -438,19 +437,9 @@ public class ReplicaMutator {
 
     DocCollection newCollection = 
CollectionMutator.updateSlice(collectionName, collection, slice);
     log.debug("Collection is now: {}", newCollection);
-    if (collection.isPerReplicaState() && oldReplica != null) {
-      if (!isAnyPropertyChanged(replica, oldReplica)) return 
ZkWriteCommand.NO_OP;
-    }
     return new ZkWriteCommand(collectionName, newCollection);
   }
 
-  private boolean isAnyPropertyChanged(Replica replica, Replica oldReplica) {
-    if (!Objects.equals(replica.getBaseUrl(), oldReplica.getBaseUrl())) return 
true;
-    if (!Objects.equals(replica.getCoreName(), oldReplica.getCoreName())) 
return true;
-    if (!Objects.equals(replica.getNodeName(), oldReplica.getNodeName())) 
return true;
-    return false;
-  }
-
   private DocCollection checkAndCompleteShardSplit(
       ClusterState prevState,
       DocCollection collection,
diff --git 
a/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java 
b/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java
index 9cd2089e881..b3ad7bf61fb 100644
--- a/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java
+++ b/solr/core/src/java/org/apache/solr/cloud/overseer/ZkStateWriter.java
@@ -31,7 +31,6 @@ import org.apache.solr.cloud.Stats;
 import org.apache.solr.common.cloud.ClusterState;
 import org.apache.solr.common.cloud.DocCollection;
 import org.apache.solr.common.cloud.PerReplicaStatesFetcher;
-import org.apache.solr.common.cloud.PerReplicaStatesOps;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.util.Compressor;
 import org.apache.solr.common.util.Utils;
@@ -318,14 +317,7 @@ public class ZkStateWriter {
             }
           }
 
-          // When dealing with a per replica collection that did not do any 
update to the per
-          // replica states znodes but did update state.json, we add then 
remove a dummy node to
-          // change the cversion of the parent znode. This is not needed by 
Solr, there's no code
-          // watching the children and not watching the state.json node 
itself. It would be useful
-          // for external code watching the collection's Zookeeper state.json 
node children but not
-          // the node itself.
           if (cmd.ops == null && cmd.isPerReplicaStateCollection) {
-            PerReplicaStatesOps.touchChildren().persist(path, 
reader.getZkClient());
             DocCollection currentCollState = 
clusterState.getCollection(cmd.name);
             if (currentCollState != null) {
               clusterState =
diff --git 
a/solr/core/src/java/org/apache/solr/cloud/overseer/ZkWriteCommand.java 
b/solr/core/src/java/org/apache/solr/cloud/overseer/ZkWriteCommand.java
index f2d5466f760..6275b885325 100644
--- a/solr/core/src/java/org/apache/solr/cloud/overseer/ZkWriteCommand.java
+++ b/solr/core/src/java/org/apache/solr/cloud/overseer/ZkWriteCommand.java
@@ -50,6 +50,14 @@ public class ZkWriteCommand {
 
   @Override
   public String toString() {
-    return getClass().getSimpleName() + ": " + (this == NO_OP ? "no-op" : name 
+ "=" + collection);
+    return getClass().getSimpleName()
+        + ": "
+        + (this == NO_OP ? "no-op" : name + "=" + this.name)
+        + " pjs: "
+        + this.persistJsonState
+        + ", prs: "
+        + this.isPerReplicaStateCollection
+        + " , ops:"
+        + this.ops;
   }
 }
diff --git 
a/solr/core/src/test/org/apache/solr/cloud/SplitShardWithNodeRoleTest.java 
b/solr/core/src/test/org/apache/solr/cloud/SplitShardWithNodeRoleTest.java
new file mode 100644
index 00000000000..7e7074a1df6
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/cloud/SplitShardWithNodeRoleTest.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.cloud;
+
+import java.lang.invoke.MethodHandles;
+import java.util.HashSet;
+import java.util.Set;
+import org.apache.solr.client.solrj.impl.CloudSolrClient;
+import org.apache.solr.client.solrj.request.CollectionAdminRequest;
+import org.apache.solr.client.solrj.request.UpdateRequest;
+import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.core.NodeRoles;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class SplitShardWithNodeRoleTest extends SolrCloudTestCase {
+  private static final Logger log = 
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+  @BeforeClass
+  public static void setupCluster() throws Exception {
+    configureCluster(2).addConfig("conf", 
configset("cloud-minimal")).configure();
+    System.setProperty(NodeRoles.NODE_ROLES_PROP, "data:off,coordinator:on");
+
+    try {
+      cluster.startJettySolrRunner();
+      cluster.startJettySolrRunner();
+    } finally {
+      System.clearProperty(NodeRoles.NODE_ROLES_PROP);
+    }
+
+    Set<String> overseerNodes = new HashSet<>();
+    System.setProperty(NodeRoles.NODE_ROLES_PROP, 
"data:off,overseer:preferred");
+    try {
+      overseerNodes.add(cluster.startJettySolrRunner().getNodeName());
+      overseerNodes.add(cluster.startJettySolrRunner().getNodeName());
+    } finally {
+      System.clearProperty(NodeRoles.NODE_ROLES_PROP);
+    }
+    OverseerRolesTest.waitForNewOverseer(10, overseerNodes::contains, false);
+  }
+
+  @Test
+  public void testSolrClusterWithNodeRoleWithSingleReplica() throws Exception {
+    doSplit("coll_NO_HA", 1, 1, 0);
+  }
+
+  @Test
+  public void testSolrClusterWithNodeRoleWithHA() throws Exception {
+    doSplit("coll_HA", 1, 1, 1);
+  }
+
+  public void doSplit(String collName, int shard, int nrtReplica, int 
pullReplica)
+      throws Exception {
+    CloudSolrClient client = cluster.getSolrClient();
+    CollectionAdminRequest.createCollection(collName, "conf", shard, 
nrtReplica, 0, pullReplica)
+        .setPerReplicaState(true)
+        .process(cluster.getSolrClient());
+    cluster.waitForActiveCollection(collName, shard, nrtReplica + pullReplica);
+    UpdateRequest ur = new UpdateRequest();
+    for (int i = 0; i < 10; i++) {
+      SolrInputDocument doc2 = new SolrInputDocument();
+      doc2.addField("id", "" + i);
+      ur.add(doc2);
+    }
+
+    ur.commit(client, collName);
+
+    CollectionAdminRequest.SplitShard splitShard =
+        CollectionAdminRequest.splitShard(collName).setShardName("shard1");
+    splitShard.process(cluster.getSolrClient());
+    waitForState(
+        "Timed out waiting for sub shards to be active. Number of active 
shards="
+            + cluster
+                .getSolrClient()
+                .getClusterState()
+                .getCollection(collName)
+                .getActiveSlices()
+                .size(),
+        collName,
+        activeClusterShape(shard + 1, 3 * (nrtReplica + pullReplica)));
+  }
+}
diff --git 
a/solr/core/src/test/org/apache/solr/cloud/overseer/ZkStateReaderTest.java 
b/solr/core/src/test/org/apache/solr/cloud/overseer/ZkStateReaderTest.java
index 38886c5e35f..b2b98fa1ea4 100644
--- a/solr/core/src/test/org/apache/solr/cloud/overseer/ZkStateReaderTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/overseer/ZkStateReaderTest.java
@@ -287,8 +287,8 @@ public class ZkStateReaderTest extends SolrTestCaseJ4 {
     ClusterState.CollectionRef ref = 
reader.getClusterState().getCollectionRef("c1");
     assertFalse(ref.isLazilyLoaded());
     assertEquals(0, ref.get().getZNodeVersion());
-    // dummy node created +1 and deleted +1 so 2
-    assertEquals(2, ref.get().getChildNodesVersion());
+    // no more dummy node
+    assertEquals(0, ref.get().getChildNodesVersion());
 
     DocCollection collection = ref.get();
     PerReplicaStates prs =
@@ -307,7 +307,7 @@ public class ZkStateReaderTest extends SolrTestCaseJ4 {
 
     ref = reader.getClusterState().getCollectionRef("c1");
     assertEquals(0, ref.get().getZNodeVersion()); // no change in Znode version
-    assertEquals(3, ref.get().getChildNodesVersion()); // but child version 
should be 1 now
+    assertEquals(1, ref.get().getChildNodesVersion()); // but child version 
should be 1 now
 
     prs = ref.get().getPerReplicaStates();
     PerReplicaStatesOps.flipState("r1", Replica.State.ACTIVE, prs)
@@ -321,7 +321,7 @@ public class ZkStateReaderTest extends SolrTestCaseJ4 {
     ref = reader.getClusterState().getCollectionRef("c1");
     assertEquals(0, ref.get().getZNodeVersion()); // no change in Znode version
     // but child version should be 3 now (1 del + 1 add)
-    assertEquals(5, ref.get().getChildNodesVersion());
+    assertEquals(3, ref.get().getChildNodesVersion());
 
     // now delete the collection
     wc = new ZkWriteCommand("c1", null);
@@ -346,7 +346,7 @@ public class ZkStateReaderTest extends SolrTestCaseJ4 {
     ref = reader.getClusterState().getCollectionRef("c1");
     assertFalse(ref.isLazilyLoaded());
     assertEquals(0, ref.get().getZNodeVersion());
-    assertEquals(2, ref.get().getChildNodesVersion()); // child node version 
is reset
+    assertEquals(0, ref.get().getChildNodesVersion()); // child node version 
is reset
 
     // re-add PRS
     collection = ref.get();
@@ -367,7 +367,7 @@ public class ZkStateReaderTest extends SolrTestCaseJ4 {
     ref = reader.getClusterState().getCollectionRef("c1");
 
     // child version should be reset since the state.json node was deleted and 
re-created
-    assertEquals(3, ref.get().getChildNodesVersion());
+    assertEquals(1, ref.get().getChildNodesVersion());
   }
 
   public void testForciblyRefreshAllClusterState() throws Exception {
diff --git 
a/solr/solrj-zookeeper/src/java/org/apache/solr/common/cloud/PerReplicaStatesOps.java
 
b/solr/solrj-zookeeper/src/java/org/apache/solr/common/cloud/PerReplicaStatesOps.java
index a21d3be9387..e85646d7207 100644
--- 
a/solr/solrj-zookeeper/src/java/org/apache/solr/common/cloud/PerReplicaStatesOps.java
+++ 
b/solr/solrj-zookeeper/src/java/org/apache/solr/common/cloud/PerReplicaStatesOps.java
@@ -29,7 +29,6 @@ import java.util.function.Function;
 import org.apache.zookeeper.CreateMode;
 import org.apache.zookeeper.KeeperException;
 import org.apache.zookeeper.Op;
-import org.apache.zookeeper.data.Stat;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -299,31 +298,6 @@ public class PerReplicaStatesOps {
         .init(rs);
   }
 
-  /**
-   * Just creates and deletes a dummy entry so that the {@link 
Stat#getCversion()} of state.json is
-   * updated
-   */
-  public static PerReplicaStatesOps touchChildren() {
-    PerReplicaStatesOps result =
-        new PerReplicaStatesOps(
-            prs -> {
-              List<PerReplicaStates.Operation> operations = new ArrayList<>(2);
-              PerReplicaStates.State st =
-                  new PerReplicaStates.State(
-                      ".dummy." + System.nanoTime(), Replica.State.DOWN, 
Boolean.FALSE, 0);
-              operations.add(
-                  new 
PerReplicaStates.Operation(PerReplicaStates.Operation.Type.ADD, st));
-              operations.add(
-                  new 
PerReplicaStates.Operation(PerReplicaStates.Operation.Type.DELETE, st));
-              if (log.isDebugEnabled()) {
-                log.debug("touchChildren {}", operations);
-              }
-              return operations;
-            });
-    result.ops = result.refresh(null);
-    return result;
-  }
-
   PerReplicaStatesOps init(PerReplicaStates rs) {
     if (rs == null) return null;
     get(rs);
diff --git 
a/solr/solrj/src/test/org/apache/solr/common/cloud/PerReplicaStatesIntegrationTest.java
 
b/solr/solrj/src/test/org/apache/solr/common/cloud/PerReplicaStatesIntegrationTest.java
index be010da984a..bc085e7d0e6 100644
--- 
a/solr/solrj/src/test/org/apache/solr/common/cloud/PerReplicaStatesIntegrationTest.java
+++ 
b/solr/solrj/src/test/org/apache/solr/common/cloud/PerReplicaStatesIntegrationTest.java
@@ -300,7 +300,7 @@ public class PerReplicaStatesIntegrationTest extends 
SolrCloudTestCase {
           .process(cluster.getSolrClient());
       stat = 
cluster.getZkClient().exists(DocCollection.getCollectionPath(PRS_COLL), null, 
true);
       // +1 after all replica are added with on state.json write to 
CreateCollectionCmd.setData()
-      assertEquals(1, stat.getVersion());
+      assertEquals(11, stat.getVersion());
       // For each replica:
       // +1 for ZkController#preRegister, in ZkController#publish, direct 
write PRS to down
       // +2 for runLeaderProcess, flip the replica to leader
@@ -320,8 +320,7 @@ public class PerReplicaStatesIntegrationTest extends 
SolrCloudTestCase {
       // +1 for ZkController#preRegister, in ZkController#publish, direct 
write PRS to down
       // +2 for RecoveryStrategy#doRecovery, since this is no longer a new 
collection, new replica
       // will go through recovery, direct write PRS to RECOVERING
-      // +2 for ZkController#register, in ZkController#publish, direct write 
PRS to active
-      assertEquals(57, stat.getCversion());
+      assertEquals(55, stat.getCversion());
 
       String addedCore = 
response.getCollectionCoresStatus().entrySet().iterator().next().getKey();
       Replica addedReplica =
@@ -337,9 +336,8 @@ public class PerReplicaStatesIntegrationTest extends 
SolrCloudTestCase {
       stat = 
cluster.getZkClient().exists(DocCollection.getCollectionPath(PRS_COLL), null, 
true);
       // For replica deletion
       // +1 for ZkController#unregister, which delete the PRS entry from data 
node
-      // +2 for state.json overseer writes, even though there's no longer PRS 
updates from
       // overseer, current code would still do a "TOUCH" on the PRS entry
-      assertEquals(60, stat.getCversion());
+      assertEquals(56, stat.getCversion());
 
       for (JettySolrRunner j : cluster.getJettySolrRunners()) {
         j.stop();
@@ -347,7 +345,7 @@ public class PerReplicaStatesIntegrationTest extends 
SolrCloudTestCase {
         stat = 
cluster.getZkClient().exists(DocCollection.getCollectionPath(PRS_COLL), null, 
true);
         // ensure restart does not update the state.json, after 
addReplica/deleteReplica, 2 more
         // updates hence at version 3 on state.json version
-        assertEquals(3, stat.getVersion());
+        assertEquals(14, stat.getVersion());
       }
     } finally {
       cluster.shutdown();

Reply via email to