This is an automated email from the ASF dual-hosted git repository.
broustant pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/branch_9x by this push:
new 6d68598cce1 SOLR-16473: Fix race condition in shard split when a
sub-shard is put in recovery state.
6d68598cce1 is described below
commit 6d68598cce17f691f020b4e99b1a9f775bb71b7f
Author: Bruno Roustant <[email protected]>
AuthorDate: Mon Dec 5 18:03:13 2022 +0100
SOLR-16473: Fix race condition in shard split when a sub-shard is put in
recovery state.
Co-authored-by: Andy Vuong <[email protected]>
---
solr/CHANGES.txt | 1 +
.../apache/solr/cloud/api/collections/SplitShardCmd.java | 15 +++++++++++++++
2 files changed, 16 insertions(+)
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 01eb61d6a2d..e6310b6ee45 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -113,6 +113,7 @@ Bug Fixes
* SOLR-16165: Rare Deadlock in SlotAcc initialization (Justin Sweeney, noble)
+* SOLR-16473: Fix race condition in shard split when a sub-shard is put in
recovery state. (Andy Vuong via Bruno Roustant)
* SOLR-10458: Fix followRedirect property on HttpSolrClient not set when using
Builder pattern. (Eric Pugh)
diff --git
a/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java
b/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java
index 087e2bb6970..23473694b59 100644
---
a/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java
+++
b/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java
@@ -39,6 +39,7 @@ import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
+import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import org.apache.solr.client.solrj.cloud.DistribStateManager;
@@ -785,6 +786,20 @@ public class SplitShardCmd implements
CollApiCmds.CollectionApiCommand {
} else {
ccc.offerStateUpdate(m);
}
+ // Wait for the sub-shards to change to the RECOVERY state before
creating the replica
+ // cores. Otherwise, there is a race condition and some recovery
updates may be lost.
+ zkStateReader.waitForState(
+ collectionName,
+ 60,
+ TimeUnit.SECONDS,
+ (collectionState) -> {
+ for (String subSlice : subSlices) {
+ if
(!collectionState.getSlice(subSlice).getState().equals(Slice.State.RECOVERY)) {
+ return false;
+ }
+ }
+ return true;
+ });
}
t = timings.sub("createCoresForReplicas");