This is an automated email from the ASF dual-hosted git repository.
dsmiley pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/main by this push:
new e377ebbdaff SOLR-11191: SolrIndexSplitter: Routing child docs by
_root_ when available (#2799)
e377ebbdaff is described below
commit e377ebbdaff5ae784b3f4e4315afa4d6705d1d71
Author: Zack Kendall <[email protected]>
AuthorDate: Tue Oct 29 18:29:41 2024 -0700
SOLR-11191: SolrIndexSplitter: Routing child docs by _root_ when available
(#2799)
Splitting shards now supports child / nested docs. Works based on
splitting based on the _root_ field.
Co-authored-by: zkendall <[email protected]>
---
solr/CHANGES.txt | 2 +
.../org/apache/solr/update/SolrIndexSplitter.java | 15 ++-
.../solr/configsets/cloud-minimal/conf/schema.xml | 2 +
.../test/org/apache/solr/cloud/SplitShardTest.java | 71 ++++++++---
.../apache/solr/update/SolrIndexSplitterTest.java | 131 ++++++++++++++++++++-
.../pages/collection-management.adoc | 1 +
.../modules/deployment-guide/pages/node-roles.adoc | 2 +-
.../pages/solrcloud-shards-indexing.adoc | 2 +-
.../pages/indexing-nested-documents.adoc | 6 +-
9 files changed, 208 insertions(+), 24 deletions(-)
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index ac00cf68152..db44d16aa74 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -189,6 +189,8 @@ Bug Fixes
* SOLR-17413: Fixed UpdateLog replay bug that shared thread-unsafe
SolrQueryRequest objects across threads (Jason Gerlowski, David Smiley, Houston
Putman)
+* SOLR-11191: Splitting shards now routes child-docs with their _root_ field
when available so they maintain parent relationship. (Zack Kendall)
+
Dependency Upgrades
---------------------
(No changes)
diff --git a/solr/core/src/java/org/apache/solr/update/SolrIndexSplitter.java
b/solr/core/src/java/org/apache/solr/update/SolrIndexSplitter.java
index 61ba505ddd5..fa12d7b51d4 100644
--- a/solr/core/src/java/org/apache/solr/update/SolrIndexSplitter.java
+++ b/solr/core/src/java/org/apache/solr/update/SolrIndexSplitter.java
@@ -69,6 +69,7 @@ import org.apache.solr.core.DirectoryFactory;
import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.IndexFetcher;
import org.apache.solr.handler.SnapShooter;
+import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.BitsFilteredPostingsEnum;
import org.apache.solr.search.SolrIndexSearcher;
@@ -125,17 +126,29 @@ public class SolrIndexSplitter {
numPieces = cmd.ranges.size();
rangesArr = cmd.ranges.toArray(new DocRouter.Range[0]);
}
+
if (cmd.routeFieldName == null) {
- field = searcher.getSchema().getUniqueKeyField();
+ // To support routing child documents, use the root field if it exists
(which would be
+ // populated with unique field), otherwise use the unique key field
+ if (searcher.getSchema().isUsableForChildDocs()) {
+ field = searcher.getSchema().getField(IndexSchema.ROOT_FIELD_NAME);
+ } else {
+ field = searcher.getSchema().getUniqueKeyField();
+ }
} else {
+ // Custom routing
+ // If child docs are used, users must ensure that the whole nested
document tree has a
+ // consistent routeField value
field = searcher.getSchema().getField(cmd.routeFieldName);
}
+
if (cmd.splitKey == null) {
splitKey = null;
} else {
checkRouterSupportsSplitKey(hashRouter, cmd.splitKey);
splitKey = ((CompositeIdRouter)
hashRouter).getRouteKeyNoSuffix(cmd.splitKey);
}
+
if (cmd.cores == null) {
this.splitMethod = SplitMethod.REWRITE;
} else {
diff --git
a/solr/core/src/test-files/solr/configsets/cloud-minimal/conf/schema.xml
b/solr/core/src/test-files/solr/configsets/cloud-minimal/conf/schema.xml
index fc23706512e..3c25f27405a 100644
--- a/solr/core/src/test-files/solr/configsets/cloud-minimal/conf/schema.xml
+++ b/solr/core/src/test-files/solr/configsets/cloud-minimal/conf/schema.xml
@@ -19,10 +19,12 @@
<fieldType name="string" class="solr.StrField"/>
<fieldType name="int" class="${solr.tests.IntegerFieldType}"
docValues="${solr.tests.numeric.dv}" precisionStep="0" omitNorms="true"
positionIncrementGap="0" uninvertible="true"/>
<fieldType name="long" class="${solr.tests.LongFieldType}"
docValues="${solr.tests.numeric.dv}" precisionStep="0" omitNorms="true"
positionIncrementGap="0" uninvertible="true"/>
+ <fieldType name="_nest_path_" class="solr.NestPathField"/>
<dynamicField name="*" type="string" indexed="true" stored="true"/>
<!-- for versioning -->
<field name="_version_" type="long" indexed="true" stored="true"/>
<field name="_root_" type="string" indexed="true" stored="true"
multiValued="false" required="false"/>
+ <field name="_nest_path_" type="_nest_path_"/>`
<field name="id" type="string" indexed="true" stored="true"/>
<dynamicField name="*_s" type="string" indexed="true" stored="true" />
<uniqueKey>id</uniqueKey>
diff --git a/solr/core/src/test/org/apache/solr/cloud/SplitShardTest.java
b/solr/core/src/test/org/apache/solr/cloud/SplitShardTest.java
index 132dfcfaaa2..ab194b291da 100644
--- a/solr/core/src/test/org/apache/solr/cloud/SplitShardTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/SplitShardTest.java
@@ -21,6 +21,7 @@ import static org.hamcrest.core.StringContains.containsString;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
+import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
@@ -35,11 +36,14 @@ import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
+import org.apache.solr.client.solrj.request.QueryRequest;
import org.apache.solr.client.solrj.request.UpdateRequest;
+import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.client.solrj.response.UpdateResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.Slice;
@@ -88,16 +92,7 @@ public class SplitShardTest extends SolrCloudTestCase {
.setNumSubShards(5)
.setShardName("shard1");
splitShard.process(cluster.getSolrClient());
- waitForState(
- "Timed out waiting for sub shards to be active. Number of active
shards="
- + cluster
- .getSolrClient()
- .getClusterState()
- .getCollection(COLLECTION_NAME)
- .getActiveSlices()
- .size(),
- COLLECTION_NAME,
- activeClusterShape(6, 6));
+ waitForState("Waiting for 6 shards after split", COLLECTION_NAME,
activeClusterShape(6, 6));
try {
splitShard =
@@ -153,15 +148,7 @@ public class SplitShardTest extends SolrCloudTestCase {
CollectionAdminRequest.splitShard(collectionName).setSplitFuzz(0.5f).setShardName("shard1");
splitShard.process(cluster.getSolrClient());
waitForState(
- "Timed out waiting for sub shards to be active. Number of active
shards="
- + cluster
- .getSolrClient()
- .getClusterState()
- .getCollection(collectionName)
- .getActiveSlices()
- .size(),
- collectionName,
- activeClusterShape(3, 3));
+ "Waiting for 3 active shards after split", collectionName,
activeClusterShape(3, 3));
DocCollection coll =
cluster.getSolrClient().getClusterState().getCollection(collectionName);
Slice s1_0 = coll.getSlice("shard1_0");
Slice s1_1 = coll.getSlice("shard1_1");
@@ -174,6 +161,52 @@ public class SplitShardTest extends SolrCloudTestCase {
assertEquals("wrong range in s1_1", expected1, delta1);
}
+ @Test
+ public void testWithChildDocuments() throws Exception {
+ CloudSolrClient solrClient = cluster.getSolrClient();
+ CollectionAdminRequest.createCollection(COLLECTION_NAME, "conf", 1,
1).process(solrClient);
+
+ cluster.waitForActiveCollection(COLLECTION_NAME, 1, 1);
+
+ List<SolrInputDocument> inputDocs = new ArrayList<>();
+ for (int idx = 0; idx < 20; ) { // 10 parents + 10 children
+ SolrInputDocument parent = new SolrInputDocument();
+ parent.addField("id", ++idx);
+ parent.addField("type_s", "parent");
+
+ // Child has different ID, so could be routed differently if used for
routing.
+ SolrInputDocument child = new SolrInputDocument();
+ child.addField("id", ++idx);
+ child.addField("expected_parent_s", parent.getField("id"));
+
+ parent.addField("myChild", List.of(child));
+
+ inputDocs.add(parent);
+ }
+ solrClient.add(COLLECTION_NAME, inputDocs);
+
+ CollectionAdminRequest.SplitShard splitShard =
+ CollectionAdminRequest.splitShard(COLLECTION_NAME)
+ .setNumSubShards(2)
+ .setShardName("shard1");
+ splitShard.setWaitForFinalState(true);
+ splitShard.process(solrClient);
+ waitForState(
+ "Waiting for 2 active shards after split", COLLECTION_NAME,
activeClusterShape(2, 2));
+
+ QueryRequest req =
+ new QueryRequest(new SolrQuery("type_s:parent").setFields("*",
"[child]", "[shard]"));
+ QueryResponse rsp = req.process(solrClient, COLLECTION_NAME);
+ assertEquals(10, rsp.getResults().getNumFound());
+ for (SolrDocument doc : rsp.getResults()) {
+ @SuppressWarnings("unchecked")
+ List<SolrDocument> children = (List<SolrDocument>) doc.get("myChild");
+ assertEquals("only 1 child per parent", 1, children.size());
+ SolrDocument child = children.get(0);
+ assertEquals("parent-child match", doc.get("id"),
child.get("expected_parent_s"));
+ }
+ }
+
private CloudSolrClient createCollection(String collectionName, int
repFactor) throws Exception {
CollectionAdminRequest.createCollection(collectionName, "conf", 1,
repFactor)
diff --git
a/solr/core/src/test/org/apache/solr/update/SolrIndexSplitterTest.java
b/solr/core/src/test/org/apache/solr/update/SolrIndexSplitterTest.java
index 2796233ea0b..78906e9b8e0 100644
--- a/solr/core/src/test/org/apache/solr/update/SolrIndexSplitterTest.java
+++ b/solr/core/src/test/org/apache/solr/update/SolrIndexSplitterTest.java
@@ -28,6 +28,7 @@ import org.apache.lucene.store.Directory;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
+import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.cloud.CompositeIdRouter;
import org.apache.solr.common.cloud.DocRouter;
import org.apache.solr.common.cloud.PlainIdRouter;
@@ -485,19 +486,147 @@ public class SolrIndexSplitterTest extends
SolrTestCaseJ4 {
}
}
+ @Test
+ public void testSplitWithChildDocs() throws Exception {
+ doTestSplitWithChildDocs(SolrIndexSplitter.SplitMethod.REWRITE);
+ }
+
+ @Test
+ public void testSplitWithChildDocsLink() throws Exception {
+ doTestSplitWithChildDocs(SolrIndexSplitter.SplitMethod.LINK);
+ }
+
+ public void doTestSplitWithChildDocs(SolrIndexSplitter.SplitMethod
splitMethod) throws Exception {
+ // Overall test/split pattern copied from doTestSplitByCores
+ File indexDir = createTempDir().toFile();
+
+ CompositeIdRouter r1 = new CompositeIdRouter();
+ String routeKeyBase = "sea-line!";
+
+ List<DocRouter.Range> twoRanges = r1.partitionRange(2,
r1.keyHashRange(routeKeyBase));
+
+ // Insert other doc for contrast
+ String otherDocId = routeKeyBase + "6"; // Will hash into first range
+ assertU(adoc("id", otherDocId));
+
+ // Insert child doc
+ String parentId = routeKeyBase + "1"; // Will hash into second range
+ String childId = routeKeyBase + "5"; // Will hash into first range
+ SolrInputDocument doc =
+ sdoc("id", parentId, "myChild", sdocs(sdoc("id", childId, "child_s",
"child")));
+ assertU(adoc(doc));
+
+ assertU(commit());
+ assertJQ(req("q", "*:*"), "/response/numFound==3");
+
+ // Able to query child.
+ assertJQ(
+ req("q", "id:" + parentId, "fl", "*, [child]"),
+ "/response/numFound==1",
+ "/response/docs/[0]/myChild/[0]/id=='" + childId + "'",
+ "/response/docs/[0]/myChild/[0]/_root_=='"
+ + parentId
+ + "'" // Child has parent root to route with
+ );
+
+ SolrCore core1 = null, core2 = null;
+ try {
+ core1 =
+ h.getCoreContainer()
+ .create(
+ "split1",
+ Map.of("dataDir", indexDir1.getAbsolutePath(), "configSet",
"cloud-minimal"));
+ core2 =
+ h.getCoreContainer()
+ .create(
+ "split2",
+ Map.of("dataDir", indexDir2.getAbsolutePath(), "configSet",
"cloud-minimal"));
+
+ LocalSolrQueryRequest request = null;
+ try {
+ request = lrf.makeRequest("q", "dummy");
+ SolrQueryResponse rsp = new SolrQueryResponse();
+ SplitIndexCommand command =
+ new SplitIndexCommand(
+ request,
+ rsp,
+ null,
+ List.of(core1, core2),
+ twoRanges,
+ new CompositeIdRouter(),
+ null,
+ null,
+ splitMethod);
+ doSplit(command);
+ } finally {
+ if (request != null) request.close();
+ }
+ @SuppressWarnings("resource")
+ final EmbeddedSolrServer server1 = new
EmbeddedSolrServer(h.getCoreContainer(), "split1");
+ @SuppressWarnings("resource")
+ final EmbeddedSolrServer server2 = new
EmbeddedSolrServer(h.getCoreContainer(), "split2");
+ server1.commit(true, true);
+ server2.commit(true, true);
+ assertEquals(
+ "should be 2 docs in index2",
+ 2,
+ server2.query(new SolrQuery("*:*")).getResults().getNumFound());
+ assertEquals(
+ "parent doc should be in index2",
+ 1,
+ server2.query(new SolrQuery("id:" +
parentId)).getResults().getNumFound());
+ assertEquals(
+ "child doc should be in index2",
+ 1,
+ server2.query(new SolrQuery("id:" +
childId)).getResults().getNumFound());
+ assertEquals(
+ "other doc should be in index1",
+ 1,
+ server1.query(new SolrQuery("id:" +
otherDocId)).getResults().getNumFound());
+ } finally {
+ h.getCoreContainer().unload("split2");
+ h.getCoreContainer().unload("split1");
+ }
+ }
+
+ /** Utility method to find Ids that hash into which ranges. Uncomment @Test
to print. */
+ // @Test
+ public void printCompositeHashSandbox() {
+ CompositeIdRouter r1 = new CompositeIdRouter();
+ String routeBase = "sea-line!";
+ DocRouter.Range routeBaseRange = r1.keyHashRange(routeBase);
+ List<DocRouter.Range> twoRanges = r1.partitionRange(2, routeBaseRange);
+ System.out.println("splitKeyRange = " + twoRanges);
+
+ // Hash some values and print which range they fall into
+ for (int i = 0; i < 10; i++) {
+ String key = routeBase + i;
+ int hash = r1.sliceHash(key, null, null, null);
+ boolean inA = twoRanges.get(0).includes(hash);
+ boolean inB = twoRanges.get(1).includes(hash);
+
+ // Print which range the key is in
+ System.out.println(key + " in " + (inA ? twoRanges.get(0) :
twoRanges.get(1)));
+ }
+ }
+
+ /** Creates a range encompassing the two ids, then splits it in two. Uses
PlainIdRouter */
private List<DocRouter.Range> getRanges(String id1, String id2) {
// find minHash/maxHash hash ranges
byte[] bytes = id1.getBytes(StandardCharsets.UTF_8);
int minHash = Hash.murmurhash3_x86_32(bytes, 0, bytes.length, 0);
bytes = id2.getBytes(StandardCharsets.UTF_8);
int maxHash = Hash.murmurhash3_x86_32(bytes, 0, bytes.length, 0);
-
if (minHash > maxHash) {
int temp = maxHash;
maxHash = minHash;
minHash = temp;
}
+ if (maxHash - minHash < 2) {
+ throw new RuntimeException("The range is too small to split");
+ }
+
PlainIdRouter router = new PlainIdRouter();
DocRouter.Range fullRange = new DocRouter.Range(minHash, maxHash);
return router.partitionRange(2, fullRange);
diff --git
a/solr/solr-ref-guide/modules/deployment-guide/pages/collection-management.adoc
b/solr/solr-ref-guide/modules/deployment-guide/pages/collection-management.adoc
index f1906e455f7..f4811158ef7 100644
---
a/solr/solr-ref-guide/modules/deployment-guide/pages/collection-management.adoc
+++
b/solr/solr-ref-guide/modules/deployment-guide/pages/collection-management.adoc
@@ -228,6 +228,7 @@ When such a collection is deleted, its autocreated
configset will be deleted by
+
If this parameter is specified, the router will look at the value of the field
in an input document to compute the hash and identify a shard instead of
looking at the `uniqueKey` field.
If the field specified is null in the document, the document will be rejected.
+For nested documents, the route field must match among all the documents in
the hierarchy.
+
Please note that xref:configuration-guide:realtime-get.adoc[] or retrieval by
document ID would also require the parameter `\_route_` (or `shard.keys`) to
avoid a distributed search.
diff --git a/solr/solr-ref-guide/modules/deployment-guide/pages/node-roles.adoc
b/solr/solr-ref-guide/modules/deployment-guide/pages/node-roles.adoc
index a261d23e70c..07dab503bdd 100644
--- a/solr/solr-ref-guide/modules/deployment-guide/pages/node-roles.adoc
+++ b/solr/solr-ref-guide/modules/deployment-guide/pages/node-roles.adoc
@@ -73,7 +73,7 @@ A node with this role can perform duties of an overseer node
(unless mode is `di
A node with this role can act as if it has replicas of all collections in the
cluster when a query is performed. The workflow is as follows
-If the cluster has collections with very large no:of shards, performing
distributed requests in your _`data node`_ will lead to
+If the cluster has collections with very large no:of shards, performing
distributed requests in your _data node_ will lead to
* large heap utilization
* frequent GC pauses
diff --git
a/solr/solr-ref-guide/modules/deployment-guide/pages/solrcloud-shards-indexing.adoc
b/solr/solr-ref-guide/modules/deployment-guide/pages/solrcloud-shards-indexing.adoc
index 21b4027fdfa..d01ce96f3b9 100644
---
a/solr/solr-ref-guide/modules/deployment-guide/pages/solrcloud-shards-indexing.adoc
+++
b/solr/solr-ref-guide/modules/deployment-guide/pages/solrcloud-shards-indexing.adoc
@@ -138,7 +138,7 @@ At query time, you include the prefix(es) along with the
number of bits into you
If you do not want to influence how documents are stored, you don't need to
specify a prefix in your document ID.
If you created the collection and defined the "implicit" router at the time of
creation, you can additionally define a `router.field` parameter to use a field
from each document to identify a shard where the document belongs.
-If the field specified is missing in the document, however, the document will
be rejected.
+If the field specified is missing in the document, then the document will be
rejected.
You could also use the `\_route_` parameter to name a specific shard.
== Shard Splitting
diff --git
a/solr/solr-ref-guide/modules/indexing-guide/pages/indexing-nested-documents.adoc
b/solr/solr-ref-guide/modules/indexing-guide/pages/indexing-nested-documents.adoc
index 939f8f2adc5..fedb60d7e3e 100644
---
a/solr/solr-ref-guide/modules/indexing-guide/pages/indexing-nested-documents.adoc
+++
b/solr/solr-ref-guide/modules/indexing-guide/pages/indexing-nested-documents.adoc
@@ -283,7 +283,7 @@ This makes it much easier to apply
xref:partial-document-updates.adoc#updating-child-documents[atomic updates] to
individual child documents.
====
-== Maintaining Integrity with Updates and Deletes
+== Maintaining Integrity with Updates, Deletes, and Shard Splits
Nested document trees can be modified with
xref:partial-document-updates.adoc#atomic-updates[atomic updates] to
@@ -304,6 +304,10 @@ Instead, use delete-by-query (most efficient) or
xref:partial-document-updates.a
If you use Solr's delete-by-query APIs, you *MUST* be careful to ensure that
any deletion query is structured to ensure no descendent children remain of any
documents that are being deleted.
*_Doing otherwise will violate integrity assumptions that Solr expects._*
+When xref:deployment-guide:shard-management.adoc#splitshard[splitting shards],
child documents get transmitted and routed independently.
+If you use the default `router.field`, this is handled correctly for you.
+If your collection has a custom `router.field`, then you must ensure that all
documents in a hierarchy have the same value for that field.
+
== Indexing Anonymous Children
Although not recommended, it is also possible to index child documents
"anonymously":