This is an automated email from the ASF dual-hosted git repository. dsmiley pushed a commit to branch branch_9x in repository https://gitbox.apache.org/repos/asf/solr.git
commit fbcf6b3ecb2d20d584fbf6d7290974f9b41c191f Author: Zack Kendall <[email protected]> AuthorDate: Tue Oct 29 21:29:41 2024 -0400 SOLR-11191: SolrIndexSplitter: Routing child docs by _root_ when available (#2799) Splitting shards now supports child / nested docs. Works based on splitting based on the _root_ field. Co-authored-by: zkendall <[email protected]> (cherry picked from commit e377ebbdaff5ae784b3f4e4315afa4d6705d1d71) --- solr/CHANGES.txt | 2 + .../org/apache/solr/update/SolrIndexSplitter.java | 15 ++- .../solr/configsets/cloud-minimal/conf/schema.xml | 2 + .../test/org/apache/solr/cloud/SplitShardTest.java | 72 ++++++++--- .../apache/solr/update/SolrIndexSplitterTest.java | 131 ++++++++++++++++++++- .../pages/collection-management.adoc | 1 + .../modules/deployment-guide/pages/node-roles.adoc | 2 +- .../pages/solrcloud-shards-indexing.adoc | 2 +- .../pages/indexing-nested-documents.adoc | 6 +- 9 files changed, 209 insertions(+), 24 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 7c5e0b99b24..6a5eb63c1a9 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -85,6 +85,8 @@ Bug Fixes * SOLR-17529: Clean up error message in PostTool when you attempt to post to a basic auth secured Solr. (Eric Pugh) +* SOLR-11191: Splitting shards now routes child-docs with their _root_ field when available so they maintain parent relationship. (Zack Kendall) + Dependency Upgrades --------------------- (No changes) diff --git a/solr/core/src/java/org/apache/solr/update/SolrIndexSplitter.java b/solr/core/src/java/org/apache/solr/update/SolrIndexSplitter.java index 61ba505ddd5..fa12d7b51d4 100644 --- a/solr/core/src/java/org/apache/solr/update/SolrIndexSplitter.java +++ b/solr/core/src/java/org/apache/solr/update/SolrIndexSplitter.java @@ -69,6 +69,7 @@ import org.apache.solr.core.DirectoryFactory; import org.apache.solr.core.SolrCore; import org.apache.solr.handler.IndexFetcher; import org.apache.solr.handler.SnapShooter; +import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.SchemaField; import org.apache.solr.search.BitsFilteredPostingsEnum; import org.apache.solr.search.SolrIndexSearcher; @@ -125,17 +126,29 @@ public class SolrIndexSplitter { numPieces = cmd.ranges.size(); rangesArr = cmd.ranges.toArray(new DocRouter.Range[0]); } + if (cmd.routeFieldName == null) { - field = searcher.getSchema().getUniqueKeyField(); + // To support routing child documents, use the root field if it exists (which would be + // populated with unique field), otherwise use the unique key field + if (searcher.getSchema().isUsableForChildDocs()) { + field = searcher.getSchema().getField(IndexSchema.ROOT_FIELD_NAME); + } else { + field = searcher.getSchema().getUniqueKeyField(); + } } else { + // Custom routing + // If child docs are used, users must ensure that the whole nested document tree has a + // consistent routeField value field = searcher.getSchema().getField(cmd.routeFieldName); } + if (cmd.splitKey == null) { splitKey = null; } else { checkRouterSupportsSplitKey(hashRouter, cmd.splitKey); splitKey = ((CompositeIdRouter) hashRouter).getRouteKeyNoSuffix(cmd.splitKey); } + if (cmd.cores == null) { this.splitMethod = SplitMethod.REWRITE; } else { diff --git a/solr/core/src/test-files/solr/configsets/cloud-minimal/conf/schema.xml b/solr/core/src/test-files/solr/configsets/cloud-minimal/conf/schema.xml index fc23706512e..3c25f27405a 100644 --- a/solr/core/src/test-files/solr/configsets/cloud-minimal/conf/schema.xml +++ b/solr/core/src/test-files/solr/configsets/cloud-minimal/conf/schema.xml @@ -19,10 +19,12 @@ <fieldType name="string" class="solr.StrField"/> <fieldType name="int" class="${solr.tests.IntegerFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" omitNorms="true" positionIncrementGap="0" uninvertible="true"/> <fieldType name="long" class="${solr.tests.LongFieldType}" docValues="${solr.tests.numeric.dv}" precisionStep="0" omitNorms="true" positionIncrementGap="0" uninvertible="true"/> + <fieldType name="_nest_path_" class="solr.NestPathField"/> <dynamicField name="*" type="string" indexed="true" stored="true"/> <!-- for versioning --> <field name="_version_" type="long" indexed="true" stored="true"/> <field name="_root_" type="string" indexed="true" stored="true" multiValued="false" required="false"/> + <field name="_nest_path_" type="_nest_path_"/>` <field name="id" type="string" indexed="true" stored="true"/> <dynamicField name="*_s" type="string" indexed="true" stored="true" /> <uniqueKey>id</uniqueKey> diff --git a/solr/core/src/test/org/apache/solr/cloud/SplitShardTest.java b/solr/core/src/test/org/apache/solr/cloud/SplitShardTest.java index dddc089ce55..e69c9a08529 100644 --- a/solr/core/src/test/org/apache/solr/cloud/SplitShardTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/SplitShardTest.java @@ -21,6 +21,7 @@ import static org.hamcrest.core.StringContains.containsString; import java.io.IOException; import java.lang.invoke.MethodHandles; +import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; @@ -36,11 +37,14 @@ import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.impl.BaseHttpSolrClient; import org.apache.solr.client.solrj.impl.CloudSolrClient; import org.apache.solr.client.solrj.request.CollectionAdminRequest; +import org.apache.solr.client.solrj.request.QueryRequest; import org.apache.solr.client.solrj.request.UpdateRequest; +import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.client.solrj.response.UpdateResponse; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.cloud.DocCollection; import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.Slice; @@ -89,16 +93,7 @@ public class SplitShardTest extends SolrCloudTestCase { .setNumSubShards(5) .setShardName("shard1"); splitShard.process(cluster.getSolrClient()); - waitForState( - "Timed out waiting for sub shards to be active. Number of active shards=" - + cluster - .getSolrClient() - .getClusterState() - .getCollection(COLLECTION_NAME) - .getActiveSlices() - .size(), - COLLECTION_NAME, - activeClusterShape(6, 7)); + waitForState("Waiting for 6 shards after split", COLLECTION_NAME, activeClusterShape(6, 7)); try { splitShard = @@ -154,15 +149,7 @@ public class SplitShardTest extends SolrCloudTestCase { CollectionAdminRequest.splitShard(collectionName).setSplitFuzz(0.5f).setShardName("shard1"); splitShard.process(cluster.getSolrClient()); waitForState( - "Timed out waiting for sub shards to be active. Number of active shards=" - + cluster - .getSolrClient() - .getClusterState() - .getCollection(collectionName) - .getActiveSlices() - .size(), - collectionName, - activeClusterShape(3, 4)); + "Waiting for 3 active shards after split", collectionName, activeClusterShape(3, 4)); DocCollection coll = cluster.getSolrClient().getClusterState().getCollection(collectionName); Slice s1_0 = coll.getSlice("shard1_0"); Slice s1_1 = coll.getSlice("shard1_1"); @@ -175,6 +162,53 @@ public class SplitShardTest extends SolrCloudTestCase { assertEquals("wrong range in s1_1", expected1, delta1); } + @Test + public void testWithChildDocuments() throws Exception { + CloudSolrClient solrClient = cluster.getSolrClient(); + CollectionAdminRequest.createCollection(COLLECTION_NAME, "conf", 1, 1).process(solrClient); + + cluster.waitForActiveCollection(COLLECTION_NAME, 1, 1); + + List<SolrInputDocument> inputDocs = new ArrayList<>(); + for (int idx = 0; idx < 20; ) { // 10 parents + 10 children + SolrInputDocument parent = new SolrInputDocument(); + parent.addField("id", ++idx); + parent.addField("type_s", "parent"); + + // Child has different ID, so could be routed differently if used for routing. + SolrInputDocument child = new SolrInputDocument(); + child.addField("id", ++idx); + child.addField("expected_parent_s", parent.getField("id")); + + parent.addField("myChild", List.of(child)); + + inputDocs.add(parent); + } + solrClient.add(COLLECTION_NAME, inputDocs); + + CollectionAdminRequest.SplitShard splitShard = + CollectionAdminRequest.splitShard(COLLECTION_NAME) + .setNumSubShards(2) + .setShardName("shard1"); + splitShard.setWaitForFinalState(true); + splitShard.process(solrClient); + CollectionAdminRequest.deleteShard(COLLECTION_NAME, "shard1").process(solrClient); + waitForState( + "Waiting for 2 active shards after split", COLLECTION_NAME, activeClusterShape(2, 2)); + + QueryRequest req = + new QueryRequest(new SolrQuery("type_s:parent").setFields("*", "[child]", "[shard]")); + QueryResponse rsp = req.process(solrClient, COLLECTION_NAME); + assertEquals(10, rsp.getResults().getNumFound()); + for (SolrDocument doc : rsp.getResults()) { + @SuppressWarnings("unchecked") + List<SolrDocument> children = (List<SolrDocument>) doc.get("myChild"); + assertEquals("only 1 child per parent", 1, children.size()); + SolrDocument child = children.get(0); + assertEquals("parent-child match", doc.get("id"), child.get("expected_parent_s")); + } + } + private CloudSolrClient createCollection(String collectionName, int repFactor) throws Exception { CollectionAdminRequest.createCollection(collectionName, "conf", 1, repFactor) diff --git a/solr/core/src/test/org/apache/solr/update/SolrIndexSplitterTest.java b/solr/core/src/test/org/apache/solr/update/SolrIndexSplitterTest.java index 2796233ea0b..78906e9b8e0 100644 --- a/solr/core/src/test/org/apache/solr/update/SolrIndexSplitterTest.java +++ b/solr/core/src/test/org/apache/solr/update/SolrIndexSplitterTest.java @@ -28,6 +28,7 @@ import org.apache.lucene.store.Directory; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer; +import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.cloud.CompositeIdRouter; import org.apache.solr.common.cloud.DocRouter; import org.apache.solr.common.cloud.PlainIdRouter; @@ -485,19 +486,147 @@ public class SolrIndexSplitterTest extends SolrTestCaseJ4 { } } + @Test + public void testSplitWithChildDocs() throws Exception { + doTestSplitWithChildDocs(SolrIndexSplitter.SplitMethod.REWRITE); + } + + @Test + public void testSplitWithChildDocsLink() throws Exception { + doTestSplitWithChildDocs(SolrIndexSplitter.SplitMethod.LINK); + } + + public void doTestSplitWithChildDocs(SolrIndexSplitter.SplitMethod splitMethod) throws Exception { + // Overall test/split pattern copied from doTestSplitByCores + File indexDir = createTempDir().toFile(); + + CompositeIdRouter r1 = new CompositeIdRouter(); + String routeKeyBase = "sea-line!"; + + List<DocRouter.Range> twoRanges = r1.partitionRange(2, r1.keyHashRange(routeKeyBase)); + + // Insert other doc for contrast + String otherDocId = routeKeyBase + "6"; // Will hash into first range + assertU(adoc("id", otherDocId)); + + // Insert child doc + String parentId = routeKeyBase + "1"; // Will hash into second range + String childId = routeKeyBase + "5"; // Will hash into first range + SolrInputDocument doc = + sdoc("id", parentId, "myChild", sdocs(sdoc("id", childId, "child_s", "child"))); + assertU(adoc(doc)); + + assertU(commit()); + assertJQ(req("q", "*:*"), "/response/numFound==3"); + + // Able to query child. + assertJQ( + req("q", "id:" + parentId, "fl", "*, [child]"), + "/response/numFound==1", + "/response/docs/[0]/myChild/[0]/id=='" + childId + "'", + "/response/docs/[0]/myChild/[0]/_root_=='" + + parentId + + "'" // Child has parent root to route with + ); + + SolrCore core1 = null, core2 = null; + try { + core1 = + h.getCoreContainer() + .create( + "split1", + Map.of("dataDir", indexDir1.getAbsolutePath(), "configSet", "cloud-minimal")); + core2 = + h.getCoreContainer() + .create( + "split2", + Map.of("dataDir", indexDir2.getAbsolutePath(), "configSet", "cloud-minimal")); + + LocalSolrQueryRequest request = null; + try { + request = lrf.makeRequest("q", "dummy"); + SolrQueryResponse rsp = new SolrQueryResponse(); + SplitIndexCommand command = + new SplitIndexCommand( + request, + rsp, + null, + List.of(core1, core2), + twoRanges, + new CompositeIdRouter(), + null, + null, + splitMethod); + doSplit(command); + } finally { + if (request != null) request.close(); + } + @SuppressWarnings("resource") + final EmbeddedSolrServer server1 = new EmbeddedSolrServer(h.getCoreContainer(), "split1"); + @SuppressWarnings("resource") + final EmbeddedSolrServer server2 = new EmbeddedSolrServer(h.getCoreContainer(), "split2"); + server1.commit(true, true); + server2.commit(true, true); + assertEquals( + "should be 2 docs in index2", + 2, + server2.query(new SolrQuery("*:*")).getResults().getNumFound()); + assertEquals( + "parent doc should be in index2", + 1, + server2.query(new SolrQuery("id:" + parentId)).getResults().getNumFound()); + assertEquals( + "child doc should be in index2", + 1, + server2.query(new SolrQuery("id:" + childId)).getResults().getNumFound()); + assertEquals( + "other doc should be in index1", + 1, + server1.query(new SolrQuery("id:" + otherDocId)).getResults().getNumFound()); + } finally { + h.getCoreContainer().unload("split2"); + h.getCoreContainer().unload("split1"); + } + } + + /** Utility method to find Ids that hash into which ranges. Uncomment @Test to print. */ + // @Test + public void printCompositeHashSandbox() { + CompositeIdRouter r1 = new CompositeIdRouter(); + String routeBase = "sea-line!"; + DocRouter.Range routeBaseRange = r1.keyHashRange(routeBase); + List<DocRouter.Range> twoRanges = r1.partitionRange(2, routeBaseRange); + System.out.println("splitKeyRange = " + twoRanges); + + // Hash some values and print which range they fall into + for (int i = 0; i < 10; i++) { + String key = routeBase + i; + int hash = r1.sliceHash(key, null, null, null); + boolean inA = twoRanges.get(0).includes(hash); + boolean inB = twoRanges.get(1).includes(hash); + + // Print which range the key is in + System.out.println(key + " in " + (inA ? twoRanges.get(0) : twoRanges.get(1))); + } + } + + /** Creates a range encompassing the two ids, then splits it in two. Uses PlainIdRouter */ private List<DocRouter.Range> getRanges(String id1, String id2) { // find minHash/maxHash hash ranges byte[] bytes = id1.getBytes(StandardCharsets.UTF_8); int minHash = Hash.murmurhash3_x86_32(bytes, 0, bytes.length, 0); bytes = id2.getBytes(StandardCharsets.UTF_8); int maxHash = Hash.murmurhash3_x86_32(bytes, 0, bytes.length, 0); - if (minHash > maxHash) { int temp = maxHash; maxHash = minHash; minHash = temp; } + if (maxHash - minHash < 2) { + throw new RuntimeException("The range is too small to split"); + } + PlainIdRouter router = new PlainIdRouter(); DocRouter.Range fullRange = new DocRouter.Range(minHash, maxHash); return router.partitionRange(2, fullRange); diff --git a/solr/solr-ref-guide/modules/deployment-guide/pages/collection-management.adoc b/solr/solr-ref-guide/modules/deployment-guide/pages/collection-management.adoc index 62f43f6d54e..27517115afb 100644 --- a/solr/solr-ref-guide/modules/deployment-guide/pages/collection-management.adoc +++ b/solr/solr-ref-guide/modules/deployment-guide/pages/collection-management.adoc @@ -228,6 +228,7 @@ When such a collection is deleted, its autocreated configset will be deleted by + If this parameter is specified, the router will look at the value of the field in an input document to compute the hash and identify a shard instead of looking at the `uniqueKey` field. If the field specified is null in the document, the document will be rejected. +For nested documents, the route field must match among all the documents in the hierarchy. + Please note that xref:configuration-guide:realtime-get.adoc[] or retrieval by document ID would also require the parameter `\_route_` (or `shard.keys`) to avoid a distributed search. diff --git a/solr/solr-ref-guide/modules/deployment-guide/pages/node-roles.adoc b/solr/solr-ref-guide/modules/deployment-guide/pages/node-roles.adoc index a261d23e70c..07dab503bdd 100644 --- a/solr/solr-ref-guide/modules/deployment-guide/pages/node-roles.adoc +++ b/solr/solr-ref-guide/modules/deployment-guide/pages/node-roles.adoc @@ -73,7 +73,7 @@ A node with this role can perform duties of an overseer node (unless mode is `di A node with this role can act as if it has replicas of all collections in the cluster when a query is performed. The workflow is as follows -If the cluster has collections with very large no:of shards, performing distributed requests in your _`data node`_ will lead to +If the cluster has collections with very large no:of shards, performing distributed requests in your _data node_ will lead to * large heap utilization * frequent GC pauses diff --git a/solr/solr-ref-guide/modules/deployment-guide/pages/solrcloud-shards-indexing.adoc b/solr/solr-ref-guide/modules/deployment-guide/pages/solrcloud-shards-indexing.adoc index cbc20ca9aa2..fb5ff6f1606 100644 --- a/solr/solr-ref-guide/modules/deployment-guide/pages/solrcloud-shards-indexing.adoc +++ b/solr/solr-ref-guide/modules/deployment-guide/pages/solrcloud-shards-indexing.adoc @@ -146,7 +146,7 @@ At query time, you include the prefix(es) along with the number of bits into you If you do not want to influence how documents are stored, you don't need to specify a prefix in your document ID. If you created the collection and defined the "implicit" router at the time of creation, you can additionally define a `router.field` parameter to use a field from each document to identify a shard where the document belongs. -If the field specified is missing in the document, however, the document will be rejected. +If the field specified is missing in the document, then the document will be rejected. You could also use the `\_route_` parameter to name a specific shard. == Shard Splitting diff --git a/solr/solr-ref-guide/modules/indexing-guide/pages/indexing-nested-documents.adoc b/solr/solr-ref-guide/modules/indexing-guide/pages/indexing-nested-documents.adoc index 939f8f2adc5..fedb60d7e3e 100644 --- a/solr/solr-ref-guide/modules/indexing-guide/pages/indexing-nested-documents.adoc +++ b/solr/solr-ref-guide/modules/indexing-guide/pages/indexing-nested-documents.adoc @@ -283,7 +283,7 @@ This makes it much easier to apply xref:partial-document-updates.adoc#updating-child-documents[atomic updates] to individual child documents. ==== -== Maintaining Integrity with Updates and Deletes +== Maintaining Integrity with Updates, Deletes, and Shard Splits Nested document trees can be modified with xref:partial-document-updates.adoc#atomic-updates[atomic updates] to @@ -304,6 +304,10 @@ Instead, use delete-by-query (most efficient) or xref:partial-document-updates.a If you use Solr's delete-by-query APIs, you *MUST* be careful to ensure that any deletion query is structured to ensure no descendent children remain of any documents that are being deleted. *_Doing otherwise will violate integrity assumptions that Solr expects._* +When xref:deployment-guide:shard-management.adoc#splitshard[splitting shards], child documents get transmitted and routed independently. +If you use the default `router.field`, this is handled correctly for you. +If your collection has a custom `router.field`, then you must ensure that all documents in a hierarchy have the same value for that field. + == Indexing Anonymous Children Although not recommended, it is also possible to index child documents "anonymously":
