(solr) branch main updated: SOLR-11191: SolrIndexSplitter: Routing child docs by _root_ when available (#2799)

dsmiley Tue, 29 Oct 2024 18:30:49 -0700

This is an automated email from the ASF dual-hosted git repository.

dsmiley pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr.git



The following commit(s) were added to refs/heads/main by this push:
     new e377ebbdaff SOLR-11191: SolrIndexSplitter: Routing child docs by 
_root_ when available (#2799)
e377ebbdaff is described below

commit e377ebbdaff5ae784b3f4e4315afa4d6705d1d71
Author: Zack Kendall <[email protected]>
AuthorDate: Tue Oct 29 18:29:41 2024 -0700

    SOLR-11191: SolrIndexSplitter: Routing child docs by _root_ when available 
(#2799)
    
    Splitting shards now supports child / nested docs.  Works based on 
splitting based on the _root_ field.
    
    
    Co-authored-by: zkendall <[email protected]>
---
 solr/CHANGES.txt                                   |   2 +
 .../org/apache/solr/update/SolrIndexSplitter.java  |  15 ++-
 .../solr/configsets/cloud-minimal/conf/schema.xml  |   2 +
 .../test/org/apache/solr/cloud/SplitShardTest.java |  71 ++++++++---
 .../apache/solr/update/SolrIndexSplitterTest.java  | 131 ++++++++++++++++++++-
 .../pages/collection-management.adoc               |   1 +
 .../modules/deployment-guide/pages/node-roles.adoc |   2 +-
 .../pages/solrcloud-shards-indexing.adoc           |   2 +-
 .../pages/indexing-nested-documents.adoc           |   6 +-
 9 files changed, 208 insertions(+), 24 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index ac00cf68152..db44d16aa74 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -189,6 +189,8 @@ Bug Fixes
 
 * SOLR-17413: Fixed UpdateLog replay bug that shared thread-unsafe 
SolrQueryRequest objects across threads (Jason Gerlowski, David Smiley, Houston 
Putman)
 
+* SOLR-11191: Splitting shards now routes child-docs with their _root_ field 
when available so they maintain parent relationship. (Zack Kendall)
+
 Dependency Upgrades
 ---------------------
 (No changes)
diff --git a/solr/core/src/java/org/apache/solr/update/SolrIndexSplitter.java 
b/solr/core/src/java/org/apache/solr/update/SolrIndexSplitter.java
index 61ba505ddd5..fa12d7b51d4 100644
--- a/solr/core/src/java/org/apache/solr/update/SolrIndexSplitter.java
+++ b/solr/core/src/java/org/apache/solr/update/SolrIndexSplitter.java
@@ -69,6 +69,7 @@ import org.apache.solr.core.DirectoryFactory;
 import org.apache.solr.core.SolrCore;
 import org.apache.solr.handler.IndexFetcher;
 import org.apache.solr.handler.SnapShooter;
+import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.schema.SchemaField;
 import org.apache.solr.search.BitsFilteredPostingsEnum;
 import org.apache.solr.search.SolrIndexSearcher;
@@ -125,17 +126,29 @@ public class SolrIndexSplitter {
       numPieces = cmd.ranges.size();
       rangesArr = cmd.ranges.toArray(new DocRouter.Range[0]);
     }
+
     if (cmd.routeFieldName == null) {
-      field = searcher.getSchema().getUniqueKeyField();
+      // To support routing child documents, use the root field if it exists 
(which would be
+      // populated with unique field), otherwise use the unique key field
+      if (searcher.getSchema().isUsableForChildDocs()) {
+        field = searcher.getSchema().getField(IndexSchema.ROOT_FIELD_NAME);
+      } else {
+        field = searcher.getSchema().getUniqueKeyField();
+      }
     } else {
+      // Custom routing
+      // If child docs are used, users must ensure that the whole nested 
document tree has a
+      // consistent routeField value
       field = searcher.getSchema().getField(cmd.routeFieldName);
     }
+
     if (cmd.splitKey == null) {
       splitKey = null;
     } else {
       checkRouterSupportsSplitKey(hashRouter, cmd.splitKey);
       splitKey = ((CompositeIdRouter) 
hashRouter).getRouteKeyNoSuffix(cmd.splitKey);
     }
+
     if (cmd.cores == null) {
       this.splitMethod = SplitMethod.REWRITE;
     } else {
diff --git 
a/solr/core/src/test-files/solr/configsets/cloud-minimal/conf/schema.xml 
b/solr/core/src/test-files/solr/configsets/cloud-minimal/conf/schema.xml
index fc23706512e..3c25f27405a 100644
--- a/solr/core/src/test-files/solr/configsets/cloud-minimal/conf/schema.xml
+++ b/solr/core/src/test-files/solr/configsets/cloud-minimal/conf/schema.xml
@@ -19,10 +19,12 @@
   <fieldType name="string" class="solr.StrField"/>
   <fieldType name="int" class="${solr.tests.IntegerFieldType}" 
docValues="${solr.tests.numeric.dv}" precisionStep="0" omitNorms="true" 
positionIncrementGap="0" uninvertible="true"/>
   <fieldType name="long" class="${solr.tests.LongFieldType}" 
docValues="${solr.tests.numeric.dv}" precisionStep="0" omitNorms="true" 
positionIncrementGap="0" uninvertible="true"/>
+  <fieldType name="_nest_path_" class="solr.NestPathField"/>
   <dynamicField name="*" type="string" indexed="true" stored="true"/>
   <!-- for versioning -->
   <field name="_version_" type="long" indexed="true" stored="true"/>
   <field name="_root_" type="string" indexed="true" stored="true" 
multiValued="false" required="false"/>
+  <field name="_nest_path_" type="_nest_path_"/>`
   <field name="id" type="string" indexed="true" stored="true"/>
   <dynamicField name="*_s"  type="string"  indexed="true"  stored="true" />
   <uniqueKey>id</uniqueKey>
diff --git a/solr/core/src/test/org/apache/solr/cloud/SplitShardTest.java 
b/solr/core/src/test/org/apache/solr/cloud/SplitShardTest.java
index 132dfcfaaa2..ab194b291da 100644
--- a/solr/core/src/test/org/apache/solr/cloud/SplitShardTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/SplitShardTest.java
@@ -21,6 +21,7 @@ import static org.hamcrest.core.StringContains.containsString;
 
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
+import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -35,11 +36,14 @@ import org.apache.solr.client.solrj.SolrQuery;
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.impl.CloudSolrClient;
 import org.apache.solr.client.solrj.request.CollectionAdminRequest;
+import org.apache.solr.client.solrj.request.QueryRequest;
 import org.apache.solr.client.solrj.request.UpdateRequest;
+import org.apache.solr.client.solrj.response.QueryResponse;
 import org.apache.solr.client.solrj.response.UpdateResponse;
 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrDocumentList;
 import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.common.cloud.DocCollection;
 import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.cloud.Slice;
@@ -88,16 +92,7 @@ public class SplitShardTest extends SolrCloudTestCase {
             .setNumSubShards(5)
             .setShardName("shard1");
     splitShard.process(cluster.getSolrClient());
-    waitForState(
-        "Timed out waiting for sub shards to be active. Number of active 
shards="
-            + cluster
-                .getSolrClient()
-                .getClusterState()
-                .getCollection(COLLECTION_NAME)
-                .getActiveSlices()
-                .size(),
-        COLLECTION_NAME,
-        activeClusterShape(6, 6));
+    waitForState("Waiting for 6 shards after split", COLLECTION_NAME, 
activeClusterShape(6, 6));
 
     try {
       splitShard =
@@ -153,15 +148,7 @@ public class SplitShardTest extends SolrCloudTestCase {
         
CollectionAdminRequest.splitShard(collectionName).setSplitFuzz(0.5f).setShardName("shard1");
     splitShard.process(cluster.getSolrClient());
     waitForState(
-        "Timed out waiting for sub shards to be active. Number of active 
shards="
-            + cluster
-                .getSolrClient()
-                .getClusterState()
-                .getCollection(collectionName)
-                .getActiveSlices()
-                .size(),
-        collectionName,
-        activeClusterShape(3, 3));
+        "Waiting for 3 active shards after split", collectionName, 
activeClusterShape(3, 3));
     DocCollection coll = 
cluster.getSolrClient().getClusterState().getCollection(collectionName);
     Slice s1_0 = coll.getSlice("shard1_0");
     Slice s1_1 = coll.getSlice("shard1_1");
@@ -174,6 +161,52 @@ public class SplitShardTest extends SolrCloudTestCase {
     assertEquals("wrong range in s1_1", expected1, delta1);
   }
 
+  @Test
+  public void testWithChildDocuments() throws Exception {
+    CloudSolrClient solrClient = cluster.getSolrClient();
+    CollectionAdminRequest.createCollection(COLLECTION_NAME, "conf", 1, 
1).process(solrClient);
+
+    cluster.waitForActiveCollection(COLLECTION_NAME, 1, 1);
+
+    List<SolrInputDocument> inputDocs = new ArrayList<>();
+    for (int idx = 0; idx < 20; ) { // 10 parents + 10 children
+      SolrInputDocument parent = new SolrInputDocument();
+      parent.addField("id", ++idx);
+      parent.addField("type_s", "parent");
+
+      // Child has different ID, so could be routed differently if used for 
routing.
+      SolrInputDocument child = new SolrInputDocument();
+      child.addField("id", ++idx);
+      child.addField("expected_parent_s", parent.getField("id"));
+
+      parent.addField("myChild", List.of(child));
+
+      inputDocs.add(parent);
+    }
+    solrClient.add(COLLECTION_NAME, inputDocs);
+
+    CollectionAdminRequest.SplitShard splitShard =
+        CollectionAdminRequest.splitShard(COLLECTION_NAME)
+            .setNumSubShards(2)
+            .setShardName("shard1");
+    splitShard.setWaitForFinalState(true);
+    splitShard.process(solrClient);
+    waitForState(
+        "Waiting for 2 active shards after split", COLLECTION_NAME, 
activeClusterShape(2, 2));
+
+    QueryRequest req =
+        new QueryRequest(new SolrQuery("type_s:parent").setFields("*", 
"[child]", "[shard]"));
+    QueryResponse rsp = req.process(solrClient, COLLECTION_NAME);
+    assertEquals(10, rsp.getResults().getNumFound());
+    for (SolrDocument doc : rsp.getResults()) {
+      @SuppressWarnings("unchecked")
+      List<SolrDocument> children = (List<SolrDocument>) doc.get("myChild");
+      assertEquals("only 1 child per parent", 1, children.size());
+      SolrDocument child = children.get(0);
+      assertEquals("parent-child match", doc.get("id"), 
child.get("expected_parent_s"));
+    }
+  }
+
   private CloudSolrClient createCollection(String collectionName, int 
repFactor) throws Exception {
 
     CollectionAdminRequest.createCollection(collectionName, "conf", 1, 
repFactor)
diff --git 
a/solr/core/src/test/org/apache/solr/update/SolrIndexSplitterTest.java 
b/solr/core/src/test/org/apache/solr/update/SolrIndexSplitterTest.java
index 2796233ea0b..78906e9b8e0 100644
--- a/solr/core/src/test/org/apache/solr/update/SolrIndexSplitterTest.java
+++ b/solr/core/src/test/org/apache/solr/update/SolrIndexSplitterTest.java
@@ -28,6 +28,7 @@ import org.apache.lucene.store.Directory;
 import org.apache.solr.SolrTestCaseJ4;
 import org.apache.solr.client.solrj.SolrQuery;
 import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
+import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.common.cloud.CompositeIdRouter;
 import org.apache.solr.common.cloud.DocRouter;
 import org.apache.solr.common.cloud.PlainIdRouter;
@@ -485,19 +486,147 @@ public class SolrIndexSplitterTest extends 
SolrTestCaseJ4 {
     }
   }
 
+  @Test
+  public void testSplitWithChildDocs() throws Exception {
+    doTestSplitWithChildDocs(SolrIndexSplitter.SplitMethod.REWRITE);
+  }
+
+  @Test
+  public void testSplitWithChildDocsLink() throws Exception {
+    doTestSplitWithChildDocs(SolrIndexSplitter.SplitMethod.LINK);
+  }
+
+  public void doTestSplitWithChildDocs(SolrIndexSplitter.SplitMethod 
splitMethod) throws Exception {
+    // Overall test/split pattern copied from doTestSplitByCores
+    File indexDir = createTempDir().toFile();
+
+    CompositeIdRouter r1 = new CompositeIdRouter();
+    String routeKeyBase = "sea-line!";
+
+    List<DocRouter.Range> twoRanges = r1.partitionRange(2, 
r1.keyHashRange(routeKeyBase));
+
+    // Insert other doc for contrast
+    String otherDocId = routeKeyBase + "6"; // Will hash into first range
+    assertU(adoc("id", otherDocId));
+
+    // Insert child doc
+    String parentId = routeKeyBase + "1"; // Will hash into second range
+    String childId = routeKeyBase + "5"; // Will hash into first range
+    SolrInputDocument doc =
+        sdoc("id", parentId, "myChild", sdocs(sdoc("id", childId, "child_s", 
"child")));
+    assertU(adoc(doc));
+
+    assertU(commit());
+    assertJQ(req("q", "*:*"), "/response/numFound==3");
+
+    // Able to query child.
+    assertJQ(
+        req("q", "id:" + parentId, "fl", "*, [child]"),
+        "/response/numFound==1",
+        "/response/docs/[0]/myChild/[0]/id=='" + childId + "'",
+        "/response/docs/[0]/myChild/[0]/_root_=='"
+            + parentId
+            + "'" // Child has parent root to route with
+        );
+
+    SolrCore core1 = null, core2 = null;
+    try {
+      core1 =
+          h.getCoreContainer()
+              .create(
+                  "split1",
+                  Map.of("dataDir", indexDir1.getAbsolutePath(), "configSet", 
"cloud-minimal"));
+      core2 =
+          h.getCoreContainer()
+              .create(
+                  "split2",
+                  Map.of("dataDir", indexDir2.getAbsolutePath(), "configSet", 
"cloud-minimal"));
+
+      LocalSolrQueryRequest request = null;
+      try {
+        request = lrf.makeRequest("q", "dummy");
+        SolrQueryResponse rsp = new SolrQueryResponse();
+        SplitIndexCommand command =
+            new SplitIndexCommand(
+                request,
+                rsp,
+                null,
+                List.of(core1, core2),
+                twoRanges,
+                new CompositeIdRouter(),
+                null,
+                null,
+                splitMethod);
+        doSplit(command);
+      } finally {
+        if (request != null) request.close();
+      }
+      @SuppressWarnings("resource")
+      final EmbeddedSolrServer server1 = new 
EmbeddedSolrServer(h.getCoreContainer(), "split1");
+      @SuppressWarnings("resource")
+      final EmbeddedSolrServer server2 = new 
EmbeddedSolrServer(h.getCoreContainer(), "split2");
+      server1.commit(true, true);
+      server2.commit(true, true);
+      assertEquals(
+          "should be  2 docs in index2",
+          2,
+          server2.query(new SolrQuery("*:*")).getResults().getNumFound());
+      assertEquals(
+          "parent doc should be in index2",
+          1,
+          server2.query(new SolrQuery("id:" + 
parentId)).getResults().getNumFound());
+      assertEquals(
+          "child doc should be in index2",
+          1,
+          server2.query(new SolrQuery("id:" + 
childId)).getResults().getNumFound());
+      assertEquals(
+          "other doc should be in index1",
+          1,
+          server1.query(new SolrQuery("id:" + 
otherDocId)).getResults().getNumFound());
+    } finally {
+      h.getCoreContainer().unload("split2");
+      h.getCoreContainer().unload("split1");
+    }
+  }
+
+  /** Utility method to find Ids that hash into which ranges. Uncomment @Test 
to print. */
+  //  @Test
+  public void printCompositeHashSandbox() {
+    CompositeIdRouter r1 = new CompositeIdRouter();
+    String routeBase = "sea-line!";
+    DocRouter.Range routeBaseRange = r1.keyHashRange(routeBase);
+    List<DocRouter.Range> twoRanges = r1.partitionRange(2, routeBaseRange);
+    System.out.println("splitKeyRange = " + twoRanges);
+
+    // Hash some values and print which range they fall into
+    for (int i = 0; i < 10; i++) {
+      String key = routeBase + i;
+      int hash = r1.sliceHash(key, null, null, null);
+      boolean inA = twoRanges.get(0).includes(hash);
+      boolean inB = twoRanges.get(1).includes(hash);
+
+      // Print which range the key is in
+      System.out.println(key + " in " + (inA ? twoRanges.get(0) : 
twoRanges.get(1)));
+    }
+  }
+
+  /** Creates a range encompassing the two ids, then splits it in two. Uses 
PlainIdRouter */
   private List<DocRouter.Range> getRanges(String id1, String id2) {
     // find minHash/maxHash hash ranges
     byte[] bytes = id1.getBytes(StandardCharsets.UTF_8);
     int minHash = Hash.murmurhash3_x86_32(bytes, 0, bytes.length, 0);
     bytes = id2.getBytes(StandardCharsets.UTF_8);
     int maxHash = Hash.murmurhash3_x86_32(bytes, 0, bytes.length, 0);
-
     if (minHash > maxHash) {
       int temp = maxHash;
       maxHash = minHash;
       minHash = temp;
     }
 
+    if (maxHash - minHash < 2) {
+      throw new RuntimeException("The range is too small to split");
+    }
+
     PlainIdRouter router = new PlainIdRouter();
     DocRouter.Range fullRange = new DocRouter.Range(minHash, maxHash);
     return router.partitionRange(2, fullRange);
diff --git 
a/solr/solr-ref-guide/modules/deployment-guide/pages/collection-management.adoc 
b/solr/solr-ref-guide/modules/deployment-guide/pages/collection-management.adoc
index f1906e455f7..f4811158ef7 100644
--- 
a/solr/solr-ref-guide/modules/deployment-guide/pages/collection-management.adoc
+++ 
b/solr/solr-ref-guide/modules/deployment-guide/pages/collection-management.adoc
@@ -228,6 +228,7 @@ When such a collection is deleted, its autocreated 
configset will be deleted by
 +
 If this parameter is specified, the router will look at the value of the field 
in an input document to compute the hash and identify a shard instead of 
looking at the `uniqueKey` field.
 If the field specified is null in the document, the document will be rejected.
+For nested documents, the route field must match among all the documents in 
the hierarchy.
 +
 Please note that xref:configuration-guide:realtime-get.adoc[] or retrieval by 
document ID would also require the parameter `\_route_` (or `shard.keys`) to 
avoid a distributed search.
 
diff --git a/solr/solr-ref-guide/modules/deployment-guide/pages/node-roles.adoc 
b/solr/solr-ref-guide/modules/deployment-guide/pages/node-roles.adoc
index a261d23e70c..07dab503bdd 100644
--- a/solr/solr-ref-guide/modules/deployment-guide/pages/node-roles.adoc
+++ b/solr/solr-ref-guide/modules/deployment-guide/pages/node-roles.adoc
@@ -73,7 +73,7 @@ A node with this role can perform duties of an overseer node 
(unless mode is `di
 
 A node with this role can act as if it has replicas of all collections in the 
cluster when a query is performed. The workflow is as follows
 
-If the cluster has collections with very large no:of shards, performing 
distributed requests in your _`data node`_ will lead to
+If the cluster has collections with very large no:of shards, performing 
distributed requests in your _data node_ will lead to
 
 * large heap utilization
 * frequent GC pauses
diff --git 
a/solr/solr-ref-guide/modules/deployment-guide/pages/solrcloud-shards-indexing.adoc
 
b/solr/solr-ref-guide/modules/deployment-guide/pages/solrcloud-shards-indexing.adoc
index 21b4027fdfa..d01ce96f3b9 100644
--- 
a/solr/solr-ref-guide/modules/deployment-guide/pages/solrcloud-shards-indexing.adoc
+++ 
b/solr/solr-ref-guide/modules/deployment-guide/pages/solrcloud-shards-indexing.adoc
@@ -138,7 +138,7 @@ At query time, you include the prefix(es) along with the 
number of bits into you
 If you do not want to influence how documents are stored, you don't need to 
specify a prefix in your document ID.
 
 If you created the collection and defined the "implicit" router at the time of 
creation, you can additionally define a `router.field` parameter to use a field 
from each document to identify a shard where the document belongs.
-If the field specified is missing in the document, however, the document will 
be rejected.
+If the field specified is missing in the document, then the document will be 
rejected.
 You could also use the `\_route_` parameter to name a specific shard.
 
 == Shard Splitting
diff --git 
a/solr/solr-ref-guide/modules/indexing-guide/pages/indexing-nested-documents.adoc
 
b/solr/solr-ref-guide/modules/indexing-guide/pages/indexing-nested-documents.adoc
index 939f8f2adc5..fedb60d7e3e 100644
--- 
a/solr/solr-ref-guide/modules/indexing-guide/pages/indexing-nested-documents.adoc
+++ 
b/solr/solr-ref-guide/modules/indexing-guide/pages/indexing-nested-documents.adoc
@@ -283,7 +283,7 @@ This makes it much easier to apply
 xref:partial-document-updates.adoc#updating-child-documents[atomic updates] to 
individual child documents.
 ====
 
-== Maintaining Integrity with Updates and Deletes
+== Maintaining Integrity with Updates, Deletes, and Shard Splits
 
 Nested document trees can be modified with
 xref:partial-document-updates.adoc#atomic-updates[atomic updates] to
@@ -304,6 +304,10 @@ Instead, use delete-by-query (most efficient) or 
xref:partial-document-updates.a
 If you use Solr's delete-by-query APIs, you *MUST* be careful to ensure that 
any deletion query is structured to ensure no descendent children remain of any 
documents that are being deleted.
 *_Doing otherwise will violate integrity assumptions that Solr expects._*
 
+When xref:deployment-guide:shard-management.adoc#splitshard[splitting shards], 
child documents get transmitted and routed independently.
+If you use the default `router.field`, this is handled correctly for you.
+If your collection has a custom `router.field`, then you must ensure that all 
documents in a hierarchy have the same value for that field.
+
 == Indexing Anonymous Children
 
 Although not recommended, it is also possible to index child documents 
"anonymously":

(solr) branch main updated: SOLR-11191: SolrIndexSplitter: Routing child docs by _root_ when available (#2799)

Reply via email to