(jackrabbit-oak) branch trunk updated: OAK-10671- [Indexing Job] Improve Mongo regex query: remove condition on non-indexed _path field to speed-up traversal (#1331)

nfsantos Thu, 29 Feb 2024 02:44:00 -0800

This is an automated email from the ASF dual-hosted git repository.

nfsantos pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git



The following commit(s) were added to refs/heads/trunk by this push:
     new 2ac4819b9b OAK-10671- [Indexing Job] Improve Mongo regex query: remove 
condition on non-indexed _path field to speed-up traversal (#1331)
2ac4819b9b is described below

commit 2ac4819b9b59019fff0f574be982e43b6f51d82d
Author: Nuno Santos <[email protected]>
AuthorDate: Thu Feb 29 11:42:18 2024 +0100

    OAK-10671- [Indexing Job] Improve Mongo regex query: remove condition on 
non-indexed _path field to speed-up traversal (#1331)
    
    * Change filter on Mongo to apply conditions only on the _modified and _id 
fields, so that the filter condition can be evaluated only with the contents of 
an index on (_modified, _id).
---
 .../pipelined/PipelinedMongoDownloadTask.java      | 102 ++++++--------
 .../document/flatfile/pipelined/PipelinedIT.java   |  34 +++--
 .../pipelined/PipelinedMongoDownloadTaskTest.java  | 147 +++++----------------
 3 files changed, 99 insertions(+), 184 deletions(-)

diff --git 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java
 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java
index aeac8ba348..d9276d9a69 100644
--- 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java
+++ 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java
@@ -147,15 +147,29 @@ public class PipelinedMongoDownloadTask implements 
Callable<PipelinedMongoDownlo
      * @param mongoFilterPaths          The paths to be included/excluded in 
the filter. These define subtrees to be included or excluded.
      *                                  (see {@link MongoFilterPaths} for 
details)
      * @param customExcludeEntriesRegex Documents with paths matching this 
regex are excluded from download
-     * @param queryUsesIndexTraversal   Whether the query will use an index to 
traverse the documents.
      * @return The filter to be used in the Mongo query, or null if no filter 
is required
      */
-    static Bson computeMongoQueryFilter(@NotNull MongoFilterPaths 
mongoFilterPaths, String customExcludeEntriesRegex, boolean 
queryUsesIndexTraversal) {
+    static Bson computeMongoQueryFilter(@NotNull MongoFilterPaths 
mongoFilterPaths, String customExcludeEntriesRegex) {
         var filters = new ArrayList<Bson>();
 
-        Bson includedFilter = descendantsFilter(mongoFilterPaths.included, 
queryUsesIndexTraversal);
-        if (includedFilter != null) {
-            filters.add(includedFilter);
+        List<Pattern> includedPatterns = 
toFilterPatterns(mongoFilterPaths.included);
+        if (!includedPatterns.isEmpty()) {
+            // The conditions above on the _id field is not enough to match 
all JCR nodes in the given paths because nodes
+            // with paths longer than a certain threshold, are represented by 
Mongo documents where the _id field is replaced
+            // by a hash and the full path is stored in an additional field 
_path. To retrieve these long path documents,
+            // we could add a condition on the _path field, but this would 
slow down substantially scanning the DB, because
+            // the _path field is not part of the index used by this query 
(it's an index on _modified, _id). Therefore,
+            // Mongo would have to retrieve every document from the column 
store to evaluate the filter condition. So instead
+            // we add below a condition to download all the long path 
documents. These documents can be identified by the
+            // format of the _id field (<n>:h<hash>), so it is possible to 
identify them using only the index.
+            // This might download documents for nodes that are not in the 
included paths, but those documents will anyway
+            // be filtered in the transform stage. And in most repositories, 
the number of long path documents is very small,
+            // often there are none, so the extra documents downloaded will 
not slow down by much the download. However, the
+            // performance gains of evaluating the filter of the query using 
only the index are very significant, especially
+            // when the index requires only a small number of nodes.
+            var patternsWithLongPathInclude = new 
ArrayList<>(includedPatterns);
+            patternsWithLongPathInclude.add(LONG_PATH_ID_PATTERN);
+            filters.add(Filters.in(NodeDocument.ID, 
patternsWithLongPathInclude));
         }
 
         // The Mongo filter returned here will download the top level path of 
each excluded subtree, which in theory
@@ -164,15 +178,13 @@ public class PipelinedMongoDownloadTask implements 
Callable<PipelinedMongoDownlo
         // This is done because excluding also the top level path would add 
extra complexity to the filter and
         // would not have any measurable impact on performance because it only 
downloads a few extra documents, one
         // for each excluded subtree. The transform stage will anyway filter 
out these paths.
-        Bson excludedFilter = descendantsFilter(mongoFilterPaths.excluded, 
queryUsesIndexTraversal);
-        if (excludedFilter != null) {
-            filters.add(Filters.nor(excludedFilter));
-        }
-
+        ArrayList<Pattern> excludedPatterns = new ArrayList<>();
+        excludedPatterns.addAll(toFilterPatterns(mongoFilterPaths.excluded));
         // Custom regex filter to exclude paths
-        Bson customExcludedPathsFilter = 
createCustomExcludedEntriesFilter(customExcludeEntriesRegex, 
queryUsesIndexTraversal);
-        if (customExcludedPathsFilter != null) {
-            filters.add(customExcludedPathsFilter);
+        
excludedPatterns.addAll(customExcludedPatterns(customExcludeEntriesRegex));
+
+        if (!excludedPatterns.isEmpty()) {
+            filters.add(Filters.nin(NodeDocument.ID, excludedPatterns));
         }
 
         if (filters.isEmpty()) {
@@ -184,65 +196,31 @@ public class PipelinedMongoDownloadTask implements 
Callable<PipelinedMongoDownlo
         }
     }
 
-    static Bson createCustomExcludedEntriesFilter(String customRegexPattern, 
boolean queryUsesIndexTraversal) {
-        if (customRegexPattern == null || customRegexPattern.trim().isEmpty()) 
{
-            LOG.info("Mongo custom regex is disabled");
-            return null;
-        } else {
-            LOG.info("Excluding nodes with paths matching regex: {}", 
customRegexPattern);
-            var pattern = Pattern.compile(customRegexPattern);
-            Bson pathFilter = createPathFilter(List.of(pattern), 
queryUsesIndexTraversal);
-            return Filters.nor(Filters.regex(NodeDocument.ID, pattern), 
pathFilter);
-        }
-    }
-
-    private static Bson descendantsFilter(List<String> paths, boolean 
queryUsesIndexTraversal) {
+    private static List<Pattern> toFilterPatterns(List<String> paths) {
         if (paths.isEmpty()) {
-            return null;
+            return List.of();
         }
         if (paths.size() == 1 && paths.get(0).equals("/")) {
-            return null;
+            return List.of();
         }
-
-        // The filter for descendants of a list of paths is a series of or 
conditions. For each path, we have to build
-        // two conditions in two different fields of the documents:
-        // _ _id   - for non-long paths - In this case, the _id is of the form 
"2:/foo/bar"
-        // _ _path - for long paths - In this case, the _id is a hash and the 
document contains an additional _path
-        //      field with the path of the document.
-        // We use the $in operator with a regular expression to match the 
paths.
-        //  
https://www.mongodb.com/docs/manual/reference/operator/query/in/#use-the--in-operator-with-a-regular-expression
-        ArrayList<Pattern> pathPatterns = new ArrayList<>();
-        ArrayList<Pattern> idPatterns = new ArrayList<>();
-
+        ArrayList<Pattern> patterns = new ArrayList<>();
         for (String path : paths) {
             if (!path.endsWith("/")) {
                 path = path + "/";
             }
             String quotedPath = Pattern.quote(path);
-            idPatterns.add(Pattern.compile("^[0-9]{1,3}:" + quotedPath + 
".*$"));
-            pathPatterns.add(Pattern.compile("^" + quotedPath + ".*$"));
+            patterns.add(Pattern.compile("^[0-9]{1,3}:" + quotedPath + ".*$"));
         }
-
-        Bson pathFilter = createPathFilter(pathPatterns, 
queryUsesIndexTraversal);
-        return Filters.or(Filters.in(NodeDocument.ID, idPatterns), pathFilter);
+        return patterns;
     }
 
-    private static Bson createPathFilter(List<Pattern> pattern, boolean 
queryUsesIndexTraversal) {
-        // If a document has a long path, the _id is replaced by a hash and 
the path is stored in an additional _path field.
-        // When doing an index scan, it may be more efficient to check that 
the _id is in the format of a long path id
-        // (that is, numeric prefix followed by ":h") first, before checking 
the _path field. The _id
-        // is available from the index while the _path field is only available 
on the document itself, so checking the
-        // _path will force an expensive retrieval of the full document. It is 
not guaranteed that Mongo will implement
-        // this optimization, but it is adding this additional check to allow 
MongoDB to apply this optimization.
-        // If the query does a column scan, then Mongo retrieves the full 
document from the column store, so we can
-        // check the _path directly, which simplifies a bit the query.
-        if (queryUsesIndexTraversal) {
-            return Filters.and(
-                    Filters.regex(NodeDocument.ID, LONG_PATH_ID_PATTERN),
-                    Filters.in(NodeDocument.PATH, pattern)
-            );
+    static List<Pattern> customExcludedPatterns(String customRegexPattern) {
+        if (customRegexPattern == null || customRegexPattern.trim().isEmpty()) 
{
+            LOG.info("Mongo custom regex is disabled");
+            return List.of();
         } else {
-            return Filters.in(NodeDocument.PATH, pattern);
+            LOG.info("Excluding nodes with paths matching regex: {}", 
customRegexPattern);
+            return List.of(Pattern.compile(customRegexPattern));
         }
     }
 
@@ -387,7 +365,7 @@ public class PipelinedMongoDownloadTask implements 
Callable<PipelinedMongoDownlo
                     .build();
             MetricsUtils.addMetric(statisticsProvider, reporter, 
PipelinedMetrics.OAK_INDEXER_PIPELINED_MONGO_DOWNLOAD_DURATION_SECONDS, 
durationMillis / 1000);
             MetricsUtils.addMetric(statisticsProvider, reporter, 
PipelinedMetrics.OAK_INDEXER_PIPELINED_DOCUMENTS_DOWNLOADED_TOTAL, 
documentsDownloadedTotal);
-            MetricsUtils.addMetric(statisticsProvider, reporter,  
PipelinedMetrics.OAK_INDEXER_PIPELINED_MONGO_DOWNLOAD_ENQUEUE_DELAY_PERCENTAGE,
+            MetricsUtils.addMetric(statisticsProvider, reporter, 
PipelinedMetrics.OAK_INDEXER_PIPELINED_MONGO_DOWNLOAD_ENQUEUE_DELAY_PERCENTAGE,
                     PipelinedUtils.toPercentage(totalEnqueueWaitTimeMillis, 
durationMillis)
             );
             MetricsUtils.addMetricByteSize(statisticsProvider, reporter, 
PipelinedMetrics.OAK_INDEXER_PIPELINED_DOCUMENTS_DOWNLOADED_TOTAL_BYTES,
@@ -421,7 +399,7 @@ public class PipelinedMongoDownloadTask implements 
Callable<PipelinedMongoDownlo
         // That is, download "/", "/content", "/content/dam" for a base path 
of "/content/dam". These nodes will not be
         // matched by the regex used in the Mongo query, which assumes a 
prefix of "???:/content/dam"
         MongoFilterPaths mongoFilterPaths = getPathsForRegexFiltering();
-        Bson mongoFilter = computeMongoQueryFilter(mongoFilterPaths, 
customExcludeEntriesRegex, true);
+        Bson mongoFilter = computeMongoQueryFilter(mongoFilterPaths, 
customExcludeEntriesRegex);
         if (mongoFilter == null) {
             LOG.info("Downloading full repository");
         } else {
@@ -516,7 +494,7 @@ public class PipelinedMongoDownloadTask implements 
Callable<PipelinedMongoDownlo
         // We are downloading potentially a large fraction of the repository, 
so using an index scan will be
         // inefficient. So we pass the natural hint to force MongoDB to use 
natural ordering, that is, column scan
         MongoFilterPaths mongoFilterPaths = getPathsForRegexFiltering();
-        Bson mongoFilter = computeMongoQueryFilter(mongoFilterPaths, 
customExcludeEntriesRegex, false);
+        Bson mongoFilter = computeMongoQueryFilter(mongoFilterPaths, 
customExcludeEntriesRegex);
         if (mongoFilter == null) {
             LOG.info("Downloading full repository from Mongo with natural 
order");
             FindIterable<NodeDocument> mongoIterable = dbCollection
diff --git 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedIT.java
 
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedIT.java
index b7f0f7811d..30ab0a3d8a 100644
--- 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedIT.java
+++ 
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedIT.java
@@ -32,6 +32,7 @@ import org.apache.jackrabbit.oak.plugins.document.MongoUtils;
 import org.apache.jackrabbit.oak.plugins.document.RevisionVector;
 import org.apache.jackrabbit.oak.plugins.document.mongo.MongoDocumentStore;
 import org.apache.jackrabbit.oak.plugins.document.util.MongoConnection;
+import org.apache.jackrabbit.oak.plugins.document.util.Utils;
 import org.apache.jackrabbit.oak.plugins.index.ConsoleIndexingReporter;
 import org.apache.jackrabbit.oak.plugins.metric.MetricStatisticsProvider;
 import org.apache.jackrabbit.oak.spi.blob.MemoryBlobStore;
@@ -186,7 +187,7 @@ public class PipelinedIT {
                 "/content/dam/2023|{\"p2\":\"v2023\"}",
                 "/content/dam/2023/01|{\"p1\":\"v202301\"}",
                 "/content/dam/2023/02|{}"
-        ));
+        ), true);
     }
 
     @Test
@@ -214,7 +215,7 @@ public class PipelinedIT {
                 "/content/dam/2022/02|{\"p1\":\"v202202\"}",
                 "/content/dam/2022/03|{\"p1\":\"v202203\"}",
                 "/content/dam/2022/04|{\"p1\":\"v202204\"}"
-        ));
+        ), true);
     }
 
 
@@ -234,7 +235,7 @@ public class PipelinedIT {
                 "/etc|{}",
                 "/home|{}",
                 "/jcr:system|{}"
-        ));
+        ), true);
     }
 
     @Test
@@ -253,7 +254,7 @@ public class PipelinedIT {
                 "/etc|{}",
                 "/home|{}",
                 "/jcr:system|{}"
-        ));
+        ), true);
     }
 
     @Test
@@ -283,8 +284,7 @@ public class PipelinedIT {
                 "/content/dam/2022/02/04|{\"p1\":\"v20220204\"}",
                 "/content/dam/2022/03|{\"p1\":\"v202203\"}",
                 "/content/dam/2022/04|{\"p1\":\"v202204\"}"
-
-        ));
+        ), true);
     }
 
     @Test
@@ -305,7 +305,7 @@ public class PipelinedIT {
                 "/content/dam/2023/01|{\"p1\":\"v202301\"}",
                 "/content/dam/2023/02|{}",
                 "/content/dam/2023/02/28|{\"p1\":\"v20230228\"}"
-        ));
+        ), true);
     }
 
     @Test
@@ -344,7 +344,7 @@ public class PipelinedIT {
         // The list above has the longest paths first, reverse it to match the 
order in the FFS
         Collections.reverse(expected);
 
-        testSuccessfulDownload(pathPredicate, pathFilters, expected);
+        testSuccessfulDownload(pathPredicate, pathFilters, expected, false);
     }
 
 
@@ -454,10 +454,10 @@ public class PipelinedIT {
 
     private void testSuccessfulDownload(Predicate<String> pathPredicate, 
List<PathFilter> pathFilters)
             throws CommitFailedException, IOException {
-        testSuccessfulDownload(pathPredicate, pathFilters, EXPECTED_FFS);
+        testSuccessfulDownload(pathPredicate, pathFilters, EXPECTED_FFS, 
false);
     }
 
-    private void testSuccessfulDownload(Predicate<String> pathPredicate, 
List<PathFilter> pathFilters, List<String> expected)
+    private void testSuccessfulDownload(Predicate<String> pathPredicate, 
List<PathFilter> pathFilters, List<String> expected, boolean ignoreLongPaths)
             throws CommitFailedException, IOException {
         Backend rwStore = createNodeStore(false);
         createContent(rwStore.documentNodeStore);
@@ -468,7 +468,19 @@ public class PipelinedIT {
 
         File file = pipelinedStrategy.createSortedStoreFile();
         assertTrue(file.exists());
-        assertEquals(expected, Files.readAllLines(file.toPath()));
+        List<String> result = Files.readAllLines(file.toPath());
+        if (ignoreLongPaths) {
+            // Remove the long paths from the result. The filter on Mongo is 
best-effort, it will download long path
+            // documents, even if they do not match the includedPaths.
+            result = result.stream()
+                    .filter(s -> {
+                        var name = s.split("\\|")[0];
+                        return name.length() < Utils.PATH_LONG;
+                    })
+                    .collect(Collectors.toList());
+
+        }
+        assertEquals(expected, result);
         assertMetrics();
     }
 
diff --git 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTaskTest.java
 
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTaskTest.java
index 46060b7985..3e87830560 100644
--- 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTaskTest.java
+++ 
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTaskTest.java
@@ -51,8 +51,10 @@ import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 
+import static 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.pipelined.PipelinedMongoDownloadTask.LONG_PATH_ID_PATTERN;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
 import static org.mockito.ArgumentMatchers.any;
 import static org.mockito.ArgumentMatchers.eq;
 import static org.mockito.Mockito.mock;
@@ -297,22 +299,14 @@ public class PipelinedMongoDownloadTaskTest {
 
     @Test
     public void createCustomExcludeEntriesFilter() {
-        
assertNull(PipelinedMongoDownloadTask.createCustomExcludedEntriesFilter(null, 
true));
-        
assertNull(PipelinedMongoDownloadTask.createCustomExcludedEntriesFilter("", 
true));
+        
assertTrue(PipelinedMongoDownloadTask.customExcludedPatterns(null).isEmpty());
+        
assertTrue(PipelinedMongoDownloadTask.customExcludedPatterns("").isEmpty());
 
         Pattern p = Pattern.compile("^[0-9]{1,3}:/a/b.*$");
-        var expectedBson = Filters.nor(
-                Filters.regex(NodeDocument.ID, p),
-                Filters.and(
-                        Filters.regex(NodeDocument.ID, 
PipelinedMongoDownloadTask.LONG_PATH_ID_PATTERN),
-                        Filters.in(NodeDocument.PATH, p)
-                )
-        );
+        var actualListOfPatterns = 
PipelinedMongoDownloadTask.customExcludedPatterns("^[0-9]{1,3}:/a/b.*$");
+        assertEquals(1, actualListOfPatterns.size());
 
-
-        var actualBson = 
PipelinedMongoDownloadTask.createCustomExcludedEntriesFilter("^[0-9]{1,3}:/a/b.*$",
 true);
-
-        assertBsonEquals(expectedBson, actualBson);
+        assertEquals(p.toString(), actualListOfPatterns.get(0).toString());
     }
 
     @Test
@@ -321,8 +315,7 @@ public class PipelinedMongoDownloadTaskTest {
         assertNull(
                 PipelinedMongoDownloadTask.computeMongoQueryFilter(
                         MongoFilterPaths.DOWNLOAD_ALL,
-                        null,
-                        true
+                        null
                 )
         );
     }
@@ -332,23 +325,14 @@ public class PipelinedMongoDownloadTaskTest {
         // Path filter but no exclude filter
         var actual = PipelinedMongoDownloadTask.computeMongoQueryFilter(
                 new MongoFilterPaths(List.of("/"), List.of("/excluded1", 
"/content/excluded2")),
-                null,
-                true
+                null
         );
         // The generated filter should not include any condition to include 
the descendants of /
-        var expected = Filters.nor(
-                Filters.or(
-                        Filters.in(NodeDocument.ID,
-                                Pattern.compile("^[0-9]{1,3}:" + 
Pattern.quote("/excluded1/") + ".*$"),
-                                Pattern.compile("^[0-9]{1,3}:" + 
Pattern.quote("/content/excluded2/") + ".*$")),
-                        Filters.and(
-                                Filters.regex(NodeDocument.ID, 
PipelinedMongoDownloadTask.LONG_PATH_ID_PATTERN),
-                                Filters.in(NodeDocument.PATH,
-                                        Pattern.compile("^" + 
Pattern.quote("/excluded1/") + ".*$"),
-                                        Pattern.compile("^" + 
Pattern.quote("/content/excluded2/") + ".*$"))
-                        )
-                )
-        );
+        var expected =
+                Filters.nin(NodeDocument.ID,
+                        Pattern.compile("^[0-9]{1,3}:" + 
Pattern.quote("/excluded1/") + ".*$"),
+                        Pattern.compile("^[0-9]{1,3}:" + 
Pattern.quote("/content/excluded2/") + ".*$")
+                );
         assertBsonEquals(expected, actual);
     }
 
@@ -358,15 +342,11 @@ public class PipelinedMongoDownloadTaskTest {
         // Path filter but no exclude filter
         var actual = PipelinedMongoDownloadTask.computeMongoQueryFilter(
                 new MongoFilterPaths(List.of("/parent"), List.of()),
-                null,
-                true
+                null
         );
-        var expected = Filters.or(
-                Filters.in(NodeDocument.ID, Pattern.compile("^[0-9]{1,3}:" + 
Pattern.quote("/parent/") + ".*$")),
-                Filters.and(
-                        Filters.in(NodeDocument.PATH, Pattern.compile("^" + 
Pattern.quote("/parent/") + ".*$")),
-                        Filters.regex(NodeDocument.ID, 
PipelinedMongoDownloadTask.LONG_PATH_ID_PATTERN)
-                )
+        var expected = Filters.in(NodeDocument.ID,
+                Pattern.compile("^[0-9]{1,3}:" + Pattern.quote("/parent/") + 
".*$"),
+                LONG_PATH_ID_PATTERN
         );
         assertBsonEquals(expected, actual);
     }
@@ -375,47 +355,24 @@ public class PipelinedMongoDownloadTaskTest {
     public void computeMongoQueryFilterNoPathFilterWithExcludeFilter() {
         var actual = PipelinedMongoDownloadTask.computeMongoQueryFilter(
                 MongoFilterPaths.DOWNLOAD_ALL,
-                "^[0-9]{1,3}:/a/b.*$",
-                true
-        );
-        Pattern excludePattern = Pattern.compile("^[0-9]{1,3}:/a/b.*$");
-        assertBsonEquals(
-                Filters.nor(
-                        Filters.regex(NodeDocument.ID, excludePattern),
-                        Filters.and(
-                                Filters.regex(NodeDocument.ID, 
PipelinedMongoDownloadTask.LONG_PATH_ID_PATTERN),
-                                Filters.in(NodeDocument.PATH, excludePattern)
-                        )
-                ),
-                actual
+                "^[0-9]{1,3}:/a/b.*$"
         );
+        Bson expectedFilter = Filters.nin(NodeDocument.ID, 
Pattern.compile("^[0-9]{1,3}:/a/b.*$"));
+        assertBsonEquals(expectedFilter, actual);
     }
 
     @Test
     public void computeMongoQueryFilterWithPathFilterWithExcludeFilter() {
         var actual = PipelinedMongoDownloadTask.computeMongoQueryFilter(
                 new MongoFilterPaths(List.of("/parent"), List.of()),
-                "^[0-9]{1,3}:/a/b.*$",
-                true
+                "^[0-9]{1,3}:/a/b.*$"
         );
 
         Pattern excludesPattern = Pattern.compile("^[0-9]{1,3}:/a/b.*$");
         var expected =
                 Filters.and(
-                        Filters.or(
-                                Filters.in(NodeDocument.ID, 
Pattern.compile("^[0-9]{1,3}:" + Pattern.quote("/parent/") + ".*$")),
-                                Filters.and(
-                                        Filters.regex(NodeDocument.ID, 
PipelinedMongoDownloadTask.LONG_PATH_ID_PATTERN),
-                                        Filters.in(NodeDocument.PATH, 
Pattern.compile("^" + Pattern.quote("/parent/") + ".*$"))
-                                )
-                        ),
-                        Filters.nor(
-                                Filters.regex(NodeDocument.ID, 
excludesPattern),
-                                Filters.and(
-                                        Filters.regex(NodeDocument.ID, 
PipelinedMongoDownloadTask.LONG_PATH_ID_PATTERN),
-                                        Filters.in(NodeDocument.PATH, 
excludesPattern)
-                                )
-                        )
+                        Filters.in(NodeDocument.ID, 
Pattern.compile("^[0-9]{1,3}:" + Pattern.quote("/parent/") + ".*$"), 
LONG_PATH_ID_PATTERN),
+                        Filters.nin(NodeDocument.ID, excludesPattern)
                 );
         assertBsonEquals(expected, actual);
     }
@@ -424,21 +381,17 @@ public class PipelinedMongoDownloadTaskTest {
     public void 
computeMongoQueryFilterWithPathFilterWithExcludeFilterAndNaturalOrderTraversal()
 {
         var actual = PipelinedMongoDownloadTask.computeMongoQueryFilter(
                 new MongoFilterPaths(List.of("/parent"), List.of()),
-                "^[0-9]{1,3}:/a/b.*$",
-                false
+                "^[0-9]{1,3}:/a/b.*$"
         );
 
         Pattern excludePattern = Pattern.compile("^[0-9]{1,3}:/a/b.*$");
         var expected =
                 Filters.and(
-                        Filters.or(
-                                Filters.in(NodeDocument.ID, 
Pattern.compile("^[0-9]{1,3}:" + Pattern.quote("/parent/") + ".*$")),
-                                Filters.in(NodeDocument.PATH, 
Pattern.compile("^" + Pattern.quote("/parent/") + ".*$"))
+                        Filters.in(NodeDocument.ID,
+                                Pattern.compile("^[0-9]{1,3}:" + 
Pattern.quote("/parent/") + ".*$"),
+                                LONG_PATH_ID_PATTERN
                         ),
-                        Filters.nor(
-                                Filters.regex(NodeDocument.ID, excludePattern),
-                                Filters.in(NodeDocument.PATH, excludePattern)
-                        )
+                        Filters.nin(NodeDocument.ID, excludePattern)
                 );
         assertBsonEquals(expected, actual);
     }
@@ -447,24 +400,12 @@ public class PipelinedMongoDownloadTaskTest {
     public void 
computeMongoQueryFilterWithPathFilterWithExcludeFilterAndNaturalColumnTraversal()
 {
         var actual = PipelinedMongoDownloadTask.computeMongoQueryFilter(
                 new MongoFilterPaths(List.of("/"), List.of("/excluded")),
-                "^[0-9]{1,3}:/a/b.*$",
-                false
+                "^[0-9]{1,3}:/a/b.*$"
         );
 
         Pattern excludePattern = Pattern.compile("^[0-9]{1,3}:/a/b.*$");
         var expected =
-                Filters.and(
-                        Filters.nor(
-                                Filters.or(
-                                        Filters.in(NodeDocument.ID, 
Pattern.compile("^[0-9]{1,3}:" + Pattern.quote("/excluded/") + ".*$")),
-                                        Filters.in(NodeDocument.PATH, 
Pattern.compile("^" + Pattern.quote("/excluded/") + ".*$"))
-                                )
-                        ),
-                        Filters.nor(
-                                Filters.regex(NodeDocument.ID, excludePattern),
-                                Filters.in(NodeDocument.PATH, excludePattern)
-                        )
-                );
+                Filters.nin(NodeDocument.ID, Pattern.compile("^[0-9]{1,3}:" + 
Pattern.quote("/excluded/") + ".*$"), excludePattern);
         assertBsonEquals(expected, actual);
     }
 
@@ -472,30 +413,14 @@ public class PipelinedMongoDownloadTaskTest {
     public void 
computeMongoQueryFilterWithPathFilterWithExcludeFilterAndNaturalIndexTraversal()
 {
         var actual = PipelinedMongoDownloadTask.computeMongoQueryFilter(
                 new MongoFilterPaths(List.of("/"), List.of("/excluded")),
-                "^[0-9]{1,3}:/a/b.*$",
-                true
+                "^[0-9]{1,3}:/a/b.*$"
         );
 
         Pattern excludePattern = Pattern.compile("^[0-9]{1,3}:/a/b.*$");
-        var expected =
-                Filters.and(
-                        Filters.nor(
-                                Filters.or(
-                                        Filters.in(NodeDocument.ID, 
Pattern.compile("^[0-9]{1,3}:" + Pattern.quote("/excluded/") + ".*$")),
-                                        Filters.and(
-                                                Filters.regex(NodeDocument.ID, 
PipelinedMongoDownloadTask.LONG_PATH_ID_PATTERN),
-                                                Filters.in(NodeDocument.PATH, 
Pattern.compile("^" + Pattern.quote("/excluded/") + ".*$"))
-                                        )
-                                )
-                        ),
-                        Filters.nor(
-                                Filters.regex(NodeDocument.ID, excludePattern),
-                                Filters.and(
-                                        Filters.regex(NodeDocument.ID, 
PipelinedMongoDownloadTask.LONG_PATH_ID_PATTERN),
-                                        Filters.in(NodeDocument.PATH, 
excludePattern)
-                                )
-                        )
-                );
+        var expected = Filters.nin(NodeDocument.ID,
+                Pattern.compile("^[0-9]{1,3}:" + Pattern.quote("/excluded/") + 
".*$"),
+                excludePattern
+        );
         assertBsonEquals(expected, actual);
     }

(jackrabbit-oak) branch trunk updated: OAK-10671- [Indexing Job] Improve Mongo regex query: remove condition on non-indexed _path field to speed-up traversal (#1331)

Reply via email to