This is an automated email from the ASF dual-hosted git repository.

daim pushed a commit to branch DetailedGC/OAK-10199
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git

commit 7c6066a974131330d65dfd71789c5d6c2a4aa2a1
Author: Nuno Santos <[email protected]>
AuthorDate: Mon Jan 8 14:30:24 2024 +0100

    OAK-10589 - Improve regex path filtering to also handle cases where 
excludedPaths are defined (#1258)
---
 .../pipelined/MongoRegexPathFilterFactory.java     | 149 +++++++++++++++++
 .../pipelined/PipelinedMongoDownloadTask.java      | 186 +++++++++++----------
 .../document/flatfile/pipelined/PipelinedIT.java   | 119 ++++++++++++-
 .../pipelined/PipelinedMongoDownloadTaskTest.java  | 145 +++++++++++++---
 4 files changed, 490 insertions(+), 109 deletions(-)

diff --git 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/MongoRegexPathFilterFactory.java
 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/MongoRegexPathFilterFactory.java
new file mode 100644
index 0000000000..299c7b19c8
--- /dev/null
+++ 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/MongoRegexPathFilterFactory.java
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.jackrabbit.oak.index.indexer.document.flatfile.pipelined;
+
+import org.apache.jackrabbit.oak.commons.PathUtils;
+import org.apache.jackrabbit.oak.spi.filter.PathFilter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+import java.util.TreeSet;
+
+public class MongoRegexPathFilterFactory {
+
+    public static class MongoFilterPaths {
+        // Special value that means "download everything". This is used when 
regex path filtering is disabled or when
+        // the path filters would require downloading everything. When this is 
returned, it is preferable to do not
+        // use any filter in the Mongo query, even though using the values in 
this object as filters would also result
+        // in downloading everything
+        public static final MongoFilterPaths DOWNLOAD_ALL = new 
MongoFilterPaths(List.of("/"), List.of());
+
+        final List<String> included;
+        final List<String> excluded;
+
+        public MongoFilterPaths(List<String> included, List<String> excluded) {
+            this.included = included;
+            this.excluded = excluded;
+        }
+
+        @Override
+        public String toString() {
+            return "MongoFilterPaths{" +
+                    "included=" + included +
+                    ", excluded=" + excluded +
+                    '}';
+        }
+
+        @Override
+        public boolean equals(Object o) {
+            if (this == o) return true;
+            if (o == null || getClass() != o.getClass()) return false;
+            MongoFilterPaths that = (MongoFilterPaths) o;
+            return Objects.equals(included, that.included) && 
Objects.equals(excluded, that.excluded);
+        }
+
+        @Override
+        public int hashCode() {
+            return Objects.hash(included, excluded);
+        }
+    }
+
+    private final static Logger LOG = 
LoggerFactory.getLogger(MongoRegexPathFilterFactory.class);
+
+    private final int maxNumberOfPaths;
+
+    public MongoRegexPathFilterFactory(int maxNumberOfPaths) {
+        this.maxNumberOfPaths = maxNumberOfPaths;
+    }
+
+    public MongoFilterPaths buildMongoFilter(List<PathFilter> pathFilters) {
+        if (pathFilters == null || pathFilters.isEmpty()) {
+            return MongoFilterPaths.DOWNLOAD_ALL;
+        }
+        // Merge and deduplicate the included and excluded paths of all 
filters.
+        // The included paths will also be further de-deuplicated, by removing 
paths that are children of other path in
+        // the list, that is, by keeping only the top level paths. To make it 
easy to de-duplicate, we sort the paths
+        // alphabetically.
+        // The list of excluded paths does not need to be sorted, but we do it 
anyway for consistency when logging the
+        // list of paths.
+        TreeSet<String> sortedCandidateIncludedPaths = new TreeSet<>();
+        TreeSet<String> candidateExcludedPaths = new TreeSet<>();
+        for (PathFilter pathFilter : pathFilters) {
+            sortedCandidateIncludedPaths.addAll(pathFilter.getIncludedPaths());
+            candidateExcludedPaths.addAll(pathFilter.getExcludedPaths());
+        }
+        // Keep only unique included paths. That is, if paths "/a/b" and 
"/a/b/c" are both in the list, keep only "/a/b"
+        List<String> finalIncludedPathsRoots = new ArrayList<>();
+        LOG.debug("Candidate included paths: {}, candidate excluded paths: 
{}", sortedCandidateIncludedPaths, candidateExcludedPaths);
+        // The quadratic algorithm below could easily be replaced by linear 
time algorithm, but it's not worth the added
+        // complexity because the list of paths should never be more than a 
few tens
+        // Idea for linear time algorithm: after adding a path to the 
includedPathsRoots, advance in the list of candidates
+        // skipping all paths that are children of the path just added. Since 
the list is sorted alphabetically, the children
+        // of a path will be next to it in the list
+        for (String candidateIncludedPath : sortedCandidateIncludedPaths) {
+            if (finalIncludedPathsRoots.stream().noneMatch(ancestor -> 
PathUtils.isAncestor(ancestor, candidateIncludedPath))) {
+                finalIncludedPathsRoots.add(candidateIncludedPath);
+            }
+        }
+
+        List<String> finalExcludedPaths = new ArrayList<>();
+        // Keep an excluded path only if it is not needed by any filter.
+        for (String candidateExcludedPath : candidateExcludedPaths) {
+            boolean notNeeded = pathFilters.stream().allMatch(pathFilter -> 
pathFilter.filter(candidateExcludedPath) == PathFilter.Result.EXCLUDE);
+            if (notNeeded) {
+                finalExcludedPaths.add(candidateExcludedPath);
+            }
+        }
+
+        // Do not filter if there are too many includedPaths. The official 
Mongo documentation recommends
+        // not using more than tens of values in the $in filter to avoid 
performance degradation.
+        // https://www.mongodb.com/docs/manual/reference/operator/query/in/
+        if (finalIncludedPathsRoots.size() > maxNumberOfPaths) {
+            LOG.info("Number of includedPaths ({}) exceed the maximum allowed 
({}), downloading everything",
+                    finalIncludedPathsRoots.size(), maxNumberOfPaths);
+            return MongoFilterPaths.DOWNLOAD_ALL;
+        }
+
+        // Also check the number of excluded paths and disable filtering by 
them if there are too many.
+        // We do not need to revert to download the full repository in the 
case where there is a reasonable number of
+        // included paths but too many excluded paths. The excluded paths are 
always children of the included paths, so
+        // if we do not filter on excludedPaths, we simply download everything 
inside the included paths. This might be
+        // much less than the full repository.
+        if (finalExcludedPaths.size() > maxNumberOfPaths) {
+            LOG.info("Number of excludedPaths ({}) exceed the maximum allowed 
({}), disabling filtering of excludedPaths",
+                    finalExcludedPaths.size(), maxNumberOfPaths);
+            finalExcludedPaths = List.of();
+        }
+
+        if (finalExcludedPaths.isEmpty()) {
+            // Only included paths
+            if (finalIncludedPathsRoots.contains("/")) {
+                return MongoFilterPaths.DOWNLOAD_ALL;
+            } else {
+                return new MongoFilterPaths(finalIncludedPathsRoots, 
List.of());
+            }
+        } else {
+            return new MongoFilterPaths(finalIncludedPathsRoots, 
finalExcludedPaths);
+        }
+    }
+
+}
diff --git 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java
 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java
index a490d5a7a9..e0f8adf497 100644
--- 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java
+++ 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java
@@ -32,6 +32,7 @@ import org.apache.jackrabbit.guava.common.base.Preconditions;
 import org.apache.jackrabbit.guava.common.base.Stopwatch;
 import org.apache.jackrabbit.oak.commons.IOUtils;
 import org.apache.jackrabbit.oak.commons.PathUtils;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.pipelined.MongoRegexPathFilterFactory.MongoFilterPaths;
 import org.apache.jackrabbit.oak.plugins.document.Collection;
 import org.apache.jackrabbit.oak.plugins.document.NodeDocument;
 import org.apache.jackrabbit.oak.plugins.document.mongo.MongoDocumentStore;
@@ -51,22 +52,20 @@ import org.slf4j.LoggerFactory;
 import java.time.Duration;
 import java.time.Instant;
 import java.time.temporal.ChronoUnit;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
-import java.util.HashSet;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
-import java.util.Set;
+import java.util.TreeSet;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.Callable;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.TimeoutException;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
-import java.util.stream.Stream;
 
-import static com.mongodb.client.model.Filters.regex;
 import static com.mongodb.client.model.Sorts.ascending;
 
 public class PipelinedMongoDownloadTask implements 
Callable<PipelinedMongoDownloadTask.Result> {
@@ -101,12 +100,22 @@ public class PipelinedMongoDownloadTask implements 
Callable<PipelinedMongoDownlo
      * filtering in the indexing job. This feature may significantly reduce 
the number of documents downloaded from
      * Mongo.
      * The performance gains may not be proportional to the reduction in the 
number of documents downloaded because Mongo
-     * still has to traverse all the documents. This is the case because the 
regex expression used for path filtering
+     * still has to traverse all the documents. This is required because the 
regex expression used for path filtering
      * starts with a wildcard (because the _id starts with the depth of the 
path, so the regex expression must ignore
      * this part). Because of the wildcard at the start, Mongo cannot use of 
the index on _id.
      */
     public static final String 
OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING = 
"oak.indexer.pipelined.mongoRegexPathFiltering";
     public static final boolean 
DEFAULT_OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING = false;
+
+    /**
+     * Maximum number of elements in the included/excluded paths list used for 
regex path filtering. If after
+     * merging and de-deduplication of the paths of all the path filters the 
number of included or excluded paths exceeds
+     * this value, then disable path filtering to avoid creating Mongo queries 
with large number of filters
+     */
+    public static final String 
OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING_MAX_PATHS = 
"oak.indexer.pipelined.mongoRegexPathFilteringMaxPaths";
+    public static final int 
DEFAULT_OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING_MAX_PATHS = 20;
+
+
     // Use a short initial retry interval. In most cases if the connection to 
a replica fails, there will be other
     // replicas available so a reconnection attempt will succeed immediately.
     private final static long retryInitialIntervalMillis = 100;
@@ -132,6 +141,7 @@ public class PipelinedMongoDownloadTask implements 
Callable<PipelinedMongoDownlo
     private final Stopwatch downloadStartWatch = Stopwatch.createUnstarted();
     private final int maxBatchSizeBytes;
     private final StatisticsProvider statisticsProvider;
+    private final MongoRegexPathFilterFactory regexPathFilterFactory;
 
     private long totalEnqueueWaitTimeMillis = 0;
     private Instant lastDelayedEnqueueWarningMessageLoggedTimestamp = 
Instant.now();
@@ -173,6 +183,10 @@ public class PipelinedMongoDownloadTask implements 
Callable<PipelinedMongoDownlo
         this.regexPathFiltering = ConfigHelper.getSystemPropertyAsBoolean(
                 OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING,
                 DEFAULT_OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING);
+        int regexPathFilteringMaxNumberOfPaths = 
ConfigHelper.getSystemPropertyAsInt(
+                OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING_MAX_PATHS,
+                
DEFAULT_OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING_MAX_PATHS);
+        this.regexPathFilterFactory = new 
MongoRegexPathFilterFactory(regexPathFilteringMaxNumberOfPaths);
 
         //TODO This may lead to reads being routed to secondary depending on 
MongoURI
         //So caller must ensure that its safe to read from secondary
@@ -245,18 +259,20 @@ public class PipelinedMongoDownloadTask implements 
Callable<PipelinedMongoDownlo
         // If regex filtering is enabled, start by downloading the ancestors 
of the path used for filtering.
         // That is, download "/", "/content", "/content/dam" for a base path 
of "/content/dam". These nodes will not be
         // matched by the regex used in the Mongo query, which assumes a 
prefix of "???:/content/dam"
-        Set<String> regexBasePaths = getPathsForRegexFiltering();
+        MongoFilterPaths mongoFilterPathsDefinition = 
getPathsForRegexFiltering();
         Bson childrenFilter;
-        if (regexBasePaths.isEmpty()) {
+        if (mongoFilterPathsDefinition == MongoFilterPaths.DOWNLOAD_ALL) {
+            LOG.info("Downloading full repository");
             childrenFilter = null;
         } else {
             // Regex path filtering is enabled
             // Download the ancestors in a separate query. No retrials done on 
this query, as it will take only a few
             // seconds and is done at the start of the job, so if it fails, 
the job can be retried without losing much work
-            downloadAncestors(regexBasePaths);
+            downloadAncestors(mongoFilterPathsDefinition.included);
 
             // Filter to apply to the main query
-            childrenFilter = descendantsFilter(regexBasePaths);
+            childrenFilter = descendantsFilter(mongoFilterPathsDefinition);
+            LOG.info("Downloading from Mongo using filter: {}", 
childrenFilter);
         }
 
         Instant failuresStartTimestamp = null; // When the last series of 
failures started
@@ -325,7 +341,10 @@ public class PipelinedMongoDownloadTask implements 
Callable<PipelinedMongoDownlo
         download(mongoIterable);
     }
 
-    private void downloadAncestors(Set<String> basePath) throws 
InterruptedException, TimeoutException {
+    private void downloadAncestors(List<String> basePath) throws 
InterruptedException, TimeoutException {
+        if (basePath.size() == 1 && basePath.get(0).equals("/")) {
+            return; // No need to download ancestors of root, the root will be 
downloaded as part of the normal traversal
+        }
         Bson ancestorQuery = ancestorsFilter(basePath);
         LOG.info("Downloading ancestors of: {}, Query: {}.", basePath, 
ancestorQuery);
         FindIterable<NodeDocument> ancestorsIterable = dbCollection
@@ -339,8 +358,8 @@ public class PipelinedMongoDownloadTask implements 
Callable<PipelinedMongoDownlo
     private void downloadWithNaturalOrdering() throws InterruptedException, 
TimeoutException {
         // We are downloading potentially a large fraction of the repository, 
so using an index scan will be
         // inefficient. So we pass the natural hint to force MongoDB to use 
natural ordering, that is, column scan
-        Set<String> regexBasePath = getPathsForRegexFiltering();
-        if (regexBasePath.isEmpty()) {
+        MongoFilterPaths regexBasePath = getPathsForRegexFiltering();
+        if (regexBasePath == MongoFilterPaths.DOWNLOAD_ALL) {
             LOG.info("Downloading full repository using natural order");
             FindIterable<NodeDocument> mongoIterable = dbCollection
                     .withReadPreference(readPreference)
@@ -349,95 +368,88 @@ public class PipelinedMongoDownloadTask implements 
Callable<PipelinedMongoDownlo
             download(mongoIterable);
 
         } else {
-            downloadAncestors(regexBasePath);
-
-            Bson childrenQuery = descendantsFilter(regexBasePath);
-            LOG.info("Downloading using regex path filtering. Downloading 
children: {}.", childrenQuery);
-            FindIterable<NodeDocument> childrenIterable = dbCollection
-                    .withReadPreference(readPreference)
-                    .find(childrenQuery)
-                    .hint(NATURAL_HINT);
-            download(childrenIterable);
+            downloadAncestors(regexBasePath.included);
+
+            Bson childrenFilter = descendantsFilter(regexBasePath);
+            FindIterable<NodeDocument> findIterable;
+            if (childrenFilter == null) {
+                LOG.info("Downloading full repository using natural order");
+                findIterable = dbCollection
+                        .withReadPreference(readPreference)
+                        .find();
+            } else {
+                LOG.info("Downloading from Mongo using filter: {}", 
childrenFilter);
+                findIterable = dbCollection
+                        .withReadPreference(readPreference)
+                        .find(childrenFilter);
+            }
+            download(findIterable.hint(NATURAL_HINT));
         }
     }
 
-    private Set<String> getPathsForRegexFiltering() {
+    private MongoFilterPaths getPathsForRegexFiltering() {
         if (!regexPathFiltering) {
             LOG.info("Regex path filtering disabled.");
-            return Set.of();
+            return MongoFilterPaths.DOWNLOAD_ALL;
         }
-        return extractIncludedPaths(pathFilters);
+        LOG.info("Computing included/excluded paths for Mongo regex path 
filtering. PathFilters: {}", pathFilters.stream()
+                .map(pf -> "PF{includedPaths=" + pf.getIncludedPaths() + ", 
excludedPaths=" + pf.getExcludedPaths() + "}")
+                .collect(Collectors.joining(", ")));
+        MongoFilterPaths mongoFilterPaths = 
this.regexPathFilterFactory.buildMongoFilter(pathFilters);
+        LOG.info("Paths used for regex filtering on Mongo: {}", 
mongoFilterPaths);
+        return mongoFilterPaths;
     }
 
-    /**
-     * Aggregates the included paths from the path filters. The final list 
will not contain duplicates or overlapping
-     * paths (i.e., /a and /a/b).
-     *
-     * @param pathFilters Empty set if path filtering should be disabled, 
otherwise the paths that should be included
-     *                    in the Mongo query filters
-     */
-    // Package private for testing
-    static Set<String> extractIncludedPaths(List<PathFilter> pathFilters) {
-        // Path filtering is enabled only if there are no excludedPaths.
-        if (pathFilters == null) {
-            return Set.of();
+    private Bson descendantsFilter(MongoFilterPaths 
mongoFilterPathsDefinition) {
+        if (mongoFilterPathsDefinition == MongoFilterPaths.DOWNLOAD_ALL) {
+            return null;
         }
-        Set<String> includedPaths = new HashSet<>();
-        Set<String> excludedPaths = new HashSet<>();
-        for (PathFilter pathFilter : pathFilters) {
-            includedPaths.addAll(pathFilter.getIncludedPaths());
-            excludedPaths.addAll(pathFilter.getExcludedPaths());
+        Bson pathFilter = 
descendantsFilter(mongoFilterPathsDefinition.included);
+        if (mongoFilterPathsDefinition.excluded.isEmpty()) {
+            return pathFilter;
+        } else {
+            // The Mongo filter returned here will download the top level path 
of each excluded subtree, which in theory
+            // should be excluded. That is, if the tree /a/b/c is excluded, 
the filter will download /a/b/c but none of
+            // its descendants.
+            // This is done because excluding also the top level path would 
add extra complexity to the filter and
+            // would not have any measurable impact on performance because it 
only downloads a few extra documents, one
+            // for each excluded subtree. The transform stage will anyway 
filter out these paths.
+            Bson excludedFilter = 
descendantsFilter(mongoFilterPathsDefinition.excluded);
+            return Filters.and(pathFilter, Filters.nor(excludedFilter));
         }
-        // Sort by natural order, so that parent paths appear before any 
children. This makes it easier to compute the
-        // common ancestors of included paths
-        List<String> sortedIncludedPaths = includedPaths.stream()
-                .sorted()
-                .collect(Collectors.toList());
+    }
 
-        LOG.info("Paths considered for regex filtering. IncludedPaths: {}, 
excludedPaths: {}", sortedIncludedPaths, excludedPaths);
-        if (sortedIncludedPaths.contains("/")) {
-            LOG.info("Disabling regex path filtering because root path is in 
the includedPaths: {}", sortedIncludedPaths);
-            return Set.of();
-        }
-        if (!excludedPaths.isEmpty()) {
-            LOG.info("Disabling regex path filtering because there are 
excluded paths: {}", excludedPaths);
-            return Set.of();
+    private static Bson descendantsFilter(List<String> paths) {
+        if (paths.isEmpty()) {
+            return null;
         }
-
-        // Keep only unique included paths. That is, if paths "/a/b" and 
"/a/b/c" are both in the list, keep only "/a/b"
-        HashSet<String> includedPathsRoots = new HashSet<>();
-        for (String path : sortedIncludedPaths) {
-            if (includedPathsRoots.stream().noneMatch(ancestor -> 
PathUtils.isAncestor(ancestor, path))) {
-                includedPathsRoots.add(path);
+        // The filter for descendants of a list of paths is a series of or 
conditions. For each path, we have to build
+        // two conditions in two different fields of the documents:
+        // _ _id   - for non-long paths - In this case, the _id is of the form 
"2:/foo/bar"
+        // _ _path - for long paths - In this case, the _id is n hash and the 
document contains an additional _path
+        //      field with the path of the document.
+        // We use the $in operator with a regular expression to match the 
paths.
+        //  
https://www.mongodb.com/docs/manual/reference/operator/query/in/#use-the--in-operator-with-a-regular-expression
+
+        ArrayList<Pattern> pathPatterns = new ArrayList<>();
+        ArrayList<Pattern> idPatterns = new ArrayList<>();
+
+        for (String path : paths) {
+            if (!path.endsWith("/")) {
+                path = path + "/";
             }
+            String quotedPath = Pattern.quote(path);
+            idPatterns.add(Pattern.compile("^[0-9]{1,3}:" + quotedPath + 
".*$"));
+            pathPatterns.add(Pattern.compile(quotedPath + ".*$"));
         }
-        return includedPathsRoots;
-    }
-
-    private static Bson descendantsFilter(Set<String> path) {
-        List<Bson> filters = path.stream()
-                .flatMap(PipelinedMongoDownloadTask::descendantsFilter)
-                .collect(Collectors.toList());
-        return Filters.or(filters);
-    }
-
-    private static Stream<Bson> descendantsFilter(String path) {
-        if (!path.endsWith("/")) {
-            path = path + "/";
-        }
-        String quotedPath = Pattern.quote(path);
-        // For entries with path sizes above a certain threshold, the _id 
field contains a hash instead of the path of
-        // the entry. The path is stored instead in the _path field. 
Therefore, we have to include in the filter also
-        // the documents with matching _path.
-        return Stream.of(
-                regex(NodeDocument.ID, Pattern.compile("^[0-9]{1,3}:" + 
quotedPath + ".*$")),
-                regex(NodeDocument.PATH, Pattern.compile(quotedPath + ".*$")
-                ));
+        return Filters.or(
+                Filters.in(NodeDocument.ID, idPatterns),
+                Filters.in(NodeDocument.PATH, pathPatterns)
+        );
     }
 
-
-    static Set<String> getAncestors(Set<String> paths) {
-        Set<String> ancestors = new HashSet<>();
+    static List<String> getAncestors(List<String> paths) {
+        TreeSet<String> ancestors = new TreeSet<>();
         for (String child : paths) {
             String parent = child;
             while (true) {
@@ -448,11 +460,11 @@ public class PipelinedMongoDownloadTask implements 
Callable<PipelinedMongoDownlo
                 parent = PathUtils.getParentPath(parent);
             }
         }
-        return ancestors;
+        return new ArrayList<>(ancestors);
     }
 
 
-    static Bson ancestorsFilter(Set<String> paths) {
+    static Bson ancestorsFilter(List<String> paths) {
         List<String> parentFilters = getAncestors(paths).stream()
                 .map(Utils::getIdFromPath)
                 .collect(Collectors.toList());
diff --git 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedIT.java
 
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedIT.java
index 122342f5d2..12d47ab47b 100644
--- 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedIT.java
+++ 
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedIT.java
@@ -164,6 +164,103 @@ public class PipelinedIT {
         testSuccessfulDownload(pathPredicate, pathFilters);
     }
 
+    @Test
+    public void createFFS_mongoFiltering_include_excludes() throws Exception {
+        System.setProperty(OAK_INDEXER_PIPELINED_RETRY_ON_CONNECTION_ERRORS, 
"false");
+        System.setProperty(OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING, 
"true");
+
+        Predicate<String> pathPredicate = s -> true;
+        List<PathFilter> pathFilters = List.of(new 
PathFilter(List.of("/content/dam/2023"), List.of("/content/dam/2023/02")));
+
+        testSuccessfulDownload(pathPredicate, pathFilters,List.of(
+                "/|{}",
+                "/content|{}",
+                "/content/dam|{}",
+                "/content/dam/2023|{\"p2\":\"v2023\"}",
+                "/content/dam/2023/01|{\"p1\":\"v202301\"}",
+                "/content/dam/2023/02|{}"
+        ));
+    }
+
+    @Test
+    public void createFFS_mongoFiltering_include_excludes2() throws Exception {
+        System.setProperty(OAK_INDEXER_PIPELINED_RETRY_ON_CONNECTION_ERRORS, 
"false");
+        System.setProperty(OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING, 
"true");
+
+        Predicate<String> pathPredicate = s -> true;
+
+        // NOTE: If a path /a/b is in the excluded paths, the descendants of 
/a/b will not be downloaded but /a/b will
+        // be downloaded. This is an intentional limitation of the logic to 
compute the Mongo filter which was done
+        // to avoid the extra complexity of also filtering the root of the 
excluded tree. The transform stage would anyway
+        // filter out these additional documents.
+        List<PathFilter> pathFilters = List.of(new 
PathFilter(List.of("/content/dam/1000", "/content/dam/2022"), 
List.of("/content/dam/2022/02", "/content/dam/2022/04")));
+
+        testSuccessfulDownload(pathPredicate, pathFilters,List.of(
+        "/|{}",
+        "/content|{}",
+        "/content/dam|{}",
+        "/content/dam/1000|{}",
+        "/content/dam/1000/12|{\"p1\":\"v100012\"}",
+        "/content/dam/2022|{}",
+        "/content/dam/2022/01|{\"p1\":\"v202201\"}",
+        "/content/dam/2022/01/01|{\"p1\":\"v20220101\"}",
+        "/content/dam/2022/02|{\"p1\":\"v202202\"}",
+        "/content/dam/2022/03|{\"p1\":\"v202203\"}",
+        "/content/dam/2022/04|{\"p1\":\"v202204\"}"
+        ));
+    }
+
+
+    @Test
+    public void createFFS_mongoFiltering_include_excludes3() throws Exception {
+        System.setProperty(OAK_INDEXER_PIPELINED_RETRY_ON_CONNECTION_ERRORS, 
"false");
+        System.setProperty(OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING, 
"true");
+
+        Predicate<String> pathPredicate = s -> true;
+
+        List<PathFilter> pathFilters = List.of(new PathFilter(List.of("/"), 
List.of("/content/dam", "/etc", "/home", "/jcr:system")));
+
+        testSuccessfulDownload(pathPredicate, pathFilters, List.of(
+                "/|{}",
+                "/content|{}",
+                "/content/dam|{}",
+                "/etc|{}",
+                "/home|{}",
+                "/jcr:system|{}"
+        ));
+    }
+
+    @Test
+    public void createFFS_mongoFiltering_include_excludes4() throws Exception {
+        System.setProperty(OAK_INDEXER_PIPELINED_RETRY_ON_CONNECTION_ERRORS, 
"false");
+        System.setProperty(OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING, 
"true");
+
+        Predicate<String> pathPredicate = s -> true;
+
+        List<PathFilter> pathFilters = List.of(
+                new PathFilter(List.of("/content/dam/1000"), List.of()),
+                new PathFilter(List.of("/content/dam/2022"), 
List.of("/content/dam/2022/01"))
+        );
+
+        testSuccessfulDownload(pathPredicate, pathFilters, List.of(
+                "/|{}",
+                "/content|{}",
+                "/content/dam|{}",
+                "/content/dam/1000|{}",
+                "/content/dam/1000/12|{\"p1\":\"v100012\"}",
+                "/content/dam/2022|{}",
+                "/content/dam/2022/01|{\"p1\":\"v202201\"}",
+                "/content/dam/2022/02|{\"p1\":\"v202202\"}",
+                "/content/dam/2022/02/01|{\"p1\":\"v20220201\"}",
+                "/content/dam/2022/02/02|{\"p1\":\"v20220202\"}",
+                "/content/dam/2022/02/03|{\"p1\":\"v20220203\"}",
+                "/content/dam/2022/02/04|{\"p1\":\"v20220204\"}",
+                "/content/dam/2022/03|{\"p1\":\"v202203\"}",
+                "/content/dam/2022/04|{\"p1\":\"v202204\"}"
+
+        ));
+    }
+
     @Test
     public void createFFS_mongoFiltering_multipleIndexes() throws Exception {
         System.setProperty(OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING, 
"true");
@@ -418,11 +515,19 @@ public class PipelinedIT {
     private void createContent(NodeStore rwNodeStore) throws 
CommitFailedException {
         @NotNull NodeBuilder rootBuilder = rwNodeStore.getRoot().builder();
         @NotNull NodeBuilder contentDamBuilder = 
rootBuilder.child("content").child("dam");
-        contentDamBuilder.child("2023").child("01").setProperty("p1", 
"v202301");
-        contentDamBuilder.child("2022").child("02").setProperty("p1", 
"v202202");
-        contentDamBuilder.child("2023").child("01").setProperty("p1", 
"v202301");
         contentDamBuilder.child("1000").child("12").setProperty("p1", 
"v100012");
+        contentDamBuilder.child("2022").child("01").setProperty("p1", 
"v202201");
+        
contentDamBuilder.child("2022").child("01").child("01").setProperty("p1", 
"v20220101");
+        contentDamBuilder.child("2022").child("02").setProperty("p1", 
"v202202");
+        
contentDamBuilder.child("2022").child("02").child("01").setProperty("p1", 
"v20220201");
+        
contentDamBuilder.child("2022").child("02").child("02").setProperty("p1", 
"v20220202");
+        
contentDamBuilder.child("2022").child("02").child("03").setProperty("p1", 
"v20220203");
+        
contentDamBuilder.child("2022").child("02").child("04").setProperty("p1", 
"v20220204");
+        contentDamBuilder.child("2022").child("03").setProperty("p1", 
"v202203");
+        contentDamBuilder.child("2022").child("04").setProperty("p1", 
"v202204");
         contentDamBuilder.child("2023").setProperty("p2", "v2023");
+        contentDamBuilder.child("2023").child("01").setProperty("p1", 
"v202301");
+        contentDamBuilder.child("2023").child("01").setProperty("p1", 
"v202301");
         
contentDamBuilder.child("2023").child("02").child("28").setProperty("p1", 
"v20230228");
 
         // Node with very long name
@@ -451,7 +556,15 @@ public class PipelinedIT {
             "/content/dam/1000|{}",
             "/content/dam/1000/12|{\"p1\":\"v100012\"}",
             "/content/dam/2022|{}",
+            "/content/dam/2022/01|{\"p1\":\"v202201\"}",
+            "/content/dam/2022/01/01|{\"p1\":\"v20220101\"}",
             "/content/dam/2022/02|{\"p1\":\"v202202\"}",
+            "/content/dam/2022/02/01|{\"p1\":\"v20220201\"}",
+            "/content/dam/2022/02/02|{\"p1\":\"v20220202\"}",
+            "/content/dam/2022/02/03|{\"p1\":\"v20220203\"}",
+            "/content/dam/2022/02/04|{\"p1\":\"v20220204\"}",
+            "/content/dam/2022/03|{\"p1\":\"v202203\"}",
+            "/content/dam/2022/04|{\"p1\":\"v202204\"}",
             "/content/dam/2023|{\"p2\":\"v2023\"}",
             "/content/dam/2023/01|{\"p1\":\"v202301\"}",
             "/content/dam/2023/02|{}",
diff --git 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTaskTest.java
 
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTaskTest.java
index ae8f2ec5db..c1f05fe26b 100644
--- 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTaskTest.java
+++ 
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTaskTest.java
@@ -24,6 +24,7 @@ import com.mongodb.client.FindIterable;
 import com.mongodb.client.MongoCollection;
 import com.mongodb.client.MongoCursor;
 import com.mongodb.client.MongoDatabase;
+import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.pipelined.MongoRegexPathFilterFactory.MongoFilterPaths;
 import org.apache.jackrabbit.oak.plugins.document.Collection;
 import org.apache.jackrabbit.oak.plugins.document.DocumentStore;
 import org.apache.jackrabbit.oak.plugins.document.NodeDocument;
@@ -32,6 +33,7 @@ import 
org.apache.jackrabbit.oak.plugins.metric.MetricStatisticsProvider;
 import org.apache.jackrabbit.oak.spi.filter.PathFilter;
 import org.bson.BsonDocument;
 import org.bson.conversions.Bson;
+import org.junit.Before;
 import org.junit.Test;
 
 import java.util.ArrayList;
@@ -43,6 +45,7 @@ import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.Executors;
 import java.util.concurrent.ScheduledExecutorService;
 import java.util.stream.Collectors;
+import java.util.stream.IntStream;
 
 import static org.junit.Assert.assertEquals;
 import static org.mockito.ArgumentMatchers.any;
@@ -53,6 +56,8 @@ import static org.mockito.Mockito.when;
 
 public class PipelinedMongoDownloadTaskTest {
 
+    private MongoRegexPathFilterFactory regexFilterBuilder;
+
     private NodeDocument newBasicDBObject(String id, long modified, 
DocumentStore docStore) {
         NodeDocument obj = Collection.NODES.newDocument(docStore);
         obj.put(NodeDocument.ID, "3:/content/dam/asset" + id);
@@ -61,6 +66,11 @@ public class PipelinedMongoDownloadTaskTest {
         return obj;
     }
 
+    @Before
+    public void setUp() {
+        this.regexFilterBuilder = new 
MongoRegexPathFilterFactory(PipelinedMongoDownloadTask.DEFAULT_OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING_MAX_PATHS);
+    }
+
     @Test
     public void connectionToMongoFailure() throws Exception {
         @SuppressWarnings("unchecked")
@@ -142,42 +152,139 @@ public class PipelinedMongoDownloadTaskTest {
 
     @Test
     public void ancestorsFilters() {
-        assertEquals(Set.of(), 
PipelinedMongoDownloadTask.getAncestors(Set.of()));
-        assertEquals(Set.of("/"), 
PipelinedMongoDownloadTask.getAncestors(Set.of("/")));
-        assertEquals(Set.of("/", "/a"), 
PipelinedMongoDownloadTask.getAncestors(Set.of("/a")));
-        assertEquals(Set.of("/", "/a", "/b", "/c", "/c/c1", "/c/c1/c2", 
"/c/c1/c2/c3", "/c/c1/c2/c4"),
-                PipelinedMongoDownloadTask.getAncestors(Set.of("/a", "/b", 
"/c/c1/c2/c3", "/c/c1/c2/c4"))
+        assertEquals(List.of(), 
PipelinedMongoDownloadTask.getAncestors(List.of()));
+        assertEquals(List.of("/"), 
PipelinedMongoDownloadTask.getAncestors(List.of("/")));
+        assertEquals(List.of("/", "/a"), 
PipelinedMongoDownloadTask.getAncestors(List.of("/a")));
+        assertEquals(List.of("/", "/a", "/b", "/c", "/c/c1", "/c/c1/c2", 
"/c/c1/c2/c3", "/c/c1/c2/c4"),
+                PipelinedMongoDownloadTask.getAncestors(List.of("/a", "/b", 
"/c/c1/c2/c3", "/c/c1/c2/c4"))
         );
     }
 
     @Test
-    public void regexFilters() {
-        assertEquals(Set.of(), 
PipelinedMongoDownloadTask.extractIncludedPaths(null));
+    public void regexFiltersIncludedPathsOnly() {
+        assertEquals(MongoFilterPaths.DOWNLOAD_ALL, 
regexFilterBuilder.buildMongoFilter(null));
 
-        assertEquals(Set.of(), 
PipelinedMongoDownloadTask.extractIncludedPaths(List.of()));
+        assertEquals(MongoFilterPaths.DOWNLOAD_ALL, 
regexFilterBuilder.buildMongoFilter(List.of()));
 
         List<PathFilter> singlePathFilter = List.of(
                 new PathFilter(List.of("/content/dam"), List.of())
         );
-        assertEquals(Set.of("/content/dam"), 
PipelinedMongoDownloadTask.extractIncludedPaths(singlePathFilter));
+        assertEquals(new MongoFilterPaths(List.of("/content/dam"), List.of()), 
regexFilterBuilder.buildMongoFilter(singlePathFilter));
 
         List<PathFilter> multipleIncludeFilters = 
createIncludedPathFilters("/content/dam", "/content/dam");
-        assertEquals(Set.of("/content/dam"), 
PipelinedMongoDownloadTask.extractIncludedPaths(multipleIncludeFilters));
+        assertEquals(new MongoFilterPaths(List.of("/content/dam"), List.of()), 
regexFilterBuilder.buildMongoFilter(multipleIncludeFilters));
 
-        List<PathFilter> includesRoot = List.of(
-                new PathFilter(List.of("/"), List.of())
-        );
-        assertEquals(Set.of(), 
PipelinedMongoDownloadTask.extractIncludedPaths(includesRoot));
+        List<PathFilter> includesRoot = createIncludedPathFilters("/", 
"/a/a1/a2");
+        assertEquals(MongoFilterPaths.DOWNLOAD_ALL, 
regexFilterBuilder.buildMongoFilter(includesRoot));
 
         List<PathFilter> multipleIncludeFiltersDifferent = 
createIncludedPathFilters("/a/a1", "/a/a1/a2");
-        assertEquals(Set.of("/a/a1"), 
PipelinedMongoDownloadTask.extractIncludedPaths(multipleIncludeFiltersDifferent));
+        assertEquals(new MongoFilterPaths(List.of("/a/a1"), List.of()), 
regexFilterBuilder.buildMongoFilter(multipleIncludeFiltersDifferent));
 
         List<PathFilter> multipleIncludeFiltersDifferent2 = 
createIncludedPathFilters("/a/a1/a2", "/a/a1", "/b", "/c", "/cc");
-        assertEquals(Set.of("/a/a1", "/b", "/c", "/cc"), 
PipelinedMongoDownloadTask.extractIncludedPaths(multipleIncludeFiltersDifferent2));
+        assertEquals(new MongoFilterPaths(List.of("/a/a1", "/b", "/c", "/cc"), 
List.of()), 
regexFilterBuilder.buildMongoFilter(multipleIncludeFiltersDifferent2));
+    }
+
+    @Test
+    public void regexFiltersIncludedAndExcludedPaths() {
+        // Excludes is not empty
+
+        // Exclude paths is inside the included path tree
+        List<PathFilter> withExcludeFilter1 = List.of(
+                new PathFilter(List.of("/a"), List.of("/a/b"))
+        );
+        assertEquals(new MongoFilterPaths(List.of("/a"), List.of("/a/b")), 
regexFilterBuilder.buildMongoFilter(withExcludeFilter1));
+
+        // includedPath contains the root
+        List<PathFilter> withExcludeFilter2 = List.of(
+                new PathFilter(List.of("/"), List.of("/a"))
+        );
+        assertEquals(new MongoFilterPaths(List.of("/"), List.of("/a")), 
regexFilterBuilder.buildMongoFilter(withExcludeFilter2));
+
+        // One of the filters excludes a directory that is included by another 
filter. The path should still be included.
+        List<PathFilter> withExcludeFilter3_a = List.of(
+                new PathFilter(List.of("/"), List.of("/a")),
+                new PathFilter(List.of("/"), List.of())
+        );
+        assertEquals(MongoFilterPaths.DOWNLOAD_ALL, 
regexFilterBuilder.buildMongoFilter(withExcludeFilter3_a));
+
+        List<PathFilter> withExcludeFilter3_b = List.of(
+                new PathFilter(List.of("/a"), List.of("/a/a_excluded")),
+                new PathFilter(List.of("/a"), List.of())
+        );
+        assertEquals(new MongoFilterPaths(List.of("/a"), List.of()), 
regexFilterBuilder.buildMongoFilter(withExcludeFilter3_b));
+
+        List<PathFilter> withExcludeFilter4 = List.of(
+                new PathFilter(List.of("/"), List.of("/exc_a")),
+                new PathFilter(List.of("/"), List.of("/exc_b"))
+        );
+        assertEquals(MongoFilterPaths.DOWNLOAD_ALL, 
regexFilterBuilder.buildMongoFilter(withExcludeFilter4));
 
-        List<PathFilter> withExcludeFilter = List.of(
-                new PathFilter(List.of("/"), List.of("/var"))
+        List<PathFilter> withExcludeFilter5 = List.of(
+                new PathFilter(List.of("/a"), List.of("/a/a_excluded")),
+                new PathFilter(List.of("/b"), List.of("/b/b_excluded"))
         );
-        assertEquals(Set.of(), 
PipelinedMongoDownloadTask.extractIncludedPaths(withExcludeFilter));
+        assertEquals(new MongoFilterPaths(List.of("/a", "/b"), 
List.of("/a/a_excluded", "/b/b_excluded")), 
regexFilterBuilder.buildMongoFilter(withExcludeFilter5));
+
+        List<PathFilter> withExcludeFilter6 = List.of(
+                new PathFilter(List.of("/"), List.of("/a", "/b", "/c")),
+                new PathFilter(List.of("/"), List.of("/b", "/c", "/d"))
+        );
+        assertEquals(new MongoFilterPaths(List.of("/"), List.of("/b", "/c")), 
regexFilterBuilder.buildMongoFilter(withExcludeFilter6));
+
+        List<PathFilter> withExcludeFilter7 = List.of(
+                new PathFilter(List.of("/a"), List.of("/a/b")),
+                new PathFilter(List.of("/a"), List.of("/a/b/c"))
+        );
+        assertEquals(new MongoFilterPaths(List.of("/"), List.of("/b", "/c")), 
regexFilterBuilder.buildMongoFilter(withExcludeFilter6));
+        assertEquals(new MongoFilterPaths(List.of("/a"), List.of("/a/b/c")), 
regexFilterBuilder.buildMongoFilter(withExcludeFilter7));
+
+        List<PathFilter> withExcludeFilter8 = List.of(
+                new PathFilter(
+                        List.of("/p1", "/p2", "/p3", "/p4", "/p5", "/p6", 
"/p7", "/p8", "/p9"),
+                        List.of("/p10", "/p5/p5_s1", "/p5/p5_s2/p5_s3", "/p11")
+                )
+        );
+        assertEquals(new MongoFilterPaths(
+                        List.of("/p1", "/p2", "/p3", "/p4", "/p5", "/p6", 
"/p7", "/p8", "/p9"),
+                        List.of("/p5/p5_s1", "/p5/p5_s2/p5_s3")),
+                regexFilterBuilder.buildMongoFilter(withExcludeFilter8));
+    }
+
+
+    @Test
+    public void regexFiltersLargeNumberOfIncludedPaths1() {
+        // Test a single path filter with many included paths.
+        var maximumPaths = IntStream.range(0, 
PipelinedMongoDownloadTask.DEFAULT_OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING_MAX_PATHS)
+                .mapToObj(i -> "/p" + i)
+                .collect(Collectors.toList());
+        var pathFilter1 = List.of(new PathFilter(maximumPaths, List.of()));
+        assertEquals(new 
MongoFilterPaths(maximumPaths.stream().sorted().collect(Collectors.toList()), 
List.of()),
+                regexFilterBuilder.buildMongoFilter(pathFilter1));
+
+
+        var tooManyPaths = IntStream.range(0, 
PipelinedMongoDownloadTask.DEFAULT_OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING_MAX_PATHS
 + 1)
+                .mapToObj(i -> "/p" + i)
+                .collect(Collectors.toList());
+        var pathFilter2 = List.of(new PathFilter(tooManyPaths, List.of()));
+        assertEquals(MongoFilterPaths.DOWNLOAD_ALL, 
regexFilterBuilder.buildMongoFilter(pathFilter2));
+    }
+
+    @Test
+    public void regexFiltersLargeNumberOfIncludedPaths2() {
+        // Test many path filters each with a single included paths.
+        var pathFilters = IntStream.range(0, 
PipelinedMongoDownloadTask.DEFAULT_OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING_MAX_PATHS
 + 1)
+                .mapToObj(i -> new PathFilter(List.of("/p" + i), List.of()))
+                .collect(Collectors.toList());
+        assertEquals(MongoFilterPaths.DOWNLOAD_ALL, 
regexFilterBuilder.buildMongoFilter(pathFilters));
+    }
+
+    @Test
+    public void regexFiltersLargeNumberOfExcludedPaths() {
+        // When there are too many excluded paths, do not filter on excluded 
paths but still filter on included paths.
+        var excludedPaths = IntStream.range(0, 
PipelinedMongoDownloadTask.DEFAULT_OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING_MAX_PATHS
 + 1)
+                .mapToObj(i -> "/parent/p" + i)
+                .collect(Collectors.toList());
+        var pathFilter = List.of(new PathFilter(List.of("/parent"), 
excludedPaths));
+        assertEquals(new MongoFilterPaths(List.of("/parent"), List.of()), 
regexFilterBuilder.buildMongoFilter(pathFilter));
     }
 }
\ No newline at end of file


Reply via email to