(jackrabbit-oak) 22/50: OAK-10592 - Add support to specify a custom regex to exclude documents from being downloaded from Mongo during indexing (#1267)

daim Thu, 25 Jan 2024 11:16:30 -0800

This is an automated email from the ASF dual-hosted git repository.

daim pushed a commit to branch DetailedGC/OAK-10199
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git


commit 0d23236438099cabde8439088507d68d05f61238
Author: Nuno Santos <[email protected]>
AuthorDate: Thu Jan 18 09:17:09 2024 +0100

    OAK-10592 - Add support to specify a custom regex to exclude documents from 
being downloaded from Mongo during indexing (#1267)
---
 .../document/flatfile/pipelined/ConfigHelper.java  |   6 +
 .../pipelined/PipelinedMongoDownloadTask.java      | 267 ++++++++++++---------
 .../document/flatfile/pipelined/PipelinedIT.java   |  64 ++++-
 .../pipelined/PipelinedMongoDownloadTaskTest.java  |  90 ++++++-
 4 files changed, 305 insertions(+), 122 deletions(-)

diff --git 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/ConfigHelper.java
 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/ConfigHelper.java
index e0e4e6fef2..9374af70b2 100644
--- 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/ConfigHelper.java
+++ 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/ConfigHelper.java
@@ -30,6 +30,12 @@ public class ConfigHelper {
         return result;
     }
 
+    public static String getSystemPropertyAsString(String name, String 
defaultValue) {
+        String result = System.getProperty(name, defaultValue);
+        LOG.info("Config {}={}", name, result);
+        return result;
+    }
+
     public static boolean getSystemPropertyAsBoolean(String name, boolean 
defaultValue) {
         String sysPropValue = System.getProperty(name);
         boolean value;
diff --git 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java
 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java
index e0f8adf497..851bbc2fd9 100644
--- 
a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java
+++ 
b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java
@@ -46,6 +46,7 @@ import org.bson.BsonDocument;
 import org.bson.codecs.configuration.CodecRegistries;
 import org.bson.codecs.configuration.CodecRegistry;
 import org.bson.conversions.Bson;
+import org.jetbrains.annotations.NotNull;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -69,6 +70,7 @@ import java.util.stream.Collectors;
 import static com.mongodb.client.model.Sorts.ascending;
 
 public class PipelinedMongoDownloadTask implements 
Callable<PipelinedMongoDownloadTask.Result> {
+
     public static class Result {
         private final long documentsDownloaded;
 
@@ -106,6 +108,14 @@ public class PipelinedMongoDownloadTask implements 
Callable<PipelinedMongoDownlo
      */
     public static final String 
OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING = 
"oak.indexer.pipelined.mongoRegexPathFiltering";
     public static final boolean 
DEFAULT_OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING = false;
+    /**
+     * Any document with a path that matches this regex pattern will not be 
downloaded. This pattern will be included
+     * in the Mongo query, that is, the filtering is done by server-side at 
Mongo, which avoids downloading the documents
+     * matching this query. This is typically a _suffix_, for example 
"/metadata.xml$|/renditions/.*.jpg$".
+     * To exclude subtrees such as /content/abc, use mongoFilterPaths instead.
+     */
+    public static final String 
OAK_INDEXER_PIPELINED_MONGO_CUSTOM_EXCLUDE_ENTRIES_REGEX = 
"oak.indexer.pipelined.mongoCustomExcludeEntriesRegex";
+    public static final String 
DEFAULT_OAK_INDEXER_PIPELINED_MONGO_CUSTOM_EXCLUDE_ENTRIES_REGEX = "";
 
     /**
      * Maximum number of elements in the included/excluded paths list used for 
regex path filtering. If after
@@ -129,6 +139,117 @@ public class PipelinedMongoDownloadTask implements 
Callable<PipelinedMongoDownlo
 
     private static final String THREAD_NAME = "mongo-dump";
 
+    /**
+     * Creates the filter to be used in the Mongo query
+     *
+     * @param mongoFilterPaths          The paths to be included/excluded in 
the filter. These define subtrees to be included or excluded.
+     *                                  (see {@link MongoFilterPaths} for 
details)
+     * @param customExcludeEntriesRegex Documents with paths matching this 
regex are excluded from download
+     * @return The filter to be used in the Mongo query, or null if no filter 
is required
+     */
+    static Bson computeMongoQueryFilter(@NotNull MongoFilterPaths 
mongoFilterPaths, String customExcludeEntriesRegex) {
+        var filters = new ArrayList<Bson>(4);
+        if (mongoFilterPaths != MongoFilterPaths.DOWNLOAD_ALL) {
+            filters.add(descendantsFilter(mongoFilterPaths.included));
+            if (!mongoFilterPaths.excluded.isEmpty()) {
+                // The Mongo filter returned here will download the top level 
path of each excluded subtree, which in theory
+                // should be excluded. That is, if the tree /a/b/c is 
excluded, the filter will download /a/b/c but none of
+                // its descendants.
+                // This is done because excluding also the top level path 
would add extra complexity to the filter and
+                // would not have any measurable impact on performance because 
it only downloads a few extra documents, one
+                // for each excluded subtree. The transform stage will anyway 
filter out these paths.
+                Bson excludedFilter = 
descendantsFilter(mongoFilterPaths.excluded);
+                if (excludedFilter != null) {
+                    filters.add(Filters.nor(excludedFilter));
+                }
+            }
+        }
+
+        // Custom regex filter to exclude paths
+        Bson customExcludedPathsFilter = 
createCustomExcludedEntriesFilter(customExcludeEntriesRegex);
+        if (customExcludedPathsFilter != null) {
+            filters.add(customExcludedPathsFilter);
+        }
+        if (filters.isEmpty()) {
+            return null;
+        } else if (filters.size() == 1) {
+            return filters.get(0);
+        } else {
+            return Filters.and(filters);
+        }
+    }
+
+    static Bson createCustomExcludedEntriesFilter(String customRegexPattern) {
+        if (customRegexPattern == null || customRegexPattern.trim().isEmpty()) 
{
+            LOG.info("Mongo custom regex is disabled");
+            return null;
+        } else {
+            LOG.info("Excluding nodes with paths matching regex: {}", 
customRegexPattern);
+            var pattern = Pattern.compile(customRegexPattern);
+            return Filters.nor(
+                    Filters.regex(NodeDocument.ID, pattern),
+                    Filters.regex(NodeDocument.PATH, pattern)
+            );
+        }
+    }
+
+    private static Bson descendantsFilter(List<String> paths) {
+        if (paths.isEmpty()) {
+            return null;
+        }
+        // The filter for descendants of a list of paths is a series of or 
conditions. For each path, we have to build
+        // two conditions in two different fields of the documents:
+        // _ _id   - for non-long paths - In this case, the _id is of the form 
"2:/foo/bar"
+        // _ _path - for long paths - In this case, the _id is a hash and the 
document contains an additional _path
+        //      field with the path of the document.
+        // We use the $in operator with a regular expression to match the 
paths.
+        //  
https://www.mongodb.com/docs/manual/reference/operator/query/in/#use-the--in-operator-with-a-regular-expression
+
+        ArrayList<Pattern> pathPatterns = new ArrayList<>();
+        ArrayList<Pattern> idPatterns = new ArrayList<>();
+
+        for (String path : paths) {
+            if (!path.endsWith("/")) {
+                path = path + "/";
+            }
+            String quotedPath = Pattern.quote(path);
+            idPatterns.add(Pattern.compile("^[0-9]{1,3}:" + quotedPath + 
".*$"));
+            pathPatterns.add(Pattern.compile(quotedPath + ".*$"));
+        }
+        return Filters.or(
+                Filters.in(NodeDocument.ID, idPatterns),
+                Filters.in(NodeDocument.PATH, pathPatterns)
+        );
+    }
+
+    /**
+     * Returns all the ancestors paths of the given list of paths. That is, if 
the list is ["/a/b/c", "/a/b/d"],
+     * this method will return ["/", "/a", "/a/b", "/a/b/c", "/a/b/d"]. Note 
that the paths on the input list are also
+     * returned, even though they are not strictly ancestors of themselves.
+     */
+    static List<String> getAncestors(List<String> paths) {
+        TreeSet<String> ancestors = new TreeSet<>();
+        for (String child : paths) {
+            String parent = child;
+            while (true) {
+                ancestors.add(parent);
+                if (PathUtils.denotesRoot(parent)) {
+                    break;
+                }
+                parent = PathUtils.getParentPath(parent);
+            }
+        }
+        return new ArrayList<>(ancestors);
+    }
+
+
+    private static Bson ancestorsFilter(List<String> paths) {
+        List<String> parentFilters = getAncestors(paths).stream()
+                .map(Utils::getIdFromPath)
+                .collect(Collectors.toList());
+        return Filters.in(NodeDocument.ID, parentFilters);
+    }
+
     private final int maxBatchNumberOfDocuments;
     private final BlockingQueue<NodeDocument[]> mongoDocQueue;
     private final List<PathFilter> pathFilters;
@@ -142,14 +263,15 @@ public class PipelinedMongoDownloadTask implements 
Callable<PipelinedMongoDownlo
     private final int maxBatchSizeBytes;
     private final StatisticsProvider statisticsProvider;
     private final MongoRegexPathFilterFactory regexPathFilterFactory;
+    private final String customExcludeEntriesRegex;
 
     private long totalEnqueueWaitTimeMillis = 0;
     private Instant lastDelayedEnqueueWarningMessageLoggedTimestamp = 
Instant.now();
     private long documentsRead = 0;
+    private long totalDataDownloadedBytes = 0;
     private long nextLastModified = 0;
     private String lastIdDownloaded = null;
 
-
     public PipelinedMongoDownloadTask(MongoDatabase mongoDatabase,
                                       MongoDocumentStore mongoDocStore,
                                       int maxBatchSizeBytes,
@@ -187,6 +309,10 @@ public class PipelinedMongoDownloadTask implements 
Callable<PipelinedMongoDownlo
                 OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING_MAX_PATHS,
                 
DEFAULT_OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING_MAX_PATHS);
         this.regexPathFilterFactory = new 
MongoRegexPathFilterFactory(regexPathFilteringMaxNumberOfPaths);
+        this.customExcludeEntriesRegex = 
ConfigHelper.getSystemPropertyAsString(
+                
PipelinedMongoDownloadTask.OAK_INDEXER_PIPELINED_MONGO_CUSTOM_EXCLUDE_ENTRIES_REGEX,
+                
PipelinedMongoDownloadTask.DEFAULT_OAK_INDEXER_PIPELINED_MONGO_CUSTOM_EXCLUDE_ENTRIES_REGEX
+        );
 
         //TODO This may lead to reads being routed to secondary depending on 
MongoURI
         //So caller must ensure that its safe to read from secondary
@@ -211,12 +337,15 @@ public class PipelinedMongoDownloadTask implements 
Callable<PipelinedMongoDownlo
             } else {
                 downloadWithNaturalOrdering();
             }
+
             long durationMillis = 
downloadStartWatch.elapsed(TimeUnit.MILLISECONDS);
             String enqueueingDelayPercentage = 
PipelinedUtils.formatAsPercentage(totalEnqueueWaitTimeMillis, durationMillis);
             String metrics = MetricsFormatter.newBuilder()
                     .add("duration", 
FormattingUtils.formatToSeconds(downloadStartWatch))
                     .add("durationSeconds", durationMillis / 1000)
                     .add("documentsDownloaded", documentsRead)
+                    .add("dataDownloadedBytes", totalDataDownloadedBytes)
+                    .add("dataDownloaded", 
IOUtils.humanReadableByteCountBin(totalDataDownloadedBytes))
                     .add("enqueueingDelayMillis", totalEnqueueWaitTimeMillis)
                     .add("enqueueingDelayPercentage", 
enqueueingDelayPercentage)
                     .build();
@@ -259,20 +388,16 @@ public class PipelinedMongoDownloadTask implements 
Callable<PipelinedMongoDownlo
         // If regex filtering is enabled, start by downloading the ancestors 
of the path used for filtering.
         // That is, download "/", "/content", "/content/dam" for a base path 
of "/content/dam". These nodes will not be
         // matched by the regex used in the Mongo query, which assumes a 
prefix of "???:/content/dam"
-        MongoFilterPaths mongoFilterPathsDefinition = 
getPathsForRegexFiltering();
-        Bson childrenFilter;
-        if (mongoFilterPathsDefinition == MongoFilterPaths.DOWNLOAD_ALL) {
+        MongoFilterPaths mongoFilterPaths = getPathsForRegexFiltering();
+        Bson mongoFilter = computeMongoQueryFilter(mongoFilterPaths, 
customExcludeEntriesRegex);
+        if (mongoFilter == null) {
             LOG.info("Downloading full repository");
-            childrenFilter = null;
         } else {
+            LOG.info("Downloading from Mongo using filter: {}", mongoFilter);
             // Regex path filtering is enabled
             // Download the ancestors in a separate query. No retrials done on 
this query, as it will take only a few
             // seconds and is done at the start of the job, so if it fails, 
the job can be retried without losing much work
-            downloadAncestors(mongoFilterPathsDefinition.included);
-
-            // Filter to apply to the main query
-            childrenFilter = descendantsFilter(mongoFilterPathsDefinition);
-            LOG.info("Downloading from Mongo using filter: {}", 
childrenFilter);
+            downloadAncestors(mongoFilterPaths.included);
         }
 
         Instant failuresStartTimestamp = null; // When the last series of 
failures started
@@ -286,14 +411,14 @@ public class PipelinedMongoDownloadTask implements 
Callable<PipelinedMongoDownlo
             try {
                 if (lastIdDownloaded != null) {
                     LOG.info("Recovering from broken connection, finishing 
downloading documents with _modified={}", nextLastModified);
-                    downloadRange(new DownloadRange(nextLastModified, 
nextLastModified + 1, lastIdDownloaded), childrenFilter);
+                    downloadRange(new DownloadRange(nextLastModified, 
nextLastModified + 1, lastIdDownloaded), mongoFilter);
                     // We have managed to reconnect, reset the failure 
timestamp
                     failuresStartTimestamp = null;
                     numberOfFailures = 0;
                     // Continue downloading everything starting from the next 
_lastmodified value
-                    downloadRange(new DownloadRange(nextLastModified + 1, 
Long.MAX_VALUE, null), childrenFilter);
+                    downloadRange(new DownloadRange(nextLastModified + 1, 
Long.MAX_VALUE, null), mongoFilter);
                 } else {
-                    downloadRange(new DownloadRange(nextLastModified, 
Long.MAX_VALUE, null), childrenFilter);
+                    downloadRange(new DownloadRange(nextLastModified, 
Long.MAX_VALUE, null), mongoFilter);
                 }
                 downloadCompleted = true;
             } catch (MongoException e) {
@@ -358,9 +483,10 @@ public class PipelinedMongoDownloadTask implements 
Callable<PipelinedMongoDownlo
     private void downloadWithNaturalOrdering() throws InterruptedException, 
TimeoutException {
         // We are downloading potentially a large fraction of the repository, 
so using an index scan will be
         // inefficient. So we pass the natural hint to force MongoDB to use 
natural ordering, that is, column scan
-        MongoFilterPaths regexBasePath = getPathsForRegexFiltering();
-        if (regexBasePath == MongoFilterPaths.DOWNLOAD_ALL) {
-            LOG.info("Downloading full repository using natural order");
+        MongoFilterPaths mongoFilterPaths = getPathsForRegexFiltering();
+        Bson mongoFilter = computeMongoQueryFilter(mongoFilterPaths, 
customExcludeEntriesRegex);
+        if (mongoFilter == null) {
+            LOG.info("Downloading full repository from Mongo with natural 
order");
             FindIterable<NodeDocument> mongoIterable = dbCollection
                     .withReadPreference(readPreference)
                     .find()
@@ -368,22 +494,14 @@ public class PipelinedMongoDownloadTask implements 
Callable<PipelinedMongoDownlo
             download(mongoIterable);
 
         } else {
-            downloadAncestors(regexBasePath.included);
-
-            Bson childrenFilter = descendantsFilter(regexBasePath);
-            FindIterable<NodeDocument> findIterable;
-            if (childrenFilter == null) {
-                LOG.info("Downloading full repository using natural order");
-                findIterable = dbCollection
-                        .withReadPreference(readPreference)
-                        .find();
-            } else {
-                LOG.info("Downloading from Mongo using filter: {}", 
childrenFilter);
-                findIterable = dbCollection
-                        .withReadPreference(readPreference)
-                        .find(childrenFilter);
-            }
-            download(findIterable.hint(NATURAL_HINT));
+            downloadAncestors(mongoFilterPaths.included);
+
+            LOG.info("Downloading from Mongo with natural order using filter: 
{}", mongoFilter);
+            FindIterable<NodeDocument> findIterable = dbCollection
+                    .withReadPreference(readPreference)
+                    .find(mongoFilter)
+                    .hint(NATURAL_HINT);
+            download(findIterable);
         }
     }
 
@@ -391,84 +509,16 @@ public class PipelinedMongoDownloadTask implements 
Callable<PipelinedMongoDownlo
         if (!regexPathFiltering) {
             LOG.info("Regex path filtering disabled.");
             return MongoFilterPaths.DOWNLOAD_ALL;
-        }
-        LOG.info("Computing included/excluded paths for Mongo regex path 
filtering. PathFilters: {}", pathFilters.stream()
-                .map(pf -> "PF{includedPaths=" + pf.getIncludedPaths() + ", 
excludedPaths=" + pf.getExcludedPaths() + "}")
-                .collect(Collectors.joining(", ")));
-        MongoFilterPaths mongoFilterPaths = 
this.regexPathFilterFactory.buildMongoFilter(pathFilters);
-        LOG.info("Paths used for regex filtering on Mongo: {}", 
mongoFilterPaths);
-        return mongoFilterPaths;
-    }
-
-    private Bson descendantsFilter(MongoFilterPaths 
mongoFilterPathsDefinition) {
-        if (mongoFilterPathsDefinition == MongoFilterPaths.DOWNLOAD_ALL) {
-            return null;
-        }
-        Bson pathFilter = 
descendantsFilter(mongoFilterPathsDefinition.included);
-        if (mongoFilterPathsDefinition.excluded.isEmpty()) {
-            return pathFilter;
         } else {
-            // The Mongo filter returned here will download the top level path 
of each excluded subtree, which in theory
-            // should be excluded. That is, if the tree /a/b/c is excluded, 
the filter will download /a/b/c but none of
-            // its descendants.
-            // This is done because excluding also the top level path would 
add extra complexity to the filter and
-            // would not have any measurable impact on performance because it 
only downloads a few extra documents, one
-            // for each excluded subtree. The transform stage will anyway 
filter out these paths.
-            Bson excludedFilter = 
descendantsFilter(mongoFilterPathsDefinition.excluded);
-            return Filters.and(pathFilter, Filters.nor(excludedFilter));
-        }
-    }
-
-    private static Bson descendantsFilter(List<String> paths) {
-        if (paths.isEmpty()) {
-            return null;
-        }
-        // The filter for descendants of a list of paths is a series of or 
conditions. For each path, we have to build
-        // two conditions in two different fields of the documents:
-        // _ _id   - for non-long paths - In this case, the _id is of the form 
"2:/foo/bar"
-        // _ _path - for long paths - In this case, the _id is n hash and the 
document contains an additional _path
-        //      field with the path of the document.
-        // We use the $in operator with a regular expression to match the 
paths.
-        //  
https://www.mongodb.com/docs/manual/reference/operator/query/in/#use-the--in-operator-with-a-regular-expression
-
-        ArrayList<Pattern> pathPatterns = new ArrayList<>();
-        ArrayList<Pattern> idPatterns = new ArrayList<>();
-
-        for (String path : paths) {
-            if (!path.endsWith("/")) {
-                path = path + "/";
-            }
-            String quotedPath = Pattern.quote(path);
-            idPatterns.add(Pattern.compile("^[0-9]{1,3}:" + quotedPath + 
".*$"));
-            pathPatterns.add(Pattern.compile(quotedPath + ".*$"));
-        }
-        return Filters.or(
-                Filters.in(NodeDocument.ID, idPatterns),
-                Filters.in(NodeDocument.PATH, pathPatterns)
-        );
-    }
-
-    static List<String> getAncestors(List<String> paths) {
-        TreeSet<String> ancestors = new TreeSet<>();
-        for (String child : paths) {
-            String parent = child;
-            while (true) {
-                ancestors.add(parent);
-                if (PathUtils.denotesRoot(parent)) {
-                    break;
-                }
-                parent = PathUtils.getParentPath(parent);
-            }
+            LOG.info("Computing included/excluded paths for Mongo regex path 
filtering. PathFilters: {}",
+                    pathFilters.stream()
+                            .map(pf -> "PF{includedPaths=" + 
pf.getIncludedPaths() + ", excludedPaths=" + pf.getExcludedPaths() + "}")
+                            .collect(Collectors.joining(", "))
+            );
+            MongoFilterPaths mongoFilterPaths = 
this.regexPathFilterFactory.buildMongoFilter(pathFilters);
+            LOG.info("Paths used for regex filtering on Mongo: {}", 
mongoFilterPaths);
+            return mongoFilterPaths;
         }
-        return new ArrayList<>(ancestors);
-    }
-
-
-    static Bson ancestorsFilter(List<String> paths) {
-        List<String> parentFilters = getAncestors(paths).stream()
-                .map(Utils::getIdFromPath)
-                .collect(Collectors.toList());
-        return Filters.in(NodeDocument.ID, parentFilters);
     }
 
     private void download(FindIterable<NodeDocument> mongoIterable) throws 
InterruptedException, TimeoutException {
@@ -492,6 +542,7 @@ public class PipelinedMongoDownloadTask implements 
Callable<PipelinedMongoDownlo
                     nextIndex++;
                     int docSize = (int) 
next.remove(NodeDocumentCodec.SIZE_FIELD);
                     batchSize += docSize;
+                    totalDataDownloadedBytes += docSize;
                     if (batchSize >= maxBatchSizeBytes || nextIndex == 
batch.length) {
                         LOG.trace("Enqueuing block with {} elements, estimated 
size: {} bytes", nextIndex, batchSize);
                         tryEnqueueCopy(batch, nextIndex);
diff --git 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedIT.java
 
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedIT.java
index 12d47ab47b..b90c44b553 100644
--- 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedIT.java
+++ 
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedIT.java
@@ -67,6 +67,7 @@ import java.util.function.Predicate;
 import java.util.stream.Collectors;
 
 import static java.lang.management.ManagementFactory.getPlatformMBeanServer;
+import static 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.pipelined.PipelinedMongoDownloadTask.OAK_INDEXER_PIPELINED_MONGO_CUSTOM_EXCLUDE_ENTRIES_REGEX;
 import static 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.pipelined.PipelinedMongoDownloadTask.OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING;
 import static 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.pipelined.PipelinedMongoDownloadTask.OAK_INDEXER_PIPELINED_RETRY_ON_CONNECTION_ERRORS;
 import static org.junit.Assert.assertArrayEquals;
@@ -172,7 +173,7 @@ public class PipelinedIT {
         Predicate<String> pathPredicate = s -> true;
         List<PathFilter> pathFilters = List.of(new 
PathFilter(List.of("/content/dam/2023"), List.of("/content/dam/2023/02")));
 
-        testSuccessfulDownload(pathPredicate, pathFilters,List.of(
+        testSuccessfulDownload(pathPredicate, pathFilters, List.of(
                 "/|{}",
                 "/content|{}",
                 "/content/dam|{}",
@@ -195,18 +196,18 @@ public class PipelinedIT {
         // filter out these additional documents.
         List<PathFilter> pathFilters = List.of(new 
PathFilter(List.of("/content/dam/1000", "/content/dam/2022"), 
List.of("/content/dam/2022/02", "/content/dam/2022/04")));
 
-        testSuccessfulDownload(pathPredicate, pathFilters,List.of(
-        "/|{}",
-        "/content|{}",
-        "/content/dam|{}",
-        "/content/dam/1000|{}",
-        "/content/dam/1000/12|{\"p1\":\"v100012\"}",
-        "/content/dam/2022|{}",
-        "/content/dam/2022/01|{\"p1\":\"v202201\"}",
-        "/content/dam/2022/01/01|{\"p1\":\"v20220101\"}",
-        "/content/dam/2022/02|{\"p1\":\"v202202\"}",
-        "/content/dam/2022/03|{\"p1\":\"v202203\"}",
-        "/content/dam/2022/04|{\"p1\":\"v202204\"}"
+        testSuccessfulDownload(pathPredicate, pathFilters, List.of(
+                "/|{}",
+                "/content|{}",
+                "/content/dam|{}",
+                "/content/dam/1000|{}",
+                "/content/dam/1000/12|{\"p1\":\"v100012\"}",
+                "/content/dam/2022|{}",
+                "/content/dam/2022/01|{\"p1\":\"v202201\"}",
+                "/content/dam/2022/01/01|{\"p1\":\"v20220101\"}",
+                "/content/dam/2022/02|{\"p1\":\"v202202\"}",
+                "/content/dam/2022/03|{\"p1\":\"v202203\"}",
+                "/content/dam/2022/04|{\"p1\":\"v202204\"}"
         ));
     }
 
@@ -321,6 +322,43 @@ public class PipelinedIT {
         testSuccessfulDownload(pathPredicate, pathFilters, expected);
     }
 
+    @Test
+    public void createFFSCustomExcludePathsRegex() throws Exception {
+        // Filter all nodes ending in /metadata.xml or having a path section 
with ".*.jpg"
+        
System.setProperty(OAK_INDEXER_PIPELINED_MONGO_CUSTOM_EXCLUDE_ENTRIES_REGEX, 
"/metadata.xml$|/.*.jpg/.*");
+        Predicate<String> pathPredicate = s -> contentDamPathFilter.filter(s) 
!= PathFilter.Result.EXCLUDE;
+
+        Backend rwStore = createNodeStore(false);
+
+        // Create content
+        var rwNodeStore = rwStore.documentNodeStore;
+        @NotNull NodeBuilder rootBuilder = rwNodeStore.getRoot().builder();
+        @NotNull NodeBuilder contentDamBuilder = 
rootBuilder.child("content").child("dam");
+        
contentDamBuilder.child("a.jpg").child("jcr:content").child("metadata.xml");
+        
contentDamBuilder.child("a.jpg").child("jcr:content").child("metadata.text");
+        
contentDamBuilder.child("image_a.png").child("jcr:content").child("metadata.text");
+        
contentDamBuilder.child("image_a.png").child("jcr:content").child("metadata.xml");
+        rwNodeStore.merge(rootBuilder, EmptyHook.INSTANCE, CommitInfo.EMPTY);
+
+        Backend roStore = createNodeStore(true);
+        PipelinedStrategy pipelinedStrategy = createStrategy(roStore, 
pathPredicate, null);
+
+        File file = pipelinedStrategy.createSortedStoreFile();
+
+        assertTrue(file.exists());
+        var expected = List.of(
+                "/|{}",
+                "/content|{}",
+                "/content/dam|{}",
+                "/content/dam/a.jpg|{}",
+                "/content/dam/image_a.png|{}",
+                "/content/dam/image_a.png/jcr:content|{}",
+                "/content/dam/image_a.png/jcr:content/metadata.text|{}"
+        );
+        assertEquals(expected, Files.readAllLines(file.toPath()));
+        assertMetrics();
+    }
+
     private void testSuccessfulDownload(Predicate<String> pathPredicate, 
List<PathFilter> pathFilters)
             throws CommitFailedException, IOException {
         testSuccessfulDownload(pathPredicate, pathFilters, EXPECTED_FFS);
diff --git 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTaskTest.java
 
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTaskTest.java
index c1f05fe26b..2b79c34dbb 100644
--- 
a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTaskTest.java
+++ 
b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTaskTest.java
@@ -18,12 +18,14 @@
  */
 package org.apache.jackrabbit.oak.index.indexer.document.flatfile.pipelined;
 
+import com.mongodb.MongoClient;
 import com.mongodb.MongoSocketException;
 import com.mongodb.ServerAddress;
 import com.mongodb.client.FindIterable;
 import com.mongodb.client.MongoCollection;
 import com.mongodb.client.MongoCursor;
 import com.mongodb.client.MongoDatabase;
+import com.mongodb.client.model.Filters;
 import 
org.apache.jackrabbit.oak.index.indexer.document.flatfile.pipelined.MongoRegexPathFilterFactory.MongoFilterPaths;
 import org.apache.jackrabbit.oak.plugins.document.Collection;
 import org.apache.jackrabbit.oak.plugins.document.DocumentStore;
@@ -44,10 +46,12 @@ import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.Executors;
 import java.util.concurrent.ScheduledExecutorService;
+import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
 import static org.mockito.ArgumentMatchers.any;
 import static org.mockito.ArgumentMatchers.eq;
 import static org.mockito.Mockito.mock;
@@ -287,4 +291,88 @@ public class PipelinedMongoDownloadTaskTest {
         var pathFilter = List.of(new PathFilter(List.of("/parent"), 
excludedPaths));
         assertEquals(new MongoFilterPaths(List.of("/parent"), List.of()), 
regexFilterBuilder.buildMongoFilter(pathFilter));
     }
-}
\ No newline at end of file
+
+    @Test
+    public void createCustomExcludeEntriesFilter() {
+        
assertNull(PipelinedMongoDownloadTask.createCustomExcludedEntriesFilter(null));
+        
assertNull(PipelinedMongoDownloadTask.createCustomExcludedEntriesFilter(""));
+
+        Pattern p = Pattern.compile("^[0-9]{1,3}:/a/b.*$");
+        var expectedBson = Filters.nor(
+                Filters.regex(NodeDocument.ID, p),
+                Filters.regex(NodeDocument.PATH, p)
+        );
+        var actualBson = 
PipelinedMongoDownloadTask.createCustomExcludedEntriesFilter("^[0-9]{1,3}:/a/b.*$");
+
+        assertBsonEquals(expectedBson, actualBson);
+    }
+
+    @Test
+    public void computeMongoQueryFilterNoPathFilterNoExcludeFilter() {
+        // No path filter and no exclude filter
+        assertNull(
+                PipelinedMongoDownloadTask.computeMongoQueryFilter(
+                        MongoFilterPaths.DOWNLOAD_ALL,
+                        null
+                )
+        );
+    }
+
+    @Test
+    public void computeMongoQueryFilterWithPathFilterNoExcludeFilter() {
+        // Path filter but no exclude filter
+        var actual = PipelinedMongoDownloadTask.computeMongoQueryFilter(
+                new MongoFilterPaths(List.of("/parent"), List.of()),
+                null
+        );
+        var expected = Filters.or(
+                Filters.in(NodeDocument.ID, Pattern.compile("^[0-9]{1,3}:" + 
Pattern.quote("/parent/") + ".*$")),
+                Filters.in(NodeDocument.PATH, 
Pattern.compile(Pattern.quote("/parent/") + ".*$"))
+        );
+        assertBsonEquals(expected, actual);
+    }
+
+    @Test
+    public void computeMongoQueryFilterNoPathFilterWithExcludeFilter() {
+        var actual = PipelinedMongoDownloadTask.computeMongoQueryFilter(
+                MongoFilterPaths.DOWNLOAD_ALL,
+                "^[0-9]{1,3}:/a/b.*$"
+        );
+        Pattern p = Pattern.compile("^[0-9]{1,3}:/a/b.*$");
+        assertBsonEquals(
+                Filters.nor(Filters.regex(NodeDocument.ID, p), 
Filters.regex(NodeDocument.PATH, p)),
+                actual
+        );
+    }
+
+    @Test
+    public void computeMongoQueryFilterWithPathFilterWithExcludeFilter() {
+        var actual = PipelinedMongoDownloadTask.computeMongoQueryFilter(
+                new MongoFilterPaths(List.of("/parent"), List.of()),
+                "^[0-9]{1,3}:/a/b.*$"
+        );
+
+        Pattern p = Pattern.compile("^[0-9]{1,3}:/a/b.*$");
+        var expected =
+                Filters.and(
+                        Filters.nor(Filters.regex(NodeDocument.ID, p), 
Filters.regex(NodeDocument.PATH, p)),
+                        Filters.or(
+                                Filters.in(NodeDocument.ID, 
Pattern.compile("^[0-9]{1,3}:" + Pattern.quote("/parent/") + ".*$")),
+                                Filters.in(NodeDocument.PATH, 
Pattern.compile(Pattern.quote("/parent/") + ".*$"))
+                        ));
+        assertBsonEquals(expected, actual);
+    }
+
+    private void assertBsonEquals(Bson actual, Bson expected) {
+        if (actual == null && expected == null) {
+            return;
+        } else if (actual == null || expected == null) {
+            throw new AssertionError("One of the bson is null. Actual: " + 
actual + ", expected: " + expected);
+        }
+        assertEquals(
+                actual.toBsonDocument(BsonDocument.class, 
MongoClient.getDefaultCodecRegistry()),
+                expected.toBsonDocument(BsonDocument.class, 
MongoClient.getDefaultCodecRegistry())
+        );
+    }
+}
+

(jackrabbit-oak) 22/50: OAK-10592 - Add support to specify a custom regex to exclude documents from being downloaded from Mongo during indexing (#1267)

Reply via email to