This is an automated email from the ASF dual-hosted git repository. daim pushed a commit to branch DetailedGC/OAK-10199 in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
commit 7c6066a974131330d65dfd71789c5d6c2a4aa2a1 Author: Nuno Santos <[email protected]> AuthorDate: Mon Jan 8 14:30:24 2024 +0100 OAK-10589 - Improve regex path filtering to also handle cases where excludedPaths are defined (#1258) --- .../pipelined/MongoRegexPathFilterFactory.java | 149 +++++++++++++++++ .../pipelined/PipelinedMongoDownloadTask.java | 186 +++++++++++---------- .../document/flatfile/pipelined/PipelinedIT.java | 119 ++++++++++++- .../pipelined/PipelinedMongoDownloadTaskTest.java | 145 +++++++++++++--- 4 files changed, 490 insertions(+), 109 deletions(-) diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/MongoRegexPathFilterFactory.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/MongoRegexPathFilterFactory.java new file mode 100644 index 0000000000..299c7b19c8 --- /dev/null +++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/MongoRegexPathFilterFactory.java @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.jackrabbit.oak.index.indexer.document.flatfile.pipelined; + +import org.apache.jackrabbit.oak.commons.PathUtils; +import org.apache.jackrabbit.oak.spi.filter.PathFilter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; +import java.util.TreeSet; + +public class MongoRegexPathFilterFactory { + + public static class MongoFilterPaths { + // Special value that means "download everything". This is used when regex path filtering is disabled or when + // the path filters would require downloading everything. When this is returned, it is preferable to do not + // use any filter in the Mongo query, even though using the values in this object as filters would also result + // in downloading everything + public static final MongoFilterPaths DOWNLOAD_ALL = new MongoFilterPaths(List.of("/"), List.of()); + + final List<String> included; + final List<String> excluded; + + public MongoFilterPaths(List<String> included, List<String> excluded) { + this.included = included; + this.excluded = excluded; + } + + @Override + public String toString() { + return "MongoFilterPaths{" + + "included=" + included + + ", excluded=" + excluded + + '}'; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + MongoFilterPaths that = (MongoFilterPaths) o; + return Objects.equals(included, that.included) && Objects.equals(excluded, that.excluded); + } + + @Override + public int hashCode() { + return Objects.hash(included, excluded); + } + } + + private final static Logger LOG = LoggerFactory.getLogger(MongoRegexPathFilterFactory.class); + + private final int maxNumberOfPaths; + + public MongoRegexPathFilterFactory(int maxNumberOfPaths) { + this.maxNumberOfPaths = maxNumberOfPaths; + } + + public MongoFilterPaths buildMongoFilter(List<PathFilter> pathFilters) { + if (pathFilters == null || pathFilters.isEmpty()) { + return MongoFilterPaths.DOWNLOAD_ALL; + } + // Merge and deduplicate the included and excluded paths of all filters. + // The included paths will also be further de-deuplicated, by removing paths that are children of other path in + // the list, that is, by keeping only the top level paths. To make it easy to de-duplicate, we sort the paths + // alphabetically. + // The list of excluded paths does not need to be sorted, but we do it anyway for consistency when logging the + // list of paths. + TreeSet<String> sortedCandidateIncludedPaths = new TreeSet<>(); + TreeSet<String> candidateExcludedPaths = new TreeSet<>(); + for (PathFilter pathFilter : pathFilters) { + sortedCandidateIncludedPaths.addAll(pathFilter.getIncludedPaths()); + candidateExcludedPaths.addAll(pathFilter.getExcludedPaths()); + } + // Keep only unique included paths. That is, if paths "/a/b" and "/a/b/c" are both in the list, keep only "/a/b" + List<String> finalIncludedPathsRoots = new ArrayList<>(); + LOG.debug("Candidate included paths: {}, candidate excluded paths: {}", sortedCandidateIncludedPaths, candidateExcludedPaths); + // The quadratic algorithm below could easily be replaced by linear time algorithm, but it's not worth the added + // complexity because the list of paths should never be more than a few tens + // Idea for linear time algorithm: after adding a path to the includedPathsRoots, advance in the list of candidates + // skipping all paths that are children of the path just added. Since the list is sorted alphabetically, the children + // of a path will be next to it in the list + for (String candidateIncludedPath : sortedCandidateIncludedPaths) { + if (finalIncludedPathsRoots.stream().noneMatch(ancestor -> PathUtils.isAncestor(ancestor, candidateIncludedPath))) { + finalIncludedPathsRoots.add(candidateIncludedPath); + } + } + + List<String> finalExcludedPaths = new ArrayList<>(); + // Keep an excluded path only if it is not needed by any filter. + for (String candidateExcludedPath : candidateExcludedPaths) { + boolean notNeeded = pathFilters.stream().allMatch(pathFilter -> pathFilter.filter(candidateExcludedPath) == PathFilter.Result.EXCLUDE); + if (notNeeded) { + finalExcludedPaths.add(candidateExcludedPath); + } + } + + // Do not filter if there are too many includedPaths. The official Mongo documentation recommends + // not using more than tens of values in the $in filter to avoid performance degradation. + // https://www.mongodb.com/docs/manual/reference/operator/query/in/ + if (finalIncludedPathsRoots.size() > maxNumberOfPaths) { + LOG.info("Number of includedPaths ({}) exceed the maximum allowed ({}), downloading everything", + finalIncludedPathsRoots.size(), maxNumberOfPaths); + return MongoFilterPaths.DOWNLOAD_ALL; + } + + // Also check the number of excluded paths and disable filtering by them if there are too many. + // We do not need to revert to download the full repository in the case where there is a reasonable number of + // included paths but too many excluded paths. The excluded paths are always children of the included paths, so + // if we do not filter on excludedPaths, we simply download everything inside the included paths. This might be + // much less than the full repository. + if (finalExcludedPaths.size() > maxNumberOfPaths) { + LOG.info("Number of excludedPaths ({}) exceed the maximum allowed ({}), disabling filtering of excludedPaths", + finalExcludedPaths.size(), maxNumberOfPaths); + finalExcludedPaths = List.of(); + } + + if (finalExcludedPaths.isEmpty()) { + // Only included paths + if (finalIncludedPathsRoots.contains("/")) { + return MongoFilterPaths.DOWNLOAD_ALL; + } else { + return new MongoFilterPaths(finalIncludedPathsRoots, List.of()); + } + } else { + return new MongoFilterPaths(finalIncludedPathsRoots, finalExcludedPaths); + } + } + +} diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java index a490d5a7a9..e0f8adf497 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java +++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java @@ -32,6 +32,7 @@ import org.apache.jackrabbit.guava.common.base.Preconditions; import org.apache.jackrabbit.guava.common.base.Stopwatch; import org.apache.jackrabbit.oak.commons.IOUtils; import org.apache.jackrabbit.oak.commons.PathUtils; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.pipelined.MongoRegexPathFilterFactory.MongoFilterPaths; import org.apache.jackrabbit.oak.plugins.document.Collection; import org.apache.jackrabbit.oak.plugins.document.NodeDocument; import org.apache.jackrabbit.oak.plugins.document.mongo.MongoDocumentStore; @@ -51,22 +52,20 @@ import org.slf4j.LoggerFactory; import java.time.Duration; import java.time.Instant; import java.time.temporal.ChronoUnit; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Map; -import java.util.Set; +import java.util.TreeSet; import java.util.concurrent.BlockingQueue; import java.util.concurrent.Callable; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.regex.Pattern; import java.util.stream.Collectors; -import java.util.stream.Stream; -import static com.mongodb.client.model.Filters.regex; import static com.mongodb.client.model.Sorts.ascending; public class PipelinedMongoDownloadTask implements Callable<PipelinedMongoDownloadTask.Result> { @@ -101,12 +100,22 @@ public class PipelinedMongoDownloadTask implements Callable<PipelinedMongoDownlo * filtering in the indexing job. This feature may significantly reduce the number of documents downloaded from * Mongo. * The performance gains may not be proportional to the reduction in the number of documents downloaded because Mongo - * still has to traverse all the documents. This is the case because the regex expression used for path filtering + * still has to traverse all the documents. This is required because the regex expression used for path filtering * starts with a wildcard (because the _id starts with the depth of the path, so the regex expression must ignore * this part). Because of the wildcard at the start, Mongo cannot use of the index on _id. */ public static final String OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING = "oak.indexer.pipelined.mongoRegexPathFiltering"; public static final boolean DEFAULT_OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING = false; + + /** + * Maximum number of elements in the included/excluded paths list used for regex path filtering. If after + * merging and de-deduplication of the paths of all the path filters the number of included or excluded paths exceeds + * this value, then disable path filtering to avoid creating Mongo queries with large number of filters + */ + public static final String OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING_MAX_PATHS = "oak.indexer.pipelined.mongoRegexPathFilteringMaxPaths"; + public static final int DEFAULT_OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING_MAX_PATHS = 20; + + // Use a short initial retry interval. In most cases if the connection to a replica fails, there will be other // replicas available so a reconnection attempt will succeed immediately. private final static long retryInitialIntervalMillis = 100; @@ -132,6 +141,7 @@ public class PipelinedMongoDownloadTask implements Callable<PipelinedMongoDownlo private final Stopwatch downloadStartWatch = Stopwatch.createUnstarted(); private final int maxBatchSizeBytes; private final StatisticsProvider statisticsProvider; + private final MongoRegexPathFilterFactory regexPathFilterFactory; private long totalEnqueueWaitTimeMillis = 0; private Instant lastDelayedEnqueueWarningMessageLoggedTimestamp = Instant.now(); @@ -173,6 +183,10 @@ public class PipelinedMongoDownloadTask implements Callable<PipelinedMongoDownlo this.regexPathFiltering = ConfigHelper.getSystemPropertyAsBoolean( OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING, DEFAULT_OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING); + int regexPathFilteringMaxNumberOfPaths = ConfigHelper.getSystemPropertyAsInt( + OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING_MAX_PATHS, + DEFAULT_OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING_MAX_PATHS); + this.regexPathFilterFactory = new MongoRegexPathFilterFactory(regexPathFilteringMaxNumberOfPaths); //TODO This may lead to reads being routed to secondary depending on MongoURI //So caller must ensure that its safe to read from secondary @@ -245,18 +259,20 @@ public class PipelinedMongoDownloadTask implements Callable<PipelinedMongoDownlo // If regex filtering is enabled, start by downloading the ancestors of the path used for filtering. // That is, download "/", "/content", "/content/dam" for a base path of "/content/dam". These nodes will not be // matched by the regex used in the Mongo query, which assumes a prefix of "???:/content/dam" - Set<String> regexBasePaths = getPathsForRegexFiltering(); + MongoFilterPaths mongoFilterPathsDefinition = getPathsForRegexFiltering(); Bson childrenFilter; - if (regexBasePaths.isEmpty()) { + if (mongoFilterPathsDefinition == MongoFilterPaths.DOWNLOAD_ALL) { + LOG.info("Downloading full repository"); childrenFilter = null; } else { // Regex path filtering is enabled // Download the ancestors in a separate query. No retrials done on this query, as it will take only a few // seconds and is done at the start of the job, so if it fails, the job can be retried without losing much work - downloadAncestors(regexBasePaths); + downloadAncestors(mongoFilterPathsDefinition.included); // Filter to apply to the main query - childrenFilter = descendantsFilter(regexBasePaths); + childrenFilter = descendantsFilter(mongoFilterPathsDefinition); + LOG.info("Downloading from Mongo using filter: {}", childrenFilter); } Instant failuresStartTimestamp = null; // When the last series of failures started @@ -325,7 +341,10 @@ public class PipelinedMongoDownloadTask implements Callable<PipelinedMongoDownlo download(mongoIterable); } - private void downloadAncestors(Set<String> basePath) throws InterruptedException, TimeoutException { + private void downloadAncestors(List<String> basePath) throws InterruptedException, TimeoutException { + if (basePath.size() == 1 && basePath.get(0).equals("/")) { + return; // No need to download ancestors of root, the root will be downloaded as part of the normal traversal + } Bson ancestorQuery = ancestorsFilter(basePath); LOG.info("Downloading ancestors of: {}, Query: {}.", basePath, ancestorQuery); FindIterable<NodeDocument> ancestorsIterable = dbCollection @@ -339,8 +358,8 @@ public class PipelinedMongoDownloadTask implements Callable<PipelinedMongoDownlo private void downloadWithNaturalOrdering() throws InterruptedException, TimeoutException { // We are downloading potentially a large fraction of the repository, so using an index scan will be // inefficient. So we pass the natural hint to force MongoDB to use natural ordering, that is, column scan - Set<String> regexBasePath = getPathsForRegexFiltering(); - if (regexBasePath.isEmpty()) { + MongoFilterPaths regexBasePath = getPathsForRegexFiltering(); + if (regexBasePath == MongoFilterPaths.DOWNLOAD_ALL) { LOG.info("Downloading full repository using natural order"); FindIterable<NodeDocument> mongoIterable = dbCollection .withReadPreference(readPreference) @@ -349,95 +368,88 @@ public class PipelinedMongoDownloadTask implements Callable<PipelinedMongoDownlo download(mongoIterable); } else { - downloadAncestors(regexBasePath); - - Bson childrenQuery = descendantsFilter(regexBasePath); - LOG.info("Downloading using regex path filtering. Downloading children: {}.", childrenQuery); - FindIterable<NodeDocument> childrenIterable = dbCollection - .withReadPreference(readPreference) - .find(childrenQuery) - .hint(NATURAL_HINT); - download(childrenIterable); + downloadAncestors(regexBasePath.included); + + Bson childrenFilter = descendantsFilter(regexBasePath); + FindIterable<NodeDocument> findIterable; + if (childrenFilter == null) { + LOG.info("Downloading full repository using natural order"); + findIterable = dbCollection + .withReadPreference(readPreference) + .find(); + } else { + LOG.info("Downloading from Mongo using filter: {}", childrenFilter); + findIterable = dbCollection + .withReadPreference(readPreference) + .find(childrenFilter); + } + download(findIterable.hint(NATURAL_HINT)); } } - private Set<String> getPathsForRegexFiltering() { + private MongoFilterPaths getPathsForRegexFiltering() { if (!regexPathFiltering) { LOG.info("Regex path filtering disabled."); - return Set.of(); + return MongoFilterPaths.DOWNLOAD_ALL; } - return extractIncludedPaths(pathFilters); + LOG.info("Computing included/excluded paths for Mongo regex path filtering. PathFilters: {}", pathFilters.stream() + .map(pf -> "PF{includedPaths=" + pf.getIncludedPaths() + ", excludedPaths=" + pf.getExcludedPaths() + "}") + .collect(Collectors.joining(", "))); + MongoFilterPaths mongoFilterPaths = this.regexPathFilterFactory.buildMongoFilter(pathFilters); + LOG.info("Paths used for regex filtering on Mongo: {}", mongoFilterPaths); + return mongoFilterPaths; } - /** - * Aggregates the included paths from the path filters. The final list will not contain duplicates or overlapping - * paths (i.e., /a and /a/b). - * - * @param pathFilters Empty set if path filtering should be disabled, otherwise the paths that should be included - * in the Mongo query filters - */ - // Package private for testing - static Set<String> extractIncludedPaths(List<PathFilter> pathFilters) { - // Path filtering is enabled only if there are no excludedPaths. - if (pathFilters == null) { - return Set.of(); + private Bson descendantsFilter(MongoFilterPaths mongoFilterPathsDefinition) { + if (mongoFilterPathsDefinition == MongoFilterPaths.DOWNLOAD_ALL) { + return null; } - Set<String> includedPaths = new HashSet<>(); - Set<String> excludedPaths = new HashSet<>(); - for (PathFilter pathFilter : pathFilters) { - includedPaths.addAll(pathFilter.getIncludedPaths()); - excludedPaths.addAll(pathFilter.getExcludedPaths()); + Bson pathFilter = descendantsFilter(mongoFilterPathsDefinition.included); + if (mongoFilterPathsDefinition.excluded.isEmpty()) { + return pathFilter; + } else { + // The Mongo filter returned here will download the top level path of each excluded subtree, which in theory + // should be excluded. That is, if the tree /a/b/c is excluded, the filter will download /a/b/c but none of + // its descendants. + // This is done because excluding also the top level path would add extra complexity to the filter and + // would not have any measurable impact on performance because it only downloads a few extra documents, one + // for each excluded subtree. The transform stage will anyway filter out these paths. + Bson excludedFilter = descendantsFilter(mongoFilterPathsDefinition.excluded); + return Filters.and(pathFilter, Filters.nor(excludedFilter)); } - // Sort by natural order, so that parent paths appear before any children. This makes it easier to compute the - // common ancestors of included paths - List<String> sortedIncludedPaths = includedPaths.stream() - .sorted() - .collect(Collectors.toList()); + } - LOG.info("Paths considered for regex filtering. IncludedPaths: {}, excludedPaths: {}", sortedIncludedPaths, excludedPaths); - if (sortedIncludedPaths.contains("/")) { - LOG.info("Disabling regex path filtering because root path is in the includedPaths: {}", sortedIncludedPaths); - return Set.of(); - } - if (!excludedPaths.isEmpty()) { - LOG.info("Disabling regex path filtering because there are excluded paths: {}", excludedPaths); - return Set.of(); + private static Bson descendantsFilter(List<String> paths) { + if (paths.isEmpty()) { + return null; } - - // Keep only unique included paths. That is, if paths "/a/b" and "/a/b/c" are both in the list, keep only "/a/b" - HashSet<String> includedPathsRoots = new HashSet<>(); - for (String path : sortedIncludedPaths) { - if (includedPathsRoots.stream().noneMatch(ancestor -> PathUtils.isAncestor(ancestor, path))) { - includedPathsRoots.add(path); + // The filter for descendants of a list of paths is a series of or conditions. For each path, we have to build + // two conditions in two different fields of the documents: + // _ _id - for non-long paths - In this case, the _id is of the form "2:/foo/bar" + // _ _path - for long paths - In this case, the _id is n hash and the document contains an additional _path + // field with the path of the document. + // We use the $in operator with a regular expression to match the paths. + // https://www.mongodb.com/docs/manual/reference/operator/query/in/#use-the--in-operator-with-a-regular-expression + + ArrayList<Pattern> pathPatterns = new ArrayList<>(); + ArrayList<Pattern> idPatterns = new ArrayList<>(); + + for (String path : paths) { + if (!path.endsWith("/")) { + path = path + "/"; } + String quotedPath = Pattern.quote(path); + idPatterns.add(Pattern.compile("^[0-9]{1,3}:" + quotedPath + ".*$")); + pathPatterns.add(Pattern.compile(quotedPath + ".*$")); } - return includedPathsRoots; - } - - private static Bson descendantsFilter(Set<String> path) { - List<Bson> filters = path.stream() - .flatMap(PipelinedMongoDownloadTask::descendantsFilter) - .collect(Collectors.toList()); - return Filters.or(filters); - } - - private static Stream<Bson> descendantsFilter(String path) { - if (!path.endsWith("/")) { - path = path + "/"; - } - String quotedPath = Pattern.quote(path); - // For entries with path sizes above a certain threshold, the _id field contains a hash instead of the path of - // the entry. The path is stored instead in the _path field. Therefore, we have to include in the filter also - // the documents with matching _path. - return Stream.of( - regex(NodeDocument.ID, Pattern.compile("^[0-9]{1,3}:" + quotedPath + ".*$")), - regex(NodeDocument.PATH, Pattern.compile(quotedPath + ".*$") - )); + return Filters.or( + Filters.in(NodeDocument.ID, idPatterns), + Filters.in(NodeDocument.PATH, pathPatterns) + ); } - - static Set<String> getAncestors(Set<String> paths) { - Set<String> ancestors = new HashSet<>(); + static List<String> getAncestors(List<String> paths) { + TreeSet<String> ancestors = new TreeSet<>(); for (String child : paths) { String parent = child; while (true) { @@ -448,11 +460,11 @@ public class PipelinedMongoDownloadTask implements Callable<PipelinedMongoDownlo parent = PathUtils.getParentPath(parent); } } - return ancestors; + return new ArrayList<>(ancestors); } - static Bson ancestorsFilter(Set<String> paths) { + static Bson ancestorsFilter(List<String> paths) { List<String> parentFilters = getAncestors(paths).stream() .map(Utils::getIdFromPath) .collect(Collectors.toList()); diff --git a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedIT.java b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedIT.java index 122342f5d2..12d47ab47b 100644 --- a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedIT.java +++ b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedIT.java @@ -164,6 +164,103 @@ public class PipelinedIT { testSuccessfulDownload(pathPredicate, pathFilters); } + @Test + public void createFFS_mongoFiltering_include_excludes() throws Exception { + System.setProperty(OAK_INDEXER_PIPELINED_RETRY_ON_CONNECTION_ERRORS, "false"); + System.setProperty(OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING, "true"); + + Predicate<String> pathPredicate = s -> true; + List<PathFilter> pathFilters = List.of(new PathFilter(List.of("/content/dam/2023"), List.of("/content/dam/2023/02"))); + + testSuccessfulDownload(pathPredicate, pathFilters,List.of( + "/|{}", + "/content|{}", + "/content/dam|{}", + "/content/dam/2023|{\"p2\":\"v2023\"}", + "/content/dam/2023/01|{\"p1\":\"v202301\"}", + "/content/dam/2023/02|{}" + )); + } + + @Test + public void createFFS_mongoFiltering_include_excludes2() throws Exception { + System.setProperty(OAK_INDEXER_PIPELINED_RETRY_ON_CONNECTION_ERRORS, "false"); + System.setProperty(OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING, "true"); + + Predicate<String> pathPredicate = s -> true; + + // NOTE: If a path /a/b is in the excluded paths, the descendants of /a/b will not be downloaded but /a/b will + // be downloaded. This is an intentional limitation of the logic to compute the Mongo filter which was done + // to avoid the extra complexity of also filtering the root of the excluded tree. The transform stage would anyway + // filter out these additional documents. + List<PathFilter> pathFilters = List.of(new PathFilter(List.of("/content/dam/1000", "/content/dam/2022"), List.of("/content/dam/2022/02", "/content/dam/2022/04"))); + + testSuccessfulDownload(pathPredicate, pathFilters,List.of( + "/|{}", + "/content|{}", + "/content/dam|{}", + "/content/dam/1000|{}", + "/content/dam/1000/12|{\"p1\":\"v100012\"}", + "/content/dam/2022|{}", + "/content/dam/2022/01|{\"p1\":\"v202201\"}", + "/content/dam/2022/01/01|{\"p1\":\"v20220101\"}", + "/content/dam/2022/02|{\"p1\":\"v202202\"}", + "/content/dam/2022/03|{\"p1\":\"v202203\"}", + "/content/dam/2022/04|{\"p1\":\"v202204\"}" + )); + } + + + @Test + public void createFFS_mongoFiltering_include_excludes3() throws Exception { + System.setProperty(OAK_INDEXER_PIPELINED_RETRY_ON_CONNECTION_ERRORS, "false"); + System.setProperty(OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING, "true"); + + Predicate<String> pathPredicate = s -> true; + + List<PathFilter> pathFilters = List.of(new PathFilter(List.of("/"), List.of("/content/dam", "/etc", "/home", "/jcr:system"))); + + testSuccessfulDownload(pathPredicate, pathFilters, List.of( + "/|{}", + "/content|{}", + "/content/dam|{}", + "/etc|{}", + "/home|{}", + "/jcr:system|{}" + )); + } + + @Test + public void createFFS_mongoFiltering_include_excludes4() throws Exception { + System.setProperty(OAK_INDEXER_PIPELINED_RETRY_ON_CONNECTION_ERRORS, "false"); + System.setProperty(OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING, "true"); + + Predicate<String> pathPredicate = s -> true; + + List<PathFilter> pathFilters = List.of( + new PathFilter(List.of("/content/dam/1000"), List.of()), + new PathFilter(List.of("/content/dam/2022"), List.of("/content/dam/2022/01")) + ); + + testSuccessfulDownload(pathPredicate, pathFilters, List.of( + "/|{}", + "/content|{}", + "/content/dam|{}", + "/content/dam/1000|{}", + "/content/dam/1000/12|{\"p1\":\"v100012\"}", + "/content/dam/2022|{}", + "/content/dam/2022/01|{\"p1\":\"v202201\"}", + "/content/dam/2022/02|{\"p1\":\"v202202\"}", + "/content/dam/2022/02/01|{\"p1\":\"v20220201\"}", + "/content/dam/2022/02/02|{\"p1\":\"v20220202\"}", + "/content/dam/2022/02/03|{\"p1\":\"v20220203\"}", + "/content/dam/2022/02/04|{\"p1\":\"v20220204\"}", + "/content/dam/2022/03|{\"p1\":\"v202203\"}", + "/content/dam/2022/04|{\"p1\":\"v202204\"}" + + )); + } + @Test public void createFFS_mongoFiltering_multipleIndexes() throws Exception { System.setProperty(OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING, "true"); @@ -418,11 +515,19 @@ public class PipelinedIT { private void createContent(NodeStore rwNodeStore) throws CommitFailedException { @NotNull NodeBuilder rootBuilder = rwNodeStore.getRoot().builder(); @NotNull NodeBuilder contentDamBuilder = rootBuilder.child("content").child("dam"); - contentDamBuilder.child("2023").child("01").setProperty("p1", "v202301"); - contentDamBuilder.child("2022").child("02").setProperty("p1", "v202202"); - contentDamBuilder.child("2023").child("01").setProperty("p1", "v202301"); contentDamBuilder.child("1000").child("12").setProperty("p1", "v100012"); + contentDamBuilder.child("2022").child("01").setProperty("p1", "v202201"); + contentDamBuilder.child("2022").child("01").child("01").setProperty("p1", "v20220101"); + contentDamBuilder.child("2022").child("02").setProperty("p1", "v202202"); + contentDamBuilder.child("2022").child("02").child("01").setProperty("p1", "v20220201"); + contentDamBuilder.child("2022").child("02").child("02").setProperty("p1", "v20220202"); + contentDamBuilder.child("2022").child("02").child("03").setProperty("p1", "v20220203"); + contentDamBuilder.child("2022").child("02").child("04").setProperty("p1", "v20220204"); + contentDamBuilder.child("2022").child("03").setProperty("p1", "v202203"); + contentDamBuilder.child("2022").child("04").setProperty("p1", "v202204"); contentDamBuilder.child("2023").setProperty("p2", "v2023"); + contentDamBuilder.child("2023").child("01").setProperty("p1", "v202301"); + contentDamBuilder.child("2023").child("01").setProperty("p1", "v202301"); contentDamBuilder.child("2023").child("02").child("28").setProperty("p1", "v20230228"); // Node with very long name @@ -451,7 +556,15 @@ public class PipelinedIT { "/content/dam/1000|{}", "/content/dam/1000/12|{\"p1\":\"v100012\"}", "/content/dam/2022|{}", + "/content/dam/2022/01|{\"p1\":\"v202201\"}", + "/content/dam/2022/01/01|{\"p1\":\"v20220101\"}", "/content/dam/2022/02|{\"p1\":\"v202202\"}", + "/content/dam/2022/02/01|{\"p1\":\"v20220201\"}", + "/content/dam/2022/02/02|{\"p1\":\"v20220202\"}", + "/content/dam/2022/02/03|{\"p1\":\"v20220203\"}", + "/content/dam/2022/02/04|{\"p1\":\"v20220204\"}", + "/content/dam/2022/03|{\"p1\":\"v202203\"}", + "/content/dam/2022/04|{\"p1\":\"v202204\"}", "/content/dam/2023|{\"p2\":\"v2023\"}", "/content/dam/2023/01|{\"p1\":\"v202301\"}", "/content/dam/2023/02|{}", diff --git a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTaskTest.java b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTaskTest.java index ae8f2ec5db..c1f05fe26b 100644 --- a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTaskTest.java +++ b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTaskTest.java @@ -24,6 +24,7 @@ import com.mongodb.client.FindIterable; import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoCursor; import com.mongodb.client.MongoDatabase; +import org.apache.jackrabbit.oak.index.indexer.document.flatfile.pipelined.MongoRegexPathFilterFactory.MongoFilterPaths; import org.apache.jackrabbit.oak.plugins.document.Collection; import org.apache.jackrabbit.oak.plugins.document.DocumentStore; import org.apache.jackrabbit.oak.plugins.document.NodeDocument; @@ -32,6 +33,7 @@ import org.apache.jackrabbit.oak.plugins.metric.MetricStatisticsProvider; import org.apache.jackrabbit.oak.spi.filter.PathFilter; import org.bson.BsonDocument; import org.bson.conversions.Bson; +import org.junit.Before; import org.junit.Test; import java.util.ArrayList; @@ -43,6 +45,7 @@ import java.util.concurrent.BlockingQueue; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.stream.Collectors; +import java.util.stream.IntStream; import static org.junit.Assert.assertEquals; import static org.mockito.ArgumentMatchers.any; @@ -53,6 +56,8 @@ import static org.mockito.Mockito.when; public class PipelinedMongoDownloadTaskTest { + private MongoRegexPathFilterFactory regexFilterBuilder; + private NodeDocument newBasicDBObject(String id, long modified, DocumentStore docStore) { NodeDocument obj = Collection.NODES.newDocument(docStore); obj.put(NodeDocument.ID, "3:/content/dam/asset" + id); @@ -61,6 +66,11 @@ public class PipelinedMongoDownloadTaskTest { return obj; } + @Before + public void setUp() { + this.regexFilterBuilder = new MongoRegexPathFilterFactory(PipelinedMongoDownloadTask.DEFAULT_OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING_MAX_PATHS); + } + @Test public void connectionToMongoFailure() throws Exception { @SuppressWarnings("unchecked") @@ -142,42 +152,139 @@ public class PipelinedMongoDownloadTaskTest { @Test public void ancestorsFilters() { - assertEquals(Set.of(), PipelinedMongoDownloadTask.getAncestors(Set.of())); - assertEquals(Set.of("/"), PipelinedMongoDownloadTask.getAncestors(Set.of("/"))); - assertEquals(Set.of("/", "/a"), PipelinedMongoDownloadTask.getAncestors(Set.of("/a"))); - assertEquals(Set.of("/", "/a", "/b", "/c", "/c/c1", "/c/c1/c2", "/c/c1/c2/c3", "/c/c1/c2/c4"), - PipelinedMongoDownloadTask.getAncestors(Set.of("/a", "/b", "/c/c1/c2/c3", "/c/c1/c2/c4")) + assertEquals(List.of(), PipelinedMongoDownloadTask.getAncestors(List.of())); + assertEquals(List.of("/"), PipelinedMongoDownloadTask.getAncestors(List.of("/"))); + assertEquals(List.of("/", "/a"), PipelinedMongoDownloadTask.getAncestors(List.of("/a"))); + assertEquals(List.of("/", "/a", "/b", "/c", "/c/c1", "/c/c1/c2", "/c/c1/c2/c3", "/c/c1/c2/c4"), + PipelinedMongoDownloadTask.getAncestors(List.of("/a", "/b", "/c/c1/c2/c3", "/c/c1/c2/c4")) ); } @Test - public void regexFilters() { - assertEquals(Set.of(), PipelinedMongoDownloadTask.extractIncludedPaths(null)); + public void regexFiltersIncludedPathsOnly() { + assertEquals(MongoFilterPaths.DOWNLOAD_ALL, regexFilterBuilder.buildMongoFilter(null)); - assertEquals(Set.of(), PipelinedMongoDownloadTask.extractIncludedPaths(List.of())); + assertEquals(MongoFilterPaths.DOWNLOAD_ALL, regexFilterBuilder.buildMongoFilter(List.of())); List<PathFilter> singlePathFilter = List.of( new PathFilter(List.of("/content/dam"), List.of()) ); - assertEquals(Set.of("/content/dam"), PipelinedMongoDownloadTask.extractIncludedPaths(singlePathFilter)); + assertEquals(new MongoFilterPaths(List.of("/content/dam"), List.of()), regexFilterBuilder.buildMongoFilter(singlePathFilter)); List<PathFilter> multipleIncludeFilters = createIncludedPathFilters("/content/dam", "/content/dam"); - assertEquals(Set.of("/content/dam"), PipelinedMongoDownloadTask.extractIncludedPaths(multipleIncludeFilters)); + assertEquals(new MongoFilterPaths(List.of("/content/dam"), List.of()), regexFilterBuilder.buildMongoFilter(multipleIncludeFilters)); - List<PathFilter> includesRoot = List.of( - new PathFilter(List.of("/"), List.of()) - ); - assertEquals(Set.of(), PipelinedMongoDownloadTask.extractIncludedPaths(includesRoot)); + List<PathFilter> includesRoot = createIncludedPathFilters("/", "/a/a1/a2"); + assertEquals(MongoFilterPaths.DOWNLOAD_ALL, regexFilterBuilder.buildMongoFilter(includesRoot)); List<PathFilter> multipleIncludeFiltersDifferent = createIncludedPathFilters("/a/a1", "/a/a1/a2"); - assertEquals(Set.of("/a/a1"), PipelinedMongoDownloadTask.extractIncludedPaths(multipleIncludeFiltersDifferent)); + assertEquals(new MongoFilterPaths(List.of("/a/a1"), List.of()), regexFilterBuilder.buildMongoFilter(multipleIncludeFiltersDifferent)); List<PathFilter> multipleIncludeFiltersDifferent2 = createIncludedPathFilters("/a/a1/a2", "/a/a1", "/b", "/c", "/cc"); - assertEquals(Set.of("/a/a1", "/b", "/c", "/cc"), PipelinedMongoDownloadTask.extractIncludedPaths(multipleIncludeFiltersDifferent2)); + assertEquals(new MongoFilterPaths(List.of("/a/a1", "/b", "/c", "/cc"), List.of()), regexFilterBuilder.buildMongoFilter(multipleIncludeFiltersDifferent2)); + } + + @Test + public void regexFiltersIncludedAndExcludedPaths() { + // Excludes is not empty + + // Exclude paths is inside the included path tree + List<PathFilter> withExcludeFilter1 = List.of( + new PathFilter(List.of("/a"), List.of("/a/b")) + ); + assertEquals(new MongoFilterPaths(List.of("/a"), List.of("/a/b")), regexFilterBuilder.buildMongoFilter(withExcludeFilter1)); + + // includedPath contains the root + List<PathFilter> withExcludeFilter2 = List.of( + new PathFilter(List.of("/"), List.of("/a")) + ); + assertEquals(new MongoFilterPaths(List.of("/"), List.of("/a")), regexFilterBuilder.buildMongoFilter(withExcludeFilter2)); + + // One of the filters excludes a directory that is included by another filter. The path should still be included. + List<PathFilter> withExcludeFilter3_a = List.of( + new PathFilter(List.of("/"), List.of("/a")), + new PathFilter(List.of("/"), List.of()) + ); + assertEquals(MongoFilterPaths.DOWNLOAD_ALL, regexFilterBuilder.buildMongoFilter(withExcludeFilter3_a)); + + List<PathFilter> withExcludeFilter3_b = List.of( + new PathFilter(List.of("/a"), List.of("/a/a_excluded")), + new PathFilter(List.of("/a"), List.of()) + ); + assertEquals(new MongoFilterPaths(List.of("/a"), List.of()), regexFilterBuilder.buildMongoFilter(withExcludeFilter3_b)); + + List<PathFilter> withExcludeFilter4 = List.of( + new PathFilter(List.of("/"), List.of("/exc_a")), + new PathFilter(List.of("/"), List.of("/exc_b")) + ); + assertEquals(MongoFilterPaths.DOWNLOAD_ALL, regexFilterBuilder.buildMongoFilter(withExcludeFilter4)); - List<PathFilter> withExcludeFilter = List.of( - new PathFilter(List.of("/"), List.of("/var")) + List<PathFilter> withExcludeFilter5 = List.of( + new PathFilter(List.of("/a"), List.of("/a/a_excluded")), + new PathFilter(List.of("/b"), List.of("/b/b_excluded")) ); - assertEquals(Set.of(), PipelinedMongoDownloadTask.extractIncludedPaths(withExcludeFilter)); + assertEquals(new MongoFilterPaths(List.of("/a", "/b"), List.of("/a/a_excluded", "/b/b_excluded")), regexFilterBuilder.buildMongoFilter(withExcludeFilter5)); + + List<PathFilter> withExcludeFilter6 = List.of( + new PathFilter(List.of("/"), List.of("/a", "/b", "/c")), + new PathFilter(List.of("/"), List.of("/b", "/c", "/d")) + ); + assertEquals(new MongoFilterPaths(List.of("/"), List.of("/b", "/c")), regexFilterBuilder.buildMongoFilter(withExcludeFilter6)); + + List<PathFilter> withExcludeFilter7 = List.of( + new PathFilter(List.of("/a"), List.of("/a/b")), + new PathFilter(List.of("/a"), List.of("/a/b/c")) + ); + assertEquals(new MongoFilterPaths(List.of("/"), List.of("/b", "/c")), regexFilterBuilder.buildMongoFilter(withExcludeFilter6)); + assertEquals(new MongoFilterPaths(List.of("/a"), List.of("/a/b/c")), regexFilterBuilder.buildMongoFilter(withExcludeFilter7)); + + List<PathFilter> withExcludeFilter8 = List.of( + new PathFilter( + List.of("/p1", "/p2", "/p3", "/p4", "/p5", "/p6", "/p7", "/p8", "/p9"), + List.of("/p10", "/p5/p5_s1", "/p5/p5_s2/p5_s3", "/p11") + ) + ); + assertEquals(new MongoFilterPaths( + List.of("/p1", "/p2", "/p3", "/p4", "/p5", "/p6", "/p7", "/p8", "/p9"), + List.of("/p5/p5_s1", "/p5/p5_s2/p5_s3")), + regexFilterBuilder.buildMongoFilter(withExcludeFilter8)); + } + + + @Test + public void regexFiltersLargeNumberOfIncludedPaths1() { + // Test a single path filter with many included paths. + var maximumPaths = IntStream.range(0, PipelinedMongoDownloadTask.DEFAULT_OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING_MAX_PATHS) + .mapToObj(i -> "/p" + i) + .collect(Collectors.toList()); + var pathFilter1 = List.of(new PathFilter(maximumPaths, List.of())); + assertEquals(new MongoFilterPaths(maximumPaths.stream().sorted().collect(Collectors.toList()), List.of()), + regexFilterBuilder.buildMongoFilter(pathFilter1)); + + + var tooManyPaths = IntStream.range(0, PipelinedMongoDownloadTask.DEFAULT_OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING_MAX_PATHS + 1) + .mapToObj(i -> "/p" + i) + .collect(Collectors.toList()); + var pathFilter2 = List.of(new PathFilter(tooManyPaths, List.of())); + assertEquals(MongoFilterPaths.DOWNLOAD_ALL, regexFilterBuilder.buildMongoFilter(pathFilter2)); + } + + @Test + public void regexFiltersLargeNumberOfIncludedPaths2() { + // Test many path filters each with a single included paths. + var pathFilters = IntStream.range(0, PipelinedMongoDownloadTask.DEFAULT_OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING_MAX_PATHS + 1) + .mapToObj(i -> new PathFilter(List.of("/p" + i), List.of())) + .collect(Collectors.toList()); + assertEquals(MongoFilterPaths.DOWNLOAD_ALL, regexFilterBuilder.buildMongoFilter(pathFilters)); + } + + @Test + public void regexFiltersLargeNumberOfExcludedPaths() { + // When there are too many excluded paths, do not filter on excluded paths but still filter on included paths. + var excludedPaths = IntStream.range(0, PipelinedMongoDownloadTask.DEFAULT_OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING_MAX_PATHS + 1) + .mapToObj(i -> "/parent/p" + i) + .collect(Collectors.toList()); + var pathFilter = List.of(new PathFilter(List.of("/parent"), excludedPaths)); + assertEquals(new MongoFilterPaths(List.of("/parent"), List.of()), regexFilterBuilder.buildMongoFilter(pathFilter)); } } \ No newline at end of file
