This is an automated email from the ASF dual-hosted git repository. daim pushed a commit to branch DetailedGC/OAK-10199 in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git
commit 0d23236438099cabde8439088507d68d05f61238 Author: Nuno Santos <[email protected]> AuthorDate: Thu Jan 18 09:17:09 2024 +0100 OAK-10592 - Add support to specify a custom regex to exclude documents from being downloaded from Mongo during indexing (#1267) --- .../document/flatfile/pipelined/ConfigHelper.java | 6 + .../pipelined/PipelinedMongoDownloadTask.java | 267 ++++++++++++--------- .../document/flatfile/pipelined/PipelinedIT.java | 64 ++++- .../pipelined/PipelinedMongoDownloadTaskTest.java | 90 ++++++- 4 files changed, 305 insertions(+), 122 deletions(-) diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/ConfigHelper.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/ConfigHelper.java index e0e4e6fef2..9374af70b2 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/ConfigHelper.java +++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/ConfigHelper.java @@ -30,6 +30,12 @@ public class ConfigHelper { return result; } + public static String getSystemPropertyAsString(String name, String defaultValue) { + String result = System.getProperty(name, defaultValue); + LOG.info("Config {}={}", name, result); + return result; + } + public static boolean getSystemPropertyAsBoolean(String name, boolean defaultValue) { String sysPropValue = System.getProperty(name); boolean value; diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java index e0f8adf497..851bbc2fd9 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java +++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java @@ -46,6 +46,7 @@ import org.bson.BsonDocument; import org.bson.codecs.configuration.CodecRegistries; import org.bson.codecs.configuration.CodecRegistry; import org.bson.conversions.Bson; +import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -69,6 +70,7 @@ import java.util.stream.Collectors; import static com.mongodb.client.model.Sorts.ascending; public class PipelinedMongoDownloadTask implements Callable<PipelinedMongoDownloadTask.Result> { + public static class Result { private final long documentsDownloaded; @@ -106,6 +108,14 @@ public class PipelinedMongoDownloadTask implements Callable<PipelinedMongoDownlo */ public static final String OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING = "oak.indexer.pipelined.mongoRegexPathFiltering"; public static final boolean DEFAULT_OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING = false; + /** + * Any document with a path that matches this regex pattern will not be downloaded. This pattern will be included + * in the Mongo query, that is, the filtering is done by server-side at Mongo, which avoids downloading the documents + * matching this query. This is typically a _suffix_, for example "/metadata.xml$|/renditions/.*.jpg$". + * To exclude subtrees such as /content/abc, use mongoFilterPaths instead. + */ + public static final String OAK_INDEXER_PIPELINED_MONGO_CUSTOM_EXCLUDE_ENTRIES_REGEX = "oak.indexer.pipelined.mongoCustomExcludeEntriesRegex"; + public static final String DEFAULT_OAK_INDEXER_PIPELINED_MONGO_CUSTOM_EXCLUDE_ENTRIES_REGEX = ""; /** * Maximum number of elements in the included/excluded paths list used for regex path filtering. If after @@ -129,6 +139,117 @@ public class PipelinedMongoDownloadTask implements Callable<PipelinedMongoDownlo private static final String THREAD_NAME = "mongo-dump"; + /** + * Creates the filter to be used in the Mongo query + * + * @param mongoFilterPaths The paths to be included/excluded in the filter. These define subtrees to be included or excluded. + * (see {@link MongoFilterPaths} for details) + * @param customExcludeEntriesRegex Documents with paths matching this regex are excluded from download + * @return The filter to be used in the Mongo query, or null if no filter is required + */ + static Bson computeMongoQueryFilter(@NotNull MongoFilterPaths mongoFilterPaths, String customExcludeEntriesRegex) { + var filters = new ArrayList<Bson>(4); + if (mongoFilterPaths != MongoFilterPaths.DOWNLOAD_ALL) { + filters.add(descendantsFilter(mongoFilterPaths.included)); + if (!mongoFilterPaths.excluded.isEmpty()) { + // The Mongo filter returned here will download the top level path of each excluded subtree, which in theory + // should be excluded. That is, if the tree /a/b/c is excluded, the filter will download /a/b/c but none of + // its descendants. + // This is done because excluding also the top level path would add extra complexity to the filter and + // would not have any measurable impact on performance because it only downloads a few extra documents, one + // for each excluded subtree. The transform stage will anyway filter out these paths. + Bson excludedFilter = descendantsFilter(mongoFilterPaths.excluded); + if (excludedFilter != null) { + filters.add(Filters.nor(excludedFilter)); + } + } + } + + // Custom regex filter to exclude paths + Bson customExcludedPathsFilter = createCustomExcludedEntriesFilter(customExcludeEntriesRegex); + if (customExcludedPathsFilter != null) { + filters.add(customExcludedPathsFilter); + } + if (filters.isEmpty()) { + return null; + } else if (filters.size() == 1) { + return filters.get(0); + } else { + return Filters.and(filters); + } + } + + static Bson createCustomExcludedEntriesFilter(String customRegexPattern) { + if (customRegexPattern == null || customRegexPattern.trim().isEmpty()) { + LOG.info("Mongo custom regex is disabled"); + return null; + } else { + LOG.info("Excluding nodes with paths matching regex: {}", customRegexPattern); + var pattern = Pattern.compile(customRegexPattern); + return Filters.nor( + Filters.regex(NodeDocument.ID, pattern), + Filters.regex(NodeDocument.PATH, pattern) + ); + } + } + + private static Bson descendantsFilter(List<String> paths) { + if (paths.isEmpty()) { + return null; + } + // The filter for descendants of a list of paths is a series of or conditions. For each path, we have to build + // two conditions in two different fields of the documents: + // _ _id - for non-long paths - In this case, the _id is of the form "2:/foo/bar" + // _ _path - for long paths - In this case, the _id is a hash and the document contains an additional _path + // field with the path of the document. + // We use the $in operator with a regular expression to match the paths. + // https://www.mongodb.com/docs/manual/reference/operator/query/in/#use-the--in-operator-with-a-regular-expression + + ArrayList<Pattern> pathPatterns = new ArrayList<>(); + ArrayList<Pattern> idPatterns = new ArrayList<>(); + + for (String path : paths) { + if (!path.endsWith("/")) { + path = path + "/"; + } + String quotedPath = Pattern.quote(path); + idPatterns.add(Pattern.compile("^[0-9]{1,3}:" + quotedPath + ".*$")); + pathPatterns.add(Pattern.compile(quotedPath + ".*$")); + } + return Filters.or( + Filters.in(NodeDocument.ID, idPatterns), + Filters.in(NodeDocument.PATH, pathPatterns) + ); + } + + /** + * Returns all the ancestors paths of the given list of paths. That is, if the list is ["/a/b/c", "/a/b/d"], + * this method will return ["/", "/a", "/a/b", "/a/b/c", "/a/b/d"]. Note that the paths on the input list are also + * returned, even though they are not strictly ancestors of themselves. + */ + static List<String> getAncestors(List<String> paths) { + TreeSet<String> ancestors = new TreeSet<>(); + for (String child : paths) { + String parent = child; + while (true) { + ancestors.add(parent); + if (PathUtils.denotesRoot(parent)) { + break; + } + parent = PathUtils.getParentPath(parent); + } + } + return new ArrayList<>(ancestors); + } + + + private static Bson ancestorsFilter(List<String> paths) { + List<String> parentFilters = getAncestors(paths).stream() + .map(Utils::getIdFromPath) + .collect(Collectors.toList()); + return Filters.in(NodeDocument.ID, parentFilters); + } + private final int maxBatchNumberOfDocuments; private final BlockingQueue<NodeDocument[]> mongoDocQueue; private final List<PathFilter> pathFilters; @@ -142,14 +263,15 @@ public class PipelinedMongoDownloadTask implements Callable<PipelinedMongoDownlo private final int maxBatchSizeBytes; private final StatisticsProvider statisticsProvider; private final MongoRegexPathFilterFactory regexPathFilterFactory; + private final String customExcludeEntriesRegex; private long totalEnqueueWaitTimeMillis = 0; private Instant lastDelayedEnqueueWarningMessageLoggedTimestamp = Instant.now(); private long documentsRead = 0; + private long totalDataDownloadedBytes = 0; private long nextLastModified = 0; private String lastIdDownloaded = null; - public PipelinedMongoDownloadTask(MongoDatabase mongoDatabase, MongoDocumentStore mongoDocStore, int maxBatchSizeBytes, @@ -187,6 +309,10 @@ public class PipelinedMongoDownloadTask implements Callable<PipelinedMongoDownlo OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING_MAX_PATHS, DEFAULT_OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING_MAX_PATHS); this.regexPathFilterFactory = new MongoRegexPathFilterFactory(regexPathFilteringMaxNumberOfPaths); + this.customExcludeEntriesRegex = ConfigHelper.getSystemPropertyAsString( + PipelinedMongoDownloadTask.OAK_INDEXER_PIPELINED_MONGO_CUSTOM_EXCLUDE_ENTRIES_REGEX, + PipelinedMongoDownloadTask.DEFAULT_OAK_INDEXER_PIPELINED_MONGO_CUSTOM_EXCLUDE_ENTRIES_REGEX + ); //TODO This may lead to reads being routed to secondary depending on MongoURI //So caller must ensure that its safe to read from secondary @@ -211,12 +337,15 @@ public class PipelinedMongoDownloadTask implements Callable<PipelinedMongoDownlo } else { downloadWithNaturalOrdering(); } + long durationMillis = downloadStartWatch.elapsed(TimeUnit.MILLISECONDS); String enqueueingDelayPercentage = PipelinedUtils.formatAsPercentage(totalEnqueueWaitTimeMillis, durationMillis); String metrics = MetricsFormatter.newBuilder() .add("duration", FormattingUtils.formatToSeconds(downloadStartWatch)) .add("durationSeconds", durationMillis / 1000) .add("documentsDownloaded", documentsRead) + .add("dataDownloadedBytes", totalDataDownloadedBytes) + .add("dataDownloaded", IOUtils.humanReadableByteCountBin(totalDataDownloadedBytes)) .add("enqueueingDelayMillis", totalEnqueueWaitTimeMillis) .add("enqueueingDelayPercentage", enqueueingDelayPercentage) .build(); @@ -259,20 +388,16 @@ public class PipelinedMongoDownloadTask implements Callable<PipelinedMongoDownlo // If regex filtering is enabled, start by downloading the ancestors of the path used for filtering. // That is, download "/", "/content", "/content/dam" for a base path of "/content/dam". These nodes will not be // matched by the regex used in the Mongo query, which assumes a prefix of "???:/content/dam" - MongoFilterPaths mongoFilterPathsDefinition = getPathsForRegexFiltering(); - Bson childrenFilter; - if (mongoFilterPathsDefinition == MongoFilterPaths.DOWNLOAD_ALL) { + MongoFilterPaths mongoFilterPaths = getPathsForRegexFiltering(); + Bson mongoFilter = computeMongoQueryFilter(mongoFilterPaths, customExcludeEntriesRegex); + if (mongoFilter == null) { LOG.info("Downloading full repository"); - childrenFilter = null; } else { + LOG.info("Downloading from Mongo using filter: {}", mongoFilter); // Regex path filtering is enabled // Download the ancestors in a separate query. No retrials done on this query, as it will take only a few // seconds and is done at the start of the job, so if it fails, the job can be retried without losing much work - downloadAncestors(mongoFilterPathsDefinition.included); - - // Filter to apply to the main query - childrenFilter = descendantsFilter(mongoFilterPathsDefinition); - LOG.info("Downloading from Mongo using filter: {}", childrenFilter); + downloadAncestors(mongoFilterPaths.included); } Instant failuresStartTimestamp = null; // When the last series of failures started @@ -286,14 +411,14 @@ public class PipelinedMongoDownloadTask implements Callable<PipelinedMongoDownlo try { if (lastIdDownloaded != null) { LOG.info("Recovering from broken connection, finishing downloading documents with _modified={}", nextLastModified); - downloadRange(new DownloadRange(nextLastModified, nextLastModified + 1, lastIdDownloaded), childrenFilter); + downloadRange(new DownloadRange(nextLastModified, nextLastModified + 1, lastIdDownloaded), mongoFilter); // We have managed to reconnect, reset the failure timestamp failuresStartTimestamp = null; numberOfFailures = 0; // Continue downloading everything starting from the next _lastmodified value - downloadRange(new DownloadRange(nextLastModified + 1, Long.MAX_VALUE, null), childrenFilter); + downloadRange(new DownloadRange(nextLastModified + 1, Long.MAX_VALUE, null), mongoFilter); } else { - downloadRange(new DownloadRange(nextLastModified, Long.MAX_VALUE, null), childrenFilter); + downloadRange(new DownloadRange(nextLastModified, Long.MAX_VALUE, null), mongoFilter); } downloadCompleted = true; } catch (MongoException e) { @@ -358,9 +483,10 @@ public class PipelinedMongoDownloadTask implements Callable<PipelinedMongoDownlo private void downloadWithNaturalOrdering() throws InterruptedException, TimeoutException { // We are downloading potentially a large fraction of the repository, so using an index scan will be // inefficient. So we pass the natural hint to force MongoDB to use natural ordering, that is, column scan - MongoFilterPaths regexBasePath = getPathsForRegexFiltering(); - if (regexBasePath == MongoFilterPaths.DOWNLOAD_ALL) { - LOG.info("Downloading full repository using natural order"); + MongoFilterPaths mongoFilterPaths = getPathsForRegexFiltering(); + Bson mongoFilter = computeMongoQueryFilter(mongoFilterPaths, customExcludeEntriesRegex); + if (mongoFilter == null) { + LOG.info("Downloading full repository from Mongo with natural order"); FindIterable<NodeDocument> mongoIterable = dbCollection .withReadPreference(readPreference) .find() @@ -368,22 +494,14 @@ public class PipelinedMongoDownloadTask implements Callable<PipelinedMongoDownlo download(mongoIterable); } else { - downloadAncestors(regexBasePath.included); - - Bson childrenFilter = descendantsFilter(regexBasePath); - FindIterable<NodeDocument> findIterable; - if (childrenFilter == null) { - LOG.info("Downloading full repository using natural order"); - findIterable = dbCollection - .withReadPreference(readPreference) - .find(); - } else { - LOG.info("Downloading from Mongo using filter: {}", childrenFilter); - findIterable = dbCollection - .withReadPreference(readPreference) - .find(childrenFilter); - } - download(findIterable.hint(NATURAL_HINT)); + downloadAncestors(mongoFilterPaths.included); + + LOG.info("Downloading from Mongo with natural order using filter: {}", mongoFilter); + FindIterable<NodeDocument> findIterable = dbCollection + .withReadPreference(readPreference) + .find(mongoFilter) + .hint(NATURAL_HINT); + download(findIterable); } } @@ -391,84 +509,16 @@ public class PipelinedMongoDownloadTask implements Callable<PipelinedMongoDownlo if (!regexPathFiltering) { LOG.info("Regex path filtering disabled."); return MongoFilterPaths.DOWNLOAD_ALL; - } - LOG.info("Computing included/excluded paths for Mongo regex path filtering. PathFilters: {}", pathFilters.stream() - .map(pf -> "PF{includedPaths=" + pf.getIncludedPaths() + ", excludedPaths=" + pf.getExcludedPaths() + "}") - .collect(Collectors.joining(", "))); - MongoFilterPaths mongoFilterPaths = this.regexPathFilterFactory.buildMongoFilter(pathFilters); - LOG.info("Paths used for regex filtering on Mongo: {}", mongoFilterPaths); - return mongoFilterPaths; - } - - private Bson descendantsFilter(MongoFilterPaths mongoFilterPathsDefinition) { - if (mongoFilterPathsDefinition == MongoFilterPaths.DOWNLOAD_ALL) { - return null; - } - Bson pathFilter = descendantsFilter(mongoFilterPathsDefinition.included); - if (mongoFilterPathsDefinition.excluded.isEmpty()) { - return pathFilter; } else { - // The Mongo filter returned here will download the top level path of each excluded subtree, which in theory - // should be excluded. That is, if the tree /a/b/c is excluded, the filter will download /a/b/c but none of - // its descendants. - // This is done because excluding also the top level path would add extra complexity to the filter and - // would not have any measurable impact on performance because it only downloads a few extra documents, one - // for each excluded subtree. The transform stage will anyway filter out these paths. - Bson excludedFilter = descendantsFilter(mongoFilterPathsDefinition.excluded); - return Filters.and(pathFilter, Filters.nor(excludedFilter)); - } - } - - private static Bson descendantsFilter(List<String> paths) { - if (paths.isEmpty()) { - return null; - } - // The filter for descendants of a list of paths is a series of or conditions. For each path, we have to build - // two conditions in two different fields of the documents: - // _ _id - for non-long paths - In this case, the _id is of the form "2:/foo/bar" - // _ _path - for long paths - In this case, the _id is n hash and the document contains an additional _path - // field with the path of the document. - // We use the $in operator with a regular expression to match the paths. - // https://www.mongodb.com/docs/manual/reference/operator/query/in/#use-the--in-operator-with-a-regular-expression - - ArrayList<Pattern> pathPatterns = new ArrayList<>(); - ArrayList<Pattern> idPatterns = new ArrayList<>(); - - for (String path : paths) { - if (!path.endsWith("/")) { - path = path + "/"; - } - String quotedPath = Pattern.quote(path); - idPatterns.add(Pattern.compile("^[0-9]{1,3}:" + quotedPath + ".*$")); - pathPatterns.add(Pattern.compile(quotedPath + ".*$")); - } - return Filters.or( - Filters.in(NodeDocument.ID, idPatterns), - Filters.in(NodeDocument.PATH, pathPatterns) - ); - } - - static List<String> getAncestors(List<String> paths) { - TreeSet<String> ancestors = new TreeSet<>(); - for (String child : paths) { - String parent = child; - while (true) { - ancestors.add(parent); - if (PathUtils.denotesRoot(parent)) { - break; - } - parent = PathUtils.getParentPath(parent); - } + LOG.info("Computing included/excluded paths for Mongo regex path filtering. PathFilters: {}", + pathFilters.stream() + .map(pf -> "PF{includedPaths=" + pf.getIncludedPaths() + ", excludedPaths=" + pf.getExcludedPaths() + "}") + .collect(Collectors.joining(", ")) + ); + MongoFilterPaths mongoFilterPaths = this.regexPathFilterFactory.buildMongoFilter(pathFilters); + LOG.info("Paths used for regex filtering on Mongo: {}", mongoFilterPaths); + return mongoFilterPaths; } - return new ArrayList<>(ancestors); - } - - - static Bson ancestorsFilter(List<String> paths) { - List<String> parentFilters = getAncestors(paths).stream() - .map(Utils::getIdFromPath) - .collect(Collectors.toList()); - return Filters.in(NodeDocument.ID, parentFilters); } private void download(FindIterable<NodeDocument> mongoIterable) throws InterruptedException, TimeoutException { @@ -492,6 +542,7 @@ public class PipelinedMongoDownloadTask implements Callable<PipelinedMongoDownlo nextIndex++; int docSize = (int) next.remove(NodeDocumentCodec.SIZE_FIELD); batchSize += docSize; + totalDataDownloadedBytes += docSize; if (batchSize >= maxBatchSizeBytes || nextIndex == batch.length) { LOG.trace("Enqueuing block with {} elements, estimated size: {} bytes", nextIndex, batchSize); tryEnqueueCopy(batch, nextIndex); diff --git a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedIT.java b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedIT.java index 12d47ab47b..b90c44b553 100644 --- a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedIT.java +++ b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedIT.java @@ -67,6 +67,7 @@ import java.util.function.Predicate; import java.util.stream.Collectors; import static java.lang.management.ManagementFactory.getPlatformMBeanServer; +import static org.apache.jackrabbit.oak.index.indexer.document.flatfile.pipelined.PipelinedMongoDownloadTask.OAK_INDEXER_PIPELINED_MONGO_CUSTOM_EXCLUDE_ENTRIES_REGEX; import static org.apache.jackrabbit.oak.index.indexer.document.flatfile.pipelined.PipelinedMongoDownloadTask.OAK_INDEXER_PIPELINED_MONGO_REGEX_PATH_FILTERING; import static org.apache.jackrabbit.oak.index.indexer.document.flatfile.pipelined.PipelinedMongoDownloadTask.OAK_INDEXER_PIPELINED_RETRY_ON_CONNECTION_ERRORS; import static org.junit.Assert.assertArrayEquals; @@ -172,7 +173,7 @@ public class PipelinedIT { Predicate<String> pathPredicate = s -> true; List<PathFilter> pathFilters = List.of(new PathFilter(List.of("/content/dam/2023"), List.of("/content/dam/2023/02"))); - testSuccessfulDownload(pathPredicate, pathFilters,List.of( + testSuccessfulDownload(pathPredicate, pathFilters, List.of( "/|{}", "/content|{}", "/content/dam|{}", @@ -195,18 +196,18 @@ public class PipelinedIT { // filter out these additional documents. List<PathFilter> pathFilters = List.of(new PathFilter(List.of("/content/dam/1000", "/content/dam/2022"), List.of("/content/dam/2022/02", "/content/dam/2022/04"))); - testSuccessfulDownload(pathPredicate, pathFilters,List.of( - "/|{}", - "/content|{}", - "/content/dam|{}", - "/content/dam/1000|{}", - "/content/dam/1000/12|{\"p1\":\"v100012\"}", - "/content/dam/2022|{}", - "/content/dam/2022/01|{\"p1\":\"v202201\"}", - "/content/dam/2022/01/01|{\"p1\":\"v20220101\"}", - "/content/dam/2022/02|{\"p1\":\"v202202\"}", - "/content/dam/2022/03|{\"p1\":\"v202203\"}", - "/content/dam/2022/04|{\"p1\":\"v202204\"}" + testSuccessfulDownload(pathPredicate, pathFilters, List.of( + "/|{}", + "/content|{}", + "/content/dam|{}", + "/content/dam/1000|{}", + "/content/dam/1000/12|{\"p1\":\"v100012\"}", + "/content/dam/2022|{}", + "/content/dam/2022/01|{\"p1\":\"v202201\"}", + "/content/dam/2022/01/01|{\"p1\":\"v20220101\"}", + "/content/dam/2022/02|{\"p1\":\"v202202\"}", + "/content/dam/2022/03|{\"p1\":\"v202203\"}", + "/content/dam/2022/04|{\"p1\":\"v202204\"}" )); } @@ -321,6 +322,43 @@ public class PipelinedIT { testSuccessfulDownload(pathPredicate, pathFilters, expected); } + @Test + public void createFFSCustomExcludePathsRegex() throws Exception { + // Filter all nodes ending in /metadata.xml or having a path section with ".*.jpg" + System.setProperty(OAK_INDEXER_PIPELINED_MONGO_CUSTOM_EXCLUDE_ENTRIES_REGEX, "/metadata.xml$|/.*.jpg/.*"); + Predicate<String> pathPredicate = s -> contentDamPathFilter.filter(s) != PathFilter.Result.EXCLUDE; + + Backend rwStore = createNodeStore(false); + + // Create content + var rwNodeStore = rwStore.documentNodeStore; + @NotNull NodeBuilder rootBuilder = rwNodeStore.getRoot().builder(); + @NotNull NodeBuilder contentDamBuilder = rootBuilder.child("content").child("dam"); + contentDamBuilder.child("a.jpg").child("jcr:content").child("metadata.xml"); + contentDamBuilder.child("a.jpg").child("jcr:content").child("metadata.text"); + contentDamBuilder.child("image_a.png").child("jcr:content").child("metadata.text"); + contentDamBuilder.child("image_a.png").child("jcr:content").child("metadata.xml"); + rwNodeStore.merge(rootBuilder, EmptyHook.INSTANCE, CommitInfo.EMPTY); + + Backend roStore = createNodeStore(true); + PipelinedStrategy pipelinedStrategy = createStrategy(roStore, pathPredicate, null); + + File file = pipelinedStrategy.createSortedStoreFile(); + + assertTrue(file.exists()); + var expected = List.of( + "/|{}", + "/content|{}", + "/content/dam|{}", + "/content/dam/a.jpg|{}", + "/content/dam/image_a.png|{}", + "/content/dam/image_a.png/jcr:content|{}", + "/content/dam/image_a.png/jcr:content/metadata.text|{}" + ); + assertEquals(expected, Files.readAllLines(file.toPath())); + assertMetrics(); + } + private void testSuccessfulDownload(Predicate<String> pathPredicate, List<PathFilter> pathFilters) throws CommitFailedException, IOException { testSuccessfulDownload(pathPredicate, pathFilters, EXPECTED_FFS); diff --git a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTaskTest.java b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTaskTest.java index c1f05fe26b..2b79c34dbb 100644 --- a/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTaskTest.java +++ b/oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTaskTest.java @@ -18,12 +18,14 @@ */ package org.apache.jackrabbit.oak.index.indexer.document.flatfile.pipelined; +import com.mongodb.MongoClient; import com.mongodb.MongoSocketException; import com.mongodb.ServerAddress; import com.mongodb.client.FindIterable; import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoCursor; import com.mongodb.client.MongoDatabase; +import com.mongodb.client.model.Filters; import org.apache.jackrabbit.oak.index.indexer.document.flatfile.pipelined.MongoRegexPathFilterFactory.MongoFilterPaths; import org.apache.jackrabbit.oak.plugins.document.Collection; import org.apache.jackrabbit.oak.plugins.document.DocumentStore; @@ -44,10 +46,12 @@ import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; +import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.IntStream; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.eq; import static org.mockito.Mockito.mock; @@ -287,4 +291,88 @@ public class PipelinedMongoDownloadTaskTest { var pathFilter = List.of(new PathFilter(List.of("/parent"), excludedPaths)); assertEquals(new MongoFilterPaths(List.of("/parent"), List.of()), regexFilterBuilder.buildMongoFilter(pathFilter)); } -} \ No newline at end of file + + @Test + public void createCustomExcludeEntriesFilter() { + assertNull(PipelinedMongoDownloadTask.createCustomExcludedEntriesFilter(null)); + assertNull(PipelinedMongoDownloadTask.createCustomExcludedEntriesFilter("")); + + Pattern p = Pattern.compile("^[0-9]{1,3}:/a/b.*$"); + var expectedBson = Filters.nor( + Filters.regex(NodeDocument.ID, p), + Filters.regex(NodeDocument.PATH, p) + ); + var actualBson = PipelinedMongoDownloadTask.createCustomExcludedEntriesFilter("^[0-9]{1,3}:/a/b.*$"); + + assertBsonEquals(expectedBson, actualBson); + } + + @Test + public void computeMongoQueryFilterNoPathFilterNoExcludeFilter() { + // No path filter and no exclude filter + assertNull( + PipelinedMongoDownloadTask.computeMongoQueryFilter( + MongoFilterPaths.DOWNLOAD_ALL, + null + ) + ); + } + + @Test + public void computeMongoQueryFilterWithPathFilterNoExcludeFilter() { + // Path filter but no exclude filter + var actual = PipelinedMongoDownloadTask.computeMongoQueryFilter( + new MongoFilterPaths(List.of("/parent"), List.of()), + null + ); + var expected = Filters.or( + Filters.in(NodeDocument.ID, Pattern.compile("^[0-9]{1,3}:" + Pattern.quote("/parent/") + ".*$")), + Filters.in(NodeDocument.PATH, Pattern.compile(Pattern.quote("/parent/") + ".*$")) + ); + assertBsonEquals(expected, actual); + } + + @Test + public void computeMongoQueryFilterNoPathFilterWithExcludeFilter() { + var actual = PipelinedMongoDownloadTask.computeMongoQueryFilter( + MongoFilterPaths.DOWNLOAD_ALL, + "^[0-9]{1,3}:/a/b.*$" + ); + Pattern p = Pattern.compile("^[0-9]{1,3}:/a/b.*$"); + assertBsonEquals( + Filters.nor(Filters.regex(NodeDocument.ID, p), Filters.regex(NodeDocument.PATH, p)), + actual + ); + } + + @Test + public void computeMongoQueryFilterWithPathFilterWithExcludeFilter() { + var actual = PipelinedMongoDownloadTask.computeMongoQueryFilter( + new MongoFilterPaths(List.of("/parent"), List.of()), + "^[0-9]{1,3}:/a/b.*$" + ); + + Pattern p = Pattern.compile("^[0-9]{1,3}:/a/b.*$"); + var expected = + Filters.and( + Filters.nor(Filters.regex(NodeDocument.ID, p), Filters.regex(NodeDocument.PATH, p)), + Filters.or( + Filters.in(NodeDocument.ID, Pattern.compile("^[0-9]{1,3}:" + Pattern.quote("/parent/") + ".*$")), + Filters.in(NodeDocument.PATH, Pattern.compile(Pattern.quote("/parent/") + ".*$")) + )); + assertBsonEquals(expected, actual); + } + + private void assertBsonEquals(Bson actual, Bson expected) { + if (actual == null && expected == null) { + return; + } else if (actual == null || expected == null) { + throw new AssertionError("One of the bson is null. Actual: " + actual + ", expected: " + expected); + } + assertEquals( + actual.toBsonDocument(BsonDocument.class, MongoClient.getDefaultCodecRegistry()), + expected.toBsonDocument(BsonDocument.class, MongoClient.getDefaultCodecRegistry()) + ); + } +} +
