nfsantos commented on code in PR #1042: URL: https://github.com/apache/jackrabbit-oak/pull/1042#discussion_r1316861299
########## oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java: ########## @@ -225,12 +282,96 @@ private void downloadRange(DownloadRange range) throws InterruptedException, Tim download(mongoIterable); } - private void downloadAll() throws InterruptedException, TimeoutException { - LOG.info("Traversing all documents"); - FindIterable<BasicDBObject> mongoIterable = dbCollection - .withReadPreference(readPreference) - .find(); - download(mongoIterable); + private void downloadWithNaturalOrdering() throws InterruptedException, TimeoutException { + // We are downloading potentially a large fraction of the repository, so using an index scan will be + // inefficient. So we pass the natural hint to force MongoDB to use natural ordering, that is, column scan + String regexBasePath = getPathForRegexFiltering(); + if (regexBasePath == null) { + LOG.info("Downloading full repository using natural order"); + FindIterable<BasicDBObject> mongoIterable = dbCollection + .withReadPreference(readPreference) + .find() + .hint(NATURAL_HINT); + download(mongoIterable); + + } else { + Bson ancestorQuery = ancestorsFilter(regexBasePath); + // Do not use natural order to download ancestors. The number of ancestors will always be small, likely less + // than 10, so let MongoDB use an index to find them. + LOG.info("Downloading using regex path filtering. Downloading ancestors: {}.", ancestorQuery); + FindIterable<BasicDBObject> ancestorsIterable = dbCollection + .withReadPreference(readPreference) + .find(ancestorQuery); + download(ancestorsIterable); + + Bson childrenQuery = childrenFilter(regexBasePath); + LOG.info("Downloading using regex path filtering. Downloading children: {}.", childrenQuery); + FindIterable<BasicDBObject> childrenIterable = dbCollection + .withReadPreference(readPreference) + .find(childrenQuery) + .hint(NATURAL_HINT); + download(childrenIterable); + } + } + + private String getPathForRegexFiltering() { + if (!regexPathFiltering) { + LOG.info("Regex path filtering disabled."); + return null; + } + return getSingleIncludedPath(pathFilters); + } + + // Package private for testing + static String getSingleIncludedPath(List<PathFilter> pathFilters) { + // For the time being, we only enable path filtering if there is a single include path across all indexes and no + // exclude paths. This is the case for most of the larger indexes. We can consider generalizing this in the future. + LOG.info("Creating regex filter from pathFilters: " + pathFilters); + if (pathFilters == null) { + return null; + } + Set<String> includedPaths = pathFilters.stream() + .flatMap(pathFilter -> pathFilter.getIncludedPaths().stream()) + .collect(Collectors.toSet()); + + Set<String> excludedPaths = pathFilters.stream() + .flatMap(pathFilter -> pathFilter.getExcludedPaths().stream()) + .collect(Collectors.toSet()); + + if (excludedPaths.isEmpty() && includedPaths.size() == 1) { + return includedPaths.stream().iterator().next(); + } else { + return null; + } + } + + private static Bson childrenFilter(String path) { + if (!path.endsWith("/")) { + path = path + "/"; + } + String quotedPath = Pattern.quote(path); + // For entries with path sizes above a certain threshold, the _id field contains a hash instead of the path of + // the entry. The path is stored instead in the _path field. Therefore, we have to include in the filter also + // the documents with matching _path. + return Filters.or( + regex(NodeDocument.ID, Pattern.compile("^[0-9]{1,3}:" + quotedPath + ".*$")), + regex(NodeDocument.PATH, Pattern.compile(quotedPath + ".*$")) + ); + } + + private static Bson ancestorsFilter(String path) { Review Comment: Good point. I added a test and code to handle the case where the path used for filtering is long enough to be handled as a long path: 2ae36bdea6ae2552d3610c45bcb1e406e703b353 I expect that this will never happen in practice, as it would mean that one of the `includePaths` is extremely long, but we should guard against it. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: dev-unsubscr...@jackrabbit.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org