nfsantos commented on code in PR #1042:
URL: https://github.com/apache/jackrabbit-oak/pull/1042#discussion_r1311356431
##########
oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java:
##########
@@ -225,12 +282,96 @@ private void downloadRange(DownloadRange range) throws
InterruptedException, Tim
download(mongoIterable);
}
- private void downloadAll() throws InterruptedException, TimeoutException {
- LOG.info("Traversing all documents");
- FindIterable<BasicDBObject> mongoIterable = dbCollection
- .withReadPreference(readPreference)
- .find();
- download(mongoIterable);
+ private void downloadWithNaturalOrdering() throws InterruptedException,
TimeoutException {
+ // We are downloading potentially a large fraction of the repository,
so using an index scan will be
+ // inefficient. So we pass the natural hint to force MongoDB to use
natural ordering, that is, column scan
+ String regexBasePath = getPathForRegexFiltering();
+ if (regexBasePath == null) {
+ LOG.info("Downloading full repository using natural order");
+ FindIterable<BasicDBObject> mongoIterable = dbCollection
+ .withReadPreference(readPreference)
+ .find()
+ .hint(NATURAL_HINT);
+ download(mongoIterable);
+
+ } else {
+ Bson ancestorQuery = ancestorsFilter(regexBasePath);
+ // Do not use natural order to download ancestors. The number of
ancestors will always be small, likely less
+ // than 10, so let MongoDB use an index to find them.
+ LOG.info("Downloading using regex path filtering. Downloading
ancestors: {}.", ancestorQuery);
+ FindIterable<BasicDBObject> ancestorsIterable = dbCollection
+ .withReadPreference(readPreference)
+ .find(ancestorQuery);
+ download(ancestorsIterable);
+
+ Bson childrenQuery = childrenFilter(regexBasePath);
+ LOG.info("Downloading using regex path filtering. Downloading
children: {}.", childrenQuery);
+ FindIterable<BasicDBObject> childrenIterable = dbCollection
+ .withReadPreference(readPreference)
+ .find(childrenQuery)
+ .hint(NATURAL_HINT);
+ download(childrenIterable);
+ }
+ }
+
+ private String getPathForRegexFiltering() {
+ if (!regexPathFiltering) {
+ LOG.info("Regex path filtering disabled.");
+ return null;
+ }
+ return getSingleIncludedPath(pathFilters);
+ }
+
+ // Package private for testing
+ static String getSingleIncludedPath(List<PathFilter> pathFilters) {
+ // For the time being, we only enable path filtering if there is a
single include path across all indexes and no
+ // exclude paths. This is the case for most of the larger indexes. We
can consider generalizing this in the future.
+ LOG.info("Creating regex filter from pathFilters: " + pathFilters);
+ if (pathFilters == null) {
+ return null;
+ }
+ Set<String> includedPaths = pathFilters.stream()
+ .flatMap(pathFilter -> pathFilter.getIncludedPaths().stream())
+ .collect(Collectors.toSet());
+
+ Set<String> excludedPaths = pathFilters.stream()
+ .flatMap(pathFilter -> pathFilter.getExcludedPaths().stream())
+ .collect(Collectors.toSet());
+
+ if (excludedPaths.isEmpty() && includedPaths.size() == 1) {
+ return includedPaths.stream().iterator().next();
+ } else {
+ return null;
+ }
+ }
+
+ private static Bson childrenFilter(String path) {
+ if (!path.endsWith("/")) {
+ path = path + "/";
Review Comment:
This is being done, see line 195, there are even comments explaining how
this case is handled:
```
// If regex filtering is enabled, start by downloading the ancestors
of the path used for filtering.
// That is, download "/", "/content", "/content/dam" for a base path
of "/content/dam". These nodes will not be
// matched by the regex used in the Mongo query, which assumes a
prefix of "???:/content/dam"
...
if (regexBasePath != null) {
// Regex path filtering is enabled
// Download the ancestors in a separate query. No retrials done
on this query, as it will take only a few
// seconds and is done at the start of the job, so if it fails,
the job can be retried without losing much work
LOG.info("Using regex filtering with path {}", regexBasePath);
Bson ancestorsQuery = ancestorsFilter(regexBasePath);
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]