[GitHub] [jackrabbit-oak] nfsantos commented on a diff in pull request #1042: OAK-10358 - Filter by path on MongoDB query when downloading repository in Indexing job
nfsantos commented on code in PR #1042: URL: https://github.com/apache/jackrabbit-oak/pull/1042#discussion_r1316874389 ## oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java: ## @@ -215,8 +190,90 @@ private void reportProgress(String id) { traversalLog.trace(id); } -private void downloadRange(DownloadRange range) throws InterruptedException, TimeoutException { -BsonDocument findQuery = range.getFindQuery(); + +private void downloadWithRetryOnConnectionErrors() throws InterruptedException, TimeoutException { +// If regex filtering is enabled, start by downloading the ancestors of the path used for filtering. +// That is, download "/", "/content", "/content/dam" for a base path of "/content/dam". These nodes will not be +// matched by the regex used in the Mongo query, which assumes a prefix of "???:/content/dam" +Bson childrenFilter = null; +String regexBasePath = getPathForRegexFiltering(); +if (regexBasePath != null) { +// Regex path filtering is enabled +// Download the ancestors in a separate query. No retrials done on this query, as it will take only a few +// seconds and is done at the start of the job, so if it fails, the job can be retried without losing much work +LOG.info("Using regex filtering with path {}", regexBasePath); +Bson ancestorsQuery = ancestorsFilter(regexBasePath); +LOG.info("Downloading ancestors of {}", regexBasePath); +// Let Mongo decide which index to use for this query, it will return very few documents +FindIterable mongoIterable = dbCollection +.withReadPreference(readPreference) +.find(ancestorsQuery); +download(mongoIterable); +// Filter to apply to the main query +childrenFilter = childrenFilter(regexBasePath); +} + +Instant failuresStartTimestamp = null; // When the last series of failures started +long retryIntervalMs = retryInitialIntervalMillis; +int numberOfFailures = 0; +boolean downloadCompleted = false; +Map exceptions = new HashMap<>(); +this.nextLastModified = 0; +this.lastIdDownloaded = null; +while (!downloadCompleted) { +try { +if (lastIdDownloaded != null) { +LOG.info("Recovering from broken connection, finishing downloading documents with _modified={}", nextLastModified); +downloadRange(new DownloadRange(nextLastModified, nextLastModified + 1, lastIdDownloaded), childrenFilter); +// We have managed to reconnect, reset the failure timestamp +failuresStartTimestamp = null; +numberOfFailures = 0; +// Continue downloading everything starting from the next _lastmodified value +downloadRange(new DownloadRange(nextLastModified + 1, Long.MAX_VALUE, null), childrenFilter); +} else { +downloadRange(new DownloadRange(nextLastModified, Long.MAX_VALUE, null), childrenFilter); +} +downloadCompleted = true; +} catch (MongoException e) { +if (e instanceof MongoInterruptedException || e instanceof MongoIncompatibleDriverException) { +// Non-recoverable exceptions +throw e; +} +if (failuresStartTimestamp == null) { +failuresStartTimestamp = Instant.now().truncatedTo(ChronoUnit.SECONDS); +} +LOG.warn("Connection error downloading from MongoDB.", e); +long secondsSinceStartOfFailures = Duration.between(failuresStartTimestamp, Instant.now()).toSeconds(); +if (secondsSinceStartOfFailures > retryDuringSeconds) { +// Give up. Get a string of all exceptions that were thrown +StringBuilder summary = new StringBuilder(); +for (Map.Entry entry : exceptions.entrySet()) { + summary.append("\n\t").append(entry.getValue()).append("x: ").append(entry.getKey()); +} +throw new RetryException(retryDuringSeconds, summary.toString(), e); +} else { +numberOfFailures++; +LOG.warn("Retrying download in {} ms; number of times failed: {}; current series of failures started at: {} ({} seconds ago)", +retryIntervalMs, numberOfFailures, failuresStartTimestamp, secondsSinceStartOfFailures); +exceptions.compute(e.getClass().getSimpleName() + " - " + e.getMessage(), +(key, val) -> val == null ? 1 : val +
[GitHub] [jackrabbit-oak] nfsantos commented on a diff in pull request #1042: OAK-10358 - Filter by path on MongoDB query when downloading repository in Indexing job
nfsantos commented on code in PR #1042: URL: https://github.com/apache/jackrabbit-oak/pull/1042#discussion_r1316861299 ## oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java: ## @@ -225,12 +282,96 @@ private void downloadRange(DownloadRange range) throws InterruptedException, Tim download(mongoIterable); } -private void downloadAll() throws InterruptedException, TimeoutException { -LOG.info("Traversing all documents"); -FindIterable mongoIterable = dbCollection -.withReadPreference(readPreference) -.find(); -download(mongoIterable); +private void downloadWithNaturalOrdering() throws InterruptedException, TimeoutException { +// We are downloading potentially a large fraction of the repository, so using an index scan will be +// inefficient. So we pass the natural hint to force MongoDB to use natural ordering, that is, column scan +String regexBasePath = getPathForRegexFiltering(); +if (regexBasePath == null) { +LOG.info("Downloading full repository using natural order"); +FindIterable mongoIterable = dbCollection +.withReadPreference(readPreference) +.find() +.hint(NATURAL_HINT); +download(mongoIterable); + +} else { +Bson ancestorQuery = ancestorsFilter(regexBasePath); +// Do not use natural order to download ancestors. The number of ancestors will always be small, likely less +// than 10, so let MongoDB use an index to find them. +LOG.info("Downloading using regex path filtering. Downloading ancestors: {}.", ancestorQuery); +FindIterable ancestorsIterable = dbCollection +.withReadPreference(readPreference) +.find(ancestorQuery); +download(ancestorsIterable); + +Bson childrenQuery = childrenFilter(regexBasePath); +LOG.info("Downloading using regex path filtering. Downloading children: {}.", childrenQuery); +FindIterable childrenIterable = dbCollection +.withReadPreference(readPreference) +.find(childrenQuery) +.hint(NATURAL_HINT); +download(childrenIterable); +} +} + +private String getPathForRegexFiltering() { +if (!regexPathFiltering) { +LOG.info("Regex path filtering disabled."); +return null; +} +return getSingleIncludedPath(pathFilters); +} + +// Package private for testing +static String getSingleIncludedPath(List pathFilters) { +// For the time being, we only enable path filtering if there is a single include path across all indexes and no +// exclude paths. This is the case for most of the larger indexes. We can consider generalizing this in the future. +LOG.info("Creating regex filter from pathFilters: " + pathFilters); +if (pathFilters == null) { +return null; +} +Set includedPaths = pathFilters.stream() +.flatMap(pathFilter -> pathFilter.getIncludedPaths().stream()) +.collect(Collectors.toSet()); + +Set excludedPaths = pathFilters.stream() +.flatMap(pathFilter -> pathFilter.getExcludedPaths().stream()) +.collect(Collectors.toSet()); + +if (excludedPaths.isEmpty() && includedPaths.size() == 1) { +return includedPaths.stream().iterator().next(); +} else { +return null; +} +} + +private static Bson childrenFilter(String path) { +if (!path.endsWith("/")) { +path = path + "/"; +} +String quotedPath = Pattern.quote(path); +// For entries with path sizes above a certain threshold, the _id field contains a hash instead of the path of +// the entry. The path is stored instead in the _path field. Therefore, we have to include in the filter also +// the documents with matching _path. +return Filters.or( +regex(NodeDocument.ID, Pattern.compile("^[0-9]{1,3}:" + quotedPath + ".*$")), +regex(NodeDocument.PATH, Pattern.compile(quotedPath + ".*$")) +); +} + +private static Bson ancestorsFilter(String path) { Review Comment: Good point. I added a test and code to handle the case where the path used for filtering is long enough to be handled as a long path: 2ae36bdea6ae2552d3610c45bcb1e406e703b353 I expect that this will never happen in practice, as it would mean that one of the `includePaths` is extremely long, but we should guard against it. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the
[GitHub] [jackrabbit-oak] nfsantos commented on a diff in pull request #1042: OAK-10358 - Filter by path on MongoDB query when downloading repository in Indexing job
nfsantos commented on code in PR #1042: URL: https://github.com/apache/jackrabbit-oak/pull/1042#discussion_r1311791210 ## oak-store-spi/src/main/java/org/apache/jackrabbit/oak/spi/filter/PathFilter.java: ## @@ -108,8 +108,7 @@ public static PathFilter from(@NotNull NodeBuilder defn) { * If both are empty then all paths would be considered to be included * * @param includes list of paths which should not be included - * @param excludes list of p - * aths which should be included + * @param excludes list of paths which should be included Review Comment: Yes, it looks like a mistake in the documentation. This PR does not change this documentation, I only fixed the wrong line break, but now I fixed the text of the docs as well. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: dev-unsubscr...@jackrabbit.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [jackrabbit-oak] nfsantos commented on a diff in pull request #1042: OAK-10358 - Filter by path on MongoDB query when downloading repository in Indexing job
nfsantos commented on code in PR #1042: URL: https://github.com/apache/jackrabbit-oak/pull/1042#discussion_r1311356431 ## oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java: ## @@ -225,12 +282,96 @@ private void downloadRange(DownloadRange range) throws InterruptedException, Tim download(mongoIterable); } -private void downloadAll() throws InterruptedException, TimeoutException { -LOG.info("Traversing all documents"); -FindIterable mongoIterable = dbCollection -.withReadPreference(readPreference) -.find(); -download(mongoIterable); +private void downloadWithNaturalOrdering() throws InterruptedException, TimeoutException { +// We are downloading potentially a large fraction of the repository, so using an index scan will be +// inefficient. So we pass the natural hint to force MongoDB to use natural ordering, that is, column scan +String regexBasePath = getPathForRegexFiltering(); +if (regexBasePath == null) { +LOG.info("Downloading full repository using natural order"); +FindIterable mongoIterable = dbCollection +.withReadPreference(readPreference) +.find() +.hint(NATURAL_HINT); +download(mongoIterable); + +} else { +Bson ancestorQuery = ancestorsFilter(regexBasePath); +// Do not use natural order to download ancestors. The number of ancestors will always be small, likely less +// than 10, so let MongoDB use an index to find them. +LOG.info("Downloading using regex path filtering. Downloading ancestors: {}.", ancestorQuery); +FindIterable ancestorsIterable = dbCollection +.withReadPreference(readPreference) +.find(ancestorQuery); +download(ancestorsIterable); + +Bson childrenQuery = childrenFilter(regexBasePath); +LOG.info("Downloading using regex path filtering. Downloading children: {}.", childrenQuery); +FindIterable childrenIterable = dbCollection +.withReadPreference(readPreference) +.find(childrenQuery) +.hint(NATURAL_HINT); +download(childrenIterable); +} +} + +private String getPathForRegexFiltering() { +if (!regexPathFiltering) { +LOG.info("Regex path filtering disabled."); +return null; +} +return getSingleIncludedPath(pathFilters); +} + +// Package private for testing +static String getSingleIncludedPath(List pathFilters) { +// For the time being, we only enable path filtering if there is a single include path across all indexes and no +// exclude paths. This is the case for most of the larger indexes. We can consider generalizing this in the future. +LOG.info("Creating regex filter from pathFilters: " + pathFilters); +if (pathFilters == null) { +return null; +} +Set includedPaths = pathFilters.stream() +.flatMap(pathFilter -> pathFilter.getIncludedPaths().stream()) +.collect(Collectors.toSet()); + +Set excludedPaths = pathFilters.stream() +.flatMap(pathFilter -> pathFilter.getExcludedPaths().stream()) +.collect(Collectors.toSet()); + +if (excludedPaths.isEmpty() && includedPaths.size() == 1) { +return includedPaths.stream().iterator().next(); +} else { +return null; +} +} + +private static Bson childrenFilter(String path) { +if (!path.endsWith("/")) { +path = path + "/"; Review Comment: This is being done, see line 195, there are even comments explaining how this case is handled: ``` // If regex filtering is enabled, start by downloading the ancestors of the path used for filtering. // That is, download "/", "/content", "/content/dam" for a base path of "/content/dam". These nodes will not be // matched by the regex used in the Mongo query, which assumes a prefix of "???:/content/dam" ... if (regexBasePath != null) { // Regex path filtering is enabled // Download the ancestors in a separate query. No retrials done on this query, as it will take only a few // seconds and is done at the start of the job, so if it fails, the job can be retried without losing much work LOG.info("Using regex filtering with path {}", regexBasePath); Bson ancestorsQuery = ancestorsFilter(regexBasePath); ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the
[GitHub] [jackrabbit-oak] nfsantos commented on a diff in pull request #1042: OAK-10358 - Filter by path on MongoDB query when downloading repository in Indexing job
nfsantos commented on code in PR #1042: URL: https://github.com/apache/jackrabbit-oak/pull/1042#discussion_r1311353211 ## oak-run-commons/src/test/java/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedIT.java: ## @@ -66,52 +69,95 @@ public class PipelinedIT { @Rule public final TemporaryFolder sortFolder = new TemporaryFolder(); +private static final PathFilter contentDamPathFilter = new PathFilter(List.of("/content/dam"), List.of()); + +private static final int LONG_PATH_TEST_LEVELS = 30; +private static final String LONG_PATH_LEVEL_STRING = "Z12345678901234567890-Level_"; + @BeforeClass -public static void checkMongoDbAvailable() { +public static void setup() throws IOException { Assume.assumeTrue(MongoUtils.isAvailable()); -} - -@Before -public void setup() throws IOException { +// Generate dynamically the entries expected for the long path tests +StringBuilder path = new StringBuilder("/content/dam"); Review Comment: There are more nodes added in the file below, see in line 266: ```java private static final List EXPECTED_FFS = new ArrayList<>(List.of( "/|{}", "/content|{}", "/content/dam|{}", "/content/dam/1000|{}", "/content/dam/1000/12|{\"p1\":\"v100012\"}", "/content/dam/2022|{}", "/content/dam/2022/02|{\"p1\":\"v202202\"}", "/content/dam/2023|{\"p2\":\"v2023\"}", "/content/dam/2023/01|{\"p1\":\"v202301\"}", "/content/dam/2023/02|{}", "/content/dam/2023/02/28|{\"p1\":\"v20230228\"}" )); ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: dev-unsubscr...@jackrabbit.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org