vinothchandar commented on a change in pull request #1008: [HUDI-80] Leverage Commit metadata to figure out partitions to be cleaned for Cleaning by commits mode URL: https://github.com/apache/incubator-hudi/pull/1008#discussion_r344985313
########## File path: hudi-client/src/main/java/org/apache/hudi/io/HoodieCleanHelper.java ########## @@ -76,6 +80,45 @@ public HoodieCleanHelper(HoodieTable<T> hoodieTable, HoodieWriteConfig config) { .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); } + /** + * Returns list of partitions where clean operations needs to be performed + * @param newInstantToRetain New instant to be retained after this cleanup operation + * @return list of partitions to scan for cleaning + * @throws IOException when underlying file-system throws this exception + */ + public List<String> getPartitionPathsToClean(Option<HoodieInstant> newInstantToRetain) throws IOException { + if (config.isIncrementalCleaner() && newInstantToRetain.isPresent() + && (HoodieCleaningPolicy.KEEP_LATEST_COMMITS == config.getCleanerPolicy())) { + Option<HoodieInstant> lastClean = + hoodieTable.getCleanTimeline().filterCompletedInstants().lastInstant(); + if (lastClean.isPresent()) { + HoodieCleanMetadata cleanMetadata = AvroUtils + .deserializeHoodieCleanMetadata(hoodieTable.getActiveTimeline().getInstantDetails(lastClean.get()).get()); + if ((cleanMetadata.getEarliestCommitToRetain() != null) || (cleanMetadata.getEarliestCommitToRetain() != "")) { + logger.warn("Incremental Cleaning mode is enabled. Looking up partition-paths that have since changed " + + "since last cleaned at " + cleanMetadata.getEarliestCommitToRetain() + + ". New Instant to retain : " + newInstantToRetain); + return hoodieTable.getCompletedCommitTimeline().getInstants().filter(instant -> { Review comment: do we need both delta and commit timelines? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services