majian1998 commented on code in PR #9243:
URL: https://github.com/apache/hudi/pull/9243#discussion_r1270223787
##########
hudi-client/hudi-spark-client/src/test/java/org/apache/hudi/table/functional/TestCleanPlanExecutor.java:
##########
@@ -683,4 +694,56 @@ public void testKeepXHoursWithCleaning(
assertFalse(testTable.baseFileExists(p0, firstCommitTs, file1P0C0));
assertFalse(testTable.baseFileExists(p1, firstCommitTs, file1P1C0));
}
+
+ @Test
+ public void testGetEarliestCommitToRetain() {
+ HoodieWriteConfig config = HoodieWriteConfig.newBuilder()
+ .withPath(basePath)
+ .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA)
+ .withMetadataConfig(HoodieMetadataConfig.newBuilder()
+ .withAssumeDatePartitioning(true)
+ .build())
+ .withAutoCommit(false)
+ .withCleanConfig(HoodieCleanConfig.newBuilder()
+ .withIncrementalCleaningMode(true)
+
.withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY)
+
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS)
+ .retainCommits(5)
+ .build())
+ .build();
+ SparkRDDWriteClient writeClient = getHoodieWriteClient(config);
+ IntStream.rangeClosed(1, 9).mapToObj(i -> {
+ String newCommitTime = "00" + i;
+ List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 10);
+ JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
+ writeClient.startCommitWithTime(newCommitTime);
+ JavaRDD<WriteStatus> writeStatues = writeClient.insert(writeRecords,
newCommitTime);
+ // Assuming the first commit is pending, simulating the situation where
all instants before the first pending commit have been achieved.
Review Comment:
So, in the next incremental clean, if this clean is still pending, neither
the starting nor the ending point of this clean will have any commits cleaned.
However, if this pending commit is rolled back or executed, then the starting
point will be this pending commit and the ending point will be the normal
commit. In this case, the clean will be executed normally and no files will be
missed.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]