yihua commented on a change in pull request #4994:
URL: https://github.com/apache/hudi/pull/4994#discussion_r825338047



##########
File path: 
hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java
##########
@@ -345,16 +358,59 @@ public void doMetadataTableValidation() {
     boolean finalResult = true;
     metaClient.reloadActiveTimeline();
     String basePath = metaClient.getBasePath();
+    List<String> baseFilesUnderDeletion = Collections.emptyList();
+
+    if (cfg.skipUnderDeletionDataFiles) {
+      HoodieTimeline pendingCleaningTimeline = metaClient.getActiveTimeline()
+          .getCleanerTimeline()
+          .filter(instant -> instant.getState() != 
HoodieInstant.State.COMPLETED);

Review comment:
       `filterInflights()` or `filterInflightsAndRequested()` can be used here. 
 Also, should inflight cleaning instants be considered only?  Requested 
cleaning instants should not affect the validation.

##########
File path: 
hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java
##########
@@ -495,11 +592,31 @@ private void validateLatestFileSlices(
     LOG.info("Validation of getLatestFileSlices succeeded for partition " + 
partitionPath);
   }
 
+  private List<FileSlice> 
filterFileSliceBasedOnUnderDeletionFiles(List<FileSlice> 
sortedLatestFileSliceList, List<String> baseFilesUnderDeletion) {

Review comment:
       Could `baseFilesUnderDeletion` be a `Set<String>` to speed up lookup?

##########
File path: 
hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java
##########
@@ -410,42 +466,64 @@ public void doMetadataTableValidation() {
    * @param metadataTableBasedContext Validation context containing 
information based on metadata table
    * @param fsBasedContext            Validation context containing 
information based on the file system
    * @param partitionPath             Partition path String
+   * @param baseFilesUnderDeletion    Base files under un-complete cleaner 
action
    */
   private void validateFilesInPartition(
       HoodieMetadataValidationContext metadataTableBasedContext,
-      HoodieMetadataValidationContext fsBasedContext, String partitionPath) {
+      HoodieMetadataValidationContext fsBasedContext, String partitionPath,
+      List<String> baseFilesUnderDeletion) {
     if (cfg.validateLatestFileSlices) {
-      validateLatestFileSlices(metadataTableBasedContext, fsBasedContext, 
partitionPath);
+      validateLatestFileSlices(metadataTableBasedContext, fsBasedContext, 
partitionPath, baseFilesUnderDeletion);
     }
 
     if (cfg.validateLatestBaseFiles) {
-      validateLatestBaseFiles(metadataTableBasedContext, fsBasedContext, 
partitionPath);
+      validateLatestBaseFiles(metadataTableBasedContext, fsBasedContext, 
partitionPath, baseFilesUnderDeletion);
     }
 
     if (cfg.validateAllFileGroups) {
-      validateAllFileGroups(metadataTableBasedContext, fsBasedContext, 
partitionPath);
+      validateAllFileGroups(metadataTableBasedContext, fsBasedContext, 
partitionPath, baseFilesUnderDeletion);
     }
 
     if (cfg.validateAllColumnStats) {
-      validateAllColumnStats(metadataTableBasedContext, fsBasedContext, 
partitionPath);
+      validateAllColumnStats(metadataTableBasedContext, fsBasedContext, 
partitionPath, baseFilesUnderDeletion);
     }
 
     if (cfg.validateBloomFilters) {
-      validateBloomFilters(metadataTableBasedContext, fsBasedContext, 
partitionPath);
+      validateBloomFilters(metadataTableBasedContext, fsBasedContext, 
partitionPath, baseFilesUnderDeletion);
     }
   }
 
   private void validateAllFileGroups(
       HoodieMetadataValidationContext metadataTableBasedContext,
-      HoodieMetadataValidationContext fsBasedContext, String partitionPath) {
-    List<FileSlice> allFileSlicesFromMeta = metadataTableBasedContext
-        .getSortedAllFileGroupList(partitionPath).stream()
-        .flatMap(HoodieFileGroup::getAllFileSlices).sorted(new 
FileSliceComparator())
-        .collect(Collectors.toList());
-    List<FileSlice> allFileSlicesFromFS = fsBasedContext
-        .getSortedAllFileGroupList(partitionPath).stream()
-        .flatMap(HoodieFileGroup::getAllFileSlices).sorted(new 
FileSliceComparator())
-        .collect(Collectors.toList());
+      HoodieMetadataValidationContext fsBasedContext,
+      String partitionPath,
+      List<String> baseFilesUnderDeletion) {

Review comment:
       similar here for variable naming.

##########
File path: 
hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java
##########
@@ -345,16 +358,59 @@ public void doMetadataTableValidation() {
     boolean finalResult = true;
     metaClient.reloadActiveTimeline();
     String basePath = metaClient.getBasePath();
+    List<String> baseFilesUnderDeletion = Collections.emptyList();

Review comment:
       nit: similar here for variable naming

##########
File path: 
hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java
##########
@@ -345,16 +358,59 @@ public void doMetadataTableValidation() {
     boolean finalResult = true;
     metaClient.reloadActiveTimeline();
     String basePath = metaClient.getBasePath();
+    List<String> baseFilesUnderDeletion = Collections.emptyList();
+
+    if (cfg.skipUnderDeletionDataFiles) {
+      HoodieTimeline pendingCleaningTimeline = metaClient.getActiveTimeline()
+          .getCleanerTimeline()
+          .filter(instant -> instant.getState() != 
HoodieInstant.State.COMPLETED);
+
+      baseFilesUnderDeletion = 
pendingCleaningTimeline.getInstants().flatMap(instant -> {
+        try {
+          if (instant.isInflight()) {
+            // convert inflight instant to requested and get clean plan
+            instant = new HoodieInstant(HoodieInstant.State.REQUESTED, 
instant.getAction(), instant.getTimestamp());
+          }
+          HoodieCleanerPlan cleanerPlan = 
CleanerUtils.getCleanerPlan(metaClient, instant);
+
+          return 
cleanerPlan.getFilePathsToBeDeletedPerPartition().values().stream().flatMap(cleanerFIleInfoList
 -> {
+            return cleanerFIleInfoList.stream().map(fileInfo -> {
+              return new Path(fileInfo.getFilePath()).getName();
+            });
+          });
+
+        } catch (HoodieIOException ex) {
+
+          if (ex.getIOException() instanceof FileNotFoundException) {
+            // cleaner instant could be deleted by archive and 
FileNotFoundException could be threw during getInstantDetails function

Review comment:
       Inflight cleaning instants shouldn't be archived.  Is this considering 
the inflight cleaning instant deleted due to failed cleaning?

##########
File path: 
hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java
##########
@@ -345,16 +358,59 @@ public void doMetadataTableValidation() {
     boolean finalResult = true;
     metaClient.reloadActiveTimeline();
     String basePath = metaClient.getBasePath();
+    List<String> baseFilesUnderDeletion = Collections.emptyList();
+
+    if (cfg.skipUnderDeletionDataFiles) {
+      HoodieTimeline pendingCleaningTimeline = metaClient.getActiveTimeline()
+          .getCleanerTimeline()
+          .filter(instant -> instant.getState() != 
HoodieInstant.State.COMPLETED);
+
+      baseFilesUnderDeletion = 
pendingCleaningTimeline.getInstants().flatMap(instant -> {
+        try {
+          if (instant.isInflight()) {
+            // convert inflight instant to requested and get clean plan
+            instant = new HoodieInstant(HoodieInstant.State.REQUESTED, 
instant.getAction(), instant.getTimestamp());
+          }
+          HoodieCleanerPlan cleanerPlan = 
CleanerUtils.getCleanerPlan(metaClient, instant);
+
+          return 
cleanerPlan.getFilePathsToBeDeletedPerPartition().values().stream().flatMap(cleanerFIleInfoList
 -> {

Review comment:
       nit: `cleanerFIleInfoList` -> `cleanerFileInfoList`




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to