zhangyue19921010 commented on a change in pull request #4878:
URL: https://github.com/apache/hudi/pull/4878#discussion_r813588000



##########
File path: 
hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java
##########
@@ -500,19 +578,179 @@ public int compare(FileSlice o1, FileSlice o2) {
     }
   }
 
-  public static class HoodieBaseFileCompactor implements 
Comparator<HoodieBaseFile>, Serializable {
+  public static class HoodieBaseFileComparator implements 
Comparator<HoodieBaseFile>, Serializable {
 
     @Override
     public int compare(HoodieBaseFile o1, HoodieBaseFile o2) {
       return o1.getPath().compareTo(o2.getPath());
     }
   }
 
-  public static class HoodieFileGroupCompactor implements 
Comparator<HoodieFileGroup>, Serializable {
+  public static class HoodieFileGroupComparator implements 
Comparator<HoodieFileGroup>, Serializable {
 
     @Override
     public int compare(HoodieFileGroup o1, HoodieFileGroup o2) {
       return o1.getFileGroupId().compareTo(o2.getFileGroupId());
     }
   }
-}
\ No newline at end of file
+
+  public static class HoodieColumnRangeMetadataComparator
+      implements Comparator<HoodieColumnRangeMetadata<String>>, Serializable {
+
+    @Override
+    public int compare(HoodieColumnRangeMetadata<String> o1, 
HoodieColumnRangeMetadata<String> o2) {
+      return o1.toString().compareTo(o2.toString());
+    }
+  }
+
+  /**
+   * Class for storing relevant information for metadata table validation.
+   * <p>
+   * If metadata table is disabled, the APIs provide the information, e.g., 
file listing,
+   * index, from the file system and base files.  If metadata table is 
enabled, the APIs
+   * provide the information from the metadata table.  The same API is 
expected to return
+   * the same information regardless of whether metadata table is enabled, 
which is
+   * verified in the {@link HoodieMetadataTableValidator}.
+   */
+  private static class HoodieMetadataValidationContext implements Serializable 
{
+    private HoodieTableMetaClient metaClient;
+    private HoodieTableFileSystemView fileSystemView;
+    private HoodieTableMetadata tableMetadata;
+    private boolean enableMetadataTable;
+    private List<String> allColumnNameList;
+
+    public HoodieMetadataValidationContext(
+        HoodieEngineContext engineContext, Config cfg, HoodieTableMetaClient 
metaClient,
+        boolean enableMetadataTable) {
+      this.metaClient = metaClient;
+      this.enableMetadataTable = enableMetadataTable;
+      HoodieMetadataConfig metadataConfig = HoodieMetadataConfig.newBuilder()
+          .enable(enableMetadataTable)
+          .withMetadataIndexBloomFilter(enableMetadataTable)
+          .withMetadataIndexColumnStats(enableMetadataTable)
+          .withMetadataIndexForAllColumns(enableMetadataTable)
+          .withAssumeDatePartitioning(cfg.assumeDatePartitioning)
+          .build();
+      this.fileSystemView = 
FileSystemViewManager.createInMemoryFileSystemView(engineContext,
+          metaClient, metadataConfig);
+      this.tableMetadata = HoodieTableMetadata.create(engineContext, 
metadataConfig, metaClient.getBasePath(),
+          FileSystemViewStorageConfig.SPILLABLE_DIR.defaultValue());
+      if 
(metaClient.getCommitsTimeline().filterCompletedInstants().countInstants() > 0) 
{
+        this.allColumnNameList = getAllColumnNames();
+      }
+    }
+
+    public List<HoodieBaseFile> getSortedLatestBaseFileList(String 
partitionPath) {
+      return fileSystemView.getLatestBaseFiles(partitionPath)
+          .sorted(new HoodieBaseFileComparator()).collect(Collectors.toList());
+    }
+
+    public List<FileSlice> getSortedLatestFileSliceList(String partitionPath) {
+      return fileSystemView.getLatestFileSlices(partitionPath)
+          .sorted(new FileSliceComparator()).collect(Collectors.toList());
+    }
+
+    public List<HoodieFileGroup> getSortedAllFileGroupList(String 
partitionPath) {
+      return fileSystemView.getAllFileGroups(partitionPath)
+          .sorted(new 
HoodieFileGroupComparator()).collect(Collectors.toList());
+    }
+
+    public List<HoodieColumnRangeMetadata<String>> getSortedColumnStatsList(
+        String partitionPath, List<String> baseFileNameList) {
+      LOG.info("All column names for getting column stats: " + 
allColumnNameList);
+      if (enableMetadataTable) {
+        List<Pair<String, String>> partitionFileNameList = 
baseFileNameList.stream()
+            .map(filename -> Pair.of(partitionPath, 
filename)).collect(Collectors.toList());
+        return allColumnNameList.stream()
+            .flatMap(columnName ->
+                tableMetadata.getColumnStats(partitionFileNameList, 
columnName).values().stream()
+                    .map(stats -> new HoodieColumnRangeMetadata<>(
+                        stats.getFileName(),
+                        columnName,
+                        stats.getMinValue(),
+                        stats.getMaxValue(),
+                        stats.getNullCount(),
+                        stats.getValueCount(),
+                        stats.getTotalSize(),
+                        stats.getTotalUncompressedSize()))
+                    .collect(Collectors.toList())
+                    .stream())
+            .sorted(new HoodieColumnRangeMetadataComparator())
+            .collect(Collectors.toList());
+      } else {
+        return baseFileNameList.stream().flatMap(filename ->
+                new ParquetUtils().readRangeFromParquetMetadata(

Review comment:
       we use ParquetUtils to build HoodieColumnRangeMetadata when fs based
   and use `tableMetadata.getColumnStats` to build it when metaBased.
   
   Also I see that the api `getColumnStats` is not supported in 
FileSystemBackedTableMetadata
   ```
     @Override
     public Map<Pair<String, String>, HoodieMetadataColumnStats> 
getColumnStats(final List<Pair<String, String>> partitionNameFileNameList, 
final String columnName)
         throws HoodieMetadataException {
       throw new HoodieMetadataException("Unsupported operation: 
getColumnsStats!");
     }
   ```
   Maybe we could move this logic into `FileSystemBackedTableMetadata` ? 
   
   Just a suggestion and it looks good to me for current logic.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to