zhangyue19921010 commented on a change in pull request #4878:
URL: https://github.com/apache/hudi/pull/4878#discussion_r813588000
##########
File path:
hudi-utilities/src/main/java/org/apache/hudi/utilities/HoodieMetadataTableValidator.java
##########
@@ -500,19 +578,179 @@ public int compare(FileSlice o1, FileSlice o2) {
}
}
- public static class HoodieBaseFileCompactor implements
Comparator<HoodieBaseFile>, Serializable {
+ public static class HoodieBaseFileComparator implements
Comparator<HoodieBaseFile>, Serializable {
@Override
public int compare(HoodieBaseFile o1, HoodieBaseFile o2) {
return o1.getPath().compareTo(o2.getPath());
}
}
- public static class HoodieFileGroupCompactor implements
Comparator<HoodieFileGroup>, Serializable {
+ public static class HoodieFileGroupComparator implements
Comparator<HoodieFileGroup>, Serializable {
@Override
public int compare(HoodieFileGroup o1, HoodieFileGroup o2) {
return o1.getFileGroupId().compareTo(o2.getFileGroupId());
}
}
-}
\ No newline at end of file
+
+ public static class HoodieColumnRangeMetadataComparator
+ implements Comparator<HoodieColumnRangeMetadata<String>>, Serializable {
+
+ @Override
+ public int compare(HoodieColumnRangeMetadata<String> o1,
HoodieColumnRangeMetadata<String> o2) {
+ return o1.toString().compareTo(o2.toString());
+ }
+ }
+
+ /**
+ * Class for storing relevant information for metadata table validation.
+ * <p>
+ * If metadata table is disabled, the APIs provide the information, e.g.,
file listing,
+ * index, from the file system and base files. If metadata table is
enabled, the APIs
+ * provide the information from the metadata table. The same API is
expected to return
+ * the same information regardless of whether metadata table is enabled,
which is
+ * verified in the {@link HoodieMetadataTableValidator}.
+ */
+ private static class HoodieMetadataValidationContext implements Serializable
{
+ private HoodieTableMetaClient metaClient;
+ private HoodieTableFileSystemView fileSystemView;
+ private HoodieTableMetadata tableMetadata;
+ private boolean enableMetadataTable;
+ private List<String> allColumnNameList;
+
+ public HoodieMetadataValidationContext(
+ HoodieEngineContext engineContext, Config cfg, HoodieTableMetaClient
metaClient,
+ boolean enableMetadataTable) {
+ this.metaClient = metaClient;
+ this.enableMetadataTable = enableMetadataTable;
+ HoodieMetadataConfig metadataConfig = HoodieMetadataConfig.newBuilder()
+ .enable(enableMetadataTable)
+ .withMetadataIndexBloomFilter(enableMetadataTable)
+ .withMetadataIndexColumnStats(enableMetadataTable)
+ .withMetadataIndexForAllColumns(enableMetadataTable)
+ .withAssumeDatePartitioning(cfg.assumeDatePartitioning)
+ .build();
+ this.fileSystemView =
FileSystemViewManager.createInMemoryFileSystemView(engineContext,
+ metaClient, metadataConfig);
+ this.tableMetadata = HoodieTableMetadata.create(engineContext,
metadataConfig, metaClient.getBasePath(),
+ FileSystemViewStorageConfig.SPILLABLE_DIR.defaultValue());
+ if
(metaClient.getCommitsTimeline().filterCompletedInstants().countInstants() > 0)
{
+ this.allColumnNameList = getAllColumnNames();
+ }
+ }
+
+ public List<HoodieBaseFile> getSortedLatestBaseFileList(String
partitionPath) {
+ return fileSystemView.getLatestBaseFiles(partitionPath)
+ .sorted(new HoodieBaseFileComparator()).collect(Collectors.toList());
+ }
+
+ public List<FileSlice> getSortedLatestFileSliceList(String partitionPath) {
+ return fileSystemView.getLatestFileSlices(partitionPath)
+ .sorted(new FileSliceComparator()).collect(Collectors.toList());
+ }
+
+ public List<HoodieFileGroup> getSortedAllFileGroupList(String
partitionPath) {
+ return fileSystemView.getAllFileGroups(partitionPath)
+ .sorted(new
HoodieFileGroupComparator()).collect(Collectors.toList());
+ }
+
+ public List<HoodieColumnRangeMetadata<String>> getSortedColumnStatsList(
+ String partitionPath, List<String> baseFileNameList) {
+ LOG.info("All column names for getting column stats: " +
allColumnNameList);
+ if (enableMetadataTable) {
+ List<Pair<String, String>> partitionFileNameList =
baseFileNameList.stream()
+ .map(filename -> Pair.of(partitionPath,
filename)).collect(Collectors.toList());
+ return allColumnNameList.stream()
+ .flatMap(columnName ->
+ tableMetadata.getColumnStats(partitionFileNameList,
columnName).values().stream()
+ .map(stats -> new HoodieColumnRangeMetadata<>(
+ stats.getFileName(),
+ columnName,
+ stats.getMinValue(),
+ stats.getMaxValue(),
+ stats.getNullCount(),
+ stats.getValueCount(),
+ stats.getTotalSize(),
+ stats.getTotalUncompressedSize()))
+ .collect(Collectors.toList())
+ .stream())
+ .sorted(new HoodieColumnRangeMetadataComparator())
+ .collect(Collectors.toList());
+ } else {
+ return baseFileNameList.stream().flatMap(filename ->
+ new ParquetUtils().readRangeFromParquetMetadata(
Review comment:
we use ParquetUtils to build HoodieColumnRangeMetadata when fs based
and use `tableMetadata.getColumnStats` to build it when metaBased.
Also I see that the api `getColumnStats` is not supported in
FileSystemBackedTableMetadata
```
@Override
public Map<Pair<String, String>, HoodieMetadataColumnStats>
getColumnStats(final List<Pair<String, String>> partitionNameFileNameList,
final String columnName)
throws HoodieMetadataException {
throw new HoodieMetadataException("Unsupported operation:
getColumnsStats!");
}
```
Maybe we could move this logic into `FileSystemBackedTableMetadata` ?
Just a suggestion and it looks good to me for current logic.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]