danny0405 commented on code in PR #12132:
URL: https://github.com/apache/hudi/pull/12132#discussion_r1808053349
##########
hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/stats/ColumnStatsIndices.java:
##########
@@ -85,16 +89,109 @@ public class ColumnStatsIndices {
private ColumnStatsIndices() {
}
- public static List<RowData> readColumnStatsIndex(String basePath,
HoodieMetadataConfig metadataConfig, String[] targetColumns) {
+ public static Set<String> candidatePartitionsInMetadataTable(
+ String basePath,
+ HoodieMetadataConfig metadataConfig,
+ RowType rowType,
+ @Nullable DataPruner dataPruner,
+ List<String> filesOrPartitions) {
+ if (dataPruner == null) {
+ return null;
+ }
+ final List<RowData> indexRows =
+ ColumnStatsIndices.readPartitionStatsIndex(basePath, metadataConfig,
dataPruner.getReferencedCols());
+ return candidatesInMetadataTable(rowType, dataPruner, indexRows,
filesOrPartitions);
+ }
+
+ public static Set<String> candidateFilesInMetadataTable(
+ String basePath,
+ HoodieMetadataConfig metadataConfig,
+ RowType rowType,
+ @Nullable DataPruner dataPruner,
+ List<String> filesOrPartitions) {
+ if (dataPruner == null) {
+ return null;
+ }
+ final List<RowData> indexRows =
+ ColumnStatsIndices.readFileColumnStatsIndex(basePath, metadataConfig,
dataPruner.getReferencedCols());
+ return candidatesInMetadataTable(rowType, dataPruner, indexRows,
filesOrPartitions);
+ }
+
+ /**
+ * Computes pruned list of candidate base-files' names based on provided
list of data filters.
+ * conditions, by leveraging Metadata Table's Column Statistics index
(hereon referred as ColStats for brevity)
+ * bearing "min", "max", "num_nulls" statistics for all columns.
+ *
+ * <p>NOTE: This method has to return complete set of candidate files, since
only provided candidates will
+ * ultimately be scanned as part of query execution. Hence, this method has
to maintain the
+ * invariant of conservatively including every base-file's name, that is NOT
referenced in its index.
+ *
+ * <p>The {@code filters} must all be simple.
+ *
+ * @param dataPruner the data pruner built from push-down filters.
Review Comment:
supplement comments for all the parameters.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]