danny0405 commented on code in PR #12132:
URL: https://github.com/apache/hudi/pull/12132#discussion_r1808052131
##########
hudi-flink-datasource/hudi-flink/src/main/java/org/apache/hudi/source/prune/PartitionPruners.java:
##########
@@ -125,6 +138,154 @@ public Set<String> filter(Collection<String> partitions) {
}
}
+ /**
+ * ColumnStats partition pruner for hoodie table source which enables
partition stats index.
+ *
+ * <p>Note: The data of new partitions created after the job starts could be
read if they match the
+ * filter conditions.
+ */
+ public static class ColumnStatsPartitionPruner implements PartitionPruner {
+ private final String basePath;
+ private final HoodieMetadataConfig metadataConfig;
+ private final DataPruner dataPruner;
+ private final RowType rowType;
+
+ public ColumnStatsPartitionPruner(
+ RowType rowType,
+ String basePath,
+ HoodieMetadataConfig metadataConfig,
+ DataPruner dataPruner) {
+ this.rowType = rowType;
+ this.basePath = basePath;
+ this.metadataConfig = metadataConfig;
+ this.dataPruner = dataPruner;
+ }
+
+ @Override
+ public Set<String> filter(Collection<String> partitions) {
+ Set<String> candidatePartitions =
ColumnStatsIndices.candidatePartitionsInMetadataTable(
+ basePath, metadataConfig, rowType, dataPruner, new
ArrayList<>(partitions));
+ if (candidatePartitions == null) {
+ return new HashSet<>(partitions);
+ }
+ return
partitions.stream().filter(candidatePartitions::contains).collect(Collectors.toSet());
+ }
+ }
+
+ /**
+ * Chained partition pruner for hoodie table source which combines multiple
partition pruners.
+ */
+ public static class ChainedPartitionPruner implements PartitionPruner {
+ private final List<PartitionPruner> pruners;
+
+ public ChainedPartitionPruner(List<PartitionPruner> pruners) {
+ this.pruners = pruners;
+ }
+
+ @Override
+ public Set<String> filter(Collection<String> partitions) {
+ for (PartitionPruner pruner: pruners) {
+ partitions = pruner.filter(partitions);
+ }
+ return new HashSet<>(partitions);
+ }
+ }
+
+ public static Builder builder() {
+ return new Builder();
+ }
+
+ public static class Builder {
+ private RowType rowType;
+ private String basePath;
+ private Configuration conf;
+ private DataPruner dataPruner;
+ private List<ExpressionEvaluators.Evaluator> partitionEvaluators;
+ private List<String> partitionKeys;
+ private List<DataType> partitionTypes;
+ private String defaultParName;
+ private boolean hivePartition;
+ private Collection<String> candidatePartitions;
+
+ private Builder() {
+ }
+
+ public Builder rowType(RowType rowType) {
+ this.rowType = rowType;
+ return this;
+ }
+
+ public Builder basePath(String basePath) {
+ this.basePath = basePath;
+ return this;
+ }
+
+ public Builder conf(Configuration conf) {
+ this.conf = conf;
+ return this;
+ }
Review Comment:
should we use builder here? The different pruners have disparity in
parameters.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]