pltbkd commented on code in PR #20415: URL: https://github.com/apache/flink/pull/20415#discussion_r938487910
########## flink-connectors/flink-connector-hive/src/main/java/org/apache/flink/connectors/hive/HiveSourceDynamicFileEnumerator.java: ########## @@ -0,0 +1,218 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.connectors.hive; + +import org.apache.flink.annotation.VisibleForTesting; +import org.apache.flink.connector.file.src.FileSourceSplit; +import org.apache.flink.connector.file.src.enumerate.DynamicFileEnumerator; +import org.apache.flink.connectors.hive.util.JobConfUtils; +import org.apache.flink.core.fs.Path; +import org.apache.flink.table.connector.source.DynamicFilteringData; +import org.apache.flink.table.data.GenericRowData; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.StringData; +import org.apache.flink.table.data.TimestampData; +import org.apache.flink.table.types.logical.LogicalTypeFamily; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.util.Preconditions; + +import org.apache.hadoop.mapred.JobConf; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Map; + +import static org.apache.flink.table.types.logical.utils.LogicalTypeChecks.getPrecision; +import static org.apache.flink.util.Preconditions.checkNotNull; + +/** + * A {@link DynamicFileEnumerator} implementation for hive source. It uses {@link + * HiveSourceFileEnumerator#createInputSplits} to generate splits like HiveSourceFileEnumerator, but + * only enumerates {@link HiveTablePartition}s that exist in the {@link DynamicFilteringData} if a + * DynamicFilteringData is provided. + */ +public class HiveSourceDynamicFileEnumerator implements DynamicFileEnumerator { + + private static final Logger LOG = + LoggerFactory.getLogger(HiveSourceDynamicFileEnumerator.class); + + private final String table; + private final List<String> dynamicPartitionKeys; + // For non-partition hive table, partitions only contains one partition which partitionValues is + // empty. + private final List<HiveTablePartition> allPartitions; + private final int threadNum; + private final JobConf jobConf; + + private transient List<HiveTablePartition> finalPartitions; + + public HiveSourceDynamicFileEnumerator( + String table, + List<String> dynamicPartitionKeys, + List<HiveTablePartition> allPartitions, + int threadNum, + JobConf jobConf) { + this.table = checkNotNull(table); + this.dynamicPartitionKeys = checkNotNull(dynamicPartitionKeys); + this.allPartitions = checkNotNull(allPartitions); + this.threadNum = threadNum; + this.jobConf = checkNotNull(jobConf); + + this.finalPartitions = this.allPartitions; + } + + public void setDynamicFilteringData(DynamicFilteringData data) { + LOG.debug("Filtering partitions of table {} based on the data: {}", table, data); + if (!data.isFiltering()) { + finalPartitions = allPartitions; + return; + } + finalPartitions = new ArrayList<>(); + RowType rowType = data.getRowType(); + Preconditions.checkArgument(rowType.getFieldCount() == dynamicPartitionKeys.size()); + for (HiveTablePartition partition : allPartitions) { + RowData partitionRow = createRowData(rowType, partition.getPartitionSpec()); + if (data.contains(partitionRow)) { + finalPartitions.add(partition); + } + } + LOG.info( + "Dynamic filtering table {}, original partition number is {}, remaining partition number {}", + table, + allPartitions.size(), + finalPartitions.size()); + } + + @VisibleForTesting + RowData createRowData(RowType rowType, Map<String, String> partitionSpec) { + GenericRowData rowData = new GenericRowData(rowType.getFieldCount()); + for (int i = 0; i < rowType.getFieldCount(); ++i) { + String value = partitionSpec.get(dynamicPartitionKeys.get(i)); + Object convertedValue; + if (JobConfUtils.getDefaultPartitionName(jobConf).equals(value)) { + // Keep the same way as hive source reads from default partition. Use the default + // partition name if the partition field is string type, use null otherwise. + // See HivePartitionUtils#restorePartitionValueFromType. + + // By default, the type of hive partition field is nullable. + // See HiveTypeUtil#toFlinkPrimitiveType + if (rowType.getTypeAt(i).is(LogicalTypeFamily.CHARACTER_STRING)) { + convertedValue = StringData.fromString(value); + } else { + convertedValue = null; + } + } else { + switch (rowType.getTypeAt(i).getTypeRoot()) { + case CHAR: + case VARCHAR: + convertedValue = StringData.fromString(value); + break; + case TINYINT: + convertedValue = Byte.parseByte(value); + break; + case SMALLINT: + convertedValue = Short.parseShort(value); + break; + case INTEGER: + case DATE: Review Comment: Thanks for the correction! I'm wondering if we can parse all types with the util method? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
