[GitHub] [flink] luoyuxia commented on a diff in pull request #20415: [FLINK-28711] Hive connector implements SupportsDynamicFiltering interface

GitBox Thu, 04 Aug 2022 20:41:34 -0700


luoyuxia commented on code in PR #20415:
URL: https://github.com/apache/flink/pull/20415#discussion_r938399637



##########
flink-connectors/flink-connector-hive/src/main/java/org/apache/flink/connectors/hive/HiveSourceDynamicFileEnumerator.java:
##########
@@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.connectors.hive;
+
+import org.apache.flink.annotation.VisibleForTesting;
+import org.apache.flink.connector.file.src.FileSourceSplit;
+import org.apache.flink.connector.file.src.enumerate.DynamicFileEnumerator;
+import org.apache.flink.connectors.hive.util.JobConfUtils;
+import org.apache.flink.core.fs.Path;
+import org.apache.flink.table.connector.source.DynamicFilteringData;
+import org.apache.flink.table.data.GenericRowData;
+import org.apache.flink.table.data.RowData;
+import org.apache.flink.table.data.StringData;
+import org.apache.flink.table.data.TimestampData;
+import org.apache.flink.table.types.logical.LogicalTypeFamily;
+import org.apache.flink.table.types.logical.RowType;
+import org.apache.flink.util.Preconditions;
+
+import org.apache.hadoop.mapred.JobConf;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import static 
org.apache.flink.table.types.logical.utils.LogicalTypeChecks.getPrecision;
+import static org.apache.flink.util.Preconditions.checkNotNull;
+
+/**
+ * A {@link DynamicFileEnumerator} implementation for hive source. It uses 
{@link
+ * HiveSourceFileEnumerator#createInputSplits} to generate splits like 
HiveSourceFileEnumerator, but
+ * only enumerates {@link HiveTablePartition}s that exist in the {@link 
DynamicFilteringData} if a
+ * DynamicFilteringData is provided.
+ */
+public class HiveSourceDynamicFileEnumerator implements DynamicFileEnumerator {
+
+    private static final Logger LOG =
+            LoggerFactory.getLogger(HiveSourceDynamicFileEnumerator.class);
+
+    private final String table;
+    private final List<String> dynamicPartitionKeys;
+    // For non-partition hive table, partitions only contains one partition 
which partitionValues is
+    // empty.
+    private final List<HiveTablePartition> allPartitions;
+    private final int threadNum;
+    private final JobConf jobConf;
+
+    private transient List<HiveTablePartition> finalPartitions;
+
+    public HiveSourceDynamicFileEnumerator(
+            String table,
+            List<String> dynamicPartitionKeys,
+            List<HiveTablePartition> allPartitions,
+            int threadNum,
+            JobConf jobConf) {
+        this.table = checkNotNull(table);
+        this.dynamicPartitionKeys = checkNotNull(dynamicPartitionKeys);
+        this.allPartitions = checkNotNull(allPartitions);
+        this.threadNum = threadNum;
+        this.jobConf = checkNotNull(jobConf);
+
+        this.finalPartitions = this.allPartitions;
+    }
+
+    public void setDynamicFilteringData(DynamicFilteringData data) {
+        LOG.debug("Filtering partitions of table {} based on the data: {}", 
table, data);
+        if (!data.isFiltering()) {
+            finalPartitions = allPartitions;
+            return;
+        }
+        finalPartitions = new ArrayList<>();
+        RowType rowType = data.getRowType();
+        Preconditions.checkArgument(rowType.getFieldCount() == 
dynamicPartitionKeys.size());
+        for (HiveTablePartition partition : allPartitions) {
+            RowData partitionRow = createRowData(rowType, 
partition.getPartitionSpec());
+            if (data.contains(partitionRow)) {
+                finalPartitions.add(partition);
+            }
+        }
+        LOG.info(
+                "Dynamic filtering table {}, original partition number is {}, 
remaining partition number {}",
+                table,
+                allPartitions.size(),
+                finalPartitions.size());
+    }
+
+    @VisibleForTesting
+    RowData createRowData(RowType rowType, Map<String, String> partitionSpec) {
+        GenericRowData rowData = new GenericRowData(rowType.getFieldCount());
+        for (int i = 0; i < rowType.getFieldCount(); ++i) {
+            String value = partitionSpec.get(dynamicPartitionKeys.get(i));
+            Object convertedValue;
+            if (JobConfUtils.getDefaultPartitionName(jobConf).equals(value)) {
+                // Keep the same way as hive source reads from default 
partition. Use the default
+                // partition name if the partition field is string type, use 
null otherwise.
+                // See HivePartitionUtils#restorePartitionValueFromType.
+
+                // By default, the type of hive partition field is nullable.
+                // See HiveTypeUtil#toFlinkPrimitiveType
+                if 
(rowType.getTypeAt(i).is(LogicalTypeFamily.CHARACTER_STRING)) {
+                    convertedValue = StringData.fromString(value);
+                } else {
+                    convertedValue = null;
+                }
+            } else {
+                switch (rowType.getTypeAt(i).getTypeRoot()) {
+                    case CHAR:
+                    case VARCHAR:
+                        convertedValue = StringData.fromString(value);
+                        break;
+                    case TINYINT:
+                        convertedValue = Byte.parseByte(value);
+                        break;
+                    case SMALLINT:
+                        convertedValue = Short.parseShort(value);
+                        break;
+                    case INTEGER:
+                    case DATE:
+                    case TIME_WITHOUT_TIME_ZONE:

Review Comment:
   Since `time` is not supported in Hive, I think `case TIME_WITHOUT_TIME_ZONE` 
can be removed.



##########
flink-connectors/flink-connector-hive/src/main/java/org/apache/flink/connectors/hive/HiveSourceDynamicFileEnumerator.java:
##########
@@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.connectors.hive;
+
+import org.apache.flink.annotation.VisibleForTesting;
+import org.apache.flink.connector.file.src.FileSourceSplit;
+import org.apache.flink.connector.file.src.enumerate.DynamicFileEnumerator;
+import org.apache.flink.connectors.hive.util.JobConfUtils;
+import org.apache.flink.core.fs.Path;
+import org.apache.flink.table.connector.source.DynamicFilteringData;
+import org.apache.flink.table.data.GenericRowData;
+import org.apache.flink.table.data.RowData;
+import org.apache.flink.table.data.StringData;
+import org.apache.flink.table.data.TimestampData;
+import org.apache.flink.table.types.logical.LogicalTypeFamily;
+import org.apache.flink.table.types.logical.RowType;
+import org.apache.flink.util.Preconditions;
+
+import org.apache.hadoop.mapred.JobConf;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import static 
org.apache.flink.table.types.logical.utils.LogicalTypeChecks.getPrecision;
+import static org.apache.flink.util.Preconditions.checkNotNull;
+
+/**
+ * A {@link DynamicFileEnumerator} implementation for hive source. It uses 
{@link
+ * HiveSourceFileEnumerator#createInputSplits} to generate splits like 
HiveSourceFileEnumerator, but
+ * only enumerates {@link HiveTablePartition}s that exist in the {@link 
DynamicFilteringData} if a
+ * DynamicFilteringData is provided.
+ */
+public class HiveSourceDynamicFileEnumerator implements DynamicFileEnumerator {
+
+    private static final Logger LOG =
+            LoggerFactory.getLogger(HiveSourceDynamicFileEnumerator.class);
+
+    private final String table;
+    private final List<String> dynamicPartitionKeys;
+    // For non-partition hive table, partitions only contains one partition 
which partitionValues is
+    // empty.
+    private final List<HiveTablePartition> allPartitions;
+    private final int threadNum;
+    private final JobConf jobConf;
+
+    private transient List<HiveTablePartition> finalPartitions;
+
+    public HiveSourceDynamicFileEnumerator(
+            String table,
+            List<String> dynamicPartitionKeys,
+            List<HiveTablePartition> allPartitions,
+            int threadNum,
+            JobConf jobConf) {
+        this.table = checkNotNull(table);
+        this.dynamicPartitionKeys = checkNotNull(dynamicPartitionKeys);
+        this.allPartitions = checkNotNull(allPartitions);
+        this.threadNum = threadNum;
+        this.jobConf = checkNotNull(jobConf);
+
+        this.finalPartitions = this.allPartitions;
+    }
+
+    public void setDynamicFilteringData(DynamicFilteringData data) {
+        LOG.debug("Filtering partitions of table {} based on the data: {}", 
table, data);
+        if (!data.isFiltering()) {
+            finalPartitions = allPartitions;
+            return;
+        }
+        finalPartitions = new ArrayList<>();
+        RowType rowType = data.getRowType();
+        Preconditions.checkArgument(rowType.getFieldCount() == 
dynamicPartitionKeys.size());
+        for (HiveTablePartition partition : allPartitions) {
+            RowData partitionRow = createRowData(rowType, 
partition.getPartitionSpec());
+            if (data.contains(partitionRow)) {
+                finalPartitions.add(partition);
+            }
+        }
+        LOG.info(
+                "Dynamic filtering table {}, original partition number is {}, 
remaining partition number {}",
+                table,
+                allPartitions.size(),
+                finalPartitions.size());
+    }
+
+    @VisibleForTesting
+    RowData createRowData(RowType rowType, Map<String, String> partitionSpec) {
+        GenericRowData rowData = new GenericRowData(rowType.getFieldCount());
+        for (int i = 0; i < rowType.getFieldCount(); ++i) {
+            String value = partitionSpec.get(dynamicPartitionKeys.get(i));
+            Object convertedValue;
+            if (JobConfUtils.getDefaultPartitionName(jobConf).equals(value)) {
+                // Keep the same way as hive source reads from default 
partition. Use the default
+                // partition name if the partition field is string type, use 
null otherwise.
+                // See HivePartitionUtils#restorePartitionValueFromType.
+
+                // By default, the type of hive partition field is nullable.
+                // See HiveTypeUtil#toFlinkPrimitiveType
+                if 
(rowType.getTypeAt(i).is(LogicalTypeFamily.CHARACTER_STRING)) {
+                    convertedValue = StringData.fromString(value);
+                } else {
+                    convertedValue = null;
+                }
+            } else {
+                switch (rowType.getTypeAt(i).getTypeRoot()) {
+                    case CHAR:
+                    case VARCHAR:
+                        convertedValue = StringData.fromString(value);
+                        break;
+                    case TINYINT:
+                        convertedValue = Byte.parseByte(value);
+                        break;
+                    case SMALLINT:
+                        convertedValue = Short.parseShort(value);
+                        break;
+                    case INTEGER:
+                    case DATE:
+                    case TIME_WITHOUT_TIME_ZONE:
+                        convertedValue = Integer.parseInt(value);
+                        break;
+                    case BIGINT:
+                        convertedValue = Long.parseLong(value);
+                        break;
+                    case TIMESTAMP_WITHOUT_TIME_ZONE:
+                    case TIMESTAMP_WITH_LOCAL_TIME_ZONE:

Review Comment:
   `TIMESTAMP_WITH_LOCAL_TIME_ZONE` is also not supported in Hive.



##########
flink-connectors/flink-connector-hive/src/main/java/org/apache/flink/connectors/hive/HiveTableSource.java:
##########
@@ -247,6 +254,30 @@ public void applyPartitions(List<Map<String, String>> 
remainingPartitions) {
         }
     }
 
+    @Override
+    public List<String> applyDynamicFiltering(List<String> 
candidateFilterFields) {
+        if (catalogTable.getPartitionKeys() != null
+                && catalogTable.getPartitionKeys().size() != 0) {
+            checkArgument(
+                    !candidateFilterFields.isEmpty(),
+                    "At least one field should be provided for dynamic 
filtering");
+            checkState(
+                    dynamicPartitionKeys == null, "Dynamic filtering should 
not be applied twice.");
+
+            // only accept partition fields to do dynamic partition pruning
+            this.dynamicPartitionKeys = new ArrayList<>();
+            for (String field : candidateFilterFields) {
+                if (catalogTable.getPartitionKeys().contains(field)) {

Review Comment:
   Shoud we warn when meets a field that not exists in 
`catalogTable.getPartitionKeys`?



##########
flink-connectors/flink-connector-hive/src/main/java/org/apache/flink/connectors/hive/HiveSourceDynamicFileEnumerator.java:
##########
@@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.connectors.hive;
+
+import org.apache.flink.annotation.VisibleForTesting;
+import org.apache.flink.connector.file.src.FileSourceSplit;
+import org.apache.flink.connector.file.src.enumerate.DynamicFileEnumerator;
+import org.apache.flink.connectors.hive.util.JobConfUtils;
+import org.apache.flink.core.fs.Path;
+import org.apache.flink.table.connector.source.DynamicFilteringData;
+import org.apache.flink.table.data.GenericRowData;
+import org.apache.flink.table.data.RowData;
+import org.apache.flink.table.data.StringData;
+import org.apache.flink.table.data.TimestampData;
+import org.apache.flink.table.types.logical.LogicalTypeFamily;
+import org.apache.flink.table.types.logical.RowType;
+import org.apache.flink.util.Preconditions;
+
+import org.apache.hadoop.mapred.JobConf;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import static 
org.apache.flink.table.types.logical.utils.LogicalTypeChecks.getPrecision;
+import static org.apache.flink.util.Preconditions.checkNotNull;
+
+/**
+ * A {@link DynamicFileEnumerator} implementation for hive source. It uses 
{@link
+ * HiveSourceFileEnumerator#createInputSplits} to generate splits like 
HiveSourceFileEnumerator, but
+ * only enumerates {@link HiveTablePartition}s that exist in the {@link 
DynamicFilteringData} if a
+ * DynamicFilteringData is provided.
+ */
+public class HiveSourceDynamicFileEnumerator implements DynamicFileEnumerator {
+
+    private static final Logger LOG =
+            LoggerFactory.getLogger(HiveSourceDynamicFileEnumerator.class);
+
+    private final String table;
+    private final List<String> dynamicPartitionKeys;
+    // For non-partition hive table, partitions only contains one partition 
which partitionValues is
+    // empty.
+    private final List<HiveTablePartition> allPartitions;
+    private final int threadNum;
+    private final JobConf jobConf;
+
+    private transient List<HiveTablePartition> finalPartitions;
+
+    public HiveSourceDynamicFileEnumerator(
+            String table,
+            List<String> dynamicPartitionKeys,
+            List<HiveTablePartition> allPartitions,
+            int threadNum,
+            JobConf jobConf) {
+        this.table = checkNotNull(table);
+        this.dynamicPartitionKeys = checkNotNull(dynamicPartitionKeys);
+        this.allPartitions = checkNotNull(allPartitions);
+        this.threadNum = threadNum;
+        this.jobConf = checkNotNull(jobConf);
+
+        this.finalPartitions = this.allPartitions;
+    }
+
+    public void setDynamicFilteringData(DynamicFilteringData data) {
+        LOG.debug("Filtering partitions of table {} based on the data: {}", 
table, data);
+        if (!data.isFiltering()) {
+            finalPartitions = allPartitions;
+            return;
+        }
+        finalPartitions = new ArrayList<>();
+        RowType rowType = data.getRowType();
+        Preconditions.checkArgument(rowType.getFieldCount() == 
dynamicPartitionKeys.size());
+        for (HiveTablePartition partition : allPartitions) {
+            RowData partitionRow = createRowData(rowType, 
partition.getPartitionSpec());
+            if (data.contains(partitionRow)) {
+                finalPartitions.add(partition);
+            }
+        }
+        LOG.info(
+                "Dynamic filtering table {}, original partition number is {}, 
remaining partition number {}",
+                table,
+                allPartitions.size(),
+                finalPartitions.size());
+    }
+
+    @VisibleForTesting
+    RowData createRowData(RowType rowType, Map<String, String> partitionSpec) {
+        GenericRowData rowData = new GenericRowData(rowType.getFieldCount());
+        for (int i = 0; i < rowType.getFieldCount(); ++i) {
+            String value = partitionSpec.get(dynamicPartitionKeys.get(i));
+            Object convertedValue;
+            if (JobConfUtils.getDefaultPartitionName(jobConf).equals(value)) {
+                // Keep the same way as hive source reads from default 
partition. Use the default
+                // partition name if the partition field is string type, use 
null otherwise.
+                // See HivePartitionUtils#restorePartitionValueFromType.
+
+                // By default, the type of hive partition field is nullable.
+                // See HiveTypeUtil#toFlinkPrimitiveType
+                if 
(rowType.getTypeAt(i).is(LogicalTypeFamily.CHARACTER_STRING)) {
+                    convertedValue = StringData.fromString(value);
+                } else {
+                    convertedValue = null;
+                }
+            } else {
+                switch (rowType.getTypeAt(i).getTypeRoot()) {
+                    case CHAR:
+                    case VARCHAR:
+                        convertedValue = StringData.fromString(value);
+                        break;
+                    case TINYINT:
+                        convertedValue = Byte.parseByte(value);
+                        break;
+                    case SMALLINT:
+                        convertedValue = Short.parseShort(value);
+                        break;
+                    case INTEGER:
+                    case DATE:
+                    case TIME_WITHOUT_TIME_ZONE:
+                        convertedValue = Integer.parseInt(value);
+                        break;
+                    case BIGINT:
+                        convertedValue = Long.parseLong(value);
+                        break;
+                    case TIMESTAMP_WITHOUT_TIME_ZONE:
+                    case TIMESTAMP_WITH_LOCAL_TIME_ZONE:
+                        final int precision = 
getPrecision(rowType.getTypeAt(i));
+                        if (TimestampData.isCompact(precision)) {
+                            long timestamp = Long.parseLong(value);

Review Comment:
   Dito.



##########
flink-connectors/flink-connector-hive/src/main/java/org/apache/flink/connectors/hive/HiveSourceDynamicFileEnumerator.java:
##########
@@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.connectors.hive;
+
+import org.apache.flink.annotation.VisibleForTesting;
+import org.apache.flink.connector.file.src.FileSourceSplit;
+import org.apache.flink.connector.file.src.enumerate.DynamicFileEnumerator;
+import org.apache.flink.connectors.hive.util.JobConfUtils;
+import org.apache.flink.core.fs.Path;
+import org.apache.flink.table.connector.source.DynamicFilteringData;
+import org.apache.flink.table.data.GenericRowData;
+import org.apache.flink.table.data.RowData;
+import org.apache.flink.table.data.StringData;
+import org.apache.flink.table.data.TimestampData;
+import org.apache.flink.table.types.logical.LogicalTypeFamily;
+import org.apache.flink.table.types.logical.RowType;
+import org.apache.flink.util.Preconditions;
+
+import org.apache.hadoop.mapred.JobConf;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import static 
org.apache.flink.table.types.logical.utils.LogicalTypeChecks.getPrecision;
+import static org.apache.flink.util.Preconditions.checkNotNull;
+
+/**
+ * A {@link DynamicFileEnumerator} implementation for hive source. It uses 
{@link
+ * HiveSourceFileEnumerator#createInputSplits} to generate splits like 
HiveSourceFileEnumerator, but
+ * only enumerates {@link HiveTablePartition}s that exist in the {@link 
DynamicFilteringData} if a
+ * DynamicFilteringData is provided.
+ */
+public class HiveSourceDynamicFileEnumerator implements DynamicFileEnumerator {
+
+    private static final Logger LOG =
+            LoggerFactory.getLogger(HiveSourceDynamicFileEnumerator.class);
+
+    private final String table;
+    private final List<String> dynamicPartitionKeys;
+    // For non-partition hive table, partitions only contains one partition 
which partitionValues is
+    // empty.
+    private final List<HiveTablePartition> allPartitions;
+    private final int threadNum;
+    private final JobConf jobConf;
+
+    private transient List<HiveTablePartition> finalPartitions;
+
+    public HiveSourceDynamicFileEnumerator(
+            String table,
+            List<String> dynamicPartitionKeys,
+            List<HiveTablePartition> allPartitions,
+            int threadNum,
+            JobConf jobConf) {
+        this.table = checkNotNull(table);
+        this.dynamicPartitionKeys = checkNotNull(dynamicPartitionKeys);
+        this.allPartitions = checkNotNull(allPartitions);
+        this.threadNum = threadNum;
+        this.jobConf = checkNotNull(jobConf);
+
+        this.finalPartitions = this.allPartitions;
+    }
+
+    public void setDynamicFilteringData(DynamicFilteringData data) {
+        LOG.debug("Filtering partitions of table {} based on the data: {}", 
table, data);
+        if (!data.isFiltering()) {
+            finalPartitions = allPartitions;
+            return;
+        }
+        finalPartitions = new ArrayList<>();
+        RowType rowType = data.getRowType();
+        Preconditions.checkArgument(rowType.getFieldCount() == 
dynamicPartitionKeys.size());
+        for (HiveTablePartition partition : allPartitions) {
+            RowData partitionRow = createRowData(rowType, 
partition.getPartitionSpec());
+            if (data.contains(partitionRow)) {
+                finalPartitions.add(partition);
+            }
+        }
+        LOG.info(
+                "Dynamic filtering table {}, original partition number is {}, 
remaining partition number {}",
+                table,
+                allPartitions.size(),
+                finalPartitions.size());
+    }
+
+    @VisibleForTesting
+    RowData createRowData(RowType rowType, Map<String, String> partitionSpec) {
+        GenericRowData rowData = new GenericRowData(rowType.getFieldCount());
+        for (int i = 0; i < rowType.getFieldCount(); ++i) {
+            String value = partitionSpec.get(dynamicPartitionKeys.get(i));
+            Object convertedValue;
+            if (JobConfUtils.getDefaultPartitionName(jobConf).equals(value)) {
+                // Keep the same way as hive source reads from default 
partition. Use the default
+                // partition name if the partition field is string type, use 
null otherwise.
+                // See HivePartitionUtils#restorePartitionValueFromType.
+
+                // By default, the type of hive partition field is nullable.
+                // See HiveTypeUtil#toFlinkPrimitiveType
+                if 
(rowType.getTypeAt(i).is(LogicalTypeFamily.CHARACTER_STRING)) {
+                    convertedValue = StringData.fromString(value);
+                } else {
+                    convertedValue = null;
+                }
+            } else {
+                switch (rowType.getTypeAt(i).getTypeRoot()) {
+                    case CHAR:
+                    case VARCHAR:
+                        convertedValue = StringData.fromString(value);
+                        break;
+                    case TINYINT:
+                        convertedValue = Byte.parseByte(value);
+                        break;
+                    case SMALLINT:
+                        convertedValue = Short.parseShort(value);
+                        break;
+                    case INTEGER:
+                    case DATE:

Review Comment:
   Since when the partition's type `DATE`, the value will be `yyyy-[m]m-[d]d`, 
we can't convert the value to `Integer` directly.
   We can follow the convert logic in 
`HivePartitionUtils#restorePartitionValueFromType`



##########
flink-connectors/flink-connector-hive/src/main/java/org/apache/flink/connectors/hive/HiveSourceBuilder.java:
##########
@@ -91,6 +91,7 @@ public class HiveSourceBuilder {
     private int[] projectedFields;
     private Long limit;
     private List<HiveTablePartition> partitions;
+    private List<String> dynamicPartitionKeys;

Review Comment:
   nit:
   SInce there exists the concept of static parition /dynamic partition and I 
think in here it's different from the concept in Hive. The name may be 
confused. What about `partitionKeysForDynamicFilter` or other name you perfer.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [flink] luoyuxia commented on a diff in pull request #20415: [FLINK-28711] Hive connector implements SupportsDynamicFiltering interface

Reply via email to