This is an automated email from the ASF dual-hosted git repository.
zabetak pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new eb57ac9a0ae HIVE-21599: Wrong results for partitioned Parquet table
when files contain partition column (Soumyakanti Das reviewed by Stamatis
Zampetakis, Aman Sinha, Alessandro Solimando)
eb57ac9a0ae is described below
commit eb57ac9a0aef456f25b559a4ac225ac9ebf40508
Author: Soumyakanti Das <[email protected]>
AuthorDate: Tue Nov 8 16:14:09 2022 -0800
HIVE-21599: Wrong results for partitioned Parquet table when files contain
partition column (Soumyakanti Das reviewed by Stamatis Zampetakis, Aman Sinha,
Alessandro Solimando)
Closes #3742
---
data/files/parquet_partition/pcol=100/000000_0 | Bin 0 -> 761 bytes
data/files/parquet_partition/pcol=200/000000_0 | Bin 0 -> 761 bytes
data/files/parquet_partition/pcol=300/000000_0 | Bin 0 -> 761 bytes
.../org/apache/hadoop/hive/ql/exec/Utilities.java | 29 ++++++++++
.../apache/hadoop/hive/ql/io/HiveInputFormat.java | 1 +
.../ql/io/parquet/ParquetRecordReaderBase.java | 19 ++++++-
.../queries/clientpositive/parquet_partition_col.q | 37 +++++++++++++
.../llap/parquet_partition_col.q.out | 61 +++++++++++++++++++++
8 files changed, 146 insertions(+), 1 deletion(-)
diff --git a/data/files/parquet_partition/pcol=100/000000_0
b/data/files/parquet_partition/pcol=100/000000_0
new file mode 100644
index 00000000000..fe3dc6a5288
Binary files /dev/null and b/data/files/parquet_partition/pcol=100/000000_0
differ
diff --git a/data/files/parquet_partition/pcol=200/000000_0
b/data/files/parquet_partition/pcol=200/000000_0
new file mode 100644
index 00000000000..4f9e6cf017c
Binary files /dev/null and b/data/files/parquet_partition/pcol=200/000000_0
differ
diff --git a/data/files/parquet_partition/pcol=300/000000_0
b/data/files/parquet_partition/pcol=300/000000_0
new file mode 100644
index 00000000000..a16616e8d3a
Binary files /dev/null and b/data/files/parquet_partition/pcol=300/000000_0
differ
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
index b2c3fbbda1f..c205f2c974f 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
@@ -161,6 +161,7 @@ import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.ql.plan.PlanUtils;
import org.apache.hadoop.hive.ql.plan.ReduceWork;
import org.apache.hadoop.hive.ql.plan.TableDesc;
+import org.apache.hadoop.hive.ql.plan.TableScanDesc;
import org.apache.hadoop.hive.ql.secrets.URISecretSource;
import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.hive.ql.stats.StatsFactory;
@@ -4273,6 +4274,34 @@ public final class Utilities {
}
}
+ /**
+ * Sets partition column names to the configuration, if there is available
info in the operator.
+ */
+ public static void setPartitionColumnNames(Configuration conf,
TableScanOperator tableScanOp) {
+ TableScanDesc scanDesc = tableScanOp.getConf();
+ Table metadata = scanDesc.getTableMetadata();
+ if (metadata == null) {
+ return;
+ }
+ List<FieldSchema> partCols = metadata.getPartCols();
+ if (partCols != null && !partCols.isEmpty()) {
+ conf.set(serdeConstants.LIST_PARTITION_COLUMNS,
MetaStoreUtils.getColumnNamesFromFieldSchema(partCols));
+ }
+ }
+
+ /**
+ * Returns a list with partition column names present in the configuration,
+ * or empty if there is no such information available.
+ */
+ public static List<String> getPartitionColumnNames(Configuration conf) {
+ String colNames = conf.get(serdeConstants.LIST_PARTITION_COLUMNS);
+ if (colNames != null) {
+ return splitColNames(new ArrayList<>(), colNames);
+ } else {
+ return Collections.emptyList();
+ }
+ }
+
/**
* Create row key and value object inspectors for reduce vectorization.
* The row object inspector used by ReduceWork needs to be a **standard**
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java
b/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java
index 6f38d680a86..de93573e303 100755
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java
@@ -918,6 +918,7 @@ public class HiveInputFormat<K extends WritableComparable,
V extends Writable>
}
Utilities.addTableSchemaToConf(jobConf, tableScan);
+ Utilities.setPartitionColumnNames(jobConf, tableScan);
// construct column name list and types for reference by filter push down
Utilities.setColumnNameList(jobConf, tableScan);
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java
b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java
index a665c2586a3..529c13871e3 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java
@@ -16,6 +16,7 @@ package org.apache.hadoop.hive.ql.io.parquet;
import com.google.common.base.Strings;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.io.IOConstants;
import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport;
import
org.apache.hadoop.hive.ql.io.parquet.read.ParquetFilterPredicateConverter;
@@ -40,6 +41,7 @@ import org.apache.parquet.hadoop.metadata.FileMetaData;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.MessageTypeParser;
+import org.apache.parquet.schema.Type;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -196,7 +198,8 @@ public abstract class ParquetRecordReaderBase {
// Create the Parquet FilterPredicate without including columns that do
not exist
// on the schema (such as partition columns).
- FilterPredicate p =
ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema, columns);
+ MessageType newSchema = getSchemaWithoutPartitionColumns(conf, schema);
+ FilterPredicate p =
ParquetFilterPredicateConverter.toFilterPredicate(sarg, newSchema, columns);
if (p != null) {
// Filter may have sensitive information. Do not send to debug.
LOG.debug("PARQUET predicate push down generated.");
@@ -209,6 +212,20 @@ public abstract class ParquetRecordReaderBase {
}
}
+ private MessageType getSchemaWithoutPartitionColumns(JobConf conf,
MessageType schema) {
+ List<String> partCols = Utilities.getPartitionColumnNames(conf);
+ if (partCols.isEmpty()) {
+ return schema;
+ }
+ List<Type> newFields = new ArrayList<>();
+ for (Type field : schema.getFields()) {
+ if (!partCols.contains(field.getName())) {
+ newFields.add(field);
+ }
+ }
+ return new MessageType(schema.getName(), newFields);
+ }
+
public List<BlockMetaData> getFilteredBlocks() {
return filteredBlocks;
}
diff --git a/ql/src/test/queries/clientpositive/parquet_partition_col.q
b/ql/src/test/queries/clientpositive/parquet_partition_col.q
new file mode 100644
index 00000000000..ae3f2bd06ff
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/parquet_partition_col.q
@@ -0,0 +1,37 @@
+-- The peculiarity of this test is that the partitioning column exists inside
each individual Parquet
+-- file (under data/files/parquet_partition) and at the same time it is also
present in the directory
+-- structure.
+--
+-- The schema of the Parquet files are shown below:
+-- {
+-- "type" : "record",
+-- "name" : "hive_schema",
+-- "fields" : [ {
+-- "name" : "strcol",
+-- "type" : [ "null", "string" ],
+-- "default" : null
+-- }, {
+-- "name" : "intcol",
+-- "type" : [ "null", "int" ],
+-- "default" : null
+-- }, {
+-- "name" : "pcol",
+-- "type" : [ "null", "int" ],
+-- "default" : null
+-- } ]
+-- }
+-- The test case necessitates the table to be external with location already
specified; we don't
+-- want the data to be reloaded cause it will change the actual problem.
+
+create external table test(
+ strcol string,
+ intcol integer
+) partitioned by (pcol int)
+stored as parquet
+location '../../data/files/parquet_partition';
+
+msck repair table test;
+
+select * from test where pcol=100 and intcol=2;
+select * from test where PCOL=200 and intcol=3;
+select * from test where `pCol`=300 and intcol=5;
diff --git
a/ql/src/test/results/clientpositive/llap/parquet_partition_col.q.out
b/ql/src/test/results/clientpositive/llap/parquet_partition_col.q.out
new file mode 100644
index 00000000000..1583792dbb5
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/parquet_partition_col.q.out
@@ -0,0 +1,61 @@
+PREHOOK: query: create external table test(
+ strcol string,
+ intcol integer
+) partitioned by (pcol int)
+stored as parquet
+location '../../data/files/parquet_partition'
+PREHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+PREHOOK: Output: database:default
+PREHOOK: Output: default@test
+POSTHOOK: query: create external table test(
+ strcol string,
+ intcol integer
+) partitioned by (pcol int)
+stored as parquet
+location '../../data/files/parquet_partition'
+POSTHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@test
+PREHOOK: query: msck repair table test
+PREHOOK: type: MSCK
+PREHOOK: Output: default@test
+POSTHOOK: query: msck repair table test
+POSTHOOK: type: MSCK
+POSTHOOK: Output: default@test
+Partitions not in metastore: test:pcol=100 test:pcol=200 test:pcol=300
+#### A masked pattern was here ####
+PREHOOK: query: select * from test where pcol=100 and intcol=2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test
+PREHOOK: Input: default@test@pcol=100
+#### A masked pattern was here ####
+POSTHOOK: query: select * from test where pcol=100 and intcol=2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test
+POSTHOOK: Input: default@test@pcol=100
+#### A masked pattern was here ####
+b 2 100
+PREHOOK: query: select * from test where PCOL=200 and intcol=3
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test
+PREHOOK: Input: default@test@pcol=200
+#### A masked pattern was here ####
+POSTHOOK: query: select * from test where PCOL=200 and intcol=3
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test
+POSTHOOK: Input: default@test@pcol=200
+#### A masked pattern was here ####
+c 3 200
+PREHOOK: query: select * from test where `pCol`=300 and intcol=5
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test
+PREHOOK: Input: default@test@pcol=300
+#### A masked pattern was here ####
+POSTHOOK: query: select * from test where `pCol`=300 and intcol=5
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test
+POSTHOOK: Input: default@test@pcol=300
+#### A masked pattern was here ####
+e 5 300