[hive] branch master updated: HIVE-21599: Wrong results for partitioned Parquet table when files contain partition column (Soumyakanti Das reviewed by Stamatis Zampetakis, Aman Sinha, Alessandro Solimando)

zabetak Tue, 22 Nov 2022 02:40:55 -0800

This is an automated email from the ASF dual-hosted git repository.

zabetak pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git



The following commit(s) were added to refs/heads/master by this push:
     new eb57ac9a0ae HIVE-21599: Wrong results for partitioned Parquet table 
when files contain partition column (Soumyakanti Das reviewed by Stamatis 
Zampetakis, Aman Sinha, Alessandro Solimando)
eb57ac9a0ae is described below

commit eb57ac9a0aef456f25b559a4ac225ac9ebf40508
Author: Soumyakanti Das <[email protected]>
AuthorDate: Tue Nov 8 16:14:09 2022 -0800

    HIVE-21599: Wrong results for partitioned Parquet table when files contain 
partition column (Soumyakanti Das reviewed by Stamatis Zampetakis, Aman Sinha, 
Alessandro Solimando)
    
    Closes #3742
---
 data/files/parquet_partition/pcol=100/000000_0     | Bin 0 -> 761 bytes
 data/files/parquet_partition/pcol=200/000000_0     | Bin 0 -> 761 bytes
 data/files/parquet_partition/pcol=300/000000_0     | Bin 0 -> 761 bytes
 .../org/apache/hadoop/hive/ql/exec/Utilities.java  |  29 ++++++++++
 .../apache/hadoop/hive/ql/io/HiveInputFormat.java  |   1 +
 .../ql/io/parquet/ParquetRecordReaderBase.java     |  19 ++++++-
 .../queries/clientpositive/parquet_partition_col.q |  37 +++++++++++++
 .../llap/parquet_partition_col.q.out               |  61 +++++++++++++++++++++
 8 files changed, 146 insertions(+), 1 deletion(-)

diff --git a/data/files/parquet_partition/pcol=100/000000_0 
b/data/files/parquet_partition/pcol=100/000000_0
new file mode 100644
index 00000000000..fe3dc6a5288
Binary files /dev/null and b/data/files/parquet_partition/pcol=100/000000_0 
differ
diff --git a/data/files/parquet_partition/pcol=200/000000_0 
b/data/files/parquet_partition/pcol=200/000000_0
new file mode 100644
index 00000000000..4f9e6cf017c
Binary files /dev/null and b/data/files/parquet_partition/pcol=200/000000_0 
differ
diff --git a/data/files/parquet_partition/pcol=300/000000_0 
b/data/files/parquet_partition/pcol=300/000000_0
new file mode 100644
index 00000000000..a16616e8d3a
Binary files /dev/null and b/data/files/parquet_partition/pcol=300/000000_0 
differ
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java 
b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
index b2c3fbbda1f..c205f2c974f 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
@@ -161,6 +161,7 @@ import org.apache.hadoop.hive.ql.plan.PartitionDesc;
 import org.apache.hadoop.hive.ql.plan.PlanUtils;
 import org.apache.hadoop.hive.ql.plan.ReduceWork;
 import org.apache.hadoop.hive.ql.plan.TableDesc;
+import org.apache.hadoop.hive.ql.plan.TableScanDesc;
 import org.apache.hadoop.hive.ql.secrets.URISecretSource;
 import org.apache.hadoop.hive.ql.session.SessionState;
 import org.apache.hadoop.hive.ql.stats.StatsFactory;
@@ -4273,6 +4274,34 @@ public final class Utilities {
     }
   }
 
+  /**
+   * Sets partition column names to the configuration, if there is available 
info in the operator.
+   */
+  public static void setPartitionColumnNames(Configuration conf, 
TableScanOperator tableScanOp) {
+    TableScanDesc scanDesc = tableScanOp.getConf();
+    Table metadata = scanDesc.getTableMetadata();
+    if (metadata == null) {
+      return;
+    }
+    List<FieldSchema> partCols = metadata.getPartCols();
+    if (partCols != null && !partCols.isEmpty()) {
+      conf.set(serdeConstants.LIST_PARTITION_COLUMNS, 
MetaStoreUtils.getColumnNamesFromFieldSchema(partCols));
+    }
+  }
+
+  /**
+   * Returns a list with partition column names present in the configuration,
+   * or empty if there is no such information available.
+   */
+  public static List<String> getPartitionColumnNames(Configuration conf) {
+    String colNames = conf.get(serdeConstants.LIST_PARTITION_COLUMNS);
+    if (colNames != null) {
+      return splitColNames(new ArrayList<>(), colNames);
+    } else {
+      return Collections.emptyList();
+    }
+  }
+
   /**
    * Create row key and value object inspectors for reduce vectorization.
    * The row object inspector used by ReduceWork needs to be a **standard**
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java 
b/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java
index 6f38d680a86..de93573e303 100755
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/HiveInputFormat.java
@@ -918,6 +918,7 @@ public class HiveInputFormat<K extends WritableComparable, 
V extends Writable>
     }
 
     Utilities.addTableSchemaToConf(jobConf, tableScan);
+    Utilities.setPartitionColumnNames(jobConf, tableScan);
 
     // construct column name list and types for reference by filter push down
     Utilities.setColumnNameList(jobConf, tableScan);
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java 
b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java
index a665c2586a3..529c13871e3 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/ParquetRecordReaderBase.java
@@ -16,6 +16,7 @@ package org.apache.hadoop.hive.ql.io.parquet;
 import com.google.common.base.Strings;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.exec.Utilities;
 import org.apache.hadoop.hive.ql.io.IOConstants;
 import org.apache.hadoop.hive.ql.io.parquet.read.DataWritableReadSupport;
 import 
org.apache.hadoop.hive.ql.io.parquet.read.ParquetFilterPredicateConverter;
@@ -40,6 +41,7 @@ import org.apache.parquet.hadoop.metadata.FileMetaData;
 import org.apache.parquet.hadoop.metadata.ParquetMetadata;
 import org.apache.parquet.schema.MessageType;
 import org.apache.parquet.schema.MessageTypeParser;
+import org.apache.parquet.schema.Type;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -196,7 +198,8 @@ public abstract class ParquetRecordReaderBase {
 
     // Create the Parquet FilterPredicate without including columns that do 
not exist
     // on the schema (such as partition columns).
-    FilterPredicate p = 
ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema, columns);
+    MessageType newSchema = getSchemaWithoutPartitionColumns(conf, schema);
+    FilterPredicate p = 
ParquetFilterPredicateConverter.toFilterPredicate(sarg, newSchema, columns);
     if (p != null) {
       // Filter may have sensitive information. Do not send to debug.
       LOG.debug("PARQUET predicate push down generated.");
@@ -209,6 +212,20 @@ public abstract class ParquetRecordReaderBase {
     }
   }
 
+  private MessageType getSchemaWithoutPartitionColumns(JobConf conf, 
MessageType schema) {
+    List<String> partCols = Utilities.getPartitionColumnNames(conf);
+    if (partCols.isEmpty()) {
+      return schema;
+    }
+    List<Type> newFields = new ArrayList<>();
+    for (Type field : schema.getFields()) {
+      if (!partCols.contains(field.getName())) {
+        newFields.add(field);
+      }
+    }
+    return new MessageType(schema.getName(), newFields);
+  }
+
   public List<BlockMetaData> getFilteredBlocks() {
     return filteredBlocks;
   }
diff --git a/ql/src/test/queries/clientpositive/parquet_partition_col.q 
b/ql/src/test/queries/clientpositive/parquet_partition_col.q
new file mode 100644
index 00000000000..ae3f2bd06ff
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/parquet_partition_col.q
@@ -0,0 +1,37 @@
+-- The peculiarity of this test is that the partitioning column exists inside 
each individual Parquet
+-- file (under data/files/parquet_partition) and at the same time it is also 
present in the directory
+-- structure.
+-- 
+-- The schema of the Parquet files are shown below:
+-- {
+--   "type" : "record",
+--   "name" : "hive_schema",
+--   "fields" : [ {
+--     "name" : "strcol",
+--     "type" : [ "null", "string" ],
+--     "default" : null
+--   }, {
+--     "name" : "intcol",
+--     "type" : [ "null", "int" ],
+--     "default" : null
+--   }, {
+--     "name" : "pcol",
+--     "type" : [ "null", "int" ],
+--     "default" : null
+--   } ]
+-- }
+-- The test case necessitates the table to be external with location already 
specified; we don't
+-- want the data to be reloaded cause it will change the actual problem.
+
+create external table test(
+  strcol string,
+  intcol integer
+) partitioned by (pcol int)
+stored as parquet
+location '../../data/files/parquet_partition';
+
+msck repair table test;
+
+select * from test where pcol=100 and intcol=2;
+select * from test where PCOL=200 and intcol=3;
+select * from test where `pCol`=300 and intcol=5;
diff --git 
a/ql/src/test/results/clientpositive/llap/parquet_partition_col.q.out 
b/ql/src/test/results/clientpositive/llap/parquet_partition_col.q.out
new file mode 100644
index 00000000000..1583792dbb5
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/parquet_partition_col.q.out
@@ -0,0 +1,61 @@
+PREHOOK: query: create external table test(
+  strcol string,
+  intcol integer
+) partitioned by (pcol int)
+stored as parquet
+location '../../data/files/parquet_partition'
+PREHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+PREHOOK: Output: database:default
+PREHOOK: Output: default@test
+POSTHOOK: query: create external table test(
+  strcol string,
+  intcol integer
+) partitioned by (pcol int)
+stored as parquet
+location '../../data/files/parquet_partition'
+POSTHOOK: type: CREATETABLE
+#### A masked pattern was here ####
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@test
+PREHOOK: query: msck repair table test
+PREHOOK: type: MSCK
+PREHOOK: Output: default@test
+POSTHOOK: query: msck repair table test
+POSTHOOK: type: MSCK
+POSTHOOK: Output: default@test
+Partitions not in metastore:   test:pcol=100   test:pcol=200   test:pcol=300
+#### A masked pattern was here ####
+PREHOOK: query: select * from test where pcol=100 and intcol=2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test
+PREHOOK: Input: default@test@pcol=100
+#### A masked pattern was here ####
+POSTHOOK: query: select * from test where pcol=100 and intcol=2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test
+POSTHOOK: Input: default@test@pcol=100
+#### A masked pattern was here ####
+b      2       100
+PREHOOK: query: select * from test where PCOL=200 and intcol=3
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test
+PREHOOK: Input: default@test@pcol=200
+#### A masked pattern was here ####
+POSTHOOK: query: select * from test where PCOL=200 and intcol=3
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test
+POSTHOOK: Input: default@test@pcol=200
+#### A masked pattern was here ####
+c      3       200
+PREHOOK: query: select * from test where `pCol`=300 and intcol=5
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test
+PREHOOK: Input: default@test@pcol=300
+#### A masked pattern was here ####
+POSTHOOK: query: select * from test where `pCol`=300 and intcol=5
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test
+POSTHOOK: Input: default@test@pcol=300
+#### A masked pattern was here ####
+e      5       300

[hive] branch master updated: HIVE-21599: Wrong results for partitioned Parquet table when files contain partition column (Soumyakanti Das reviewed by Stamatis Zampetakis, Aman Sinha, Alessandro Solimando)

Reply via email to