Repository: hive Updated Branches: refs/heads/master 29736e438 -> d31dc22ae
HIVE-15782: query on parquet table returns incorrect result when hive.optimize.index.filter is set to true (Aihua Xu, reviewed by Yongzhi Chen) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/d31dc22a Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/d31dc22a Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/d31dc22a Branch: refs/heads/master Commit: d31dc22ae5d9f0983d985f45551f20058777d524 Parents: 29736e4 Author: Aihua Xu <aihu...@apache.org> Authored: Wed Feb 1 14:23:15 2017 -0500 Committer: Aihua Xu <aihu...@apache.org> Committed: Mon Feb 6 09:21:39 2017 -0500 ---------------------------------------------------------------------- .../hive/ql/io/parquet/LeafFilterFactory.java | 13 +++-- .../read/ParquetFilterPredicateConverter.java | 19 ++++---- .../parquet/TestParquetRecordReaderWrapper.java | 39 +++++++-------- .../clientpositive/parquet_ppd_multifiles.q | 13 +++++ .../clientpositive/parquet_ppd_multifiles.q.out | 50 ++++++++++++++++++++ 5 files changed, 100 insertions(+), 34 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/d31dc22a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/LeafFilterFactory.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/LeafFilterFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/LeafFilterFactory.java index f95ebcd..3bd01f2 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/LeafFilterFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/LeafFilterFactory.java @@ -18,7 +18,7 @@ import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf.Operator; - +import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.parquet.filter2.predicate.FilterApi; import org.apache.parquet.filter2.predicate.FilterPredicate; import org.apache.parquet.io.api.Binary; @@ -167,9 +167,11 @@ public class LeafFilterFactory { * supported yet. * @param type FilterPredicateType * @return + * @throws HiveException Exception is thrown for unsupported data types so we can skip filtering */ - public FilterPredicateLeafBuilder getLeafFilterBuilderByType(PredicateLeaf.Type type, - Type parquetType){ + public FilterPredicateLeafBuilder getLeafFilterBuilderByType( + PredicateLeaf.Type type, + Type parquetType) throws HiveException { switch (type){ case LONG: if (parquetType.asPrimitiveType().getPrimitiveTypeName() == @@ -193,8 +195,9 @@ public class LeafFilterFactory { case DECIMAL: case TIMESTAMP: default: - LOG.debug("Conversion to Parquet FilterPredicate not supported for " + type); - return null; + String msg = "Conversion to Parquet FilterPredicate not supported for " + type; + LOG.debug(msg); + throw new HiveException(msg); } } } http://git-wip-us.apache.org/repos/asf/hive/blob/d31dc22a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/ParquetFilterPredicateConverter.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/ParquetFilterPredicateConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/ParquetFilterPredicateConverter.java index 786a260..47777f8 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/ParquetFilterPredicateConverter.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/read/ParquetFilterPredicateConverter.java @@ -39,7 +39,8 @@ public class ParquetFilterPredicateConverter { /** * Translate the search argument to the filter predicate parquet uses. It includes * only the columns from the passed schema. - * @return translate the sarg into a filter predicate + * @return a filter predicate translated from search argument. null is returned + * if failed to convert. */ public static FilterPredicate toFilterPredicate(SearchArgument sarg, MessageType schema) { Set<String> columns = null; @@ -50,13 +51,17 @@ public class ParquetFilterPredicateConverter { } } - return translate(sarg.getExpression(), sarg.getLeaves(), columns, schema); + try { + return translate(sarg.getExpression(), sarg.getLeaves(), columns, schema); + } catch(Exception e) { + return null; + } } private static FilterPredicate translate(ExpressionTree root, List<PredicateLeaf> leaves, Set<String> columns, - MessageType schema) { + MessageType schema) throws Exception { FilterPredicate p = null; switch (root.getOperator()) { case OR: @@ -113,15 +118,13 @@ public class ParquetFilterPredicateConverter { } private static FilterPredicate buildFilterPredicateFromPredicateLeaf - (PredicateLeaf leaf, Type parquetType) { + (PredicateLeaf leaf, Type parquetType) throws Exception { LeafFilterFactory leafFilterFactory = new LeafFilterFactory(); FilterPredicateLeafBuilder builder; try { builder = leafFilterFactory .getLeafFilterBuilderByType(leaf.getType(), parquetType); - if (builder == null) { - return null; - } + if (isMultiLiteralsOperator(leaf.getOperator())) { return builder.buildPredicate(leaf.getOperator(), leaf.getLiteralList(), @@ -134,7 +137,7 @@ public class ParquetFilterPredicateConverter { } } catch (Exception e) { LOG.error("fail to build predicate filter leaf with errors" + e, e); - return null; + throw e; } } http://git-wip-us.apache.org/repos/asf/hive/blob/d31dc22a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestParquetRecordReaderWrapper.java ---------------------------------------------------------------------- diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestParquetRecordReaderWrapper.java b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestParquetRecordReaderWrapper.java index 35d0342..1d6fc31 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestParquetRecordReaderWrapper.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/parquet/TestParquetRecordReaderWrapper.java @@ -18,20 +18,19 @@ package org.apache.hadoop.hive.ql.io.parquet; -import static junit.framework.Assert.assertEquals; - import org.apache.hadoop.hive.common.type.HiveChar; import org.apache.hadoop.hive.common.type.HiveVarchar; import org.apache.hadoop.hive.ql.io.parquet.read.ParquetFilterPredicateConverter; import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; -import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.TruthValue; import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory; import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.MessageTypeParser; import org.junit.Test; +import static org.junit.Assert.assertEquals; + import java.sql.Date; import org.apache.parquet.filter2.predicate.FilterPredicate; @@ -41,10 +40,6 @@ import org.apache.parquet.filter2.predicate.FilterPredicate; */ public class TestParquetRecordReaderWrapper { - private static TruthValue[] values(TruthValue... vals) { - return vals; - } - @Test public void testBuilder() throws Exception { SearchArgument sarg = SearchArgumentFactory.newBuilder() @@ -69,6 +64,10 @@ public class TestParquetRecordReaderWrapper { assertEquals(expected, p.toString()); } + /** + * Check the converted filter predicate is null if unsupported types are included + * @throws Exception + */ @Test public void testBuilderComplexTypes() throws Exception { SearchArgument sarg = @@ -83,8 +82,8 @@ public class TestParquetRecordReaderWrapper { .build(); MessageType schema = MessageTypeParser.parseMessageType("message test {" + " required int32 x; required binary y; required binary z;}"); - assertEquals("lteq(y, Binary{\"hi \"})", - ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema).toString()); + assertEquals(null, + ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema)); sarg = SearchArgumentFactory.newBuilder() .startNot() @@ -101,13 +100,14 @@ public class TestParquetRecordReaderWrapper { schema = MessageTypeParser.parseMessageType("message test {" + " optional int32 x; required binary y; required int32 z;" + " optional binary a;}"); - FilterPredicate p = ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema); - String expected = - "and(and(not(eq(x, null)), not(or(or(eq(z, 1), eq(z, 2)), eq(z, 3)))), " + - "not(eq(a, Binary{\"stinger\"})))"; - assertEquals(expected, p.toString()); + assertEquals(null, + ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema)); } + /** + * Check the converted filter predicate is null if unsupported types are included + * @throws Exception + */ @Test public void testBuilderComplexTypes2() throws Exception { SearchArgument sarg = @@ -122,8 +122,8 @@ public class TestParquetRecordReaderWrapper { .build(); MessageType schema = MessageTypeParser.parseMessageType("message test {" + " required int32 x; required binary y; required binary z;}"); - assertEquals("lteq(y, Binary{\"hi \"})", - ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema).toString()); + assertEquals(null, + ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema)); sarg = SearchArgumentFactory.newBuilder() .startNot() @@ -140,11 +140,8 @@ public class TestParquetRecordReaderWrapper { schema = MessageTypeParser.parseMessageType("message test {" + " optional int32 x; required binary y; required int32 z;" + " optional binary a;}"); - - FilterPredicate p = ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema); - String expected = "and(and(not(eq(x, null)), not(or(or(eq(z, 1), eq(z, 2)), eq(z, 3)))), " + - "not(eq(a, Binary{\"stinger\"})))"; - assertEquals(expected, p.toString()); + assertEquals(null, + ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema)); } @Test http://git-wip-us.apache.org/repos/asf/hive/blob/d31dc22a/ql/src/test/queries/clientpositive/parquet_ppd_multifiles.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/parquet_ppd_multifiles.q b/ql/src/test/queries/clientpositive/parquet_ppd_multifiles.q new file mode 100644 index 0000000..6483684 --- /dev/null +++ b/ql/src/test/queries/clientpositive/parquet_ppd_multifiles.q @@ -0,0 +1,13 @@ +CREATE TABLE parquet_ppd_multifiles ( + name string, + dec decimal(5,0) +) stored as parquet; + +insert into table parquet_ppd_multifiles values('Jim', 3); +insert into table parquet_ppd_multifiles values('Tom', 5); + +set hive.optimize.index.filter=false; +select * from parquet_ppd_multifiles where (name = 'Jim' or dec = 5); + +set hive.optimize.index.filter=true; +select * from parquet_ppd_multifiles where (name = 'Jim' or dec = 5); http://git-wip-us.apache.org/repos/asf/hive/blob/d31dc22a/ql/src/test/results/clientpositive/parquet_ppd_multifiles.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/parquet_ppd_multifiles.q.out b/ql/src/test/results/clientpositive/parquet_ppd_multifiles.q.out new file mode 100644 index 0000000..d7688f8 --- /dev/null +++ b/ql/src/test/results/clientpositive/parquet_ppd_multifiles.q.out @@ -0,0 +1,50 @@ +PREHOOK: query: CREATE TABLE parquet_ppd_multifiles ( + name string, + dec decimal(5,0) +) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@parquet_ppd_multifiles +POSTHOOK: query: CREATE TABLE parquet_ppd_multifiles ( + name string, + dec decimal(5,0) +) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquet_ppd_multifiles +PREHOOK: query: insert into table parquet_ppd_multifiles values('Jim', 3) +PREHOOK: type: QUERY +PREHOOK: Output: default@parquet_ppd_multifiles +POSTHOOK: query: insert into table parquet_ppd_multifiles values('Jim', 3) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@parquet_ppd_multifiles +POSTHOOK: Lineage: parquet_ppd_multifiles.dec EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: parquet_ppd_multifiles.name SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: insert into table parquet_ppd_multifiles values('Tom', 5) +PREHOOK: type: QUERY +PREHOOK: Output: default@parquet_ppd_multifiles +POSTHOOK: query: insert into table parquet_ppd_multifiles values('Tom', 5) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@parquet_ppd_multifiles +POSTHOOK: Lineage: parquet_ppd_multifiles.dec EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: parquet_ppd_multifiles.name SIMPLE [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: select * from parquet_ppd_multifiles where (name = 'Jim' or dec = 5) +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_ppd_multifiles +#### A masked pattern was here #### +POSTHOOK: query: select * from parquet_ppd_multifiles where (name = 'Jim' or dec = 5) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_ppd_multifiles +#### A masked pattern was here #### +Jim 3 +Tom 5 +PREHOOK: query: select * from parquet_ppd_multifiles where (name = 'Jim' or dec = 5) +PREHOOK: type: QUERY +PREHOOK: Input: default@parquet_ppd_multifiles +#### A masked pattern was here #### +POSTHOOK: query: select * from parquet_ppd_multifiles where (name = 'Jim' or dec = 5) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquet_ppd_multifiles +#### A masked pattern was here #### +Jim 3 +Tom 5