HIVE-13826: Make VectorUDFAdaptor work for GenericUDFBetween when used as FILTER (Matt McCline, reviewed by Ashutosh Chauhan)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/71725869 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/71725869 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/71725869 Branch: refs/heads/java8 Commit: 7172586966739de0ba1659bf9abcea40d109b341 Parents: 9bebaf6 Author: Matt McCline <[email protected]> Authored: Thu May 26 08:25:29 2016 -0700 Committer: Matt McCline <[email protected]> Committed: Thu May 26 08:25:29 2016 -0700 ---------------------------------------------------------------------- .../ql/exec/vector/VectorizationContext.java | 22 +++- .../clientpositive/vector_between_columns.q | 8 +- .../tez/vector_between_columns.q.out | 116 +++++++++++++++++- .../clientpositive/vector_between_columns.q.out | 117 ++++++++++++++++++- 4 files changed, 250 insertions(+), 13 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/71725869/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java index 886e222..a76e31d 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java @@ -1965,8 +1965,17 @@ public class VectorizationContext { private VectorExpression getCustomUDFExpression(ExprNodeGenericFuncDesc expr, Mode mode) throws HiveException { - if (mode != Mode.PROJECTION) { - return null; + boolean isFilter = false; // Assume. + if (mode == Mode.FILTER) { + + // Is output type a BOOLEAN? + TypeInfo resultTypeInfo = expr.getTypeInfo(); + if (resultTypeInfo.getCategory() == Category.PRIMITIVE && + ((PrimitiveTypeInfo) resultTypeInfo).getPrimitiveCategory() == PrimitiveCategory.BOOLEAN) { + isFilter = true; + } else { + return null; + } } //GenericUDFBridge udfBridge = (GenericUDFBridge) expr.getGenericUDF(); @@ -2032,7 +2041,14 @@ public class VectorizationContext { for (Integer i : exprResultColumnNums) { ocm.freeOutputColumn(i); } - return ve; + + if (isFilter) { + SelectColumnIsTrue filterVectorExpr = new SelectColumnIsTrue(outputCol); + filterVectorExpr.setChildExpressions(new VectorExpression[] {ve}); + return filterVectorExpr; + } else { + return ve; + } } public static boolean isStringFamily(String resultType) { http://git-wip-us.apache.org/repos/asf/hive/blob/71725869/ql/src/test/queries/clientpositive/vector_between_columns.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/vector_between_columns.q b/ql/src/test/queries/clientpositive/vector_between_columns.q index ba38445..41f9243 100644 --- a/ql/src/test/queries/clientpositive/vector_between_columns.q +++ b/ql/src/test/queries/clientpositive/vector_between_columns.q @@ -7,8 +7,7 @@ set hive.mapred.mode=nonstrict; -- SORT_QUERY_RESULTS -- --- The following WILL NOT BE ABLE TO USE the VectorUDFAdaptor to GenericUDFBetween --- because the mode = FILTER is not supported yet. +-- Verify the VectorUDFAdaptor to GenericUDFBetween works for PROJECTION and FILTER. -- create table if not exists TSINT_txt ( RNUM int , CSINT smallint ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' LINES TERMINATED BY '\n'; @@ -25,6 +24,11 @@ create table TSINT stored as orc AS SELECT * FROM TSINT_txt; create table TINT stored as orc AS SELECT * FROM TINT_txt; +explain +select tint.rnum, tsint.rnum, tint.cint, tsint.csint, (case when (tint.cint between tsint.csint and tsint.csint) then "Ok" else "NoOk" end) as between_col from tint , tsint; + +select tint.rnum, tsint.rnum, tint.cint, tsint.csint, (case when (tint.cint between tsint.csint and tsint.csint) then "Ok" else "NoOk" end) as between_col from tint , tsint; + explain select tint.rnum, tsint.rnum, tint.cint, tsint.csint from tint , tsint where tint.cint between tsint.csint and tsint.csint; http://git-wip-us.apache.org/repos/asf/hive/blob/71725869/ql/src/test/results/clientpositive/tez/vector_between_columns.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/tez/vector_between_columns.q.out b/ql/src/test/results/clientpositive/tez/vector_between_columns.q.out index 8a9978b..939aab5 100644 --- a/ql/src/test/results/clientpositive/tez/vector_between_columns.q.out +++ b/ql/src/test/results/clientpositive/tez/vector_between_columns.q.out @@ -1,7 +1,6 @@ PREHOOK: query: -- SORT_QUERY_RESULTS -- --- The following WILL NOT BE ABLE TO USE the VectorUDFAdaptor to GenericUDFBetween --- because the mode = FILTER is not supported yet. +-- Verify the VectorUDFAdaptor to GenericUDFBetween works for PROJECTION and FILTER. -- create table if not exists TSINT_txt ( RNUM int , CSINT smallint ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' LINES TERMINATED BY '\n' @@ -10,8 +9,7 @@ PREHOOK: Output: database:default PREHOOK: Output: default@TSINT_txt POSTHOOK: query: -- SORT_QUERY_RESULTS -- --- The following WILL NOT BE ABLE TO USE the VectorUDFAdaptor to GenericUDFBetween --- because the mode = FILTER is not supported yet. +-- Verify the VectorUDFAdaptor to GenericUDFBetween works for PROJECTION and FILTER. -- create table if not exists TSINT_txt ( RNUM int , CSINT smallint ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' LINES TERMINATED BY '\n' @@ -70,6 +68,115 @@ POSTHOOK: Output: default@TINT POSTHOOK: Lineage: tint.cint SIMPLE [(tint_txt)tint_txt.FieldSchema(name:cint, type:int, comment:null), ] POSTHOOK: Lineage: tint.rnum SIMPLE [(tint_txt)tint_txt.FieldSchema(name:rnum, type:int, comment:null), ] tint_txt.rnum tint_txt.cint +Warning: Map Join MAPJOIN[9][bigTable=?] in task 'Map 1' is a cross product +PREHOOK: query: explain +select tint.rnum, tsint.rnum, tint.cint, tsint.csint, (case when (tint.cint between tsint.csint and tsint.csint) then "Ok" else "NoOk" end) as between_col from tint , tsint +PREHOOK: type: QUERY +POSTHOOK: query: explain +select tint.rnum, tsint.rnum, tint.cint, tsint.csint, (case when (tint.cint between tsint.csint and tsint.csint) then "Ok" else "NoOk" end) as between_col from tint , tsint +POSTHOOK: type: QUERY +Explain +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 1 <- Map 2 (BROADCAST_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: tint + Statistics: Num rows: 5 Data size: 36 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: rnum (type: int), cint (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 36 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col1, _col2, _col3 + input vertices: + 1 Map 2 + Statistics: Num rows: 5 Data size: 39 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col2 (type: int), _col1 (type: int), _col3 (type: smallint), CASE WHEN (_col1 BETWEEN _col3 AND _col3) THEN ('Ok') ELSE ('NoOk') END (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 5 Data size: 39 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 5 Data size: 39 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized + Map 2 + Map Operator Tree: + TableScan + alias: tsint + Statistics: Num rows: 5 Data size: 36 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: rnum (type: int), csint (type: smallint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 36 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 5 Data size: 36 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: smallint) + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Map Join MAPJOIN[9][bigTable=?] in task 'Map 1' is a cross product +PREHOOK: query: select tint.rnum, tsint.rnum, tint.cint, tsint.csint, (case when (tint.cint between tsint.csint and tsint.csint) then "Ok" else "NoOk" end) as between_col from tint , tsint +PREHOOK: type: QUERY +PREHOOK: Input: default@tint +PREHOOK: Input: default@tsint +#### A masked pattern was here #### +POSTHOOK: query: select tint.rnum, tsint.rnum, tint.cint, tsint.csint, (case when (tint.cint between tsint.csint and tsint.csint) then "Ok" else "NoOk" end) as between_col from tint , tsint +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tint +POSTHOOK: Input: default@tsint +#### A masked pattern was here #### +tint.rnum tsint.rnum tint.cint tsint.csint between_col +0 0 NULL NULL NoOk +0 1 NULL -1 NoOk +0 2 NULL 0 NoOk +0 3 NULL 1 NoOk +0 4 NULL 10 NoOk +1 0 -1 NULL NoOk +1 1 -1 -1 Ok +1 2 -1 0 NoOk +1 3 -1 1 NoOk +1 4 -1 10 NoOk +2 0 0 NULL NoOk +2 1 0 -1 NoOk +2 2 0 0 Ok +2 3 0 1 NoOk +2 4 0 10 NoOk +3 0 1 NULL NoOk +3 1 1 -1 NoOk +3 2 1 0 NoOk +3 3 1 1 Ok +3 4 1 10 NoOk +4 0 10 NULL NoOk +4 1 10 -1 NoOk +4 2 10 0 NoOk +4 3 10 1 NoOk +4 4 10 10 Ok Warning: Map Join MAPJOIN[10][bigTable=?] in task 'Map 1' is a cross product PREHOOK: query: explain select tint.rnum, tsint.rnum, tint.cint, tsint.csint from tint , tsint where tint.cint between tsint.csint and tsint.csint @@ -123,6 +230,7 @@ STAGE PLANS: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized Map 2 Map Operator Tree: TableScan http://git-wip-us.apache.org/repos/asf/hive/blob/71725869/ql/src/test/results/clientpositive/vector_between_columns.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/vector_between_columns.q.out b/ql/src/test/results/clientpositive/vector_between_columns.q.out index 5143074..a1bd6c6 100644 --- a/ql/src/test/results/clientpositive/vector_between_columns.q.out +++ b/ql/src/test/results/clientpositive/vector_between_columns.q.out @@ -1,7 +1,6 @@ PREHOOK: query: -- SORT_QUERY_RESULTS -- --- The following WILL NOT BE ABLE TO USE the VectorUDFAdaptor to GenericUDFBetween --- because the mode = FILTER is not supported yet. +-- Verify the VectorUDFAdaptor to GenericUDFBetween works for PROJECTION and FILTER. -- create table if not exists TSINT_txt ( RNUM int , CSINT smallint ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' LINES TERMINATED BY '\n' @@ -10,8 +9,7 @@ PREHOOK: Output: database:default PREHOOK: Output: default@TSINT_txt POSTHOOK: query: -- SORT_QUERY_RESULTS -- --- The following WILL NOT BE ABLE TO USE the VectorUDFAdaptor to GenericUDFBetween --- because the mode = FILTER is not supported yet. +-- Verify the VectorUDFAdaptor to GenericUDFBetween works for PROJECTION and FILTER. -- create table if not exists TSINT_txt ( RNUM int , CSINT smallint ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' LINES TERMINATED BY '\n' @@ -70,6 +68,116 @@ POSTHOOK: Output: default@TINT POSTHOOK: Lineage: tint.cint SIMPLE [(tint_txt)tint_txt.FieldSchema(name:cint, type:int, comment:null), ] POSTHOOK: Lineage: tint.rnum SIMPLE [(tint_txt)tint_txt.FieldSchema(name:rnum, type:int, comment:null), ] tint_txt.rnum tint_txt.cint +Warning: Map Join MAPJOIN[9][bigTable=?] in task 'Stage-3:MAPRED' is a cross product +PREHOOK: query: explain +select tint.rnum, tsint.rnum, tint.cint, tsint.csint, (case when (tint.cint between tsint.csint and tsint.csint) then "Ok" else "NoOk" end) as between_col from tint , tsint +PREHOOK: type: QUERY +POSTHOOK: query: explain +select tint.rnum, tsint.rnum, tint.cint, tsint.csint, (case when (tint.cint between tsint.csint and tsint.csint) then "Ok" else "NoOk" end) as between_col from tint , tsint +POSTHOOK: type: QUERY +Explain +STAGE DEPENDENCIES: + Stage-4 is a root stage + Stage-3 depends on stages: Stage-4 + Stage-0 depends on stages: Stage-3 + +STAGE PLANS: + Stage: Stage-4 + Map Reduce Local Work + Alias -> Map Local Tables: + $hdt$_0:tint + Fetch Operator + limit: -1 + Alias -> Map Local Operator Tree: + $hdt$_0:tint + TableScan + alias: tint + Statistics: Num rows: 5 Data size: 36 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: rnum (type: int), cint (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 36 Basic stats: COMPLETE Column stats: NONE + HashTable Sink Operator + keys: + 0 + 1 + + Stage: Stage-3 + Map Reduce + Map Operator Tree: + TableScan + alias: tsint + Statistics: Num rows: 5 Data size: 36 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: rnum (type: int), csint (type: smallint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 5 Data size: 36 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 5 Data size: 39 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: int), _col2 (type: int), _col1 (type: int), _col3 (type: smallint), CASE WHEN (_col1 BETWEEN _col3 AND _col3) THEN ('Ok') ELSE ('NoOk') END (type: string) + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 5 Data size: 39 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 5 Data size: 39 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized + Local Work: + Map Reduce Local Work + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Map Join MAPJOIN[9][bigTable=?] in task 'Stage-3:MAPRED' is a cross product +PREHOOK: query: select tint.rnum, tsint.rnum, tint.cint, tsint.csint, (case when (tint.cint between tsint.csint and tsint.csint) then "Ok" else "NoOk" end) as between_col from tint , tsint +PREHOOK: type: QUERY +PREHOOK: Input: default@tint +PREHOOK: Input: default@tsint +#### A masked pattern was here #### +POSTHOOK: query: select tint.rnum, tsint.rnum, tint.cint, tsint.csint, (case when (tint.cint between tsint.csint and tsint.csint) then "Ok" else "NoOk" end) as between_col from tint , tsint +POSTHOOK: type: QUERY +POSTHOOK: Input: default@tint +POSTHOOK: Input: default@tsint +#### A masked pattern was here #### +tint.rnum tsint.rnum tint.cint tsint.csint between_col +0 0 NULL NULL NoOk +0 1 NULL -1 NoOk +0 2 NULL 0 NoOk +0 3 NULL 1 NoOk +0 4 NULL 10 NoOk +1 0 -1 NULL NoOk +1 1 -1 -1 Ok +1 2 -1 0 NoOk +1 3 -1 1 NoOk +1 4 -1 10 NoOk +2 0 0 NULL NoOk +2 1 0 -1 NoOk +2 2 0 0 Ok +2 3 0 1 NoOk +2 4 0 10 NoOk +3 0 1 NULL NoOk +3 1 1 -1 NoOk +3 2 1 0 NoOk +3 3 1 1 Ok +3 4 1 10 NoOk +4 0 10 NULL NoOk +4 1 10 -1 NoOk +4 2 10 0 NoOk +4 3 10 1 NoOk +4 4 10 10 Ok Warning: Map Join MAPJOIN[10][bigTable=?] in task 'Stage-3:MAPRED' is a cross product PREHOOK: query: explain select tint.rnum, tsint.rnum, tint.cint, tsint.csint from tint , tsint where tint.cint between tsint.csint and tsint.csint @@ -136,6 +244,7 @@ STAGE PLANS: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized Local Work: Map Reduce Local Work
