HIVE-12209: Vectorize simple UDFs with null arguments (Gopal V, reviewed by Sergey Shelukhin)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/db2c5009 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/db2c5009 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/db2c5009 Branch: refs/heads/master-fixed Commit: db2c5009b243aeb5be09225b03476d1c12ebef84 Parents: 492a10f Author: Gopal V <[email protected]> Authored: Mon Nov 2 19:42:35 2015 -0800 Committer: Gopal V <[email protected]> Committed: Mon Nov 2 19:42:35 2015 -0800 ---------------------------------------------------------------------- .../ql/exec/vector/VectorizationContext.java | 7 +- .../ql/exec/vector/udf/VectorUDFArgDesc.java | 19 ++-- .../queries/clientpositive/vectorized_case.q | 19 ++++ .../clientpositive/spark/vectorized_case.q.out | 109 +++++++++++++++++-- .../clientpositive/tez/vectorized_case.q.out | 109 +++++++++++++++++-- .../clientpositive/vectorized_case.q.out | 69 ++++++++++++ 6 files changed, 301 insertions(+), 31 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/db2c5009/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java index 3489c9c..e7a829e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java @@ -2022,12 +2022,7 @@ public class VectorizationContext { variableArgPositions.add(i); argDescs[i].setVariable(getInputColumnIndex(((ExprNodeColumnDesc) child).getColumn())); } else if (child instanceof ExprNodeConstantDesc) { - if (((ExprNodeConstantDesc) child).getValue() == null) { - // cannot handle constant null at the moment - throw new HiveException("Unable to vectorize custom UDF. Custom udf containing " - + "constant null argument cannot be currently vectorized."); - } - // this is a constant + // this is a constant (or null) argDescs[i].setConstant((ExprNodeConstantDesc) child); } else { throw new HiveException("Unable to vectorize custom UDF. Encountered unsupported expr desc : " http://git-wip-us.apache.org/repos/asf/hive/blob/db2c5009/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/udf/VectorUDFArgDesc.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/udf/VectorUDFArgDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/udf/VectorUDFArgDesc.java index e113980..6abfe63 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/udf/VectorUDFArgDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/udf/VectorUDFArgDesc.java @@ -59,13 +59,18 @@ public class VectorUDFArgDesc implements Serializable { * during initialization. */ public void prepareConstant() { - PrimitiveCategory pc = ((PrimitiveTypeInfo) constExpr.getTypeInfo()) - .getPrimitiveCategory(); - - // Convert from Java to Writable - Object writableValue = PrimitiveObjectInspectorFactory - .getPrimitiveJavaObjectInspector(pc).getPrimitiveWritableObject( - constExpr.getValue()); + final Object writableValue; + if (constExpr != null) { + PrimitiveCategory pc = ((PrimitiveTypeInfo) constExpr.getTypeInfo()) + .getPrimitiveCategory(); + + // Convert from Java to Writable + writableValue = PrimitiveObjectInspectorFactory + .getPrimitiveJavaObjectInspector(pc).getPrimitiveWritableObject( + constExpr.getValue()); + } else { + writableValue = null; + } constObjVal = new GenericUDF.DeferredJavaObject(writableValue); } http://git-wip-us.apache.org/repos/asf/hive/blob/db2c5009/ql/src/test/queries/clientpositive/vectorized_case.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/vectorized_case.q b/ql/src/test/queries/clientpositive/vectorized_case.q index 8799fbb..e74bf82 100644 --- a/ql/src/test/queries/clientpositive/vectorized_case.q +++ b/ql/src/test/queries/clientpositive/vectorized_case.q @@ -1,4 +1,5 @@ set hive.explain.user=false; +set hive.fetch.task.conversion=none; set hive.vectorized.execution.enabled = true ; explain @@ -36,3 +37,21 @@ where csmallint = 418 or csmallint = 12205 or csmallint = 10583 ; +explain +select + csmallint, + case + when csmallint = 418 then "a" + when csmallint = 12205 then "b" + else null + end, + case csmallint + when 418 then "a" + when 12205 then null + else "c" + end +from alltypesorc +where csmallint = 418 +or csmallint = 12205 +or csmallint = 10583 +; http://git-wip-us.apache.org/repos/asf/hive/blob/db2c5009/ql/src/test/results/clientpositive/spark/vectorized_case.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/spark/vectorized_case.q.out b/ql/src/test/results/clientpositive/spark/vectorized_case.q.out index c2250e6..ade9cfe 100644 --- a/ql/src/test/results/clientpositive/spark/vectorized_case.q.out +++ b/ql/src/test/results/clientpositive/spark/vectorized_case.q.out @@ -35,21 +35,40 @@ or csmallint = 12205 or csmallint = 10583 POSTHOOK: type: QUERY STAGE DEPENDENCIES: - Stage-0 is a root stage + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 STAGE PLANS: + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: alltypesorc + Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((csmallint = 418) or (csmallint = 12205) or (csmallint = 10583)) (type: boolean) + Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: csmallint (type: smallint), CASE WHEN ((csmallint = 418)) THEN ('a') WHEN ((csmallint = 12205)) THEN ('b') ELSE ('c') END (type: string), CASE (csmallint) WHEN (418) THEN ('a') WHEN (12205) THEN ('b') ELSE ('c') END (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized + Stage: Stage-0 Fetch Operator limit: -1 Processor Tree: - TableScan - alias: alltypesorc - Filter Operator - predicate: ((csmallint = 418) or (csmallint = 12205) or (csmallint = 10583)) (type: boolean) - Select Operator - expressions: csmallint (type: smallint), CASE WHEN ((csmallint = 418)) THEN ('a') WHEN ((csmallint = 12205)) THEN ('b') ELSE ('c') END (type: string), CASE (csmallint) WHEN (418) THEN ('a') WHEN (12205) THEN ('b') ELSE ('c') END (type: string) - outputColumnNames: _col0, _col1, _col2 - ListSink + ListSink PREHOOK: query: select csmallint, @@ -93,3 +112,75 @@ POSTHOOK: Input: default@alltypesorc 10583 c c 418 a a 12205 b b +PREHOOK: query: explain +select + csmallint, + case + when csmallint = 418 then "a" + when csmallint = 12205 then "b" + else null + end, + case csmallint + when 418 then "a" + when 12205 then null + else "c" + end +from alltypesorc +where csmallint = 418 +or csmallint = 12205 +or csmallint = 10583 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select + csmallint, + case + when csmallint = 418 then "a" + when csmallint = 12205 then "b" + else null + end, + case csmallint + when 418 then "a" + when 12205 then null + else "c" + end +from alltypesorc +where csmallint = 418 +or csmallint = 12205 +or csmallint = 10583 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: alltypesorc + Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((csmallint = 418) or (csmallint = 12205) or (csmallint = 10583)) (type: boolean) + Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: csmallint (type: smallint), CASE WHEN ((csmallint = 418)) THEN ('a') WHEN ((csmallint = 12205)) THEN ('b') ELSE (null) END (type: string), CASE (csmallint) WHEN (418) THEN ('a') WHEN (12205) THEN (null) ELSE ('c') END (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + http://git-wip-us.apache.org/repos/asf/hive/blob/db2c5009/ql/src/test/results/clientpositive/tez/vectorized_case.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/tez/vectorized_case.q.out b/ql/src/test/results/clientpositive/tez/vectorized_case.q.out index c2250e6..136714d 100644 --- a/ql/src/test/results/clientpositive/tez/vectorized_case.q.out +++ b/ql/src/test/results/clientpositive/tez/vectorized_case.q.out @@ -35,21 +35,40 @@ or csmallint = 12205 or csmallint = 10583 POSTHOOK: type: QUERY STAGE DEPENDENCIES: - Stage-0 is a root stage + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: alltypesorc + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((csmallint = 418) or (csmallint = 12205) or (csmallint = 10583)) (type: boolean) + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: csmallint (type: smallint), CASE WHEN ((csmallint = 418)) THEN ('a') WHEN ((csmallint = 12205)) THEN ('b') ELSE ('c') END (type: string), CASE (csmallint) WHEN (418) THEN ('a') WHEN (12205) THEN ('b') ELSE ('c') END (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized + Stage: Stage-0 Fetch Operator limit: -1 Processor Tree: - TableScan - alias: alltypesorc - Filter Operator - predicate: ((csmallint = 418) or (csmallint = 12205) or (csmallint = 10583)) (type: boolean) - Select Operator - expressions: csmallint (type: smallint), CASE WHEN ((csmallint = 418)) THEN ('a') WHEN ((csmallint = 12205)) THEN ('b') ELSE ('c') END (type: string), CASE (csmallint) WHEN (418) THEN ('a') WHEN (12205) THEN ('b') ELSE ('c') END (type: string) - outputColumnNames: _col0, _col1, _col2 - ListSink + ListSink PREHOOK: query: select csmallint, @@ -93,3 +112,75 @@ POSTHOOK: Input: default@alltypesorc 10583 c c 418 a a 12205 b b +PREHOOK: query: explain +select + csmallint, + case + when csmallint = 418 then "a" + when csmallint = 12205 then "b" + else null + end, + case csmallint + when 418 then "a" + when 12205 then null + else "c" + end +from alltypesorc +where csmallint = 418 +or csmallint = 12205 +or csmallint = 10583 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select + csmallint, + case + when csmallint = 418 then "a" + when csmallint = 12205 then "b" + else null + end, + case csmallint + when 418 then "a" + when 12205 then null + else "c" + end +from alltypesorc +where csmallint = 418 +or csmallint = 12205 +or csmallint = 10583 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: alltypesorc + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((csmallint = 418) or (csmallint = 12205) or (csmallint = 10583)) (type: boolean) + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: csmallint (type: smallint), CASE WHEN ((csmallint = 418)) THEN ('a') WHEN ((csmallint = 12205)) THEN ('b') ELSE (null) END (type: string), CASE (csmallint) WHEN (418) THEN ('a') WHEN (12205) THEN (null) ELSE ('c') END (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + http://git-wip-us.apache.org/repos/asf/hive/blob/db2c5009/ql/src/test/results/clientpositive/vectorized_case.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/vectorized_case.q.out b/ql/src/test/results/clientpositive/vectorized_case.q.out index 73bf12d..347a93e 100644 --- a/ql/src/test/results/clientpositive/vectorized_case.q.out +++ b/ql/src/test/results/clientpositive/vectorized_case.q.out @@ -109,3 +109,72 @@ POSTHOOK: Input: default@alltypesorc 10583 c c 418 a a 12205 b b +PREHOOK: query: explain +select + csmallint, + case + when csmallint = 418 then "a" + when csmallint = 12205 then "b" + else null + end, + case csmallint + when 418 then "a" + when 12205 then null + else "c" + end +from alltypesorc +where csmallint = 418 +or csmallint = 12205 +or csmallint = 10583 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select + csmallint, + case + when csmallint = 418 then "a" + when csmallint = 12205 then "b" + else null + end, + case csmallint + when 418 then "a" + when 12205 then null + else "c" + end +from alltypesorc +where csmallint = 418 +or csmallint = 12205 +or csmallint = 10583 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: alltypesorc + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((csmallint = 418) or (csmallint = 12205) or (csmallint = 10583)) (type: boolean) + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: csmallint (type: smallint), CASE WHEN ((csmallint = 418)) THEN ('a') WHEN ((csmallint = 12205)) THEN ('b') ELSE (null) END (type: string), CASE (csmallint) WHEN (418) THEN ('a') WHEN (12205) THEN (null) ELSE ('c') END (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink +
