HIVE-13340 : Vectorization: from_unixtime UDF shim (Gopal V via Ashutosh Chauhan)
Signed-off-by: Ashutosh Chauhan <hashut...@apache.org> Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/7049f49d Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/7049f49d Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/7049f49d Branch: refs/heads/llap Commit: 7049f49d9574587b2eb5896bab8415d7cd7c1ef1 Parents: e7f69f0 Author: Gopal V <gop...@apache.org> Authored: Wed Mar 23 02:07:00 2016 -0800 Committer: Ashutosh Chauhan <hashut...@apache.org> Committed: Wed Apr 13 12:00:07 2016 -0700 ---------------------------------------------------------------------- .../ql/exec/vector/VectorizationContext.java | 2 + .../optimizer/ConstantPropagateProcFactory.java | 22 ++- .../hive/ql/optimizer/physical/Vectorizer.java | 2 + ql/src/test/queries/clientpositive/foldts.q | 20 +++ ql/src/test/results/clientpositive/foldts.q.out | 154 +++++++++++++++++++ .../clientpositive/udf_to_unix_timestamp.q.out | 2 +- 6 files changed, 197 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/7049f49d/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java index 329c1d5..86025ef 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java @@ -102,6 +102,7 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.plan.GroupByDesc; import org.apache.hadoop.hive.ql.udf.SettableUDF; import org.apache.hadoop.hive.ql.udf.UDFConv; +import org.apache.hadoop.hive.ql.udf.UDFFromUnixTime; import org.apache.hadoop.hive.ql.udf.UDFHex; import org.apache.hadoop.hive.ql.udf.UDFRegExpExtract; import org.apache.hadoop.hive.ql.udf.UDFRegExpReplace; @@ -761,6 +762,7 @@ public class VectorizationContext { || udfClass.equals(UDFRegExpExtract.class) || udfClass.equals(UDFRegExpReplace.class) || udfClass.equals(UDFConv.class) + || udfClass.equals(UDFFromUnixTime.class) && isIntFamily(arg0Type(expr)) || isCastToIntFamily(udfClass) && isStringFamily(arg0Type(expr)) || isCastToFloatFamily(udfClass) && isStringFamily(arg0Type(expr)) || udfClass.equals(UDFToString.class) && http://git-wip-us.apache.org/repos/asf/hive/blob/7049f49d/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConstantPropagateProcFactory.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConstantPropagateProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConstantPropagateProcFactory.java index bdc7448..8c1f34d 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConstantPropagateProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ConstantPropagateProcFactory.java @@ -77,6 +77,8 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNotNull; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNull; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFStruct; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToUnixTimeStamp; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFUnixTimeStamp; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFWhen; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector; @@ -229,7 +231,7 @@ public final class ConstantPropagateProcFactory { public static ExprNodeDesc foldExpr(ExprNodeGenericFuncDesc funcDesc) { GenericUDF udf = funcDesc.getGenericUDF(); - if (!isDeterministicUdf(udf)) { + if (!isDeterministicUdf(udf, funcDesc.getChildren())) { return funcDesc; } return evaluateFunction(funcDesc.getGenericUDF(),funcDesc.getChildren(), funcDesc.getChildren()); @@ -347,7 +349,7 @@ public final class ConstantPropagateProcFactory { } // Don't evaluate nondeterministic function since the value can only calculate during runtime. - if (!isDeterministicUdf(udf)) { + if (!isDeterministicUdf(udf, newExprs)) { if (LOG.isDebugEnabled()) { LOG.debug("Function " + udf.getClass() + " is undeterministic. Don't evalulate immediately."); } @@ -406,7 +408,7 @@ public final class ConstantPropagateProcFactory { } // Don't evaluate nondeterministic function since the value can only calculate during runtime. - if (!isDeterministicUdf(udf)) { + if (!isDeterministicUdf(udf, newExprs)) { if (LOG.isDebugEnabled()) { LOG.debug("Function " + udf.getClass() + " is undeterministic. Don't evaluate immediately."); } @@ -457,12 +459,17 @@ public final class ConstantPropagateProcFactory { return desc; } - private static boolean isDeterministicUdf(GenericUDF udf) { + private static boolean isDeterministicUdf(GenericUDF udf, List<ExprNodeDesc> children) { UDFType udfType = udf.getClass().getAnnotation(UDFType.class); if (udf instanceof GenericUDFBridge) { udfType = ((GenericUDFBridge) udf).getUdfClass().getAnnotation(UDFType.class); } if (udfType.deterministic() == false) { + if (udf.getClass().equals(GenericUDFUnixTimeStamp.class) + && children != null && children.size() > 0) { + // unix_timestamp is polymorphic (ignore class annotations) + return true; + } return false; } @@ -817,6 +824,13 @@ public final class ConstantPropagateProcFactory { } } + if (udf instanceof GenericUDFUnixTimeStamp) { + if (newExprs.size() >= 1) { + // unix_timestamp(args) -> to_unix_timestamp(args) + return ExprNodeGenericFuncDesc.newInstance(new GenericUDFToUnixTimeStamp(), newExprs); + } + } + return null; } http://git-wip-us.apache.org/repos/asf/hive/blob/7049f49d/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index d806b97..1ddd9be 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -126,6 +126,7 @@ import org.apache.hadoop.hive.ql.udf.UDFCos; import org.apache.hadoop.hive.ql.udf.UDFDayOfMonth; import org.apache.hadoop.hive.ql.udf.UDFDegrees; import org.apache.hadoop.hive.ql.udf.UDFExp; +import org.apache.hadoop.hive.ql.udf.UDFFromUnixTime; import org.apache.hadoop.hive.ql.udf.UDFHex; import org.apache.hadoop.hive.ql.udf.UDFHour; import org.apache.hadoop.hive.ql.udf.UDFLength; @@ -247,6 +248,7 @@ public class Vectorizer implements PhysicalPlanResolver { supportedGenericUDFs.add(UDFSecond.class); supportedGenericUDFs.add(UDFWeekOfYear.class); supportedGenericUDFs.add(GenericUDFToUnixTimeStamp.class); + supportedGenericUDFs.add(UDFFromUnixTime.class); supportedGenericUDFs.add(GenericUDFDateAdd.class); supportedGenericUDFs.add(GenericUDFDateSub.class); http://git-wip-us.apache.org/repos/asf/hive/blob/7049f49d/ql/src/test/queries/clientpositive/foldts.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/foldts.q b/ql/src/test/queries/clientpositive/foldts.q new file mode 100644 index 0000000..362cac2 --- /dev/null +++ b/ql/src/test/queries/clientpositive/foldts.q @@ -0,0 +1,20 @@ + +set hive.vectorized.execution.enabled=true; +set hive.fetch.task.conversion=none; + +explain +select ctimestamp1, unix_timestamp(ctimestamp1), to_unix_timestamp(ctimestamp1) from alltypesorc limit 1; + +select ctimestamp1, unix_timestamp(ctimestamp1), to_unix_timestamp(ctimestamp1) from alltypesorc limit 1; + +create temporary table src1orc stored as orc as select * from src1; + +explain +select from_unixtime(to_unix_timestamp(ctimestamp1), 'EEEE') from alltypesorc limit 1; + +select from_unixtime(to_unix_timestamp(ctimestamp1), 'EEEE') from alltypesorc limit 1; + +explain +select from_unixtime(unix_timestamp(ctimestamp1), 'EEEE') from alltypesorc limit 1; + +select from_unixtime(unix_timestamp(ctimestamp1), 'EEEE') from alltypesorc limit 1; http://git-wip-us.apache.org/repos/asf/hive/blob/7049f49d/ql/src/test/results/clientpositive/foldts.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/foldts.q.out b/ql/src/test/results/clientpositive/foldts.q.out new file mode 100644 index 0000000..4c78495 --- /dev/null +++ b/ql/src/test/results/clientpositive/foldts.q.out @@ -0,0 +1,154 @@ +PREHOOK: query: explain +select ctimestamp1, unix_timestamp(ctimestamp1), to_unix_timestamp(ctimestamp1) from alltypesorc limit 1 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select ctimestamp1, unix_timestamp(ctimestamp1), to_unix_timestamp(ctimestamp1) from alltypesorc limit 1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: alltypesorc + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: ctimestamp1 (type: timestamp), to_unix_timestamp(ctimestamp1) (type: bigint), to_unix_timestamp(ctimestamp1) (type: bigint) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 1 + Statistics: Num rows: 1 Data size: 215 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 215 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + +PREHOOK: query: select ctimestamp1, unix_timestamp(ctimestamp1), to_unix_timestamp(ctimestamp1) from alltypesorc limit 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ctimestamp1, unix_timestamp(ctimestamp1), to_unix_timestamp(ctimestamp1) from alltypesorc limit 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +1969-12-31 15:59:46.674 -13 -13 +PREHOOK: query: create temporary table src1orc stored as orc as select * from src1 +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@src1 +PREHOOK: Output: database:default +PREHOOK: Output: default@src1orc +POSTHOOK: query: create temporary table src1orc stored as orc as select * from src1 +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@src1 +POSTHOOK: Output: database:default +POSTHOOK: Output: default@src1orc +PREHOOK: query: explain +select from_unixtime(to_unix_timestamp(ctimestamp1), 'EEEE') from alltypesorc limit 1 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select from_unixtime(to_unix_timestamp(ctimestamp1), 'EEEE') from alltypesorc limit 1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: alltypesorc + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: from_unixtime(to_unix_timestamp(ctimestamp1), 'EEEE') (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 1 + Statistics: Num rows: 1 Data size: 215 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 215 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + +PREHOOK: query: select from_unixtime(to_unix_timestamp(ctimestamp1), 'EEEE') from alltypesorc limit 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select from_unixtime(to_unix_timestamp(ctimestamp1), 'EEEE') from alltypesorc limit 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +Wednesday +PREHOOK: query: explain +select from_unixtime(unix_timestamp(ctimestamp1), 'EEEE') from alltypesorc limit 1 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select from_unixtime(unix_timestamp(ctimestamp1), 'EEEE') from alltypesorc limit 1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: alltypesorc + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: from_unixtime(to_unix_timestamp(ctimestamp1), 'EEEE') (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 1 + Statistics: Num rows: 1 Data size: 215 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 215 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + +PREHOOK: query: select from_unixtime(unix_timestamp(ctimestamp1), 'EEEE') from alltypesorc limit 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select from_unixtime(unix_timestamp(ctimestamp1), 'EEEE') from alltypesorc limit 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +Wednesday http://git-wip-us.apache.org/repos/asf/hive/blob/7049f49d/ql/src/test/results/clientpositive/udf_to_unix_timestamp.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/udf_to_unix_timestamp.q.out b/ql/src/test/results/clientpositive/udf_to_unix_timestamp.q.out index ce82461..3d31664 100644 --- a/ql/src/test/results/clientpositive/udf_to_unix_timestamp.q.out +++ b/ql/src/test/results/clientpositive/udf_to_unix_timestamp.q.out @@ -103,7 +103,7 @@ STAGE PLANS: alias: src Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE Filter Operator - predicate: (unix_timestamp(key) > 10) (type: boolean) + predicate: (to_unix_timestamp(key) > 10) (type: boolean) Statistics: Num rows: 166 Data size: 1763 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: key (type: string), value (type: string)