Repository: hive Updated Branches: refs/heads/master 093341624 -> d35ad0677
HIVE-14155: Vectorization: Custom UDF Vectorization annotations are ignored (Gopal V, reviewed by Ashutosh Chauhan) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/d35ad067 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/d35ad067 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/d35ad067 Branch: refs/heads/master Commit: d35ad06779650c8b5f6c259413bf03e9909ba72f Parents: 0933416 Author: Gopal V <gop...@apache.org> Authored: Sat Aug 27 01:25:28 2016 -0700 Committer: Gopal V <gop...@apache.org> Committed: Sat Aug 27 01:25:36 2016 -0700 ---------------------------------------------------------------------- itests/custom-udfs/pom.xml | 1 + .../udf-vectorized-badexample/pom.xml | 43 ++++++++ .../hive/it/custom/udfs/GenericUDFRot13.java | 32 ++++++ .../custom/udfs/vector/VectorStringRot13.java | 46 ++++++++ .../ql/exec/vector/VectorizationContext.java | 106 ++++++++++--------- .../test/queries/clientpositive/vector_udf3.q | 13 +++ .../results/clientpositive/vector_udf3.q.out | 76 +++++++++++++ 7 files changed, 266 insertions(+), 51 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/d35ad067/itests/custom-udfs/pom.xml ---------------------------------------------------------------------- diff --git a/itests/custom-udfs/pom.xml b/itests/custom-udfs/pom.xml index 3e7443c..b230b41 100644 --- a/itests/custom-udfs/pom.xml +++ b/itests/custom-udfs/pom.xml @@ -42,6 +42,7 @@ limitations under the License. <module>udf-classloader-util</module> <module>udf-classloader-udf1</module> <module>udf-classloader-udf2</module> + <module>udf-vectorized-badexample</module> </modules> <dependencies> http://git-wip-us.apache.org/repos/asf/hive/blob/d35ad067/itests/custom-udfs/udf-vectorized-badexample/pom.xml ---------------------------------------------------------------------- diff --git a/itests/custom-udfs/udf-vectorized-badexample/pom.xml b/itests/custom-udfs/udf-vectorized-badexample/pom.xml new file mode 100644 index 0000000..35c1a2f --- /dev/null +++ b/itests/custom-udfs/udf-vectorized-badexample/pom.xml @@ -0,0 +1,43 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.apache.hive</groupId> + <artifactId>hive-it-custom-udfs</artifactId> + <version>2.2.0-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + + <groupId>org.apache.hive.hive-it-custom-udfs</groupId> + <artifactId>udf-vectorized-badexample</artifactId> + <packaging>jar</packaging> + <name>Hive Integration - Custom UDFs - udf-vectorized-badexample</name> + + <dependencies> + <dependency> + <groupId>org.apache.hive.hive-it-custom-udfs</groupId> + <artifactId>udf-classloader-util</artifactId> + <version>${project.version}</version> + </dependency> + </dependencies> + + <properties> + <hive.path.to.root>../../..</hive.path.to.root> + </properties> + +</project> http://git-wip-us.apache.org/repos/asf/hive/blob/d35ad067/itests/custom-udfs/udf-vectorized-badexample/src/main/java/hive/it/custom/udfs/GenericUDFRot13.java ---------------------------------------------------------------------- diff --git a/itests/custom-udfs/udf-vectorized-badexample/src/main/java/hive/it/custom/udfs/GenericUDFRot13.java b/itests/custom-udfs/udf-vectorized-badexample/src/main/java/hive/it/custom/udfs/GenericUDFRot13.java new file mode 100644 index 0000000..8941175 --- /dev/null +++ b/itests/custom-udfs/udf-vectorized-badexample/src/main/java/hive/it/custom/udfs/GenericUDFRot13.java @@ -0,0 +1,32 @@ +package hive.it.custom.udfs; + +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.Text; +import hive.it.custom.udfs.vector.VectorStringRot13; + +@VectorizedExpressions(value = { VectorStringRot13.class }) +public class GenericUDFRot13 extends GenericUDF { + + @Override + public Object evaluate(DeferredObject[] arg0) throws HiveException { + /* this is the bad part - the vectorized UDF returns the right result */ + return new Text("Unvectorized"); + } + + @Override + public String getDisplayString(String[] arg0) { + return String.format("Rot13(%s)", arg0[0]); + } + + @Override + public ObjectInspector initialize(ObjectInspector[] arg0) + throws UDFArgumentException { + return PrimitiveObjectInspectorFactory.writableStringObjectInspector; + } + +} http://git-wip-us.apache.org/repos/asf/hive/blob/d35ad067/itests/custom-udfs/udf-vectorized-badexample/src/main/java/hive/it/custom/udfs/vector/VectorStringRot13.java ---------------------------------------------------------------------- diff --git a/itests/custom-udfs/udf-vectorized-badexample/src/main/java/hive/it/custom/udfs/vector/VectorStringRot13.java b/itests/custom-udfs/udf-vectorized-badexample/src/main/java/hive/it/custom/udfs/vector/VectorStringRot13.java new file mode 100644 index 0000000..7fbfe32 --- /dev/null +++ b/itests/custom-udfs/udf-vectorized-badexample/src/main/java/hive/it/custom/udfs/vector/VectorStringRot13.java @@ -0,0 +1,46 @@ +package hive.it.custom.udfs.vector; + +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor.Descriptor; +import org.apache.hadoop.hive.ql.exec.vector.ColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.expressions.StringUnaryUDF; +import org.apache.hadoop.hive.ql.exec.vector.expressions.StringUnaryUDFDirect; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.orc.impl.TreeReaderFactory.BytesColumnVectorUtil; + +public class VectorStringRot13 extends StringUnaryUDFDirect { + + public VectorStringRot13(int inputColumn, int outputColumn) { + super(inputColumn, outputColumn); + } + + public VectorStringRot13() { + super(); + } + + @Override + protected void func(BytesColumnVector outV, byte[][] vector, int[] start, + int[] length, int i) { + int off = start[i]; + int len = length[i]; + byte[] src = vector[i]; + byte[] dst = new byte[len]; + for (int j = 0; j < len ; j++) { + dst[j] = rot13(src[off+j]); + } + outV.setVal(i, dst, 0, length[i]); + } + + private byte rot13(byte b) { + if (b >= 'a' && b <= 'm' || b >= 'A' && b <= 'M' ) { + return (byte) (b+13); + } + if (b >= 'n' && b <= 'z' || b >= 'N' && b <= 'Z') { + return (byte) (b-13); + } + return b; + } +} http://git-wip-us.apache.org/repos/asf/hive/blob/d35ad067/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java index 3f71fa8..f088941 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java @@ -531,58 +531,54 @@ public class VectorizationContext { ve = getColumnVectorExpression((ExprNodeColumnDesc) exprDesc, mode); } else if (exprDesc instanceof ExprNodeGenericFuncDesc) { ExprNodeGenericFuncDesc expr = (ExprNodeGenericFuncDesc) exprDesc; - if (isCustomUDF(expr)) { - ve = getCustomUDFExpression(expr, mode); - } else { - - // Add cast expression if needed. Child expressions of a udf may return different data types - // and that would require converting their data types to evaluate the udf. - // For example decimal column added to an integer column would require integer column to be - // cast to decimal. - List<ExprNodeDesc> childExpressions = getChildExpressionsWithImplicitCast(expr.getGenericUDF(), - exprDesc.getChildren(), exprDesc.getTypeInfo()); - ve = getGenericUdfVectorExpression(expr.getGenericUDF(), - childExpressions, mode, exprDesc.getTypeInfo()); - if (ve == null) { - // Ok, no vectorized class available. No problem -- try to use the VectorUDFAdaptor - // when configured. - // - // NOTE: We assume if hiveVectorAdaptorUsageMode has not been set it because we are - // executing a test that didn't create a HiveConf, etc. No usage of VectorUDFAdaptor in - // that case. - if (hiveVectorAdaptorUsageMode != null) { - switch (hiveVectorAdaptorUsageMode) { - case NONE: - // No VectorUDFAdaptor usage. + // Add cast expression if needed. Child expressions of a udf may return different data types + // and that would require converting their data types to evaluate the udf. + // For example decimal column added to an integer column would require integer column to be + // cast to decimal. + // Note: this is a no-op for custom UDFs + List<ExprNodeDesc> childExpressions = getChildExpressionsWithImplicitCast(expr.getGenericUDF(), + exprDesc.getChildren(), exprDesc.getTypeInfo()); + ve = getGenericUdfVectorExpression(expr.getGenericUDF(), + childExpressions, mode, exprDesc.getTypeInfo()); + if (ve == null) { + // Ok, no vectorized class available. No problem -- try to use the VectorUDFAdaptor + // when configured. + // + // NOTE: We assume if hiveVectorAdaptorUsageMode has not been set it because we are + // executing a test that didn't create a HiveConf, etc. No usage of VectorUDFAdaptor in + // that case. + if (hiveVectorAdaptorUsageMode != null) { + switch (hiveVectorAdaptorUsageMode) { + case NONE: + // No VectorUDFAdaptor usage. + throw new HiveException( + "Could not vectorize expression (mode = " + mode.name() + "): " + exprDesc.toString() + + " because hive.vectorized.adaptor.usage.mode=none"); + case CHOSEN: + if (isNonVectorizedPathUDF(expr, mode)) { + ve = getCustomUDFExpression(expr, mode); + } else { throw new HiveException( "Could not vectorize expression (mode = " + mode.name() + "): " + exprDesc.toString() - + " because hive.vectorized.adaptor.usage.mode=none"); - case CHOSEN: - if (isNonVectorizedPathUDF(expr, mode)) { - ve = getCustomUDFExpression(expr, mode); - } else { - throw new HiveException( - "Could not vectorize expression (mode = " + mode.name() + "): " + exprDesc.toString() - + " because hive.vectorized.adaptor.usage.mode=chosen " - + " and the UDF wasn't one of the chosen ones"); - } - break; - case ALL: - if (LOG.isDebugEnabled()) { - LOG.debug("We will try to use the VectorUDFAdaptor for " + exprDesc.toString() - + " because hive.vectorized.adaptor.usage.mode=all"); - } - ve = getCustomUDFExpression(expr, mode); - break; - default: - throw new RuntimeException("Unknown hive vector adaptor usage mode " + - hiveVectorAdaptorUsageMode.name()); + + " because hive.vectorized.adaptor.usage.mode=chosen " + + " and the UDF wasn't one of the chosen ones"); } - if (ve == null) { - throw new HiveException( - "Unable vectorize expression (mode = " + mode.name() + "): " + exprDesc.toString() - + " even for the VectorUDFAdaptor"); + break; + case ALL: + if (LOG.isDebugEnabled()) { + LOG.debug("We will try to use the VectorUDFAdaptor for " + exprDesc.toString() + + " because hive.vectorized.adaptor.usage.mode=all"); } + ve = getCustomUDFExpression(expr, mode); + break; + default: + throw new RuntimeException("Unknown hive vector adaptor usage mode " + + hiveVectorAdaptorUsageMode.name()); + } + if (ve == null) { + throw new HiveException( + "Unable vectorize expression (mode = " + mode.name() + "): " + exprDesc.toString() + + " even for the VectorUDFAdaptor"); } } } @@ -650,8 +646,13 @@ public class VectorizationContext { */ private List<ExprNodeDesc> getChildExpressionsWithImplicitCast(GenericUDF genericUDF, List<ExprNodeDesc> children, TypeInfo returnType) throws HiveException { - if (isExcludedFromCast(genericUDF)) { + if (isCustomUDF(genericUDF.getUdfName())) { + // no implicit casts possible + return children; + } + + if (isExcludedFromCast(genericUDF)) { // No implicit cast needed return children; } @@ -946,9 +947,12 @@ public class VectorizationContext { } // Return true if this is a custom UDF or custom GenericUDF. - // This is for use only in the planner. It will fail in a task. + // This two functions are for use only in the planner. It will fail in a task. public static boolean isCustomUDF(ExprNodeGenericFuncDesc expr) { - String udfName = expr.getFuncText(); + return isCustomUDF(expr.getFuncText()); + } + + private static boolean isCustomUDF(String udfName) { if (udfName == null) { return false; } http://git-wip-us.apache.org/repos/asf/hive/blob/d35ad067/ql/src/test/queries/clientpositive/vector_udf3.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/vector_udf3.q b/ql/src/test/queries/clientpositive/vector_udf3.q new file mode 100644 index 0000000..8a4df79 --- /dev/null +++ b/ql/src/test/queries/clientpositive/vector_udf3.q @@ -0,0 +1,13 @@ +ADD JAR ivy://org.apache.hive.hive-it-custom-udfs:udf-vectorized-badexample:+; + +CREATE TEMPORARY FUNCTION rot13 as 'hive.it.custom.udfs.GenericUDFRot13'; + +set hive.vectorized.execution.enabled=true; + +EXPLAIN SELECT rot13(cstring1) from alltypesorc; + +SELECT cstring1, rot13(cstring1) from alltypesorc order by cstring1 desc limit 10; + +set hive.vectorized.execution.enabled=false; + +SELECT cstring1, rot13(cstring1) from alltypesorc order by cstring1 desc limit 10; http://git-wip-us.apache.org/repos/asf/hive/blob/d35ad067/ql/src/test/results/clientpositive/vector_udf3.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/vector_udf3.q.out b/ql/src/test/results/clientpositive/vector_udf3.q.out new file mode 100644 index 0000000..7c6a90a --- /dev/null +++ b/ql/src/test/results/clientpositive/vector_udf3.q.out @@ -0,0 +1,76 @@ +PREHOOK: query: CREATE TEMPORARY FUNCTION rot13 as 'hive.it.custom.udfs.GenericUDFRot13' +PREHOOK: type: CREATEFUNCTION +PREHOOK: Output: rot13 +POSTHOOK: query: CREATE TEMPORARY FUNCTION rot13 as 'hive.it.custom.udfs.GenericUDFRot13' +POSTHOOK: type: CREATEFUNCTION +POSTHOOK: Output: rot13 +PREHOOK: query: EXPLAIN SELECT rot13(cstring1) from alltypesorc +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN SELECT rot13(cstring1) from alltypesorc +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: alltypesorc + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: Rot13(cstring1) (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT cstring1, rot13(cstring1) from alltypesorc order by cstring1 desc limit 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: SELECT cstring1, rot13(cstring1) from alltypesorc order by cstring1 desc limit 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +yy2GiGM ll2TvTZ +yxN0212hM17E8J8bJj8D7b lkA0212uZ17R8W8oWw8Q7o +ywA68u76Jv06axCv451avL4 ljN68h76Wi06nkPi451niY4 +yvNv1q liAi1d +yv3gnG4a33hD7bIm7oxE5rw li3taT4n33uQ7oVz7bkR5ej +yv1js li1wf +yujO07KWj lhwB07XJw +ytpx1RL8F2I lgck1EY8S2V +ytj7g5W lgw7t5J +ytgaJW1Gvrkv5wFUJU2y1S lgtnWJ1Tiexi5jSHWH2l1F +PREHOOK: query: SELECT cstring1, rot13(cstring1) from alltypesorc order by cstring1 desc limit 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: SELECT cstring1, rot13(cstring1) from alltypesorc order by cstring1 desc limit 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +yy2GiGM Unvectorized +yxN0212hM17E8J8bJj8D7b Unvectorized +ywA68u76Jv06axCv451avL4 Unvectorized +yvNv1q Unvectorized +yv3gnG4a33hD7bIm7oxE5rw Unvectorized +yv1js Unvectorized +yujO07KWj Unvectorized +ytpx1RL8F2I Unvectorized +ytj7g5W Unvectorized +ytgaJW1Gvrkv5wFUJU2y1S Unvectorized