HIVE-13267: Vectorization: Add SelectLikeStringColScalar for non-filter operations (Gopal V, reviewed by Matt McCline)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/51609a0f Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/51609a0f Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/51609a0f Branch: refs/heads/java8 Commit: 51609a0f242ca96de2d6c92b57d4343f89e0d9cc Parents: d2dac26 Author: Gopal V <[email protected]> Authored: Wed May 25 16:50:33 2016 -0700 Committer: Gopal V <[email protected]> Committed: Wed May 25 16:50:33 2016 -0700 ---------------------------------------------------------------------- ...AbstractFilterStringColLikeStringScalar.java | 2 +- .../SelectStringColLikeStringScalar.java | 179 +++++++++++++++++++ .../org/apache/hadoop/hive/ql/udf/UDFLike.java | 3 +- .../test/queries/clientpositive/vector_udf2.q | 29 +++ .../results/clientpositive/vector_udf2.q.out | 110 ++++++++++++ 5 files changed, 321 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/51609a0f/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java index b70beef..c50af8d 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java @@ -63,7 +63,7 @@ public abstract class AbstractFilterStringColLikeStringScalar extends VectorExpr * @param pattern * @return */ - private Checker createChecker(String pattern) { + Checker createChecker(String pattern) { for (CheckerFactory checkerFactory : getCheckerFactories()) { Checker checker = checkerFactory.tryCreate(pattern); if (checker != null) { http://git-wip-us.apache.org/repos/asf/hive/blob/51609a0f/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/SelectStringColLikeStringScalar.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/SelectStringColLikeStringScalar.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/SelectStringColLikeStringScalar.java new file mode 100644 index 0000000..b914196 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/SelectStringColLikeStringScalar.java @@ -0,0 +1,179 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.expressions; + +import java.nio.charset.StandardCharsets; + +import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor.Descriptor; +import org.apache.hadoop.hive.ql.exec.vector.expressions.AbstractFilterStringColLikeStringScalar.Checker; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; + +public class SelectStringColLikeStringScalar extends VectorExpression { + + private static final long serialVersionUID = 1L; + + private int colNum; + private int outputColumn; + private byte[] pattern; + transient Checker checker = null; + + public SelectStringColLikeStringScalar() { + super(); + } + + public SelectStringColLikeStringScalar(int colNum, byte[] pattern, int outputColumn) { + super(); + this.colNum = colNum; + this.pattern = pattern; + this.outputColumn = outputColumn; + } + + @Override + public void evaluate(VectorizedRowBatch batch) { + if (checker == null) { + checker = borrowChecker(); + } + + if (childExpressions != null) { + super.evaluateChildren(batch); + } + + BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[colNum]; + int[] sel = batch.selected; + boolean[] nullPos = inputColVector.isNull; + int n = batch.size; + byte[][] vector = inputColVector.vector; + int[] length = inputColVector.length; + int[] start = inputColVector.start; + + LongColumnVector outV = (LongColumnVector) batch.cols[outputColumn]; + long[] outputVector = outV.vector; + + // return immediately if batch is empty + if (n == 0) { + return; + } + + outV.noNulls = inputColVector.noNulls; + outV.isRepeating = inputColVector.isRepeating; + + if (inputColVector.noNulls) { + if (inputColVector.isRepeating) { + outputVector[0] = (checker.check(vector[0], start[0], length[0]) ? 1 : 0); + outV.isNull[0] = false; + } else if (batch.selectedInUse) { + for (int j = 0; j != n; j++) { + int i = sel[j]; + outputVector[i] = (checker.check(vector[i], start[i], length[i]) ? 1 : 0); + outV.isNull[i] = false; + } + } else { + for (int i = 0; i != n; i++) { + outputVector[i] = (checker.check(vector[i], start[i], length[i]) ? 1 : 0); + outV.isNull[i] = false; + } + } + } else { + if (inputColVector.isRepeating) { + //All must be selected otherwise size would be zero. Repeating property will not change. + if (!nullPos[0]) { + outputVector[0] = (checker.check(vector[0], start[0], length[0]) ? 1 : 0); + outV.isNull[0] = false; + } else { + outputVector[0] = LongColumnVector.NULL_VALUE; + outV.isNull[0] = true; + } + } else if (batch.selectedInUse) { + for (int j = 0; j != n; j++) { + int i = sel[j]; + if (!nullPos[i]) { + outputVector[i] = (checker.check(vector[i], start[i], length[i]) ? 1 : 0); + outV.isNull[i] = false; + } else { + outputVector[0] = LongColumnVector.NULL_VALUE; + outV.isNull[i] = true; + } + } + } else { + for (int i = 0; i != n; i++) { + if (!nullPos[i]) { + outputVector[i] = (checker.check(vector[i], start[i], length[i]) ? 1 : 0); + outV.isNull[i] = false; + } else { + outputVector[0] = LongColumnVector.NULL_VALUE; + outV.isNull[i] = true; + } + } + } + } + } + + private Checker borrowChecker() { + FilterStringColLikeStringScalar fil = new FilterStringColLikeStringScalar(); + return fil.createChecker(new String(pattern, StandardCharsets.UTF_8)); + } + + public int getColNum() { + return colNum; + } + + public void setColNum(int colNum) { + this.colNum = colNum; + } + + public byte[] getPattern() { + return pattern; + } + + public void setPattern(byte[] pattern) { + this.pattern = pattern; + } + + public void setOutputColumn(int outputColumn) { + this.outputColumn = outputColumn; + } + + @Override + public int getOutputColumn() { + return outputColumn; + } + + @Override + public String getOutputType() { + return "String_Family"; + } + + @Override + public Descriptor getDescriptor() { + return (new VectorExpressionDescriptor.Builder()) + .setMode( + VectorExpressionDescriptor.Mode.PROJECTION) + .setNumArguments(2) + .setArgumentTypes( + VectorExpressionDescriptor.ArgumentType.STRING_FAMILY, + VectorExpressionDescriptor.ArgumentType.STRING) + .setInputExpressionTypes( + VectorExpressionDescriptor.InputExpressionType.COLUMN, + VectorExpressionDescriptor.InputExpressionType.SCALAR).build(); + } + +} http://git-wip-us.apache.org/repos/asf/hive/blob/51609a0f/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLike.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLike.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLike.java index 85d0363..7bcd36e 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLike.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLike.java @@ -25,6 +25,7 @@ import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions; import org.apache.hadoop.hive.ql.exec.vector.expressions.FilterStringColLikeStringScalar; +import org.apache.hadoop.hive.ql.exec.vector.expressions.SelectStringColLikeStringScalar; import org.apache.hadoop.io.BooleanWritable; import org.apache.hadoop.io.Text; @@ -37,7 +38,7 @@ import org.apache.hadoop.io.Text; extended = "Example:\n" + " > SELECT a.* FROM srcpart a WHERE a.hr _FUNC_ '%2' LIMIT 1;\n" + " 27 val_27 2008-04-08 12") -@VectorizedExpressions({FilterStringColLikeStringScalar.class}) +@VectorizedExpressions({FilterStringColLikeStringScalar.class, SelectStringColLikeStringScalar.class}) public class UDFLike extends UDF { private final Text lastLikePattern = new Text(); private Pattern p = null; http://git-wip-us.apache.org/repos/asf/hive/blob/51609a0f/ql/src/test/queries/clientpositive/vector_udf2.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/vector_udf2.q b/ql/src/test/queries/clientpositive/vector_udf2.q new file mode 100644 index 0000000..e349d14 --- /dev/null +++ b/ql/src/test/queries/clientpositive/vector_udf2.q @@ -0,0 +1,29 @@ +SET hive.vectorized.execution.enabled=true; +set hive.fetch.task.conversion=none; + +drop table varchar_udf_2; + +create table varchar_udf_2 (c1 string, c2 string, c3 varchar(10), c4 varchar(20)) STORED AS ORC; +insert overwrite table varchar_udf_2 + select key, value, key, value from src where key = '238' limit 1; + +explain +select + c1 LIKE '%38%', + c2 LIKE 'val_%', + c3 LIKE '%38', + c1 LIKE '%3x8%', + c2 LIKE 'xval_%', + c3 LIKE '%x38' +from varchar_udf_2 limit 1; + +select + c1 LIKE '%38%', + c2 LIKE 'val_%', + c3 LIKE '%38', + c1 LIKE '%3x8%', + c2 LIKE 'xval_%', + c3 LIKE '%x38' +from varchar_udf_2 limit 1; + +drop table varchar_udf_2; http://git-wip-us.apache.org/repos/asf/hive/blob/51609a0f/ql/src/test/results/clientpositive/vector_udf2.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/vector_udf2.q.out b/ql/src/test/results/clientpositive/vector_udf2.q.out new file mode 100644 index 0000000..42e7041 --- /dev/null +++ b/ql/src/test/results/clientpositive/vector_udf2.q.out @@ -0,0 +1,110 @@ +PREHOOK: query: drop table varchar_udf_2 +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table varchar_udf_2 +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table varchar_udf_2 (c1 string, c2 string, c3 varchar(10), c4 varchar(20)) STORED AS ORC +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@varchar_udf_2 +POSTHOOK: query: create table varchar_udf_2 (c1 string, c2 string, c3 varchar(10), c4 varchar(20)) STORED AS ORC +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@varchar_udf_2 +PREHOOK: query: insert overwrite table varchar_udf_2 + select key, value, key, value from src where key = '238' limit 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@varchar_udf_2 +POSTHOOK: query: insert overwrite table varchar_udf_2 + select key, value, key, value from src where key = '238' limit 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@varchar_udf_2 +POSTHOOK: Lineage: varchar_udf_2.c1 SIMPLE [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: varchar_udf_2.c2 SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: varchar_udf_2.c3 EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: varchar_udf_2.c4 EXPRESSION [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: explain +select + c1 LIKE '%38%', + c2 LIKE 'val_%', + c3 LIKE '%38', + c1 LIKE '%3x8%', + c2 LIKE 'xval_%', + c3 LIKE '%x38' +from varchar_udf_2 limit 1 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select + c1 LIKE '%38%', + c2 LIKE 'val_%', + c3 LIKE '%38', + c1 LIKE '%3x8%', + c2 LIKE 'xval_%', + c3 LIKE '%x38' +from varchar_udf_2 limit 1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: varchar_udf_2 + Statistics: Num rows: 1 Data size: 356 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: (c1 like '%38%') (type: boolean), (c2 like 'val_%') (type: boolean), (c3 like '%38') (type: boolean), (c1 like '%3x8%') (type: boolean), (c2 like 'xval_%') (type: boolean), (c3 like '%x38') (type: boolean) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + Statistics: Num rows: 1 Data size: 356 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 1 + Statistics: Num rows: 1 Data size: 356 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 356 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + +PREHOOK: query: select + c1 LIKE '%38%', + c2 LIKE 'val_%', + c3 LIKE '%38', + c1 LIKE '%3x8%', + c2 LIKE 'xval_%', + c3 LIKE '%x38' +from varchar_udf_2 limit 1 +PREHOOK: type: QUERY +PREHOOK: Input: default@varchar_udf_2 +#### A masked pattern was here #### +POSTHOOK: query: select + c1 LIKE '%38%', + c2 LIKE 'val_%', + c3 LIKE '%38', + c1 LIKE '%3x8%', + c2 LIKE 'xval_%', + c3 LIKE '%x38' +from varchar_udf_2 limit 1 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@varchar_udf_2 +#### A masked pattern was here #### +true true true false false false +PREHOOK: query: drop table varchar_udf_2 +PREHOOK: type: DROPTABLE +PREHOOK: Input: default@varchar_udf_2 +PREHOOK: Output: default@varchar_udf_2 +POSTHOOK: query: drop table varchar_udf_2 +POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@varchar_udf_2 +POSTHOOK: Output: default@varchar_udf_2
