Repository: hive
Updated Branches:
  refs/heads/branch-1 293e22e0e -> 78bedc8e2
  refs/heads/branch-2.0 54760abdc -> 69440a62a


HIVE-13957 : vectorized IN is inconsistent with non-vectorized (at least for 
decimal in (string)) (Sergey Shelukhin, reviewed by Matt McCline)

Conflicts:
        
ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
        ql/src/test/results/clientpositive/spark/vector_between_in.q.out
        ql/src/test/results/clientpositive/tez/vector_between_in.q.out
        ql/src/test/results/clientpositive/vector_between_in.q.out


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/69440a62
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/69440a62
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/69440a62

Branch: refs/heads/branch-2.0
Commit: 69440a62af477e8c237030ca1ed0120c7dc7d787
Parents: 54760ab
Author: Sergey Shelukhin <ser...@apache.org>
Authored: Mon Jun 13 18:32:12 2016 -0700
Committer: Sergey Shelukhin <ser...@apache.org>
Committed: Mon Jun 13 18:48:14 2016 -0700

----------------------------------------------------------------------
 .../ql/exec/vector/VectorizationContext.java    |  30 ++++--
 .../hive/ql/udf/generic/GenericUDFUtils.java    |  52 +++++++--
 .../clientpositive/vector_string_decimal.q      |  21 ++++
 .../spark/vector_between_in.q.out               |   2 -
 .../clientpositive/tez/vector_between_in.q.out  |   2 -
 .../clientpositive/vector_between_in.q.out      |   2 -
 .../clientpositive/vector_string_decimal.q.out  | 106 +++++++++++++++++++
 .../hive/serde2/typeinfo/HiveDecimalUtils.java  |   4 +-
 8 files changed, 192 insertions(+), 27 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/69440a62/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java 
b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
index 1eb960d..6601a87 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizationContext.java
@@ -494,8 +494,8 @@ public class VectorizationContext {
    * Given a udf and its children, return the common type to which the 
children's type should be
    * cast.
    */
-  private TypeInfo getCommonTypeForChildExpressions(GenericUDF genericUdf, 
List<ExprNodeDesc> children,
-      TypeInfo returnType) {
+  private TypeInfo getCommonTypeForChildExpressions(GenericUDF genericUdf,
+      List<ExprNodeDesc> children, TypeInfo returnType) throws HiveException {
     TypeInfo commonType;
     if (genericUdf instanceof GenericUDFBaseCompare) {
 
@@ -507,9 +507,20 @@ public class VectorizationContext {
         commonType = returnType;
       }
     } else if (genericUdf instanceof GenericUDFIn) {
-
-      // Cast to the type of the first child
-      return children.get(0).getTypeInfo();
+      TypeInfo colTi = children.get(0).getTypeInfo();
+      if (colTi.getCategory() != Category.PRIMITIVE) {
+        return colTi; // Handled later, only struct will be supported.
+      }
+      TypeInfo opTi = GenericUDFUtils.deriveInType(children);
+      if (opTi == null || opTi.getCategory() != Category.PRIMITIVE) {
+        throw new HiveException("Cannot vectorize IN() - common type is " + 
opTi);
+      }
+      if (((PrimitiveTypeInfo)colTi).getPrimitiveCategory() !=
+          ((PrimitiveTypeInfo)opTi).getPrimitiveCategory()) {
+        throw new HiveException("Cannot vectorize IN() - casting a column is 
not supported. "
+            + "Column type is " + colTi + " but the common type is " + opTi);
+      }
+      return colTi;
     } else {
       // The children type should be converted to return type
       commonType = returnType;
@@ -606,6 +617,7 @@ public class VectorizationContext {
     }
     PrimitiveTypeInfo ptinfo = (PrimitiveTypeInfo) inputTypeInfo;
     int precision = getPrecisionForType(ptinfo);
+    // TODO: precision and scale would be practically invalid for string 
conversion (38,38)
     int scale = HiveDecimalUtils.getScaleForType(ptinfo);
     return new DecimalTypeInfo(precision, scale);
   }
@@ -1496,8 +1508,8 @@ public class VectorizationContext {
   /**
    * Create a filter or boolean-valued expression for column IN ( 
<list-of-constants> )
    */
-  private VectorExpression getInExpression(List<ExprNodeDesc> childExpr, Mode 
mode, TypeInfo returnType)
-      throws HiveException {
+  private VectorExpression getInExpression(List<ExprNodeDesc> childExpr,
+      VectorExpressionDescriptor.Mode mode, TypeInfo returnType) throws 
HiveException {
     ExprNodeDesc colExpr = childExpr.get(0);
     List<ExprNodeDesc> inChildren = childExpr.subList(1, childExpr.size());
 
@@ -1505,7 +1517,7 @@ public class VectorizationContext {
     colType = VectorizationContext.mapTypeNameSynonyms(colType);
     TypeInfo colTypeInfo = TypeInfoUtils.getTypeInfoFromTypeString(colType);
     Category category = colTypeInfo.getCategory();
-    if (category == Category.STRUCT){
+    if (category == Category.STRUCT) {
       return getStructInExpression(childExpr, colExpr, colTypeInfo, 
inChildren, mode, returnType);
     } else if (category != Category.PRIMITIVE) {
       return null;
@@ -1526,6 +1538,8 @@ public class VectorizationContext {
 
     // determine class
     Class<?> cl = null;
+    // TODO: the below assumes that all the arguments to IN are of the same 
type;
+    //       non-vectorized validates that explicitly during UDF init.
     if (isIntFamily(colType)) {
       cl = (mode == Mode.FILTER ? FilterLongColumnInList.class : 
LongColumnInList.class);
       long[] inVals = new long[childrenForInList.size()];

http://git-wip-us.apache.org/repos/asf/hive/blob/69440a62/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUtils.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUtils.java 
b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUtils.java
index 3bbe783..2c4c0d0 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFUtils.java
@@ -23,14 +23,17 @@ import java.lang.reflect.Method;
 import java.lang.reflect.ParameterizedType;
 import java.lang.reflect.Type;
 import java.util.HashMap;
+import java.util.List;
 
 import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
 import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
 import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
 import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
 import org.apache.hadoop.hive.serde2.io.HiveCharWritable;
 import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
 import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters;
 import 
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter;
 import 
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.IdentityConverter;
@@ -168,17 +171,7 @@ public final class GenericUDFUtils {
         return false;
       }
 
-      /**
-       * TODO: Hack fix until HIVE-5848 is addressed. non-exact type shouldn't 
be promoted
-       * to exact type, as FunctionRegistry.getCommonClass() might do. This 
corrects
-       * that.
-       */
-      if (commonTypeInfo instanceof DecimalTypeInfo) {
-        if ((!FunctionRegistry.isExactNumericType((PrimitiveTypeInfo) 
oiTypeInfo)) ||
-            (!FunctionRegistry.isExactNumericType((PrimitiveTypeInfo) 
rTypeInfo))) {
-          commonTypeInfo = TypeInfoFactory.doubleTypeInfo;
-        }
-      }
+      commonTypeInfo = updateCommonTypeForDecimal(commonTypeInfo, oiTypeInfo, 
rTypeInfo);
 
       returnObjectInspector = TypeInfoUtils
           .getStandardWritableObjectInspectorFromTypeInfo(commonTypeInfo);
@@ -239,6 +232,43 @@ public final class GenericUDFUtils {
 
   }
 
+  protected static TypeInfo updateCommonTypeForDecimal(
+      TypeInfo commonTypeInfo, TypeInfo ti, TypeInfo returnType) {
+    /**
+     * TODO: Hack fix until HIVE-5848 is addressed. non-exact type shouldn't 
be promoted
+     * to exact type, as FunctionRegistry.getCommonClass() might do. This 
corrects
+     * that.
+     */
+    if (commonTypeInfo instanceof DecimalTypeInfo) {
+      if ((!FunctionRegistry.isExactNumericType((PrimitiveTypeInfo)ti)) ||
+          
(!FunctionRegistry.isExactNumericType((PrimitiveTypeInfo)returnType))) {
+        return TypeInfoFactory.doubleTypeInfo;
+      }
+    }
+    return commonTypeInfo;
+  }
+
+  // Based on update() above.
+  public static TypeInfo deriveInType(List<ExprNodeDesc> children) {
+    TypeInfo returnType = null;
+    for (ExprNodeDesc node : children) {
+      TypeInfo ti = node.getTypeInfo();
+      if (ti.getCategory() == Category.PRIMITIVE
+        && ((PrimitiveTypeInfo)ti).getPrimitiveCategory() == 
PrimitiveCategory.VOID) {
+        continue;
+      }
+      if (returnType == null) {
+        returnType = ti;
+        continue;
+      }
+      if (returnType == ti) continue;
+      TypeInfo commonTypeInfo = FunctionRegistry.getCommonClass(returnType, 
ti);
+      if (commonTypeInfo == null) return null;
+      returnType = updateCommonTypeForDecimal(commonTypeInfo, ti, returnType);
+    }
+    return returnType;
+  }
+
   /**
    * Convert parameters for the method if needed.
    */

http://git-wip-us.apache.org/repos/asf/hive/blob/69440a62/ql/src/test/queries/clientpositive/vector_string_decimal.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/vector_string_decimal.q 
b/ql/src/test/queries/clientpositive/vector_string_decimal.q
new file mode 100644
index 0000000..e69cd77
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/vector_string_decimal.q
@@ -0,0 +1,21 @@
+set hive.vectorized.execution.enabled=false;
+set hive.fetch.task.conversion=none;
+
+drop table orc_decimal;
+drop table staging;
+create table orc_decimal (id decimal(18,0)) stored as orc;
+
+create table staging (id decimal(18,0));
+
+insert into staging values (34324.0), (100000000.0), (200000000.0), 
(300000000.0);
+
+insert overwrite table orc_decimal select id from staging;
+
+set hive.vectorized.execution.enabled=true;
+
+explain
+select * from orc_decimal where id in ('100000000', '200000000');
+select * from orc_decimal where id in ('100000000', '200000000');
+
+drop table orc_decimal;
+drop table staging;

http://git-wip-us.apache.org/repos/asf/hive/blob/69440a62/ql/src/test/results/clientpositive/spark/vector_between_in.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/spark/vector_between_in.q.out 
b/ql/src/test/results/clientpositive/spark/vector_between_in.q.out
index f1ff784..71e13ab 100644
--- a/ql/src/test/results/clientpositive/spark/vector_between_in.q.out
+++ b/ql/src/test/results/clientpositive/spark/vector_between_in.q.out
@@ -149,7 +149,6 @@ STAGE PLANS:
                         key expressions: _col0 (type: decimal(20,10))
                         sort order: +
                         Statistics: Num rows: 6144 Data size: 1233808 Basic 
stats: COMPLETE Column stats: NONE
-            Execution mode: vectorized
         Reducer 2 
             Execution mode: vectorized
             Reduce Operator Tree:
@@ -205,7 +204,6 @@ STAGE PLANS:
                           sort order: 
                           Statistics: Num rows: 1 Data size: 8 Basic stats: 
COMPLETE Column stats: NONE
                           value expressions: _col0 (type: bigint)
-            Execution mode: vectorized
         Reducer 2 
             Execution mode: vectorized
             Reduce Operator Tree:

http://git-wip-us.apache.org/repos/asf/hive/blob/69440a62/ql/src/test/results/clientpositive/tez/vector_between_in.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/tez/vector_between_in.q.out 
b/ql/src/test/results/clientpositive/tez/vector_between_in.q.out
index 9466ab2..b8be37e 100644
--- a/ql/src/test/results/clientpositive/tez/vector_between_in.q.out
+++ b/ql/src/test/results/clientpositive/tez/vector_between_in.q.out
@@ -152,7 +152,6 @@ STAGE PLANS:
                         key expressions: _col0 (type: decimal(20,10))
                         sort order: +
                         Statistics: Num rows: 6144 Data size: 1233808 Basic 
stats: COMPLETE Column stats: NONE
-            Execution mode: vectorized
         Reducer 2 
             Execution mode: vectorized
             Reduce Operator Tree:
@@ -209,7 +208,6 @@ STAGE PLANS:
                           sort order: 
                           Statistics: Num rows: 1 Data size: 8 Basic stats: 
COMPLETE Column stats: NONE
                           value expressions: _col0 (type: bigint)
-            Execution mode: vectorized
         Reducer 2 
             Execution mode: vectorized
             Reduce Operator Tree:

http://git-wip-us.apache.org/repos/asf/hive/blob/69440a62/ql/src/test/results/clientpositive/vector_between_in.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/vector_between_in.q.out 
b/ql/src/test/results/clientpositive/vector_between_in.q.out
index b80da1b..d14e0f2 100644
--- a/ql/src/test/results/clientpositive/vector_between_in.q.out
+++ b/ql/src/test/results/clientpositive/vector_between_in.q.out
@@ -130,7 +130,6 @@ STAGE PLANS:
                   key expressions: _col0 (type: decimal(20,10))
                   sort order: +
                   Statistics: Num rows: 6144 Data size: 1233808 Basic stats: 
COMPLETE Column stats: NONE
-      Execution mode: vectorized
       Reduce Operator Tree:
         Select Operator
           expressions: KEY.reducesinkkey0 (type: decimal(20,10))
@@ -179,7 +178,6 @@ STAGE PLANS:
                     sort order: 
                     Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: NONE
                     value expressions: _col0 (type: bigint)
-      Execution mode: vectorized
       Reduce Operator Tree:
         Group By Operator
           aggregations: count(VALUE._col0)

http://git-wip-us.apache.org/repos/asf/hive/blob/69440a62/ql/src/test/results/clientpositive/vector_string_decimal.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/vector_string_decimal.q.out 
b/ql/src/test/results/clientpositive/vector_string_decimal.q.out
new file mode 100644
index 0000000..e0a3563
--- /dev/null
+++ b/ql/src/test/results/clientpositive/vector_string_decimal.q.out
@@ -0,0 +1,106 @@
+PREHOOK: query: drop table orc_decimal
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table orc_decimal
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: drop table staging
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table staging
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: create table orc_decimal (id decimal(18,0)) stored as orc
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@orc_decimal
+POSTHOOK: query: create table orc_decimal (id decimal(18,0)) stored as orc
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@orc_decimal
+PREHOOK: query: create table staging (id decimal(18,0))
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@staging
+POSTHOOK: query: create table staging (id decimal(18,0))
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@staging
+PREHOOK: query: insert into staging values (34324.0), (100000000.0), 
(200000000.0), (300000000.0)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@values__tmp__table__1
+PREHOOK: Output: default@staging
+POSTHOOK: query: insert into staging values (34324.0), (100000000.0), 
(200000000.0), (300000000.0)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@values__tmp__table__1
+POSTHOOK: Output: default@staging
+POSTHOOK: Lineage: staging.id EXPRESSION 
[(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, 
type:string, comment:), ]
+PREHOOK: query: insert overwrite table orc_decimal select id from staging
+PREHOOK: type: QUERY
+PREHOOK: Input: default@staging
+PREHOOK: Output: default@orc_decimal
+POSTHOOK: query: insert overwrite table orc_decimal select id from staging
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@staging
+POSTHOOK: Output: default@orc_decimal
+POSTHOOK: Lineage: orc_decimal.id SIMPLE 
[(staging)staging.FieldSchema(name:id, type:decimal(18,0), comment:null), ]
+PREHOOK: query: explain
+select * from orc_decimal where id in ('100000000', '200000000')
+PREHOOK: type: QUERY
+POSTHOOK: query: explain
+select * from orc_decimal where id in ('100000000', '200000000')
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: orc_decimal
+            Statistics: Num rows: 4 Data size: 448 Basic stats: COMPLETE 
Column stats: NONE
+            Filter Operator
+              predicate: (id) IN ('100000000', '200000000') (type: boolean)
+              Statistics: Num rows: 2 Data size: 224 Basic stats: COMPLETE 
Column stats: NONE
+              Select Operator
+                expressions: id (type: decimal(18,0))
+                outputColumnNames: _col0
+                Statistics: Num rows: 2 Data size: 224 Basic stats: COMPLETE 
Column stats: NONE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 2 Data size: 224 Basic stats: COMPLETE 
Column stats: NONE
+                  table:
+                      input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select * from orc_decimal where id in ('100000000', 
'200000000')
+PREHOOK: type: QUERY
+PREHOOK: Input: default@orc_decimal
+#### A masked pattern was here ####
+POSTHOOK: query: select * from orc_decimal where id in ('100000000', 
'200000000')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@orc_decimal
+#### A masked pattern was here ####
+100000000
+200000000
+PREHOOK: query: drop table orc_decimal
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@orc_decimal
+PREHOOK: Output: default@orc_decimal
+POSTHOOK: query: drop table orc_decimal
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@orc_decimal
+POSTHOOK: Output: default@orc_decimal
+PREHOOK: query: drop table staging
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@staging
+PREHOOK: Output: default@staging
+POSTHOOK: query: drop table staging
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@staging
+POSTHOOK: Output: default@staging

http://git-wip-us.apache.org/repos/asf/hive/blob/69440a62/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/HiveDecimalUtils.java
----------------------------------------------------------------------
diff --git 
a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/HiveDecimalUtils.java 
b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/HiveDecimalUtils.java
index cdd20bb..5caaf6b 100644
--- 
a/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/HiveDecimalUtils.java
+++ 
b/serde/src/java/org/apache/hadoop/hive/serde2/typeinfo/HiveDecimalUtils.java
@@ -77,7 +77,7 @@ public class HiveDecimalUtils {
     case VOID:
       return 1;
     default:
-      return HiveDecimal.MAX_PRECISION;
+      return HiveDecimal.SYSTEM_DEFAULT_PRECISION;
     }
   }
 
@@ -100,7 +100,7 @@ public class HiveDecimalUtils {
     case VOID:
       return 0;
     default:
-      return HiveDecimal.MAX_SCALE;
+      return HiveDecimal.SYSTEM_DEFAULT_SCALE;
     }
   }
 

Reply via email to