Repository: hive Updated Branches: refs/heads/branch-2 9c23c3b81 -> 95a66afb4
HIVE-18422 : Vectorized input format should not be used when input format is excluded and row.serde is enabled (Vihang Karajgaonkar, reviewed by Matt McCline) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/95a66afb Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/95a66afb Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/95a66afb Branch: refs/heads/branch-2 Commit: 95a66afb4211a925f0a290416fcdc4c267969ac6 Parents: 9c23c3b Author: Vihang Karajgaonkar <[email protected]> Authored: Wed Jan 10 14:18:12 2018 -0800 Committer: Vihang Karajgaonkar <[email protected]> Committed: Tue Jan 23 18:06:40 2018 -0800 ---------------------------------------------------------------------- .../hive/ql/optimizer/physical/Vectorizer.java | 21 ++- .../vectorization_input_format_excludes.q | 23 +++ .../vectorization_input_format_excludes.q.out | 164 +++++++++++++++++++ .../vectorization_input_format_excludes.q.out | 152 +++++++++++++++++ 4 files changed, 358 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/95a66afb/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index 11e623a..09e9993 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -434,7 +434,9 @@ public class Vectorizer implements PhysicalPlanResolver { List<Integer> dataColumnNums; int partitionColumnCount; - boolean useVectorizedInputFileFormat; + //not to be confused with useVectorizedInputFileFormat at Vectorizer level + //which represents the value of configuration hive.vectorized.use.vectorized.input.format + private boolean useVectorizedInputFileFormat; boolean groupByVectorOutput; boolean allNative; @@ -856,6 +858,20 @@ public class Vectorizer implements PhysicalPlanResolver { return false; } + private boolean shouldUseVectorizedInputFormat(Set<String> inputFileFormatClassNames) { + if (inputFileFormatClassNames == null || inputFileFormatClassNames.isEmpty() + || !useVectorizedInputFileFormat) { + return useVectorizedInputFileFormat; + } + //Global config of vectorized input format is enabled; check if these inputformats are excluded + for (String inputFormat : inputFileFormatClassNames) { + if(isInputFormatExcluded(inputFormat, vectorizedInputFormatExcludes)) { + return false; + } + } + return true; + } + private boolean isInputFormatExcluded(String inputFileFormatClassName, Collection<Class<?>> excludes) { Class<?> ifClass = null; try { @@ -1029,7 +1045,8 @@ public class Vectorizer implements PhysicalPlanResolver { vectorTaskColumnInfo.setAllTypeInfos(allTypeInfoList); vectorTaskColumnInfo.setDataColumnNums(dataColumnNums); vectorTaskColumnInfo.setPartitionColumnCount(partitionColumnCount); - vectorTaskColumnInfo.setUseVectorizedInputFileFormat(useVectorizedInputFileFormat); + vectorTaskColumnInfo.setUseVectorizedInputFileFormat( + shouldUseVectorizedInputFormat(inputFileFormatClassNameSet)); // Always set these so EXPLAIN can see. mapWork.setVectorizationInputFileFormatClassNameSet(inputFileFormatClassNameSet); http://git-wip-us.apache.org/repos/asf/hive/blob/95a66afb/ql/src/test/queries/clientpositive/vectorization_input_format_excludes.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/vectorization_input_format_excludes.q b/ql/src/test/queries/clientpositive/vectorization_input_format_excludes.q index 78d8249..0db7748 100644 --- a/ql/src/test/queries/clientpositive/vectorization_input_format_excludes.q +++ b/ql/src/test/queries/clientpositive/vectorization_input_format_excludes.q @@ -167,3 +167,26 @@ select ctinyint, stddev_pop(cdouble) from alltypes_orc group by ctinyint; + +-- test when input format is excluded row serde is used for vectorization +set hive.vectorized.input.format.excludes=org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat,org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; +set hive.vectorized.use.vectorized.input.format=true; +set hive.vectorized.use.row.serde.deserialize=true; + +create table orcTbl (t1 tinyint, t2 tinyint) +stored as orc; + +insert into orcTbl values (54, 9), (-104, 25), (-112, 24); + +explain vectorization select t1, t2, (t1+t2) from orcTbl where (t1+t2) > 10; + +select t1, t2, (t1+t2) from orcTbl where (t1+t2) > 10; + +create table parquetTbl (t1 tinyint, t2 tinyint) +stored as parquet; + +insert into parquetTbl values (54, 9), (-104, 25), (-112, 24); + +explain vectorization SELECT t1, t2, (t1 + t2) FROM parquetTbl WHERE (t1 + t2) > 10; + +SELECT t1, t2, (t1 + t2) FROM parquetTbl WHERE (t1 + t2) > 10; http://git-wip-us.apache.org/repos/asf/hive/blob/95a66afb/ql/src/test/results/clientpositive/spark/vectorization_input_format_excludes.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/spark/vectorization_input_format_excludes.q.out b/ql/src/test/results/clientpositive/spark/vectorization_input_format_excludes.q.out index bf6c9fc..8bc7d93 100644 --- a/ql/src/test/results/clientpositive/spark/vectorization_input_format_excludes.q.out +++ b/ql/src/test/results/clientpositive/spark/vectorization_input_format_excludes.q.out @@ -1374,3 +1374,167 @@ POSTHOOK: Input: default@alltypes_orc 8 1070764888 -15778 1034 8.0 9562.355155774725 9 626923679 -13629 25 9.0 10157.217948808622 NULL 1073418988 -16379 3115 NULL 305051.4870777435 +PREHOOK: query: create table orcTbl (t1 tinyint, t2 tinyint) +stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@orcTbl +POSTHOOK: query: create table orcTbl (t1 tinyint, t2 tinyint) +stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orcTbl +PREHOOK: query: insert into orcTbl values (54, 9), (-104, 25), (-112, 24) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@orctbl +POSTHOOK: query: insert into orcTbl values (54, 9), (-104, 25), (-112, 24) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@orctbl +POSTHOOK: Lineage: orctbl.t1 SCRIPT [] +POSTHOOK: Lineage: orctbl.t2 SCRIPT [] +PREHOOK: query: explain vectorization select t1, t2, (t1+t2) from orcTbl where (t1+t2) > 10 +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select t1, t2, (t1+t2) from orcTbl where (t1+t2) > 10 +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: orctbl + Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (UDFToInteger((t1 + t2)) > 10) (type: boolean) + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: t1 (type: tinyint), t2 (type: tinyint), (t1 + t2) (type: tinyint) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.row.serde.deserialize IS true + inputFormatFeatureSupport: [] + featureSupportInUse: [] + inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select t1, t2, (t1+t2) from orcTbl where (t1+t2) > 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@orctbl +#### A masked pattern was here #### +POSTHOOK: query: select t1, t2, (t1+t2) from orcTbl where (t1+t2) > 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orctbl +#### A masked pattern was here #### +54 9 63 +PREHOOK: query: create table parquetTbl (t1 tinyint, t2 tinyint) +stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@parquetTbl +POSTHOOK: query: create table parquetTbl (t1 tinyint, t2 tinyint) +stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquetTbl +PREHOOK: query: insert into parquetTbl values (54, 9), (-104, 25), (-112, 24) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@parquettbl +POSTHOOK: query: insert into parquetTbl values (54, 9), (-104, 25), (-112, 24) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@parquettbl +POSTHOOK: Lineage: parquettbl.t1 SCRIPT [] +POSTHOOK: Lineage: parquettbl.t2 SCRIPT [] +PREHOOK: query: explain vectorization SELECT t1, t2, (t1 + t2) FROM parquetTbl WHERE (t1 + t2) > 10 +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization SELECT t1, t2, (t1 + t2) FROM parquetTbl WHERE (t1 + t2) > 10 +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: parquettbl + Statistics: Num rows: 3 Data size: 6 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (UDFToInteger((t1 + t2)) > 10) (type: boolean) + Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: t1 (type: tinyint), t2 (type: tinyint), (t1 + t2) (type: tinyint) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.row.serde.deserialize IS true + inputFormatFeatureSupport: [] + featureSupportInUse: [] + inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT t1, t2, (t1 + t2) FROM parquetTbl WHERE (t1 + t2) > 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@parquettbl +#### A masked pattern was here #### +POSTHOOK: query: SELECT t1, t2, (t1 + t2) FROM parquetTbl WHERE (t1 + t2) > 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquettbl +#### A masked pattern was here #### +54 9 63 http://git-wip-us.apache.org/repos/asf/hive/blob/95a66afb/ql/src/test/results/clientpositive/vectorization_input_format_excludes.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/vectorization_input_format_excludes.q.out b/ql/src/test/results/clientpositive/vectorization_input_format_excludes.q.out index f049047..0409ced 100644 --- a/ql/src/test/results/clientpositive/vectorization_input_format_excludes.q.out +++ b/ql/src/test/results/clientpositive/vectorization_input_format_excludes.q.out @@ -1334,3 +1334,155 @@ POSTHOOK: Input: default@alltypes_orc 8 1070764888 -15778 1034 8.0 9562.355155774725 9 626923679 -13629 25 9.0 10157.217948808622 NULL 1073418988 -16379 3115 NULL 305051.4870777435 +PREHOOK: query: create table orcTbl (t1 tinyint, t2 tinyint) +stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@orcTbl +POSTHOOK: query: create table orcTbl (t1 tinyint, t2 tinyint) +stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@orcTbl +PREHOOK: query: insert into orcTbl values (54, 9), (-104, 25), (-112, 24) +PREHOOK: type: QUERY +PREHOOK: Output: default@orctbl +POSTHOOK: query: insert into orcTbl values (54, 9), (-104, 25), (-112, 24) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@orctbl +POSTHOOK: Lineage: orctbl.t1 EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: orctbl.t2 EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +PREHOOK: query: explain vectorization select t1, t2, (t1+t2) from orcTbl where (t1+t2) > 10 +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization select t1, t2, (t1+t2) from orcTbl where (t1+t2) > 10 +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: orctbl + Statistics: Num rows: 3 Data size: 24 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (UDFToInteger((t1 + t2)) > 10) (type: boolean) + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: t1 (type: tinyint), t2 (type: tinyint), (t1 + t2) (type: tinyint) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.row.serde.deserialize IS true + groupByVectorOutput: true + inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select t1, t2, (t1+t2) from orcTbl where (t1+t2) > 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@orctbl +#### A masked pattern was here #### +POSTHOOK: query: select t1, t2, (t1+t2) from orcTbl where (t1+t2) > 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@orctbl +#### A masked pattern was here #### +54 9 63 +PREHOOK: query: create table parquetTbl (t1 tinyint, t2 tinyint) +stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@parquetTbl +POSTHOOK: query: create table parquetTbl (t1 tinyint, t2 tinyint) +stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@parquetTbl +PREHOOK: query: insert into parquetTbl values (54, 9), (-104, 25), (-112, 24) +PREHOOK: type: QUERY +PREHOOK: Output: default@parquettbl +POSTHOOK: query: insert into parquetTbl values (54, 9), (-104, 25), (-112, 24) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@parquettbl +POSTHOOK: Lineage: parquettbl.t1 EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: parquettbl.t2 EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +PREHOOK: query: explain vectorization SELECT t1, t2, (t1 + t2) FROM parquetTbl WHERE (t1 + t2) > 10 +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization SELECT t1, t2, (t1 + t2) FROM parquetTbl WHERE (t1 + t2) > 10 +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: parquettbl + Statistics: Num rows: 3 Data size: 6 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (UDFToInteger((t1 + t2)) > 10) (type: boolean) + Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: t1 (type: tinyint), t2 (type: tinyint), (t1 + t2) (type: tinyint) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 2 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.row.serde.deserialize IS true + groupByVectorOutput: true + inputFileFormats: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT t1, t2, (t1 + t2) FROM parquetTbl WHERE (t1 + t2) > 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@parquettbl +#### A masked pattern was here #### +POSTHOOK: query: SELECT t1, t2, (t1 + t2) FROM parquetTbl WHERE (t1 + t2) > 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@parquettbl +#### A masked pattern was here #### +54 9 63
