This is an automated email from the ASF dual-hosted git repository. vgarg pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new ece9a54 HIVE-22074: Slow compilation due to IN to OR transformation (Vineet Garg, reviewed by Jesus Camacho Rodriguez) ece9a54 is described below commit ece9a54b78e41e2acfe9cd5df45fcddf3a4b05ab Author: Vineet Garg <vg...@apache.org> AuthorDate: Fri Aug 23 13:25:16 2019 -0700 HIVE-22074: Slow compilation due to IN to OR transformation (Vineet Garg, reviewed by Jesus Camacho Rodriguez) --- .../java/org/apache/hadoop/hive/conf/HiveConf.java | 2 + .../calcite/translator/RexNodeConverter.java | 25 +++- .../hadoop/hive/ql/parse/CalcitePlanner.java | 11 +- .../hadoop/hive/ql/parse/TypeCheckProcFactory.java | 26 ++-- .../queries/clientpositive/in_typecheck_char.q | 12 ++ .../results/clientpositive/in_typecheck_char.q.out | 146 +++++++++++++++++++++ 6 files changed, 205 insertions(+), 17 deletions(-) diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 9fd7dca..69408f6 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -2270,6 +2270,8 @@ public class HiveConf extends Configuration { "Whether to transform OR clauses in Filter operators into IN clauses"), HIVEPOINTLOOKUPOPTIMIZERMIN("hive.optimize.point.lookup.min", 2, "Minimum number of OR clauses needed to transform into IN clauses"), + HIVEOPT_TRANSFORM_IN_MAXNODES("hive.optimize.transform.in.maxnodes", 16, + "Maximum number of IN expressions beyond which IN will not be transformed into OR clause"), HIVECOUNTDISTINCTOPTIMIZER("hive.optimize.countdistinct", true, "Whether to transform count distinct into two stages"), HIVEPARTITIONCOLUMNSEPARATOR("hive.optimize.partition.columns.separate", true, diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/RexNodeConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/RexNodeConverter.java index 100ee0b..63882fd 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/RexNodeConverter.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/RexNodeConverter.java @@ -142,6 +142,7 @@ public class RexNodeConverter { private final RowResolver outerRR; private final ImmutableMap<String, Integer> outerNameToPosMap; private int correlatedId; + private final int maxNodesForInToOrTransformation; //Constructor used by HiveRexExecutorImpl public RexNodeConverter(RelOptCluster cluster) { @@ -151,13 +152,15 @@ public class RexNodeConverter { //subqueries will need outer query's row resolver public RexNodeConverter(RelOptCluster cluster, RelDataType inpDataType, ImmutableMap<String, Integer> outerNameToPosMap, - ImmutableMap<String, Integer> nameToPosMap, RowResolver hiveRR, RowResolver outerRR, int offset, boolean flattenExpr, int correlatedId) { + ImmutableMap<String, Integer> nameToPosMap, RowResolver hiveRR, RowResolver outerRR, + int maxNodesForInToOrTransformation, int offset, boolean flattenExpr, int correlatedId) { this.cluster = cluster; this.inputCtxs = ImmutableList.of(new InputCtx(inpDataType, nameToPosMap, hiveRR, offset)); this.flattenExpr = flattenExpr; this.outerRR = outerRR; this.outerNameToPosMap = outerNameToPosMap; this.correlatedId = correlatedId; + this.maxNodesForInToOrTransformation = maxNodesForInToOrTransformation; } public RexNodeConverter(RelOptCluster cluster, RelDataType inpDataType, @@ -167,6 +170,7 @@ public class RexNodeConverter { this.flattenExpr = flattenExpr; this.outerRR = null; this.outerNameToPosMap = null; + this.maxNodesForInToOrTransformation = 0; } public RexNodeConverter(RelOptCluster cluster, List<InputCtx> inpCtxLst, boolean flattenExpr) { @@ -175,6 +179,7 @@ public class RexNodeConverter { this.flattenExpr = flattenExpr; this.outerRR = null; this.outerNameToPosMap = null; + this.maxNodesForInToOrTransformation = 0; } public RexNode convert(ExprNodeDesc expr) throws SemanticException { @@ -423,12 +428,22 @@ public class RexNodeConverter { // from IN [A,B] => EQUALS [A,B] // except complex types calciteOp = SqlStdOperatorTable.EQUALS; - } else if (RexUtil.isReferenceOrAccess(childRexNodeLst.get(0), true)) { + } else if (RexUtil.isReferenceOrAccess(childRexNodeLst.get(0), true)){ // if it is more than an single item in an IN clause, // transform from IN [A,B,C] => OR [EQUALS [A,B], EQUALS [A,C]] // except complex types - childRexNodeLst = rewriteInClauseChildren(calciteOp, childRexNodeLst); - calciteOp = SqlStdOperatorTable.OR; + // Rewrite to OR is done only if number of operands are less than + // the threshold configured + boolean rewriteToOr = true; + if(this.maxNodesForInToOrTransformation != 0) { + if(childRexNodeLst.size() > this.maxNodesForInToOrTransformation) { + rewriteToOr = false; + } + } + if(rewriteToOr) { + childRexNodeLst = rewriteInClauseChildren(calciteOp, childRexNodeLst); + calciteOp = SqlStdOperatorTable.OR; + } } } else if (calciteOp.getKind() == SqlKind.COALESCE && childRexNodeLst.size() > 1) { @@ -576,7 +591,7 @@ public class RexNodeConverter { RexNode node = nodes.get(i); if ((i % 2 == 1 || i == nodes.size() - 1) && !node.getType().getSqlTypeName().equals(retType.getSqlTypeName())) { - newNodes.add(cluster.getRexBuilder().makeCast(retType, node)); + newNodes.add(cluster.getRexBuilder().makeCast(retType, node)); } else { newNodes.add(node); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java index 212d27a..8d9718f 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java @@ -3195,6 +3195,7 @@ public class CalcitePlanner extends SemanticAnalyzer { .get(srcRel); RexNode convertedFilterExpr = new RexNodeConverter(cluster, srcRel.getRowType(), outerNameToPosMap, hiveColNameCalcitePosMap, relToHiveRR.get(srcRel), outerRR, + HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVEOPT_TRANSFORM_IN_MAXNODES), 0, true, subqueryId).convert(filterCondn); RexNode factoredFilterExpr = RexUtil .pullFactors(cluster.getRexBuilder(), convertedFilterExpr); @@ -3428,8 +3429,9 @@ public class CalcitePlanner extends SemanticAnalyzer { ImmutableMap<String, Integer> hiveColNameCalcitePosMap = this.relToHiveColNameCalcitePosMap .get(srcRel); RexNode convertedFilterLHS = new RexNodeConverter(cluster, srcRel.getRowType(), - outerNameToPosMap, hiveColNameCalcitePosMap, relToHiveRR.get(srcRel), - outerRR, 0, true, subqueryId).convert(subQueryExpr); + outerNameToPosMap, hiveColNameCalcitePosMap, relToHiveRR.get(srcRel), + outerRR, HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVEOPT_TRANSFORM_IN_MAXNODES), + 0, true, subqueryId).convert(subQueryExpr); RelNode filterRel = new HiveFilter(cluster, cluster.traitSetOf(HiveRelNode.CONVENTION), srcRel, convertedFilterLHS); @@ -4695,8 +4697,9 @@ public class CalcitePlanner extends SemanticAnalyzer { List<RexNode> calciteColLst = new ArrayList<RexNode>(); RexNodeConverter rexNodeConv = new RexNodeConverter(cluster, srcRel.getRowType(), - outerNameToPosMap, buildHiveColNameToInputPosMap(col_list, inputRR), relToHiveRR.get(srcRel), - outerRR, 0, false, subqueryId); + outerNameToPosMap, buildHiveColNameToInputPosMap(col_list, inputRR), relToHiveRR.get(srcRel), + outerRR, HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVEOPT_TRANSFORM_IN_MAXNODES), + 0, false, subqueryId); for (ExprNodeDesc colExpr : col_list) { calciteColLst.add(rexNodeConv.convert(colExpr)); } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java index 0c81986..a4c1b9a 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TypeCheckProcFactory.java @@ -1220,16 +1220,26 @@ public class TypeCheckProcFactory { } outputOpList.add(nullConst); } + if (!ctx.isCBOExecuted()) { - ArrayList<ExprNodeDesc> orOperands = TypeCheckProcFactoryUtils.rewriteInToOR(children); - if (orOperands != null) { - if (orOperands.size() == 1) { - orOperands.add(new ExprNodeConstantDesc(TypeInfoFactory.booleanTypeInfo, false)); + + HiveConf conf; + try { + conf = Hive.get().getConf(); + } catch (HiveException e) { + throw new SemanticException(e); + } + if( children.size() <= HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVEOPT_TRANSFORM_IN_MAXNODES)) { + ArrayList<ExprNodeDesc> orOperands = TypeCheckProcFactoryUtils.rewriteInToOR(children); + if (orOperands != null) { + if (orOperands.size() == 1) { + orOperands.add(new ExprNodeConstantDesc(TypeInfoFactory.booleanTypeInfo, false)); + } + funcText = "or"; + genericUDF = new GenericUDFOPOr(); + children.clear(); + children.addAll(orOperands); } - funcText = "or"; - genericUDF = new GenericUDFOPOr(); - children.clear(); - children.addAll(orOperands); } } } diff --git a/ql/src/test/queries/clientpositive/in_typecheck_char.q b/ql/src/test/queries/clientpositive/in_typecheck_char.q index 3955c4b..a144d51 100644 --- a/ql/src/test/queries/clientpositive/in_typecheck_char.q +++ b/ql/src/test/queries/clientpositive/in_typecheck_char.q @@ -22,3 +22,15 @@ select 'expected 2',count(*) from ax where (s,t) in (('a','a'),(null, 'bb')); -- this is right now broken; HIVE-20779 should fix it explain select 'expected 1',count(*) from ax where ((s,t) in (('a','a'),(null, 'bb'))) is null; select 'expected 1',count(*) from ax where ((s,t) in (('a','a'),(null, 'bb'))) is null; + +set hive.optimize.point.lookup=false; +explain cbo select count(*) from ax where t in +('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z'); +explain select count(*) from ax where t in +('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z'); + +set hive.optimize.transform.in.maxnodes=20; +explain cbo select count(*) from ax where t in +('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z'); +explain select count(*) from ax where t in +('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z'); diff --git a/ql/src/test/results/clientpositive/in_typecheck_char.q.out b/ql/src/test/results/clientpositive/in_typecheck_char.q.out index cb9e777..4d59ddf 100644 --- a/ql/src/test/results/clientpositive/in_typecheck_char.q.out +++ b/ql/src/test/results/clientpositive/in_typecheck_char.q.out @@ -262,3 +262,149 @@ POSTHOOK: type: QUERY POSTHOOK: Input: default@ax #### A masked pattern was here #### expected 1 1 +PREHOOK: query: explain cbo select count(*) from ax where t in +('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z') +PREHOOK: type: QUERY +PREHOOK: Input: default@ax +#### A masked pattern was here #### +POSTHOOK: query: explain cbo select count(*) from ax where t in +('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@ax +#### A masked pattern was here #### +CBO PLAN: +HiveAggregate(group=[{}], agg#0=[count()]) + HiveFilter(condition=[IN($1, _UTF-16LE'a ', _UTF-16LE'bb ', _UTF-16LE'aa ', _UTF-16LE'bbb ', _UTF-16LE'ab ', _UTF-16LE'ba ', _UTF-16LE'aaa ', _UTF-16LE'bbb ', _UTF-16LE'abc ', _UTF-16LE'bc ', _UTF-16LE'ac ', _UTF-16LE'bca ', _UTF-16LE'cab ', _UTF-16LE'cb ', _UTF-16LE'ca ', _UTF-16LE'cbc ', _UTF-16LE'z ')]) + HiveTableScan(table=[[default, ax]], table:alias=[ax]) + +PREHOOK: query: explain select count(*) from ax where t in +('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z') +PREHOOK: type: QUERY +PREHOOK: Input: default@ax +#### A masked pattern was here #### +POSTHOOK: query: explain select count(*) from ax where t in +('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@ax +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: ax + filterExpr: (t) IN ('a ', 'bb ', 'aa ', 'bbb ', 'ab ', 'ba ', 'aaa ', 'bbb ', 'abc ', 'bc ', 'ac ', 'bca ', 'cab ', 'cb ', 'ca ', 'cbc ', 'z ') (type: boolean) + Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (t) IN ('a ', 'bb ', 'aa ', 'bbb ', 'ab ', 'ba ', 'aaa ', 'bbb ', 'abc ', 'bc ', 'ac ', 'bca ', 'cab ', 'cb ', 'ca ', 'cbc ', 'z ') (type: boolean) + Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count() + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) + Execution mode: vectorized + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain cbo select count(*) from ax where t in +('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z') +PREHOOK: type: QUERY +PREHOOK: Input: default@ax +#### A masked pattern was here #### +POSTHOOK: query: explain cbo select count(*) from ax where t in +('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@ax +#### A masked pattern was here #### +CBO PLAN: +HiveAggregate(group=[{}], agg#0=[count()]) + HiveFilter(condition=[OR(=($1, _UTF-16LE'a '), =($1, _UTF-16LE'bb '), =($1, _UTF-16LE'aa '), =($1, _UTF-16LE'bbb '), =($1, _UTF-16LE'ab '), =($1, _UTF-16LE'ba '), =($1, _UTF-16LE'aaa '), =($1, _UTF-16LE'abc '), =($1, _UTF-16LE'bc '), =($1, _UTF-16LE'ac '), =($1, _UTF-16LE'bca '), =($1, _UTF-16LE'cab '), =($1, _UTF-16LE'cb '), =($1, _UTF-16LE'ca '), =($1, _UTF-16LE'cbc '), =($1, _UTF-16LE [...] + HiveTableScan(table=[[default, ax]], table:alias=[ax]) + +PREHOOK: query: explain select count(*) from ax where t in +('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z') +PREHOOK: type: QUERY +PREHOOK: Input: default@ax +#### A masked pattern was here #### +POSTHOOK: query: explain select count(*) from ax where t in +('a','bb','aa','bbb','ab','ba','aaa','bbb','abc','bc','ac','bca','cab','cb','ca','cbc','z') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@ax +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: ax + filterExpr: ((t = 'a ') or (t = 'bb ') or (t = 'aa ') or (t = 'bbb ') or (t = 'ab ') or (t = 'ba ') or (t = 'aaa ') or (t = 'abc ') or (t = 'bc ') or (t = 'ac ') or (t = 'bca ') or (t = 'cab ') or (t = 'cb ') or (t = 'ca ') or (t = 'cbc ') or (t = 'z ')) (type: boolean) + Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: ((t = 'a ') or (t = 'bb ') or (t = 'aa ') or (t = 'bbb ') or (t = 'ab ') or (t = 'ba ') or (t = 'aaa ') or (t = 'abc ') or (t = 'bc ') or (t = 'ac ') or (t = 'bca ') or (t = 'cab ') or (t = 'cb ') or (t = 'ca ') or (t = 'cbc ') or (t = 'z ')) (type: boolean) + Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + Statistics: Num rows: 3 Data size: 255 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count() + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) + Execution mode: vectorized + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink +