Repository: hive Updated Branches: refs/heads/master 5201f188b -> be05e32e5
HIVE-12021: HivePreFilteringRule may introduce wrong common operands (Jesus Camacho Rodriguez, reviewed by Laljo John Pullokkaran) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/be05e32e Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/be05e32e Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/be05e32e Branch: refs/heads/master Commit: be05e32e58f54c0185551cb61d9769d2ceb5bbdd Parents: 5201f18 Author: Jesus Camacho Rodriguez <[email protected]> Authored: Sun Oct 4 11:40:30 2015 +0100 Committer: Jesus Camacho Rodriguez <[email protected]> Committed: Fri Oct 9 09:04:49 2015 +0100 ---------------------------------------------------------------------- .../calcite/rules/HivePreFilteringRule.java | 58 ++++++++------ .../clientpositive/filter_cond_pushdown.q | 5 ++ .../clientpositive/filter_cond_pushdown.q.out | 80 ++++++++++++++++++++ 3 files changed, 121 insertions(+), 22 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/be05e32e/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HivePreFilteringRule.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HivePreFilteringRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HivePreFilteringRule.java index 349c7f8..5824127 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HivePreFilteringRule.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HivePreFilteringRule.java @@ -47,6 +47,7 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveFilter; import com.google.common.collect.ImmutableList; import com.google.common.collect.LinkedHashMultimap; import com.google.common.collect.Multimap; +import com.google.common.collect.Sets; public class HivePreFilteringRule extends RelOptRule { @@ -178,54 +179,67 @@ public class HivePreFilteringRule extends RelOptRule { assert condition.getKind() == SqlKind.OR; Multimap<String,RexNode> reductionCondition = LinkedHashMultimap.create(); + // Data structure to control whether a certain reference is present in every operand + Set<String> refsInAllOperands = null; + // 1. We extract the information necessary to create the predicate for the new // filter; currently we support comparison functions, in and between ImmutableList<RexNode> operands = RexUtil.flattenOr(((RexCall) condition).getOperands()); - for (RexNode operand: operands) { + for (int i = 0; i < operands.size(); i++) { + final RexNode operand = operands.get(i); + final RexNode operandCNF = RexUtil.toCnf(rexBuilder, operand); final List<RexNode> conjunctions = RelOptUtil.conjunctions(operandCNF); - boolean addedToReductionCondition = false; // Flag to control whether we have added a new factor - // to the reduction predicate + + Set<String> refsInCurrentOperand = Sets.newHashSet(); for (RexNode conjunction: conjunctions) { + // We do not know what it is, we bail out for safety if (!(conjunction instanceof RexCall)) { - continue; + return new ArrayList<>(); } RexCall conjCall = (RexCall) conjunction; + RexNode ref = null; if(COMPARISON.contains(conjCall.getOperator().getKind())) { if (conjCall.operands.get(0) instanceof RexInputRef && conjCall.operands.get(1) instanceof RexLiteral) { - reductionCondition.put(conjCall.operands.get(0).toString(), - conjCall); - addedToReductionCondition = true; + ref = conjCall.operands.get(0); } else if (conjCall.operands.get(1) instanceof RexInputRef && conjCall.operands.get(0) instanceof RexLiteral) { - reductionCondition.put(conjCall.operands.get(1).toString(), - conjCall); - addedToReductionCondition = true; + ref = conjCall.operands.get(1); + } else { + // We do not know what it is, we bail out for safety + return new ArrayList<>(); } } else if(conjCall.getOperator().getKind().equals(SqlKind.IN)) { - reductionCondition.put(conjCall.operands.get(0).toString(), - conjCall); - addedToReductionCondition = true; + ref = conjCall.operands.get(0); } else if(conjCall.getOperator().getKind().equals(SqlKind.BETWEEN)) { - reductionCondition.put(conjCall.operands.get(1).toString(), - conjCall); - addedToReductionCondition = true; + ref = conjCall.operands.get(1); + } else { + // We do not know what it is, we bail out for safety + return new ArrayList<>(); } + + String stringRef = ref.toString(); + reductionCondition.put(stringRef, conjCall); + refsInCurrentOperand.add(stringRef); } - // If we did not add any factor, we can bail out - if (!addedToReductionCondition) { + // Updates the references that are present in every operand up till now + if (i == 0) { + refsInAllOperands = refsInCurrentOperand; + } else { + refsInAllOperands = Sets.intersection(refsInAllOperands, refsInCurrentOperand); + } + // If we did not add any factor or there are no common factors, we can bail out + if (refsInAllOperands.isEmpty()) { return new ArrayList<>(); } } // 2. We gather the common factors and return them List<RexNode> commonOperands = new ArrayList<>(); - for (Entry<String,Collection<RexNode>> pair : reductionCondition.asMap().entrySet()) { - if (pair.getValue().size() == operands.size()) { - commonOperands.add(RexUtil.composeDisjunction(rexBuilder, pair.getValue(), false)); - } + for (String ref : refsInAllOperands) { + commonOperands.add(RexUtil.composeDisjunction(rexBuilder, reductionCondition.get(ref), false)); } return commonOperands; } http://git-wip-us.apache.org/repos/asf/hive/blob/be05e32e/ql/src/test/queries/clientpositive/filter_cond_pushdown.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/filter_cond_pushdown.q b/ql/src/test/queries/clientpositive/filter_cond_pushdown.q index 5e23b71..2425706 100644 --- a/ql/src/test/queries/clientpositive/filter_cond_pushdown.q +++ b/ql/src/test/queries/clientpositive/filter_cond_pushdown.q @@ -17,3 +17,8 @@ JOIN ( JOIN (SELECT * FROM cbo_t3 t3 WHERE c_int=1) t3 ON t2.key=t3.c_int WHERE ((t2.key=t3.key) AND (t2.c_float + t3.c_float > 2)) OR ((t2.key=t3.key) AND (t2.c_int + t3.c_int > 2))) t4 ON t1.key=t4.key; + +EXPLAIN +SELECT f.key, f.value, m.value +FROM src f JOIN src m ON(f.key = m.key AND m.value is not null AND m.value !='') +WHERE (f.value IN ('2008-04-08','2008-04-10') AND f.value IN ('2008-04-08','2008-04-09') AND m.value='2008-04-10') OR (m.value='2008-04-08'); http://git-wip-us.apache.org/repos/asf/hive/blob/be05e32e/ql/src/test/results/clientpositive/filter_cond_pushdown.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/filter_cond_pushdown.q.out b/ql/src/test/results/clientpositive/filter_cond_pushdown.q.out index af42d5c..99eb3f7 100644 --- a/ql/src/test/results/clientpositive/filter_cond_pushdown.q.out +++ b/ql/src/test/results/clientpositive/filter_cond_pushdown.q.out @@ -380,3 +380,83 @@ STAGE PLANS: Processor Tree: ListSink +PREHOOK: query: EXPLAIN +SELECT f.key, f.value, m.value +FROM src f JOIN src m ON(f.key = m.key AND m.value is not null AND m.value !='') +WHERE (f.value IN ('2008-04-08','2008-04-10') AND f.value IN ('2008-04-08','2008-04-09') AND m.value='2008-04-10') OR (m.value='2008-04-08') +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN +SELECT f.key, f.value, m.value +FROM src f JOIN src m ON(f.key = m.key AND m.value is not null AND m.value !='') +WHERE (f.value IN ('2008-04-08','2008-04-10') AND f.value IN ('2008-04-08','2008-04-09') AND m.value='2008-04-10') OR (m.value='2008-04-08') +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: f + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key is not null (type: boolean) + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + TableScan + alias: f + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (((((value = '2008-04-10') or (value = '2008-04-08')) and value is not null) and (value <> '')) and key is not null) (type: boolean) + Statistics: Num rows: 125 Data size: 1328 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 125 Data size: 1328 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 125 Data size: 1328 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + Reduce Operator Tree: + Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string) + 1 _col0 (type: string) + outputColumnNames: _col0, _col1, _col3 + Statistics: Num rows: 275 Data size: 2921 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (((_col1) IN ('2008-04-08', '2008-04-10') and (_col1) IN ('2008-04-08', '2008-04-09') and (_col3 = '2008-04-10')) or (_col3 = '2008-04-08')) (type: boolean) + Statistics: Num rows: 171 Data size: 1816 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: string), _col1 (type: string), _col3 (type: string) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 171 Data size: 1816 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 171 Data size: 1816 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink +
