IMPALA-6392: Consistent explain format for parquet predicate statistics In EXPLAIN_LEVEL=2+, change the explain format for parquet predicate statistics to output each tuple descriptor per line. This change is to make it consistent with the output of other predicates.
Before: parquet statistics predicates: c_custkey < 10, o_orderkey < 5, l_linenumber < 3 After: parquet statistics predicates: c_custkey < 10 parquet statistics predicates on o: o_orderkey < 5 parquet statistics predicates on o_lineitems: l_linenumber < 3 Testing: - Ran existing planner tests and updated the ones that are affected by this change. - Ran end-to-end tests in query_test Change-Id: Ia3d55ab6a1ae551867a9f68b3622844102cc854e Reviewed-on: http://gerrit.cloudera.org:8080/9223 Tested-by: Impala Public Jenkins Reviewed-by: Alex Behm <alex.b...@cloudera.com> Project: http://git-wip-us.apache.org/repos/asf/impala/repo Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/3d7d8209 Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/3d7d8209 Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/3d7d8209 Branch: refs/heads/2.x Commit: 3d7d8209edf77216b8d990ea5b0eb6a16d06fc07 Parents: 1a632e7 Author: Fredy Wijaya <fwij...@cloudera.com> Authored: Tue Feb 6 01:05:14 2018 -0600 Committer: Impala Public Jenkins <impala-public-jenk...@gerrit.cloudera.org> Committed: Tue Feb 13 21:10:13 2018 +0000 ---------------------------------------------------------------------- .../org/apache/impala/planner/HdfsScanNode.java | 46 +++++++++++++++----- .../queries/PlannerTest/constant-folding.test | 3 +- .../queries/PlannerTest/mt-dop-validation.test | 12 +++-- .../queries/PlannerTest/parquet-filtering.test | 8 ++-- 4 files changed, 51 insertions(+), 18 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/impala/blob/3d7d8209/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java index 45ad8d6..7735f98 100644 --- a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java +++ b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java @@ -225,9 +225,10 @@ public class HdfsScanNode extends ScanNode { // data when scanning Parquet files. private final List<Expr> minMaxConjuncts_ = Lists.newArrayList(); - // List of PlanNode conjuncts that have been transformed into conjuncts in - // 'minMaxConjuncts_'. - private final List<Expr> minMaxOriginalConjuncts_ = Lists.newArrayList(); + // Map from TupleDescriptor to list of PlanNode conjuncts that have been transformed + // into conjuncts in 'minMaxConjuncts_'. + private final Map<TupleDescriptor, List<Expr>> minMaxOriginalConjuncts_ = + Maps.newLinkedHashMap(); // Tuple that is used to materialize statistics when scanning Parquet files. For each // column it can contain 0, 1, or 2 slots, depending on whether the column needs to be @@ -470,10 +471,10 @@ public class HdfsScanNode extends ScanNode { BinaryPredicate.Operator op = binaryPred.getOp(); if (op == BinaryPredicate.Operator.LT || op == BinaryPredicate.Operator.LE || op == BinaryPredicate.Operator.GE || op == BinaryPredicate.Operator.GT) { - minMaxOriginalConjuncts_.add(binaryPred); + addMinMaxOriginalConjunct(slotRef.getDesc().getParent(), binaryPred); buildStatsPredicate(analyzer, slotRef, binaryPred, op); } else if (op == BinaryPredicate.Operator.EQ) { - minMaxOriginalConjuncts_.add(binaryPred); + addMinMaxOriginalConjunct(slotRef.getDesc().getParent(), binaryPred); // TODO: this could be optimized for boolean columns. buildStatsPredicate(analyzer, slotRef, binaryPred, BinaryPredicate.Operator.LE); buildStatsPredicate(analyzer, slotRef, binaryPred, BinaryPredicate.Operator.GE); @@ -513,11 +514,20 @@ public class HdfsScanNode extends ScanNode { BinaryPredicate maxBound = new BinaryPredicate(BinaryPredicate.Operator.LE, children.get(0).clone(), max.clone()); - minMaxOriginalConjuncts_.add(inPred); + addMinMaxOriginalConjunct(slotRef.getDesc().getParent(), inPred); buildStatsPredicate(analyzer, slotRef, minBound, minBound.getOp()); buildStatsPredicate(analyzer, slotRef, maxBound, maxBound.getOp()); } + private void addMinMaxOriginalConjunct(TupleDescriptor tupleDesc, Expr expr) { + List<Expr> exprs = minMaxOriginalConjuncts_.get(tupleDesc); + if (exprs == null) { + exprs = new ArrayList<Expr>(); + minMaxOriginalConjuncts_.put(tupleDesc, exprs); + } + exprs.add(expr); + } + private void tryComputeMinMaxPredicate(Analyzer analyzer, Expr pred) { if (pred instanceof BinaryPredicate) { tryComputeBinaryMinMaxPredicate(analyzer, (BinaryPredicate) pred); @@ -1080,16 +1090,32 @@ public class HdfsScanNode extends ScanNode { numPartitionsNoDiskIds_, numPartitions_, numFilesNoDiskIds_, totalFiles_, numScanRangesNoDiskIds_, scanRanges_.size())); } - if (!minMaxOriginalConjuncts_.isEmpty()) { - output.append(String.format("%sparquet statistics predicates: %s\n", - detailPrefix, getExplainString(minMaxOriginalConjuncts_))); - } + // Groups the min max original conjuncts by tuple descriptor. + output.append(getMinMaxOriginalConjunctsExplainString(detailPrefix)); // Groups the dictionary filterable conjuncts by tuple descriptor. output.append(getDictionaryConjunctsExplainString(detailPrefix)); } return output.toString(); } + // Helper method that prints min max original conjuncts by tuple descriptor. + private String getMinMaxOriginalConjunctsExplainString(String prefix) { + StringBuilder output = new StringBuilder(); + for (Map.Entry<TupleDescriptor, List<Expr>> entry : + minMaxOriginalConjuncts_.entrySet()) { + TupleDescriptor tupleDesc = entry.getKey(); + List<Expr> exprs = entry.getValue(); + if (tupleDesc == getTupleDesc()) { + output.append(String.format("%sparquet statistics predicates: %s\n", prefix, + getExplainString(exprs))); + } else { + output.append(String.format("%sparquet statistics predicates on %s: %s\n", + prefix, tupleDesc.getAlias(), getExplainString(exprs))); + } + } + return output.toString(); + } + // Helper method that prints the dictionary filterable conjuncts by tuple descriptor. private String getDictionaryConjunctsExplainString(String prefix) { StringBuilder output = new StringBuilder(); http://git-wip-us.apache.org/repos/asf/impala/blob/3d7d8209/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test b/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test index 2b2d5ef..f25ad0a 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test @@ -54,7 +54,8 @@ PLAN-ROOT SINK table: rows=150000 size=292.36MB columns missing stats: c_orders extrapolated-rows=disabled - parquet statistics predicates: c_custkey > 10, o_orderkey = 4 + parquet statistics predicates: c_custkey > 10 + parquet statistics predicates on o: o_orderkey = 4 parquet dictionary predicates: c_custkey > 10 parquet dictionary predicates on o: o_orderkey = 4 parquet dictionary predicates on o_lineitems: 20 + l_linenumber < 0 http://git-wip-us.apache.org/repos/asf/impala/blob/3d7d8209/testdata/workloads/functional-planner/queries/PlannerTest/mt-dop-validation.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/mt-dop-validation.test b/testdata/workloads/functional-planner/queries/PlannerTest/mt-dop-validation.test index f3a46de..61d646b 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/mt-dop-validation.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/mt-dop-validation.test @@ -251,7 +251,9 @@ PLAN-ROOT SINK table: rows=150000 size=292.36MB columns missing stats: c_orders extrapolated-rows=disabled - parquet statistics predicates: c_custkey < 10, o_orderkey < 5, l_linenumber < 3 + parquet statistics predicates: c_custkey < 10 + parquet statistics predicates on o: o_orderkey < 5 + parquet statistics predicates on o_lineitems: l_linenumber < 3 parquet dictionary predicates: c_custkey < 10 parquet dictionary predicates on o: o_orderkey < 5 parquet dictionary predicates on o_lineitems: l_linenumber < 3 @@ -314,7 +316,9 @@ Per-Host Resources: mem-estimate=264.00MB mem-reservation=0B table: rows=150000 size=292.36MB columns missing stats: c_orders extrapolated-rows=disabled - parquet statistics predicates: c_custkey < 10, o_orderkey < 5, l_linenumber < 3 + parquet statistics predicates: c_custkey < 10 + parquet statistics predicates on o: o_orderkey < 5 + parquet statistics predicates on o_lineitems: l_linenumber < 3 parquet dictionary predicates: c_custkey < 10 parquet dictionary predicates on o: o_orderkey < 5 parquet dictionary predicates on o_lineitems: l_linenumber < 3 @@ -368,7 +372,7 @@ PLAN-ROOT SINK table: rows=150000 size=292.36MB columns missing stats: c_orders, c_orders extrapolated-rows=disabled - parquet statistics predicates: o1.o_orderkey < 5 + parquet statistics predicates on o1: o1.o_orderkey < 5 parquet dictionary predicates on o1: o1.o_orderkey < 5 mem-estimate=88.00MB mem-reservation=0B tuple-ids=0 row-size=270B cardinality=150000 @@ -421,7 +425,7 @@ Per-Host Resources: mem-estimate=269.81MB mem-reservation=5.81MB table: rows=150000 size=292.36MB columns missing stats: c_orders, c_orders extrapolated-rows=disabled - parquet statistics predicates: o1.o_orderkey < 5 + parquet statistics predicates on o1: o1.o_orderkey < 5 parquet dictionary predicates on o1: o1.o_orderkey < 5 mem-estimate=88.00MB mem-reservation=0B tuple-ids=0 row-size=270B cardinality=150000 http://git-wip-us.apache.org/repos/asf/impala/blob/3d7d8209/testdata/workloads/functional-planner/queries/PlannerTest/parquet-filtering.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/parquet-filtering.test b/testdata/workloads/functional-planner/queries/PlannerTest/parquet-filtering.test index e7dee4e..2b602c9 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/parquet-filtering.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/parquet-filtering.test @@ -150,7 +150,7 @@ PLAN-ROOT SINK table: rows=unavailable size=unavailable columns missing stats: id extrapolated-rows=disabled - parquet statistics predicates: a.item.e < -10 + parquet statistics predicates on a: a.item.e < -10 parquet dictionary predicates on a: a.item.e < -10 mem-estimate=32.00MB mem-reservation=0B tuple-ids=0 row-size=24B cardinality=unavailable @@ -327,7 +327,9 @@ PLAN-ROOT SINK table: rows=150000 size=292.36MB columns missing stats: c_orders extrapolated-rows=disabled - parquet statistics predicates: c_custkey > 0, o.o_orderkey > 0, l.l_partkey > 0 + parquet statistics predicates: c_custkey > 0 + parquet statistics predicates on o: o.o_orderkey > 0 + parquet statistics predicates on l: l.l_partkey > 0 parquet dictionary predicates: c_custkey > 0 parquet dictionary predicates on o: o.o_orderkey > 0 parquet dictionary predicates on l: l.l_partkey > 0 @@ -435,7 +437,7 @@ PLAN-ROOT SINK table: rows=150000 size=292.36MB columns missing stats: c_orders extrapolated-rows=disabled - parquet statistics predicates: l.l_shipdate = '1994-08-19', l.l_receiptdate = '1994-08-24', l.l_shipmode = 'RAIL', l.l_returnflag = 'R' + parquet statistics predicates on l: l.l_shipdate = '1994-08-19', l.l_receiptdate = '1994-08-24', l.l_shipmode = 'RAIL', l.l_returnflag = 'R' parquet dictionary predicates on l: l.l_shipdate = '1994-08-19', l.l_receiptdate = '1994-08-24', l.l_shipmode = 'RAIL', l.l_returnflag = 'R' mem-estimate=176.00MB mem-reservation=0B tuple-ids=0 row-size=50B cardinality=150000