IMPALA-4731/IMPALA-397/IMPALA-4728: Materialize sort exprs Previously, exprs used in sorts were evaluated lazily. This can potentially be bad for performance if the exprs are expensive to evaluate, and it can lead to crashes if the exprs are non-deterministic, as this violates assumptions of our sorting algorithm.
This patch addresses these issues by materializing ordering exprs. It does so when the expr is non-deterministic (including when it contains a UDF, which we cannot currently know if they are non-deterministic), or when its cost exceeds a threshold (or the cost is unknown). Testing: - Added e2e tests in test_sort.py. - Updated planner tests. Change-Id: Ifefdaff8557a30ac44ea82ed428e6d1ffbca2e9e Reviewed-on: http://gerrit.cloudera.org:8080/6322 Reviewed-by: Thomas Tauber-Marshall <[email protected]> Tested-by: Impala Public Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/6cddb952 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/6cddb952 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/6cddb952 Branch: refs/heads/master Commit: 6cddb952cefedd373b2a1ce71a1b3cff2e774d70 Parents: 42ca45e Author: Thomas Tauber-Marshall <[email protected]> Authored: Tue Jan 31 10:33:07 2017 -0800 Committer: Impala Public Jenkins <[email protected]> Committed: Wed Apr 26 22:34:04 2017 +0000 ---------------------------------------------------------------------- .../java/org/apache/impala/analysis/Expr.java | 13 +- .../impala/analysis/FunctionCallExpr.java | 6 +- .../org/apache/impala/analysis/QueryStmt.java | 6 +- .../org/apache/impala/analysis/SortInfo.java | 107 +++++++++--- .../apache/impala/planner/AnalyticPlanner.java | 8 +- .../org/apache/impala/planner/SortNode.java | 11 ++ .../org/apache/impala/planner/PlannerTest.java | 9 + .../queries/PlannerTest/constant-folding.test | 1 + .../PlannerTest/sort-expr-materialization.test | 169 +++++++++++++++++++ tests/query_test/test_sort.py | 47 +++++- 10 files changed, 340 insertions(+), 37 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6cddb952/fe/src/main/java/org/apache/impala/analysis/Expr.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/analysis/Expr.java b/fe/src/main/java/org/apache/impala/analysis/Expr.java index 1a1c92b..e28ab48 100644 --- a/fe/src/main/java/org/apache/impala/analysis/Expr.java +++ b/fe/src/main/java/org/apache/impala/analysis/Expr.java @@ -168,8 +168,17 @@ abstract public class Expr extends TreeNode<Expr> implements ParseNode, Cloneabl new com.google.common.base.Predicate<Expr>() { @Override public boolean apply(Expr arg) { - return arg instanceof FunctionCallExpr && - !((FunctionCallExpr)arg).isNondeterministicBuiltinFn(); + return arg instanceof FunctionCallExpr + && ((FunctionCallExpr) arg).isNondeterministicBuiltinFn(); + } + }; + + public final static com.google.common.base.Predicate<Expr> IS_UDF_PREDICATE = + new com.google.common.base.Predicate<Expr>() { + @Override + public boolean apply(Expr arg) { + return arg instanceof FunctionCallExpr + && !((FunctionCallExpr) arg).getFnName().isBuiltin(); } }; http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6cddb952/fe/src/main/java/org/apache/impala/analysis/FunctionCallExpr.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/analysis/FunctionCallExpr.java b/fe/src/main/java/org/apache/impala/analysis/FunctionCallExpr.java index 1e06254..5895326 100644 --- a/fe/src/main/java/org/apache/impala/analysis/FunctionCallExpr.java +++ b/fe/src/main/java/org/apache/impala/analysis/FunctionCallExpr.java @@ -234,9 +234,9 @@ public class FunctionCallExpr extends Expr { static boolean isNondeterministicBuiltinFnName(String fnName) { if (fnName.equalsIgnoreCase("rand") || fnName.equalsIgnoreCase("random") || fnName.equalsIgnoreCase("uuid")) { - return false; + return true; } - return true; + return false; } /** @@ -280,7 +280,7 @@ public class FunctionCallExpr extends Expr { fnName = path.get(path.size() - 1); } // Non-deterministic functions are never constant. - if (!isNondeterministicBuiltinFnName(fnName)) { + if (isNondeterministicBuiltinFnName(fnName)) { return false; } // Sleep is a special function for testing. http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6cddb952/fe/src/main/java/org/apache/impala/analysis/QueryStmt.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/analysis/QueryStmt.java b/fe/src/main/java/org/apache/impala/analysis/QueryStmt.java index 135f2e4..69b9625 100644 --- a/fe/src/main/java/org/apache/impala/analysis/QueryStmt.java +++ b/fe/src/main/java/org/apache/impala/analysis/QueryStmt.java @@ -250,8 +250,10 @@ public abstract class QueryStmt extends StatementBase { ExprSubstitutionMap smap = sortInfo_.createSortTupleInfo(resultExprs_, analyzer); for (int i = 0; i < smap.size(); ++i) { - Preconditions.checkState(smap.getLhs().get(i) instanceof SlotRef); - Preconditions.checkState(smap.getRhs().get(i) instanceof SlotRef); + if (!(smap.getLhs().get(i) instanceof SlotRef) + || !(smap.getRhs().get(i) instanceof SlotRef)) { + continue; + } SlotRef inputSlotRef = (SlotRef) smap.getLhs().get(i); SlotRef outputSlotRef = (SlotRef) smap.getRhs().get(i); if (hasLimit()) { http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6cddb952/fe/src/main/java/org/apache/impala/analysis/SortInfo.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/analysis/SortInfo.java b/fe/src/main/java/org/apache/impala/analysis/SortInfo.java index 31f4d18..6c46231 100644 --- a/fe/src/main/java/org/apache/impala/analysis/SortInfo.java +++ b/fe/src/main/java/org/apache/impala/analysis/SortInfo.java @@ -18,7 +18,9 @@ package org.apache.impala.analysis; import org.apache.impala.common.TreeNode; +import java.util.ArrayList; import java.util.List; +import java.util.ListIterator; import java.util.Set; import com.google.common.base.Preconditions; @@ -34,10 +36,22 @@ import com.google.common.collect.Sets; * particular input row (materialize all row slots) */ public class SortInfo { + // All ordering exprs with cost greater than this will be materialized. Since we don't + // currently have any information about actual function costs, this value is intended to + // ensure that all expensive functions will be materialized while still leaving simple + // operations unmaterialized, for example 'SlotRef + SlotRef' should have a cost below + // this threshold. + // TODO: rethink this when we have a better cost model. + private static final float SORT_MATERIALIZATION_COST_THRESHOLD = + Expr.FUNCTION_CALL_COST; + private List<Expr> orderingExprs_; private final List<Boolean> isAscOrder_; // True if "NULLS FIRST", false if "NULLS LAST", null if not specified. private final List<Boolean> nullsFirstParams_; + // Subset of ordering exprs that are materialized. Populated in + // createMaterializedOrderExprs(), used for EXPLAIN output. + private List<Expr> materializedOrderingExprs_; // The single tuple that is materialized, sorted, and output by a sort operator // (i.e. SortNode or TopNNode) private TupleDescriptor sortTupleDesc_; @@ -52,6 +66,7 @@ public class SortInfo { orderingExprs_ = orderingExprs; isAscOrder_ = isAscOrder; nullsFirstParams_ = nullsFirstParams; + materializedOrderingExprs_ = Lists.newArrayList(); } /** @@ -61,6 +76,7 @@ public class SortInfo { orderingExprs_ = Expr.cloneList(other.orderingExprs_); isAscOrder_ = Lists.newArrayList(other.isAscOrder_); nullsFirstParams_ = Lists.newArrayList(other.nullsFirstParams_); + materializedOrderingExprs_ = Expr.cloneList(other.materializedOrderingExprs_); sortTupleDesc_ = other.sortTupleDesc_; if (other.sortTupleSlotExprs_ != null) { sortTupleSlotExprs_ = Expr.cloneList(other.sortTupleSlotExprs_); @@ -85,6 +101,7 @@ public class SortInfo { public List<Expr> getOrderingExprs() { return orderingExprs_; } public List<Boolean> getIsAscOrder() { return isAscOrder_; } public List<Boolean> getNullsFirstParams() { return nullsFirstParams_; } + public List<Expr> getMaterializedOrderingExprs() { return materializedOrderingExprs_; } public List<Expr> getSortTupleSlotExprs() { return sortTupleSlotExprs_; } public TupleDescriptor getSortTupleDescriptor() { return sortTupleDesc_; } @@ -93,6 +110,7 @@ public class SortInfo { * of asc/desc. */ public List<Boolean> getNullsFirst() { + Preconditions.checkState(orderingExprs_.size() == nullsFirstParams_.size()); List<Boolean> nullsFirst = Lists.newArrayList(); for (int i = 0; i < orderingExprs_.size(); ++i) { nullsFirst.add(OrderByElement.nullsFirst(nullsFirstParams_.get(i), @@ -146,42 +164,51 @@ public class SortInfo { /** * Create a tuple descriptor for the single tuple that is materialized, sorted, and - * output by the sort node. Done by materializing slot refs in the order-by and given - * result expressions. Those slot refs in the ordering and result exprs are substituted - * with slot refs into the new tuple. This simplifies the sorting logic for total and - * top-n sorts. The substitution map is returned. - * TODO: We could do something more sophisticated than simply copying input slot refs - - * e.g. compute some order-by expressions. + * output by the sort node. Materializes slots required by 'resultExprs' as well as + * non-deterministic and expensive order by exprs. The materialized exprs are + * substituted with slot refs into the new tuple. This simplifies the sorting logic for + * total and top-n sorts. The substitution map is returned. */ public ExprSubstitutionMap createSortTupleInfo( List<Expr> resultExprs, Analyzer analyzer) { - // sourceSlots contains the slots from the sort input to materialize. - Set<SlotRef> sourceSlots = Sets.newHashSet(); - - TreeNode.collect(resultExprs, Predicates.instanceOf(SlotRef.class), sourceSlots); - TreeNode.collect(orderingExprs_, Predicates.instanceOf(SlotRef.class), sourceSlots); - // The descriptor for the tuples on which the sort operates. TupleDescriptor sortTupleDesc = analyzer.getDescTbl().createTupleDescriptor("sort"); sortTupleDesc.setIsMaterialized(true); - List<Expr> sortTupleExprs = Lists.newArrayList(); - // substOrderBy is the mapping from slot refs in the sort node's input to slot refs in - // the materialized sort tuple. Each slot ref in the input gets cloned and builds up - // the tuple operated on and returned by the sort node. - ExprSubstitutionMap substOrderBy = new ExprSubstitutionMap(); + // substOrderBy is a mapping from exprs evaluated on the sort input that get + // materialized into the sort tuple to their corresponding SlotRefs in the sort tuple. + // The following exprs are materialized: + // 1. Ordering exprs that we chose to materialize + // 2. SlotRefs against the sort input contained in the result and ordering exprs after + // substituting the materialized ordering exprs. + + // Case 1: + ExprSubstitutionMap substOrderBy = + createMaterializedOrderExprs(sortTupleDesc, analyzer); + sortTupleExprs.addAll(substOrderBy.getLhs()); + + // Case 2: SlotRefs in the result and ordering exprs after substituting the + // materialized ordering exprs. + Set<SlotRef> sourceSlots = Sets.newHashSet(); + TreeNode.collect(Expr.substituteList(resultExprs, substOrderBy, analyzer, false), + Predicates.instanceOf(SlotRef.class), sourceSlots); + TreeNode.collect(Expr.substituteList(orderingExprs_, substOrderBy, analyzer, false), + Predicates.instanceOf(SlotRef.class), sourceSlots); for (SlotRef origSlotRef: sourceSlots) { - SlotDescriptor origSlotDesc = origSlotRef.getDesc(); - SlotDescriptor materializedDesc = - analyzer.copySlotDescriptor(origSlotDesc, sortTupleDesc); - SlotRef cloneRef = new SlotRef(materializedDesc); - substOrderBy.put(origSlotRef, cloneRef); - sortTupleExprs.add(origSlotRef); + // Don't rematerialize slots that are already in the sort tuple. + if (origSlotRef.getDesc().getParent().getId() != sortTupleDesc.getId()) { + SlotDescriptor origSlotDesc = origSlotRef.getDesc(); + SlotDescriptor materializedDesc = + analyzer.copySlotDescriptor(origSlotDesc, sortTupleDesc); + SlotRef cloneRef = new SlotRef(materializedDesc); + substOrderBy.put(origSlotRef, cloneRef); + sortTupleExprs.add(origSlotRef); + } } - // The ordering exprs still point to the old slot refs and need to be replaced with - // ones that point to the slot refs into the sort's output tuple. + // The ordering exprs are evaluated against the sort tuple, so they must reflect the + // materialization decision above. substituteOrderingExprs(substOrderBy, analyzer); // Update the tuple descriptor used to materialize the input of the sort. @@ -189,4 +216,34 @@ public class SortInfo { return substOrderBy; } + + /** + * Materialize ordering exprs by creating slots for them in 'sortTupleDesc' if they: + * - contain a non-deterministic expr + * - contain a UDF (since we don't know if they're deterministic) + * - are more expensive than a cost threshold + * - don't have a cost set + * + * Populates 'materializedOrderingExprs_' and returns a mapping from the original + * ordering exprs to the new SlotRefs. It is expected that this smap will be passed into + * substituteOrderingExprs() by the caller. + */ + public ExprSubstitutionMap createMaterializedOrderExprs( + TupleDescriptor sortTupleDesc, Analyzer analyzer) { + ExprSubstitutionMap substOrderBy = new ExprSubstitutionMap(); + for (Expr origOrderingExpr : orderingExprs_) { + if (!origOrderingExpr.hasCost() + || origOrderingExpr.getCost() > SORT_MATERIALIZATION_COST_THRESHOLD + || origOrderingExpr.contains(Expr.IS_NONDETERMINISTIC_BUILTIN_FN_PREDICATE) + || origOrderingExpr.contains(Expr.IS_UDF_PREDICATE)) { + SlotDescriptor materializedDesc = analyzer.addSlotDescriptor(sortTupleDesc); + materializedDesc.initFromExpr(origOrderingExpr); + materializedDesc.setIsMaterialized(true); + SlotRef materializedRef = new SlotRef(materializedDesc); + substOrderBy.put(origOrderingExpr, materializedRef); + materializedOrderingExprs_.add(origOrderingExpr); + } + } + return substOrderBy; + } } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6cddb952/fe/src/main/java/org/apache/impala/planner/AnalyticPlanner.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/planner/AnalyticPlanner.java b/fe/src/main/java/org/apache/impala/planner/AnalyticPlanner.java index 6d726ec..08dd9f5 100644 --- a/fe/src/main/java/org/apache/impala/planner/AnalyticPlanner.java +++ b/fe/src/main/java/org/apache/impala/planner/AnalyticPlanner.java @@ -308,8 +308,12 @@ public class AnalyticPlanner { } } - SortInfo sortInfo = new SortInfo( - Expr.substituteList(sortExprs, sortSmap, analyzer_, false), isAsc, nullsFirst); + SortInfo sortInfo = new SortInfo(sortExprs, isAsc, nullsFirst); + ExprSubstitutionMap smap = + sortInfo.createMaterializedOrderExprs(sortTupleDesc, analyzer_); + sortSlotExprs.addAll(smap.getLhs()); + sortSmap = ExprSubstitutionMap.combine(sortSmap, smap); + sortInfo.substituteOrderingExprs(sortSmap, analyzer_); if (LOG.isTraceEnabled()) { LOG.trace("sortinfo exprs: " + Expr.debugString(sortInfo.getOrderingExprs())); } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6cddb952/fe/src/main/java/org/apache/impala/planner/SortNode.java ---------------------------------------------------------------------- diff --git a/fe/src/main/java/org/apache/impala/planner/SortNode.java b/fe/src/main/java/org/apache/impala/planner/SortNode.java index ef05499..177565a 100644 --- a/fe/src/main/java/org/apache/impala/planner/SortNode.java +++ b/fe/src/main/java/org/apache/impala/planner/SortNode.java @@ -187,6 +187,17 @@ public class SortNode extends PlanNode { } output.append("\n"); } + + if (detailLevel.ordinal() >= TExplainLevel.EXTENDED.ordinal() + && info_.getMaterializedOrderingExprs().size() > 0) { + output.append(detailPrefix + "materialized: "); + for (int i = 0; i < info_.getMaterializedOrderingExprs().size(); ++i) { + if (i > 0) output.append(", "); + output.append(info_.getMaterializedOrderingExprs().get(i).toSql()); + } + output.append("\n"); + } + return output.toString(); } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6cddb952/fe/src/test/java/org/apache/impala/planner/PlannerTest.java ---------------------------------------------------------------------- diff --git a/fe/src/test/java/org/apache/impala/planner/PlannerTest.java b/fe/src/test/java/org/apache/impala/planner/PlannerTest.java index 363c59c..80ba3b2 100644 --- a/fe/src/test/java/org/apache/impala/planner/PlannerTest.java +++ b/fe/src/test/java/org/apache/impala/planner/PlannerTest.java @@ -19,6 +19,7 @@ package org.apache.impala.planner; import org.apache.impala.catalog.Catalog; import org.apache.impala.catalog.Db; +import org.apache.impala.catalog.Type; import org.apache.impala.common.ImpalaException; import org.apache.impala.common.RuntimeEnv; import org.apache.impala.testutil.TestUtils; @@ -32,6 +33,7 @@ import org.junit.Assume; import org.junit.Test; import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; // All planner tests, except for S3 specific tests should go here. public class PlannerTest extends PlannerTestBase { @@ -379,4 +381,11 @@ public class PlannerTest extends PlannerTestBase { runPlannerTestFile("resource-requirements", options, false); } + @Test + public void testSortExprMaterialization() { + addTestFunction("TestFn", Lists.newArrayList(Type.DOUBLE), false); + TQueryOptions options = defaultQueryOptions(); + options.setExplain_level(TExplainLevel.EXTENDED); + runPlannerTestFile("sort-expr-materialization", options); + } } http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6cddb952/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test b/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test index 7effd9b..0a30a97 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/constant-folding.test @@ -261,6 +261,7 @@ PLAN-ROOT SINK | 01:SORT | order by: concat('ab', string_col) ASC NULLS FIRST, greatest(20, bigint_col) ASC +| materialized: concat('ab', string_col), greatest(20, bigint_col) | mem-estimate=16.00MB mem-reservation=48.00MB | tuple-ids=3 row-size=29B cardinality=7300 | http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6cddb952/testdata/workloads/functional-planner/queries/PlannerTest/sort-expr-materialization.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/sort-expr-materialization.test b/testdata/workloads/functional-planner/queries/PlannerTest/sort-expr-materialization.test new file mode 100644 index 0000000..b7e22a9 --- /dev/null +++ b/testdata/workloads/functional-planner/queries/PlannerTest/sort-expr-materialization.test @@ -0,0 +1,169 @@ +# sort on a non-deterministic expr, gets materialized +select * from functional.alltypes order by random() +---- PLAN +F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1 +PLAN-ROOT SINK +| mem-estimate=0B mem-reservation=0B +| +01:SORT +| order by: random() ASC +| materialized: random() +| mem-estimate=16.00MB mem-reservation=48.00MB +| tuple-ids=1 row-size=105B cardinality=7300 +| +00:SCAN HDFS [functional.alltypes] + partitions=24/24 files=24 size=478.45KB + table stats: 7300 rows total + column stats: all + mem-estimate=128.00MB mem-reservation=0B + tuple-ids=0 row-size=97B cardinality=7300 +==== +# sort on a deterministic expr that exceeds the cost threshold +select * from functional.alltypes order by abs(id) + abs(id) +---- PLAN +F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1 +PLAN-ROOT SINK +| mem-estimate=0B mem-reservation=0B +| +01:SORT +| order by: abs(id) + abs(id) ASC +| materialized: abs(id) + abs(id) +| mem-estimate=16.00MB mem-reservation=48.00MB +| tuple-ids=1 row-size=105B cardinality=7300 +| +00:SCAN HDFS [functional.alltypes] + partitions=24/24 files=24 size=478.45KB + table stats: 7300 rows total + column stats: all + mem-estimate=128.00MB mem-reservation=0B + tuple-ids=0 row-size=97B cardinality=7300 +==== +# sort on a deterministic expr that doesn't exceed the cost threshold +select * from functional.alltypes order by tinyint_col + 1 +---- PLAN +F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1 +PLAN-ROOT SINK +| mem-estimate=0B mem-reservation=0B +| +01:SORT +| order by: tinyint_col + 1 ASC +| mem-estimate=16.00MB mem-reservation=48.00MB +| tuple-ids=1 row-size=97B cardinality=7300 +| +00:SCAN HDFS [functional.alltypes] + partitions=24/24 files=24 size=478.45KB + table stats: 7300 rows total + column stats: all + mem-estimate=128.00MB mem-reservation=0B + tuple-ids=0 row-size=97B cardinality=7300 +==== +# sort on multiple exprs, subset is materialized +select * from functional.alltypes +order by dayofweek(timestamp_col), true, id + 1, string_col = date_string_col, id = tinyint_col +---- PLAN +F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1 +PLAN-ROOT SINK +| mem-estimate=0B mem-reservation=0B +| +01:SORT +| order by: dayofweek(timestamp_col) ASC, TRUE ASC, id + 1 ASC, string_col = date_string_col ASC, id = tinyint_col ASC +| materialized: dayofweek(timestamp_col), string_col = date_string_col +| mem-estimate=16.00MB mem-reservation=48.00MB +| tuple-ids=1 row-size=102B cardinality=7300 +| +00:SCAN HDFS [functional.alltypes] + partitions=24/24 files=24 size=478.45KB + table stats: 7300 rows total + column stats: all + mem-estimate=128.00MB mem-reservation=0B + tuple-ids=0 row-size=97B cardinality=7300 +==== +# expensive analytic order by expr gets materialized +select last_value(id) over (order by to_date(timestamp_col), bool_col is null) +from functional.alltypes +---- PLAN +F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1 +PLAN-ROOT SINK +| mem-estimate=0B mem-reservation=0B +| +02:ANALYTIC +| functions: last_value(id) +| order by: to_date(timestamp_col) ASC, bool_col IS NULL ASC +| window: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW +| mem-estimate=0B mem-reservation=16.00MB +| tuple-ids=3,2 row-size=41B cardinality=7300 +| +01:SORT +| order by: to_date(timestamp_col) ASC, bool_col IS NULL ASC +| materialized: to_date(timestamp_col) +| mem-estimate=16.00MB mem-reservation=48.00MB +| tuple-ids=3 row-size=37B cardinality=7300 +| +00:SCAN HDFS [functional.alltypes] + partitions=24/24 files=24 size=478.45KB + table stats: 7300 rows total + column stats: all + mem-estimate=128.00MB mem-reservation=0B + tuple-ids=0 row-size=21B cardinality=7300 +==== +# expensive order by expr in top-n gets materialized +select id from functional.alltypes order by string_col like 'a.*b', id * bigint_col, +regexp_replace(string_col, 'a.*b', 'c') limit 10 +---- PLAN +F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1 +PLAN-ROOT SINK +| mem-estimate=0B mem-reservation=0B +| +01:TOP-N [LIMIT=10] +| order by: string_col LIKE 'a.*b' ASC, id * bigint_col ASC, regexp_replace(string_col, 'a.*b', 'c') ASC +| materialized: string_col LIKE 'a.*b', regexp_replace(string_col, 'a.*b', 'c') +| mem-estimate=290B mem-reservation=0B +| tuple-ids=1 row-size=29B cardinality=10 +| +00:SCAN HDFS [functional.alltypes] + partitions=24/24 files=24 size=478.45KB + table stats: 7300 rows total + column stats: all + mem-estimate=128.00MB mem-reservation=0B + tuple-ids=0 row-size=29B cardinality=7300 +==== +# sort on udf, gets materialized +select * from functional.alltypes order by TestFn(double_col) +---- PLAN +F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1 +PLAN-ROOT SINK +| mem-estimate=0B mem-reservation=0B +| +01:SORT +| order by: default.testfn(double_col) ASC +| materialized: default.testfn(double_col) +| mem-estimate=16.00MB mem-reservation=48.00MB +| tuple-ids=1 row-size=101B cardinality=7300 +| +00:SCAN HDFS [functional.alltypes] + partitions=24/24 files=24 size=478.45KB + table stats: 7300 rows total + column stats: all + mem-estimate=128.00MB mem-reservation=0B + tuple-ids=0 row-size=97B cardinality=7300 +==== +# sort expr contains SlotRefs that don't need to be materialized separately +select concat(date_string_col, string_col) c from functional.alltypes order by c +---- PLAN +F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1 +PLAN-ROOT SINK +| mem-estimate=0B mem-reservation=0B +| +01:SORT +| order by: concat(date_string_col, string_col) ASC +| materialized: concat(date_string_col, string_col) +| mem-estimate=16.00MB mem-reservation=48.00MB +| tuple-ids=1 row-size=16B cardinality=7300 +| +00:SCAN HDFS [functional.alltypes] + partitions=24/24 files=24 size=478.45KB + table stats: 7300 rows total + column stats: all + mem-estimate=128.00MB mem-reservation=0B + tuple-ids=0 row-size=41B cardinality=7300 +==== http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/6cddb952/tests/query_test/test_sort.py ---------------------------------------------------------------------- diff --git a/tests/query_test/test_sort.py b/tests/query_test/test_sort.py index 228d25d..1b77b6f 100644 --- a/tests/query_test/test_sort.py +++ b/tests/query_test/test_sort.py @@ -17,11 +17,12 @@ from tests.common.impala_test_suite import ImpalaTestSuite -def transpose_results(result): +def transpose_results(result, map_fn=lambda x: x): """Given a query result (list of strings, each string represents a row), return a list - of columns, where each column is a list of strings.""" + of columns, where each column is a list of strings. Optionally, map_fn can be provided + to be applied to every value, eg. to convert the strings to their underlying types.""" split_result = [row.split('\t') for row in result] - return [list(l) for l in zip(*split_result)] + return [map(map_fn, list(l)) for l in zip(*split_result)] class TestQueryFullSort(ImpalaTestSuite): """Test class to do functional validation of sorting when data is spilled to disk.""" @@ -154,3 +155,43 @@ class TestQueryFullSort(ImpalaTestSuite): query, exec_option, table_format=table_format).data) assert(result[0] == sorted(result[0])) +class TestRandomSort(ImpalaTestSuite): + @classmethod + def get_workload(self): + return 'functional' + + def test_order_by_random(self): + """Tests that 'order by random()' works as expected.""" + # "order by random()" with different seeds should produce different orderings. + seed_query = "select * from functional.alltypestiny order by random(%s)" + results_seed0 = self.execute_query(seed_query % "0") + results_seed1 = self.execute_query(seed_query % "1") + assert results_seed0.data != results_seed1.data + assert sorted(results_seed0.data) == sorted(results_seed1.data) + + # Include "random()" in the select list to check that it's sorted correctly. + results = transpose_results(self.execute_query( + "select random() as r from functional.alltypessmall order by r").data, + lambda x: float(x)) + assert(results[0] == sorted(results[0])) + + # Like above, but with a limit. + results = transpose_results(self.execute_query( + "select random() as r from functional.alltypes order by r limit 100").data, + lambda x: float(x)) + assert(results == sorted(results)) + + # "order by random()" inside an inline view. + query = "select r from (select random() r from functional.alltypessmall) v order by r" + results = transpose_results(self.execute_query(query).data, lambda x: float(x)) + assert (results == sorted(results)) + + def test_analytic_order_by_random(self): + """Tests that a window function over 'order by random()' works as expected.""" + # Since we use the same random seed and a very small table, the following queries + # should be equivalent. + results = transpose_results(self.execute_query("select id from " + "functional.alltypestiny order by random(2)").data) + analytic_results = transpose_results(self.execute_query("select last_value(id) over " + "(order by random(2)) from functional.alltypestiny").data) + assert results == analytic_results
