Yingyi Bu has submitted this change and it was merged. Change subject: ASTERIXDB-1487: fix the wrong plan when we prune the selective branch. ......................................................................
ASTERIXDB-1487: fix the wrong plan when we prune the selective branch. 1. Add the test case of ASTERIX-1487 with single join branch required. 2. Disable the join branch pruning in case of unnestmap following datasourcescan. - We need to prune the join branch when it is NOT required by the upstream operators and its generated join key is derived from the same DATASOURCE of the other branch. - We SHOULD NOT prune the join branch if there exists a selective operator (UNNESTMAP, LOUNNESTMAP, LIMIT, SELECT) located between the join operator and DATASOURCESCAN. Change-Id: I1aef69a2278853fd9f8020da6639331b367ed5ad Reviewed-on: https://asterix-gerrit.ics.uci.edu/1119 Tested-by: Jenkins <[email protected]> Integration-Tests: Jenkins <[email protected]> Reviewed-by: Yingyi Bu <[email protected]> --- M asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/RemoveUnusedOneToOneEquiJoinRule.java M asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/am/InvertedIndexAccessMethod.java A asterixdb/asterix-app/src/test/resources/optimizerts/queries/select-self-join.aql M asterixdb/asterix-app/src/test/resources/optimizerts/results/inverted-index-join/issue741.plan A asterixdb/asterix-app/src/test/resources/optimizerts/results/select-self-join.plan A asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_5/dblp-csx-aqlplus_5.1.ddl.aql A asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_5/dblp-csx-aqlplus_5.2.update.aql A asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_5/dblp-csx-aqlplus_5.3.query.aql A asterixdb/asterix-app/src/test/resources/runtimets/results/fuzzyjoin/dblp-csx-aqlplus_5/dblp-csx-aqlplus_5.1.adm M asterixdb/asterix-app/src/test/resources/runtimets/testsuite.xml 10 files changed, 225 insertions(+), 21 deletions(-) Approvals: Yingyi Bu: Looks good to me, approved Jenkins: Verified; Verified Objections: Jenkins: Violations found diff --git a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/RemoveUnusedOneToOneEquiJoinRule.java b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/RemoveUnusedOneToOneEquiJoinRule.java index 2e43912..03c7663 100644 --- a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/RemoveUnusedOneToOneEquiJoinRule.java +++ b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/RemoveUnusedOneToOneEquiJoinRule.java @@ -50,6 +50,7 @@ * 1. The live variables of one input branch of the join are not used in the upstream plan * 2. The join is an inner equi join * 3. The join condition only uses variables that correspond to primary keys of the same dataset + * 4. The records of one input branch will not be filtered by the selective operators till join. * Notice that the last condition implies a 1:1 join, i.e., the join does not change the result cardinality. * Joins that satisfy the above conditions may be introduced by other rules * which use surrogate optimizations. Such an optimization aims to reduce data copies and communication costs by @@ -61,11 +62,11 @@ */ public class RemoveUnusedOneToOneEquiJoinRule implements IAlgebraicRewriteRule { - private final Set<LogicalVariable> parentsUsedVars = new HashSet<LogicalVariable>(); - private final List<LogicalVariable> usedVars = new ArrayList<LogicalVariable>(); - private final List<LogicalVariable> liveVars = new ArrayList<LogicalVariable>(); - private final List<LogicalVariable> pkVars = new ArrayList<LogicalVariable>(); - private final List<DataSourceScanOperator> dataScans = new ArrayList<DataSourceScanOperator>(); + private final Set<LogicalVariable> parentsUsedVars = new HashSet<>(); + private final List<LogicalVariable> usedVars = new ArrayList<>(); + private final List<LogicalVariable> liveVars = new ArrayList<>(); + private final List<LogicalVariable> pkVars = new ArrayList<>(); + private final List<DataSourceScanOperator> dataScans = new ArrayList<>(); private boolean hasRun = false; @Override @@ -179,9 +180,35 @@ // keys from datasource scans of the same dataset. return -1; } + // Suppose we Project B over A.a ~= B.b, where A's fields are involved in a selective operator. + // We expect the post-plan will NOT prune the join part derived from A. + if (unusedJoinBranchIndex >= 0 + && isSelectionAboveDataScan(opRef.getValue().getInputs().get(unusedJoinBranchIndex))) { + unusedJoinBranchIndex = -1; + } return unusedJoinBranchIndex; } + private boolean isSelectionAboveDataScan(Mutable<ILogicalOperator> opRef) { + boolean hasSelection = false; + AbstractLogicalOperator op = (AbstractLogicalOperator) opRef.getValue(); + LogicalOperatorTag tag = op.getOperatorTag(); + switch (tag) { + case DATASOURCESCAN: + return false; + case UNNEST_MAP: + case LEFT_OUTER_UNNEST_MAP: + case LIMIT: + case SELECT: + return true; + default: + for (Mutable<ILogicalOperator> inputOp : op.getInputs()) { + hasSelection |= isSelectionAboveDataScan(inputOp); + } + } + return hasSelection; + } + private void gatherProducingDataScans(Mutable<ILogicalOperator> opRef, List<LogicalVariable> joinUsedVars, List<DataSourceScanOperator> dataScans) { AbstractLogicalOperator op = (AbstractLogicalOperator) opRef.getValue(); diff --git a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/am/InvertedIndexAccessMethod.java b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/am/InvertedIndexAccessMethod.java index 0bc5e78..8f42904 100644 --- a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/am/InvertedIndexAccessMethod.java +++ b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/am/InvertedIndexAccessMethod.java @@ -65,6 +65,7 @@ import org.apache.hyracks.algebricks.core.algebra.functions.FunctionIdentifier; import org.apache.hyracks.algebricks.core.algebra.operators.logical.AbstractBinaryJoinOperator; import org.apache.hyracks.algebricks.core.algebra.operators.logical.AbstractLogicalOperator; +import org.apache.hyracks.algebricks.core.algebra.operators.logical.AbstractLogicalOperator.ExecutionMode; import org.apache.hyracks.algebricks.core.algebra.operators.logical.AbstractUnnestMapOperator; import org.apache.hyracks.algebricks.core.algebra.operators.logical.AssignOperator; import org.apache.hyracks.algebricks.core.algebra.operators.logical.DataSourceScanOperator; @@ -73,7 +74,6 @@ import org.apache.hyracks.algebricks.core.algebra.operators.logical.SelectOperator; import org.apache.hyracks.algebricks.core.algebra.operators.logical.UnionAllOperator; import org.apache.hyracks.algebricks.core.algebra.operators.logical.UnnestOperator; -import org.apache.hyracks.algebricks.core.algebra.operators.logical.AbstractLogicalOperator.ExecutionMode; import org.apache.hyracks.algebricks.core.algebra.operators.logical.visitors.LogicalOperatorDeepCopyWithNewVariablesVisitor; import org.apache.hyracks.algebricks.core.algebra.operators.logical.visitors.VariableUtilities; import org.apache.hyracks.algebricks.core.algebra.util.OperatorManipulationUtil; diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/queries/select-self-join.aql b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/select-self-join.aql new file mode 100644 index 0000000..720bacd --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/queries/select-self-join.aql @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/* + * Description : Tests that self-join on primary key with select introduces surrogate join. + * Success : Yes + */ + +drop dataverse test if exists; +create dataverse test; +use dataverse test; + +write output to asterix_nc1:"opttest/select-self-join.adm"; + +create type empType as open { +id: int, +sal: int +} + +create dataset Emps(empType) primary key id; + +for $e1 in dataset Emps +for $e2 in (for $e3 in dataset Emps where $e3.sal > 1000 return $e3) +where $e1.id = $e2.id +return $e1 diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/results/inverted-index-join/issue741.plan b/asterixdb/asterix-app/src/test/resources/optimizerts/results/inverted-index-join/issue741.plan index 4e40dd2..5b08bf5 100644 --- a/asterixdb/asterix-app/src/test/resources/optimizerts/results/inverted-index-join/issue741.plan +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/results/inverted-index-join/issue741.plan @@ -13,20 +13,33 @@ -- STABLE_SORT [$$25(ASC)] |PARTITIONED| -- HASH_PARTITION_EXCHANGE [$$25] |PARTITIONED| -- STREAM_PROJECT |PARTITIONED| - -- STREAM_SELECT |PARTITIONED| - -- STREAM_PROJECT |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- HYBRID_HASH_JOIN [$$36][$$25] |PARTITIONED| -- ONE_TO_ONE_EXCHANGE |PARTITIONED| - -- BTREE_SEARCH |PARTITIONED| - -- ONE_TO_ONE_EXCHANGE |PARTITIONED| - -- STABLE_SORT [$$39(ASC)] |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- STREAM_SELECT |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ASSIGN |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- DATASOURCE_SCAN |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- EMPTY_TUPLE_SOURCE |PARTITIONED| + -- HASH_PARTITION_EXCHANGE [$$25] |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- STREAM_SELECT |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| -- ONE_TO_ONE_EXCHANGE |PARTITIONED| - -- LENGTH_PARTITIONED_INVERTED_INDEX_SEARCH |PARTITIONED| - -- BROADCAST_EXCHANGE |PARTITIONED| - -- STREAM_PROJECT |PARTITIONED| - -- STREAM_SELECT |PARTITIONED| - -- STREAM_PROJECT |PARTITIONED| - -- ASSIGN |PARTITIONED| - -- ONE_TO_ONE_EXCHANGE |PARTITIONED| - -- DATASOURCE_SCAN |PARTITIONED| - -- ONE_TO_ONE_EXCHANGE |PARTITIONED| - -- EMPTY_TUPLE_SOURCE |PARTITIONED| + -- BTREE_SEARCH |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- STABLE_SORT [$$39(ASC)] |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- LENGTH_PARTITIONED_INVERTED_INDEX_SEARCH |PARTITIONED| + -- BROADCAST_EXCHANGE |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- STREAM_SELECT |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ASSIGN |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- DATASOURCE_SCAN |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- EMPTY_TUPLE_SOURCE |PARTITIONED| diff --git a/asterixdb/asterix-app/src/test/resources/optimizerts/results/select-self-join.plan b/asterixdb/asterix-app/src/test/resources/optimizerts/results/select-self-join.plan new file mode 100644 index 0000000..45c0e3e --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/optimizerts/results/select-self-join.plan @@ -0,0 +1,16 @@ +-- DISTRIBUTE_RESULT |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- HYBRID_HASH_JOIN [$$11][$$12] |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- DATASOURCE_SCAN |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- EMPTY_TUPLE_SOURCE |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- STREAM_PROJECT |PARTITIONED| + -- STREAM_SELECT |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- DATASOURCE_SCAN |PARTITIONED| + -- ONE_TO_ONE_EXCHANGE |PARTITIONED| + -- EMPTY_TUPLE_SOURCE |PARTITIONED| diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_5/dblp-csx-aqlplus_5.1.ddl.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_5/dblp-csx-aqlplus_5.1.ddl.aql new file mode 100644 index 0000000..811c2b5 --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_5/dblp-csx-aqlplus_5.1.ddl.aql @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +drop dataverse fuzzyjoin if exists; + +create dataverse fuzzyjoin; + +use dataverse fuzzyjoin; + +create type DBLPType as open { + id: int64, + dblpid: string?, + title: string?, + authors: string?, + misc: string? +} + +create type CSXType as open { + id: int64, + csxid: string?, + title: string?, + authors: string?, + misc: string? +} + +create dataset DBLP(DBLPType) primary key id; +create dataset CSX(CSXType) primary key id; + +create index author_index on DBLP(authors) type keyword; diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_5/dblp-csx-aqlplus_5.2.update.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_5/dblp-csx-aqlplus_5.2.update.aql new file mode 100644 index 0000000..fc2fb4b --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_5/dblp-csx-aqlplus_5.2.update.aql @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin; + +load dataset DBLP +using localfs +(("path"="asterix_nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")); + +load dataset CSX +using localfs +(("path"="asterix_nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000")); + diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_5/dblp-csx-aqlplus_5.3.query.aql b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_5/dblp-csx-aqlplus_5.3.query.aql new file mode 100644 index 0000000..7c65b3b --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_5/dblp-csx-aqlplus_5.3.query.aql @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +use dataverse fuzzyjoin; + +set simthreshold '.7f' + +let $s := count( +for $t in dataset('CSX') +for $o in dataset('DBLP') +where contains($o.title, "System") and +word-tokens($o.authors) ~= word-tokens($t.authors) +return $t) +return $s diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/fuzzyjoin/dblp-csx-aqlplus_5/dblp-csx-aqlplus_5.1.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/fuzzyjoin/dblp-csx-aqlplus_5/dblp-csx-aqlplus_5.1.adm new file mode 100644 index 0000000..00750ed --- /dev/null +++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/fuzzyjoin/dblp-csx-aqlplus_5/dblp-csx-aqlplus_5.1.adm @@ -0,0 +1 @@ +3 diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite.xml index 5750b28..2f277cc 100644 --- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite.xml +++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite.xml @@ -2656,6 +2656,11 @@ </compilation-unit> </test-case> <test-case FilePath="fuzzyjoin"> + <compilation-unit name="dblp-csx-aqlplus_5"> + <output-dir compare="Text">dblp-csx-aqlplus_5</output-dir> + </compilation-unit> + </test-case> + <test-case FilePath="fuzzyjoin"> <compilation-unit name="dblp-csx-dblp-aqlplus_1"> <output-dir compare="Text">dblp-csx-dblp-aqlplus_1</output-dir> </compilation-unit> -- To view, visit https://asterix-gerrit.ics.uci.edu/1119 To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings Gerrit-MessageType: merged Gerrit-Change-Id: I1aef69a2278853fd9f8020da6639331b367ed5ad Gerrit-PatchSet: 16 Gerrit-Project: asterixdb Gerrit-Branch: master Gerrit-Owner: Wenhai Li <[email protected]> Gerrit-Reviewer: Chen Li <[email protected]> Gerrit-Reviewer: Jenkins <[email protected]> Gerrit-Reviewer: Jianfeng Jia <[email protected]> Gerrit-Reviewer: Taewoo Kim <[email protected]> Gerrit-Reviewer: Till Westmann <[email protected]> Gerrit-Reviewer: Wenhai Li <[email protected]> Gerrit-Reviewer: Yingyi Bu <[email protected]> Gerrit-Reviewer: Yingyi Bu <[email protected]>
