This is an automated email from the ASF dual-hosted git repository. joemcdonnell pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 7d043864ff5f2b46b40e589bd8f19eaa23308eca Author: ttttttz <[email protected]> AuthorDate: Wed Aug 14 11:15:12 2024 +0800 IMPALA-13274: Filter out illegal output for certain join nodes Filter out illegal output for certain join nodes, including those with join operators LEFT_ANTI_JOIN, LEFT_SEMI_JOIN, NULL_AWARE_LEFT_ANTI_JOIN, and ICEBERG_DELETE_JOIN. For these join nodes, we only retain the tuple ids of the outer side while computing tuple ids. If the illegal output from these join nodes is referenced by the parent node, it may cause the backend to crash due to missing tuple id. Tests - Add e2e test Change-Id: I50b82d85737025df2fdd9e7ab0fca2385e642415 Reviewed-on: http://gerrit.cloudera.org:8080/21671 Reviewed-by: Impala Public Jenkins <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> --- .../java/org/apache/impala/planner/JoinNode.java | 33 ++++++ .../queries/QueryTest/miss-tuple-joins.test | 116 +++++++++++++++++++++ tests/query_test/test_join_queries.py | 6 ++ 3 files changed, 155 insertions(+) diff --git a/fe/src/main/java/org/apache/impala/planner/JoinNode.java b/fe/src/main/java/org/apache/impala/planner/JoinNode.java index e91dee0a0..43dc651f2 100644 --- a/fe/src/main/java/org/apache/impala/planner/JoinNode.java +++ b/fe/src/main/java/org/apache/impala/planner/JoinNode.java @@ -28,6 +28,7 @@ import org.apache.impala.analysis.AnalyticExpr; import org.apache.impala.analysis.Analyzer; import org.apache.impala.analysis.BinaryPredicate; import org.apache.impala.analysis.Expr; +import org.apache.impala.analysis.ExprSubstitutionMap; import org.apache.impala.analysis.JoinOperator; import org.apache.impala.analysis.SlotDescriptor; import org.apache.impala.analysis.SlotRef; @@ -194,6 +195,38 @@ public abstract class JoinNode extends PlanNode { } } + @Override + public ExprSubstitutionMap getOutputSmap() { + // Filter out illegal output for certain join nodes, including those with + // join operators LEFT_ANTI_JOIN, LEFT_SEMI_JOIN, NULL_AWARE_LEFT_ANTI_JOIN, + // and ICEBERG_DELETE_JOIN. + switch (joinOp_) { + case LEFT_ANTI_JOIN: + case LEFT_SEMI_JOIN: + case NULL_AWARE_LEFT_ANTI_JOIN: + case ICEBERG_DELETE_JOIN: { + ExprSubstitutionMap result = new ExprSubstitutionMap(); + List<Expr> lhs = Expr.cloneList(outputSmap_.getLhs()); + List<Expr> rhs = Expr.cloneList(outputSmap_.getRhs()); + for (int i = 0; i < rhs.size(); i++) { + if (rhs.get(i) instanceof SlotRef) { + SlotRef slotRef = (SlotRef) rhs.get(i); + TupleId tid = slotRef.getDesc().getParent().getId(); + // If the tid is not in the current node's tuple ids, skip it. + if (!tupleIds_.contains(tid)) { + continue; + } + } + result.put(lhs.get(i), rhs.get(i)); + } + return result; + } + default: { + return outputSmap_; + } + } + } + /** * Returns true if the join node can be inverted. Inversions are not allowed * in the following cases: diff --git a/testdata/workloads/functional-query/queries/QueryTest/miss-tuple-joins.test b/testdata/workloads/functional-query/queries/QueryTest/miss-tuple-joins.test new file mode 100644 index 000000000..6585d2cf4 --- /dev/null +++ b/testdata/workloads/functional-query/queries/QueryTest/miss-tuple-joins.test @@ -0,0 +1,116 @@ +==== +---- QUERY +CREATE TABLE test_miss_tuple_1 ( + f0 INT +) +STORED AS PARQUET; +==== +---- QUERY +CREATE TABLE test_miss_tuple_2 ( + f0 INT +) +STORED AS PARQUET; +==== +---- QUERY +INSERT INTO test_miss_tuple_1 VALUES (1); +==== +---- QUERY +INSERT INTO test_miss_tuple_1 VALUES (2); +==== +---- QUERY +INSERT INTO test_miss_tuple_1 VALUES (3); +==== +---- QUERY +INSERT INTO test_miss_tuple_2 VALUES (1); +==== +---- QUERY +SELECT * + FROM test_miss_tuple_1 +WHERE f0 NOT IN + (SELECT f0 FROM test_miss_tuple_2 ORDER BY rand() LIMIT 10000) +ORDER BY f0, rand() LIMIT 200000; +---- TYPES +int +---- RESULTS +2 +3 +==== +---- QUERY +SELECT * + FROM test_miss_tuple_1 +WHERE f0 NOT IN + (SELECT f0 FROM test_miss_tuple_2 ORDER BY abs(1) LIMIT 10000) +ORDER BY f0, abs(1) LIMIT 200000; +---- TYPES +int +---- RESULTS +2 +3 +==== +---- QUERY +SELECT * + FROM test_miss_tuple_1 +WHERE f0 IN + (SELECT f0 FROM test_miss_tuple_2 ORDER BY rand() LIMIT 10000) +ORDER BY f0, rand() LIMIT 200000; +---- TYPES +int +---- RESULTS +1 +==== +---- QUERY +SELECT * + FROM test_miss_tuple_1 +WHERE f0 IN + (SELECT f0 FROM test_miss_tuple_2 ORDER BY abs(1) LIMIT 10000) +ORDER BY f0, abs(1) LIMIT 200000; +---- TYPES +int +---- RESULTS +1 +==== +---- QUERY +SELECT * + FROM test_miss_tuple_1 t1 +LEFT SEMI JOIN + (SELECT f0 FROM test_miss_tuple_2 ORDER BY rand() LIMIT 10000) t2 +ON t1.f0 = t2.f0 +ORDER BY f0, rand() LIMIT 10000; +---- TYPES +int +---- RESULTS +1 +==== +---- QUERY +SELECT * + FROM test_miss_tuple_1 t1 +LEFT ANTI JOIN + (SELECT f0 FROM test_miss_tuple_2 ORDER BY rand() LIMIT 10000) t2 +ON t1.f0 = t2.f0 +ORDER BY f0, rand() LIMIT 10000; +---- TYPES +int +---- RESULTS +2 +3 +==== +---- QUERY +SELECT * + FROM test_miss_tuple_1 t1 +RIGHT SEMI JOIN + (SELECT f0 FROM test_miss_tuple_2 ORDER BY rand() LIMIT 10000) t2 +ON t1.f0 = t2.f0 +ORDER BY f0, rand() LIMIT 10000; +---- TYPES +int +---- RESULTS +1 +==== +---- QUERY +SELECT * + FROM test_miss_tuple_1 t1 +RIGHT ANTI JOIN + (SELECT f0 FROM test_miss_tuple_2 ORDER BY rand() LIMIT 10000) t2 +ON t1.f0 = t2.f0 +ORDER BY f0, rand() LIMIT 10000; +==== \ No newline at end of file diff --git a/tests/query_test/test_join_queries.py b/tests/query_test/test_join_queries.py index 90105af7c..ddcf59f4d 100644 --- a/tests/query_test/test_join_queries.py +++ b/tests/query_test/test_join_queries.py @@ -118,6 +118,12 @@ class TestJoinQueries(ImpalaTestSuite): new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop') self.run_test_case('QueryTest/empty-build-joins', new_vector) + def test_miss_tuple_joins(self, vector, unique_database): + new_vector = deepcopy(vector) + new_vector.get_value('exec_option')['batch_size'] = vector.get_value('batch_size') + new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop') + self.run_test_case('QueryTest/miss-tuple-joins', new_vector, unique_database) + class TestTPCHJoinQueries(ImpalaTestSuite): # Uses the TPC-H dataset in order to have larger joins. Needed for example to test # the repartitioning codepaths.
