This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 7d043864ff5f2b46b40e589bd8f19eaa23308eca
Author: ttttttz <[email protected]>
AuthorDate: Wed Aug 14 11:15:12 2024 +0800

    IMPALA-13274: Filter out illegal output for certain join nodes
    
    Filter out illegal output for certain join nodes, including those with
    join operators LEFT_ANTI_JOIN, LEFT_SEMI_JOIN, NULL_AWARE_LEFT_ANTI_JOIN,
    and ICEBERG_DELETE_JOIN. For these join nodes, we only retain the tuple
    ids of the outer side while computing tuple ids. If the illegal output
    from these join nodes is referenced by the parent node, it may cause
    the backend to crash due to missing tuple id.
    
    Tests
    - Add e2e test
    
    Change-Id: I50b82d85737025df2fdd9e7ab0fca2385e642415
    Reviewed-on: http://gerrit.cloudera.org:8080/21671
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 .../java/org/apache/impala/planner/JoinNode.java   |  33 ++++++
 .../queries/QueryTest/miss-tuple-joins.test        | 116 +++++++++++++++++++++
 tests/query_test/test_join_queries.py              |   6 ++
 3 files changed, 155 insertions(+)

diff --git a/fe/src/main/java/org/apache/impala/planner/JoinNode.java 
b/fe/src/main/java/org/apache/impala/planner/JoinNode.java
index e91dee0a0..43dc651f2 100644
--- a/fe/src/main/java/org/apache/impala/planner/JoinNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/JoinNode.java
@@ -28,6 +28,7 @@ import org.apache.impala.analysis.AnalyticExpr;
 import org.apache.impala.analysis.Analyzer;
 import org.apache.impala.analysis.BinaryPredicate;
 import org.apache.impala.analysis.Expr;
+import org.apache.impala.analysis.ExprSubstitutionMap;
 import org.apache.impala.analysis.JoinOperator;
 import org.apache.impala.analysis.SlotDescriptor;
 import org.apache.impala.analysis.SlotRef;
@@ -194,6 +195,38 @@ public abstract class JoinNode extends PlanNode {
     }
   }
 
+  @Override
+  public ExprSubstitutionMap getOutputSmap() {
+    // Filter out illegal output for certain join nodes, including those with
+    // join operators LEFT_ANTI_JOIN, LEFT_SEMI_JOIN, 
NULL_AWARE_LEFT_ANTI_JOIN,
+    // and ICEBERG_DELETE_JOIN.
+    switch (joinOp_) {
+      case LEFT_ANTI_JOIN:
+      case LEFT_SEMI_JOIN:
+      case NULL_AWARE_LEFT_ANTI_JOIN:
+      case ICEBERG_DELETE_JOIN: {
+        ExprSubstitutionMap result = new ExprSubstitutionMap();
+        List<Expr> lhs = Expr.cloneList(outputSmap_.getLhs());
+        List<Expr> rhs = Expr.cloneList(outputSmap_.getRhs());
+        for (int i = 0; i < rhs.size(); i++) {
+          if (rhs.get(i) instanceof SlotRef) {
+            SlotRef slotRef = (SlotRef) rhs.get(i);
+            TupleId tid = slotRef.getDesc().getParent().getId();
+            // If the tid is not in the current node's tuple ids, skip it.
+            if (!tupleIds_.contains(tid)) {
+              continue;
+            }
+          }
+          result.put(lhs.get(i), rhs.get(i));
+        }
+        return result;
+      }
+      default: {
+        return outputSmap_;
+      }
+    }
+  }
+
   /**
    * Returns true if the join node can be inverted. Inversions are not allowed
    * in the following cases:
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/miss-tuple-joins.test 
b/testdata/workloads/functional-query/queries/QueryTest/miss-tuple-joins.test
new file mode 100644
index 000000000..6585d2cf4
--- /dev/null
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/miss-tuple-joins.test
@@ -0,0 +1,116 @@
+====
+---- QUERY
+CREATE TABLE test_miss_tuple_1 (
+  f0 INT
+)
+STORED AS PARQUET;
+====
+---- QUERY
+CREATE TABLE test_miss_tuple_2 (
+  f0 INT
+)
+STORED AS PARQUET;
+====
+---- QUERY
+INSERT INTO test_miss_tuple_1 VALUES (1);
+====
+---- QUERY
+INSERT INTO test_miss_tuple_1 VALUES (2);
+====
+---- QUERY
+INSERT INTO test_miss_tuple_1 VALUES (3);
+====
+---- QUERY
+INSERT INTO test_miss_tuple_2 VALUES (1);
+====
+---- QUERY
+SELECT *
+  FROM test_miss_tuple_1
+WHERE f0 NOT IN
+  (SELECT f0 FROM test_miss_tuple_2 ORDER BY rand() LIMIT 10000)
+ORDER BY f0, rand() LIMIT 200000;
+---- TYPES
+int
+---- RESULTS
+2
+3
+====
+---- QUERY
+SELECT *
+  FROM test_miss_tuple_1
+WHERE f0 NOT IN
+  (SELECT f0 FROM test_miss_tuple_2 ORDER BY abs(1) LIMIT 10000)
+ORDER BY f0, abs(1) LIMIT 200000;
+---- TYPES
+int
+---- RESULTS
+2
+3
+====
+---- QUERY
+SELECT *
+  FROM test_miss_tuple_1
+WHERE f0 IN
+  (SELECT f0 FROM test_miss_tuple_2 ORDER BY rand() LIMIT 10000)
+ORDER BY f0, rand() LIMIT 200000;
+---- TYPES
+int
+---- RESULTS
+1
+====
+---- QUERY
+SELECT *
+  FROM test_miss_tuple_1
+WHERE f0 IN
+  (SELECT f0 FROM test_miss_tuple_2 ORDER BY abs(1) LIMIT 10000)
+ORDER BY f0, abs(1) LIMIT 200000;
+---- TYPES
+int
+---- RESULTS
+1
+====
+---- QUERY
+SELECT *
+  FROM test_miss_tuple_1 t1
+LEFT SEMI JOIN
+  (SELECT f0 FROM test_miss_tuple_2 ORDER BY rand() LIMIT 10000) t2
+ON t1.f0 = t2.f0
+ORDER BY f0, rand() LIMIT 10000;
+---- TYPES
+int
+---- RESULTS
+1
+====
+---- QUERY
+SELECT *
+  FROM test_miss_tuple_1 t1
+LEFT ANTI JOIN
+  (SELECT f0 FROM test_miss_tuple_2 ORDER BY rand() LIMIT 10000) t2
+ON t1.f0 = t2.f0
+ORDER BY f0, rand() LIMIT 10000;
+---- TYPES
+int
+---- RESULTS
+2
+3
+====
+---- QUERY
+SELECT *
+  FROM test_miss_tuple_1 t1
+RIGHT SEMI JOIN
+  (SELECT f0 FROM test_miss_tuple_2 ORDER BY rand() LIMIT 10000) t2
+ON t1.f0 = t2.f0
+ORDER BY f0, rand() LIMIT 10000;
+---- TYPES
+int
+---- RESULTS
+1
+====
+---- QUERY
+SELECT *
+  FROM test_miss_tuple_1 t1
+RIGHT ANTI JOIN
+  (SELECT f0 FROM test_miss_tuple_2 ORDER BY rand() LIMIT 10000) t2
+ON t1.f0 = t2.f0
+ORDER BY f0, rand() LIMIT 10000;
+====
\ No newline at end of file
diff --git a/tests/query_test/test_join_queries.py 
b/tests/query_test/test_join_queries.py
index 90105af7c..ddcf59f4d 100644
--- a/tests/query_test/test_join_queries.py
+++ b/tests/query_test/test_join_queries.py
@@ -118,6 +118,12 @@ class TestJoinQueries(ImpalaTestSuite):
     new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop')
     self.run_test_case('QueryTest/empty-build-joins', new_vector)
 
+  def test_miss_tuple_joins(self, vector, unique_database):
+    new_vector = deepcopy(vector)
+    new_vector.get_value('exec_option')['batch_size'] = 
vector.get_value('batch_size')
+    new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop')
+    self.run_test_case('QueryTest/miss-tuple-joins', new_vector, 
unique_database)
+
 class TestTPCHJoinQueries(ImpalaTestSuite):
   # Uses the TPC-H dataset in order to have larger joins. Needed for example 
to test
   # the repartitioning codepaths.

Reply via email to