This is an automated email from the ASF dual-hosted git repository.

dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new ea52c3f83bd [fix](join)Consider mark join when computing 
right_col_idx(#50720) (#51156)
ea52c3f83bd is described below

commit ea52c3f83bd4d6195c6bac4030c287113947c2ca
Author: Jerry Hu <[email protected]>
AuthorDate: Wed Jun 11 11:14:30 2025 +0800

    [fix](join)Consider mark join when computing right_col_idx(#50720) (#51156)
    
    ### What problem does this PR solve?
    
    Pick #50720 #50993 #51124
    
    Co-authored-by: zhangdong <[email protected]>
---
 be/src/pipeline/exec/hashjoin_probe_operator.cpp   |  31 ++-
 be/src/pipeline/exec/hashjoin_probe_operator.h     |   6 +
 .../pipeline/exec/join/process_hash_table_probe.h  |  10 +-
 .../exec/join/process_hash_table_probe_impl.h      | 144 ++++++++----
 be/src/vec/common/hash_table/join_hash_table.h     |  44 ++--
 .../data/query_p0/join/mark_join/mark_join.out     | Bin 239 -> 1007 bytes
 .../join/mark_join/right_semi_mark_join.out        | Bin 0 -> 568 bytes
 .../query_p0/join/mark_join/mark_join.groovy       | 251 +++++++++++++++++++--
 .../join/mark_join/right_semi_mark_join.groovy     | 109 +++++++++
 9 files changed, 509 insertions(+), 86 deletions(-)

diff --git a/be/src/pipeline/exec/hashjoin_probe_operator.cpp 
b/be/src/pipeline/exec/hashjoin_probe_operator.cpp
index c969f60fa72..82e4b732193 100644
--- a/be/src/pipeline/exec/hashjoin_probe_operator.cpp
+++ b/be/src/pipeline/exec/hashjoin_probe_operator.cpp
@@ -73,6 +73,7 @@ Status HashJoinProbeLocalState::open(RuntimeState* state) {
     RETURN_IF_ERROR(JoinProbeLocalState::open(state));
 
     auto& p = _parent->cast<HashJoinProbeOperatorX>();
+    _right_col_idx = p._right_col_idx;
     std::visit(
             [&](auto&& join_op_variants, auto have_other_join_conjunct) {
                 using JoinOpType = std::decay_t<decltype(join_op_variants)>;
@@ -511,7 +512,7 @@ Status HashJoinProbeOperatorX::init(const TPlanNode& tnode, 
RuntimeState* state)
     const bool probe_dispose_null =
             _match_all_probe || _build_unique || _join_op == 
TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN ||
             _join_op == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN || _join_op == 
TJoinOp::LEFT_ANTI_JOIN ||
-            _join_op == TJoinOp::LEFT_SEMI_JOIN;
+            _join_op == TJoinOp::LEFT_SEMI_JOIN || _is_mark_join;
     const std::vector<TEqJoinCondition>& eq_join_conjuncts = 
tnode.hash_join_node.eq_join_conjuncts;
     std::vector<bool> probe_not_ignore_null(eq_join_conjuncts.size());
     size_t conjuncts_index = 0;
@@ -642,14 +643,34 @@ Status HashJoinProbeOperatorX::open(RuntimeState* state) {
         }
     }
 
-    const int right_col_idx =
-            (_is_right_semi_anti && !_have_other_join_conjunct) ? 0 : 
_left_table_data_types.size();
+    _right_col_idx = (_is_right_semi_anti && !_have_other_join_conjunct &&
+                      (!_is_mark_join || _mark_join_conjuncts.empty()))
+                             ? 0
+                             : _left_table_data_types.size();
     size_t idx = 0;
     for (const auto* slot : slots_to_check) {
         auto data_type = slot->get_data_type_ptr();
-        const auto slot_on_left = idx < right_col_idx;
+        const auto slot_on_left = idx < _right_col_idx;
+
+        if (slot_on_left) {
+            if (idx >= _left_table_data_types.size()) {
+                return Status::InternalError(
+                        "Join node(id={}, OP={}) intermediate slot({}, #{})'s 
on left table "
+                        "idx out bound of _left_table_data_types: {} vs {}",
+                        _node_id, _join_op, slot->col_name(), slot->id(), idx,
+                        _left_table_data_types.size());
+            }
+        } else if (idx - _right_col_idx >= _right_table_data_types.size()) {
+            return Status::InternalError(
+                    "Join node(id={}, OP={}) intermediate slot({}, #{})'s on 
right table "
+                    "idx out bound of _right_table_data_types: {} vs {}(idx = 
{}, _right_col_idx = "
+                    "{})",
+                    _node_id, _join_op, slot->col_name(), slot->id(), idx - 
_right_col_idx,
+                    _right_table_data_types.size(), idx, _right_col_idx);
+        }
+
         auto target_data_type = slot_on_left ? _left_table_data_types[idx]
-                                             : _right_table_data_types[idx - 
right_col_idx];
+                                             : _right_table_data_types[idx - 
_right_col_idx];
         ++idx;
         if (data_type->equals(*target_data_type)) {
             continue;
diff --git a/be/src/pipeline/exec/hashjoin_probe_operator.h 
b/be/src/pipeline/exec/hashjoin_probe_operator.h
index 66d709e6541..ee0bb78ee4f 100644
--- a/be/src/pipeline/exec/hashjoin_probe_operator.h
+++ b/be/src/pipeline/exec/hashjoin_probe_operator.h
@@ -116,6 +116,9 @@ private:
     std::unique_ptr<HashTableCtxVariants> _process_hashtable_ctx_variants =
             std::make_unique<HashTableCtxVariants>();
 
+    // Index of column(slot) from right table in the `_intermediate_row_desc`.
+    size_t _right_col_idx;
+
     RuntimeProfile::Counter* _probe_expr_call_timer = nullptr;
     RuntimeProfile::Counter* _probe_side_output_timer = nullptr;
     RuntimeProfile::HighWaterMarkCounter* _probe_arena_memory_usage = nullptr;
@@ -185,6 +188,9 @@ private:
     std::vector<bool> _right_output_slot_flags;
     std::vector<std::string> _right_table_column_names;
     const std::vector<TExpr> _partition_exprs;
+
+    // Index of column(slot) from right table in the `_intermediate_row_desc`.
+    size_t _right_col_idx;
 };
 
 } // namespace pipeline
diff --git a/be/src/pipeline/exec/join/process_hash_table_probe.h 
b/be/src/pipeline/exec/join/process_hash_table_probe.h
index 7a5c34fb845..4e1a22a9fab 100644
--- a/be/src/pipeline/exec/join/process_hash_table_probe.h
+++ b/be/src/pipeline/exec/join/process_hash_table_probe.h
@@ -23,6 +23,7 @@
 #include "vec/columns/column.h"
 #include "vec/columns/columns_number.h"
 #include "vec/common/arena.h"
+#include "vec/common/custom_allocator.h"
 
 namespace doris {
 namespace vectorized {
@@ -137,8 +138,15 @@ struct ProcessHashTableProbe {
     RuntimeProfile::Counter* _probe_side_output_timer = nullptr;
     RuntimeProfile::Counter* _finish_probe_phase_timer = nullptr;
 
-    int _right_col_idx;
+    // See `HashJoinProbeOperatorX::_right_col_idx`
+    const int _right_col_idx;
+
     int _right_col_len;
+
+    // For right semi with mark join conjunct, we need to store the mark join 
flags
+    // in the hash table.
+    // -1 means null, 0 means false, 1 means true
+    DorisVector<int8_t> mark_join_flags;
 };
 
 } // namespace pipeline
diff --git a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h 
b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h
index 1f9d127549a..b88d1f42541 100644
--- a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h
+++ b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h
@@ -56,9 +56,7 @@ 
ProcessHashTableProbe<JoinOpType>::ProcessHashTableProbe(HashJoinProbeLocalState
           _build_side_output_timer(parent->_build_side_output_timer),
           _probe_side_output_timer(parent->_probe_side_output_timer),
           _finish_probe_phase_timer(parent->_finish_probe_phase_timer),
-          _right_col_idx((_is_right_semi_anti && !_have_other_join_conjunct)
-                                 ? 0
-                                 : _parent->left_table_data_types().size()),
+          _right_col_idx(parent->_right_col_idx),
           _right_col_len(_parent->right_table_data_types().size()) {}
 
 template <int JoinOpType>
@@ -66,11 +64,10 @@ void 
ProcessHashTableProbe<JoinOpType>::build_side_output_column(
         vectorized::MutableColumns& mcol, const std::vector<bool>& 
output_slot_flags, int size,
         bool have_other_join_conjunct, bool is_mark_join) {
     SCOPED_TIMER(_build_side_output_timer);
-    constexpr auto is_semi_anti_join = JoinOpType == TJoinOp::RIGHT_ANTI_JOIN 
||
-                                       JoinOpType == TJoinOp::RIGHT_SEMI_JOIN 
||
-                                       JoinOpType == TJoinOp::LEFT_ANTI_JOIN ||
+    constexpr auto is_semi_anti_join = JoinOpType == TJoinOp::LEFT_ANTI_JOIN ||
+                                       JoinOpType == TJoinOp::LEFT_SEMI_JOIN ||
                                        JoinOpType == 
TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN ||
-                                       JoinOpType == TJoinOp::LEFT_SEMI_JOIN;
+                                       JoinOpType == 
TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN;
 
     constexpr auto probe_all =
             JoinOpType == TJoinOp::LEFT_OUTER_JOIN || JoinOpType == 
TJoinOp::FULL_OUTER_JOIN;
@@ -209,7 +206,7 @@ Status 
ProcessHashTableProbe<JoinOpType>::do_process(HashTableType& hash_table_c
                         (JoinOpType == doris::TJoinOp::LEFT_ANTI_JOIN ||
                          JoinOpType == doris::TJoinOp::LEFT_SEMI_JOIN ||
                          JoinOpType == 
doris::TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN ||
-                         (is_mark_join && JoinOpType != 
doris::TJoinOp::RIGHT_SEMI_JOIN)));
+                         (is_mark_join)));
     }
 
     auto& mcol = mutable_block.mutable_columns();
@@ -268,8 +265,8 @@ Status 
ProcessHashTableProbe<JoinOpType>::do_process(HashTableType& hash_table_c
     build_side_output_column(mcol, *_right_output_slot_flags, current_offset, 
with_other_conjuncts,
                              is_mark_join);
 
-    if constexpr (with_other_conjuncts || (JoinOpType != 
TJoinOp::RIGHT_SEMI_JOIN &&
-                                           JoinOpType != 
TJoinOp::RIGHT_ANTI_JOIN)) {
+    if (with_other_conjuncts || !_parent->_mark_join_conjuncts.empty() ||
+        (JoinOpType != TJoinOp::RIGHT_SEMI_JOIN && JoinOpType != 
TJoinOp::RIGHT_ANTI_JOIN)) {
         auto check_all_match_one = [](const std::vector<uint32_t>& vecs, 
uint32_t probe_idx,
                                       int size) {
             if (!size || vecs[0] != probe_idx || vecs[size - 1] != probe_idx + 
size - 1) {
@@ -291,7 +288,14 @@ Status 
ProcessHashTableProbe<JoinOpType>::do_process(HashTableType& hash_table_c
 
     output_block->swap(mutable_block.to_block());
 
-    if constexpr (is_mark_join && JoinOpType != TJoinOp::RIGHT_SEMI_JOIN) {
+    if constexpr (is_mark_join) {
+        if constexpr (JoinOpType == TJoinOp::RIGHT_SEMI_JOIN ||
+                      JoinOpType == TJoinOp::RIGHT_ANTI_JOIN) {
+            if (mark_join_flags.empty()) {
+                mark_join_flags.resize(hash_table_ctx.hash_table->size(), 0);
+            }
+        }
+
         return do_mark_join_conjuncts<with_other_conjuncts>(
                 output_block, hash_table_ctx.hash_table->get_bucket_size());
     } else if constexpr (with_other_conjuncts) {
@@ -363,15 +367,19 @@ template <int JoinOpType>
 template <bool with_other_conjuncts>
 Status 
ProcessHashTableProbe<JoinOpType>::do_mark_join_conjuncts(vectorized::Block* 
output_block,
                                                                  size_t 
hash_table_bucket_size) {
-    DCHECK(JoinOpType == TJoinOp::LEFT_ANTI_JOIN ||
-           JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN ||
-           JoinOpType == TJoinOp::LEFT_SEMI_JOIN ||
-           JoinOpType == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN);
+    if (JoinOpType != TJoinOp::LEFT_ANTI_JOIN && JoinOpType != 
TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN &&
+        JoinOpType != TJoinOp::LEFT_SEMI_JOIN && JoinOpType != 
TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN &&
+        JoinOpType != TJoinOp::RIGHT_SEMI_JOIN && JoinOpType != 
TJoinOp::RIGHT_ANTI_JOIN) {
+        return Status::InternalError("join type {} is not supported", 
JoinOpType);
+    }
 
     constexpr bool is_anti_join = JoinOpType == TJoinOp::LEFT_ANTI_JOIN ||
-                                  JoinOpType == 
TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN;
+                                  JoinOpType == 
TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN ||
+                                  JoinOpType == TJoinOp::RIGHT_ANTI_JOIN;
     constexpr bool is_null_aware_join = JoinOpType == 
TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN ||
                                         JoinOpType == 
TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN;
+    constexpr bool is_right_half_join =
+            JoinOpType == TJoinOp::RIGHT_SEMI_JOIN || JoinOpType == 
TJoinOp::RIGHT_ANTI_JOIN;
 
     const auto row_count = output_block->rows();
     if (!row_count) {
@@ -458,37 +466,71 @@ Status 
ProcessHashTableProbe<JoinOpType>::do_mark_join_conjuncts(vectorized::Blo
     const bool should_be_null_if_build_side_has_null =
             *_has_null_in_build_side && is_null_aware_join && 
!with_other_conjuncts;
     for (size_t i = 0; i != row_count; ++i) {
-        bool not_matched_before = _parent->_last_probe_match != 
_probe_indexs[i];
-        if (_build_indexs[i] == 0) {
-            bool has_null_mark_value = _parent->_last_probe_null_mark == 
_probe_indexs[i];
-            if (not_matched_before) {
-                filter_map[i] = true;
-                mark_null_map[i] = has_null_mark_value || 
should_be_null_if_build_side_has_null;
-                mark_filter_data[i] = false;
+        if constexpr (is_right_half_join) {
+            const auto& build_index = _build_indexs[i];
+            if (build_index == 0) {
+                continue;
+            }
+
+            if (mark_join_flags[build_index] == 1) {
+                continue;
+            }
+
+            if (mark_null_map[i]) {
+                mark_join_flags[build_index] = -1;
+            } else if (mark_filter_data[i]) {
+                mark_join_flags[build_index] = 1;
             }
         } else {
-            if (mark_null_map[i]) { // is null
-                _parent->_last_probe_null_mark = _probe_indexs[i];
-            } else {
-                if (mark_filter_data[i] && not_matched_before) {
-                    _parent->_last_probe_match = _probe_indexs[i];
+            bool not_matched_before = _parent->_last_probe_match != 
_probe_indexs[i];
+            if (_build_indexs[i] == 0) {
+                bool has_null_mark_value = _parent->_last_probe_null_mark == 
_probe_indexs[i];
+                if (not_matched_before) {
                     filter_map[i] = true;
+                    mark_null_map[i] = has_null_mark_value || 
should_be_null_if_build_side_has_null;
+                    mark_filter_data[i] = false;
+                }
+            } else {
+                if (mark_null_map[i]) { // is null
+                    _parent->_last_probe_null_mark = _probe_indexs[i];
+                } else {
+                    if (mark_filter_data[i] && not_matched_before) {
+                        _parent->_last_probe_match = _probe_indexs[i];
+                        filter_map[i] = true;
+                    }
                 }
             }
         }
     }
 
-    if constexpr (is_anti_join) {
-        // flip the mark column
-        for (size_t i = 0; i != row_count; ++i) {
-            mark_filter_data[i] ^= 1; // not null/ null
+    if constexpr (is_right_half_join) {
+        if constexpr (is_anti_join) {
+            // flip the mark column
+            for (size_t i = 0; i != row_count; ++i) {
+                if (mark_join_flags[i] == -1) {
+                    // -1 means null.
+                    continue;
+                }
+
+                mark_join_flags[i] ^= 1;
+            }
+        }
+        // For right semi/anti join, no rows will be output in probe phase.
+        output_block->clear();
+        return Status::OK();
+    } else {
+        if constexpr (is_anti_join) {
+            // flip the mark column
+            for (size_t i = 0; i != row_count; ++i) {
+                mark_filter_data[i] ^= 1; // not null/ null
+            }
         }
-    }
 
-    auto result_column_id = output_block->columns();
-    output_block->insert(
-            {std::move(filter_column), 
std::make_shared<vectorized::DataTypeUInt8>(), ""});
-    return vectorized::Block::filter_block(output_block, result_column_id, 
result_column_id);
+        auto result_column_id = output_block->columns();
+        output_block->insert(
+                {std::move(filter_column), 
std::make_shared<vectorized::DataTypeUInt8>(), ""});
+        return vectorized::Block::filter_block(output_block, result_column_id, 
result_column_id);
+    }
 }
 
 template <int JoinOpType>
@@ -637,7 +679,8 @@ Status 
ProcessHashTableProbe<JoinOpType>::finish_probing(HashTableType& hash_tab
     if (block_size) {
         if (mcol.size() < _right_col_len + _right_col_idx) {
             return Status::InternalError(
-                    "output block invalid, mcol.size()={}, _right_col_len={}, 
_right_col_idx={}",
+                    "output block invalid, mcol.size()={}, _right_col_len={}, "
+                    "_right_col_idx={}",
                     mcol.size(), _right_col_len, _right_col_idx);
         }
         for (size_t j = 0; j < _right_col_len; ++j) {
@@ -646,8 +689,31 @@ Status 
ProcessHashTableProbe<JoinOpType>::finish_probing(HashTableType& hash_tab
                                                           _build_indexs.data() 
+ block_size);
         }
 
+        if constexpr (JoinOpType == TJoinOp::RIGHT_ANTI_JOIN ||
+                      JoinOpType == TJoinOp::RIGHT_SEMI_JOIN) {
+            if (is_mark_join) {
+                if (mark_join_flags.empty()) {
+                    mark_join_flags.resize(hash_table_ctx.hash_table->size(), 
0);
+                }
+
+                // mark column is nullable
+                auto* mark_column = assert_cast<vectorized::ColumnNullable*>(
+                        mcol[_parent->_mark_column_id].get());
+                mark_column->resize(block_size);
+                auto* null_map = mark_column->get_null_map_data().data();
+                auto* data = 
assert_cast<vectorized::ColumnUInt8&>(mark_column->get_nested_column())
+                                     .get_data()
+                                     .data();
+                for (size_t i = 0; i != block_size; ++i) {
+                    const auto build_index = _build_indexs[i];
+                    null_map[i] = mark_join_flags[build_index] == -1;
+                    data[i] = mark_join_flags[build_index] == 1;
+                }
+            }
+        }
+
         // just resize the left table column in case with other conjunct to 
make block size is not zero
-        if (_is_right_semi_anti && _have_other_join_conjunct) {
+        if (_is_right_semi_anti && _right_col_idx != 0) {
             for (int i = 0; i < _right_col_idx; ++i) {
                 mcol[i]->resize(block_size);
             }
diff --git a/be/src/vec/common/hash_table/join_hash_table.h 
b/be/src/vec/common/hash_table/join_hash_table.h
index 317987541cd..88695ef9b43 100644
--- a/be/src/vec/common/hash_table/join_hash_table.h
+++ b/be/src/vec/common/hash_table/join_hash_table.h
@@ -101,24 +101,23 @@ public:
             }
         }
 
-        if constexpr (with_other_conjuncts ||
-                      (is_mark_join && JoinOpType != 
TJoinOp::RIGHT_SEMI_JOIN)) {
-            if constexpr (!with_other_conjuncts) {
-                constexpr bool is_null_aware_join =
-                        JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN ||
-                        JoinOpType == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN;
-                constexpr bool is_left_half_join = JoinOpType == 
TJoinOp::LEFT_SEMI_JOIN ||
-                                                   JoinOpType == 
TJoinOp::LEFT_ANTI_JOIN;
-
-                /// For null aware join or left half(semi/anti) join without 
other conjuncts and without
-                /// mark join conjunct.
-                /// If one row on probe side has one match in build side, we 
should stop searching the
-                /// hash table for this row.
-                if (is_null_aware_join || (is_left_half_join && 
!has_mark_join_conjunct)) {
-                    return _find_batch_conjunct<JoinOpType, need_judge_null, 
true>(
-                            keys, build_idx_map, probe_idx, build_idx, 
probe_rows, probe_idxs,
-                            build_idxs);
-                }
+        if constexpr (with_other_conjuncts) {
+            return _find_batch_conjunct<JoinOpType, need_judge_null, false>(
+                    keys, build_idx_map, probe_idx, build_idx, probe_rows, 
probe_idxs, build_idxs);
+        } else if constexpr (is_mark_join) {
+            constexpr bool is_null_aware_join = JoinOpType == 
TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN ||
+                                                JoinOpType == 
TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN;
+            constexpr bool is_left_half_join =
+                    JoinOpType == TJoinOp::LEFT_SEMI_JOIN || JoinOpType == 
TJoinOp::LEFT_ANTI_JOIN;
+
+            /// For null aware join or left half(semi/anti) join without other 
conjuncts and without
+            /// mark join conjunct.
+            /// If one row on probe side has one match in build side, we 
should stop searching the
+            /// hash table for this row.
+            if (is_null_aware_join || (is_left_half_join && 
!has_mark_join_conjunct)) {
+                return _find_batch_conjunct<JoinOpType, need_judge_null, true>(
+                        keys, build_idx_map, probe_idx, build_idx, probe_rows, 
probe_idxs,
+                        build_idxs);
             }
 
             return _find_batch_conjunct<JoinOpType, need_judge_null, false>(
@@ -339,14 +338,7 @@ private:
 
         auto do_the_probe = [&]() {
             while (build_idx && matched_cnt < batch_size) {
-                if constexpr (JoinOpType == TJoinOp::RIGHT_ANTI_JOIN ||
-                              JoinOpType == TJoinOp::RIGHT_SEMI_JOIN) {
-                    if (!visited[build_idx] && keys[probe_idx] == 
build_keys[build_idx]) {
-                        probe_idxs[matched_cnt] = probe_idx;
-                        build_idxs[matched_cnt] = build_idx;
-                        matched_cnt++;
-                    }
-                } else if constexpr (need_judge_null) {
+                if constexpr (need_judge_null) {
                     if (build_idx == bucket_size) {
                         build_idxs[matched_cnt] = build_idx;
                         probe_idxs[matched_cnt] = probe_idx;
diff --git a/regression-test/data/query_p0/join/mark_join/mark_join.out 
b/regression-test/data/query_p0/join/mark_join/mark_join.out
index ed3575d0e14..ea3b7ddd6a6 100644
Binary files a/regression-test/data/query_p0/join/mark_join/mark_join.out and 
b/regression-test/data/query_p0/join/mark_join/mark_join.out differ
diff --git 
a/regression-test/data/query_p0/join/mark_join/right_semi_mark_join.out 
b/regression-test/data/query_p0/join/mark_join/right_semi_mark_join.out
new file mode 100644
index 00000000000..e00e19be11e
Binary files /dev/null and 
b/regression-test/data/query_p0/join/mark_join/right_semi_mark_join.out differ
diff --git a/regression-test/suites/query_p0/join/mark_join/mark_join.groovy 
b/regression-test/suites/query_p0/join/mark_join/mark_join.groovy
index 9759a0e9b4c..289c9d5b06b 100644
--- a/regression-test/suites/query_p0/join/mark_join/mark_join.groovy
+++ b/regression-test/suites/query_p0/join/mark_join/mark_join.groovy
@@ -16,10 +16,17 @@
 // under the License.
 
 suite("mark_join") {
-    sql "drop table if exists t1;"
-    sql "drop table if exists t2;"
+    String suiteName = "mark_join"
+    String table_t1 = "${suiteName}_table_t1"
+    String table_t2 = "${suiteName}_table_t2"
+    String table_tbl1 = "${suiteName}_table_tbl1"
+    String table_tbl2 = "${suiteName}_table_tbl2"
+    String table_tbl3 = "${suiteName}_table_tbl3"
+    
+    sql "drop table if exists ${table_t1};"
+    sql "drop table if exists ${table_t2};"
     sql """
-        create table t1 (
+        create table ${table_t1} (
             k1 int null,
             k2 int null,
             k3 bigint null,
@@ -31,7 +38,7 @@ suite("mark_join") {
     """
 
     sql """
-        create table t2 (
+        create table ${table_t2} (
             k1 int null,
             k2 int null,
             k3 bigint null,
@@ -42,23 +49,237 @@ suite("mark_join") {
         properties("replication_num" = "1");
     """
 
-    sql "insert into t1 select 1,1,1,'a';"
-    sql "insert into t1 select 2,2,2,'b';"
-    sql "insert into t1 select 3,-3,null,'c';"
-    sql "insert into t1 select 3,3,null,'c';"
+    sql "insert into ${table_t1} select 1,1,1,'a';"
+    sql "insert into ${table_t1} select 2,2,2,'b';"
+    sql "insert into ${table_t1} select 3,-3,null,'c';"
+    sql "insert into ${table_t1} select 3,3,null,'c';"
 
-    sql "insert into t2 select 1,1,1,'a';"
-    sql "insert into t2 select 2,2,2,'b';"
-    sql "insert into t2 select 3,-3,null,'c';"
-    sql "insert into t2 select 3,3,null,'c';"
+    sql "insert into ${table_t2} select 1,1,1,'a';"
+    sql "insert into ${table_t2} select 2,2,2,'b';"
+    sql "insert into ${table_t2} select 3,-3,null,'c';"
+    sql "insert into ${table_t2} select 3,3,null,'c';"
 
     qt_test """
-    select * from t1 where exists (select t2.k3 from t2 where t1.k2 = t2.k2) 
or k1 < 10 order by k1, k2;
+    select * from ${table_t1} where exists (select ${table_t2}.k3 from 
${table_t2} where ${table_t1}.k2 = ${table_t2}.k2) or k1 < 10 order by k1, k2;
     """
     qt_test """
-    select * from t1 where not exists (select t2.k3 from t2 where t1.k2 = 
t2.k2) or k1 < 10 order by k1, k2;
+    select * from ${table_t1} where not exists (select ${table_t2}.k3 from 
${table_t2} where ${table_t1}.k2 = ${table_t2}.k2) or k1 < 10 order by k1, k2;
     """
     qt_test """
-    select * from t1 where t1.k1 not in (select t2.k3 from t2 where t2.k2 = 
t1.k2) or k1 < 10 order by k1, k2;
+    select * from ${table_t1} where ${table_t1}.k1 not in (select 
${table_t2}.k3 from ${table_t2} where ${table_t2}.k2 = ${table_t1}.k2) or k1 < 
10 order by k1, k2;
+    """
+
+    sql "drop table if exists ${table_tbl1};"
+    sql "drop table if exists ${table_tbl2};"
+    sql "drop table if exists ${table_tbl3};"
+
+    sql """
+        CREATE TABLE `${table_tbl1}` (
+            `unit_name` varchar(1080) NULL,
+            `cur_unit_name` varchar(1080) NOT NULL
+        ) ENGINE=OLAP
+        DUPLICATE KEY(`unit_name`)
+        DISTRIBUTED BY RANDOM BUCKETS AUTO
+        PROPERTIES (
+            "replication_allocation" = "tag.location.default: 1"
+        );
+    """
+
+    sql """
+        CREATE TABLE `${table_tbl2}` (
+            `org_code` varchar(150) NOT NULL ,
+            `org_name` varchar(300) NULL
+        ) ENGINE=OLAP
+        DUPLICATE KEY(`org_code`)
+        DISTRIBUTED BY HASH(`org_code`) BUCKETS 4
+        PROPERTIES (
+            "replication_allocation" = "tag.location.default: 1"
+        );
+    """
+
+    sql """
+        CREATE TABLE `${table_tbl3}` (
+            `id` bigint NOT NULL,
+            `acntm_name` varchar(500) NULL ,
+            `vendor_name` varchar(500) NULL
+        ) ENGINE=OLAP
+        DUPLICATE KEY(`id`)
+        DISTRIBUTED BY HASH(`id`) BUCKETS AUTO
+        PROPERTIES (
+            "replication_allocation" = "tag.location.default: 1"
+        );
+    """
+
+    sql """
+        insert into ${table_tbl1} (unit_name, cur_unit_name) values
+            ('v1', 'o1'),
+            ('v2', 'o2'),
+            ('v3', 'o3'),
+            ('v4', 'o4'),
+            ('v5', 'o5'),
+            (null, 'o1'),
+            ('v1', 'o1'),
+            ('v2', 'o2'),
+            ('v3', 'o3'),
+            ('v4', 'o4'),
+            ('v5', 'o5'),
+            (null, 'o1'),
+            (null, 'o2'),
+            (null, 'o3'),
+            (null, 'o4'),
+            (null, 'o5'),
+            ('v1', 'o1'),
+            ('v2', 'o2'),
+            ('v3', 'o3'),
+            ('v4', 'o4'),
+            ('v5', 'o5');
+    """
+
+    sql """
+        insert into ${table_tbl2}(org_code, org_name) values
+            ('v1', 'o1'),
+            ('v2', 'o2'),
+            ('v3', 'o3'),
+            ('v4', 'o4'),
+            ('v5', 'o5'),
+            ('v1', null),
+            ('v2', null),
+            ('v3', null),
+            ('v4', null),
+            ('v5', null);
+    """
+
+    sql """
+        insert into ${table_tbl3} (id, vendor_name, acntm_name)
+            values(1, 'o1', 'v1'),
+            (2, 'o2', 'v2'),
+            (3, 'o3', 'v3'),
+            (4, 'o4', 'v4'),
+            (5, 'o5', 'v5'),
+            (6, null, 'v1'),
+            (7, null, 'v2'),
+            (8, null, 'v3'),
+            (9, null, 'v4'),
+            (10, null, 'v5');
+    """
+
+    sql " analyze table ${table_tbl1} with sync;"
+    sql " analyze table ${table_tbl2} with sync;"
+    sql " analyze table ${table_tbl3} with sync;"
+
+    sql "set disable_join_reorder=0;"
+    qt_test_right_semi_mark_join """
+        select
+            ${table_tbl3}.id,
+            ${table_tbl3}.acntm_name,
+            ${table_tbl3}.vendor_name,
+            ${table_tbl3}.vendor_name in (
+                select
+                    ${table_tbl1}.unit_name
+                from
+                    ${table_tbl2}
+                    join ${table_tbl1} on ${table_tbl1}.cur_unit_name = 
${table_tbl2}.org_name
+                where
+                    ${table_tbl2}.org_code = ${table_tbl3}.acntm_name
+            ) v1,
+            ${table_tbl3}.vendor_name not in (
+                select
+                    ${table_tbl1}.unit_name
+                from
+                    ${table_tbl2}
+                    join ${table_tbl1} on ${table_tbl1}.cur_unit_name = 
${table_tbl2}.org_name
+                where
+                    ${table_tbl2}.org_code = ${table_tbl3}.acntm_name
+            ) v2
+        from
+            ${table_tbl3} order by 1,2,3,4,5;
+    """
+
+    sql "set disable_join_reorder=1;"
+    qt_test_right_semi_mark_join_2 """
+        select
+            ${table_tbl3}.id,
+            ${table_tbl3}.acntm_name,
+            ${table_tbl3}.vendor_name,
+            ${table_tbl3}.vendor_name in (
+                select
+                    ${table_tbl1}.unit_name
+                from
+                    ${table_tbl2}
+                    join ${table_tbl1} on ${table_tbl1}.cur_unit_name = 
${table_tbl2}.org_name
+                where
+                    ${table_tbl2}.org_code = ${table_tbl3}.acntm_name
+            ) v1,
+            ${table_tbl3}.vendor_name not in (
+                select
+                    ${table_tbl1}.unit_name
+                from
+                    ${table_tbl2}
+                    join ${table_tbl1} on ${table_tbl1}.cur_unit_name = 
${table_tbl2}.org_name
+                where
+                    ${table_tbl2}.org_code = ${table_tbl3}.acntm_name
+            ) v2
+        from
+            ${table_tbl3} order by 1,2,3,4,5;
+    """
+
+    sql "set disable_join_reorder=0;"
+    qt_test_right_semi_mark_join_no_null """
+        select
+            ${table_tbl3}.id,
+            ${table_tbl3}.acntm_name,
+            ${table_tbl3}.vendor_name,
+            ${table_tbl3}.vendor_name in (
+                select
+                    ${table_tbl1}.unit_name
+                from
+                    ${table_tbl2}
+                    join ${table_tbl1} on ${table_tbl1}.cur_unit_name = 
${table_tbl2}.org_name
+                where
+                    ${table_tbl2}.org_code = ${table_tbl3}.acntm_name
+                    and ${table_tbl1}.unit_name is not null
+            ) v1,
+            ${table_tbl3}.vendor_name not in (
+                select
+                    ${table_tbl1}.unit_name
+                from
+                    ${table_tbl2}
+                    join ${table_tbl1} on ${table_tbl1}.cur_unit_name = 
${table_tbl2}.org_name
+                where
+                    ${table_tbl2}.org_code = ${table_tbl3}.acntm_name
+                    and ${table_tbl1}.unit_name is not null
+            ) v2
+        from
+            ${table_tbl3} order by 1,2,3,4,5;
+    """
+
+    sql "set disable_join_reorder=1;"
+    qt_test_right_semi_mark_join_no_null_2 """
+        select
+            ${table_tbl3}.id,
+            ${table_tbl3}.acntm_name,
+            ${table_tbl3}.vendor_name,
+            ${table_tbl3}.vendor_name in (
+                select
+                    ${table_tbl1}.unit_name
+                from
+                    ${table_tbl2}
+                    join ${table_tbl1} on ${table_tbl1}.cur_unit_name = 
${table_tbl2}.org_name
+                where
+                    ${table_tbl2}.org_code = ${table_tbl3}.acntm_name
+                    and ${table_tbl1}.unit_name is not null
+            ) v1,
+            ${table_tbl3}.vendor_name not in (
+                select
+                    ${table_tbl1}.unit_name
+                from
+                    ${table_tbl2}
+                    join ${table_tbl1} on ${table_tbl1}.cur_unit_name = 
${table_tbl2}.org_name
+                where
+                    ${table_tbl2}.org_code = ${table_tbl3}.acntm_name
+                    and ${table_tbl1}.unit_name is not null
+            ) v2
+        from
+            ${table_tbl3} order by 1,2,3,4,5;
     """
 }
diff --git 
a/regression-test/suites/query_p0/join/mark_join/right_semi_mark_join.groovy 
b/regression-test/suites/query_p0/join/mark_join/right_semi_mark_join.groovy
new file mode 100644
index 00000000000..cc287f1a6f7
--- /dev/null
+++ b/regression-test/suites/query_p0/join/mark_join/right_semi_mark_join.groovy
@@ -0,0 +1,109 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("right_semi_mark_join") {
+    String suiteName = "right_semi_mark_join"
+    String table_tbl1 = "${suiteName}_table_tbl1"
+    String table_tbl2 = "${suiteName}_table_tbl2"
+    String table_tbl3 = "${suiteName}_table_tbl3"
+    
+    sql "drop table if exists ${table_tbl1};"
+    sql "drop table if exists ${table_tbl2};"
+    sql "drop table if exists ${table_tbl3};"
+
+    sql """
+        create table ${table_tbl1} (pk int, col1 bigint, col2 bigint) engine = 
olap DUPLICATE KEY(pk) distributed by hash(pk) buckets 10 
properties("replication_num" = "1");
+    """
+
+    sql """
+        insert into
+            ${table_tbl1}(pk, col1, col2)
+        values
+            (0, null, 18332),  (1, 788547, null), (2, 4644959, -56),  (3, 
8364628, 72),  (4, null, -5581),
+            (5, 2344024, -62), (6, -2689177, 22979),  (7, 1320, -41), (8, 
null, -54),  (9, 12, -6236),
+            (10, -8321648, null), (11, 153691, null), (12, -8056, null), (13, 
-12, -2343514), (14, -35, -3361960);
+    """
+
+    sql """
+        create table ${table_tbl2} (
+            pk int, col1 bigint, col2 bigint
+        ) engine = olap 
+        distributed by hash(pk) buckets 4
+        properties("replication_num" = "1");
+    """
+
+    sql """
+        insert into
+            ${table_tbl2}(pk, col1, col2)
+        values
+            (0, 108, 31161), (1, 1479175, 6764263), (2, 110, 25), (3, 110, 
-18656), (4, null, -51),
+            (5, 21, 27), (6, -6950217, 1585978), (7, null, null), (8, null, 
3453467),  (9, null, -6701140);
+    """
+    
+    sql """
+        create table ${table_tbl3} (
+            pk int, col1 bigint, col2 bigint, col3 bigint
+        ) engine = olap 
+        DUPLICATE KEY(pk) distributed by hash(pk) buckets 10
+        properties("replication_num" = "1");
+    """
+
+    sql """
+        insert into
+            ${table_tbl3}(pk, col1, col2)
+        values
+            (0, 55, -58), (1, 49, 29792), (2, 95, 32361),  (3, 31243, -27428), 
(4, -27400, null),
+            (5, 31243, null), (6, null, -27428), (7, null, 7), (8, 31243, 
-21951), (9, 13186, 24466),
+            (10, null, -8), (11, null, null), (12, -18, 32361), (13, null, 
-18), (14, 21681, 14079),
+            (15, 31241, -17653), (16, 5825, 13559), (17, null, -10508), (18, 
null, 20682), (19, 31243, -98),
+            (73, -32480, 24424), (74, 31, -27428), (75, 31243, -718), (76, 
null, 20822), (77, 31243, -27428),
+            (78, -15934, null), (79, 78, -27428), (80, 8572, -27428), (81, 
31243, 4077), (82, null, 114),
+            (83, 10, -71), (84, -32489, 32361), (85, null, null), (86, -22984, 
32361), (87, 26607, -27428),
+            (5, 31243, null), (6, null, -27428), (7, null, 7), (8, 31243, 
-21951), (9, 13186, 24466),
+            (10, null, -8), (11, null, null), (12, -18, 32361), (13, null, 
-18), (14, 21681, 14079),
+            (15, 31241, -17653), (16, 5825, 13559), (17, null, -10508), (18, 
null, 20682), (19, 31243, -98),
+            (73, -32480, 24424), (74, 31, -27428), (75, 31243, -718), (76, 
null, 20822), (77, 31243, -27428),
+            (78, -15934, null), (79, 78, -27428), (80, 8572, -27428), (81, 
31243, 4077), (82, null, 114),
+            (83, 10, -71), (84, -32489, 32361), (85, null, null), (86, -22984, 
32361), (87, 26607, -27428),
+            (10, null, -8), (11, null, null), (12, -18, 32361), (13, null, 
-18), (14, 21681, 14079),
+            (15, 31241, -17653), (16, 5825, 13559), (17, null, -10508), (18, 
null, 20682), (19, 31243, -98),
+            (73, -32480, 24424), (74, 31, -27428), (75, 31243, -718), (76, 
null, 20822), (77, 31243, -27428),
+            (78, -15934, null), (79, 78, -27428), (80, 8572, -27428), (81, 
31243, 4077), (82, null, 114),
+            (83, 10, -71), (84, -32489, 32361), (85, null, null), (86, -22984, 
32361), (87, 26607, -27428);
+    """
+
+    qt_test """
+        SELECT
+            T1.pk AS C1,
+            T1.col2 AS C2
+        FROM
+            ${table_tbl1} AS T1 FULL
+            OUTER JOIN ${table_tbl2} AS T2 ON T1.col1 <= T2.col2
+            OR T2.col1 IN (
+                SELECT
+                    T3.col2
+                FROM
+                    ${table_tbl3} AS T3
+                WHERE
+                    T2.col2 = T3.col1
+            )
+        ORDER BY
+            C1,
+            C2 DESC;
+    """
+}
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]


Reply via email to