This is an automated email from the ASF dual-hosted git repository.
dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new ea52c3f83bd [fix](join)Consider mark join when computing
right_col_idx(#50720) (#51156)
ea52c3f83bd is described below
commit ea52c3f83bd4d6195c6bac4030c287113947c2ca
Author: Jerry Hu <[email protected]>
AuthorDate: Wed Jun 11 11:14:30 2025 +0800
[fix](join)Consider mark join when computing right_col_idx(#50720) (#51156)
### What problem does this PR solve?
Pick #50720 #50993 #51124
Co-authored-by: zhangdong <[email protected]>
---
be/src/pipeline/exec/hashjoin_probe_operator.cpp | 31 ++-
be/src/pipeline/exec/hashjoin_probe_operator.h | 6 +
.../pipeline/exec/join/process_hash_table_probe.h | 10 +-
.../exec/join/process_hash_table_probe_impl.h | 144 ++++++++----
be/src/vec/common/hash_table/join_hash_table.h | 44 ++--
.../data/query_p0/join/mark_join/mark_join.out | Bin 239 -> 1007 bytes
.../join/mark_join/right_semi_mark_join.out | Bin 0 -> 568 bytes
.../query_p0/join/mark_join/mark_join.groovy | 251 +++++++++++++++++++--
.../join/mark_join/right_semi_mark_join.groovy | 109 +++++++++
9 files changed, 509 insertions(+), 86 deletions(-)
diff --git a/be/src/pipeline/exec/hashjoin_probe_operator.cpp
b/be/src/pipeline/exec/hashjoin_probe_operator.cpp
index c969f60fa72..82e4b732193 100644
--- a/be/src/pipeline/exec/hashjoin_probe_operator.cpp
+++ b/be/src/pipeline/exec/hashjoin_probe_operator.cpp
@@ -73,6 +73,7 @@ Status HashJoinProbeLocalState::open(RuntimeState* state) {
RETURN_IF_ERROR(JoinProbeLocalState::open(state));
auto& p = _parent->cast<HashJoinProbeOperatorX>();
+ _right_col_idx = p._right_col_idx;
std::visit(
[&](auto&& join_op_variants, auto have_other_join_conjunct) {
using JoinOpType = std::decay_t<decltype(join_op_variants)>;
@@ -511,7 +512,7 @@ Status HashJoinProbeOperatorX::init(const TPlanNode& tnode,
RuntimeState* state)
const bool probe_dispose_null =
_match_all_probe || _build_unique || _join_op ==
TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN ||
_join_op == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN || _join_op ==
TJoinOp::LEFT_ANTI_JOIN ||
- _join_op == TJoinOp::LEFT_SEMI_JOIN;
+ _join_op == TJoinOp::LEFT_SEMI_JOIN || _is_mark_join;
const std::vector<TEqJoinCondition>& eq_join_conjuncts =
tnode.hash_join_node.eq_join_conjuncts;
std::vector<bool> probe_not_ignore_null(eq_join_conjuncts.size());
size_t conjuncts_index = 0;
@@ -642,14 +643,34 @@ Status HashJoinProbeOperatorX::open(RuntimeState* state) {
}
}
- const int right_col_idx =
- (_is_right_semi_anti && !_have_other_join_conjunct) ? 0 :
_left_table_data_types.size();
+ _right_col_idx = (_is_right_semi_anti && !_have_other_join_conjunct &&
+ (!_is_mark_join || _mark_join_conjuncts.empty()))
+ ? 0
+ : _left_table_data_types.size();
size_t idx = 0;
for (const auto* slot : slots_to_check) {
auto data_type = slot->get_data_type_ptr();
- const auto slot_on_left = idx < right_col_idx;
+ const auto slot_on_left = idx < _right_col_idx;
+
+ if (slot_on_left) {
+ if (idx >= _left_table_data_types.size()) {
+ return Status::InternalError(
+ "Join node(id={}, OP={}) intermediate slot({}, #{})'s
on left table "
+ "idx out bound of _left_table_data_types: {} vs {}",
+ _node_id, _join_op, slot->col_name(), slot->id(), idx,
+ _left_table_data_types.size());
+ }
+ } else if (idx - _right_col_idx >= _right_table_data_types.size()) {
+ return Status::InternalError(
+ "Join node(id={}, OP={}) intermediate slot({}, #{})'s on
right table "
+ "idx out bound of _right_table_data_types: {} vs {}(idx =
{}, _right_col_idx = "
+ "{})",
+ _node_id, _join_op, slot->col_name(), slot->id(), idx -
_right_col_idx,
+ _right_table_data_types.size(), idx, _right_col_idx);
+ }
+
auto target_data_type = slot_on_left ? _left_table_data_types[idx]
- : _right_table_data_types[idx -
right_col_idx];
+ : _right_table_data_types[idx -
_right_col_idx];
++idx;
if (data_type->equals(*target_data_type)) {
continue;
diff --git a/be/src/pipeline/exec/hashjoin_probe_operator.h
b/be/src/pipeline/exec/hashjoin_probe_operator.h
index 66d709e6541..ee0bb78ee4f 100644
--- a/be/src/pipeline/exec/hashjoin_probe_operator.h
+++ b/be/src/pipeline/exec/hashjoin_probe_operator.h
@@ -116,6 +116,9 @@ private:
std::unique_ptr<HashTableCtxVariants> _process_hashtable_ctx_variants =
std::make_unique<HashTableCtxVariants>();
+ // Index of column(slot) from right table in the `_intermediate_row_desc`.
+ size_t _right_col_idx;
+
RuntimeProfile::Counter* _probe_expr_call_timer = nullptr;
RuntimeProfile::Counter* _probe_side_output_timer = nullptr;
RuntimeProfile::HighWaterMarkCounter* _probe_arena_memory_usage = nullptr;
@@ -185,6 +188,9 @@ private:
std::vector<bool> _right_output_slot_flags;
std::vector<std::string> _right_table_column_names;
const std::vector<TExpr> _partition_exprs;
+
+ // Index of column(slot) from right table in the `_intermediate_row_desc`.
+ size_t _right_col_idx;
};
} // namespace pipeline
diff --git a/be/src/pipeline/exec/join/process_hash_table_probe.h
b/be/src/pipeline/exec/join/process_hash_table_probe.h
index 7a5c34fb845..4e1a22a9fab 100644
--- a/be/src/pipeline/exec/join/process_hash_table_probe.h
+++ b/be/src/pipeline/exec/join/process_hash_table_probe.h
@@ -23,6 +23,7 @@
#include "vec/columns/column.h"
#include "vec/columns/columns_number.h"
#include "vec/common/arena.h"
+#include "vec/common/custom_allocator.h"
namespace doris {
namespace vectorized {
@@ -137,8 +138,15 @@ struct ProcessHashTableProbe {
RuntimeProfile::Counter* _probe_side_output_timer = nullptr;
RuntimeProfile::Counter* _finish_probe_phase_timer = nullptr;
- int _right_col_idx;
+ // See `HashJoinProbeOperatorX::_right_col_idx`
+ const int _right_col_idx;
+
int _right_col_len;
+
+ // For right semi with mark join conjunct, we need to store the mark join
flags
+ // in the hash table.
+ // -1 means null, 0 means false, 1 means true
+ DorisVector<int8_t> mark_join_flags;
};
} // namespace pipeline
diff --git a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h
b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h
index 1f9d127549a..b88d1f42541 100644
--- a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h
+++ b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h
@@ -56,9 +56,7 @@
ProcessHashTableProbe<JoinOpType>::ProcessHashTableProbe(HashJoinProbeLocalState
_build_side_output_timer(parent->_build_side_output_timer),
_probe_side_output_timer(parent->_probe_side_output_timer),
_finish_probe_phase_timer(parent->_finish_probe_phase_timer),
- _right_col_idx((_is_right_semi_anti && !_have_other_join_conjunct)
- ? 0
- : _parent->left_table_data_types().size()),
+ _right_col_idx(parent->_right_col_idx),
_right_col_len(_parent->right_table_data_types().size()) {}
template <int JoinOpType>
@@ -66,11 +64,10 @@ void
ProcessHashTableProbe<JoinOpType>::build_side_output_column(
vectorized::MutableColumns& mcol, const std::vector<bool>&
output_slot_flags, int size,
bool have_other_join_conjunct, bool is_mark_join) {
SCOPED_TIMER(_build_side_output_timer);
- constexpr auto is_semi_anti_join = JoinOpType == TJoinOp::RIGHT_ANTI_JOIN
||
- JoinOpType == TJoinOp::RIGHT_SEMI_JOIN
||
- JoinOpType == TJoinOp::LEFT_ANTI_JOIN ||
+ constexpr auto is_semi_anti_join = JoinOpType == TJoinOp::LEFT_ANTI_JOIN ||
+ JoinOpType == TJoinOp::LEFT_SEMI_JOIN ||
JoinOpType ==
TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN ||
- JoinOpType == TJoinOp::LEFT_SEMI_JOIN;
+ JoinOpType ==
TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN;
constexpr auto probe_all =
JoinOpType == TJoinOp::LEFT_OUTER_JOIN || JoinOpType ==
TJoinOp::FULL_OUTER_JOIN;
@@ -209,7 +206,7 @@ Status
ProcessHashTableProbe<JoinOpType>::do_process(HashTableType& hash_table_c
(JoinOpType == doris::TJoinOp::LEFT_ANTI_JOIN ||
JoinOpType == doris::TJoinOp::LEFT_SEMI_JOIN ||
JoinOpType ==
doris::TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN ||
- (is_mark_join && JoinOpType !=
doris::TJoinOp::RIGHT_SEMI_JOIN)));
+ (is_mark_join)));
}
auto& mcol = mutable_block.mutable_columns();
@@ -268,8 +265,8 @@ Status
ProcessHashTableProbe<JoinOpType>::do_process(HashTableType& hash_table_c
build_side_output_column(mcol, *_right_output_slot_flags, current_offset,
with_other_conjuncts,
is_mark_join);
- if constexpr (with_other_conjuncts || (JoinOpType !=
TJoinOp::RIGHT_SEMI_JOIN &&
- JoinOpType !=
TJoinOp::RIGHT_ANTI_JOIN)) {
+ if (with_other_conjuncts || !_parent->_mark_join_conjuncts.empty() ||
+ (JoinOpType != TJoinOp::RIGHT_SEMI_JOIN && JoinOpType !=
TJoinOp::RIGHT_ANTI_JOIN)) {
auto check_all_match_one = [](const std::vector<uint32_t>& vecs,
uint32_t probe_idx,
int size) {
if (!size || vecs[0] != probe_idx || vecs[size - 1] != probe_idx +
size - 1) {
@@ -291,7 +288,14 @@ Status
ProcessHashTableProbe<JoinOpType>::do_process(HashTableType& hash_table_c
output_block->swap(mutable_block.to_block());
- if constexpr (is_mark_join && JoinOpType != TJoinOp::RIGHT_SEMI_JOIN) {
+ if constexpr (is_mark_join) {
+ if constexpr (JoinOpType == TJoinOp::RIGHT_SEMI_JOIN ||
+ JoinOpType == TJoinOp::RIGHT_ANTI_JOIN) {
+ if (mark_join_flags.empty()) {
+ mark_join_flags.resize(hash_table_ctx.hash_table->size(), 0);
+ }
+ }
+
return do_mark_join_conjuncts<with_other_conjuncts>(
output_block, hash_table_ctx.hash_table->get_bucket_size());
} else if constexpr (with_other_conjuncts) {
@@ -363,15 +367,19 @@ template <int JoinOpType>
template <bool with_other_conjuncts>
Status
ProcessHashTableProbe<JoinOpType>::do_mark_join_conjuncts(vectorized::Block*
output_block,
size_t
hash_table_bucket_size) {
- DCHECK(JoinOpType == TJoinOp::LEFT_ANTI_JOIN ||
- JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN ||
- JoinOpType == TJoinOp::LEFT_SEMI_JOIN ||
- JoinOpType == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN);
+ if (JoinOpType != TJoinOp::LEFT_ANTI_JOIN && JoinOpType !=
TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN &&
+ JoinOpType != TJoinOp::LEFT_SEMI_JOIN && JoinOpType !=
TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN &&
+ JoinOpType != TJoinOp::RIGHT_SEMI_JOIN && JoinOpType !=
TJoinOp::RIGHT_ANTI_JOIN) {
+ return Status::InternalError("join type {} is not supported",
JoinOpType);
+ }
constexpr bool is_anti_join = JoinOpType == TJoinOp::LEFT_ANTI_JOIN ||
- JoinOpType ==
TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN;
+ JoinOpType ==
TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN ||
+ JoinOpType == TJoinOp::RIGHT_ANTI_JOIN;
constexpr bool is_null_aware_join = JoinOpType ==
TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN ||
JoinOpType ==
TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN;
+ constexpr bool is_right_half_join =
+ JoinOpType == TJoinOp::RIGHT_SEMI_JOIN || JoinOpType ==
TJoinOp::RIGHT_ANTI_JOIN;
const auto row_count = output_block->rows();
if (!row_count) {
@@ -458,37 +466,71 @@ Status
ProcessHashTableProbe<JoinOpType>::do_mark_join_conjuncts(vectorized::Blo
const bool should_be_null_if_build_side_has_null =
*_has_null_in_build_side && is_null_aware_join &&
!with_other_conjuncts;
for (size_t i = 0; i != row_count; ++i) {
- bool not_matched_before = _parent->_last_probe_match !=
_probe_indexs[i];
- if (_build_indexs[i] == 0) {
- bool has_null_mark_value = _parent->_last_probe_null_mark ==
_probe_indexs[i];
- if (not_matched_before) {
- filter_map[i] = true;
- mark_null_map[i] = has_null_mark_value ||
should_be_null_if_build_side_has_null;
- mark_filter_data[i] = false;
+ if constexpr (is_right_half_join) {
+ const auto& build_index = _build_indexs[i];
+ if (build_index == 0) {
+ continue;
+ }
+
+ if (mark_join_flags[build_index] == 1) {
+ continue;
+ }
+
+ if (mark_null_map[i]) {
+ mark_join_flags[build_index] = -1;
+ } else if (mark_filter_data[i]) {
+ mark_join_flags[build_index] = 1;
}
} else {
- if (mark_null_map[i]) { // is null
- _parent->_last_probe_null_mark = _probe_indexs[i];
- } else {
- if (mark_filter_data[i] && not_matched_before) {
- _parent->_last_probe_match = _probe_indexs[i];
+ bool not_matched_before = _parent->_last_probe_match !=
_probe_indexs[i];
+ if (_build_indexs[i] == 0) {
+ bool has_null_mark_value = _parent->_last_probe_null_mark ==
_probe_indexs[i];
+ if (not_matched_before) {
filter_map[i] = true;
+ mark_null_map[i] = has_null_mark_value ||
should_be_null_if_build_side_has_null;
+ mark_filter_data[i] = false;
+ }
+ } else {
+ if (mark_null_map[i]) { // is null
+ _parent->_last_probe_null_mark = _probe_indexs[i];
+ } else {
+ if (mark_filter_data[i] && not_matched_before) {
+ _parent->_last_probe_match = _probe_indexs[i];
+ filter_map[i] = true;
+ }
}
}
}
}
- if constexpr (is_anti_join) {
- // flip the mark column
- for (size_t i = 0; i != row_count; ++i) {
- mark_filter_data[i] ^= 1; // not null/ null
+ if constexpr (is_right_half_join) {
+ if constexpr (is_anti_join) {
+ // flip the mark column
+ for (size_t i = 0; i != row_count; ++i) {
+ if (mark_join_flags[i] == -1) {
+ // -1 means null.
+ continue;
+ }
+
+ mark_join_flags[i] ^= 1;
+ }
+ }
+ // For right semi/anti join, no rows will be output in probe phase.
+ output_block->clear();
+ return Status::OK();
+ } else {
+ if constexpr (is_anti_join) {
+ // flip the mark column
+ for (size_t i = 0; i != row_count; ++i) {
+ mark_filter_data[i] ^= 1; // not null/ null
+ }
}
- }
- auto result_column_id = output_block->columns();
- output_block->insert(
- {std::move(filter_column),
std::make_shared<vectorized::DataTypeUInt8>(), ""});
- return vectorized::Block::filter_block(output_block, result_column_id,
result_column_id);
+ auto result_column_id = output_block->columns();
+ output_block->insert(
+ {std::move(filter_column),
std::make_shared<vectorized::DataTypeUInt8>(), ""});
+ return vectorized::Block::filter_block(output_block, result_column_id,
result_column_id);
+ }
}
template <int JoinOpType>
@@ -637,7 +679,8 @@ Status
ProcessHashTableProbe<JoinOpType>::finish_probing(HashTableType& hash_tab
if (block_size) {
if (mcol.size() < _right_col_len + _right_col_idx) {
return Status::InternalError(
- "output block invalid, mcol.size()={}, _right_col_len={},
_right_col_idx={}",
+ "output block invalid, mcol.size()={}, _right_col_len={}, "
+ "_right_col_idx={}",
mcol.size(), _right_col_len, _right_col_idx);
}
for (size_t j = 0; j < _right_col_len; ++j) {
@@ -646,8 +689,31 @@ Status
ProcessHashTableProbe<JoinOpType>::finish_probing(HashTableType& hash_tab
_build_indexs.data()
+ block_size);
}
+ if constexpr (JoinOpType == TJoinOp::RIGHT_ANTI_JOIN ||
+ JoinOpType == TJoinOp::RIGHT_SEMI_JOIN) {
+ if (is_mark_join) {
+ if (mark_join_flags.empty()) {
+ mark_join_flags.resize(hash_table_ctx.hash_table->size(),
0);
+ }
+
+ // mark column is nullable
+ auto* mark_column = assert_cast<vectorized::ColumnNullable*>(
+ mcol[_parent->_mark_column_id].get());
+ mark_column->resize(block_size);
+ auto* null_map = mark_column->get_null_map_data().data();
+ auto* data =
assert_cast<vectorized::ColumnUInt8&>(mark_column->get_nested_column())
+ .get_data()
+ .data();
+ for (size_t i = 0; i != block_size; ++i) {
+ const auto build_index = _build_indexs[i];
+ null_map[i] = mark_join_flags[build_index] == -1;
+ data[i] = mark_join_flags[build_index] == 1;
+ }
+ }
+ }
+
// just resize the left table column in case with other conjunct to
make block size is not zero
- if (_is_right_semi_anti && _have_other_join_conjunct) {
+ if (_is_right_semi_anti && _right_col_idx != 0) {
for (int i = 0; i < _right_col_idx; ++i) {
mcol[i]->resize(block_size);
}
diff --git a/be/src/vec/common/hash_table/join_hash_table.h
b/be/src/vec/common/hash_table/join_hash_table.h
index 317987541cd..88695ef9b43 100644
--- a/be/src/vec/common/hash_table/join_hash_table.h
+++ b/be/src/vec/common/hash_table/join_hash_table.h
@@ -101,24 +101,23 @@ public:
}
}
- if constexpr (with_other_conjuncts ||
- (is_mark_join && JoinOpType !=
TJoinOp::RIGHT_SEMI_JOIN)) {
- if constexpr (!with_other_conjuncts) {
- constexpr bool is_null_aware_join =
- JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN ||
- JoinOpType == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN;
- constexpr bool is_left_half_join = JoinOpType ==
TJoinOp::LEFT_SEMI_JOIN ||
- JoinOpType ==
TJoinOp::LEFT_ANTI_JOIN;
-
- /// For null aware join or left half(semi/anti) join without
other conjuncts and without
- /// mark join conjunct.
- /// If one row on probe side has one match in build side, we
should stop searching the
- /// hash table for this row.
- if (is_null_aware_join || (is_left_half_join &&
!has_mark_join_conjunct)) {
- return _find_batch_conjunct<JoinOpType, need_judge_null,
true>(
- keys, build_idx_map, probe_idx, build_idx,
probe_rows, probe_idxs,
- build_idxs);
- }
+ if constexpr (with_other_conjuncts) {
+ return _find_batch_conjunct<JoinOpType, need_judge_null, false>(
+ keys, build_idx_map, probe_idx, build_idx, probe_rows,
probe_idxs, build_idxs);
+ } else if constexpr (is_mark_join) {
+ constexpr bool is_null_aware_join = JoinOpType ==
TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN ||
+ JoinOpType ==
TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN;
+ constexpr bool is_left_half_join =
+ JoinOpType == TJoinOp::LEFT_SEMI_JOIN || JoinOpType ==
TJoinOp::LEFT_ANTI_JOIN;
+
+ /// For null aware join or left half(semi/anti) join without other
conjuncts and without
+ /// mark join conjunct.
+ /// If one row on probe side has one match in build side, we
should stop searching the
+ /// hash table for this row.
+ if (is_null_aware_join || (is_left_half_join &&
!has_mark_join_conjunct)) {
+ return _find_batch_conjunct<JoinOpType, need_judge_null, true>(
+ keys, build_idx_map, probe_idx, build_idx, probe_rows,
probe_idxs,
+ build_idxs);
}
return _find_batch_conjunct<JoinOpType, need_judge_null, false>(
@@ -339,14 +338,7 @@ private:
auto do_the_probe = [&]() {
while (build_idx && matched_cnt < batch_size) {
- if constexpr (JoinOpType == TJoinOp::RIGHT_ANTI_JOIN ||
- JoinOpType == TJoinOp::RIGHT_SEMI_JOIN) {
- if (!visited[build_idx] && keys[probe_idx] ==
build_keys[build_idx]) {
- probe_idxs[matched_cnt] = probe_idx;
- build_idxs[matched_cnt] = build_idx;
- matched_cnt++;
- }
- } else if constexpr (need_judge_null) {
+ if constexpr (need_judge_null) {
if (build_idx == bucket_size) {
build_idxs[matched_cnt] = build_idx;
probe_idxs[matched_cnt] = probe_idx;
diff --git a/regression-test/data/query_p0/join/mark_join/mark_join.out
b/regression-test/data/query_p0/join/mark_join/mark_join.out
index ed3575d0e14..ea3b7ddd6a6 100644
Binary files a/regression-test/data/query_p0/join/mark_join/mark_join.out and
b/regression-test/data/query_p0/join/mark_join/mark_join.out differ
diff --git
a/regression-test/data/query_p0/join/mark_join/right_semi_mark_join.out
b/regression-test/data/query_p0/join/mark_join/right_semi_mark_join.out
new file mode 100644
index 00000000000..e00e19be11e
Binary files /dev/null and
b/regression-test/data/query_p0/join/mark_join/right_semi_mark_join.out differ
diff --git a/regression-test/suites/query_p0/join/mark_join/mark_join.groovy
b/regression-test/suites/query_p0/join/mark_join/mark_join.groovy
index 9759a0e9b4c..289c9d5b06b 100644
--- a/regression-test/suites/query_p0/join/mark_join/mark_join.groovy
+++ b/regression-test/suites/query_p0/join/mark_join/mark_join.groovy
@@ -16,10 +16,17 @@
// under the License.
suite("mark_join") {
- sql "drop table if exists t1;"
- sql "drop table if exists t2;"
+ String suiteName = "mark_join"
+ String table_t1 = "${suiteName}_table_t1"
+ String table_t2 = "${suiteName}_table_t2"
+ String table_tbl1 = "${suiteName}_table_tbl1"
+ String table_tbl2 = "${suiteName}_table_tbl2"
+ String table_tbl3 = "${suiteName}_table_tbl3"
+
+ sql "drop table if exists ${table_t1};"
+ sql "drop table if exists ${table_t2};"
sql """
- create table t1 (
+ create table ${table_t1} (
k1 int null,
k2 int null,
k3 bigint null,
@@ -31,7 +38,7 @@ suite("mark_join") {
"""
sql """
- create table t2 (
+ create table ${table_t2} (
k1 int null,
k2 int null,
k3 bigint null,
@@ -42,23 +49,237 @@ suite("mark_join") {
properties("replication_num" = "1");
"""
- sql "insert into t1 select 1,1,1,'a';"
- sql "insert into t1 select 2,2,2,'b';"
- sql "insert into t1 select 3,-3,null,'c';"
- sql "insert into t1 select 3,3,null,'c';"
+ sql "insert into ${table_t1} select 1,1,1,'a';"
+ sql "insert into ${table_t1} select 2,2,2,'b';"
+ sql "insert into ${table_t1} select 3,-3,null,'c';"
+ sql "insert into ${table_t1} select 3,3,null,'c';"
- sql "insert into t2 select 1,1,1,'a';"
- sql "insert into t2 select 2,2,2,'b';"
- sql "insert into t2 select 3,-3,null,'c';"
- sql "insert into t2 select 3,3,null,'c';"
+ sql "insert into ${table_t2} select 1,1,1,'a';"
+ sql "insert into ${table_t2} select 2,2,2,'b';"
+ sql "insert into ${table_t2} select 3,-3,null,'c';"
+ sql "insert into ${table_t2} select 3,3,null,'c';"
qt_test """
- select * from t1 where exists (select t2.k3 from t2 where t1.k2 = t2.k2)
or k1 < 10 order by k1, k2;
+ select * from ${table_t1} where exists (select ${table_t2}.k3 from
${table_t2} where ${table_t1}.k2 = ${table_t2}.k2) or k1 < 10 order by k1, k2;
"""
qt_test """
- select * from t1 where not exists (select t2.k3 from t2 where t1.k2 =
t2.k2) or k1 < 10 order by k1, k2;
+ select * from ${table_t1} where not exists (select ${table_t2}.k3 from
${table_t2} where ${table_t1}.k2 = ${table_t2}.k2) or k1 < 10 order by k1, k2;
"""
qt_test """
- select * from t1 where t1.k1 not in (select t2.k3 from t2 where t2.k2 =
t1.k2) or k1 < 10 order by k1, k2;
+ select * from ${table_t1} where ${table_t1}.k1 not in (select
${table_t2}.k3 from ${table_t2} where ${table_t2}.k2 = ${table_t1}.k2) or k1 <
10 order by k1, k2;
+ """
+
+ sql "drop table if exists ${table_tbl1};"
+ sql "drop table if exists ${table_tbl2};"
+ sql "drop table if exists ${table_tbl3};"
+
+ sql """
+ CREATE TABLE `${table_tbl1}` (
+ `unit_name` varchar(1080) NULL,
+ `cur_unit_name` varchar(1080) NOT NULL
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`unit_name`)
+ DISTRIBUTED BY RANDOM BUCKETS AUTO
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ );
+ """
+
+ sql """
+ CREATE TABLE `${table_tbl2}` (
+ `org_code` varchar(150) NOT NULL ,
+ `org_name` varchar(300) NULL
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`org_code`)
+ DISTRIBUTED BY HASH(`org_code`) BUCKETS 4
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ );
+ """
+
+ sql """
+ CREATE TABLE `${table_tbl3}` (
+ `id` bigint NOT NULL,
+ `acntm_name` varchar(500) NULL ,
+ `vendor_name` varchar(500) NULL
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`id`)
+ DISTRIBUTED BY HASH(`id`) BUCKETS AUTO
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ );
+ """
+
+ sql """
+ insert into ${table_tbl1} (unit_name, cur_unit_name) values
+ ('v1', 'o1'),
+ ('v2', 'o2'),
+ ('v3', 'o3'),
+ ('v4', 'o4'),
+ ('v5', 'o5'),
+ (null, 'o1'),
+ ('v1', 'o1'),
+ ('v2', 'o2'),
+ ('v3', 'o3'),
+ ('v4', 'o4'),
+ ('v5', 'o5'),
+ (null, 'o1'),
+ (null, 'o2'),
+ (null, 'o3'),
+ (null, 'o4'),
+ (null, 'o5'),
+ ('v1', 'o1'),
+ ('v2', 'o2'),
+ ('v3', 'o3'),
+ ('v4', 'o4'),
+ ('v5', 'o5');
+ """
+
+ sql """
+ insert into ${table_tbl2}(org_code, org_name) values
+ ('v1', 'o1'),
+ ('v2', 'o2'),
+ ('v3', 'o3'),
+ ('v4', 'o4'),
+ ('v5', 'o5'),
+ ('v1', null),
+ ('v2', null),
+ ('v3', null),
+ ('v4', null),
+ ('v5', null);
+ """
+
+ sql """
+ insert into ${table_tbl3} (id, vendor_name, acntm_name)
+ values(1, 'o1', 'v1'),
+ (2, 'o2', 'v2'),
+ (3, 'o3', 'v3'),
+ (4, 'o4', 'v4'),
+ (5, 'o5', 'v5'),
+ (6, null, 'v1'),
+ (7, null, 'v2'),
+ (8, null, 'v3'),
+ (9, null, 'v4'),
+ (10, null, 'v5');
+ """
+
+ sql " analyze table ${table_tbl1} with sync;"
+ sql " analyze table ${table_tbl2} with sync;"
+ sql " analyze table ${table_tbl3} with sync;"
+
+ sql "set disable_join_reorder=0;"
+ qt_test_right_semi_mark_join """
+ select
+ ${table_tbl3}.id,
+ ${table_tbl3}.acntm_name,
+ ${table_tbl3}.vendor_name,
+ ${table_tbl3}.vendor_name in (
+ select
+ ${table_tbl1}.unit_name
+ from
+ ${table_tbl2}
+ join ${table_tbl1} on ${table_tbl1}.cur_unit_name =
${table_tbl2}.org_name
+ where
+ ${table_tbl2}.org_code = ${table_tbl3}.acntm_name
+ ) v1,
+ ${table_tbl3}.vendor_name not in (
+ select
+ ${table_tbl1}.unit_name
+ from
+ ${table_tbl2}
+ join ${table_tbl1} on ${table_tbl1}.cur_unit_name =
${table_tbl2}.org_name
+ where
+ ${table_tbl2}.org_code = ${table_tbl3}.acntm_name
+ ) v2
+ from
+ ${table_tbl3} order by 1,2,3,4,5;
+ """
+
+ sql "set disable_join_reorder=1;"
+ qt_test_right_semi_mark_join_2 """
+ select
+ ${table_tbl3}.id,
+ ${table_tbl3}.acntm_name,
+ ${table_tbl3}.vendor_name,
+ ${table_tbl3}.vendor_name in (
+ select
+ ${table_tbl1}.unit_name
+ from
+ ${table_tbl2}
+ join ${table_tbl1} on ${table_tbl1}.cur_unit_name =
${table_tbl2}.org_name
+ where
+ ${table_tbl2}.org_code = ${table_tbl3}.acntm_name
+ ) v1,
+ ${table_tbl3}.vendor_name not in (
+ select
+ ${table_tbl1}.unit_name
+ from
+ ${table_tbl2}
+ join ${table_tbl1} on ${table_tbl1}.cur_unit_name =
${table_tbl2}.org_name
+ where
+ ${table_tbl2}.org_code = ${table_tbl3}.acntm_name
+ ) v2
+ from
+ ${table_tbl3} order by 1,2,3,4,5;
+ """
+
+ sql "set disable_join_reorder=0;"
+ qt_test_right_semi_mark_join_no_null """
+ select
+ ${table_tbl3}.id,
+ ${table_tbl3}.acntm_name,
+ ${table_tbl3}.vendor_name,
+ ${table_tbl3}.vendor_name in (
+ select
+ ${table_tbl1}.unit_name
+ from
+ ${table_tbl2}
+ join ${table_tbl1} on ${table_tbl1}.cur_unit_name =
${table_tbl2}.org_name
+ where
+ ${table_tbl2}.org_code = ${table_tbl3}.acntm_name
+ and ${table_tbl1}.unit_name is not null
+ ) v1,
+ ${table_tbl3}.vendor_name not in (
+ select
+ ${table_tbl1}.unit_name
+ from
+ ${table_tbl2}
+ join ${table_tbl1} on ${table_tbl1}.cur_unit_name =
${table_tbl2}.org_name
+ where
+ ${table_tbl2}.org_code = ${table_tbl3}.acntm_name
+ and ${table_tbl1}.unit_name is not null
+ ) v2
+ from
+ ${table_tbl3} order by 1,2,3,4,5;
+ """
+
+ sql "set disable_join_reorder=1;"
+ qt_test_right_semi_mark_join_no_null_2 """
+ select
+ ${table_tbl3}.id,
+ ${table_tbl3}.acntm_name,
+ ${table_tbl3}.vendor_name,
+ ${table_tbl3}.vendor_name in (
+ select
+ ${table_tbl1}.unit_name
+ from
+ ${table_tbl2}
+ join ${table_tbl1} on ${table_tbl1}.cur_unit_name =
${table_tbl2}.org_name
+ where
+ ${table_tbl2}.org_code = ${table_tbl3}.acntm_name
+ and ${table_tbl1}.unit_name is not null
+ ) v1,
+ ${table_tbl3}.vendor_name not in (
+ select
+ ${table_tbl1}.unit_name
+ from
+ ${table_tbl2}
+ join ${table_tbl1} on ${table_tbl1}.cur_unit_name =
${table_tbl2}.org_name
+ where
+ ${table_tbl2}.org_code = ${table_tbl3}.acntm_name
+ and ${table_tbl1}.unit_name is not null
+ ) v2
+ from
+ ${table_tbl3} order by 1,2,3,4,5;
"""
}
diff --git
a/regression-test/suites/query_p0/join/mark_join/right_semi_mark_join.groovy
b/regression-test/suites/query_p0/join/mark_join/right_semi_mark_join.groovy
new file mode 100644
index 00000000000..cc287f1a6f7
--- /dev/null
+++ b/regression-test/suites/query_p0/join/mark_join/right_semi_mark_join.groovy
@@ -0,0 +1,109 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("right_semi_mark_join") {
+ String suiteName = "right_semi_mark_join"
+ String table_tbl1 = "${suiteName}_table_tbl1"
+ String table_tbl2 = "${suiteName}_table_tbl2"
+ String table_tbl3 = "${suiteName}_table_tbl3"
+
+ sql "drop table if exists ${table_tbl1};"
+ sql "drop table if exists ${table_tbl2};"
+ sql "drop table if exists ${table_tbl3};"
+
+ sql """
+ create table ${table_tbl1} (pk int, col1 bigint, col2 bigint) engine =
olap DUPLICATE KEY(pk) distributed by hash(pk) buckets 10
properties("replication_num" = "1");
+ """
+
+ sql """
+ insert into
+ ${table_tbl1}(pk, col1, col2)
+ values
+ (0, null, 18332), (1, 788547, null), (2, 4644959, -56), (3,
8364628, 72), (4, null, -5581),
+ (5, 2344024, -62), (6, -2689177, 22979), (7, 1320, -41), (8,
null, -54), (9, 12, -6236),
+ (10, -8321648, null), (11, 153691, null), (12, -8056, null), (13,
-12, -2343514), (14, -35, -3361960);
+ """
+
+ sql """
+ create table ${table_tbl2} (
+ pk int, col1 bigint, col2 bigint
+ ) engine = olap
+ distributed by hash(pk) buckets 4
+ properties("replication_num" = "1");
+ """
+
+ sql """
+ insert into
+ ${table_tbl2}(pk, col1, col2)
+ values
+ (0, 108, 31161), (1, 1479175, 6764263), (2, 110, 25), (3, 110,
-18656), (4, null, -51),
+ (5, 21, 27), (6, -6950217, 1585978), (7, null, null), (8, null,
3453467), (9, null, -6701140);
+ """
+
+ sql """
+ create table ${table_tbl3} (
+ pk int, col1 bigint, col2 bigint, col3 bigint
+ ) engine = olap
+ DUPLICATE KEY(pk) distributed by hash(pk) buckets 10
+ properties("replication_num" = "1");
+ """
+
+ sql """
+ insert into
+ ${table_tbl3}(pk, col1, col2)
+ values
+ (0, 55, -58), (1, 49, 29792), (2, 95, 32361), (3, 31243, -27428),
(4, -27400, null),
+ (5, 31243, null), (6, null, -27428), (7, null, 7), (8, 31243,
-21951), (9, 13186, 24466),
+ (10, null, -8), (11, null, null), (12, -18, 32361), (13, null,
-18), (14, 21681, 14079),
+ (15, 31241, -17653), (16, 5825, 13559), (17, null, -10508), (18,
null, 20682), (19, 31243, -98),
+ (73, -32480, 24424), (74, 31, -27428), (75, 31243, -718), (76,
null, 20822), (77, 31243, -27428),
+ (78, -15934, null), (79, 78, -27428), (80, 8572, -27428), (81,
31243, 4077), (82, null, 114),
+ (83, 10, -71), (84, -32489, 32361), (85, null, null), (86, -22984,
32361), (87, 26607, -27428),
+ (5, 31243, null), (6, null, -27428), (7, null, 7), (8, 31243,
-21951), (9, 13186, 24466),
+ (10, null, -8), (11, null, null), (12, -18, 32361), (13, null,
-18), (14, 21681, 14079),
+ (15, 31241, -17653), (16, 5825, 13559), (17, null, -10508), (18,
null, 20682), (19, 31243, -98),
+ (73, -32480, 24424), (74, 31, -27428), (75, 31243, -718), (76,
null, 20822), (77, 31243, -27428),
+ (78, -15934, null), (79, 78, -27428), (80, 8572, -27428), (81,
31243, 4077), (82, null, 114),
+ (83, 10, -71), (84, -32489, 32361), (85, null, null), (86, -22984,
32361), (87, 26607, -27428),
+ (10, null, -8), (11, null, null), (12, -18, 32361), (13, null,
-18), (14, 21681, 14079),
+ (15, 31241, -17653), (16, 5825, 13559), (17, null, -10508), (18,
null, 20682), (19, 31243, -98),
+ (73, -32480, 24424), (74, 31, -27428), (75, 31243, -718), (76,
null, 20822), (77, 31243, -27428),
+ (78, -15934, null), (79, 78, -27428), (80, 8572, -27428), (81,
31243, 4077), (82, null, 114),
+ (83, 10, -71), (84, -32489, 32361), (85, null, null), (86, -22984,
32361), (87, 26607, -27428);
+ """
+
+ qt_test """
+ SELECT
+ T1.pk AS C1,
+ T1.col2 AS C2
+ FROM
+ ${table_tbl1} AS T1 FULL
+ OUTER JOIN ${table_tbl2} AS T2 ON T1.col1 <= T2.col2
+ OR T2.col1 IN (
+ SELECT
+ T3.col2
+ FROM
+ ${table_tbl3} AS T3
+ WHERE
+ T2.col2 = T3.col1
+ )
+ ORDER BY
+ C1,
+ C2 DESC;
+ """
+}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]