jackwener commented on code in PR #4377:
URL: https://github.com/apache/arrow-datafusion/pull/4377#discussion_r1032933725
##########
datafusion/core/src/physical_plan/joins/hash_join.rs:
##########
@@ -1441,44 +1181,147 @@ fn equal_rows(
err.unwrap_or(Ok(res))
}
-// Produces a batch for left-side rows that have/have not been matched during
the whole join
-fn produce_from_matched(
- visited_left_side: &BooleanBufferBuilder,
- schema: &SchemaRef,
- column_indices: &[ColumnIndex],
- left_data: &JoinLeftData,
- unmatched: bool,
-) -> ArrowResult<RecordBatch> {
- let indices = if unmatched {
- UInt64Array::from_iter_values(
- (0..visited_left_side.len())
- .filter_map(|v| (!visited_left_side.get_bit(v)).then_some(v as
u64)),
- )
+// The input is the matched indices for left and right.
+// Adjust the indices according to the join type
+fn adjust_indices_by_join_type(
+ left_indices: UInt64Array,
+ right_indices: UInt32Array,
+ count_right_batch: usize,
+ join_type: JoinType,
+) -> (UInt64Array, UInt32Array) {
+ match join_type {
+ JoinType::Inner => {
+ // matched
+ (left_indices, right_indices)
+ }
+ JoinType::Left => {
+ // matched
+ (left_indices, right_indices)
+ // unmatched left row will be produced in the end of loop, and it
has been set in the left visited bitmap
+ }
+ JoinType::Right | JoinType::Full => {
+ // matched
+ // unmatched right row will be produced in this batch
+ let right_null_indices = get_anti_indices(count_right_batch,
&right_indices);
+ // combine the matched and unmatched right result together
+ append_right_indices(left_indices, right_indices,
right_null_indices)
+ }
+ JoinType::RightSemi => {
+ // need to remove the duplicated record in the right side
+ let right_indices = get_semi_indices(count_right_batch,
&right_indices);
+ // the left_indices will not be used later for the `right semi`
join
+ (left_indices, right_indices)
+ }
+ JoinType::RightAnti => {
+ // need to remove the duplicated record in the right side
+ // get the anti index for the right side
+ let right_indices = get_anti_indices(count_right_batch,
&right_indices);
+ // the left_indices will not be used later for the `right anti`
join
+ (left_indices, right_indices)
+ }
+ JoinType::LeftSemi | JoinType::LeftAnti => {
+ // matched or unmatched left row will be produced in the end of
loop
+ (
+ UInt64Array::from_iter_values(vec![]),
+ UInt32Array::from_iter_values(vec![]),
+ )
Review Comment:
We can add a TODO optimized points
Because semi don't need to wait the end
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]