korowa commented on code in PR #8020:
URL: https://github.com/apache/arrow-datafusion/pull/8020#discussion_r1460933408
##########
datafusion/physical-plan/src/joins/hash_join.rs:
##########
@@ -1039,76 +1058,32 @@ pub fn build_equal_condition_join_indices<T:
JoinHashMapType>(
.into_array(build_input_buffer.num_rows())
})
.collect::<Result<Vec<_>>>()?;
- hashes_buffer.clear();
- hashes_buffer.resize(probe_batch.num_rows(), 0);
- let hash_values = create_hashes(&keys_values, random_state,
hashes_buffer)?;
- // In case build-side input has not been inverted while JoinHashMap
creation, the chained list algorithm
- // will return build indices for each probe row in a reverse order as such:
- // Build Indices: [5, 4, 3]
- // Probe Indices: [1, 1, 1]
- //
- // This affects the output sequence. Hypothetically, it's possible to
preserve the lexicographic order on the build side.
- // Let's consider probe rows [0,1] as an example:
- //
- // When the probe iteration sequence is reversed, the following pairings
can be derived:
- //
- // For probe row 1:
- // (5, 1)
- // (4, 1)
- // (3, 1)
- //
- // For probe row 0:
- // (5, 0)
- // (4, 0)
- // (3, 0)
- //
- // After reversing both sets of indices, we obtain reversed indices:
- //
- // (3,0)
- // (4,0)
- // (5,0)
- // (3,1)
- // (4,1)
- // (5,1)
- //
- // With this approach, the lexicographic order on both the probe side and
the build side is preserved.
- let (mut probe_indices, mut build_indices) = if fifo_hashmap {
- build_hashmap.get_matched_indices(hash_values.iter().enumerate(),
deleted_offset)
- } else {
- let (mut matched_probe, mut matched_build) = build_hashmap
- .get_matched_indices(hash_values.iter().enumerate().rev(),
deleted_offset);
-
- matched_probe.as_slice_mut().reverse();
- matched_build.as_slice_mut().reverse();
+ let mut hashes_buffer = vec![0; probe_batch.num_rows()];
Review Comment:
After precalculating hashes (it was the minor cause of degradation ~10ms)
and removing all iterators-related constructions from lookup function (this
function indeed was the major cause of execution slowdown ~30-40ms), I was able
to obtain following results for q18
```
Comparing master and hash_join_batch_size
--------------------
Benchmark tpch.json
--------------------
┏━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓
┃ Query ┃ master ┃ hash_join_batch_size ┃ Change ┃
┡━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩
│ QQuery 18 │ 508.30ms │ 510.62ms │ no change │
└──────────────┴──────────┴──────────────────────┴───────────┘
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓
┃ Benchmark Summary ┃ ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩
│ Total Time (master) │ 508.30ms │
│ Total Time (hash_join_batch_size) │ 510.62ms │
│ Average Time (master) │ 508.30ms │
│ Average Time (hash_join_batch_size) │ 510.62ms │
│ Queries Faster │ 0 │
│ Queries Slower │ 0 │
│ Queries with No Change │ 1 │
└─────────────────────────────────────┴──────────┘
--------------------
Benchmark tpch_mem.json
--------------------
┏━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓
┃ Query ┃ master ┃ hash_join_batch_size ┃ Change ┃
┡━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩
│ QQuery 18 │ 455.00ms │ 476.05ms │ no change │
└──────────────┴──────────┴──────────────────────┴───────────┘
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓
┃ Benchmark Summary ┃ ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩
│ Total Time (master) │ 455.00ms │
│ Total Time (hash_join_batch_size) │ 476.05ms │
│ Average Time (master) │ 455.00ms │
│ Average Time (hash_join_batch_size) │ 476.05ms │
│ Queries Faster │ 0 │
│ Queries Slower │ 0 │
│ Queries with No Change │ 1 │
└─────────────────────────────────────┴──────────┘
```
(with `--iterations 50 --partitions 4 --query 18`) which shows ~20-30ms
speedup of what I've seen before for tpch-mem, but I don't have exact
explanation of it yet. If you have (or will have) any additional comments /
ideas / thoughts, it would be great.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]