UBarney commented on code in PR #16443:
URL: https://github.com/apache/datafusion/pull/16443#discussion_r2275332927
##########
datafusion/physical-plan/src/joins/utils.rs:
##########
@@ -843,24 +844,56 @@ pub(crate) fn apply_join_filter_to_indices(
probe_indices: UInt32Array,
filter: &JoinFilter,
build_side: JoinSide,
+ max_intermediate_size: Option<usize>,
) -> Result<(UInt64Array, UInt32Array)> {
if build_indices.is_empty() && probe_indices.is_empty() {
return Ok((build_indices, probe_indices));
};
- let intermediate_batch = build_batch_from_indices(
- filter.schema(),
- build_input_buffer,
- probe_batch,
- &build_indices,
- &probe_indices,
- filter.column_indices(),
- build_side,
- )?;
- let filter_result = filter
- .expression()
- .evaluate(&intermediate_batch)?
- .into_array(intermediate_batch.num_rows())?;
+ let filter_result = if let Some(max_size) = max_intermediate_size {
+ let mut filter_results =
+ Vec::with_capacity(build_indices.len().div_ceil(max_size));
+
+ for i in (0..build_indices.len()).step_by(max_size) {
+ let end = min(build_indices.len(), i + max_size);
+ let len = end - i;
+ let intermediate_batch = build_batch_from_indices(
+ filter.schema(),
+ build_input_buffer,
+ probe_batch,
+ &build_indices.slice(i, len),
+ &probe_indices.slice(i, len),
+ filter.column_indices(),
+ build_side,
+ )?;
+ let filter_result = filter
Review Comment:
This optimization boosts performance with a much higher IPC of 1.75 (vs
0.87), achieved by dramatically cutting LLC misses from 109M to 25M, even with
a similar L1 miss rate.
<details>
```
sudo perf stat -e
cycles,instructions,L1-dcache-load-misses,L1-dcache-loads,LLC-loads,LLC-load-misses
./limit_batch_size@36991aca -c 'select t1.value from range(100) t1 join
range(819200) t2 on (t1.value + t2.value) % 1000 = 0; ' --maxrows 1
sudo perf stat -e
cycles,instructions,L1-dcache-load-misses,L1-dcache-loads,LLC-loads,LLC-load-misses
./join_base@6965fd32 -c 'select t1.value from range(100) t1 join range(819200)
t2 on (t1.value + t2.value) % 1000 = 0; ' --maxrows 1
DataFusion CLI v48.0.0
+-------+
| value |
+-------+
| 40 |
| . |
| . |
| . |
+-------+
81901 row(s) fetched. (First 1 displayed. Use --maxrows to adjust)
Elapsed 0.067 seconds.
Performance counter stats for './limit_batch_size@36991aca -c select
t1.value from range(100) t1 join range(819200) t2 on (t1.value + t2.value) %
1000 = 0; --maxrows 1':
1901401922 cycles
3325776634 instructions # 1.75
insn per cycle
32419611 L1-dcache-load-misses # 5.27% of
all L1-dcache accesses
614645891 L1-dcache-loads
<not supported> LLC-loads
<not supported> LLC-load-misses
0.073244586 seconds time elapsed
0.448238000 seconds user
0.044823000 seconds sys
DataFusion CLI v48.0.0
+-------+
| value |
+-------+
| 99 |
| . |
| . |
| . |
+-------+
81901 row(s) fetched. (First 1 displayed. Use --maxrows to adjust)
Elapsed 0.131 seconds.
Performance counter stats for './join_base@6965fd32 -c select t1.value
from range(100) t1 join range(819200) t2 on (t1.value + t2.value) % 1000 = 0;
--maxrows 1':
3696196789 cycles
3201132508 instructions # 0.87
insn per cycle
21781750 L1-dcache-load-misses # 3.68% of
all L1-dcache accesses
592094439 L1-dcache-loads
<not supported> LLC-loads
<not supported> LLC-load-misses
0.139081088 seconds time elapsed
0.835575000 seconds user
0.111277000 seconds sys
(venv) √ devhomeinsp ~/c/d/t/release > valgrind --cache-sim=yes
--tool=cachegrind ./join_base@6965fd32 -c 'select t1.value from range(8192) t1
join range(8192) t2 on t1.value + t2.value > t1.value * t2.value;' --maxrows 1
==94454== Cachegrind, a high-precision tracing profiler
==94454== Copyright (C) 2002-2017, and GNU GPL'd, by Nicholas
Nethercote et al.
==94454== Using Valgrind-3.22.0 and LibVEX; rerun with -h for copyright
info
==94454== Command: ./join_base@6965fd32 -c select\ t1.value\ from\
range(8192)\ t1\ join\ range(8192)\ t2\ on\ t1.value\ +\ t2.value\ \>\
t1.value\ *\ t2.value; --maxrows 1
==94454==
--94454-- warning: L3 cache found, using its data for the LL simulation.
--94454-- warning: specified LL cache: line_size 64 assoc 12
total_size 31,457,280
--94454-- warning: simulated LL cache: line_size 64 assoc 15
total_size 31,457,280
DataFusion CLI v48.0.0
+-------+
| value |
+-------+
| 1 |
| . |
| . |
| . |
+-------+
32763 row(s) fetched. (First 1 displayed. Use --maxrows to adjust)
Elapsed 7.948 seconds.
==94454==
==94454== I refs: 3,555,994,712
==94454== I1 misses: 66,444
==94454== LLi misses: 26,028
==94454== I1 miss rate: 0.00%
==94454== LLi miss rate: 0.00%
==94454==
==94454== D refs: 813,250,121 (475,263,085 rd + 337,987,036
wr)
==94454== D1 misses: 118,285,307 ( 71,937,864 rd + 46,347,443
wr)
==94454== LLd misses: 109,455,796 ( 63,122,399 rd + 46,333,397
wr)
==94454== D1 miss rate: 14.5% ( 15.1% +
13.7% )
==94454== LLd miss rate: 13.5% ( 13.3% +
13.7% )
==94454==
==94454== LL refs: 118,351,751 ( 72,004,308 rd + 46,347,443
wr)
==94454== LL misses: 109,481,824 ( 63,148,427 rd + 46,333,397
wr)
==94454== LL miss rate: 2.5% ( 1.6% +
13.7% )
(venv) √ devhomeinsp ~/c/d/t/release > valgrind --cache-sim=yes
--tool=cachegrind ./limit_batch_size@36991aca -c 'select t1.value from
range(8192) t1 join range(8192) t2 on t1.value + t2.value > t1.value *
t2.value;' --maxrows 1
==96086== Cachegrind, a high-precision tracing profiler
==96086== Copyright (C) 2002-2017, and GNU GPL'd, by Nicholas
Nethercote et al.
==96086== Using Valgrind-3.22.0 and LibVEX; rerun with -h for copyright
info
==96086== Command: ./limit_batch_size@36991aca -c select\ t1.value\
from\ range(8192)\ t1\ join\ range(8192)\ t2\ on\ t1.value\ +\ t2.value\ \>\
t1.value\ *\ t2.value; --maxrows 1
==96086==
--96086-- warning: L3 cache found, using its data for the LL simulation.
--96086-- warning: specified LL cache: line_size 64 assoc 12
total_size 31,457,280
--96086-- warning: simulated LL cache: line_size 64 assoc 15
total_size 31,457,280
DataFusion CLI v48.0.0
+-------+
| value |
+-------+
| 1 |
| . |
| . |
| . |
+-------+
32763 row(s) fetched. (First 1 displayed. Use --maxrows to adjust)
Elapsed 8.163 seconds.
==96086==
==96086== I refs: 3,663,944,959
==96086== I1 misses: 944,257
==96086== LLi misses: 27,378
==96086== I1 miss rate: 0.03%
==96086== LLi miss rate: 0.00%
==96086==
==96086== D refs: 847,265,289 (495,073,876 rd + 352,191,413
wr)
==96086== D1 misses: 122,392,985 ( 74,620,328 rd + 47,772,657
wr)
==96086== LLd misses: 25,750,761 ( 12,815,239 rd + 12,935,522
wr)
==96086== D1 miss rate: 14.4% ( 15.1% +
13.6% )
==96086== LLd miss rate: 3.0% ( 2.6% +
3.7% )
==96086==
==96086== LL refs: 123,337,242 ( 75,564,585 rd + 47,772,657
wr)
==96086== LL misses: 25,778,139 ( 12,842,617 rd + 12,935,522
wr)
==96086== LL miss rate: 0.6% ( 0.3% +
3.7% )
```
</details>
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]