UBarney commented on code in PR #16443: URL: https://github.com/apache/datafusion/pull/16443#discussion_r2275332927
########## datafusion/physical-plan/src/joins/utils.rs: ########## @@ -843,24 +844,56 @@ pub(crate) fn apply_join_filter_to_indices( probe_indices: UInt32Array, filter: &JoinFilter, build_side: JoinSide, + max_intermediate_size: Option<usize>, ) -> Result<(UInt64Array, UInt32Array)> { if build_indices.is_empty() && probe_indices.is_empty() { return Ok((build_indices, probe_indices)); }; - let intermediate_batch = build_batch_from_indices( - filter.schema(), - build_input_buffer, - probe_batch, - &build_indices, - &probe_indices, - filter.column_indices(), - build_side, - )?; - let filter_result = filter - .expression() - .evaluate(&intermediate_batch)? - .into_array(intermediate_batch.num_rows())?; + let filter_result = if let Some(max_size) = max_intermediate_size { + let mut filter_results = + Vec::with_capacity(build_indices.len().div_ceil(max_size)); + + for i in (0..build_indices.len()).step_by(max_size) { + let end = min(build_indices.len(), i + max_size); + let len = end - i; + let intermediate_batch = build_batch_from_indices( + filter.schema(), + build_input_buffer, + probe_batch, + &build_indices.slice(i, len), + &probe_indices.slice(i, len), + filter.column_indices(), + build_side, + )?; + let filter_result = filter Review Comment: This optimization boosts performance with a much higher IPC of 1.75 (vs 0.87), achieved by dramatically cutting LLC misses from 109M to 25M, even with a similar L1 miss rate. <details> ``` sudo perf stat -e cycles,instructions,L1-dcache-load-misses,L1-dcache-loads,LLC-loads,LLC-load-misses ./limit_batch_size@36991aca -c 'select t1.value from range(100) t1 join range(819200) t2 on (t1.value + t2.value) % 1000 = 0; ' --maxrows 1 sudo perf stat -e cycles,instructions,L1-dcache-load-misses,L1-dcache-loads,LLC-loads,LLC-load-misses ./join_base@6965fd32 -c 'select t1.value from range(100) t1 join range(819200) t2 on (t1.value + t2.value) % 1000 = 0; ' --maxrows 1 DataFusion CLI v48.0.0 +-------+ | value | +-------+ | 40 | | . | | . | | . | +-------+ 81901 row(s) fetched. (First 1 displayed. Use --maxrows to adjust) Elapsed 0.067 seconds. Performance counter stats for './limit_batch_size@36991aca -c select t1.value from range(100) t1 join range(819200) t2 on (t1.value + t2.value) % 1000 = 0; --maxrows 1': 1901401922 cycles 3325776634 instructions # 1.75 insn per cycle 32419611 L1-dcache-load-misses # 5.27% of all L1-dcache accesses 614645891 L1-dcache-loads <not supported> LLC-loads <not supported> LLC-load-misses 0.073244586 seconds time elapsed 0.448238000 seconds user 0.044823000 seconds sys DataFusion CLI v48.0.0 +-------+ | value | +-------+ | 99 | | . | | . | | . | +-------+ 81901 row(s) fetched. (First 1 displayed. Use --maxrows to adjust) Elapsed 0.131 seconds. Performance counter stats for './join_base@6965fd32 -c select t1.value from range(100) t1 join range(819200) t2 on (t1.value + t2.value) % 1000 = 0; --maxrows 1': 3696196789 cycles 3201132508 instructions # 0.87 insn per cycle 21781750 L1-dcache-load-misses # 3.68% of all L1-dcache accesses 592094439 L1-dcache-loads <not supported> LLC-loads <not supported> LLC-load-misses 0.139081088 seconds time elapsed 0.835575000 seconds user 0.111277000 seconds sys (venv) √ devhomeinsp ~/c/d/t/release > valgrind --cache-sim=yes --tool=cachegrind ./join_base@6965fd32 -c 'select t1.value from range(8192) t1 join range(8192) t2 on t1.value + t2.value > t1.value * t2.value;' --maxrows 1 ==94454== Cachegrind, a high-precision tracing profiler ==94454== Copyright (C) 2002-2017, and GNU GPL'd, by Nicholas Nethercote et al. ==94454== Using Valgrind-3.22.0 and LibVEX; rerun with -h for copyright info ==94454== Command: ./join_base@6965fd32 -c select\ t1.value\ from\ range(8192)\ t1\ join\ range(8192)\ t2\ on\ t1.value\ +\ t2.value\ \>\ t1.value\ *\ t2.value; --maxrows 1 ==94454== --94454-- warning: L3 cache found, using its data for the LL simulation. --94454-- warning: specified LL cache: line_size 64 assoc 12 total_size 31,457,280 --94454-- warning: simulated LL cache: line_size 64 assoc 15 total_size 31,457,280 DataFusion CLI v48.0.0 +-------+ | value | +-------+ | 1 | | . | | . | | . | +-------+ 32763 row(s) fetched. (First 1 displayed. Use --maxrows to adjust) Elapsed 7.948 seconds. ==94454== ==94454== I refs: 3,555,994,712 ==94454== I1 misses: 66,444 ==94454== LLi misses: 26,028 ==94454== I1 miss rate: 0.00% ==94454== LLi miss rate: 0.00% ==94454== ==94454== D refs: 813,250,121 (475,263,085 rd + 337,987,036 wr) ==94454== D1 misses: 118,285,307 ( 71,937,864 rd + 46,347,443 wr) ==94454== LLd misses: 109,455,796 ( 63,122,399 rd + 46,333,397 wr) ==94454== D1 miss rate: 14.5% ( 15.1% + 13.7% ) ==94454== LLd miss rate: 13.5% ( 13.3% + 13.7% ) ==94454== ==94454== LL refs: 118,351,751 ( 72,004,308 rd + 46,347,443 wr) ==94454== LL misses: 109,481,824 ( 63,148,427 rd + 46,333,397 wr) ==94454== LL miss rate: 2.5% ( 1.6% + 13.7% ) (venv) √ devhomeinsp ~/c/d/t/release > valgrind --cache-sim=yes --tool=cachegrind ./limit_batch_size@36991aca -c 'select t1.value from range(8192) t1 join range(8192) t2 on t1.value + t2.value > t1.value * t2.value;' --maxrows 1 ==96086== Cachegrind, a high-precision tracing profiler ==96086== Copyright (C) 2002-2017, and GNU GPL'd, by Nicholas Nethercote et al. ==96086== Using Valgrind-3.22.0 and LibVEX; rerun with -h for copyright info ==96086== Command: ./limit_batch_size@36991aca -c select\ t1.value\ from\ range(8192)\ t1\ join\ range(8192)\ t2\ on\ t1.value\ +\ t2.value\ \>\ t1.value\ *\ t2.value; --maxrows 1 ==96086== --96086-- warning: L3 cache found, using its data for the LL simulation. --96086-- warning: specified LL cache: line_size 64 assoc 12 total_size 31,457,280 --96086-- warning: simulated LL cache: line_size 64 assoc 15 total_size 31,457,280 DataFusion CLI v48.0.0 +-------+ | value | +-------+ | 1 | | . | | . | | . | +-------+ 32763 row(s) fetched. (First 1 displayed. Use --maxrows to adjust) Elapsed 8.163 seconds. ==96086== ==96086== I refs: 3,663,944,959 ==96086== I1 misses: 944,257 ==96086== LLi misses: 27,378 ==96086== I1 miss rate: 0.03% ==96086== LLi miss rate: 0.00% ==96086== ==96086== D refs: 847,265,289 (495,073,876 rd + 352,191,413 wr) ==96086== D1 misses: 122,392,985 ( 74,620,328 rd + 47,772,657 wr) ==96086== LLd misses: 25,750,761 ( 12,815,239 rd + 12,935,522 wr) ==96086== D1 miss rate: 14.4% ( 15.1% + 13.6% ) ==96086== LLd miss rate: 3.0% ( 2.6% + 3.7% ) ==96086== ==96086== LL refs: 123,337,242 ( 75,564,585 rd + 47,772,657 wr) ==96086== LL misses: 25,778,139 ( 12,842,617 rd + 12,935,522 wr) ==96086== LL miss rate: 0.6% ( 0.3% + 3.7% ) ``` </details> -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org