Re: [PR] perf(parquet): Vectorize dict-index bounds check in RleDecoder::get_batch_with_dict (up to -7.9%) [arrow-rs]

via GitHub Fri, 17 Apr 2026 02:44:39 -0700


Dandandan commented on code in PR #9746:
URL: https://github.com/apache/arrow-rs/pull/9746#discussion_r3099285744



##########
parquet/src/encodings/rle.rs:
##########
@@ -514,16 +514,28 @@ impl RleDecoder {
                         break;
                     }
                     {
+                        #[cold]
+                        #[inline(never)]
+                        fn oob(max_idx: u32, dict_len: usize) -> ! {
+                            panic!(
+                                "dictionary index out of bounds: the len is 
{dict_len} but the index is {max_idx}"
+                            )
+                        }
+                        const CHUNK: usize = 16;
                         let out = &mut buffer[values_read..values_read + 
num_values];
                         let idx = &index_buf[..num_values];
-                        let mut out_chunks = out.chunks_exact_mut(8);
-                        let idx_chunks = idx.chunks_exact(8);
+                        let dict_len = dict.len();
+                        let mut out_chunks = out.chunks_exact_mut(CHUNK);
+                        let idx_chunks = idx.chunks_exact(CHUNK);
                         for (out_chunk, idx_chunk) in 
out_chunks.by_ref().zip(idx_chunks) {
-                            let dict_len = dict.len();
-                            assert!(
-                                idx_chunk.iter().all(|&i| (i as usize) < 
dict_len),
-                                "dictionary index out of bounds"
-                            );
+                            // u32 max-reduction instead of `.all(|&i| ..)`: 
`.all`

Review Comment:
   Oof :) 
   
   I compared it now as well to `.fold(true, |a, &i| a & ((i as u32) < 
dict_len_u32))`, looks like computing the max-then-compare does also generate 
better code (and benchmarks) than compare-against-max:
   
   ```
     max-reduce:                                                                
                                                                                
                                                    
     ldp  q1, q0, [x13, #32]                                              
     ldp  q3, q2, [x13]                                                         
                                                                                
                                                                                
     umax.4s  v4, v2, v0                                                  
     umax.4s  v3, v3, v1                                                        
                                                                                
                                                                                
     umax.4s  v3, v3, v4
     umaxv.4s s3, v3                                                            
                                                                                
                                                                                
     fmov     w8, s3                                                      
     cmp      x21, x8                                                           
                                                                                
                                                                                
     b.ls     LBB2735_48
                                                                                
                                                                                
                                                                                
     AND-reduce : 
     ldp  q1, q0, [x8, #32]                                                     
                                                                                
                                                                                
     ldp  q3, q2, [x8]     
     cmhs.4s  v4, v2, v6                                                        
                                                                                
                                                                                
     cmhs.4s  v3, v3, v6                                                        
                                                                                
                                                                                
     uzp1.8h  v3, v3, v4
     cmhs.4s  v4, v1, v6                                                        
                                                                                
                                                                                
     cmhs.4s  v5, v0, v6                                                        
                                                                                
                                                                                
     uzp1.8h  v4, v4, v5
     uzp1.16b v3, v3, v4                                                        
                                                                                
                                                                                
     umaxv.16b b3, v3                                                     
     fmov     w14, s3                                                           
                                                                                
                                                                                
     tbnz     w14, #0, LBB2735_48
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] perf(parquet): Vectorize dict-index bounds check in RleDecoder::get_batch_with_dict (up to -7.9%) [arrow-rs]

Reply via email to