etseidl commented on code in PR #9967:
URL: https://github.com/apache/arrow-rs/pull/9967#discussion_r3237130632


##########
parquet/src/arrow/arrow_writer/levels.rs:
##########
@@ -645,17 +645,58 @@ impl LevelInfoBuilder {
             match &info.logical_nulls {
                 Some(nulls) => {
                     assert!(range.end <= nulls.len());
-                    let nulls = nulls.inner();
-                    info.def_levels.extend_from_iter(range.clone().map(|i| {
-                        // Safety: range.end was asserted to be in bounds 
earlier
-                        let valid = unsafe { nulls.value_unchecked(i) };
-                        max_def_level - (!valid as i16)
-                    }));
-                    info.non_null_indices.reserve(len);
-                    info.non_null_indices.extend(
-                        BitIndexIterator::new(nulls.inner(), nulls.offset() + 
range.start, len)
-                            .map(|i| i + range.start),
-                    );
+                    // Bulk-fill fast path. Gated on:
+                    //   - `len >= 64`: per-call slice/popcount/iterator 
overhead only
+                    //     amortizes on sizable sub-ranges. List/struct paths 
call
+                    //     write_leaf many times with tiny ranges (avg list 
length 1-5);
+                    //     paying any per-call popcount there would regress 
them. A
+                    //     threshold sweep at T={0,16,32,64,128,256} on Ryzen 
9 9950X
+                    //     shows the regression floor settles by T=32 and the 
choice of
+                    //     64 gives ~12x margin over avg list length without 
losing the
+                    //     flat-primitive wins.
+                    //   - `nulls.null_count() * 2 >= nulls.len()`: cached 
`null_count()`
+                    //     is O(1), so this check is free. We use the 
buffer-level density
+                    //     as a heuristic for the sub-range; for full-array 
writes (the
+                    //     primary target — flat primitive columns) it's exact.
+                    // Note: even when this gate skips the fast path, 
evaluating the gate
+                    // itself across high-frequency call sites (~10K calls in 
some list
+                    // benchmarks) is a small structural cost (~+1-2% on 
list-sparse
+                    // cases). It's the price of having any gate at all on 
this hot path;
+                    // reducing it further would require hoisting the decision 
into the
+                    // caller. The wins on the targeted shapes (-35% 
sparse-primitive,
+                    // -66% all-null primitive) far outweigh it.
+                    if len >= 64 && nulls.null_count() * 2 >= nulls.len() {
+                        // Bulk-fill the null level (vectorized memset) and 
overwrite
+                        // only the non-null positions. Correct for any null 
distribution
+                        // in the range; the gate above only controls when 
it's profitable.
+                        let range_nulls = nulls.slice(range.start, len);
+                        let valid_in_range = len - range_nulls.null_count();
+                        let null_def_level = max_def_level - 1;
+                        let buf = info
+                            .def_levels
+                            .materialize_mut()
+                            .expect("definition levels present");
+                        let base = buf.len();
+                        buf.resize(base + len, null_def_level);
+                        for i in range_nulls.valid_indices() {
+                            buf[base + i] = max_def_level;
+                        }
+                        info.non_null_indices.reserve(valid_in_range);
+                        info.non_null_indices
+                            .extend(range_nulls.valid_indices().map(|i| i + 
range.start));
+                    } else {
+                        let bits = nulls.inner();
+                        info.def_levels.extend_from_iter(range.clone().map(|i| 
{
+                            // Safety: range.end was asserted to be in bounds 
earlier
+                            let valid = unsafe { bits.value_unchecked(i) };
+                            max_def_level - (!valid as i16)
+                        }));
+                        info.non_null_indices.reserve(len);
+                        info.non_null_indices.extend(
+                            BitIndexIterator::new(bits.inner(), bits.offset() 
+ range.start, len)
+                                .map(|i| i + range.start),
+                        );

Review Comment:
   > Did you have in mind slicing first the way the bulk-fill branch a few 
lines up does, i.e. `let range_nulls = nulls.slice(range.start, len)` and then 
iterating `range_nulls`? That would also let `non_null_indices` use 
`range_nulls.valid_indices().map(|i| i + range.start)`, which I think still 
needs to happen for the downstream value path
   
   I think this.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to