etseidl commented on code in PR #9967:
URL: https://github.com/apache/arrow-rs/pull/9967#discussion_r3237130632
##########
parquet/src/arrow/arrow_writer/levels.rs:
##########
@@ -645,17 +645,58 @@ impl LevelInfoBuilder {
match &info.logical_nulls {
Some(nulls) => {
assert!(range.end <= nulls.len());
- let nulls = nulls.inner();
- info.def_levels.extend_from_iter(range.clone().map(|i| {
- // Safety: range.end was asserted to be in bounds
earlier
- let valid = unsafe { nulls.value_unchecked(i) };
- max_def_level - (!valid as i16)
- }));
- info.non_null_indices.reserve(len);
- info.non_null_indices.extend(
- BitIndexIterator::new(nulls.inner(), nulls.offset() +
range.start, len)
- .map(|i| i + range.start),
- );
+ // Bulk-fill fast path. Gated on:
+ // - `len >= 64`: per-call slice/popcount/iterator
overhead only
+ // amortizes on sizable sub-ranges. List/struct paths
call
+ // write_leaf many times with tiny ranges (avg list
length 1-5);
+ // paying any per-call popcount there would regress
them. A
+ // threshold sweep at T={0,16,32,64,128,256} on Ryzen
9 9950X
+ // shows the regression floor settles by T=32 and the
choice of
+ // 64 gives ~12x margin over avg list length without
losing the
+ // flat-primitive wins.
+ // - `nulls.null_count() * 2 >= nulls.len()`: cached
`null_count()`
+ // is O(1), so this check is free. We use the
buffer-level density
+ // as a heuristic for the sub-range; for full-array
writes (the
+ // primary target — flat primitive columns) it's exact.
+ // Note: even when this gate skips the fast path,
evaluating the gate
+ // itself across high-frequency call sites (~10K calls in
some list
+ // benchmarks) is a small structural cost (~+1-2% on
list-sparse
+ // cases). It's the price of having any gate at all on
this hot path;
+ // reducing it further would require hoisting the decision
into the
+ // caller. The wins on the targeted shapes (-35%
sparse-primitive,
+ // -66% all-null primitive) far outweigh it.
+ if len >= 64 && nulls.null_count() * 2 >= nulls.len() {
+ // Bulk-fill the null level (vectorized memset) and
overwrite
+ // only the non-null positions. Correct for any null
distribution
+ // in the range; the gate above only controls when
it's profitable.
+ let range_nulls = nulls.slice(range.start, len);
+ let valid_in_range = len - range_nulls.null_count();
+ let null_def_level = max_def_level - 1;
+ let buf = info
+ .def_levels
+ .materialize_mut()
+ .expect("definition levels present");
+ let base = buf.len();
+ buf.resize(base + len, null_def_level);
+ for i in range_nulls.valid_indices() {
+ buf[base + i] = max_def_level;
+ }
+ info.non_null_indices.reserve(valid_in_range);
+ info.non_null_indices
+ .extend(range_nulls.valid_indices().map(|i| i +
range.start));
+ } else {
+ let bits = nulls.inner();
+ info.def_levels.extend_from_iter(range.clone().map(|i|
{
+ // Safety: range.end was asserted to be in bounds
earlier
+ let valid = unsafe { bits.value_unchecked(i) };
+ max_def_level - (!valid as i16)
+ }));
+ info.non_null_indices.reserve(len);
+ info.non_null_indices.extend(
+ BitIndexIterator::new(bits.inner(), bits.offset()
+ range.start, len)
+ .map(|i| i + range.start),
+ );
Review Comment:
> Did you have in mind slicing first the way the bulk-fill branch a few
lines up does, i.e. `let range_nulls = nulls.slice(range.start, len)` and then
iterating `range_nulls`? That would also let `non_null_indices` use
`range_nulls.valid_indices().map(|i| i + range.start)`, which I think still
needs to happen for the downstream value path
I think this.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]