HippoBaro commented on code in PR #9967:
URL: https://github.com/apache/arrow-rs/pull/9967#discussion_r3231347135
##########
parquet/src/arrow/arrow_writer/levels.rs:
##########
@@ -646,12 +646,38 @@ impl LevelInfoBuilder {
Some(nulls) => {
assert!(range.end <= nulls.len());
let nulls = nulls.inner();
- info.def_levels.extend_from_iter(range.clone().map(|i| {
- // Safety: range.end was asserted to be in bounds
earlier
- let valid = unsafe { nulls.value_unchecked(i) };
- max_def_level - (!valid as i16)
- }));
- info.non_null_indices.reserve(len);
+ // Count valid values in `range` with a single popcount
pass over the
+ // null mask (O(len / 64)). When the slice is
majority-null, building the
+ // definition levels by bulk-filling the null level (a
vectorized memset)
+ // and overwriting only the non-null positions is far
cheaper than the
+ // per-row map used in the general case.
+ let valid_in_range = nulls
+ .inner()
+ .count_set_bits_offset(nulls.offset() + range.start,
len);
+ if len - valid_in_range >= valid_in_range {
+ let null_def_level = max_def_level - 1;
+ let buf = info
+ .def_levels
+ .materialize_mut()
+ .expect("definition levels present");
+ let base = buf.len();
+ buf.resize(base + len, null_def_level);
+ for i in
+ BitIndexIterator::new(nulls.inner(),
nulls.offset() + range.start, len)
+ {
+ // Safety: BitIndexIterator yields i in 0..len,
and the buffer was
+ // just resized to base + len, so base + i is in
bounds.
+ debug_assert!(base + i < buf.len());
+ unsafe { *buf.get_unchecked_mut(base + i) =
max_def_level };
+ }
+ } else {
+ info.def_levels.extend_from_iter(range.clone().map(|i|
{
+ // Safety: range.end was asserted to be in bounds
earlier
+ let valid = unsafe { nulls.value_unchecked(i) };
+ max_def_level - (!valid as i16)
+ }));
+ }
+ info.non_null_indices.reserve(len - valid_in_range);
Review Comment:
This is the null count right? We should reserve `valid_in_range` directly I
think?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]