This is an automated email from the ASF dual-hosted git repository.
mbrobbel pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new ba22a214b3 Fix "Incorrect Behavior of Collecting a filtered iterator
to a BooleanArray" (#8543)
ba22a214b3 is described below
commit ba22a214b3c469da5466f60a74ab201a268bc0fc
Author: Tobias Schwarzinger <[email protected]>
AuthorDate: Tue Oct 7 10:18:13 2025 +0200
Fix "Incorrect Behavior of Collecting a filtered iterator to a
BooleanArray" (#8543)
# Which issue does this PR close?
- Closes #8505 .
# Rationale for this change
Fix the bug and align `BooleanArray::from_iter` to
`PrimitiveArray::from_iter`
In `BooleanArray::from_iter`:
Collecting to a `Vec` and then using `from_trusted_len_iter` was almost
double as fast as using `BooleanBufferBuilder` on my machine.
# What changes are included in this PR?
- Use builders in `BooleanArray::from_iter` to fix the wrong behavior
- Introduce `BooleanArray::from_trusted_len_iter` for a more performant
version (The old version of `BooleanArray::from_iter`, just with unsafe
flavor of `bit_util::set_bit_raw`)
- Add `BooleanAdapter`, inspired by `NativeAdapter` from the
`PrimitiveArray`. This allows also doing `BooleanArray::from_iter([true,
false].into_iter())`.
# Are these changes tested?
- New test to cover the initial bug
- New test to cover `BooleanArray::from_trusted_len_iter` directly (old
`BooleanArray::from_iter` also cover it indirectly)
- New test to document that you can directly collect `[false, true,
...]` (no `Option`)
# Are there any user-facing changes?
- `BooleanArray::from_iter` has a "slight" performance regression that
users could observe.
- Allow directly collecting bools to a `BooleanArray`
- `BooleanArray::from_trusted_len_iter`
---
arrow-array/src/array/boolean_array.rs | 128 +++++++++++++++++++++++++++--
arrow-array/src/builder/boolean_builder.rs | 9 +-
arrow/benches/array_from.rs | 10 ++-
3 files changed, 135 insertions(+), 12 deletions(-)
diff --git a/arrow-array/src/array/boolean_array.rs
b/arrow-array/src/array/boolean_array.rs
index d75b33f7e4..7967084aa7 100644
--- a/arrow-array/src/array/boolean_array.rs
+++ b/arrow-array/src/array/boolean_array.rs
@@ -436,11 +436,84 @@ impl<'a> BooleanArray {
}
}
-impl<Ptr: std::borrow::Borrow<Option<bool>>> FromIterator<Ptr> for
BooleanArray {
+/// An optional boolean value
+///
+/// This struct is used as an adapter when creating `BooleanArray` from an
iterator.
+/// `FromIterator` for `BooleanArray` takes an iterator where the elements can
be `into`
+/// this struct. So once implementing `From` or `Into` trait for a type, an
iterator of
+/// the type can be collected to `BooleanArray`.
+///
+/// See also [NativeAdapter](crate::array::NativeAdapter).
+#[derive(Debug)]
+struct BooleanAdapter {
+ /// Corresponding Rust native type if available
+ pub native: Option<bool>,
+}
+
+impl From<bool> for BooleanAdapter {
+ fn from(value: bool) -> Self {
+ BooleanAdapter {
+ native: Some(value),
+ }
+ }
+}
+
+impl From<&bool> for BooleanAdapter {
+ fn from(value: &bool) -> Self {
+ BooleanAdapter {
+ native: Some(*value),
+ }
+ }
+}
+
+impl From<Option<bool>> for BooleanAdapter {
+ fn from(value: Option<bool>) -> Self {
+ BooleanAdapter { native: value }
+ }
+}
+
+impl From<&Option<bool>> for BooleanAdapter {
+ fn from(value: &Option<bool>) -> Self {
+ BooleanAdapter { native: *value }
+ }
+}
+
+impl<Ptr: Into<BooleanAdapter>> FromIterator<Ptr> for BooleanArray {
fn from_iter<I: IntoIterator<Item = Ptr>>(iter: I) -> Self {
let iter = iter.into_iter();
- let (_, data_len) = iter.size_hint();
- let data_len = data_len.expect("Iterator must be sized"); // panic if
no upper bound.
+ let capacity = match iter.size_hint() {
+ (lower, Some(upper)) if lower == upper => lower,
+ _ => 0,
+ };
+ let mut builder = BooleanBuilder::with_capacity(capacity);
+ builder.extend(iter.map(|item| item.into().native));
+ builder.finish()
+ }
+}
+
+impl BooleanArray {
+ /// Creates a [`BooleanArray`] from an iterator of trusted length.
+ ///
+ /// # Safety
+ ///
+ /// The iterator must be
[`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
+ /// I.e. that `size_hint().1` correctly reports its length. Note that this
is a stronger
+ /// guarantee that `ExactSizeIterator` provides which could still report a
wrong length.
+ ///
+ /// # Panics
+ ///
+ /// Panics if the iterator does not report an upper bound on `size_hint()`.
+ #[inline]
+ #[allow(
+ private_bounds,
+ reason = "We will expose BooleanAdapter if there is a need"
+ )]
+ pub unsafe fn from_trusted_len_iter<I, P>(iter: I) -> Self
+ where
+ P: Into<BooleanAdapter>,
+ I: ExactSizeIterator<Item = P>,
+ {
+ let data_len = iter.len();
let num_bytes = bit_util::ceil(data_len, 8);
let mut null_builder = MutableBuffer::from_len_zeroed(num_bytes);
@@ -450,10 +523,14 @@ impl<Ptr: std::borrow::Borrow<Option<bool>>>
FromIterator<Ptr> for BooleanArray
let null_slice = null_builder.as_slice_mut();
iter.enumerate().for_each(|(i, item)| {
- if let Some(a) = item.borrow() {
- bit_util::set_bit(null_slice, i);
- if *a {
- bit_util::set_bit(data, i);
+ if let Some(a) = item.into().native {
+ unsafe {
+ // SAFETY: There will be enough space in the buffers due
to the trusted len size
+ // hint
+ bit_util::set_bit_raw(null_slice.as_mut_ptr(), i);
+ if a {
+ bit_util::set_bit_raw(data.as_mut_ptr(), i);
+ }
}
}
});
@@ -599,6 +676,20 @@ mod tests {
}
}
+ #[test]
+ fn test_boolean_array_from_non_nullable_iter() {
+ let v = vec![true, false, true];
+ let arr = v.into_iter().collect::<BooleanArray>();
+ assert_eq!(3, arr.len());
+ assert_eq!(0, arr.offset());
+ assert_eq!(0, arr.null_count());
+ assert!(arr.nulls().is_none());
+
+ assert!(arr.value(0));
+ assert!(!arr.value(1));
+ assert!(arr.value(2));
+ }
+
#[test]
fn test_boolean_array_from_nullable_iter() {
let v = vec![Some(true), None, Some(false), None];
@@ -617,6 +708,29 @@ mod tests {
assert!(!arr.value(2));
}
+ #[test]
+ fn test_boolean_array_from_nullable_trusted_len_iter() {
+ // Should exhibit the same behavior as `from_iter`, which is tested
above.
+ let v = vec![Some(true), None, Some(false), None];
+ let expected = v.clone().into_iter().collect::<BooleanArray>();
+ let actual = unsafe {
+ // SAFETY: `v` has trusted length
+ BooleanArray::from_trusted_len_iter(v.into_iter())
+ };
+ assert_eq!(expected, actual);
+ }
+
+ #[test]
+ fn test_boolean_array_from_iter_with_larger_upper_bound() {
+ // See https://github.com/apache/arrow-rs/issues/8505
+ // This returns an upper size hint of 4
+ let iterator = vec![Some(true), None, Some(false), None]
+ .into_iter()
+ .filter(Option::is_some);
+ let arr = iterator.collect::<BooleanArray>();
+ assert_eq!(2, arr.len());
+ }
+
#[test]
fn test_boolean_array_builder() {
// Test building a boolean array with ArrayData builder and offset
diff --git a/arrow-array/src/builder/boolean_builder.rs
b/arrow-array/src/builder/boolean_builder.rs
index a0bd5745d2..275aa8c9e5 100644
--- a/arrow-array/src/builder/boolean_builder.rs
+++ b/arrow-array/src/builder/boolean_builder.rs
@@ -234,9 +234,12 @@ impl ArrayBuilder for BooleanBuilder {
impl Extend<Option<bool>> for BooleanBuilder {
#[inline]
fn extend<T: IntoIterator<Item = Option<bool>>>(&mut self, iter: T) {
- for v in iter {
- self.append_option(v)
- }
+ let buffered = iter.into_iter().collect::<Vec<_>>();
+ let array = unsafe {
+ // SAFETY: std::vec::IntoIter implements TrustedLen
+ BooleanArray::from_trusted_len_iter(buffered.into_iter())
+ };
+ self.append_array(&array)
}
}
diff --git a/arrow/benches/array_from.rs b/arrow/benches/array_from.rs
index 3af605ef4a..575a8280f6 100644
--- a/arrow/benches/array_from.rs
+++ b/arrow/benches/array_from.rs
@@ -15,13 +15,12 @@
// specific language governing permissions and limitations
// under the License.
+extern crate arrow;
#[macro_use]
extern crate criterion;
use criterion::Criterion;
-extern crate arrow;
-
use arrow::array::*;
use arrow_buffer::i256;
use rand::Rng;
@@ -236,6 +235,13 @@ fn from_iter_benchmark(c: &mut Criterion) {
let values = gen_option_vector(true, ITER_LEN);
b.iter(|| hint::black_box(BooleanArray::from_iter(values.iter())));
});
+ c.bench_function("BooleanArray::from_trusted_len_iter", |b| {
+ let values = gen_option_vector(true, ITER_LEN);
+ b.iter(|| unsafe {
+ // SAFETY: values.iter() is a TrustedLenIterator
+ hint::black_box(BooleanArray::from_trusted_len_iter(values.iter()))
+ });
+ });
}
criterion_group!(