This is an automated email from the ASF dual-hosted git repository.

mbrobbel pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new ba22a214b3 Fix "Incorrect Behavior of Collecting a filtered iterator 
to a BooleanArray" (#8543)
ba22a214b3 is described below

commit ba22a214b3c469da5466f60a74ab201a268bc0fc
Author: Tobias Schwarzinger <[email protected]>
AuthorDate: Tue Oct 7 10:18:13 2025 +0200

    Fix "Incorrect Behavior of Collecting a filtered iterator to a 
BooleanArray" (#8543)
    
    # Which issue does this PR close?
    
    - Closes #8505 .
    
    # Rationale for this change
    
    Fix the bug and align `BooleanArray::from_iter` to
    `PrimitiveArray::from_iter`
    
    In `BooleanArray::from_iter`:
    Collecting to a `Vec` and then using `from_trusted_len_iter` was almost
    double as fast as using `BooleanBufferBuilder` on my machine.
    
    # What changes are included in this PR?
    
    - Use builders in `BooleanArray::from_iter` to fix the wrong behavior
    - Introduce `BooleanArray::from_trusted_len_iter` for a more performant
    version (The old version of `BooleanArray::from_iter`, just with unsafe
    flavor of `bit_util::set_bit_raw`)
    - Add `BooleanAdapter`, inspired by `NativeAdapter` from the
    `PrimitiveArray`. This allows also doing `BooleanArray::from_iter([true,
    false].into_iter())`.
    
    # Are these changes tested?
    
    - New test to cover the initial bug
    - New test to cover `BooleanArray::from_trusted_len_iter` directly (old
    `BooleanArray::from_iter` also cover it indirectly)
    - New test to document that you can directly collect `[false, true,
    ...]` (no `Option`)
    
    # Are there any user-facing changes?
    
    - `BooleanArray::from_iter` has a "slight" performance regression that
    users could observe.
    - Allow directly collecting bools to a `BooleanArray`
    - `BooleanArray::from_trusted_len_iter`
---
 arrow-array/src/array/boolean_array.rs     | 128 +++++++++++++++++++++++++++--
 arrow-array/src/builder/boolean_builder.rs |   9 +-
 arrow/benches/array_from.rs                |  10 ++-
 3 files changed, 135 insertions(+), 12 deletions(-)

diff --git a/arrow-array/src/array/boolean_array.rs 
b/arrow-array/src/array/boolean_array.rs
index d75b33f7e4..7967084aa7 100644
--- a/arrow-array/src/array/boolean_array.rs
+++ b/arrow-array/src/array/boolean_array.rs
@@ -436,11 +436,84 @@ impl<'a> BooleanArray {
     }
 }
 
-impl<Ptr: std::borrow::Borrow<Option<bool>>> FromIterator<Ptr> for 
BooleanArray {
+/// An optional boolean value
+///
+/// This struct is used as an adapter when creating `BooleanArray` from an 
iterator.
+/// `FromIterator` for `BooleanArray` takes an iterator where the elements can 
be `into`
+/// this struct. So once implementing `From` or `Into` trait for a type, an 
iterator of
+/// the type can be collected to `BooleanArray`.
+///
+/// See also [NativeAdapter](crate::array::NativeAdapter).
+#[derive(Debug)]
+struct BooleanAdapter {
+    /// Corresponding Rust native type if available
+    pub native: Option<bool>,
+}
+
+impl From<bool> for BooleanAdapter {
+    fn from(value: bool) -> Self {
+        BooleanAdapter {
+            native: Some(value),
+        }
+    }
+}
+
+impl From<&bool> for BooleanAdapter {
+    fn from(value: &bool) -> Self {
+        BooleanAdapter {
+            native: Some(*value),
+        }
+    }
+}
+
+impl From<Option<bool>> for BooleanAdapter {
+    fn from(value: Option<bool>) -> Self {
+        BooleanAdapter { native: value }
+    }
+}
+
+impl From<&Option<bool>> for BooleanAdapter {
+    fn from(value: &Option<bool>) -> Self {
+        BooleanAdapter { native: *value }
+    }
+}
+
+impl<Ptr: Into<BooleanAdapter>> FromIterator<Ptr> for BooleanArray {
     fn from_iter<I: IntoIterator<Item = Ptr>>(iter: I) -> Self {
         let iter = iter.into_iter();
-        let (_, data_len) = iter.size_hint();
-        let data_len = data_len.expect("Iterator must be sized"); // panic if 
no upper bound.
+        let capacity = match iter.size_hint() {
+            (lower, Some(upper)) if lower == upper => lower,
+            _ => 0,
+        };
+        let mut builder = BooleanBuilder::with_capacity(capacity);
+        builder.extend(iter.map(|item| item.into().native));
+        builder.finish()
+    }
+}
+
+impl BooleanArray {
+    /// Creates a [`BooleanArray`] from an iterator of trusted length.
+    ///
+    /// # Safety
+    ///
+    /// The iterator must be 
[`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
+    /// I.e. that `size_hint().1` correctly reports its length. Note that this 
is a stronger
+    /// guarantee that `ExactSizeIterator` provides which could still report a 
wrong length.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the iterator does not report an upper bound on `size_hint()`.
+    #[inline]
+    #[allow(
+        private_bounds,
+        reason = "We will expose BooleanAdapter if there is a need"
+    )]
+    pub unsafe fn from_trusted_len_iter<I, P>(iter: I) -> Self
+    where
+        P: Into<BooleanAdapter>,
+        I: ExactSizeIterator<Item = P>,
+    {
+        let data_len = iter.len();
 
         let num_bytes = bit_util::ceil(data_len, 8);
         let mut null_builder = MutableBuffer::from_len_zeroed(num_bytes);
@@ -450,10 +523,14 @@ impl<Ptr: std::borrow::Borrow<Option<bool>>> 
FromIterator<Ptr> for BooleanArray
 
         let null_slice = null_builder.as_slice_mut();
         iter.enumerate().for_each(|(i, item)| {
-            if let Some(a) = item.borrow() {
-                bit_util::set_bit(null_slice, i);
-                if *a {
-                    bit_util::set_bit(data, i);
+            if let Some(a) = item.into().native {
+                unsafe {
+                    // SAFETY: There will be enough space in the buffers due 
to the trusted len size
+                    // hint
+                    bit_util::set_bit_raw(null_slice.as_mut_ptr(), i);
+                    if a {
+                        bit_util::set_bit_raw(data.as_mut_ptr(), i);
+                    }
                 }
             }
         });
@@ -599,6 +676,20 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_boolean_array_from_non_nullable_iter() {
+        let v = vec![true, false, true];
+        let arr = v.into_iter().collect::<BooleanArray>();
+        assert_eq!(3, arr.len());
+        assert_eq!(0, arr.offset());
+        assert_eq!(0, arr.null_count());
+        assert!(arr.nulls().is_none());
+
+        assert!(arr.value(0));
+        assert!(!arr.value(1));
+        assert!(arr.value(2));
+    }
+
     #[test]
     fn test_boolean_array_from_nullable_iter() {
         let v = vec![Some(true), None, Some(false), None];
@@ -617,6 +708,29 @@ mod tests {
         assert!(!arr.value(2));
     }
 
+    #[test]
+    fn test_boolean_array_from_nullable_trusted_len_iter() {
+        // Should exhibit the same behavior as `from_iter`, which is tested 
above.
+        let v = vec![Some(true), None, Some(false), None];
+        let expected = v.clone().into_iter().collect::<BooleanArray>();
+        let actual = unsafe {
+            // SAFETY: `v` has trusted length
+            BooleanArray::from_trusted_len_iter(v.into_iter())
+        };
+        assert_eq!(expected, actual);
+    }
+
+    #[test]
+    fn test_boolean_array_from_iter_with_larger_upper_bound() {
+        // See https://github.com/apache/arrow-rs/issues/8505
+        // This returns an upper size hint of 4
+        let iterator = vec![Some(true), None, Some(false), None]
+            .into_iter()
+            .filter(Option::is_some);
+        let arr = iterator.collect::<BooleanArray>();
+        assert_eq!(2, arr.len());
+    }
+
     #[test]
     fn test_boolean_array_builder() {
         // Test building a boolean array with ArrayData builder and offset
diff --git a/arrow-array/src/builder/boolean_builder.rs 
b/arrow-array/src/builder/boolean_builder.rs
index a0bd5745d2..275aa8c9e5 100644
--- a/arrow-array/src/builder/boolean_builder.rs
+++ b/arrow-array/src/builder/boolean_builder.rs
@@ -234,9 +234,12 @@ impl ArrayBuilder for BooleanBuilder {
 impl Extend<Option<bool>> for BooleanBuilder {
     #[inline]
     fn extend<T: IntoIterator<Item = Option<bool>>>(&mut self, iter: T) {
-        for v in iter {
-            self.append_option(v)
-        }
+        let buffered = iter.into_iter().collect::<Vec<_>>();
+        let array = unsafe {
+            // SAFETY: std::vec::IntoIter implements TrustedLen
+            BooleanArray::from_trusted_len_iter(buffered.into_iter())
+        };
+        self.append_array(&array)
     }
 }
 
diff --git a/arrow/benches/array_from.rs b/arrow/benches/array_from.rs
index 3af605ef4a..575a8280f6 100644
--- a/arrow/benches/array_from.rs
+++ b/arrow/benches/array_from.rs
@@ -15,13 +15,12 @@
 // specific language governing permissions and limitations
 // under the License.
 
+extern crate arrow;
 #[macro_use]
 extern crate criterion;
 
 use criterion::Criterion;
 
-extern crate arrow;
-
 use arrow::array::*;
 use arrow_buffer::i256;
 use rand::Rng;
@@ -236,6 +235,13 @@ fn from_iter_benchmark(c: &mut Criterion) {
         let values = gen_option_vector(true, ITER_LEN);
         b.iter(|| hint::black_box(BooleanArray::from_iter(values.iter())));
     });
+    c.bench_function("BooleanArray::from_trusted_len_iter", |b| {
+        let values = gen_option_vector(true, ITER_LEN);
+        b.iter(|| unsafe {
+            // SAFETY: values.iter() is a TrustedLenIterator
+            hint::black_box(BooleanArray::from_trusted_len_iter(values.iter()))
+        });
+    });
 }
 
 criterion_group!(

Reply via email to