This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 6c3e588118 fix: `zip` now treats nulls as false in provided mask 
regardless of the underlying bit value (#8711)
6c3e588118 is described below

commit 6c3e5881185bf74b56ce96233135bb540826554b
Author: Raz Luvaton <[email protected]>
AuthorDate: Mon Oct 27 22:34:29 2025 +0200

    fix: `zip` now treats nulls as false in provided mask regardless of the 
underlying bit value (#8711)
    
    # Which issue does this PR close?
    
    - closes https://github.com/apache/arrow-rs/issues/8721
    
    # Rationale for this change
    
    mask `nulls` should be treated as `false` (even if the underlying values
    are not 0) as described in the docs for zip
    
    # What changes are included in this PR?
    
    used `prep_null_mask_filter` before iterating over the mask,
    added tests for both scalar and non scalar (to prepare for #8653)
    
    # Are these changes tested?
    Yes
    
    # Are there any user-facing changes?
    
    Kinda
---
 arrow-select/src/filter.rs |   8 ++-
 arrow-select/src/zip.rs    | 125 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 130 insertions(+), 3 deletions(-)

diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs
index cd132de39d..6a5ba13c95 100644
--- a/arrow-select/src/filter.rs
+++ b/arrow-select/src/filter.rs
@@ -58,7 +58,13 @@ pub struct SlicesIterator<'a>(BitSliceIterator<'a>);
 impl<'a> SlicesIterator<'a> {
     /// Creates a new iterator from a [BooleanArray]
     pub fn new(filter: &'a BooleanArray) -> Self {
-        Self(filter.values().set_slices())
+        filter.values().into()
+    }
+}
+
+impl<'a> From<&'a BooleanBuffer> for SlicesIterator<'a> {
+    fn from(filter: &'a BooleanBuffer) -> Self {
+        Self(filter.set_slices())
     }
 }
 
diff --git a/arrow-select/src/zip.rs b/arrow-select/src/zip.rs
index 2efd2e7499..c202be6b62 100644
--- a/arrow-select/src/zip.rs
+++ b/arrow-select/src/zip.rs
@@ -17,8 +17,9 @@
 
 //! [`zip`]: Combine values from two arrays based on boolean mask
 
-use crate::filter::SlicesIterator;
+use crate::filter::{SlicesIterator, prep_null_mask_filter};
 use arrow_array::*;
+use arrow_buffer::BooleanBuffer;
 use arrow_data::transform::MutableArrayData;
 use arrow_schema::ArrowError;
 
@@ -127,7 +128,8 @@ pub fn zip(
     // keep track of how much is filled
     let mut filled = 0;
 
-    SlicesIterator::new(mask).for_each(|(start, end)| {
+    let mask = maybe_prep_null_mask_filter(mask);
+    SlicesIterator::from(&mask).for_each(|(start, end)| {
         // the gap needs to be filled with falsy values
         if start > filled {
             if falsy_is_scalar {
@@ -166,9 +168,22 @@ pub fn zip(
     Ok(make_array(data))
 }
 
+fn maybe_prep_null_mask_filter(predicate: &BooleanArray) -> BooleanBuffer {
+    // Nulls are treated as false
+    if predicate.null_count() == 0 {
+        predicate.values().clone()
+    } else {
+        let cleaned = prep_null_mask_filter(predicate);
+        let (boolean_buffer, _) = cleaned.into_parts();
+        boolean_buffer
+    }
+}
+
 #[cfg(test)]
 mod test {
     use super::*;
+    use arrow_array::cast::AsArray;
+    use arrow_buffer::{BooleanBuffer, NullBuffer};
 
     #[test]
     fn test_zip_kernel_one() {
@@ -279,4 +294,110 @@ mod test {
         let expected = Int32Array::from(vec![None, None, Some(42), Some(42), 
None]);
         assert_eq!(actual, &expected);
     }
+
+    #[test]
+    fn 
test_zip_primitive_array_with_nulls_is_mask_should_be_treated_as_false() {
+        let truthy = Int32Array::from_iter_values(vec![1, 2, 3, 4, 5, 6]);
+        let falsy = Int32Array::from_iter_values(vec![7, 8, 9, 10, 11, 12]);
+
+        let mask = {
+            let booleans = BooleanBuffer::from(vec![true, true, false, true, 
false, false]);
+            let nulls = NullBuffer::from(vec![
+                true, true, true,
+                false, // null treated as false even though in the original 
mask it was true
+                true, true,
+            ]);
+            BooleanArray::new(booleans, Some(nulls))
+        };
+        let out = zip(&mask, &truthy, &falsy).unwrap();
+        let actual = out.as_any().downcast_ref::<Int32Array>().unwrap();
+        let expected = Int32Array::from(vec![
+            Some(1),
+            Some(2),
+            Some(9),
+            Some(10), // true in mask but null
+            Some(11),
+            Some(12),
+        ]);
+        assert_eq!(actual, &expected);
+    }
+
+    #[test]
+    fn 
test_zip_kernel_primitive_scalar_with_boolean_array_mask_with_nulls_should_be_treated_as_false()
+     {
+        let scalar_truthy = Scalar::new(Int32Array::from_value(42, 1));
+        let scalar_falsy = Scalar::new(Int32Array::from_value(123, 1));
+
+        let mask = {
+            let booleans = BooleanBuffer::from(vec![true, true, false, true, 
false, false]);
+            let nulls = NullBuffer::from(vec![
+                true, true, true,
+                false, // null treated as false even though in the original 
mask it was true
+                true, true,
+            ]);
+            BooleanArray::new(booleans, Some(nulls))
+        };
+        let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap();
+        let actual = out.as_any().downcast_ref::<Int32Array>().unwrap();
+        let expected = Int32Array::from(vec![
+            Some(42),
+            Some(42),
+            Some(123),
+            Some(123), // true in mask but null
+            Some(123),
+            Some(123),
+        ]);
+        assert_eq!(actual, &expected);
+    }
+
+    #[test]
+    fn test_zip_string_array_with_nulls_is_mask_should_be_treated_as_false() {
+        let truthy = StringArray::from_iter_values(vec!["1", "2", "3", "4", 
"5", "6"]);
+        let falsy = StringArray::from_iter_values(vec!["7", "8", "9", "10", 
"11", "12"]);
+
+        let mask = {
+            let booleans = BooleanBuffer::from(vec![true, true, false, true, 
false, false]);
+            let nulls = NullBuffer::from(vec![
+                true, true, true,
+                false, // null treated as false even though in the original 
mask it was true
+                true, true,
+            ]);
+            BooleanArray::new(booleans, Some(nulls))
+        };
+        let out = zip(&mask, &truthy, &falsy).unwrap();
+        let actual = out.as_string::<i32>();
+        let expected = StringArray::from_iter_values(vec![
+            "1", "2", "9", "10", // true in mask but null
+            "11", "12",
+        ]);
+        assert_eq!(actual, &expected);
+    }
+
+    #[test]
+    fn 
test_zip_kernel_large_string_scalar_with_boolean_array_mask_with_nulls_should_be_treated_as_false()
+     {
+        let scalar_truthy = 
Scalar::new(LargeStringArray::from_iter_values(["test"]));
+        let scalar_falsy = 
Scalar::new(LargeStringArray::from_iter_values(["something else"]));
+
+        let mask = {
+            let booleans = BooleanBuffer::from(vec![true, true, false, true, 
false, false]);
+            let nulls = NullBuffer::from(vec![
+                true, true, true,
+                false, // null treated as false even though in the original 
mask it was true
+                true, true,
+            ]);
+            BooleanArray::new(booleans, Some(nulls))
+        };
+        let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap();
+        let actual = out.as_any().downcast_ref::<LargeStringArray>().unwrap();
+        let expected = LargeStringArray::from_iter(vec![
+            Some("test"),
+            Some("test"),
+            Some("something else"),
+            Some("something else"), // true in mask but null
+            Some("something else"),
+            Some("something else"),
+        ]);
+        assert_eq!(actual, &expected);
+    }
 }

Reply via email to