This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 6c3e588118 fix: `zip` now treats nulls as false in provided mask
regardless of the underlying bit value (#8711)
6c3e588118 is described below
commit 6c3e5881185bf74b56ce96233135bb540826554b
Author: Raz Luvaton <[email protected]>
AuthorDate: Mon Oct 27 22:34:29 2025 +0200
fix: `zip` now treats nulls as false in provided mask regardless of the
underlying bit value (#8711)
# Which issue does this PR close?
- closes https://github.com/apache/arrow-rs/issues/8721
# Rationale for this change
mask `nulls` should be treated as `false` (even if the underlying values
are not 0) as described in the docs for zip
# What changes are included in this PR?
used `prep_null_mask_filter` before iterating over the mask,
added tests for both scalar and non scalar (to prepare for #8653)
# Are these changes tested?
Yes
# Are there any user-facing changes?
Kinda
---
arrow-select/src/filter.rs | 8 ++-
arrow-select/src/zip.rs | 125 ++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 130 insertions(+), 3 deletions(-)
diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs
index cd132de39d..6a5ba13c95 100644
--- a/arrow-select/src/filter.rs
+++ b/arrow-select/src/filter.rs
@@ -58,7 +58,13 @@ pub struct SlicesIterator<'a>(BitSliceIterator<'a>);
impl<'a> SlicesIterator<'a> {
/// Creates a new iterator from a [BooleanArray]
pub fn new(filter: &'a BooleanArray) -> Self {
- Self(filter.values().set_slices())
+ filter.values().into()
+ }
+}
+
+impl<'a> From<&'a BooleanBuffer> for SlicesIterator<'a> {
+ fn from(filter: &'a BooleanBuffer) -> Self {
+ Self(filter.set_slices())
}
}
diff --git a/arrow-select/src/zip.rs b/arrow-select/src/zip.rs
index 2efd2e7499..c202be6b62 100644
--- a/arrow-select/src/zip.rs
+++ b/arrow-select/src/zip.rs
@@ -17,8 +17,9 @@
//! [`zip`]: Combine values from two arrays based on boolean mask
-use crate::filter::SlicesIterator;
+use crate::filter::{SlicesIterator, prep_null_mask_filter};
use arrow_array::*;
+use arrow_buffer::BooleanBuffer;
use arrow_data::transform::MutableArrayData;
use arrow_schema::ArrowError;
@@ -127,7 +128,8 @@ pub fn zip(
// keep track of how much is filled
let mut filled = 0;
- SlicesIterator::new(mask).for_each(|(start, end)| {
+ let mask = maybe_prep_null_mask_filter(mask);
+ SlicesIterator::from(&mask).for_each(|(start, end)| {
// the gap needs to be filled with falsy values
if start > filled {
if falsy_is_scalar {
@@ -166,9 +168,22 @@ pub fn zip(
Ok(make_array(data))
}
+fn maybe_prep_null_mask_filter(predicate: &BooleanArray) -> BooleanBuffer {
+ // Nulls are treated as false
+ if predicate.null_count() == 0 {
+ predicate.values().clone()
+ } else {
+ let cleaned = prep_null_mask_filter(predicate);
+ let (boolean_buffer, _) = cleaned.into_parts();
+ boolean_buffer
+ }
+}
+
#[cfg(test)]
mod test {
use super::*;
+ use arrow_array::cast::AsArray;
+ use arrow_buffer::{BooleanBuffer, NullBuffer};
#[test]
fn test_zip_kernel_one() {
@@ -279,4 +294,110 @@ mod test {
let expected = Int32Array::from(vec![None, None, Some(42), Some(42),
None]);
assert_eq!(actual, &expected);
}
+
+ #[test]
+ fn
test_zip_primitive_array_with_nulls_is_mask_should_be_treated_as_false() {
+ let truthy = Int32Array::from_iter_values(vec![1, 2, 3, 4, 5, 6]);
+ let falsy = Int32Array::from_iter_values(vec![7, 8, 9, 10, 11, 12]);
+
+ let mask = {
+ let booleans = BooleanBuffer::from(vec![true, true, false, true,
false, false]);
+ let nulls = NullBuffer::from(vec![
+ true, true, true,
+ false, // null treated as false even though in the original
mask it was true
+ true, true,
+ ]);
+ BooleanArray::new(booleans, Some(nulls))
+ };
+ let out = zip(&mask, &truthy, &falsy).unwrap();
+ let actual = out.as_any().downcast_ref::<Int32Array>().unwrap();
+ let expected = Int32Array::from(vec![
+ Some(1),
+ Some(2),
+ Some(9),
+ Some(10), // true in mask but null
+ Some(11),
+ Some(12),
+ ]);
+ assert_eq!(actual, &expected);
+ }
+
+ #[test]
+ fn
test_zip_kernel_primitive_scalar_with_boolean_array_mask_with_nulls_should_be_treated_as_false()
+ {
+ let scalar_truthy = Scalar::new(Int32Array::from_value(42, 1));
+ let scalar_falsy = Scalar::new(Int32Array::from_value(123, 1));
+
+ let mask = {
+ let booleans = BooleanBuffer::from(vec![true, true, false, true,
false, false]);
+ let nulls = NullBuffer::from(vec![
+ true, true, true,
+ false, // null treated as false even though in the original
mask it was true
+ true, true,
+ ]);
+ BooleanArray::new(booleans, Some(nulls))
+ };
+ let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap();
+ let actual = out.as_any().downcast_ref::<Int32Array>().unwrap();
+ let expected = Int32Array::from(vec![
+ Some(42),
+ Some(42),
+ Some(123),
+ Some(123), // true in mask but null
+ Some(123),
+ Some(123),
+ ]);
+ assert_eq!(actual, &expected);
+ }
+
+ #[test]
+ fn test_zip_string_array_with_nulls_is_mask_should_be_treated_as_false() {
+ let truthy = StringArray::from_iter_values(vec!["1", "2", "3", "4",
"5", "6"]);
+ let falsy = StringArray::from_iter_values(vec!["7", "8", "9", "10",
"11", "12"]);
+
+ let mask = {
+ let booleans = BooleanBuffer::from(vec![true, true, false, true,
false, false]);
+ let nulls = NullBuffer::from(vec![
+ true, true, true,
+ false, // null treated as false even though in the original
mask it was true
+ true, true,
+ ]);
+ BooleanArray::new(booleans, Some(nulls))
+ };
+ let out = zip(&mask, &truthy, &falsy).unwrap();
+ let actual = out.as_string::<i32>();
+ let expected = StringArray::from_iter_values(vec![
+ "1", "2", "9", "10", // true in mask but null
+ "11", "12",
+ ]);
+ assert_eq!(actual, &expected);
+ }
+
+ #[test]
+ fn
test_zip_kernel_large_string_scalar_with_boolean_array_mask_with_nulls_should_be_treated_as_false()
+ {
+ let scalar_truthy =
Scalar::new(LargeStringArray::from_iter_values(["test"]));
+ let scalar_falsy =
Scalar::new(LargeStringArray::from_iter_values(["something else"]));
+
+ let mask = {
+ let booleans = BooleanBuffer::from(vec![true, true, false, true,
false, false]);
+ let nulls = NullBuffer::from(vec![
+ true, true, true,
+ false, // null treated as false even though in the original
mask it was true
+ true, true,
+ ]);
+ BooleanArray::new(booleans, Some(nulls))
+ };
+ let out = zip(&mask, &scalar_truthy, &scalar_falsy).unwrap();
+ let actual = out.as_any().downcast_ref::<LargeStringArray>().unwrap();
+ let expected = LargeStringArray::from_iter(vec![
+ Some("test"),
+ Some("test"),
+ Some("something else"),
+ Some("something else"), // true in mask but null
+ Some("something else"),
+ Some("something else"),
+ ]);
+ assert_eq!(actual, &expected);
+ }
}