This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 909608aa5f fix: Fix bug in `array_has` scalar path with sliced arrays
(#20677)
909608aa5f is described below
commit 909608aa5f2e6872f81a692c5ac3281e3888dda7
Author: Neil Conway <[email protected]>
AuthorDate: Wed Mar 4 11:27:30 2026 -0500
fix: Fix bug in `array_has` scalar path with sliced arrays (#20677)
## Which issue does this PR close?
N/A
## Rationale for this change
In #20374, `array_has` with a scalar needle was optimized to reconstruct
matches more efficiently. Unfortunately, that code was incorrect for
sliced arrays: `values()` returns the entire value buffer (including
elements outside the visible slice), so we need to skip the
corresponding indexes in the result bitmap.
We could fix this by just skipping indexes, but it seems more robust and
efficient to arrange to not compare the needle against elements outside
the visible range in the first place.
`array_position` has a similar behavior (introduced in #20532): it
didn't have the buggy behavior, but it still did extra work for sliced
arrays by comparing against elements outside the visible range.
Benchmarking the revised code, there is no performance regression for
unsliced arrays.
## What changes are included in this PR?
* Fix `array_has` bug for sliced arrays with scalar needle
* Improve `array_has` and `array_position` to not compare against
elements outside the visible range of a sliced array
* Add unit test for `array_has` bug
* Add unit test to increase confidence in `array_position` behavior for
sliced arrays
## Are these changes tested?
Yes.
## Are there any user-facing changes?
No.
---
datafusion/functions-nested/src/array_has.rs | 74 ++++++++++++++++++++++---
datafusion/functions-nested/src/position.rs | 81 +++++++++++++++++++++++++---
2 files changed, 140 insertions(+), 15 deletions(-)
diff --git a/datafusion/functions-nested/src/array_has.rs
b/datafusion/functions-nested/src/array_has.rs
index ace69de66f..76cf786c95 100644
--- a/datafusion/functions-nested/src/array_has.rs
+++ b/datafusion/functions-nested/src/array_has.rs
@@ -352,8 +352,6 @@ fn array_has_dispatch_for_scalar(
haystack: ArrayWrapper<'_>,
needle: &dyn Datum,
) -> Result<ArrayRef> {
- let values = haystack.values();
- let is_nested = values.data_type().is_nested();
// If first argument is empty list (second argument is non-null), return
false
// i.e. array_has([], non-null element) -> false
if haystack.len() == 0 {
@@ -362,7 +360,17 @@ fn array_has_dispatch_for_scalar(
None,
)));
}
- let eq_array = compare_with_eq(values, needle, is_nested)?;
+
+ // For sliced ListArrays, values() returns the full underlying array but
+ // only elements between the first and last offset are visible.
+ let offsets: Vec<usize> = haystack.offsets().collect();
+ let first_offset = offsets[0];
+ let visible_values = haystack
+ .values()
+ .slice(first_offset, offsets[offsets.len() - 1] - first_offset);
+
+ let is_nested = visible_values.data_type().is_nested();
+ let eq_array = compare_with_eq(&visible_values, needle, is_nested)?;
// When a haystack element is null, `eq()` returns null (not false).
// In Arrow, a null BooleanArray entry has validity=0 but an
@@ -382,10 +390,14 @@ fn array_has_dispatch_for_scalar(
ArrayWrapper::LargeList(arr) => arr.nulls(),
};
let mut matches = eq_bits.set_indices().peekable();
- let mut values = BooleanBufferBuilder::new(haystack.len());
- values.append_n(haystack.len(), false);
+ let mut result = BooleanBufferBuilder::new(haystack.len());
+ result.append_n(haystack.len(), false);
+
+ // Match positions are relative to visible_values (0-based), so
+ // subtract first_offset from each offset when comparing.
+ for (i, window) in offsets.windows(2).enumerate() {
+ let end = window[1] - first_offset;
- for (i, (_start, end)) in haystack.offsets().tuple_windows().enumerate() {
let has_match = matches.peek().is_some_and(|&p| p < end);
// Advance past all match positions in this row's range.
@@ -394,14 +406,14 @@ fn array_has_dispatch_for_scalar(
}
if has_match && validity.is_none_or(|v| v.is_valid(i)) {
- values.set_bit(i, true);
+ result.set_bit(i, true);
}
}
// A null haystack row always produces a null output, so we can
// reuse the haystack's null buffer directly.
Ok(Arc::new(BooleanArray::new(
- values.finish(),
+ result.finish(),
validity.cloned(),
)))
}
@@ -1066,6 +1078,52 @@ mod tests {
Ok(())
}
+ #[test]
+ fn test_array_has_sliced_list() -> Result<(), DataFusionError> {
+ // [[10, 20], [30, 40], [50, 60], [70, 80]] → slice(1,2) → [[30,
40], [50, 60]]
+ let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+ Some(vec![Some(10), Some(20)]),
+ Some(vec![Some(30), Some(40)]),
+ Some(vec![Some(50), Some(60)]),
+ Some(vec![Some(70), Some(80)]),
+ ]);
+ let sliced = list.slice(1, 2);
+ let haystack_field =
+ Arc::new(Field::new("haystack", sliced.data_type().clone(), true));
+ let needle_field = Arc::new(Field::new("needle", DataType::Int32,
true));
+ let return_field = Arc::new(Field::new("return", DataType::Boolean,
true));
+
+ // Search for elements that exist only in sliced-away rows:
+ // 10 is in the prefix row, 70 is in the suffix row.
+ let invoke = |needle: i32| -> Result<ArrayRef, DataFusionError> {
+ ArrayHas::new()
+ .invoke_with_args(ScalarFunctionArgs {
+ args: vec![
+ ColumnarValue::Array(Arc::new(sliced.clone())),
+
ColumnarValue::Scalar(ScalarValue::Int32(Some(needle))),
+ ],
+ arg_fields: vec![
+ Arc::clone(&haystack_field),
+ Arc::clone(&needle_field),
+ ],
+ number_rows: 2,
+ return_field: Arc::clone(&return_field),
+ config_options: Arc::new(ConfigOptions::default()),
+ })?
+ .into_array(2)
+ };
+
+ let output = invoke(10)?.as_boolean().clone();
+ assert!(!output.value(0));
+ assert!(!output.value(1));
+
+ let output = invoke(70)?.as_boolean().clone();
+ assert!(!output.value(0));
+ assert!(!output.value(1));
+
+ Ok(())
+ }
+
#[test]
fn test_array_has_list_null_haystack() -> Result<(), DataFusionError> {
let haystack_field = Arc::new(Field::new("haystack", DataType::Null,
true));
diff --git a/datafusion/functions-nested/src/position.rs
b/datafusion/functions-nested/src/position.rs
index ba16d08538..0214b1552b 100644
--- a/datafusion/functions-nested/src/position.rs
+++ b/datafusion/functions-nested/src/position.rs
@@ -230,26 +230,36 @@ fn array_position_scalar<O: OffsetSizeTrait>(
"array_position",
&[list_array.values(), element_array],
)?;
- let element_datum = Scalar::new(Arc::clone(element_array));
-
- let offsets = list_array.offsets();
- let validity = list_array.nulls();
if list_array.len() == 0 {
return Ok(Arc::new(UInt64Array::new_null(0)));
}
+ let element_datum = Scalar::new(Arc::clone(element_array));
+ let validity = list_array.nulls();
+
+ // Only compare the visible portion of the values buffer, which avoids
+ // wasted work for sliced ListArrays.
+ let offsets = list_array.offsets();
+ let first_offset = offsets[0].as_usize();
+ let last_offset = offsets[list_array.len()].as_usize();
+ let visible_values = list_array
+ .values()
+ .slice(first_offset, last_offset - first_offset);
+
// `not_distinct` treats NULL=NULL as true, matching the semantics of
// `array_position`
- let eq_array = arrow_ord::cmp::not_distinct(list_array.values(),
&element_datum)?;
+ let eq_array = arrow_ord::cmp::not_distinct(&visible_values,
&element_datum)?;
let eq_bits = eq_array.values();
let mut result: Vec<Option<u64>> = Vec::with_capacity(list_array.len());
let mut matches = eq_bits.set_indices().peekable();
+ // Match positions are relative to visible_values (0-based), so
+ // subtract first_offset from each offset when comparing.
for i in 0..list_array.len() {
- let start = offsets[i].as_usize();
- let end = offsets[i + 1].as_usize();
+ let start = offsets[i].as_usize() - first_offset;
+ let end = offsets[i + 1].as_usize() - first_offset;
if validity.is_some_and(|v| v.is_null(i)) {
// Null row -> null output; advance past matches in range
@@ -474,3 +484,60 @@ fn general_positions<OffsetSize: OffsetSizeTrait>(
ListArray::from_iter_primitive::<UInt64Type, _, _>(data),
))
}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use arrow::array::AsArray;
+ use arrow::datatypes::Int32Type;
+ use datafusion_common::config::ConfigOptions;
+ use datafusion_expr::ScalarFunctionArgs;
+
+ #[test]
+ fn test_array_position_sliced_list() -> Result<()> {
+ // [[10, 20], [30, 40], [50, 60], [70, 80]] → slice(1,2) → [[30,
40], [50, 60]]
+ let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+ Some(vec![Some(10), Some(20)]),
+ Some(vec![Some(30), Some(40)]),
+ Some(vec![Some(50), Some(60)]),
+ Some(vec![Some(70), Some(80)]),
+ ]);
+ let sliced = list.slice(1, 2);
+ let haystack_field =
+ Arc::new(Field::new("haystack", sliced.data_type().clone(), true));
+ let needle_field = Arc::new(Field::new("needle", DataType::Int32,
true));
+ let return_field = Arc::new(Field::new("return", UInt64, true));
+
+ // Search for elements that exist only in sliced-away rows:
+ // 10 is in the prefix row, 70 is in the suffix row.
+ let invoke = |needle: i32| -> Result<ArrayRef> {
+ ArrayPosition::new()
+ .invoke_with_args(ScalarFunctionArgs {
+ args: vec![
+ ColumnarValue::Array(Arc::new(sliced.clone())),
+
ColumnarValue::Scalar(ScalarValue::Int32(Some(needle))),
+ ],
+ arg_fields: vec![
+ Arc::clone(&haystack_field),
+ Arc::clone(&needle_field),
+ ],
+ number_rows: 2,
+ return_field: Arc::clone(&return_field),
+ config_options: Arc::new(ConfigOptions::default()),
+ })?
+ .into_array(2)
+ };
+
+ let output = invoke(10)?;
+ let output = output.as_primitive::<UInt64Type>();
+ assert!(output.is_null(0));
+ assert!(output.is_null(1));
+
+ let output = invoke(70)?;
+ let output = output.as_primitive::<UInt64Type>();
+ assert!(output.is_null(0));
+ assert!(output.is_null(1));
+
+ Ok(())
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]