This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 909608aa5f fix: Fix bug in `array_has` scalar path with sliced arrays 
(#20677)
909608aa5f is described below

commit 909608aa5f2e6872f81a692c5ac3281e3888dda7
Author: Neil Conway <[email protected]>
AuthorDate: Wed Mar 4 11:27:30 2026 -0500

    fix: Fix bug in `array_has` scalar path with sliced arrays (#20677)
    
    ## Which issue does this PR close?
    
    N/A
    
    ## Rationale for this change
    
    In #20374, `array_has` with a scalar needle was optimized to reconstruct
    matches more efficiently. Unfortunately, that code was incorrect for
    sliced arrays: `values()` returns the entire value buffer (including
    elements outside the visible slice), so we need to skip the
    corresponding indexes in the result bitmap.
    
    We could fix this by just skipping indexes, but it seems more robust and
    efficient to arrange to not compare the needle against elements outside
    the visible range in the first place.
    
    `array_position` has a similar behavior (introduced in #20532): it
    didn't have the buggy behavior, but it still did extra work for sliced
    arrays by comparing against elements outside the visible range.
    
    Benchmarking the revised code, there is no performance regression for
    unsliced arrays.
    
    ## What changes are included in this PR?
    
    * Fix `array_has` bug for sliced arrays with scalar needle
    * Improve `array_has` and `array_position` to not compare against
    elements outside the visible range of a sliced array
    * Add unit test for `array_has` bug
    * Add unit test to increase confidence in `array_position` behavior for
    sliced arrays
    
    ## Are these changes tested?
    
    Yes.
    
    ## Are there any user-facing changes?
    
    No.
---
 datafusion/functions-nested/src/array_has.rs | 74 ++++++++++++++++++++++---
 datafusion/functions-nested/src/position.rs  | 81 +++++++++++++++++++++++++---
 2 files changed, 140 insertions(+), 15 deletions(-)

diff --git a/datafusion/functions-nested/src/array_has.rs 
b/datafusion/functions-nested/src/array_has.rs
index ace69de66f..76cf786c95 100644
--- a/datafusion/functions-nested/src/array_has.rs
+++ b/datafusion/functions-nested/src/array_has.rs
@@ -352,8 +352,6 @@ fn array_has_dispatch_for_scalar(
     haystack: ArrayWrapper<'_>,
     needle: &dyn Datum,
 ) -> Result<ArrayRef> {
-    let values = haystack.values();
-    let is_nested = values.data_type().is_nested();
     // If first argument is empty list (second argument is non-null), return 
false
     // i.e. array_has([], non-null element) -> false
     if haystack.len() == 0 {
@@ -362,7 +360,17 @@ fn array_has_dispatch_for_scalar(
             None,
         )));
     }
-    let eq_array = compare_with_eq(values, needle, is_nested)?;
+
+    // For sliced ListArrays, values() returns the full underlying array but
+    // only elements between the first and last offset are visible.
+    let offsets: Vec<usize> = haystack.offsets().collect();
+    let first_offset = offsets[0];
+    let visible_values = haystack
+        .values()
+        .slice(first_offset, offsets[offsets.len() - 1] - first_offset);
+
+    let is_nested = visible_values.data_type().is_nested();
+    let eq_array = compare_with_eq(&visible_values, needle, is_nested)?;
 
     // When a haystack element is null, `eq()` returns null (not false).
     // In Arrow, a null BooleanArray entry has validity=0 but an
@@ -382,10 +390,14 @@ fn array_has_dispatch_for_scalar(
         ArrayWrapper::LargeList(arr) => arr.nulls(),
     };
     let mut matches = eq_bits.set_indices().peekable();
-    let mut values = BooleanBufferBuilder::new(haystack.len());
-    values.append_n(haystack.len(), false);
+    let mut result = BooleanBufferBuilder::new(haystack.len());
+    result.append_n(haystack.len(), false);
+
+    // Match positions are relative to visible_values (0-based), so
+    // subtract first_offset from each offset when comparing.
+    for (i, window) in offsets.windows(2).enumerate() {
+        let end = window[1] - first_offset;
 
-    for (i, (_start, end)) in haystack.offsets().tuple_windows().enumerate() {
         let has_match = matches.peek().is_some_and(|&p| p < end);
 
         // Advance past all match positions in this row's range.
@@ -394,14 +406,14 @@ fn array_has_dispatch_for_scalar(
         }
 
         if has_match && validity.is_none_or(|v| v.is_valid(i)) {
-            values.set_bit(i, true);
+            result.set_bit(i, true);
         }
     }
 
     // A null haystack row always produces a null output, so we can
     // reuse the haystack's null buffer directly.
     Ok(Arc::new(BooleanArray::new(
-        values.finish(),
+        result.finish(),
         validity.cloned(),
     )))
 }
@@ -1066,6 +1078,52 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn test_array_has_sliced_list() -> Result<(), DataFusionError> {
+        // [[10, 20], [30, 40], [50, 60], [70, 80]]  →  slice(1,2)  →  [[30, 
40], [50, 60]]
+        let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(10), Some(20)]),
+            Some(vec![Some(30), Some(40)]),
+            Some(vec![Some(50), Some(60)]),
+            Some(vec![Some(70), Some(80)]),
+        ]);
+        let sliced = list.slice(1, 2);
+        let haystack_field =
+            Arc::new(Field::new("haystack", sliced.data_type().clone(), true));
+        let needle_field = Arc::new(Field::new("needle", DataType::Int32, 
true));
+        let return_field = Arc::new(Field::new("return", DataType::Boolean, 
true));
+
+        // Search for elements that exist only in sliced-away rows:
+        // 10 is in the prefix row, 70 is in the suffix row.
+        let invoke = |needle: i32| -> Result<ArrayRef, DataFusionError> {
+            ArrayHas::new()
+                .invoke_with_args(ScalarFunctionArgs {
+                    args: vec![
+                        ColumnarValue::Array(Arc::new(sliced.clone())),
+                        
ColumnarValue::Scalar(ScalarValue::Int32(Some(needle))),
+                    ],
+                    arg_fields: vec![
+                        Arc::clone(&haystack_field),
+                        Arc::clone(&needle_field),
+                    ],
+                    number_rows: 2,
+                    return_field: Arc::clone(&return_field),
+                    config_options: Arc::new(ConfigOptions::default()),
+                })?
+                .into_array(2)
+        };
+
+        let output = invoke(10)?.as_boolean().clone();
+        assert!(!output.value(0));
+        assert!(!output.value(1));
+
+        let output = invoke(70)?.as_boolean().clone();
+        assert!(!output.value(0));
+        assert!(!output.value(1));
+
+        Ok(())
+    }
+
     #[test]
     fn test_array_has_list_null_haystack() -> Result<(), DataFusionError> {
         let haystack_field = Arc::new(Field::new("haystack", DataType::Null, 
true));
diff --git a/datafusion/functions-nested/src/position.rs 
b/datafusion/functions-nested/src/position.rs
index ba16d08538..0214b1552b 100644
--- a/datafusion/functions-nested/src/position.rs
+++ b/datafusion/functions-nested/src/position.rs
@@ -230,26 +230,36 @@ fn array_position_scalar<O: OffsetSizeTrait>(
         "array_position",
         &[list_array.values(), element_array],
     )?;
-    let element_datum = Scalar::new(Arc::clone(element_array));
-
-    let offsets = list_array.offsets();
-    let validity = list_array.nulls();
 
     if list_array.len() == 0 {
         return Ok(Arc::new(UInt64Array::new_null(0)));
     }
 
+    let element_datum = Scalar::new(Arc::clone(element_array));
+    let validity = list_array.nulls();
+
+    // Only compare the visible portion of the values buffer, which avoids
+    // wasted work for sliced ListArrays.
+    let offsets = list_array.offsets();
+    let first_offset = offsets[0].as_usize();
+    let last_offset = offsets[list_array.len()].as_usize();
+    let visible_values = list_array
+        .values()
+        .slice(first_offset, last_offset - first_offset);
+
     // `not_distinct` treats NULL=NULL as true, matching the semantics of
     // `array_position`
-    let eq_array = arrow_ord::cmp::not_distinct(list_array.values(), 
&element_datum)?;
+    let eq_array = arrow_ord::cmp::not_distinct(&visible_values, 
&element_datum)?;
     let eq_bits = eq_array.values();
 
     let mut result: Vec<Option<u64>> = Vec::with_capacity(list_array.len());
     let mut matches = eq_bits.set_indices().peekable();
 
+    // Match positions are relative to visible_values (0-based), so
+    // subtract first_offset from each offset when comparing.
     for i in 0..list_array.len() {
-        let start = offsets[i].as_usize();
-        let end = offsets[i + 1].as_usize();
+        let start = offsets[i].as_usize() - first_offset;
+        let end = offsets[i + 1].as_usize() - first_offset;
 
         if validity.is_some_and(|v| v.is_null(i)) {
             // Null row -> null output; advance past matches in range
@@ -474,3 +484,60 @@ fn general_positions<OffsetSize: OffsetSizeTrait>(
         ListArray::from_iter_primitive::<UInt64Type, _, _>(data),
     ))
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::AsArray;
+    use arrow::datatypes::Int32Type;
+    use datafusion_common::config::ConfigOptions;
+    use datafusion_expr::ScalarFunctionArgs;
+
+    #[test]
+    fn test_array_position_sliced_list() -> Result<()> {
+        // [[10, 20], [30, 40], [50, 60], [70, 80]]  →  slice(1,2)  →  [[30, 
40], [50, 60]]
+        let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+            Some(vec![Some(10), Some(20)]),
+            Some(vec![Some(30), Some(40)]),
+            Some(vec![Some(50), Some(60)]),
+            Some(vec![Some(70), Some(80)]),
+        ]);
+        let sliced = list.slice(1, 2);
+        let haystack_field =
+            Arc::new(Field::new("haystack", sliced.data_type().clone(), true));
+        let needle_field = Arc::new(Field::new("needle", DataType::Int32, 
true));
+        let return_field = Arc::new(Field::new("return", UInt64, true));
+
+        // Search for elements that exist only in sliced-away rows:
+        // 10 is in the prefix row, 70 is in the suffix row.
+        let invoke = |needle: i32| -> Result<ArrayRef> {
+            ArrayPosition::new()
+                .invoke_with_args(ScalarFunctionArgs {
+                    args: vec![
+                        ColumnarValue::Array(Arc::new(sliced.clone())),
+                        
ColumnarValue::Scalar(ScalarValue::Int32(Some(needle))),
+                    ],
+                    arg_fields: vec![
+                        Arc::clone(&haystack_field),
+                        Arc::clone(&needle_field),
+                    ],
+                    number_rows: 2,
+                    return_field: Arc::clone(&return_field),
+                    config_options: Arc::new(ConfigOptions::default()),
+                })?
+                .into_array(2)
+        };
+
+        let output = invoke(10)?;
+        let output = output.as_primitive::<UInt64Type>();
+        assert!(output.is_null(0));
+        assert!(output.is_null(1));
+
+        let output = invoke(70)?;
+        let output = output.as_primitive::<UInt64Type>();
+        assert!(output.is_null(0));
+        assert!(output.is_null(1));
+
+        Ok(())
+    }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to