This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 114beec770 Fix array_has simplification with null argument (#18186)
114beec770 is described below
commit 114beec770dfc7f12e581a5e178c897104b96c70
Author: Georgi Krastev <[email protected]>
AuthorDate: Wed Oct 22 11:33:19 2025 +0200
Fix array_has simplification with null argument (#18186)
## Which issue does this PR close?
<!--
We generally require a GitHub issue to be filed for all bug fixes and
enhancements and this helps us generate change logs for our releases.
You can link an issue to this PR using the GitHub syntax. For example
`Closes #123` indicates that this PR will close issue #123.
-->
- Closes #.
## Rationale for this change
According to three-valued logic we should return `null` and that's also
what happens when the argument is not a constant as seen in the test.
<!--
Why are you proposing this change? If this is already explained clearly
in the issue then this section is not needed.
Explaining clearly why changes are proposed helps reviewers understand
your changes and offer better suggestions for fixes.
-->
## What changes are included in this PR?
Updated `ArrayHas::simplify` to explicitly handle `null`
<!--
There is no need to duplicate the description in the issue here but it
is sometimes worth providing a summary of the individual changes in this
PR.
-->
## Are these changes tested?
Updated the `array_has` SQL test and added unit tests
<!--
We typically require tests for all PRs in order to:
1. Prevent the code from being accidentally broken by subsequent changes
2. Serve as another way to document the expected behavior of the code
If tests are not included in your PR, please explain why (for example,
are they covered by existing tests)?
-->
## Are there any user-facing changes?
Yes, a minor change in behaviour wrt `null`
<!--
If there are user-facing changes then we may require documentation to be
updated before approving the PR.
-->
<!--
If there are any breaking changes to public APIs, please add the `api
change` label.
-->
---
datafusion/functions-nested/src/array_has.rs | 93 ++++++++++++++++++++++++----
datafusion/sqllogictest/test_files/array.slt | 10 +--
2 files changed, 86 insertions(+), 17 deletions(-)
diff --git a/datafusion/functions-nested/src/array_has.rs
b/datafusion/functions-nested/src/array_has.rs
index f34fea0c4b..080b2f16d9 100644
--- a/datafusion/functions-nested/src/array_has.rs
+++ b/datafusion/functions-nested/src/array_has.rs
@@ -132,23 +132,26 @@ impl ScalarUDFImpl for ArrayHas {
// if the haystack is a constant list, we can use an inlist expression
which is more
// efficient because the haystack is not varying per-row
match haystack {
+ Expr::Literal(scalar, _) if scalar.is_null() => {
+ return Ok(ExprSimplifyResult::Simplified(Expr::Literal(
+ ScalarValue::Boolean(None),
+ None,
+ )))
+ }
Expr::Literal(
// FixedSizeList gets coerced to List
scalar @ ScalarValue::List(_) | scalar @
ScalarValue::LargeList(_),
_,
) => {
- let array = scalar.to_array().unwrap(); // guarantee of
ScalarValue
if let Ok(scalar_values) =
- ScalarValue::convert_array_to_scalar_vec(&array)
+
ScalarValue::convert_array_to_scalar_vec(&scalar.to_array()?)
{
assert_eq!(scalar_values.len(), 1);
let list = scalar_values
.into_iter()
- // If the vec is a singular null, `list` will be empty
due to this flatten().
- // It would be more clear if we handled the None
separately, but this is more performant.
.flatten()
.flatten()
- .map(|v| Expr::Literal(v.clone(), None))
+ .map(|v| Expr::Literal(v, None))
.collect();
return Ok(ExprSimplifyResult::Simplified(in_list(
@@ -178,6 +181,12 @@ impl ScalarUDFImpl for ArrayHas {
args: datafusion_expr::ScalarFunctionArgs,
) -> Result<ColumnarValue> {
let [first_arg, second_arg] = take_function_args(self.name(),
&args.args)?;
+ if first_arg.data_type().is_null() {
+ // Always return null if the first argument is null
+ // i.e. array_has(null, element) -> null
+ return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None)));
+ }
+
match &second_arg {
ColumnarValue::Array(array_needle) => {
// the needle is already an array, convert the haystack to an
array of the same length
@@ -663,6 +672,7 @@ fn general_array_has_all_and_any_kernel(
mod tests {
use std::sync::Arc;
+ use arrow::datatypes::Int32Type;
use arrow::{
array::{create_array, Array, ArrayRef, AsArray, Int32Array, ListArray},
buffer::OffsetBuffer,
@@ -733,6 +743,40 @@ mod tests {
);
}
+ #[test]
+ fn test_simplify_array_has_with_null_to_null() {
+ let haystack = Expr::Literal(ScalarValue::Null, None);
+ let needle = col("c");
+
+ let props = ExecutionProps::new();
+ let context = datafusion_expr::simplify::SimplifyContext::new(&props);
+ let Ok(ExprSimplifyResult::Simplified(simplified)) =
+ ArrayHas::new().simplify(vec![haystack, needle], &context)
+ else {
+ panic!("Expected simplified expression");
+ };
+
+ assert_eq!(simplified, Expr::Literal(ScalarValue::Boolean(None),
None));
+ }
+
+ #[test]
+ fn test_simplify_array_has_with_null_list_to_null() {
+ let haystack =
+ ListArray::from_iter_primitive::<Int32Type, [Option<i32>; 0],
_>([None]);
+ let haystack = Expr::Literal(ScalarValue::List(Arc::new(haystack)),
None);
+ let needle = col("c");
+
+ let props = ExecutionProps::new();
+ let context = datafusion_expr::simplify::SimplifyContext::new(&props);
+ let Ok(ExprSimplifyResult::Simplified(simplified)) =
+ ArrayHas::new().simplify(vec![haystack, needle], &context)
+ else {
+ panic!("Expected simplified expression");
+ };
+
+ assert_eq!(simplified, Expr::Literal(ScalarValue::Boolean(None),
None));
+ }
+
#[test]
fn test_array_has_complex_list_not_simplified() {
let haystack = col("c1");
@@ -757,13 +801,9 @@ mod tests {
Field::new_list("", Field::new("", DataType::Int32, true), true),
true,
));
- let needle_field = Arc::new(Field::new("needle", DataType::Int32,
true));
- let return_field = Arc::new(Field::new_list(
- "return",
- Field::new("", DataType::Boolean, true),
- true,
- ));
+ let needle_field = Arc::new(Field::new("needle", DataType::Int32,
true));
+ let return_field = Arc::new(Field::new("return", DataType::Boolean,
true));
let haystack = ListArray::new(
Field::new_list_field(DataType::Int32, true).into(),
OffsetBuffer::new(vec![0, 0].into()),
@@ -773,7 +813,6 @@ mod tests {
let haystack = ColumnarValue::Array(Arc::new(haystack));
let needle = ColumnarValue::Scalar(ScalarValue::Int32(Some(1)));
-
let result = ArrayHas::new().invoke_with_args(ScalarFunctionArgs {
args: vec![haystack, needle],
arg_fields: vec![haystack_field, needle_field],
@@ -789,4 +828,34 @@ mod tests {
Ok(())
}
+
+ #[test]
+ fn test_array_has_list_null_haystack() -> Result<(), DataFusionError> {
+ let haystack_field = Arc::new(Field::new("haystack", DataType::Null,
true));
+ let needle_field = Arc::new(Field::new("needle", DataType::Int32,
true));
+ let return_field = Arc::new(Field::new("return", DataType::Boolean,
true));
+ let haystack =
+ ListArray::from_iter_primitive::<Int32Type, [Option<i32>; 0], _>([
+ None, None, None,
+ ]);
+
+ let haystack = ColumnarValue::Array(Arc::new(haystack));
+ let needle = ColumnarValue::Scalar(ScalarValue::Int32(Some(1)));
+ let result = ArrayHas::new().invoke_with_args(ScalarFunctionArgs {
+ args: vec![haystack, needle],
+ arg_fields: vec![haystack_field, needle_field],
+ number_rows: 1,
+ return_field,
+ config_options: Arc::new(ConfigOptions::default()),
+ })?;
+
+ let output = result.into_array(1)?;
+ let output = output.as_boolean();
+ assert_eq!(output.len(), 3);
+ for i in 0..3 {
+ assert!(output.is_null(i));
+ }
+
+ Ok(())
+ }
}
diff --git a/datafusion/sqllogictest/test_files/array.slt
b/datafusion/sqllogictest/test_files/array.slt
index f488204d6d..43899642a9 100644
--- a/datafusion/sqllogictest/test_files/array.slt
+++ b/datafusion/sqllogictest/test_files/array.slt
@@ -6040,13 +6040,13 @@ false
# array_has([1, 3, 5], 1) -> true (array contains element)
# array_has([], 1) -> false (empty array, not null)
# array_has(null, 1) -> null (null array)
-query B
-select array_has(column1, column2)
+query BB
+select array_has(column1, column2), array_has(null, column2)
from array_has_table_empty;
----
-true
-false
-NULL
+true NULL
+false NULL
+NULL NULL
# Test for issue: array_has should return false for empty arrays, not null
# This test demonstrates the correct behavior with COALESCE to show the
distinction
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]