alamb commented on code in PR #12125: URL: https://github.com/apache/datafusion/pull/12125#discussion_r1729938966
########## datafusion/functions-nested/src/array_has.rs: ########## @@ -95,25 +96,138 @@ impl ScalarUDFImpl for ArrayHas { } fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> { - make_scalar_function(array_has_inner)(args) + // Always return null if the second argumet is null + // i.e. array_has(array, null) -> null + if let ColumnarValue::Scalar(s) = &args[1] { + if s.is_null() { + return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None))); + } + } + + // first, identify if any of the arguments is an Array. If yes, store its `len`, + // as any scalar will need to be converted to an array of len `len`. + let len = args + .iter() + .fold(Option::<usize>::None, |acc, arg| match arg { + ColumnarValue::Scalar(_) => acc, + ColumnarValue::Array(a) => Some(a.len()), + }); + + let is_scalar = len.is_none(); + + let result = match args[1] { + ColumnarValue::Array(_) => { + let args = ColumnarValue::values_to_arrays(args)?; + array_has_inner_for_array(&args[0], &args[1]) + } + ColumnarValue::Scalar(_) => { + let haystack = args[0].to_owned().into_array(1)?; + let needle = args[1].to_owned().into_array(1)?; + let needle = Scalar::new(needle); + arrat_has_inner_for_scalar(&haystack, &needle) + } + }; + + if is_scalar { + // If all inputs are scalar, keeps output as scalar + let result = result.and_then(|arr| ScalarValue::try_from_array(&arr, 0)); + result.map(ColumnarValue::Scalar) + } else { + result.map(ColumnarValue::Array) + } } fn aliases(&self) -> &[String] { &self.aliases } } -fn array_has_inner(args: &[ArrayRef]) -> Result<ArrayRef> { - match args[0].data_type() { - DataType::List(_) => array_has_dispatch::<i32>(&args[0], &args[1]), - DataType::LargeList(_) => array_has_dispatch::<i64>(&args[0], &args[1]), +fn arrat_has_inner_for_scalar( Review Comment: typo: ```suggestion fn array_has_inner_for_scalar( ``` ########## datafusion/sqllogictest/test_files/array.slt: ########## @@ -4972,11 +4972,19 @@ NULL 1 1 ## array_has/array_has_all/array_has_any -query BB +# If lhs is empty, return false +query B +select array_has([], 1); +---- +false + +# If rhs is Null, we returns Null +query BBB select array_has([], null), - array_has([1, 2, 3], null); + array_has([1, 2, 3], null), + array_has([null, 1], null); ---- -false false +NULL NULL NULL #TODO: array_has_all and array_has_any cannot handle NULL Review Comment: Is there coverage in this file for `array_has` when the first argument is an actual column (not a scalar)? I didn't see such tests but maybe I missed them ########## datafusion/functions-nested/src/array_has.rs: ########## @@ -95,25 +96,138 @@ impl ScalarUDFImpl for ArrayHas { } fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> { - make_scalar_function(array_has_inner)(args) + // Always return null if the second argumet is null + // i.e. array_has(array, null) -> null + if let ColumnarValue::Scalar(s) = &args[1] { + if s.is_null() { + return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None))); + } + } + + // first, identify if any of the arguments is an Array. If yes, store its `len`, + // as any scalar will need to be converted to an array of len `len`. + let len = args + .iter() + .fold(Option::<usize>::None, |acc, arg| match arg { + ColumnarValue::Scalar(_) => acc, + ColumnarValue::Array(a) => Some(a.len()), + }); + + let is_scalar = len.is_none(); + + let result = match args[1] { + ColumnarValue::Array(_) => { + let args = ColumnarValue::values_to_arrays(args)?; + array_has_inner_for_array(&args[0], &args[1]) + } + ColumnarValue::Scalar(_) => { + let haystack = args[0].to_owned().into_array(1)?; + let needle = args[1].to_owned().into_array(1)?; + let needle = Scalar::new(needle); + arrat_has_inner_for_scalar(&haystack, &needle) + } + }; + + if is_scalar { + // If all inputs are scalar, keeps output as scalar + let result = result.and_then(|arr| ScalarValue::try_from_array(&arr, 0)); + result.map(ColumnarValue::Scalar) + } else { + result.map(ColumnarValue::Array) + } } fn aliases(&self) -> &[String] { &self.aliases } } -fn array_has_inner(args: &[ArrayRef]) -> Result<ArrayRef> { - match args[0].data_type() { - DataType::List(_) => array_has_dispatch::<i32>(&args[0], &args[1]), - DataType::LargeList(_) => array_has_dispatch::<i64>(&args[0], &args[1]), +fn arrat_has_inner_for_scalar( + haystack: &ArrayRef, + needle: &dyn Datum, +) -> Result<ArrayRef> { + match haystack.data_type() { + DataType::List(_) => array_has_dispatch_for_scalar::<i32>(haystack, needle), + DataType::LargeList(_) => array_has_dispatch_for_scalar::<i64>(haystack, needle), _ => exec_err!( "array_has does not support type '{:?}'.", - args[0].data_type() + haystack.data_type() + ), + } +} + +fn array_has_inner_for_array(haystack: &ArrayRef, needle: &ArrayRef) -> Result<ArrayRef> { + match haystack.data_type() { + DataType::List(_) => array_has_dispatch_for_array::<i32>(haystack, needle), + DataType::LargeList(_) => array_has_dispatch_for_array::<i64>(haystack, needle), + _ => exec_err!( + "array_has does not support type '{:?}'.", + haystack.data_type() ), } } +fn array_has_dispatch_for_array<O: OffsetSizeTrait>( + haystack: &ArrayRef, + needle: &ArrayRef, +) -> Result<ArrayRef> { + let haystack = as_generic_list_array::<O>(haystack)?; + let mut boolean_builder = BooleanArray::builder(haystack.len()); + + for (i, arr) in haystack.iter().enumerate() { + if arr.is_none() || needle.is_null(i) { + boolean_builder.append_null(); + continue; + } + let arr = arr.unwrap(); + let needle_row = Scalar::new(needle.slice(i, 1)); + let eq_array = compare_op_for_nested(Operator::Eq, &arr, &needle_row)?; + let is_contained = eq_array.true_count() > 0; + boolean_builder.append_value(is_contained) + } + + Ok(Arc::new(boolean_builder.finish())) +} + +fn array_has_dispatch_for_scalar<O: OffsetSizeTrait>( + haystack: &ArrayRef, + needle: &dyn Datum, +) -> Result<ArrayRef> { + let haystack = as_generic_list_array::<O>(haystack)?; + let values = haystack.values(); + let offsets = haystack.value_offsets(); + // If first argument is empty list (second argument is non-null), return false + // i.e. array_has([], non-null element) -> false + if values.len() == 0 { + return Ok(Arc::new(BooleanArray::from(vec![Some(false)]))); + } + let eq_array = compare_op_for_nested(Operator::Eq, values, needle)?; + let mut final_contained = vec![None; haystack.len()]; + for (i, offset) in offsets.windows(2).enumerate() { + let start = offset[0].to_usize().unwrap(); + let end = offset[1].to_usize().unwrap(); + let length = end - start; + // For non-nested list, length is 0 for null + if length == 0 { + continue; + } + // For nested lsit, check number of nulls + let null_count = eq_array.slice(start, length).null_count(); + if null_count == length { + continue; + } + + let number_of_true = eq_array.slice(start, length).true_count(); Review Comment: I think you could reuse the call to `slice` above rather than making the same slice twice. It might also be faster to directly iterate over the relevant parts of the eq_array rather than creating a slice. But perhaps that could be done in a follow on PR ########## datafusion/functions-nested/src/array_has.rs: ########## @@ -95,25 +96,138 @@ impl ScalarUDFImpl for ArrayHas { } fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> { - make_scalar_function(array_has_inner)(args) + // Always return null if the second argumet is null + // i.e. array_has(array, null) -> null + if let ColumnarValue::Scalar(s) = &args[1] { + if s.is_null() { + return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None))); + } + } + + // first, identify if any of the arguments is an Array. If yes, store its `len`, + // as any scalar will need to be converted to an array of len `len`. + let len = args + .iter() + .fold(Option::<usize>::None, |acc, arg| match arg { + ColumnarValue::Scalar(_) => acc, + ColumnarValue::Array(a) => Some(a.len()), + }); + + let is_scalar = len.is_none(); + + let result = match args[1] { + ColumnarValue::Array(_) => { + let args = ColumnarValue::values_to_arrays(args)?; + array_has_inner_for_array(&args[0], &args[1]) + } + ColumnarValue::Scalar(_) => { + let haystack = args[0].to_owned().into_array(1)?; + let needle = args[1].to_owned().into_array(1)?; + let needle = Scalar::new(needle); + arrat_has_inner_for_scalar(&haystack, &needle) + } + }; + + if is_scalar { + // If all inputs are scalar, keeps output as scalar + let result = result.and_then(|arr| ScalarValue::try_from_array(&arr, 0)); + result.map(ColumnarValue::Scalar) + } else { + result.map(ColumnarValue::Array) + } } fn aliases(&self) -> &[String] { &self.aliases } } -fn array_has_inner(args: &[ArrayRef]) -> Result<ArrayRef> { - match args[0].data_type() { - DataType::List(_) => array_has_dispatch::<i32>(&args[0], &args[1]), - DataType::LargeList(_) => array_has_dispatch::<i64>(&args[0], &args[1]), +fn arrat_has_inner_for_scalar( + haystack: &ArrayRef, + needle: &dyn Datum, +) -> Result<ArrayRef> { + match haystack.data_type() { + DataType::List(_) => array_has_dispatch_for_scalar::<i32>(haystack, needle), + DataType::LargeList(_) => array_has_dispatch_for_scalar::<i64>(haystack, needle), _ => exec_err!( "array_has does not support type '{:?}'.", - args[0].data_type() + haystack.data_type() + ), + } +} + +fn array_has_inner_for_array(haystack: &ArrayRef, needle: &ArrayRef) -> Result<ArrayRef> { + match haystack.data_type() { + DataType::List(_) => array_has_dispatch_for_array::<i32>(haystack, needle), + DataType::LargeList(_) => array_has_dispatch_for_array::<i64>(haystack, needle), + _ => exec_err!( + "array_has does not support type '{:?}'.", + haystack.data_type() ), } } +fn array_has_dispatch_for_array<O: OffsetSizeTrait>( + haystack: &ArrayRef, + needle: &ArrayRef, +) -> Result<ArrayRef> { + let haystack = as_generic_list_array::<O>(haystack)?; + let mut boolean_builder = BooleanArray::builder(haystack.len()); + + for (i, arr) in haystack.iter().enumerate() { + if arr.is_none() || needle.is_null(i) { + boolean_builder.append_null(); + continue; + } + let arr = arr.unwrap(); + let needle_row = Scalar::new(needle.slice(i, 1)); + let eq_array = compare_op_for_nested(Operator::Eq, &arr, &needle_row)?; + let is_contained = eq_array.true_count() > 0; + boolean_builder.append_value(is_contained) + } + + Ok(Arc::new(boolean_builder.finish())) +} + +fn array_has_dispatch_for_scalar<O: OffsetSizeTrait>( + haystack: &ArrayRef, + needle: &dyn Datum, +) -> Result<ArrayRef> { + let haystack = as_generic_list_array::<O>(haystack)?; + let values = haystack.values(); + let offsets = haystack.value_offsets(); + // If first argument is empty list (second argument is non-null), return false + // i.e. array_has([], non-null element) -> false + if values.len() == 0 { + return Ok(Arc::new(BooleanArray::from(vec![Some(false)]))); + } + let eq_array = compare_op_for_nested(Operator::Eq, values, needle)?; + let mut final_contained = vec![None; haystack.len()]; + for (i, offset) in offsets.windows(2).enumerate() { + let start = offset[0].to_usize().unwrap(); + let end = offset[1].to_usize().unwrap(); + let length = end - start; + // For non-nested list, length is 0 for null + if length == 0 { + continue; + } + // For nested lsit, check number of nulls Review Comment: ```suggestion // For nested list, check number of nulls ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org