jayzhan211 commented on code in PR #12125:
URL: https://github.com/apache/datafusion/pull/12125#discussion_r1729948604


##########
datafusion/functions-nested/src/array_has.rs:
##########
@@ -95,25 +96,138 @@ impl ScalarUDFImpl for ArrayHas {
     }
 
     fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
-        make_scalar_function(array_has_inner)(args)
+        // Always return null if the second argumet is null
+        // i.e. array_has(array, null) -> null
+        if let ColumnarValue::Scalar(s) = &args[1] {
+            if s.is_null() {
+                return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None)));
+            }
+        }
+
+        // first, identify if any of the arguments is an Array. If yes, store 
its `len`,
+        // as any scalar will need to be converted to an array of len `len`.
+        let len = args
+            .iter()
+            .fold(Option::<usize>::None, |acc, arg| match arg {
+                ColumnarValue::Scalar(_) => acc,
+                ColumnarValue::Array(a) => Some(a.len()),
+            });
+
+        let is_scalar = len.is_none();
+
+        let result = match args[1] {
+            ColumnarValue::Array(_) => {
+                let args = ColumnarValue::values_to_arrays(args)?;
+                array_has_inner_for_array(&args[0], &args[1])
+            }
+            ColumnarValue::Scalar(_) => {
+                let haystack = args[0].to_owned().into_array(1)?;
+                let needle = args[1].to_owned().into_array(1)?;
+                let needle = Scalar::new(needle);
+                arrat_has_inner_for_scalar(&haystack, &needle)
+            }
+        };
+
+        if is_scalar {
+            // If all inputs are scalar, keeps output as scalar
+            let result = result.and_then(|arr| 
ScalarValue::try_from_array(&arr, 0));
+            result.map(ColumnarValue::Scalar)
+        } else {
+            result.map(ColumnarValue::Array)
+        }
     }
 
     fn aliases(&self) -> &[String] {
         &self.aliases
     }
 }
 
-fn array_has_inner(args: &[ArrayRef]) -> Result<ArrayRef> {
-    match args[0].data_type() {
-        DataType::List(_) => array_has_dispatch::<i32>(&args[0], &args[1]),
-        DataType::LargeList(_) => array_has_dispatch::<i64>(&args[0], 
&args[1]),
+fn arrat_has_inner_for_scalar(
+    haystack: &ArrayRef,
+    needle: &dyn Datum,
+) -> Result<ArrayRef> {
+    match haystack.data_type() {
+        DataType::List(_) => array_has_dispatch_for_scalar::<i32>(haystack, 
needle),
+        DataType::LargeList(_) => 
array_has_dispatch_for_scalar::<i64>(haystack, needle),
         _ => exec_err!(
             "array_has does not support type '{:?}'.",
-            args[0].data_type()
+            haystack.data_type()
+        ),
+    }
+}
+
+fn array_has_inner_for_array(haystack: &ArrayRef, needle: &ArrayRef) -> 
Result<ArrayRef> {
+    match haystack.data_type() {
+        DataType::List(_) => array_has_dispatch_for_array::<i32>(haystack, 
needle),
+        DataType::LargeList(_) => 
array_has_dispatch_for_array::<i64>(haystack, needle),
+        _ => exec_err!(
+            "array_has does not support type '{:?}'.",
+            haystack.data_type()
         ),
     }
 }
 
+fn array_has_dispatch_for_array<O: OffsetSizeTrait>(
+    haystack: &ArrayRef,
+    needle: &ArrayRef,
+) -> Result<ArrayRef> {
+    let haystack = as_generic_list_array::<O>(haystack)?;
+    let mut boolean_builder = BooleanArray::builder(haystack.len());
+
+    for (i, arr) in haystack.iter().enumerate() {
+        if arr.is_none() || needle.is_null(i) {
+            boolean_builder.append_null();
+            continue;
+        }
+        let arr = arr.unwrap();
+        let needle_row = Scalar::new(needle.slice(i, 1));
+        let eq_array = compare_op_for_nested(Operator::Eq, &arr, &needle_row)?;
+        let is_contained = eq_array.true_count() > 0;
+        boolean_builder.append_value(is_contained)
+    }
+
+    Ok(Arc::new(boolean_builder.finish()))
+}
+
+fn array_has_dispatch_for_scalar<O: OffsetSizeTrait>(
+    haystack: &ArrayRef,
+    needle: &dyn Datum,
+) -> Result<ArrayRef> {
+    let haystack = as_generic_list_array::<O>(haystack)?;
+    let values = haystack.values();
+    let offsets = haystack.value_offsets();
+    // If first argument is empty list (second argument is non-null), return 
false
+    // i.e. array_has([], non-null element) -> false
+    if values.len() == 0 {
+        return Ok(Arc::new(BooleanArray::from(vec![Some(false)])));
+    }
+    let eq_array = compare_op_for_nested(Operator::Eq, values, needle)?;
+    let mut final_contained = vec![None; haystack.len()];
+    for (i, offset) in offsets.windows(2).enumerate() {
+        let start = offset[0].to_usize().unwrap();
+        let end = offset[1].to_usize().unwrap();
+        let length = end - start;
+        // For non-nested list, length is 0 for null
+        if length == 0 {
+            continue;
+        }
+        // For nested lsit, check number of nulls
+        let null_count = eq_array.slice(start, length).null_count();
+        if null_count == length {
+            continue;
+        }
+
+        let number_of_true = eq_array.slice(start, length).true_count();

Review Comment:
   I didn't find iterate over directly being faster, I guess `null_count` and 
`true_count` specialization is much better over the overhead of creating slice. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to