comphead commented on code in PR #19529:
URL: https://github.com/apache/datafusion/pull/19529#discussion_r2649910125


##########
datafusion/functions/src/string/contains.rs:
##########
@@ -89,51 +89,82 @@ impl ScalarUDFImpl for ContainsFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> 
Result<ColumnarValue> {
-        make_scalar_function(contains, vec![])(&args.args)
+        make_scalar_function_columnar(contains)(&args.args)
     }
 
     fn documentation(&self) -> Option<&Documentation> {
         self.doc()
     }
 }
 
+/// Converts a `ColumnarValue` to a value that implements `Datum` for use with 
arrow kernels.
+/// If the value is a scalar, wraps the single-element array in `Scalar` to 
signal to arrow
+/// that this is a scalar value (enabling optimized code paths).
+fn columnar_to_datum(value: &ColumnarValue) -> Result<(ArrayRef, bool)> {
+    match value {
+        ColumnarValue::Array(array) => Ok((Arc::clone(array), false)),
+        ColumnarValue::Scalar(scalar) => Ok((scalar.to_array()?, true)),
+    }
+}
+
+/// Helper to call arrow_contains with proper Datum handling.
+/// When an argument is marked as scalar, we wrap it in `Scalar` to tell 
arrow's
+/// kernel to use the optimized single-value code path instead of iterating.
+fn call_arrow_contains(
+    haystack: &ArrayRef,
+    haystack_is_scalar: bool,
+    needle: &ArrayRef,
+    needle_is_scalar: bool,
+) -> Result<ColumnarValue> {
+    // Arrow's Datum trait is implemented for ArrayRef, Arc<dyn Array>, and 
Scalar<T>
+    // We pass ArrayRef directly when not scalar, or wrap in Scalar when it is
+    let result = match (haystack_is_scalar, needle_is_scalar) {
+        (false, false) => arrow_contains(haystack, needle)?,
+        (false, true) => arrow_contains(haystack, 
&Scalar::new(Arc::clone(needle)))?,
+        (true, false) => arrow_contains(&Scalar::new(Arc::clone(haystack)), 
needle)?,
+        (true, true) => arrow_contains(
+            &Scalar::new(Arc::clone(haystack)),
+            &Scalar::new(Arc::clone(needle)),
+        )?,
+    };
+
+    // If both inputs were scalar, return a scalar result
+    if haystack_is_scalar && needle_is_scalar {
+        let scalar = datafusion_common::ScalarValue::try_from_array(&result, 
0)?;
+        Ok(ColumnarValue::Scalar(scalar))
+    } else {
+        Ok(ColumnarValue::Array(Arc::new(result)))
+    }
+}
+
 /// use `arrow::compute::contains` to do the calculation for contains
-fn contains(args: &[ArrayRef]) -> Result<ArrayRef, DataFusionError> {
+fn contains(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+    let (haystack, haystack_is_scalar) = columnar_to_datum(&args[0])?;
+    let (needle, needle_is_scalar) = columnar_to_datum(&args[1])?;
+
     if let Some(coercion_data_type) =
-        string_coercion(args[0].data_type(), args[1].data_type()).or_else(|| {
-            binary_to_string_coercion(args[0].data_type(), args[1].data_type())
+        string_coercion(haystack.data_type(), needle.data_type()).or_else(|| {

Review Comment:
   another potential optimizations is to call coercion/datatype stuff only 
once, rather than per every batch



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to