alamb commented on a change in pull request #9376:
URL: https://github.com/apache/arrow/pull/9376#discussion_r568168403



##########
File path: rust/datafusion/examples/simple_udf.rs
##########
@@ -54,50 +58,76 @@ fn create_context() -> Result<ExecutionContext> {
     Ok(ctx)
 }
 
+// a small utility function to compute pow(base, exponent)
+fn maybe_pow(base: &Option<f64>, exponent: &Option<f64>) -> Option<f64> {
+    match (base, exponent) {
+        // in arrow, any value can be null.
+        // Here we decide to make our UDF to return null when either base or 
exponent is null.
+        (Some(base), Some(exponent)) => Some(base.powf(*exponent)),
+        _ => None,
+    }
+}
+
+fn pow_array(base: &dyn Array, exponent: &dyn Array) -> Result<ArrayRef> {
+    // 1. cast both arguments to f64. These casts MUST be aligned with the 
signature or this function panics!
+    let base = base
+        .as_any()
+        .downcast_ref::<Float64Array>()
+        .expect("cast failed");
+    let exponent = exponent
+        .as_any()
+        .downcast_ref::<Float64Array>()
+        .expect("cast failed");
+
+    // this is guaranteed by DataFusion. We place it just to make it obvious.
+    assert_eq!(exponent.len(), base.len());
+
+    // 2. perform the computation
+    let array = base
+        .iter()
+        .zip(exponent.iter())
+        .map(|(base, exponent)| maybe_pow(&base, &exponent))
+        .collect::<Float64Array>();
+
+    // `Ok` because no error occurred during the calculation (we should add 
one if exponent was [0, 1[ and the base < 0 because that panics!)

Review comment:
       ```suggestion
       // `Ok` because no error occurred during the calculation (we should add 
one if exponent was [0, 1] and the base < 0 because that panics!)
   ```

##########
File path: rust/datafusion/src/physical_plan/datetime_expressions.rs
##########
@@ -167,152 +175,166 @@ fn naive_datetime_to_timestamp(s: &str, datetime: 
NaiveDateTime) -> Result<i64>
     }
 }
 
-/// convert an array of strings into `Timestamp(Nanosecond, None)`
-pub fn to_timestamp(args: &[ArrayRef]) -> Result<TimestampNanosecondArray> {
-    let num_rows = args[0].len();
-    let string_args =
-        &args[0]
-            .as_any()
-            .downcast_ref::<StringArray>()
-            .ok_or_else(|| {
-                DataFusionError::Internal(
-                    "could not cast to_timestamp input to 
StringArray".to_string(),
-                )
-            })?;
-
-    let result = (0..num_rows)
-        .map(|i| {
-            if string_args.is_null(i) {
-                // NB: Since we use the same null bitset as the input,
-                // the output for this value will be ignored, but we
-                // need some value in the array we are building.
-                Ok(0)
-            } else {
-                string_to_timestamp_nanos(string_args.value(i))
+pub(crate) fn unary_string_to_primitive_function<'a, T, O, F>(
+    args: &[&'a dyn Array],
+    op: F,
+    name: &str,
+) -> Result<PrimitiveArray<O>>
+where
+    O: ArrowPrimitiveType,
+    T: StringOffsetSizeTrait,
+    F: Fn(&'a str) -> Result<O::Native>,
+{
+    if args.len() != 1 {
+        return Err(DataFusionError::Internal(format!(
+            "{:?} args were supplied but {} takes exactly one argument",
+            args.len(),
+            name,
+        )));
+    }
+
+    let array = args[0]
+        .as_any()
+        .downcast_ref::<GenericStringArray<T>>()
+        .unwrap();
+
+    // first map is the iterator, second is for the `Option<_>`
+    array.iter().map(|x| x.map(|x| op(x)).transpose()).collect()
+}
+
+fn handle<'a, O, F, S>(

Review comment:
       I may be missing it -- but this function only seems to be invoked once 
with the same set of type arguments -- does it need to be generic? Or more 
broadly, can we hoist this seemingly repeated `handle` pattern somewhere it can 
be  reused (and reduce the cognitive load on people writing new functions?)

##########
File path: rust/datafusion/src/physical_plan/datetime_expressions.rs
##########
@@ -167,152 +175,166 @@ fn naive_datetime_to_timestamp(s: &str, datetime: 
NaiveDateTime) -> Result<i64>
     }
 }
 
-/// convert an array of strings into `Timestamp(Nanosecond, None)`
-pub fn to_timestamp(args: &[ArrayRef]) -> Result<TimestampNanosecondArray> {
-    let num_rows = args[0].len();
-    let string_args =
-        &args[0]
-            .as_any()
-            .downcast_ref::<StringArray>()
-            .ok_or_else(|| {
-                DataFusionError::Internal(
-                    "could not cast to_timestamp input to 
StringArray".to_string(),
-                )
-            })?;
-
-    let result = (0..num_rows)
-        .map(|i| {
-            if string_args.is_null(i) {
-                // NB: Since we use the same null bitset as the input,
-                // the output for this value will be ignored, but we
-                // need some value in the array we are building.
-                Ok(0)
-            } else {
-                string_to_timestamp_nanos(string_args.value(i))
+pub(crate) fn unary_string_to_primitive_function<'a, T, O, F>(

Review comment:
       I think adding an example of using this pattern for implementing UDfs 
might be really helpful

##########
File path: rust/arrow/src/conversions.rs
##########
@@ -0,0 +1,127 @@
+// Licensed to the Apache Software Foundation (ASF) under one

Review comment:
       this module might be more specifically named `datetime_conversions.rs`

##########
File path: rust/datafusion/src/physical_plan/string_expressions.rs
##########
@@ -17,27 +17,100 @@
 
 //! String expressions
 
-use crate::error::{DataFusionError, Result};
-use arrow::array::{
-    Array, ArrayRef, GenericStringArray, StringArray, StringBuilder,
-    StringOffsetSizeTrait,
+use std::sync::Arc;
+
+use crate::{
+    error::{DataFusionError, Result},
+    scalar::ScalarValue,
+};
+use arrow::{
+    array::{Array, GenericStringArray, StringArray, StringOffsetSizeTrait},
+    datatypes::DataType,
 };
 
-macro_rules! downcast_vec {
-    ($ARGS:expr, $ARRAY_TYPE:ident) => {{
-        $ARGS
-            .iter()
-            .map(|e| match e.as_any().downcast_ref::<$ARRAY_TYPE>() {
-                Some(array) => Ok(array),
-                _ => Err(DataFusionError::Internal("failed to 
downcast".to_string())),
-            })
-    }};
+use super::ColumnarValue;
+
+pub(crate) fn unary_string_function<'a, T, O, F, R>(

Review comment:
       this function is effectively applying a `str->str` function `op` to all 
values in `args`? Some comments might be helpful




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to