jorgecarleitao commented on a change in pull request #9376:
URL: https://github.com/apache/arrow/pull/9376#discussion_r568342573



##########
File path: rust/arrow/src/conversions.rs
##########
@@ -0,0 +1,127 @@
+// Licensed to the Apache Software Foundation (ASF) under one

Review comment:
       It was renamed on a different PR to `temporal_conversions`. I rebased 
this PR so that we no longer have these changes.

##########
File path: rust/datafusion/examples/simple_udf.rs
##########
@@ -54,50 +58,76 @@ fn create_context() -> Result<ExecutionContext> {
     Ok(ctx)
 }
 
+// a small utility function to compute pow(base, exponent)
+fn maybe_pow(base: &Option<f64>, exponent: &Option<f64>) -> Option<f64> {
+    match (base, exponent) {
+        // in arrow, any value can be null.
+        // Here we decide to make our UDF to return null when either base or 
exponent is null.
+        (Some(base), Some(exponent)) => Some(base.powf(*exponent)),
+        _ => None,
+    }
+}
+
+fn pow_array(base: &dyn Array, exponent: &dyn Array) -> Result<ArrayRef> {
+    // 1. cast both arguments to f64. These casts MUST be aligned with the 
signature or this function panics!
+    let base = base
+        .as_any()
+        .downcast_ref::<Float64Array>()
+        .expect("cast failed");
+    let exponent = exponent
+        .as_any()
+        .downcast_ref::<Float64Array>()
+        .expect("cast failed");
+
+    // this is guaranteed by DataFusion. We place it just to make it obvious.
+    assert_eq!(exponent.len(), base.len());
+
+    // 2. perform the computation
+    let array = base
+        .iter()
+        .zip(exponent.iter())
+        .map(|(base, exponent)| maybe_pow(&base, &exponent))
+        .collect::<Float64Array>();
+
+    // `Ok` because no error occurred during the calculation (we should add 
one if exponent was [0, 1[ and the base < 0 because that panics!)

Review comment:
       This represents an open interval: `0 <= exponent < 1` because `(x)^1 = 
x` for all x (including negative ones).

##########
File path: rust/datafusion/src/physical_plan/datetime_expressions.rs
##########
@@ -167,152 +175,166 @@ fn naive_datetime_to_timestamp(s: &str, datetime: 
NaiveDateTime) -> Result<i64>
     }
 }
 
-/// convert an array of strings into `Timestamp(Nanosecond, None)`
-pub fn to_timestamp(args: &[ArrayRef]) -> Result<TimestampNanosecondArray> {
-    let num_rows = args[0].len();
-    let string_args =
-        &args[0]
-            .as_any()
-            .downcast_ref::<StringArray>()
-            .ok_or_else(|| {
-                DataFusionError::Internal(
-                    "could not cast to_timestamp input to 
StringArray".to_string(),
-                )
-            })?;
-
-    let result = (0..num_rows)
-        .map(|i| {
-            if string_args.is_null(i) {
-                // NB: Since we use the same null bitset as the input,
-                // the output for this value will be ignored, but we
-                // need some value in the array we are building.
-                Ok(0)
-            } else {
-                string_to_timestamp_nanos(string_args.value(i))
+pub(crate) fn unary_string_to_primitive_function<'a, T, O, F>(
+    args: &[&'a dyn Array],
+    op: F,
+    name: &str,
+) -> Result<PrimitiveArray<O>>
+where
+    O: ArrowPrimitiveType,
+    T: StringOffsetSizeTrait,
+    F: Fn(&'a str) -> Result<O::Native>,
+{
+    if args.len() != 1 {
+        return Err(DataFusionError::Internal(format!(
+            "{:?} args were supplied but {} takes exactly one argument",
+            args.len(),
+            name,
+        )));
+    }
+
+    let array = args[0]
+        .as_any()
+        .downcast_ref::<GenericStringArray<T>>()
+        .unwrap();
+
+    // first map is the iterator, second is for the `Option<_>`
+    array.iter().map(|x| x.map(|x| op(x)).transpose()).collect()
+}
+
+fn handle<'a, O, F, S>(

Review comment:
       The pattern depends on the input and output of the function. I.e. when 
input is `&str`, then `Utf8/LargeUtf8`. When output is a `Native`, then the 
output is `PrimitiveArray<O::Native>`. In general this construct depends on the 
what the user is trying to achieve (wrt to input and output types).
   
   I placed this here because it allows to decouple the pattern (of handling 
Scalar and Array) from the implementation of the logic 
(`string_to_timestamp_nanos` in this case).
   
   In crypto_expressions we have a similar pattern, but in this case the 
function is `&str -> AsRef<[u8]>`, which allowed to write all cripto `sha` in a 
succinct manner. However, in that case, the output type is always `Binary` 
instead of `LargeBinary` for `LargeStringArray`, because the hashes are always 
smaller than `i32::MAX`.
   
   

##########
File path: rust/datafusion/src/physical_plan/datetime_expressions.rs
##########
@@ -167,152 +175,166 @@ fn naive_datetime_to_timestamp(s: &str, datetime: 
NaiveDateTime) -> Result<i64>
     }
 }
 
-/// convert an array of strings into `Timestamp(Nanosecond, None)`
-pub fn to_timestamp(args: &[ArrayRef]) -> Result<TimestampNanosecondArray> {
-    let num_rows = args[0].len();
-    let string_args =
-        &args[0]
-            .as_any()
-            .downcast_ref::<StringArray>()
-            .ok_or_else(|| {
-                DataFusionError::Internal(
-                    "could not cast to_timestamp input to 
StringArray".to_string(),
-                )
-            })?;
-
-    let result = (0..num_rows)
-        .map(|i| {
-            if string_args.is_null(i) {
-                // NB: Since we use the same null bitset as the input,
-                // the output for this value will be ignored, but we
-                // need some value in the array we are building.
-                Ok(0)
-            } else {
-                string_to_timestamp_nanos(string_args.value(i))
+pub(crate) fn unary_string_to_primitive_function<'a, T, O, F>(
+    args: &[&'a dyn Array],
+    op: F,
+    name: &str,
+) -> Result<PrimitiveArray<O>>
+where
+    O: ArrowPrimitiveType,
+    T: StringOffsetSizeTrait,
+    F: Fn(&'a str) -> Result<O::Native>,
+{
+    if args.len() != 1 {
+        return Err(DataFusionError::Internal(format!(
+            "{:?} args were supplied but {} takes exactly one argument",
+            args.len(),
+            name,
+        )));
+    }
+
+    let array = args[0]
+        .as_any()
+        .downcast_ref::<GenericStringArray<T>>()
+        .unwrap();
+
+    // first map is the iterator, second is for the `Option<_>`
+    array.iter().map(|x| x.map(|x| op(x)).transpose()).collect()
+}
+
+fn handle<'a, O, F, S>(

Review comment:
       The pattern depends on the input and output of the function. I.e. when 
input is `&str`, then `Utf8/LargeUtf8`. When output is a `Native`, then the 
output is `PrimitiveArray<O::Native>`. In general this construct depends on the 
what the user is trying to achieve (wrt to input and output types).
   
   I placed this here because it allows to decouple the pattern (of handling 
Scalar and Array) from the implementation of the logic 
(`string_to_timestamp_nanos` in this case).
   
   In `crypto_expressions` we have a similar pattern, but in this case the 
function is `&str -> AsRef<[u8]>`, which allowed to write all cripto `sha` in a 
succinct manner. However, in that case, the output type is always `Binary` 
instead of `LargeBinary` for `LargeStringArray`, because the hashes are always 
smaller than `i32::MAX`. All of this was already written, I just expanded it 
for the two variants (scalar and vector).
   
   Note that `crypto_expressions::handle` and `crypto_expressions::md5` are 
very similar, but their return types are different: `unary_binary_function` 
receives a `GenericStringArray`, but returns a `BinaryArray`. This is because 
`MD5`'s signature is `string -> string`, while `sha` is `string -> binary`.
   

##########
File path: rust/datafusion/src/physical_plan/datetime_expressions.rs
##########
@@ -167,152 +175,166 @@ fn naive_datetime_to_timestamp(s: &str, datetime: 
NaiveDateTime) -> Result<i64>
     }
 }
 
-/// convert an array of strings into `Timestamp(Nanosecond, None)`
-pub fn to_timestamp(args: &[ArrayRef]) -> Result<TimestampNanosecondArray> {
-    let num_rows = args[0].len();
-    let string_args =
-        &args[0]
-            .as_any()
-            .downcast_ref::<StringArray>()
-            .ok_or_else(|| {
-                DataFusionError::Internal(
-                    "could not cast to_timestamp input to 
StringArray".to_string(),
-                )
-            })?;
-
-    let result = (0..num_rows)
-        .map(|i| {
-            if string_args.is_null(i) {
-                // NB: Since we use the same null bitset as the input,
-                // the output for this value will be ignored, but we
-                // need some value in the array we are building.
-                Ok(0)
-            } else {
-                string_to_timestamp_nanos(string_args.value(i))
+pub(crate) fn unary_string_to_primitive_function<'a, T, O, F>(
+    args: &[&'a dyn Array],
+    op: F,
+    name: &str,
+) -> Result<PrimitiveArray<O>>
+where
+    O: ArrowPrimitiveType,
+    T: StringOffsetSizeTrait,
+    F: Fn(&'a str) -> Result<O::Native>,
+{
+    if args.len() != 1 {
+        return Err(DataFusionError::Internal(format!(
+            "{:?} args were supplied but {} takes exactly one argument",
+            args.len(),
+            name,
+        )));
+    }
+
+    let array = args[0]
+        .as_any()
+        .downcast_ref::<GenericStringArray<T>>()
+        .unwrap();
+
+    // first map is the iterator, second is for the `Option<_>`
+    array.iter().map(|x| x.map(|x| op(x)).transpose()).collect()
+}
+
+fn handle<'a, O, F, S>(

Review comment:
       The pattern depends on the input and output of the function. I.e. when 
input is `&str`, then `Utf8/LargeUtf8`. When output is a `Native`, then the 
output is `PrimitiveArray<O::Native>`. In general this construct depends on the 
what the user is trying to achieve (wrt to input and output types).
   
   I placed this here because it allows to decouple the pattern (of handling 
Scalar and Array) from the implementation of the logic 
(`string_to_timestamp_nanos` in this case).
   
   In `crypto_expressions` we have a similar pattern, but in this case the 
function is `&str -> AsRef<[u8]>`, which allowed to write all cripto `sha` in a 
succinct manner. However, in that case, the output type is always `Binary` 
instead of `LargeBinary` for `LargeStringArray`, because the hashes are always 
smaller than `i32::MAX`. All of this was already written, I just expanded it 
for the two variants (scalar and vector).
   
   Note that `crypto_expressions::handle` and `crypto_expressions::md5` are 
very similar, but their return types are different: `handle` receives a 
`GenericStringArray`, but returns a `BinaryArray`. This is because `MD5`'s 
signature is `string -> string`, while `sha` is `string -> binary`.
   




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to