alamb commented on code in PR #8886:
URL: https://github.com/apache/arrow-datafusion/pull/8886#discussion_r1457650837
##########
datafusion/physical-expr/src/datetime_expressions.rs:
##########
@@ -121,58 +291,220 @@ where
}
}
+// given an function that maps a `&str`, `&str` to an arrow native type,
Review Comment:
This is a very cool function. Maybe we can pull it into a shared
function/utility at some point
##########
datafusion/physical-expr/src/datetime_expressions.rs:
##########
@@ -84,7 +172,96 @@ where
array.iter().map(|x| x.map(&op).transpose()).collect()
}
-// given an function that maps a `&str` to a arrow native type,
+/// given a function `op` that maps `&str`, `&str` to the first successful
Result
+/// of an arrow native type, returns a `PrimitiveArray` after the application
of the
+/// function to `args` and the subsequence application of the `op2` function
to any
+/// successful result. This function calls the `op` function with the first
and second
+/// argument and if not successful continues with first and third, first and
fourth,
+/// etc until the result was successful or no more arguments are present.
+/// # Errors
+/// This function errors iff:
+/// * the number of arguments is not > 1 or
+/// * the array arguments are not castable to a `GenericStringArray` or
+/// * the function `op` errors for all input
+pub(crate) fn strings_to_primitive_function<'a, T, O, F, F2>(
+ args: &'a [ColumnarValue],
+ op: F,
+ op2: F2,
+ name: &str,
+) -> Result<PrimitiveArray<O>>
+where
+ O: ArrowPrimitiveType,
+ T: OffsetSizeTrait,
+ F: Fn(&'a str, &'a str) -> Result<O::Native>,
+ F2: Fn(O::Native) -> O::Native,
+{
+ if args.len() < 2 {
+ return internal_err!(
+ "{:?} args were supplied but {} takes 2 or more arguments",
+ args.len(),
+ name
+ );
+ }
+
+ // this will throw the error if any of the array args are not castable to
GenericStringArray
+ let data = args
+ .iter()
+ .map(|a| match a {
+ ColumnarValue::Array(a) => {
+ Ok(Either::Left(as_generic_string_array::<T>(a.as_ref())?))
+ }
+ ColumnarValue::Scalar(s) => match s {
+ ScalarValue::Utf8(a) | ScalarValue::LargeUtf8(a) =>
Ok(Either::Right(a)),
+ other => internal_err!(
+ "Unexpected scalar type encountered '{other}' for function
'{name}'"
+ ),
+ },
+ })
+ .collect::<Result<Vec<Either<&GenericStringArray<T>,
&Option<String>>>>>()?;
+
+ let first_arg = &data.first().unwrap().left().unwrap();
+
+ first_arg
+ .iter()
+ .enumerate()
+ .map(|(pos, x)| {
+ let mut val = None;
+
+ if let Some(x) = x {
+ let param_args = data.iter().skip(1);
+
+ // go through the args and find the first successful result.
Only the last
+ // failure will be returned if no successful result was
received.
+ for param_arg in param_args {
+ // param_arg is an array, use the corresponding index into
the array as the arg
+ // we're currently parsing
+ let p = *param_arg;
+ let r = if p.is_left() {
+ let p = p.left().unwrap();
+ op(x, p.value(pos))
+ }
+ // args is a scalar, use it directly
+ else if let Some(p) = p.right().unwrap() {
+ op(x, p.as_str())
+ } else {
+ continue;
+ };
+
+ if r.is_ok() {
Review Comment:
One thing to consider here is that this pattern will allocate a string to
generate an error message that will get discarded if there are more arguments
to go.
The alternate is to pass back an Option and only generate an `Error` when it
actually needs to be returned
This is a potential optimization for the future, I think, not needed for
this PR
##########
datafusion/physical-expr/src/datetime_expressions.rs:
##########
@@ -121,58 +291,220 @@ where
}
}
+// given an function that maps a `&str`, `&str` to an arrow native type,
+// returns a `ColumnarValue` where the function is applied to either a
`ArrayRef` or `ScalarValue`
+// depending on the `args`'s variant.
+fn handle_multiple<'a, O, F, S, M>(
+ args: &'a [ColumnarValue],
+ op: F,
+ op2: M,
+ name: &str,
+) -> Result<ColumnarValue>
+where
+ O: ArrowPrimitiveType,
+ S: ScalarType<O::Native>,
+ F: Fn(&'a str, &'a str) -> Result<O::Native>,
+ M: Fn(O::Native) -> O::Native,
+{
+ match &args[0] {
+ ColumnarValue::Array(a) => match a.data_type() {
+ DataType::Utf8 | DataType::LargeUtf8 => {
+ // validate the column types
+ for (pos, arg) in args.iter().enumerate() {
+ match arg {
+ ColumnarValue::Array(arg) => match arg.data_type() {
+ DataType::Utf8 | DataType::LargeUtf8 => {
+ // all good
+ },
+ other => return internal_err!("Unsupported
data type {other:?} for function {name}, arg # {pos}"),
+ },
+ ColumnarValue::Scalar(arg) => { match arg.data_type() {
+ DataType::Utf8 | DataType::LargeUtf8 => {
+ // all good
+ },
+ other => return internal_err!("Unsupported data
type {other:?} for function {name}, arg # {pos}"),
+ }}
+ }
+ }
+
+ Ok(ColumnarValue::Array(Arc::new(
+ strings_to_primitive_function::<i32, O, _, _>(args, op,
op2, name)?,
+ )))
+ }
+ other => {
+ internal_err!("Unsupported data type {other:?} for function
{name}")
+ }
+ },
+ // if the first argument is a scalar utf8 all arguments are expected
to be scalar utf8
+ ColumnarValue::Scalar(scalar) => match scalar {
+ ScalarValue::Utf8(a) | ScalarValue::LargeUtf8(a) => {
+ let mut val: Option<Result<ColumnarValue>> = None;
+ let mut err: Option<DataFusionError> = None;
+
+ match a {
+ Some(a) => {
+ // enumerate all the values finding the first one that
returns an Ok result
+ for (pos, v) in args.iter().enumerate().skip(1) {
+ if let ColumnarValue::Scalar(s) = v {
+ if let ScalarValue::Utf8(x) |
ScalarValue::LargeUtf8(x) =
+ s
+ {
+ if let Some(s) = x {
+ match op(a.as_str(), s.as_str()) {
+ Ok(r) => {
+ val =
Some(Ok(ColumnarValue::Scalar(
+ S::scalar(Some(op2(r))),
+ )));
+ break;
+ }
+ Err(e) => {
+ err = Some(e);
+ }
+ }
+ }
+ } else {
+ return internal_err!("Unsupported data
type {s:?} for function {name}, arg # {pos}");
Review Comment:
We don't have to support invoking this pattern, but I don't think it should
generate an internal error - maybe we can add a test for this too
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]