[GitHub] [arrow] alamb commented on a change in pull request #8080: ARROW-9887: [Rust] [DataFusion] Added support for complex return types for built-in functions

GitBox Sun, 30 Aug 2020 04:29:38 -0700


alamb commented on a change in pull request #8080:
URL: https://github.com/apache/arrow/pull/8080#discussion_r479755529




##########
File path: rust/datafusion/src/execution/physical_plan/functions.rs
##########
@@ -0,0 +1,319 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Declaration of built-in (scalar) functions.
+//! This module contains built-in functions' enumeration and metadata.
+//!
+//! Generally, a function has:
+//! * a set of valid argument types
+//! * a return type function of an incoming argument types
+//! * the computation valid for all sets of valid argument types
+//!
+//! * Argument types: the number of arguments and set of valid types. For 
example, [[f32, f32], [f64, f64]] is a function of two arguments only accepting 
f32 or f64 on each of its arguments.
+//! * Return type: a function `(arg_types) -> return_type`. E.g. for sqrt, 
([f32]) -> f32, ([f64]) -> f64.
+//!
+//! Currently, this implementation supports only a single argument and a 
single signature.
+//!
+//! This module also has a set of coercion rules to improve user experience: 
if an argument i32 is passed
+//! to a function that supports f64, it is coerced to f64.
+
+use super::{
+    expressions::{cast, is_numeric},
+    PhysicalExpr,
+};
+use crate::error::{ExecutionError, Result};
+use crate::execution::physical_plan::math_expressions;
+use crate::execution::physical_plan::udf;
+use arrow::{
+    compute::kernels::length::length,
+    datatypes::{DataType, Schema},
+};
+use std::{fmt, str::FromStr, sync::Arc};
+use udf::ScalarUdf;
+
+/// Enum of all built-in scalar functions
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum ScalarFunction {
+    /// sqrt
+    Sqrt,
+    /// sin
+    Sin,
+    /// cos
+    Cos,
+    /// tan
+    Tan,
+    /// asin
+    Asin,
+    /// acos
+    Acos,
+    /// atan
+    Atan,
+    /// exp
+    Exp,
+    /// log, also known as ln
+    Log,
+    /// log2
+    Log2,
+    /// log10
+    Log10,
+    /// floor
+    Floor,
+    /// ceil
+    Ceil,
+    /// round
+    Round,
+    /// trunc
+    Trunc,
+    /// abs
+    Abs,
+    /// signum
+    Signum,
+    /// length
+    Length,
+}
+
+impl fmt::Display for ScalarFunction {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        // lowercase of the debug.
+        write!(f, "{}", format!("{:?}", self).to_lowercase())
+    }
+}
+
+impl FromStr for ScalarFunction {
+    type Err = ExecutionError;
+    fn from_str(name: &str) -> Result<ScalarFunction> {
+        Ok(match name {
+            "sqrt" => ScalarFunction::Sqrt,
+            "sin" => ScalarFunction::Sin,
+            "cos" => ScalarFunction::Cos,
+            "tan" => ScalarFunction::Tan,
+            "asin" => ScalarFunction::Asin,
+            "acos" => ScalarFunction::Acos,
+            "atan" => ScalarFunction::Atan,
+            "exp" => ScalarFunction::Exp,
+            "log" => ScalarFunction::Log,
+            "log2" => ScalarFunction::Log2,
+            "log10" => ScalarFunction::Log10,
+            "floor" => ScalarFunction::Floor,
+            "ceil" => ScalarFunction::Ceil,
+            "round" => ScalarFunction::Round,
+            "truc" => ScalarFunction::Trunc,
+            "abs" => ScalarFunction::Abs,
+            "signum" => ScalarFunction::Signum,
+            "length" => ScalarFunction::Length,
+            _ => {
+                return Err(ExecutionError::General(format!(
+                    "There is no built-in function named {}",
+                    name
+                )))
+            }
+        })
+    }
+}
+
+/// Returns the datatype of the scalar function
+pub fn return_type(fun: &ScalarFunction, arg_types: &Vec<DataType>) -> 
Result<DataType> {
+    // Note that this function *must* return the same type that the respective 
physical expression returns
+    // or the execution panics.
+
+    if arg_types.len() != 1 {
+        // for now, every function expects a single argument, and thus this is 
enough
+        return Err(ExecutionError::General(format!(
+            "The function \"{}\" expected 1 argument, but received \"{}\"",
+            fun,
+            arg_types.len()
+        )));
+    }
+
+    // verify that this is a valid type for this function
+    coerce(fun, &arg_types[0])?;
+
+    // the return type after coercion.
+    // for now, this is type-independent, but there will be built-in functions 
whose return type

Review comment:
       ```suggestion
       // for now, this is type-independent, but there may be built-in 
functions whose return type
   ```

##########
File path: rust/datafusion/src/execution/physical_plan/functions.rs
##########
@@ -0,0 +1,319 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Declaration of built-in (scalar) functions.
+//! This module contains built-in functions' enumeration and metadata.
+//!
+//! Generally, a function has:
+//! * a set of valid argument types
+//! * a return type function of an incoming argument types
+//! * the computation valid for all sets of valid argument types
+//!
+//! * Argument types: the number of arguments and set of valid types. For 
example, [[f32, f32], [f64, f64]] is a function of two arguments only accepting 
f32 or f64 on each of its arguments.
+//! * Return type: a function `(arg_types) -> return_type`. E.g. for sqrt, 
([f32]) -> f32, ([f64]) -> f64.
+//!
+//! Currently, this implementation supports only a single argument and a 
single signature.
+//!
+//! This module also has a set of coercion rules to improve user experience: 
if an argument i32 is passed
+//! to a function that supports f64, it is coerced to f64.
+
+use super::{
+    expressions::{cast, is_numeric},
+    PhysicalExpr,
+};
+use crate::error::{ExecutionError, Result};
+use crate::execution::physical_plan::math_expressions;
+use crate::execution::physical_plan::udf;
+use arrow::{
+    compute::kernels::length::length,
+    datatypes::{DataType, Schema},
+};
+use std::{fmt, str::FromStr, sync::Arc};
+use udf::ScalarUdf;
+
+/// Enum of all built-in scalar functions
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum ScalarFunction {
+    /// sqrt
+    Sqrt,
+    /// sin
+    Sin,
+    /// cos
+    Cos,
+    /// tan
+    Tan,
+    /// asin
+    Asin,
+    /// acos
+    Acos,
+    /// atan
+    Atan,
+    /// exp
+    Exp,
+    /// log, also known as ln
+    Log,
+    /// log2
+    Log2,
+    /// log10
+    Log10,
+    /// floor
+    Floor,
+    /// ceil
+    Ceil,
+    /// round
+    Round,
+    /// trunc
+    Trunc,
+    /// abs
+    Abs,
+    /// signum
+    Signum,
+    /// length
+    Length,
+}
+
+impl fmt::Display for ScalarFunction {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        // lowercase of the debug.
+        write!(f, "{}", format!("{:?}", self).to_lowercase())
+    }
+}
+
+impl FromStr for ScalarFunction {
+    type Err = ExecutionError;
+    fn from_str(name: &str) -> Result<ScalarFunction> {
+        Ok(match name {
+            "sqrt" => ScalarFunction::Sqrt,
+            "sin" => ScalarFunction::Sin,
+            "cos" => ScalarFunction::Cos,
+            "tan" => ScalarFunction::Tan,
+            "asin" => ScalarFunction::Asin,
+            "acos" => ScalarFunction::Acos,
+            "atan" => ScalarFunction::Atan,
+            "exp" => ScalarFunction::Exp,
+            "log" => ScalarFunction::Log,
+            "log2" => ScalarFunction::Log2,
+            "log10" => ScalarFunction::Log10,
+            "floor" => ScalarFunction::Floor,
+            "ceil" => ScalarFunction::Ceil,
+            "round" => ScalarFunction::Round,
+            "truc" => ScalarFunction::Trunc,
+            "abs" => ScalarFunction::Abs,
+            "signum" => ScalarFunction::Signum,
+            "length" => ScalarFunction::Length,
+            _ => {
+                return Err(ExecutionError::General(format!(
+                    "There is no built-in function named {}",
+                    name
+                )))
+            }
+        })
+    }
+}
+
+/// Returns the datatype of the scalar function
+pub fn return_type(fun: &ScalarFunction, arg_types: &Vec<DataType>) -> 
Result<DataType> {
+    // Note that this function *must* return the same type that the respective 
physical expression returns
+    // or the execution panics.
+
+    if arg_types.len() != 1 {
+        // for now, every function expects a single argument, and thus this is 
enough
+        return Err(ExecutionError::General(format!(
+            "The function \"{}\" expected 1 argument, but received \"{}\"",
+            fun,
+            arg_types.len()
+        )));
+    }
+
+    // verify that this is a valid type for this function
+    coerce(fun, &arg_types[0])?;
+
+    // the return type after coercion.
+    // for now, this is type-independent, but there will be built-in functions 
whose return type
+    // depends on the incoming type.
+    match fun {
+        ScalarFunction::Length => Ok(DataType::UInt32),
+        _ => Ok(DataType::Float64),
+    }
+}
+
+/// Create a physical (function) expression.
+/// This function errors when `args`' can't be coerced to a valid argument 
type of the function.
+pub fn function(
+    fun: &ScalarFunction,
+    args: &Vec<Arc<dyn PhysicalExpr>>,
+    input_schema: &Schema,
+) -> Result<Arc<dyn PhysicalExpr>> {
+    let fun_expr: ScalarUdf = Arc::new(match fun {
+        ScalarFunction::Sqrt => math_expressions::sqrt,
+        ScalarFunction::Sin => math_expressions::sin,
+        ScalarFunction::Cos => math_expressions::cos,
+        ScalarFunction::Tan => math_expressions::tan,
+        ScalarFunction::Asin => math_expressions::asin,
+        ScalarFunction::Acos => math_expressions::acos,
+        ScalarFunction::Atan => math_expressions::atan,
+        ScalarFunction::Exp => math_expressions::exp,
+        ScalarFunction::Log => math_expressions::ln,
+        ScalarFunction::Log2 => math_expressions::log2,
+        ScalarFunction::Log10 => math_expressions::log10,
+        ScalarFunction::Floor => math_expressions::floor,
+        ScalarFunction::Ceil => math_expressions::ceil,
+        ScalarFunction::Round => math_expressions::round,
+        ScalarFunction::Trunc => math_expressions::trunc,
+        ScalarFunction::Abs => math_expressions::abs,
+        ScalarFunction::Signum => math_expressions::signum,
+        ScalarFunction::Length => |args| 
Ok(Arc::new(length(args[0].as_ref())?)),
+    });
+    let data_types = args
+        .iter()
+        .map(|e| e.data_type(input_schema))
+        .collect::<Result<Vec<_>>>()?;
+
+    // coerce type
+    // for now, this supports a single type.
+    assert!(args.len() == 1);
+    assert!(data_types.len() == 1);
+    let arg_type = coerce(fun, &data_types[0])?;
+    let args = vec![cast(args[0].clone(), input_schema, arg_type.clone())?];
+
+    Ok(Arc::new(udf::ScalarFunctionExpr::new(
+        &format!("{}", fun),
+        fun_expr,
+        args,
+        &return_type(&fun, &vec![arg_type])?,
+    )))
+}
+
+/// the type supported by the function `fun`.
+fn valid_type(fun: &ScalarFunction) -> DataType {

Review comment:
       ```suggestion
   fn argument_type(fun: &ScalarFunction) -> DataType {
   ```
   
   

##########
File path: rust/datafusion/src/execution/physical_plan/functions.rs
##########
@@ -0,0 +1,319 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Declaration of built-in (scalar) functions.
+//! This module contains built-in functions' enumeration and metadata.
+//!
+//! Generally, a function has:
+//! * a set of valid argument types
+//! * a return type function of an incoming argument types
+//! * the computation valid for all sets of valid argument types
+//!
+//! * Argument types: the number of arguments and set of valid types. For 
example, [[f32, f32], [f64, f64]] is a function of two arguments only accepting 
f32 or f64 on each of its arguments.
+//! * Return type: a function `(arg_types) -> return_type`. E.g. for sqrt, 
([f32]) -> f32, ([f64]) -> f64.
+//!
+//! Currently, this implementation supports only a single argument and a 
single signature.
+//!
+//! This module also has a set of coercion rules to improve user experience: 
if an argument i32 is passed
+//! to a function that supports f64, it is coerced to f64.
+
+use super::{
+    expressions::{cast, is_numeric},
+    PhysicalExpr,
+};
+use crate::error::{ExecutionError, Result};
+use crate::execution::physical_plan::math_expressions;
+use crate::execution::physical_plan::udf;
+use arrow::{
+    compute::kernels::length::length,
+    datatypes::{DataType, Schema},
+};
+use std::{fmt, str::FromStr, sync::Arc};
+use udf::ScalarUdf;
+
+/// Enum of all built-in scalar functions
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum ScalarFunction {
+    /// sqrt
+    Sqrt,
+    /// sin
+    Sin,
+    /// cos
+    Cos,
+    /// tan
+    Tan,
+    /// asin
+    Asin,
+    /// acos
+    Acos,
+    /// atan
+    Atan,
+    /// exp
+    Exp,
+    /// log, also known as ln
+    Log,
+    /// log2
+    Log2,
+    /// log10
+    Log10,
+    /// floor
+    Floor,
+    /// ceil
+    Ceil,
+    /// round
+    Round,
+    /// trunc
+    Trunc,
+    /// abs
+    Abs,
+    /// signum
+    Signum,
+    /// length
+    Length,
+}
+
+impl fmt::Display for ScalarFunction {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        // lowercase of the debug.
+        write!(f, "{}", format!("{:?}", self).to_lowercase())
+    }
+}
+
+impl FromStr for ScalarFunction {
+    type Err = ExecutionError;
+    fn from_str(name: &str) -> Result<ScalarFunction> {
+        Ok(match name {
+            "sqrt" => ScalarFunction::Sqrt,
+            "sin" => ScalarFunction::Sin,
+            "cos" => ScalarFunction::Cos,
+            "tan" => ScalarFunction::Tan,
+            "asin" => ScalarFunction::Asin,
+            "acos" => ScalarFunction::Acos,
+            "atan" => ScalarFunction::Atan,
+            "exp" => ScalarFunction::Exp,
+            "log" => ScalarFunction::Log,
+            "log2" => ScalarFunction::Log2,
+            "log10" => ScalarFunction::Log10,
+            "floor" => ScalarFunction::Floor,
+            "ceil" => ScalarFunction::Ceil,
+            "round" => ScalarFunction::Round,
+            "truc" => ScalarFunction::Trunc,
+            "abs" => ScalarFunction::Abs,
+            "signum" => ScalarFunction::Signum,
+            "length" => ScalarFunction::Length,
+            _ => {
+                return Err(ExecutionError::General(format!(
+                    "There is no built-in function named {}",
+                    name
+                )))
+            }
+        })
+    }
+}
+
+/// Returns the datatype of the scalar function
+pub fn return_type(fun: &ScalarFunction, arg_types: &Vec<DataType>) -> 
Result<DataType> {
+    // Note that this function *must* return the same type that the respective 
physical expression returns
+    // or the execution panics.
+
+    if arg_types.len() != 1 {
+        // for now, every function expects a single argument, and thus this is 
enough
+        return Err(ExecutionError::General(format!(
+            "The function \"{}\" expected 1 argument, but received \"{}\"",
+            fun,
+            arg_types.len()
+        )));
+    }
+
+    // verify that this is a valid type for this function
+    coerce(fun, &arg_types[0])?;
+
+    // the return type after coercion.
+    // for now, this is type-independent, but there will be built-in functions 
whose return type
+    // depends on the incoming type.
+    match fun {
+        ScalarFunction::Length => Ok(DataType::UInt32),
+        _ => Ok(DataType::Float64),
+    }
+}
+
+/// Create a physical (function) expression.
+/// This function errors when `args`' can't be coerced to a valid argument 
type of the function.
+pub fn function(

Review comment:
       ```suggestion
   pub fn to_physical_expr(
   ```
   
   I personally find `function` to be a little confusing as the term is so 
generic and could apply to many things

##########
File path: rust/datafusion/src/execution/physical_plan/functions.rs
##########
@@ -0,0 +1,319 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Declaration of built-in (scalar) functions.
+//! This module contains built-in functions' enumeration and metadata.
+//!
+//! Generally, a function has:
+//! * a set of valid argument types
+//! * a return type function of an incoming argument types
+//! * the computation valid for all sets of valid argument types
+//!
+//! * Argument types: the number of arguments and set of valid types. For 
example, [[f32, f32], [f64, f64]] is a function of two arguments only accepting 
f32 or f64 on each of its arguments.
+//! * Return type: a function `(arg_types) -> return_type`. E.g. for sqrt, 
([f32]) -> f32, ([f64]) -> f64.
+//!
+//! Currently, this implementation supports only a single argument and a 
single signature.
+//!
+//! This module also has a set of coercion rules to improve user experience: 
if an argument i32 is passed
+//! to a function that supports f64, it is coerced to f64.
+
+use super::{
+    expressions::{cast, is_numeric},
+    PhysicalExpr,
+};
+use crate::error::{ExecutionError, Result};
+use crate::execution::physical_plan::math_expressions;
+use crate::execution::physical_plan::udf;
+use arrow::{
+    compute::kernels::length::length,
+    datatypes::{DataType, Schema},
+};
+use std::{fmt, str::FromStr, sync::Arc};
+use udf::ScalarUdf;
+
+/// Enum of all built-in scalar functions
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum ScalarFunction {
+    /// sqrt
+    Sqrt,
+    /// sin
+    Sin,
+    /// cos
+    Cos,
+    /// tan
+    Tan,
+    /// asin
+    Asin,
+    /// acos
+    Acos,
+    /// atan
+    Atan,
+    /// exp
+    Exp,
+    /// log, also known as ln
+    Log,
+    /// log2
+    Log2,
+    /// log10
+    Log10,
+    /// floor
+    Floor,
+    /// ceil
+    Ceil,
+    /// round
+    Round,
+    /// trunc
+    Trunc,
+    /// abs
+    Abs,
+    /// signum
+    Signum,
+    /// length
+    Length,
+}
+
+impl fmt::Display for ScalarFunction {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        // lowercase of the debug.
+        write!(f, "{}", format!("{:?}", self).to_lowercase())
+    }
+}
+
+impl FromStr for ScalarFunction {
+    type Err = ExecutionError;
+    fn from_str(name: &str) -> Result<ScalarFunction> {
+        Ok(match name {
+            "sqrt" => ScalarFunction::Sqrt,
+            "sin" => ScalarFunction::Sin,
+            "cos" => ScalarFunction::Cos,
+            "tan" => ScalarFunction::Tan,
+            "asin" => ScalarFunction::Asin,
+            "acos" => ScalarFunction::Acos,
+            "atan" => ScalarFunction::Atan,
+            "exp" => ScalarFunction::Exp,
+            "log" => ScalarFunction::Log,
+            "log2" => ScalarFunction::Log2,
+            "log10" => ScalarFunction::Log10,
+            "floor" => ScalarFunction::Floor,
+            "ceil" => ScalarFunction::Ceil,
+            "round" => ScalarFunction::Round,
+            "truc" => ScalarFunction::Trunc,
+            "abs" => ScalarFunction::Abs,
+            "signum" => ScalarFunction::Signum,
+            "length" => ScalarFunction::Length,
+            _ => {
+                return Err(ExecutionError::General(format!(
+                    "There is no built-in function named {}",
+                    name
+                )))
+            }
+        })
+    }
+}
+
+/// Returns the datatype of the scalar function
+pub fn return_type(fun: &ScalarFunction, arg_types: &Vec<DataType>) -> 
Result<DataType> {
+    // Note that this function *must* return the same type that the respective 
physical expression returns
+    // or the execution panics.
+
+    if arg_types.len() != 1 {
+        // for now, every function expects a single argument, and thus this is 
enough
+        return Err(ExecutionError::General(format!(
+            "The function \"{}\" expected 1 argument, but received \"{}\"",
+            fun,
+            arg_types.len()
+        )));
+    }
+
+    // verify that this is a valid type for this function
+    coerce(fun, &arg_types[0])?;
+
+    // the return type after coercion.
+    // for now, this is type-independent, but there will be built-in functions 
whose return type
+    // depends on the incoming type.
+    match fun {
+        ScalarFunction::Length => Ok(DataType::UInt32),
+        _ => Ok(DataType::Float64),
+    }
+}
+
+/// Create a physical (function) expression.
+/// This function errors when `args`' can't be coerced to a valid argument 
type of the function.
+pub fn function(
+    fun: &ScalarFunction,
+    args: &Vec<Arc<dyn PhysicalExpr>>,
+    input_schema: &Schema,
+) -> Result<Arc<dyn PhysicalExpr>> {
+    let fun_expr: ScalarUdf = Arc::new(match fun {
+        ScalarFunction::Sqrt => math_expressions::sqrt,
+        ScalarFunction::Sin => math_expressions::sin,
+        ScalarFunction::Cos => math_expressions::cos,
+        ScalarFunction::Tan => math_expressions::tan,
+        ScalarFunction::Asin => math_expressions::asin,
+        ScalarFunction::Acos => math_expressions::acos,
+        ScalarFunction::Atan => math_expressions::atan,
+        ScalarFunction::Exp => math_expressions::exp,
+        ScalarFunction::Log => math_expressions::ln,
+        ScalarFunction::Log2 => math_expressions::log2,
+        ScalarFunction::Log10 => math_expressions::log10,
+        ScalarFunction::Floor => math_expressions::floor,
+        ScalarFunction::Ceil => math_expressions::ceil,
+        ScalarFunction::Round => math_expressions::round,
+        ScalarFunction::Trunc => math_expressions::trunc,
+        ScalarFunction::Abs => math_expressions::abs,
+        ScalarFunction::Signum => math_expressions::signum,
+        ScalarFunction::Length => |args| 
Ok(Arc::new(length(args[0].as_ref())?)),
+    });
+    let data_types = args
+        .iter()
+        .map(|e| e.data_type(input_schema))
+        .collect::<Result<Vec<_>>>()?;
+
+    // coerce type
+    // for now, this supports a single type.
+    assert!(args.len() == 1);
+    assert!(data_types.len() == 1);
+    let arg_type = coerce(fun, &data_types[0])?;
+    let args = vec![cast(args[0].clone(), input_schema, arg_type.clone())?];
+
+    Ok(Arc::new(udf::ScalarFunctionExpr::new(
+        &format!("{}", fun),
+        fun_expr,
+        args,
+        &return_type(&fun, &vec![arg_type])?,
+    )))
+}
+
+/// the type supported by the function `fun`.

Review comment:
       ```suggestion
   /// the type of argument required by the function `fun`.
   ```

##########
File path: rust/datafusion/src/sql/planner.rs
##########
@@ -592,7 +607,7 @@ mod tests {
     fn select_scalar_func_with_literal_no_relation() {
         quick_test(
             "SELECT sqrt(9)",
-            "Projection: sqrt(CAST(Int64(9) AS Float64))\
+            "Projection: sqrt(Int64(9))\

Review comment:
       This change would imply to me that `sqrt` can take an `int` which is not 
actually what happens, right? At some point something adds a `Cast` to float...

##########
File path: rust/datafusion/src/logicalplan.rs
##########
@@ -190,7 +190,14 @@ fn create_name(e: &Expr, input_schema: &Schema) -> 
Result<String> {
             let expr = create_name(expr, input_schema)?;
             Ok(format!("CAST({} as {:?})", expr, data_type))
         }
-        Expr::ScalarFunction { name, args, .. } => {
+        Expr::ScalarFunction { fun, args, .. } => {
+            let mut names = Vec::with_capacity(args.len());

Review comment:
       I don't know how much you care about using "idomatic" rust, but you can 
probably do the same kind of thing here with iterators:
   
   ```
   let names = args.iter()
     .map(|e| create_name(e, input_schema)
     .collect::<Result<Vec_>>()?;
   ```

##########
File path: rust/datafusion/src/execution/physical_plan/functions.rs
##########
@@ -45,6 +43,22 @@ use arrow::{
 use std::{fmt, str::FromStr, sync::Arc};
 use udf::ScalarUdf;
 
+/// A function's signature, which defines the function's supported argument 
types.
+#[derive(Debug)]
+pub enum Signature {
+    /// arbitrary number of arguments of an uniform type out of a list of 
valid types
+    // A function such as `concat` is `Many(vec![DataType::Utf8, 
DataType::LargeUtf8])`
+    Many(Vec<DataType>),
+    /// arbitrary number of arguments of an arbitrary but uniform type
+    // A function such as `array` is `ManyUniform`
+    // The first argument decides the type used for coercion
+    ManyUniform,

Review comment:
       the idea of a `ManyUniform` number of arguments perhaps is what is 
making this so complicated. `ManyUniform` sounds a lot like the notion of 
"varargs" (variadic functions) from C (aka how you can do `printf(char *msg, 
...)`, though that does not restrict to a single type. 

##########
File path: rust/datafusion/src/execution/physical_plan/functions.rs
##########
@@ -0,0 +1,319 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Declaration of built-in (scalar) functions.
+//! This module contains built-in functions' enumeration and metadata.
+//!
+//! Generally, a function has:
+//! * a set of valid argument types
+//! * a return type function of an incoming argument types
+//! * the computation valid for all sets of valid argument types
+//!
+//! * Argument types: the number of arguments and set of valid types. For 
example, [[f32, f32], [f64, f64]] is a function of two arguments only accepting 
f32 or f64 on each of its arguments.
+//! * Return type: a function `(arg_types) -> return_type`. E.g. for sqrt, 
([f32]) -> f32, ([f64]) -> f64.
+//!
+//! Currently, this implementation supports only a single argument and a 
single signature.
+//!
+//! This module also has a set of coercion rules to improve user experience: 
if an argument i32 is passed
+//! to a function that supports f64, it is coerced to f64.
+
+use super::{
+    expressions::{cast, is_numeric},
+    PhysicalExpr,
+};
+use crate::error::{ExecutionError, Result};
+use crate::execution::physical_plan::math_expressions;
+use crate::execution::physical_plan::udf;
+use arrow::{
+    compute::kernels::length::length,
+    datatypes::{DataType, Schema},
+};
+use std::{fmt, str::FromStr, sync::Arc};
+use udf::ScalarUdf;
+
+/// Enum of all built-in scalar functions
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum ScalarFunction {
+    /// sqrt
+    Sqrt,
+    /// sin
+    Sin,
+    /// cos
+    Cos,
+    /// tan
+    Tan,
+    /// asin
+    Asin,
+    /// acos
+    Acos,
+    /// atan
+    Atan,
+    /// exp
+    Exp,
+    /// log, also known as ln
+    Log,
+    /// log2
+    Log2,
+    /// log10
+    Log10,
+    /// floor
+    Floor,
+    /// ceil
+    Ceil,
+    /// round
+    Round,
+    /// trunc
+    Trunc,
+    /// abs
+    Abs,
+    /// signum
+    Signum,
+    /// length
+    Length,
+}
+
+impl fmt::Display for ScalarFunction {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        // lowercase of the debug.
+        write!(f, "{}", format!("{:?}", self).to_lowercase())
+    }
+}
+
+impl FromStr for ScalarFunction {
+    type Err = ExecutionError;
+    fn from_str(name: &str) -> Result<ScalarFunction> {
+        Ok(match name {
+            "sqrt" => ScalarFunction::Sqrt,
+            "sin" => ScalarFunction::Sin,
+            "cos" => ScalarFunction::Cos,
+            "tan" => ScalarFunction::Tan,
+            "asin" => ScalarFunction::Asin,
+            "acos" => ScalarFunction::Acos,
+            "atan" => ScalarFunction::Atan,
+            "exp" => ScalarFunction::Exp,
+            "log" => ScalarFunction::Log,
+            "log2" => ScalarFunction::Log2,
+            "log10" => ScalarFunction::Log10,
+            "floor" => ScalarFunction::Floor,
+            "ceil" => ScalarFunction::Ceil,
+            "round" => ScalarFunction::Round,
+            "truc" => ScalarFunction::Trunc,
+            "abs" => ScalarFunction::Abs,
+            "signum" => ScalarFunction::Signum,
+            "length" => ScalarFunction::Length,
+            _ => {
+                return Err(ExecutionError::General(format!(
+                    "There is no built-in function named {}",
+                    name
+                )))
+            }
+        })
+    }
+}
+
+/// Returns the datatype of the scalar function
+pub fn return_type(fun: &ScalarFunction, arg_types: &Vec<DataType>) -> 
Result<DataType> {
+    // Note that this function *must* return the same type that the respective 
physical expression returns
+    // or the execution panics.
+
+    if arg_types.len() != 1 {
+        // for now, every function expects a single argument, and thus this is 
enough
+        return Err(ExecutionError::General(format!(
+            "The function \"{}\" expected 1 argument, but received \"{}\"",
+            fun,
+            arg_types.len()
+        )));
+    }
+
+    // verify that this is a valid type for this function
+    coerce(fun, &arg_types[0])?;
+
+    // the return type after coercion.
+    // for now, this is type-independent, but there will be built-in functions 
whose return type
+    // depends on the incoming type.
+    match fun {
+        ScalarFunction::Length => Ok(DataType::UInt32),
+        _ => Ok(DataType::Float64),
+    }
+}
+
+/// Create a physical (function) expression.
+/// This function errors when `args`' can't be coerced to a valid argument 
type of the function.
+pub fn function(
+    fun: &ScalarFunction,
+    args: &Vec<Arc<dyn PhysicalExpr>>,
+    input_schema: &Schema,
+) -> Result<Arc<dyn PhysicalExpr>> {
+    let fun_expr: ScalarUdf = Arc::new(match fun {
+        ScalarFunction::Sqrt => math_expressions::sqrt,
+        ScalarFunction::Sin => math_expressions::sin,
+        ScalarFunction::Cos => math_expressions::cos,
+        ScalarFunction::Tan => math_expressions::tan,
+        ScalarFunction::Asin => math_expressions::asin,
+        ScalarFunction::Acos => math_expressions::acos,
+        ScalarFunction::Atan => math_expressions::atan,
+        ScalarFunction::Exp => math_expressions::exp,
+        ScalarFunction::Log => math_expressions::ln,
+        ScalarFunction::Log2 => math_expressions::log2,
+        ScalarFunction::Log10 => math_expressions::log10,
+        ScalarFunction::Floor => math_expressions::floor,
+        ScalarFunction::Ceil => math_expressions::ceil,
+        ScalarFunction::Round => math_expressions::round,
+        ScalarFunction::Trunc => math_expressions::trunc,
+        ScalarFunction::Abs => math_expressions::abs,
+        ScalarFunction::Signum => math_expressions::signum,
+        ScalarFunction::Length => |args| 
Ok(Arc::new(length(args[0].as_ref())?)),
+    });
+    let data_types = args
+        .iter()
+        .map(|e| e.data_type(input_schema))
+        .collect::<Result<Vec<_>>>()?;
+
+    // coerce type
+    // for now, this supports a single type.
+    assert!(args.len() == 1);
+    assert!(data_types.len() == 1);
+    let arg_type = coerce(fun, &data_types[0])?;
+    let args = vec![cast(args[0].clone(), input_schema, arg_type.clone())?];
+
+    Ok(Arc::new(udf::ScalarFunctionExpr::new(
+        &format!("{}", fun),
+        fun_expr,
+        args,
+        &return_type(&fun, &vec![arg_type])?,
+    )))
+}
+
+/// the type supported by the function `fun`.
+fn valid_type(fun: &ScalarFunction) -> DataType {
+    // note: the physical expression must accept the type returned by this 
function or the execution panics.
+
+    // for now, the list is small, as we do not have many built-in functions.
+    match fun {
+        ScalarFunction::Length => DataType::Utf8,
+        // math expressions expect f64
+        _ => DataType::Float64,
+    }
+}
+
+/// coercion rules for all built-in functions.
+/// Returns a DataType coerced from `arg_type` that is accepted by `fun`.
+/// Errors when `arg_type` can't be coerced to a valid return type of `fun`.
+fn coerce(fun: &ScalarFunction, arg_type: &DataType) -> Result<DataType> {

Review comment:
       ```suggestion
   fn is_argument_compatible(fun: &ScalarFunction, arg_type: &DataType) -> 
Result<DataType> {
   ```
   
   I don't think this function is "coercing" the input -- it is effectively 
encoding "what coercions would be possible" I think... 




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] alamb commented on a change in pull request #8080: ARROW-9887: [Rust] [DataFusion] Added support for complex return types for built-in functions

Reply via email to