timsaucer commented on code in PR #771: URL: https://github.com/apache/datafusion-python/pull/771#discussion_r1693012963
########## python/datafusion/functions.py: ########## @@ -1480,31 +1481,26 @@ def last_value( ) -def bit_and(*args: Expr, distinct: bool = False) -> Expr: +def bit_and(arg: Expr, distinct: bool = False) -> Expr: """Computes the bitwise AND of the argument.""" - args = [arg.expr for arg in args] - return Expr(f.bit_and(*args, distinct=distinct)) + return Expr(f.bit_and(arg.expr, distinct=distinct)) -def bit_or(*args: Expr, distinct: bool = False) -> Expr: +def bit_or(arg: Expr, distinct: bool = False) -> Expr: """Computes the bitwise OR of the argument.""" - args = [arg.expr for arg in args] - return Expr(f.bit_or(*args, distinct=distinct)) + return Expr(f.bit_or(arg.expr, distinct=distinct)) -def bit_xor(*args: Expr, distinct: bool = False) -> Expr: +def bit_xor(arg: Expr, distinct: bool = False) -> Expr: """Computes the bitwise XOR of the argument.""" - args = [arg.expr for arg in args] - return Expr(f.bit_xor(*args, distinct=distinct)) + return Expr(f.bit_xor(arg.expr, distinct=distinct)) -def bool_and(*args: Expr, distinct: bool = False) -> Expr: +def bool_and(arg: Expr, distinct: bool = False) -> Expr: """Computes the boolean AND of the arugment.""" - args = [arg.expr for arg in args] - return Expr(f.bool_and(*args, distinct=distinct)) + return Expr(f.bool_and(arg.expr, distinct=distinct)) -def bool_or(*args: Expr, distinct: bool = False) -> Expr: +def bool_or(arg: Expr, distinct: bool = False) -> Expr: """Computes the boolean OR of the arguement.""" - args = [arg.expr for arg in args] - return Expr(f.bool_or(*args, distinct=distinct)) + return Expr(f.bool_or(arg.expr, distinct=distinct)) Review Comment: Thank you. I think these are on me - I should have realized they should only have one argument. ########## src/functions.rs: ########## @@ -293,21 +569,23 @@ fn col(name: &str) -> PyResult<PyExpr> { }) } +// TODO: should we just expose this in python? /// Create a COUNT(1) aggregate expression #[pyfunction] -fn count_star() -> PyResult<PyExpr> { - Ok(PyExpr { - expr: Expr::AggregateFunction(AggregateFunction { - func_def: datafusion_expr::expr::AggregateFunctionDefinition::BuiltIn( - aggregate_function::AggregateFunction::Count, - ), - args: vec![lit(1)], - distinct: false, - filter: None, - order_by: None, - null_treatment: None, - }), - }) +fn count_star() -> PyExpr { + functions_aggregate::expr_fn::count(lit(1)).into() +} + Review Comment: Per the question: I think it's worth adding some guidance about what to put in python vs what to put in rust for this repo. In my mind the things that should go into the python side are - Trivial aliases (this is a good example) - Simple type conversion, like path -> string of the path or number to lit(number) - More complex type conversion *if* it makes sense to do in python. For example, in the `named_struct` where sending in a dictionary on the python side makes a lot more sense and isn't as simple to do via pyo3. And then I'd lean towards everything else sitting on the rust side. ########## src/functions.rs: ########## @@ -30,13 +31,147 @@ use datafusion::functions_aggregate; use datafusion_common::{Column, ScalarValue, TableReference}; use datafusion_expr::expr::Alias; use datafusion_expr::{ - aggregate_function, expr::{ find_df_window_func, AggregateFunction, AggregateFunctionDefinition, Sort, WindowFunction, }, lit, Expr, WindowFunctionDefinition, }; +#[pyfunction] +pub fn approx_distinct(expression: PyExpr) -> PyExpr { + functions_aggregate::expr_fn::approx_distinct::approx_distinct(expression.expr).into() +} + +#[pyfunction] +pub fn approx_median(expression: PyExpr, distinct: bool) -> PyResult<PyExpr> { + let expr = functions_aggregate::expr_fn::approx_median(expression.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} Review Comment: It feels like these are a lot of work. Is it too difficult to instead update `aggregate_function!` macro? ########## src/functions.rs: ########## @@ -75,47 +230,168 @@ pub fn var(y: PyExpr) -> PyExpr { } #[pyfunction] -#[pyo3(signature = (*args, distinct = false, filter = None, order_by = None, null_treatment = None))] +pub fn var_pop(expression: PyExpr, distinct: bool) -> PyResult<PyExpr> { + let expr = functions_aggregate::expr_fn::var_pop(expression.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn regr_avgx(expr_y: PyExpr, expr_x: PyExpr, distinct: bool) -> PyResult<PyExpr> { + let expr = functions_aggregate::expr_fn::regr_avgx(expr_y.expr, expr_x.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn regr_avgy(expr_y: PyExpr, expr_x: PyExpr, distinct: bool) -> PyResult<PyExpr> { + let expr = functions_aggregate::expr_fn::regr_avgy(expr_y.expr, expr_x.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn regr_count(expr_y: PyExpr, expr_x: PyExpr, distinct: bool) -> PyResult<PyExpr> { + let expr = functions_aggregate::expr_fn::regr_count(expr_y.expr, expr_x.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn regr_intercept(expr_y: PyExpr, expr_x: PyExpr, distinct: bool) -> PyResult<PyExpr> { + let expr = functions_aggregate::expr_fn::regr_intercept(expr_y.expr, expr_x.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn regr_r2(expr_y: PyExpr, expr_x: PyExpr, distinct: bool) -> PyResult<PyExpr> { + let expr = functions_aggregate::expr_fn::regr_r2(expr_y.expr, expr_x.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn regr_slope(expr_y: PyExpr, expr_x: PyExpr, distinct: bool) -> PyResult<PyExpr> { + let expr = functions_aggregate::expr_fn::regr_slope(expr_y.expr, expr_x.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn regr_sxx(expr_y: PyExpr, expr_x: PyExpr, distinct: bool) -> PyResult<PyExpr> { + let expr = functions_aggregate::expr_fn::regr_sxx(expr_y.expr, expr_x.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn regr_sxy(expr_y: PyExpr, expr_x: PyExpr, distinct: bool) -> PyResult<PyExpr> { + let expr = functions_aggregate::expr_fn::regr_sxy(expr_y.expr, expr_x.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +pub fn regr_syy(expr_y: PyExpr, expr_x: PyExpr, distinct: bool) -> PyResult<PyExpr> { + let expr = functions_aggregate::expr_fn::regr_syy(expr_y.expr, expr_x.expr); + if distinct { + Ok(expr.distinct().build()?.into()) + } else { + Ok(expr.into()) + } +} + +#[pyfunction] +#[pyo3(signature = (expr, distinct = false, filter = None, order_by = None, null_treatment = None))] pub fn first_value( - args: Vec<PyExpr>, + expr: PyExpr, distinct: bool, filter: Option<PyExpr>, order_by: Option<Vec<PyExpr>>, null_treatment: Option<NullTreatment>, -) -> PyExpr { - let null_treatment = null_treatment.map(Into::into); - let args = args.into_iter().map(|x| x.expr).collect::<Vec<_>>(); - let order_by = order_by.map(|x| x.into_iter().map(|x| x.expr).collect::<Vec<_>>()); - functions_aggregate::expr_fn::first_value( - args, - distinct, - filter.map(|x| Box::new(x.expr)), - order_by, - null_treatment, - ) - .into() +) -> PyResult<PyExpr> { + // If we initialize the UDAF with order_by directly, then it gets over-written by the builder + let agg_fn = functions_aggregate::expr_fn::first_value(expr.expr, None); + + // luckily, I can guarantee initializing a builder with an `order_by` default of empty vec + let order_by = order_by + .map(|x| x.into_iter().map(|x| x.expr).collect::<Vec<_>>()) + .unwrap_or_default(); + let mut builder = agg_fn.order_by(order_by); + + if distinct { + builder = builder.distinct(); + } + + if let Some(filter) = filter { + builder = builder.filter(filter.expr); + } + + if let Some(null_treatment) = null_treatment { + builder = builder.null_treatment(null_treatment.into()) + } + + Ok(builder.build()?.into()) } #[pyfunction] -#[pyo3(signature = (*args, distinct = false, filter = None, order_by = None, null_treatment = None))] +#[pyo3(signature = (expr, distinct = false, filter = None, order_by = None, null_treatment = None))] Review Comment: With the wrappers now in place, I'm wondering if we should just remove all the pyo3 signatures just to reduce clutter in the repo. That shouldn't hold up this PR though. What do you think? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org