alamb commented on code in PR #10644: URL: https://github.com/apache/datafusion/pull/10644#discussion_r1614539590
########## datafusion/functions-aggregate/src/median.rs: ########## @@ -15,71 +15,105 @@ // specific language governing permissions and limitations // under the License. -//! # Median - -use crate::aggregate::utils::{down_cast_any_ref, Hashable}; -use crate::expressions::format_state_name; -use crate::{AggregateExpr, PhysicalExpr}; -use arrow::array::{Array, ArrayRef}; -use arrow::datatypes::{DataType, Field}; -use arrow_array::cast::AsArray; -use arrow_array::{downcast_integer, ArrowNativeTypeOp, ArrowNumericType}; -use arrow_buffer::ArrowNativeType; -use datafusion_common::{DataFusionError, Result, ScalarValue}; -use datafusion_expr::Accumulator; -use std::any::Any; use std::collections::HashSet; use std::fmt::Formatter; -use std::sync::Arc; +use std::{fmt::Debug, sync::Arc}; + +use arrow::array::{downcast_integer, ArrowNumericType}; +use arrow::{ + array::{ArrayRef, AsArray}, + datatypes::{ + DataType, Decimal128Type, Decimal256Type, Field, Float16Type, Float32Type, + Float64Type, + }, +}; + +use arrow::array::Array; +use arrow::array::ArrowNativeTypeOp; +use arrow::datatypes::ArrowNativeType; + +use datafusion_common::{DataFusionError, Result, ScalarValue}; +use datafusion_expr::function::StateFieldsArgs; +use datafusion_expr::{ + function::AccumulatorArgs, utils::format_state_name, Accumulator, AggregateUDFImpl, + Signature, Volatility, +}; +use datafusion_physical_expr_common::aggregate::utils::Hashable; + +make_udaf_expr_and_func!( + Median, + median, + expression, + "Computes the median of a set of numbers", + median_udaf +); -/// MEDIAN aggregate expression. If using the non-distinct variation, then this uses a Review Comment: I think this comment still holds and would be valuable to bring to the new `Median` UDF Specifically that median uses substantial memory ########## datafusion/expr/src/signature.rs: ########## @@ -119,6 +119,8 @@ pub enum TypeSignature { OneOf(Vec<TypeSignature>), /// Specifies Signatures for array functions ArraySignature(ArrayFunctionSignature), + /// Fixed number of arguments of numeric types Review Comment: Could we please document (or link to documentation) about what types are "numeric"? Is it https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html#method.is_numeric ? ########## datafusion/optimizer/src/single_distinct_to_groupby.rs: ########## @@ -257,6 +257,56 @@ impl OptimizerRule for SingleDistinctToGroupBy { ))) } } + Expr::AggregateFunction(AggregateFunction { Review Comment: Can we please add a test (if not already done) showing that the distinct aggregate is rewritten? Or perhaps there is already a test for median 🤔 ########## datafusion/functions-aggregate/Cargo.toml: ########## @@ -39,6 +39,7 @@ path = "src/lib.rs" [dependencies] arrow = { workspace = true } +arrow-schema = { workspace = true } Review Comment: is the issue that not re-exported in arrow? ########## datafusion/optimizer/src/single_distinct_to_groupby.rs: ########## @@ -257,6 +257,56 @@ impl OptimizerRule for SingleDistinctToGroupBy { ))) } } + Expr::AggregateFunction(AggregateFunction { Review Comment: From what I can tell this is now a general optimization (as in it will rewrite *any* distinct user defined aggregate as well). If so, that is quite cool. Is it correct? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org