jayzhan211 commented on code in PR #10644: URL: https://github.com/apache/datafusion/pull/10644#discussion_r1613540288
########## datafusion/functions-aggregate/src/median.rs: ########## @@ -15,71 +15,105 @@ // specific language governing permissions and limitations // under the License. -//! # Median - -use crate::aggregate::utils::{down_cast_any_ref, Hashable}; -use crate::expressions::format_state_name; -use crate::{AggregateExpr, PhysicalExpr}; -use arrow::array::{Array, ArrayRef}; -use arrow::datatypes::{DataType, Field}; -use arrow_array::cast::AsArray; -use arrow_array::{downcast_integer, ArrowNativeTypeOp, ArrowNumericType}; -use arrow_buffer::ArrowNativeType; -use datafusion_common::{DataFusionError, Result, ScalarValue}; -use datafusion_expr::Accumulator; -use std::any::Any; use std::collections::HashSet; use std::fmt::Formatter; -use std::sync::Arc; +use std::{fmt::Debug, sync::Arc}; + +use arrow::array::{downcast_integer, ArrowNumericType}; +use arrow::{ + array::{ArrayRef, AsArray}, + datatypes::{ + DataType, Decimal128Type, Decimal256Type, Field, Float16Type, Float32Type, + Float64Type, + }, +}; + +use arrow::array::Array; +use arrow::array::ArrowNativeTypeOp; +use arrow::datatypes::ArrowNativeType; + +use datafusion_common::{DataFusionError, Result, ScalarValue}; +use datafusion_expr::function::StateFieldsArgs; +use datafusion_expr::{ + function::AccumulatorArgs, utils::format_state_name, Accumulator, AggregateUDFImpl, + Signature, Volatility, +}; +use datafusion_physical_expr_common::aggregate::utils::Hashable; + +make_udaf_expr_and_func!( + Median, + median, + expression, + "Computes the median of a set of numbers", + median_udaf +); -/// MEDIAN aggregate expression. If using the non-distinct variation, then this uses a -/// lot of memory because all values need to be stored in memory before a result can be -/// computed. If an approximation is sufficient then APPROX_MEDIAN provides a much more -/// efficient solution. -/// -/// If using the distinct variation, the memory usage will be similarly high if the -/// cardinality is high as it stores all distinct values in memory before computing the -/// result, but if cardinality is low then memory usage will also be lower. -#[derive(Debug)] pub struct Median { - name: String, - expr: Arc<dyn PhysicalExpr>, - data_type: DataType, - distinct: bool, + signature: Signature, + aliases: Vec<String>, +} + +impl Debug for Median { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + f.debug_struct("Median") + .field("name", &self.name()) + .field("signature", &self.signature) + .finish() + } +} + +impl Default for Median { + fn default() -> Self { + Self::new() + } } impl Median { - /// Create a new MEDIAN aggregate function - pub fn new( - expr: Arc<dyn PhysicalExpr>, - name: impl Into<String>, - data_type: DataType, - distinct: bool, - ) -> Self { + pub fn new() -> Self { Self { - name: name.into(), - expr, - data_type, - distinct, + aliases: vec!["median".to_string()], + signature: Signature::numeric(1, Volatility::Immutable), Review Comment: I introduce this signature that computes the final coerce types instead of generating tons of valid types with the possible same type. The numeric signature considers decimal types too, which is needed but not included in the Numeric types array. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org