Jefffrey commented on code in PR #18663:
URL: https://github.com/apache/datafusion/pull/18663#discussion_r2522808990


##########
datafusion/functions-aggregate/src/percentile_cont.rs:
##########
@@ -141,10 +142,17 @@ impl Default for PercentileCont {
 impl PercentileCont {
     pub fn new() -> Self {
         let mut variants = Vec::with_capacity(NUMERICS.len());
-        // Accept any numeric value paired with a float64 percentile
+
+        let list_f64 =
+            DataType::List(Arc::new(Field::new("item", DataType::Float64, 
true)));
+
         for num in NUMERICS {
+            // Accept any numeric value paired with a float64 percentile
             variants.push(TypeSignature::Exact(vec![num.clone(), 
DataType::Float64]));
+            //Accept any numeric value paired with a list of float64 
percentiles
+            variants.push(TypeSignature::Exact(vec![num.clone(), 
list_f64.clone()]));
         }
+

Review Comment:
   Hmm this might not be ideal, but I don't think our current signature API 
does have a clean way of representing what we want here 🤔 
   
   Might need some further work on the signature API to make this nicer (not to 
mention hopefully cleaning up the usage of `NUMERICS` here,  which I've been 
slowly working on #18092)
   
   Perhaps we go with the `user_defined` approach for now 🤔 



##########
datafusion/functions-aggregate/src/percentile_cont.rs:
##########
@@ -747,69 +811,82 @@ impl<T: ArrowNumericType> Accumulator for 
DistinctPercentileContAccumulator<T> {
 /// For percentile p and n values:
 /// - If p * (n-1) is an integer, return the value at that position
 /// - Otherwise, interpolate between the two closest values
-fn calculate_percentile<T: ArrowNumericType>(
+fn calculate_percentiles<T: ArrowNumericType>(
     mut values: Vec<T::Native>,
-    percentile: f64,
-) -> Option<T::Native> {
+    percentiles: &[f64],
+) -> Vec<Option<T::Native>> {
     let cmp = |x: &T::Native, y: &T::Native| x.compare(*y);
 
     let len = values.len();
     if len == 0 {
-        None
+        vec![None; percentiles.len()]
     } else if len == 1 {
-        Some(values[0])
-    } else if percentile == 0.0 {
+        vec![Some(values[0]); percentiles.len()]
+    } else if percentiles[0] == 0.0 {
         // Get minimum value
-        Some(
-            *values
-                .iter()
-                .min_by(|a, b| cmp(a, b))
-                .expect("we checked for len > 0 a few lines above"),
-        )
-    } else if percentile == 1.0 {
+        vec![
+            Some(
+                *values
+                    .iter()
+                    .min_by(|a, b| cmp(a, b))
+                    .expect("we checked for len > 0 a few lines above"),
+            );
+            percentiles.len()
+        ]
+    } else if percentiles[0] == 1.0 {
         // Get maximum value
-        Some(
-            *values
-                .iter()
-                .max_by(|a, b| cmp(a, b))
-                .expect("we checked for len > 0 a few lines above"),
-        )
-    } else {
-        // Calculate the index using the formula: p * (n - 1)
-        let index = percentile * ((len - 1) as f64);
-        let lower_index = index.floor() as usize;
-        let upper_index = index.ceil() as usize;
-
-        if lower_index == upper_index {
-            // Exact index, return the value at that position
-            let (_, value, _) = values.select_nth_unstable_by(lower_index, 
cmp);
-            Some(*value)
-        } else {
-            // Need to interpolate between two values
-            // First, partition at lower_index to get the lower value
-            let (_, lower_value, _) = 
values.select_nth_unstable_by(lower_index, cmp);
-            let lower_value = *lower_value;
-
-            // Then partition at upper_index to get the upper value
-            let (_, upper_value, _) = 
values.select_nth_unstable_by(upper_index, cmp);
-            let upper_value = *upper_value;
-
-            // Linear interpolation using wrapping arithmetic
-            // We use wrapping operations here (matching the approach in 
median.rs) because:
-            // 1. Both values come from the input data, so diff is bounded by 
the value range
-            // 2. fraction is between 0 and 1, and INTERPOLATION_PRECISION is 
small enough
-            //    to prevent overflow when combined with typical numeric ranges
-            // 3. The result is guaranteed to be between lower_value and 
upper_value
-            // 4. For floating-point types, wrapping ops behave the same as 
standard ops
-            let fraction = index - (lower_index as f64);
-            let diff = upper_value.sub_wrapping(lower_value);
-            let interpolated = lower_value.add_wrapping(
-                diff.mul_wrapping(T::Native::usize_as(
-                    (fraction * INTERPOLATION_PRECISION as f64) as usize,
-                ))
-                .div_wrapping(T::Native::usize_as(INTERPOLATION_PRECISION)),
+        vec![
+            Some(
+                *values
+                    .iter()
+                    .max_by(|a, b| cmp(a, b))
+                    .expect("we checked for len > 0 a few lines above"),
             );
-            Some(interpolated)
-        }
+            percentiles.len()
+        ]
+    } else {
+        percentiles
+            .iter()
+            .map(|percentile| {

Review Comment:
   This logic might be inefficient; I don't know how postgres does it but 
perhaps they just sort the values once up front allowing each percentile to be 
calculated O(1) on the sorted output.
   
   Maybe we need a separate path for when we have a single percentile (do it 
the old way) and multiple percentiles (sort input then calculate all 
percentiles at once)



##########
datafusion/functions-aggregate/src/percentile_cont.rs:
##########
@@ -747,69 +811,82 @@ impl<T: ArrowNumericType> Accumulator for 
DistinctPercentileContAccumulator<T> {
 /// For percentile p and n values:
 /// - If p * (n-1) is an integer, return the value at that position
 /// - Otherwise, interpolate between the two closest values
-fn calculate_percentile<T: ArrowNumericType>(
+fn calculate_percentiles<T: ArrowNumericType>(
     mut values: Vec<T::Native>,
-    percentile: f64,
-) -> Option<T::Native> {
+    percentiles: &[f64],
+) -> Vec<Option<T::Native>> {
     let cmp = |x: &T::Native, y: &T::Native| x.compare(*y);
 
     let len = values.len();
     if len == 0 {
-        None
+        vec![None; percentiles.len()]
     } else if len == 1 {
-        Some(values[0])
-    } else if percentile == 0.0 {
+        vec![Some(values[0]); percentiles.len()]
+    } else if percentiles[0] == 0.0 {

Review Comment:
   This needs to consider length of `percentiles`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to