Re: [PR] Convert ApproxPercentileCont and ApproxPercentileContWithWeight to UDAF [datafusion]

via GitHub Sun, 16 Jun 2024 04:51:37 -0700


goldmedal commented on code in PR #10917:
URL: https://github.com/apache/datafusion/pull/10917#discussion_r1641808518



##########
datafusion/functions-aggregate/src/approx_percentile_cont.rs:
##########
@@ -15,19 +15,254 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::any::Any;
+use std::fmt::{Debug, Formatter};
+
 use arrow::{
     array::{
         ArrayRef, Float32Array, Float64Array, Int16Array, Int32Array, 
Int64Array,
         Int8Array, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
     },
     datatypes::DataType,
 };
+use arrow_schema::Field;
 
-use datafusion_common::{downcast_value, internal_err, DataFusionError, 
ScalarValue};
-use datafusion_expr::Accumulator;
+use datafusion_common::{
+    downcast_value, internal_err, not_impl_err, plan_err, DataFusionError, 
ScalarValue,
+};
+use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs};
+use datafusion_expr::type_coercion::aggregates::{INTEGERS, NUMERICS};
+use datafusion_expr::utils::format_state_name;
+use datafusion_expr::{
+    Accumulator, AggregateUDFImpl, Expr, Signature, TypeSignature, Volatility,
+};
 use datafusion_physical_expr_common::aggregate::tdigest::{
     TDigest, TryIntoF64, DEFAULT_MAX_SIZE,
 };
+use datafusion_physical_expr_common::aggregate::utils::down_cast_any_ref;
+
+make_udaf_expr_and_func!(
+    ApproxPercentileCont,
+    approx_percentile_cont,
+    expression percentile,
+    "Computes the approximate percentile continuous of a set of numbers",
+    approx_percentile_cont_udaf
+);
+
+pub struct ApproxPercentileCont {
+    signature: Signature,
+}
+
+impl Debug for ApproxPercentileCont {
+    fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
+        f.debug_struct("ApproxPercentileCont")
+            .field("name", &self.name())
+            .field("signature", &self.signature)
+            .finish()
+    }
+}
+
+impl Default for ApproxPercentileCont {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl ApproxPercentileCont {
+    /// Create a new [`ApproxPercentileCont`] aggregate function.
+    pub fn new() -> Self {
+        let mut variants = Vec::with_capacity(NUMERICS.len() * (INTEGERS.len() 
+ 1));
+        // Accept any numeric value paired with a float64 percentile
+        for num in NUMERICS {
+            variants.push(TypeSignature::Exact(vec![num.clone(), 
DataType::Float64]));
+            // Additionally accept an integer number of centroids for T-Digest
+            for int in INTEGERS {
+                variants.push(TypeSignature::Exact(vec![
+                    num.clone(),
+                    DataType::Float64,
+                    int.clone(),
+                ]))
+            }
+        }
+        Self {
+            signature: Signature::one_of(variants, Volatility::Immutable),

Review Comment:
   I found we can't use `Signature::numeric`. I tried to use it, but I 
discovered that all the input arguments will have the same type as the first 
argument. For example, in a SQL query like:
   ```
   select approx_percentile_cont(float64_col, 0.5, 100) from t
   ```
   the input arguments will be `vec![Float64, Float64, Float64]`. However, the 
last argument should be `int64`. This will cause a failure.
   
   I'm not sure if it's related, but I found something weird. Why is 
`AccumulatorArgs#input_type` not a `vec` but a single datatype? If the number 
of arguments is greater than 1, how do we handle the other arguments?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Re: [PR] Convert ApproxPercentileCont and ApproxPercentileContWithWeight to UDAF [datafusion]

Reply via email to