icexelloss commented on code in PR #35514:
URL: https://github.com/apache/arrow/pull/35514#discussion_r1213365848


##########
python/pyarrow/conftest.py:
##########
@@ -278,3 +278,59 @@ def unary_function(ctx, x):
                                 {"array": pa.int64()},
                                 pa.int64())
     return unary_function, func_name
+
+
[email protected](scope="session")
+def unary_agg_func_fixture():
+    """
+    Register a unary aggregate function
+    """
+    from pyarrow import compute as pc
+    import numpy as np
+
+    def func(ctx, x):
+        return pa.scalar(np.nanmean(x))
+
+    func_name = "y=avg(x)"
+    func_doc = {"summary": "y=avg(x)",
+                "description": "find mean of x"}
+
+    pc.register_aggregate_function(func,
+                                   func_name,
+                                   func_doc,
+                                   {
+                                       "x": pa.float64(),
+                                   },
+                                   pa.float64()
+                                   )
+    return func, func_name
+
+
[email protected](scope="session")
+def varargs_agg_func_fixture():
+    """
+    Register a unary aggregate function
+    """
+    from pyarrow import compute as pc
+    import numpy as np
+
+    def func(ctx, *args):
+        sum = 0.0
+        for arg in args:
+            sum += np.nanmean(arg)
+        return pa.scalar(sum)
+
+    func_name = "y=sum_mean(x...)"
+    func_doc = {"summary": "Varargs aggregate",
+                "description": "Varargs aggregate"}
+
+    pc.register_aggregate_function(func,
+                                   func_name,
+                                   func_doc,
+                                   {
+                                       "x": pa.int64(),
+                                       "y": pa.float64()

Review Comment:
   Admittedly this is weird/confusing but here is why:
   
   This not a truely "varargs" function, as this function must take two 
arguments x and y with the specified type. 
   This test case matches how we would this it internally. The end user would 
define sth like
   
   def foo(v: pd.Series):
       return np.nanmean(v)
   
   summarize(table, agg=foo, columns=['v'], by='time')
   
   We would then wrap the foo into a function that Acero is expecting (a 
varargs UDF)
   
   def get_acero_func(func):
         # This wraps turns the func to what Acero is expecting
         def acero_func(ctx, *args):
               return pa.scalar(func(*[arg.to_pandas() for arg in args]))
               
         return acero_func
   
   And also register it in Acero on the fly.
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to