westonpace commented on code in PR #14682:
URL: https://github.com/apache/arrow/pull/14682#discussion_r1040625791
##########
python/pyarrow/_compute.pyx:
##########
@@ -141,6 +153,38 @@ cdef wrap_hash_aggregate_kernel(const
CHashAggregateKernel* c_kernel):
return kernel
+cdef class RecordBatchIterator(_Weakrefable):
Review Comment:
Is there any way we can use a RecordBatchReader instead of introducing a new
type which seems pretty similar?
##########
python/pyarrow/_compute.pyx:
##########
@@ -2558,8 +2602,31 @@ def _get_scalar_udf_context(memory_pool, batch_length):
return context
-def register_scalar_function(func, function_name, function_doc, in_types,
- out_type):
+ctypedef CStatus (*CRegisterScalarLikeFunction)(PyObject* function,
+ function[CallbackUdf] wrapper,
const CScalarUdfOptions& options,
+ CFunctionRegistry* registry)
+
+cdef class RegisterScalarLikeFunction(_Weakrefable):
+ cdef CRegisterScalarLikeFunction register_func
+
+ cdef void init(self, const CRegisterScalarLikeFunction register_func):
+ self.register_func = register_func
+
+
+cdef GetRegisterScalarFunction():
Review Comment:
Nit: Should probably be `get_register_scalar_function`
##########
python/pyarrow/_compute.pyx:
##########
@@ -2678,5 +2831,37 @@ def register_scalar_function(func, function_name,
function_doc, in_types,
c_options.input_types = c_in_types
c_options.output_type = c_out_type
- check_status(RegisterScalarFunction(c_function,
- <function[CallbackUdf]>
&_scalar_udf_callback, c_options))
+ if func_registry is None:
+ c_func_registry = NULL
+ else:
+ c_func_registry = (<FunctionRegistry>func_registry).registry
+
+ c_register_func = (<RegisterScalarLikeFunction>register_func).register_func
+
+ check_status(c_register_func(c_function,
+ <function[CallbackUdf]> &_scalar_udf_callback,
+ c_options, c_func_registry))
+
+
+def get_record_batches_from_tabular_function(function_name,
func_registry=None):
Review Comment:
Is this the table UDF equivalent of `call_function`? In other words, a way
to use the function outside of an exec plan?
##########
python/pyarrow/_compute.pyx:
##########
@@ -2558,8 +2602,31 @@ def _get_scalar_udf_context(memory_pool, batch_length):
return context
-def register_scalar_function(func, function_name, function_doc, in_types,
- out_type):
+ctypedef CStatus (*CRegisterScalarLikeFunction)(PyObject* function,
+ function[CallbackUdf] wrapper,
const CScalarUdfOptions& options,
+ CFunctionRegistry* registry)
+
+cdef class RegisterScalarLikeFunction(_Weakrefable):
+ cdef CRegisterScalarLikeFunction register_func
+
+ cdef void init(self, const CRegisterScalarLikeFunction register_func):
+ self.register_func = register_func
+
+
+cdef GetRegisterScalarFunction():
+ cdef RegisterScalarLikeFunction reg =
RegisterScalarLikeFunction.__new__(RegisterScalarLikeFunction)
+ reg.register_func = RegisterScalarFunction
+ return reg
+
+
+cdef GetRegisterTabularFunction():
Review Comment:
Nit: Should probably be `get_register_tabular_function`
##########
python/pyarrow/_compute.pyx:
##########
@@ -2629,14 +2718,98 @@ def register_scalar_function(func, function_name,
function_doc, in_types,
21
]
"""
+ return register_scalar_like_function(GetRegisterScalarFunction(),
+ func, function_name, function_doc,
in_types,
+ out_type, func_registry)
+
+
+def register_tabular_function(func, function_name, function_doc, in_types,
out_type,
+ func_registry=None):
+ """
+ Register a user-defined tabular function.
+
+ A tabular function is one accepting a context argument of type
+ ScalarUdfContext and returning a generator of struct arrays.
+ The in_types argument must be empty and the out_type argument
+ specifies a schema. Each struct array must have field types
+ correspoding to the schema.
+
+ Parameters
+ ----------
+ func : callable
+ A callable implementing the user-defined function.
+ The only argument is the context argument of type
+ ScalarUdfContext. It must return a callable that
+ returns on each invocation a StructArray matching
+ the out_type, where an empty array indicates end.
+ function_name : str
+ Name of the function. This name must be globally unique.
+ function_doc : dict
+ A dictionary object with keys "summary" (str),
+ and "description" (str).
+ in_types : Dict[str, DataType]
+ Must be an empty dictionary.
Review Comment:
Maybe `Must be an empty dictionary (planned for future use)`?
##########
python/pyarrow/_compute.pyx:
##########
@@ -2629,14 +2718,98 @@ def register_scalar_function(func, function_name,
function_doc, in_types,
21
]
"""
+ return register_scalar_like_function(GetRegisterScalarFunction(),
+ func, function_name, function_doc,
in_types,
+ out_type, func_registry)
+
+
+def register_tabular_function(func, function_name, function_doc, in_types,
out_type,
+ func_registry=None):
+ """
+ Register a user-defined tabular function.
+
+ A tabular function is one accepting a context argument of type
+ ScalarUdfContext and returning a generator of struct arrays.
+ The in_types argument must be empty and the out_type argument
+ specifies a schema. Each struct array must have field types
+ correspoding to the schema.
+
+ Parameters
+ ----------
+ func : callable
+ A callable implementing the user-defined function.
+ The only argument is the context argument of type
+ ScalarUdfContext. It must return a callable that
+ returns on each invocation a StructArray matching
+ the out_type, where an empty array indicates end.
+ function_name : str
+ Name of the function. This name must be globally unique.
+ function_doc : dict
+ A dictionary object with keys "summary" (str),
+ and "description" (str).
+ in_types : Dict[str, DataType]
+ Must be an empty dictionary.
+ out_type : DataType
+ Output type of the function.
Review Comment:
Can it be a schema? That would be simpler for users I think. We could do
the conversion to a struct type ourselves.
##########
python/pyarrow/_compute.pyx:
##########
@@ -2629,14 +2718,98 @@ def register_scalar_function(func, function_name,
function_doc, in_types,
21
]
"""
+ return register_scalar_like_function(GetRegisterScalarFunction(),
+ func, function_name, function_doc,
in_types,
+ out_type, func_registry)
+
+
+def register_tabular_function(func, function_name, function_doc, in_types,
out_type,
+ func_registry=None):
+ """
+ Register a user-defined tabular function.
+
+ A tabular function is one accepting a context argument of type
+ ScalarUdfContext and returning a generator of struct arrays.
+ The in_types argument must be empty and the out_type argument
+ specifies a schema. Each struct array must have field types
+ correspoding to the schema.
+
+ Parameters
+ ----------
+ func : callable
+ A callable implementing the user-defined function.
+ The only argument is the context argument of type
+ ScalarUdfContext. It must return a callable that
+ returns on each invocation a StructArray matching
+ the out_type, where an empty array indicates end.
+ function_name : str
+ Name of the function. This name must be globally unique.
+ function_doc : dict
+ A dictionary object with keys "summary" (str),
+ and "description" (str).
+ in_types : Dict[str, DataType]
+ Must be an empty dictionary.
+ out_type : DataType
+ Output type of the function.
+ func_registry : FunctionRegistry
+ Optional function registry to use instead of the default global one.
+ """
+ return register_scalar_like_function(GetRegisterTabularFunction(),
+ func, function_name, function_doc,
in_types,
+ out_type, func_registry)
+
+
+def register_scalar_like_function(register_func, func, function_name,
function_doc, in_types,
+ out_type, func_registry=None):
+ """
+ Register a user-defined scalar-like function.
+
+ A scalar-like function is a callable accepting a first
+ context argument of type ScalarUdfContext as well as
+ possibly additional Arrow arguments, and returning a
+ an Arrow result appropriate for the kind of function.
+ A scalar function and a tabular function are examples
+ for scalar-like functions.
+ This function is normally not called directly but via
+ register_scalar_function or register_tabular_function.
Review Comment:
They could create a new kind of UDX but is there any point if there is no
node that consumes those types of functions? At the moment I would agree to
making this internal (we can keep the API docs, I think it's just a matter of
adding a `_` in front of the name)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]