[GitHub] [arrow] lidavidm commented on a diff in pull request #12590: ARROW-15639 [C++][Python] UDF Scalar Function Implementation

GitBox Thu, 21 Apr 2022 05:22:07 -0700


lidavidm commented on code in PR #12590:
URL: https://github.com/apache/arrow/pull/12590#discussion_r855117341



##########
cpp/src/arrow/datum.cc:
##########
@@ -279,4 +279,23 @@ void PrintTo(const Datum& datum, std::ostream* os) {
   }
 }
 
+std::string ToString(Datum::Kind kind) {
+  switch (kind) {
+    case Datum::NONE:
+      return "None";
+    case Datum::SCALAR:
+      return "Scalar";
+    case Datum::ARRAY:
+      return "Array";
+    case Datum::CHUNKED_ARRAY:
+      return "ChunkedArray";
+    case Datum::RECORD_BATCH:
+      return "RecordBatch";
+    case Datum::TABLE:
+      return "Table";
+    default:
+      return NULL;

Review Comment:
   Let's return "(unknown)" or possibly `std::to_string(static_cast<int>(kind))`



##########
cpp/src/arrow/python/udf.cc:
##########
@@ -0,0 +1,135 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/python/udf.h"
+
+#include <cstddef>
+#include <memory>
+#include <sstream>
+
+#include "arrow/compute/function.h"
+#include "arrow/python/common.h"
+
+namespace arrow {
+
+namespace py {
+
+Status CheckOutputType(const DataType& expected, const DataType& actual) {
+  if (!expected.Equals(actual)) {
+    return Status::TypeError("Expected output type, ", expected.name(),
+                             ", but function returned type ", actual.name());
+  }
+  return Status::OK();
+}
+
+struct PythonUdf {
+  ScalarUdfWrapperCallback cb;
+  std::shared_ptr<OwnedRefNoGIL> function;
+  compute::OutputType output_type;
+
+  // function needs to be destroyed at process exit
+  // and Python may no longer be initialized.
+  ~PythonUdf() {
+    if (_Py_IsFinalizing()) {
+      function->detach();
+    }
+  }
+
+  Status operator()(compute::KernelContext* ctx, const compute::ExecBatch& 
batch,
+                    Datum* out) {
+    return SafeCallIntoPython([=]() -> Status { return Execute(ctx, batch, 
out); });
+  }
+
+  Status Execute(compute::KernelContext* ctx, const compute::ExecBatch& batch,
+                 Datum* out) {
+    const auto num_args = batch.values.size();
+    ScalarUdfContext udf_context{ctx->memory_pool(), 
static_cast<int64_t>(num_args)};
+    PyObject* arg_tuple = PyTuple_New(num_args);
+    for (size_t arg_id = 0; arg_id < num_args; arg_id++) {
+      switch (batch[arg_id].kind()) {
+        case Datum::SCALAR: {
+          auto c_data = batch[arg_id].scalar();
+          PyObject* data = wrap_scalar(c_data);
+          PyTuple_SetItem(arg_tuple, arg_id, data);
+          break;
+        }
+        case Datum::ARRAY: {
+          auto c_data = batch[arg_id].make_array();
+          PyObject* data = wrap_array(c_data);
+          PyTuple_SetItem(arg_tuple, arg_id, data);
+          break;
+        }
+        default:
+          auto datum = batch[arg_id];
+          return Status::NotImplemented(
+              "User-defined-functions are not supported for the datum kind ",
+              ToString(batch[arg_id].kind()));
+      }
+    }
+    PyObject* result;
+    result = cb(function->obj(), udf_context, arg_tuple);
+    RETURN_NOT_OK(CheckPyError());
+    if (result == Py_None) {
+      return Status::Invalid("Output is None, but expected an array");

Review Comment:
   This error message isn't correct (it could be a scalar), and shouldn't this 
be caught by is_scalar/is_array below?



##########
python/pyarrow/_compute.pyx:
##########
@@ -2251,3 +2338,219 @@ cdef CExpression _bind(Expression filter, Schema 
schema) except *:
 
     return GetResultValue(filter.unwrap().Bind(
         deref(pyarrow_unwrap_schema(schema).get())))
+
+
+cdef class ScalarUdfContext:
+    """A container to hold user-defined-function related
+    entities. `batch_length` and `MemoryPool` are important
+    entities in defining functions which require these details. 
+
+    Example
+    -------
+
+    ScalarUdfContext is used with the scalar user-defined-functions. 
+    When defining such a function, the first parameter must be a
+    ScalarUdfContext object. This object can be used to hold important
+    information. This can be further enhanced depending on the use 
+    cases of user-defined-functions. 
+
+    >>> def random(context, one, two):
+            return pc.add(one, two, memory_pool=context.memory_pool)
+    """
+
+    def __init__(self):
+        raise TypeError("Do not call {}'s constructor directly"
+                        .format(self.__class__.__name__))
+
+    cdef void init(self, const CScalarUdfContext &c_context):
+        self.c_context = c_context
+
+    @property
+    def batch_length(self):
+        """
+        Returns the length of the batch associated with the
+        user-defined-function. Useful when the batch_length
+        is required to do computations specially when scalars
+        are parameters of the user-defined-function.
+
+        Returns
+        -------
+        batch_length : int
+            The number of batches used when calling 
+            user-defined-function. 
+        """
+        return self.c_context.batch_length
+
+    @property
+    def memory_pool(self):
+        """
+        Returns the MemoryPool associated with the 
+        user-defined-function. An already initialized
+        MemoryPool can be used within the
+        user-defined-function. 
+
+        Returns
+        -------
+        memory_pool : MemoryPool
+            MemoryPool is obtained from the KernelContext
+            and passed to the ScalarUdfContext.

Review Comment:
   ```suggestion
           A memory pool for allocations (:class:`MemoryPool`).
   ```



##########
python/pyarrow/_compute.pyx:
##########
@@ -2251,3 +2338,219 @@ cdef CExpression _bind(Expression filter, Schema 
schema) except *:
 
     return GetResultValue(filter.unwrap().Bind(
         deref(pyarrow_unwrap_schema(schema).get())))
+
+
+cdef class ScalarUdfContext:
+    """A container to hold user-defined-function related
+    entities. `batch_length` and `MemoryPool` are important
+    entities in defining functions which require these details. 
+
+    Example
+    -------
+
+    ScalarUdfContext is used with the scalar user-defined-functions. 
+    When defining such a function, the first parameter must be a
+    ScalarUdfContext object. This object can be used to hold important
+    information. This can be further enhanced depending on the use 
+    cases of user-defined-functions. 
+
+    >>> def random(context, one, two):
+            return pc.add(one, two, memory_pool=context.memory_pool)

Review Comment:
   Docstrings should have a one-line summary and optionally a description. 
Also, most of this isn't relevant to a user; I don't think we need to say too 
much here.
   
   ```suggestion
       """
       Per-invocation function context/state.
       
       This object will always be the first argument to a user-defined
       function. It should not be used outside of a call to the function. 
   ```



##########
python/pyarrow/_compute.pyx:
##########
@@ -2251,3 +2338,219 @@ cdef CExpression _bind(Expression filter, Schema 
schema) except *:
 
     return GetResultValue(filter.unwrap().Bind(
         deref(pyarrow_unwrap_schema(schema).get())))
+
+
+cdef class ScalarUdfContext:
+    """A container to hold user-defined-function related
+    entities. `batch_length` and `MemoryPool` are important
+    entities in defining functions which require these details. 
+
+    Example
+    -------
+
+    ScalarUdfContext is used with the scalar user-defined-functions. 
+    When defining such a function, the first parameter must be a
+    ScalarUdfContext object. This object can be used to hold important
+    information. This can be further enhanced depending on the use 
+    cases of user-defined-functions. 
+
+    >>> def random(context, one, two):
+            return pc.add(one, two, memory_pool=context.memory_pool)
+    """
+
+    def __init__(self):
+        raise TypeError("Do not call {}'s constructor directly"
+                        .format(self.__class__.__name__))
+
+    cdef void init(self, const CScalarUdfContext &c_context):
+        self.c_context = c_context
+
+    @property
+    def batch_length(self):
+        """
+        Returns the length of the batch associated with the
+        user-defined-function. Useful when the batch_length
+        is required to do computations specially when scalars
+        are parameters of the user-defined-function.
+
+        Returns
+        -------
+        batch_length : int
+            The number of batches used when calling 
+            user-defined-function. 
+        """
+        return self.c_context.batch_length
+
+    @property
+    def memory_pool(self):
+        """
+        Returns the MemoryPool associated with the 
+        user-defined-function. An already initialized
+        MemoryPool can be used within the
+        user-defined-function. 
+
+        Returns
+        -------
+        memory_pool : MemoryPool
+            MemoryPool is obtained from the KernelContext
+            and passed to the ScalarUdfContext.
+        """
+        return box_memory_pool(self.c_context.pool)
+
+
+cdef inline CFunctionDoc _make_function_doc(dict func_doc) except *:
+    """
+    Helper function to generate the FunctionDoc
+    This function accepts a dictionary and expect the 
+    summary(str), description(str) and arg_names(List[str]) keys. 
+    """
+    cdef:
+        CFunctionDoc f_doc
+        vector[c_string] c_arg_names
+
+    f_doc.summary = tobytes(func_doc["summary"])
+    f_doc.description = tobytes(func_doc["description"])
+    for arg_name in func_doc["arg_names"]:
+        c_arg_names.push_back(tobytes(arg_name))
+    f_doc.arg_names = c_arg_names
+    # UDFOptions integration:
+    # TODO: https://issues.apache.org/jira/browse/ARROW-16041
+    f_doc.options_class = tobytes("")
+    f_doc.options_required = False
+    return f_doc
+
+cdef _scalar_udf_callback(user_function, const CScalarUdfContext& c_context, 
inputs):
+    """
+    Helper callback function used to wrap the ScalarUdfContext from Python to 
C++
+    execution.
+    """
+    cdef ScalarUdfContext context = ScalarUdfContext.__new__(ScalarUdfContext)
+    context.init(c_context)
+    return user_function(context, *inputs)
+
+
+def register_scalar_function(func, func_name, function_doc, in_types,
+                             out_type):
+    """
+    Register a user-defined scalar function. 
+
+    A scalar function is a function that executes elementwise
+    operations on arrays or scalars, and therefore whose results
+    generally do not depend on the order of the values in the
+    arguments. Accepts and returns arrays that are all of the
+    same size. These functions roughly correspond to the functions
+    used in SQL expressions.

Review Comment:
   "In other words, all argument arrays have the same length, and the output 
array is of the same length as the arguments. Scalar functions are the only 
functions allowed in query engine expressions."



##########
python/pyarrow/_compute.pyx:
##########
@@ -2251,3 +2338,219 @@ cdef CExpression _bind(Expression filter, Schema 
schema) except *:
 
     return GetResultValue(filter.unwrap().Bind(
         deref(pyarrow_unwrap_schema(schema).get())))
+
+
+cdef class ScalarUdfContext:
+    """A container to hold user-defined-function related
+    entities. `batch_length` and `MemoryPool` are important
+    entities in defining functions which require these details. 
+
+    Example
+    -------
+
+    ScalarUdfContext is used with the scalar user-defined-functions. 
+    When defining such a function, the first parameter must be a
+    ScalarUdfContext object. This object can be used to hold important
+    information. This can be further enhanced depending on the use 
+    cases of user-defined-functions. 
+
+    >>> def random(context, one, two):
+            return pc.add(one, two, memory_pool=context.memory_pool)
+    """
+
+    def __init__(self):
+        raise TypeError("Do not call {}'s constructor directly"
+                        .format(self.__class__.__name__))
+
+    cdef void init(self, const CScalarUdfContext &c_context):
+        self.c_context = c_context
+
+    @property
+    def batch_length(self):
+        """
+        Returns the length of the batch associated with the
+        user-defined-function. Useful when the batch_length
+        is required to do computations specially when scalars
+        are parameters of the user-defined-function.
+
+        Returns
+        -------
+        batch_length : int
+            The number of batches used when calling 
+            user-defined-function. 
+        """
+        return self.c_context.batch_length
+
+    @property
+    def memory_pool(self):
+        """
+        Returns the MemoryPool associated with the 
+        user-defined-function. An already initialized
+        MemoryPool can be used within the
+        user-defined-function. 
+
+        Returns
+        -------
+        memory_pool : MemoryPool
+            MemoryPool is obtained from the KernelContext
+            and passed to the ScalarUdfContext.
+        """
+        return box_memory_pool(self.c_context.pool)
+
+
+cdef inline CFunctionDoc _make_function_doc(dict func_doc) except *:
+    """
+    Helper function to generate the FunctionDoc
+    This function accepts a dictionary and expect the 
+    summary(str), description(str) and arg_names(List[str]) keys. 
+    """
+    cdef:
+        CFunctionDoc f_doc
+        vector[c_string] c_arg_names
+
+    f_doc.summary = tobytes(func_doc["summary"])
+    f_doc.description = tobytes(func_doc["description"])
+    for arg_name in func_doc["arg_names"]:
+        c_arg_names.push_back(tobytes(arg_name))
+    f_doc.arg_names = c_arg_names
+    # UDFOptions integration:
+    # TODO: https://issues.apache.org/jira/browse/ARROW-16041
+    f_doc.options_class = tobytes("")
+    f_doc.options_required = False
+    return f_doc
+
+cdef _scalar_udf_callback(user_function, const CScalarUdfContext& c_context, 
inputs):
+    """
+    Helper callback function used to wrap the ScalarUdfContext from Python to 
C++
+    execution.
+    """
+    cdef ScalarUdfContext context = ScalarUdfContext.__new__(ScalarUdfContext)
+    context.init(c_context)
+    return user_function(context, *inputs)
+
+
+def register_scalar_function(func, func_name, function_doc, in_types,
+                             out_type):
+    """
+    Register a user-defined scalar function. 
+
+    A scalar function is a function that executes elementwise
+    operations on arrays or scalars, and therefore whose results
+    generally do not depend on the order of the values in the

Review Comment:
   nit, but the results must not/cannot depend on the order, or rather, each 
output row must be computed only from its corresponding input row.



##########
python/pyarrow/_compute.pyx:
##########
@@ -2251,3 +2338,219 @@ cdef CExpression _bind(Expression filter, Schema 
schema) except *:
 
     return GetResultValue(filter.unwrap().Bind(
         deref(pyarrow_unwrap_schema(schema).get())))
+
+
+cdef class ScalarUdfContext:
+    """A container to hold user-defined-function related
+    entities. `batch_length` and `MemoryPool` are important
+    entities in defining functions which require these details. 
+
+    Example
+    -------
+
+    ScalarUdfContext is used with the scalar user-defined-functions. 
+    When defining such a function, the first parameter must be a
+    ScalarUdfContext object. This object can be used to hold important
+    information. This can be further enhanced depending on the use 
+    cases of user-defined-functions. 
+
+    >>> def random(context, one, two):
+            return pc.add(one, two, memory_pool=context.memory_pool)
+    """
+
+    def __init__(self):
+        raise TypeError("Do not call {}'s constructor directly"
+                        .format(self.__class__.__name__))
+
+    cdef void init(self, const CScalarUdfContext &c_context):
+        self.c_context = c_context
+
+    @property
+    def batch_length(self):
+        """
+        Returns the length of the batch associated with the
+        user-defined-function. Useful when the batch_length
+        is required to do computations specially when scalars
+        are parameters of the user-defined-function.
+
+        Returns
+        -------
+        batch_length : int
+            The number of batches used when calling 
+            user-defined-function. 

Review Comment:
   As noted in #12901 it seems Sphinx expects particular syntax.
   
   ```suggestion
           """
           The common length of all input arguments (int).
           
           In the case that all arguments are scalars, this value
           is used to pass the "actual length" of the arguments,
           e.g. because the scalar values are encoding a column
           with a constant value.
   ```



##########
cpp/src/arrow/python/udf.h:
##########
@@ -0,0 +1,82 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/python/platform.h"
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/cast.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/function.h"
+#include "arrow/compute/registry.h"
+
+#include "arrow/python/common.h"
+#include "arrow/python/pyarrow.h"
+#include "arrow/python/visibility.h"
+
+namespace arrow {
+
+namespace py {
+
+/// TODO: TODO(ARROW-16041): UDF Options are not exposed to the Python
+/// users. This feature will be included when extending to provide advanced
+/// options for the users.
+class ARROW_PYTHON_EXPORT ScalarUdfOptions {

Review Comment:
   Ping here?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] lidavidm commented on a diff in pull request #12590: ARROW-15639 [C++][Python] UDF Scalar Function Implementation

Reply via email to