westonpace commented on code in PR #12590:
URL: https://github.com/apache/arrow/pull/12590#discussion_r847916856


##########
cpp/src/arrow/python/udf.cc:
##########
@@ -0,0 +1,126 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/python/udf.h"
+
+#include <cstddef>
+#include <memory>
+#include <sstream>
+
+#include "arrow/compute/function.h"
+#include "arrow/python/common.h"
+
+namespace arrow {
+
+namespace py {
+
+Status ExecuteFunction(const compute::ExecBatch& batch, PyObject* function,
+                       const compute::OutputType& exp_out_type, Datum* out) {
+  int num_args = static_cast<int64_t>(batch.values.size());
+  PyObject* arg_tuple = PyTuple_New(num_args);
+  // wrap exec_batch objects into Python objects based on the datum type
+  for (int arg_id = 0; arg_id < num_args; arg_id++) {
+    switch (batch[arg_id].kind()) {
+      case Datum::SCALAR: {
+        auto c_data = batch[arg_id].scalar();
+        PyObject* data = wrap_scalar(c_data);
+        PyTuple_SetItem(arg_tuple, arg_id, data);
+        break;
+      }
+      case Datum::ARRAY: {
+        auto c_data = batch[arg_id].make_array();
+        PyObject* data = wrap_array(c_data);
+        PyTuple_SetItem(arg_tuple, arg_id, data);
+        break;
+      }
+      default:
+        return Status::NotImplemented(
+            "User-defined-functions are not supported for the datum kind ",
+            batch[arg_id].kind());
+    }
+  }
+  // call to Python executing the function
+  PyObject* result;
+  auto st = SafeCallIntoPython([&]() -> Status {
+    result = PyObject_CallObject(function, arg_tuple);
+    return CheckPyError();
+  });
+  RETURN_NOT_OK(st);
+  if (result == nullptr) {
+    return Status::ExecutionError("Output is null, but expected an array");
+  }
+  // wrapping the output for expected output type
+  if (is_scalar(result)) {
+    ARROW_ASSIGN_OR_RAISE(auto val, unwrap_scalar(result));
+    if (!exp_out_type.type()->Equals(val->type)) {
+      return Status::Invalid("Expected output type, ", 
exp_out_type.type()->name(),
+                             ", but function returned type ", 
val->type->name());
+    }
+    *out = Datum(val);
+    return Status::OK();
+  } else if (is_array(result)) {
+    ARROW_ASSIGN_OR_RAISE(auto val, unwrap_array(result));
+    if (!exp_out_type.type()->Equals(val->type())) {
+      return Status::Invalid("Expected output type, ", 
exp_out_type.type()->name(),
+                             ", but function returned type ", 
val->type()->name());
+    }
+    *out = Datum(val);
+    return Status::OK();
+  } else {
+    return Status::Invalid("Not supported output type");

Review Comment:
   ```suggestion
       return Status::Invalid("Not a supported output type");
   ```
   Minor nit: Would be nice if we could get a string description of the type 
here and include it with the error message.  For example, `Not a supported 
output type: str` instead of just `Not a supported output type`.



##########
cpp/examples/arrow/udf_example.cc:
##########
@@ -0,0 +1,104 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <arrow/api.h>
+#include <arrow/compute/api.h>
+
+#include <cstdlib>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+// Demonstrate registering a user-defined Arrow compute function outside of 
the Arrow
+// source tree
+
+namespace cp = ::arrow::compute;
+
+#define ABORT_ON_FAILURE(expr)                     \
+  do {                                             \
+    arrow::Status status_ = (expr);                \
+    if (!status_.ok()) {                           \
+      std::cerr << status_.message() << std::endl; \
+      abort();                                     \
+    }                                              \
+  } while (0);
+
+template <typename TYPE,
+          typename = typename 
std::enable_if<arrow::is_number_type<TYPE>::value |
+                                             
arrow::is_boolean_type<TYPE>::value |
+                                             
arrow::is_temporal_type<TYPE>::value>::type>
+arrow::Result<std::shared_ptr<arrow::Array>> GetArrayDataSample(
+    const std::vector<typename TYPE::c_type>& values) {
+  using ArrowBuilderType = typename arrow::TypeTraits<TYPE>::BuilderType;
+  ArrowBuilderType builder;
+  ARROW_RETURN_NOT_OK(builder.Reserve(values.size()));
+  ARROW_RETURN_NOT_OK(builder.AppendValues(values));
+  return builder.Finish();
+}
+
+const cp::FunctionDoc func_doc{
+    "User-defined-function usage to demonstrate registering an out-of-tree 
function",
+    "returns x + y + z",
+    {"x", "y", "z"},
+    "UDFOptions"};
+
+arrow::Status SampleFunction(cp::KernelContext* ctx, const cp::ExecBatch& 
batch,
+                             arrow::Datum* out) {
+  // temp = x + y; return temp + z
+  ARROW_ASSIGN_OR_RAISE(auto temp, cp::CallFunction("add", {batch[0], 
batch[1]}));
+  ARROW_ASSIGN_OR_RAISE(*out, cp::CallFunction("add", {temp, batch[2]}));
+  return arrow::Status::OK();
+}

Review Comment:
   ```suggestion
   Result<arrow::Datum> SampleFunction(cp::KernelContext* ctx, const 
cp::ExecBatch& batch) {
     // temp = x + y; return temp + z
     ARROW_ASSIGN_OR_RAISE(auto temp, cp::CallFunction("add", {batch[0], 
batch[1]}));
     return cp::CallFunction("add", {temp, batch[2]});
   }
   ```
   
   Minor nit:  Avoid out parameters if possible.



##########
cpp/src/arrow/compute/function.h:
##########
@@ -205,7 +205,7 @@ class ARROW_EXPORT Function {
   const Arity& arity() const { return arity_; }
 
   /// \brief Return the function documentation
-  const FunctionDoc& doc() const { return *doc_; }
+  const FunctionDoc doc() const { return doc_; }

Review Comment:
   I agree with David.  We should be returning a reference here.



##########
cpp/src/arrow/python/udf.cc:
##########
@@ -0,0 +1,126 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/python/udf.h"
+
+#include <cstddef>
+#include <memory>
+#include <sstream>
+
+#include "arrow/compute/function.h"
+#include "arrow/python/common.h"
+
+namespace arrow {
+
+namespace py {
+
+Status ExecuteFunction(const compute::ExecBatch& batch, PyObject* function,
+                       const compute::OutputType& exp_out_type, Datum* out) {
+  int num_args = static_cast<int64_t>(batch.values.size());
+  PyObject* arg_tuple = PyTuple_New(num_args);
+  // wrap exec_batch objects into Python objects based on the datum type
+  for (int arg_id = 0; arg_id < num_args; arg_id++) {
+    switch (batch[arg_id].kind()) {
+      case Datum::SCALAR: {
+        auto c_data = batch[arg_id].scalar();
+        PyObject* data = wrap_scalar(c_data);
+        PyTuple_SetItem(arg_tuple, arg_id, data);
+        break;
+      }
+      case Datum::ARRAY: {
+        auto c_data = batch[arg_id].make_array();
+        PyObject* data = wrap_array(c_data);
+        PyTuple_SetItem(arg_tuple, arg_id, data);
+        break;
+      }
+      default:
+        return Status::NotImplemented(
+            "User-defined-functions are not supported for the datum kind ",
+            batch[arg_id].kind());
+    }
+  }
+  // call to Python executing the function
+  PyObject* result;
+  auto st = SafeCallIntoPython([&]() -> Status {
+    result = PyObject_CallObject(function, arg_tuple);
+    return CheckPyError();
+  });
+  RETURN_NOT_OK(st);
+  if (result == nullptr) {
+    return Status::ExecutionError("Output is null, but expected an array");

Review Comment:
   I wonder if we should say `None` instead of `null` here.



##########
python/pyarrow/_compute.pyx:
##########
@@ -2251,3 +2333,180 @@ cdef CExpression _bind(Expression filter, Schema 
schema) except *:
 
     return GetResultValue(filter.unwrap().Bind(
         deref(pyarrow_unwrap_schema(schema).get())))
+
+
+cdef CFunctionDoc _make_function_doc(dict func_doc) except *:
+    """
+    Helper function to generate the FunctionDoc
+    This function accepts a dictionary and expect the 
+    summary(str), description(str) and arg_names(List[str]) keys. 
+    """
+    cdef:
+        CFunctionDoc f_doc
+        vector[c_string] c_arg_names
+
+    if len(func_doc) <= 1:
+        raise ValueError(
+            "Function doc must contain a summary, a description and arg_names")
+
+    if not "summary" in func_doc.keys():
+        raise ValueError("Function doc must contain a summary")
+
+    if not "description" in func_doc.keys():
+        raise ValueError("Function doc must contain a description")
+
+    if not "arg_names" in func_doc.keys():
+        raise ValueError("Function doc must contain arg_names")
+
+    f_doc.summary = tobytes(func_doc["summary"])
+    f_doc.description = tobytes(func_doc["description"])
+    for arg_name in func_doc["arg_names"]:
+        c_arg_names.push_back(tobytes(arg_name))
+    f_doc.arg_names = c_arg_names
+    # UDFOptions integration:
+    # TODO: https://issues.apache.org/jira/browse/ARROW-16041
+    f_doc.options_class = tobytes("None")
+    f_doc.options_required = False
+    return f_doc
+
+
+def register_scalar_function(func_name, function_doc, in_types,
+                             out_type, function):
+    """
+    Register a user-defined scalar function. 
+
+    A scalar function is a function that executes elementwise
+    operations on arrays or scalars, and therefore whose results
+    generally do not depend on the order of the values in the
+    arguments. Accepts and returns arrays that are all of the
+    same size. These functions roughly correspond to the functions
+    used in SQL expressions.
+
+    Parameters
+    ----------
+    func_name : str
+        Name of the function. This name must be globally unique. 
+    function_doc : dict
+        A dictionary object with keys "summary" (str),
+        and "description" (str).
+    in_types : Dict[str, InputType]
+        Dictionary containing items with input type name, InputType
+        objects which defines the arguments to the function.
+        The number of arguments specified here determines the
+        function arity.

Review Comment:
   I don't think users will understand what `input type name` means.  Maybe 
something like...
   
   ```
   Dictionary containing items with input label, InputType objects which 
defines the arguments to the function.  The input label is a str that will be 
used to generate documentation for the function.  The number of arguments 
specified here determines the function arity.
   ```
   
   Also, the last sentence (the number of arguments...arity) is a little 
superfluous but we can keep it in if you prefer.  The user doesn't really have 
to know what "arity" is to use UDFs.



##########
cpp/src/arrow/python/udf.h:
##########
@@ -0,0 +1,97 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/python/platform.h"
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/compute/api_scalar.h"
+#include "arrow/compute/cast.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/function.h"
+#include "arrow/compute/registry.h"
+#include "arrow/datum.h"
+#include "arrow/util/cpu_info.h"
+#include "arrow/util/logging.h"
+
+#include "arrow/python/common.h"
+#include "arrow/python/pyarrow.h"
+#include "arrow/python/visibility.h"
+
+namespace arrow {
+
+namespace py {
+
+/// TODO: TODO(ARROW-16041): UDF Options are not exposed to the Python
+/// users. This feature will be included when extending to provide advanced
+/// options for the users.
+class ARROW_PYTHON_EXPORT ScalarUdfOptions {
+ public:
+  ScalarUdfOptions(const std::string func_name, const compute::Arity arity,
+                   const compute::FunctionDoc func_doc,
+                   const std::vector<compute::InputType> in_types,
+                   const compute::OutputType out_type)
+      : func_name_(func_name),
+        kind_(compute::Function::SCALAR),
+        arity_(arity),
+        func_doc_(std::move(func_doc)),
+        in_types_(std::move(in_types)),
+        out_type_(out_type) {}
+
+  const std::string& name() const { return func_name_; }
+
+  compute::Function::Kind kind() { return kind_; }
+
+  const compute::Arity& arity() const { return arity_; }
+
+  const compute::FunctionDoc doc() const { return func_doc_; }
+
+  const std::vector<compute::InputType>& input_types() const { return 
in_types_; }
+
+  const compute::OutputType& output_type() const { return out_type_; }
+
+ private:
+  std::string func_name_;
+  compute::Function::Kind kind_;
+  compute::Arity arity_;
+  const compute::FunctionDoc func_doc_;
+  std::vector<compute::InputType> in_types_;
+  compute::OutputType out_type_;
+};
+
+class ARROW_PYTHON_EXPORT UdfBuilder {
+ public:
+  UdfBuilder() {}
+};
+
+class ARROW_PYTHON_EXPORT ScalarUdfBuilder : public UdfBuilder {
+ public:
+  ScalarUdfBuilder() : UdfBuilder() {}
+
+  Status MakeFunction(PyObject* function, ScalarUdfOptions* options);
+
+ private:
+  OwnedRefNoGIL function_;
+  std::shared_ptr<compute::ScalarFunction> scalar_func_;
+};

Review Comment:
   ```suggestion
   Status ARROW_PYTHON_EXPORT MakeFunction(PyObject* function, 
ScalarUdfOptions* options);
   ```
   
   We can get rid of `UdfBuilder` and `ScalarUdfBuilder` now.  Notice that both 
`function_` and `scalar_func_` are only used within the method and by the time 
the method exits:
   
    * `function_` is a redundant copy of the owned ref.  The only copy that 
really matters is captured into the lambda.
    * `scalar_func_` has been moved out of and is no longer valid anyways.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to