edponce commented on a change in pull request #11082:
URL: https://github.com/apache/arrow/pull/11082#discussion_r703078818
##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -420,6 +426,205 @@ struct StringTransformExecWithState
}
};
+/// Kernel exec generator for binary string transforms.
+/// The first parameter is expected to always be a string type while the
second parameter
+/// is generic. It supports executions of the form:
+/// * Scalar, Scalar
+/// * Array, Scalar - scalar is broadcasted and paired with all values of
array
+/// * Array, Array - arrays are processed element-wise
+/// * Scalar, Array - not supported by default
+// TODO(edponce): For when second parameter is an array, need to specify a
corresponding
+// iterator/visitor.
+template <typename Type1, typename Type2, typename StringTransform>
+struct StringBinaryTransformExecBase {
+ using offset_type = typename Type1::offset_type;
+ using ArrayType1 = typename TypeTraits<Type1>::ArrayType;
+ using ArrayType2 = typename TypeTraits<Type2>::ArrayType;
+
+ static Status Execute(KernelContext* ctx, StringTransform* transform,
+ const ExecBatch& batch, Datum* out) {
+ if (batch.num_values() != 2) {
+ return Status::Invalid("Invalid arity for binary string transform");
+ }
+
+ if (batch[0].is_array()) {
+ if (batch[1].is_array()) {
+ return ExecArrayArray(ctx, transform, batch[0].array(),
batch[1].array(), out);
+ } else if (batch[1].is_scalar()) {
+ return ExecArrayScalar(ctx, transform, batch[0].array(),
batch[1].scalar(), out);
+ }
+ } else if (batch[0].is_scalar()) {
+ if (batch[1].is_array()) {
+ return ExecScalarArray(ctx, transform, batch[0].scalar(),
batch[1].array(), out);
+ } else if (batch[1].is_scalar()) {
+ return ExecScalarScalar(ctx, transform, batch[0].scalar(),
batch[1].scalar(),
+ out);
+ }
+ }
+ return Status::Invalid("Invalid ExecBatch kind for binary string
transform");
+ }
+
+ private:
+ static Status ExecScalarScalar(KernelContext* ctx, StringTransform*
transform,
+ const std::shared_ptr<Scalar>& scalar1,
+ const std::shared_ptr<Scalar>& scalar2,
Datum* out) {
+ const auto& input1 = checked_cast<const BaseBinaryScalar&>(*scalar1);
+ // TODO(edponce): How to validate inputs? For some kernels, returning null
is ok
+ // (ascii_lower) but others not necessarily (concatenate)
Review comment:
Defining "string transforms" as functions where the input string is
operated on and the output is a string of possibly different size, these are
the only string binary transforms I found:
* string repeat
* remove suffix (not implemented)
* remove prefix (not implemented)
From these, only for the `remove suffix/prefix` it makes sense to have
`ExecScalarArray`, so you are correct in that string binary transforms are not
too generalizable as numeric ones. I suggest to keep this PR for
`StringBinaryTransformExecBase` defining only `ExecScalarScalar`,
`ExecArrayScalar`, and `ExecArrayArray`, so that it can be used by the 3
kernels listed above.
##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -420,6 +426,205 @@ struct StringTransformExecWithState
}
};
+/// Kernel exec generator for binary string transforms.
+/// The first parameter is expected to always be a string type while the
second parameter
+/// is generic. It supports executions of the form:
+/// * Scalar, Scalar
+/// * Array, Scalar - scalar is broadcasted and paired with all values of
array
+/// * Array, Array - arrays are processed element-wise
+/// * Scalar, Array - not supported by default
+// TODO(edponce): For when second parameter is an array, need to specify a
corresponding
+// iterator/visitor.
+template <typename Type1, typename Type2, typename StringTransform>
+struct StringBinaryTransformExecBase {
+ using offset_type = typename Type1::offset_type;
+ using ArrayType1 = typename TypeTraits<Type1>::ArrayType;
+ using ArrayType2 = typename TypeTraits<Type2>::ArrayType;
+
+ static Status Execute(KernelContext* ctx, StringTransform* transform,
+ const ExecBatch& batch, Datum* out) {
+ if (batch.num_values() != 2) {
+ return Status::Invalid("Invalid arity for binary string transform");
+ }
+
+ if (batch[0].is_array()) {
+ if (batch[1].is_array()) {
+ return ExecArrayArray(ctx, transform, batch[0].array(),
batch[1].array(), out);
+ } else if (batch[1].is_scalar()) {
+ return ExecArrayScalar(ctx, transform, batch[0].array(),
batch[1].scalar(), out);
+ }
+ } else if (batch[0].is_scalar()) {
+ if (batch[1].is_array()) {
+ return ExecScalarArray(ctx, transform, batch[0].scalar(),
batch[1].array(), out);
+ } else if (batch[1].is_scalar()) {
+ return ExecScalarScalar(ctx, transform, batch[0].scalar(),
batch[1].scalar(),
+ out);
+ }
+ }
+ return Status::Invalid("Invalid ExecBatch kind for binary string
transform");
+ }
+
+ private:
+ static Status ExecScalarScalar(KernelContext* ctx, StringTransform*
transform,
+ const std::shared_ptr<Scalar>& scalar1,
+ const std::shared_ptr<Scalar>& scalar2,
Datum* out) {
+ const auto& input1 = checked_cast<const BaseBinaryScalar&>(*scalar1);
+ // TODO(edponce): How to validate inputs? For some kernels, returning null
is ok
+ // (ascii_lower) but others not necessarily (concatenate)
Review comment:
BTW, even though this PR is complete and passing, it is a draft because
I am working on a design proposal to refactor some parts of the compute layer
and this falls into that. We can do one of the following:
* Accept this PR, knowing that there is a possibility of it being modified
(if proposal is accepted by community)
* Close this PR and make each string binary kernel explicitly define the
`PreExec` and `Exec` methods
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]