edponce commented on a change in pull request #11023:
URL: https://github.com/apache/arrow/pull/11023#discussion_r706345457
##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -417,6 +419,231 @@ struct StringTransformExecWithState
}
};
+struct StringBinaryTransformBase {
+ virtual Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum*
out) {
+ return Status::OK();
+ }
+
+ // Return the maximum total size of the output in codeunits (i.e. bytes)
+ // given input characteristics.
+ virtual int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits,
+ const std::shared_ptr<Scalar>& input2) {
+ return input_ncodeunits;
+ }
+
+ // Return the maximum total size of the output in codeunits (i.e. bytes)
+ // given input characteristics.
+ virtual int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits,
+ const std::shared_ptr<ArrayData>& data2) {
+ return input_ncodeunits;
+ }
+
+ virtual Status InvalidStatus() {
+ return Status::Invalid("Invalid UTF8 sequence in input");
+ }
+};
+
+/// Kernel exec generator for binary string transforms.
+/// The first parameter is expected to always be a string type while the
second parameter
+/// is generic. It supports executions of the form:
+/// * Scalar, Scalar
+/// * Array, Scalar - scalar is broadcasted and paired with all values of
array
+/// * Array, Array - arrays are processed element-wise
+/// * Scalar, Array - not supported by default
+template <typename Type1, typename Type2, typename StringTransform>
+struct StringBinaryTransformExecBase {
+ using offset_type = typename Type1::offset_type;
+ using ArrayType1 = typename TypeTraits<Type1>::ArrayType;
+ using ArrayType2 = typename TypeTraits<Type2>::ArrayType;
+
+ static Status Execute(KernelContext* ctx, StringTransform* transform,
+ const ExecBatch& batch, Datum* out) {
+ if (batch.num_values() != 2) {
+ return Status::Invalid("Invalid arity for binary string transform");
+ }
+
+ if (batch[0].is_array()) {
+ if (batch[1].is_array()) {
+ return ExecArrayArray(ctx, transform, batch[0].array(),
batch[1].array(), out);
+ } else if (batch[1].is_scalar()) {
+ return ExecArrayScalar(ctx, transform, batch[0].array(),
batch[1].scalar(), out);
+ }
+ } else if (batch[0].is_scalar()) {
+ if (batch[1].is_array()) {
+ return ExecScalarArray(ctx, transform, batch[0].scalar(),
batch[1].array(), out);
+ } else if (batch[1].is_scalar()) {
+ return ExecScalarScalar(ctx, transform, batch[0].scalar(),
batch[1].scalar(),
+ out);
+ }
+ }
+ return Status::Invalid("Invalid ExecBatch kind for binary string
transform");
+ }
+
+ static Status ExecScalarScalar(KernelContext* ctx, StringTransform*
transform,
+ const std::shared_ptr<Scalar>& scalar1,
+ const std::shared_ptr<Scalar>& scalar2,
Datum* out) {
+ if (!scalar1->is_valid || !scalar2->is_valid) {
+ return Status::OK();
+ }
+
+ const auto& input1 = checked_cast<const BaseBinaryScalar&>(*scalar1);
+ auto input_ncodeunits = input1.value->size();
+ auto input_nstrings = 1;
+ auto output_ncodeunits_max =
+ transform->MaxCodeunits(input_nstrings, input_ncodeunits, scalar2);
+ if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
+ return Status::CapacityError(
+ "Result might not fit in a 32bit utf8 array, convert to large_utf8");
Review comment:
I copied the same message as in "unary" `StringTransformExecBase`. Note,
that the term _array_ refers to the buffer holding the string value which is
allocated with size of `MaxCodeUnits()`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]