[GitHub] [arrow] pitrou commented on a change in pull request #11023: ARROW-12712: [C++] String repeat kernel

GitBox Thu, 09 Sep 2021 07:08:01 -0700


pitrou commented on a change in pull request #11023:
URL: https://github.com/apache/arrow/pull/11023#discussion_r705355207




##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -417,6 +419,231 @@ struct StringTransformExecWithState
   }
 };
 
+struct StringBinaryTransformBase {
+  virtual Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* 
out) {
+    return Status::OK();
+  }
+
+  // Return the maximum total size of the output in codeunits (i.e. bytes)
+  // given input characteristics.
+  virtual int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits,
+                               const std::shared_ptr<Scalar>& input2) {
+    return input_ncodeunits;
+  }
+
+  // Return the maximum total size of the output in codeunits (i.e. bytes)
+  // given input characteristics.
+  virtual int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits,
+                               const std::shared_ptr<ArrayData>& data2) {
+    return input_ncodeunits;
+  }
+
+  virtual Status InvalidStatus() {
+    return Status::Invalid("Invalid UTF8 sequence in input");
+  }
+};
+
+/// Kernel exec generator for binary string transforms.
+/// The first parameter is expected to always be a string type while the 
second parameter
+/// is generic. It supports executions of the form:
+///   * Scalar, Scalar
+///   * Array, Scalar - scalar is broadcasted and paired with all values of 
array
+///   * Array, Array - arrays are processed element-wise
+///   * Scalar, Array - not supported by default
+template <typename Type1, typename Type2, typename StringTransform>
+struct StringBinaryTransformExecBase {
+  using offset_type = typename Type1::offset_type;
+  using ArrayType1 = typename TypeTraits<Type1>::ArrayType;
+  using ArrayType2 = typename TypeTraits<Type2>::ArrayType;
+
+  static Status Execute(KernelContext* ctx, StringTransform* transform,
+                        const ExecBatch& batch, Datum* out) {
+    if (batch.num_values() != 2) {
+      return Status::Invalid("Invalid arity for binary string transform");
+    }
+
+    if (batch[0].is_array()) {
+      if (batch[1].is_array()) {
+        return ExecArrayArray(ctx, transform, batch[0].array(), 
batch[1].array(), out);
+      } else if (batch[1].is_scalar()) {
+        return ExecArrayScalar(ctx, transform, batch[0].array(), 
batch[1].scalar(), out);
+      }
+    } else if (batch[0].is_scalar()) {
+      if (batch[1].is_array()) {
+        return ExecScalarArray(ctx, transform, batch[0].scalar(), 
batch[1].array(), out);
+      } else if (batch[1].is_scalar()) {
+        return ExecScalarScalar(ctx, transform, batch[0].scalar(), 
batch[1].scalar(),
+                                out);
+      }
+    }
+    return Status::Invalid("Invalid ExecBatch kind for binary string 
transform");
+  }
+
+  static Status ExecScalarScalar(KernelContext* ctx, StringTransform* 
transform,
+                                 const std::shared_ptr<Scalar>& scalar1,
+                                 const std::shared_ptr<Scalar>& scalar2, 
Datum* out) {
+    if (!scalar1->is_valid || !scalar2->is_valid) {
+      return Status::OK();
+    }
+
+    const auto& input1 = checked_cast<const BaseBinaryScalar&>(*scalar1);
+    auto input_ncodeunits = input1.value->size();
+    auto input_nstrings = 1;
+    auto output_ncodeunits_max =
+        transform->MaxCodeunits(input_nstrings, input_ncodeunits, scalar2);
+    if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
+      return Status::CapacityError(
+          "Result might not fit in a 32bit utf8 array, convert to large_utf8");
+    }
+
+    ARROW_ASSIGN_OR_RAISE(auto value_buffer, 
ctx->Allocate(output_ncodeunits_max));
+    auto result = checked_cast<BaseBinaryScalar*>(out->scalar().get());
+    result->is_valid = true;
+    result->value = value_buffer;
+    auto output_str = value_buffer->mutable_data();
+
+    auto input1_string = input1.value->data();
+    auto encoded_nbytes = static_cast<offset_type>(
+        transform->Transform(input1_string, input_ncodeunits, scalar2, 
output_str));
+    if (encoded_nbytes < 0) {
+      return transform->InvalidStatus();
+    }
+    DCHECK_LE(encoded_nbytes, output_ncodeunits_max);
+    return value_buffer->Resize(encoded_nbytes, /*shrink_to_fit=*/true);
+  }
+
+  static Status ExecArrayScalar(KernelContext* ctx, StringTransform* transform,
+                                const std::shared_ptr<ArrayData>& data1,
+                                const std::shared_ptr<Scalar>& scalar2, Datum* 
out) {
+    if (!scalar2->is_valid) {
+      return Status::OK();
+    }
+
+    ArrayType1 input1(data1);
+    auto input1_ncodeunits = input1.total_values_length();
+    auto input1_nstrings = input1.length();
+    auto output_ncodeunits_max =
+        transform->MaxCodeunits(input1_nstrings, input1_ncodeunits, scalar2);
+    if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
+      return Status::CapacityError(
+          "Result might not fit in a 32bit utf8 array, convert to large_utf8");
+    }
+
+    ArrayData* output = out->mutable_array();
+    ARROW_ASSIGN_OR_RAISE(auto values_buffer, 
ctx->Allocate(output_ncodeunits_max));
+    output->buffers[2] = values_buffer;
+
+    // String offsets are preallocated
+    auto output_string_offsets = output->GetMutableValues<offset_type>(1);
+    auto output_str = output->buffers[2]->mutable_data();
+    output_string_offsets[0] = 0;
+
+    offset_type output_ncodeunits = 0;
+    for (int64_t i = 0; i < input1_nstrings; ++i) {
+      if (!input1.IsNull(i)) {
+        offset_type input1_string_ncodeunits;
+        auto input1_string = input1.GetValue(i, &input1_string_ncodeunits);
+        auto encoded_nbytes = static_cast<offset_type>(
+            transform->Transform(input1_string, input1_string_ncodeunits, 
scalar2,
+                                 output_str + output_ncodeunits));
+        if (encoded_nbytes < 0) {
+          return transform->InvalidStatus();
+        }
+        output_ncodeunits += encoded_nbytes;
+      }
+      output_string_offsets[i + 1] = output_ncodeunits;
+    }
+    DCHECK_LE(output_ncodeunits, output_ncodeunits_max);
+
+    // Trim the codepoint buffer, since we allocated too much
+    return values_buffer->Resize(output_ncodeunits, /*shrink_to_fit=*/true);
+    return Status::OK();

Review comment:
       This line is dead code.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -417,6 +419,231 @@ struct StringTransformExecWithState
   }
 };
 
+struct StringBinaryTransformBase {
+  virtual Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* 
out) {
+    return Status::OK();
+  }
+
+  // Return the maximum total size of the output in codeunits (i.e. bytes)
+  // given input characteristics.
+  virtual int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits,
+                               const std::shared_ptr<Scalar>& input2) {
+    return input_ncodeunits;
+  }
+
+  // Return the maximum total size of the output in codeunits (i.e. bytes)
+  // given input characteristics.
+  virtual int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits,
+                               const std::shared_ptr<ArrayData>& data2) {
+    return input_ncodeunits;
+  }
+
+  virtual Status InvalidStatus() {
+    return Status::Invalid("Invalid UTF8 sequence in input");
+  }
+};
+
+/// Kernel exec generator for binary string transforms.
+/// The first parameter is expected to always be a string type while the 
second parameter
+/// is generic. It supports executions of the form:
+///   * Scalar, Scalar
+///   * Array, Scalar - scalar is broadcasted and paired with all values of 
array
+///   * Array, Array - arrays are processed element-wise
+///   * Scalar, Array - not supported by default
+template <typename Type1, typename Type2, typename StringTransform>
+struct StringBinaryTransformExecBase {
+  using offset_type = typename Type1::offset_type;
+  using ArrayType1 = typename TypeTraits<Type1>::ArrayType;
+  using ArrayType2 = typename TypeTraits<Type2>::ArrayType;
+
+  static Status Execute(KernelContext* ctx, StringTransform* transform,
+                        const ExecBatch& batch, Datum* out) {
+    if (batch.num_values() != 2) {
+      return Status::Invalid("Invalid arity for binary string transform");
+    }
+
+    if (batch[0].is_array()) {
+      if (batch[1].is_array()) {
+        return ExecArrayArray(ctx, transform, batch[0].array(), 
batch[1].array(), out);
+      } else if (batch[1].is_scalar()) {
+        return ExecArrayScalar(ctx, transform, batch[0].array(), 
batch[1].scalar(), out);
+      }
+    } else if (batch[0].is_scalar()) {
+      if (batch[1].is_array()) {
+        return ExecScalarArray(ctx, transform, batch[0].scalar(), 
batch[1].array(), out);
+      } else if (batch[1].is_scalar()) {
+        return ExecScalarScalar(ctx, transform, batch[0].scalar(), 
batch[1].scalar(),
+                                out);
+      }
+    }
+    return Status::Invalid("Invalid ExecBatch kind for binary string 
transform");
+  }
+
+  static Status ExecScalarScalar(KernelContext* ctx, StringTransform* 
transform,
+                                 const std::shared_ptr<Scalar>& scalar1,
+                                 const std::shared_ptr<Scalar>& scalar2, 
Datum* out) {
+    if (!scalar1->is_valid || !scalar2->is_valid) {
+      return Status::OK();
+    }
+
+    const auto& input1 = checked_cast<const BaseBinaryScalar&>(*scalar1);
+    auto input_ncodeunits = input1.value->size();
+    auto input_nstrings = 1;
+    auto output_ncodeunits_max =
+        transform->MaxCodeunits(input_nstrings, input_ncodeunits, scalar2);
+    if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
+      return Status::CapacityError(
+          "Result might not fit in a 32bit utf8 array, convert to large_utf8");
+    }
+
+    ARROW_ASSIGN_OR_RAISE(auto value_buffer, 
ctx->Allocate(output_ncodeunits_max));
+    auto result = checked_cast<BaseBinaryScalar*>(out->scalar().get());
+    result->is_valid = true;
+    result->value = value_buffer;
+    auto output_str = value_buffer->mutable_data();
+
+    auto input1_string = input1.value->data();
+    auto encoded_nbytes = static_cast<offset_type>(
+        transform->Transform(input1_string, input_ncodeunits, scalar2, 
output_str));
+    if (encoded_nbytes < 0) {
+      return transform->InvalidStatus();
+    }
+    DCHECK_LE(encoded_nbytes, output_ncodeunits_max);
+    return value_buffer->Resize(encoded_nbytes, /*shrink_to_fit=*/true);
+  }
+
+  static Status ExecArrayScalar(KernelContext* ctx, StringTransform* transform,
+                                const std::shared_ptr<ArrayData>& data1,
+                                const std::shared_ptr<Scalar>& scalar2, Datum* 
out) {
+    if (!scalar2->is_valid) {
+      return Status::OK();
+    }
+
+    ArrayType1 input1(data1);
+    auto input1_ncodeunits = input1.total_values_length();
+    auto input1_nstrings = input1.length();
+    auto output_ncodeunits_max =
+        transform->MaxCodeunits(input1_nstrings, input1_ncodeunits, scalar2);
+    if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
+      return Status::CapacityError(
+          "Result might not fit in a 32bit utf8 array, convert to large_utf8");
+    }
+
+    ArrayData* output = out->mutable_array();
+    ARROW_ASSIGN_OR_RAISE(auto values_buffer, 
ctx->Allocate(output_ncodeunits_max));
+    output->buffers[2] = values_buffer;
+
+    // String offsets are preallocated
+    auto output_string_offsets = output->GetMutableValues<offset_type>(1);
+    auto output_str = output->buffers[2]->mutable_data();
+    output_string_offsets[0] = 0;
+
+    offset_type output_ncodeunits = 0;
+    for (int64_t i = 0; i < input1_nstrings; ++i) {
+      if (!input1.IsNull(i)) {
+        offset_type input1_string_ncodeunits;
+        auto input1_string = input1.GetValue(i, &input1_string_ncodeunits);
+        auto encoded_nbytes = static_cast<offset_type>(
+            transform->Transform(input1_string, input1_string_ncodeunits, 
scalar2,
+                                 output_str + output_ncodeunits));
+        if (encoded_nbytes < 0) {
+          return transform->InvalidStatus();
+        }
+        output_ncodeunits += encoded_nbytes;
+      }
+      output_string_offsets[i + 1] = output_ncodeunits;
+    }
+    DCHECK_LE(output_ncodeunits, output_ncodeunits_max);
+
+    // Trim the codepoint buffer, since we allocated too much
+    return values_buffer->Resize(output_ncodeunits, /*shrink_to_fit=*/true);
+    return Status::OK();
+  }
+
+  static Status ExecScalarArray(KernelContext* ctx, StringTransform* transform,
+                                const std::shared_ptr<Scalar>& scalar1,
+                                const std::shared_ptr<ArrayData>& data2, 
Datum* out) {
+    return Status::NotImplemented(
+        "Binary string transforms with (scalar, array) inputs are not 
supported for the "
+        "general case");
+  }
+
+  static Status ExecArrayArray(KernelContext* ctx, StringTransform* transform,
+                               const std::shared_ptr<ArrayData>& data1,
+                               const std::shared_ptr<ArrayData>& data2, Datum* 
out) {
+    ArrayType1 input1(data1);
+    ArrayType2 input2(data2);
+
+    auto input1_ncodeunits = input1.total_values_length();
+    auto input1_nstrings = input1.length();
+    auto output_ncodeunits_max =
+        transform->MaxCodeunits(input1_nstrings, input1_ncodeunits, data2);
+    if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
+      return Status::CapacityError(
+          "Result might not fit in a 32bit utf8 array, convert to large_utf8");
+    }
+
+    ArrayData* output = out->mutable_array();
+    ARROW_ASSIGN_OR_RAISE(auto values_buffer, 
ctx->Allocate(output_ncodeunits_max));
+    output->buffers[2] = values_buffer;
+
+    // String offsets are preallocated
+    auto output_string_offsets = output->GetMutableValues<offset_type>(1);
+    auto output_str = output->buffers[2]->mutable_data();
+    output_string_offsets[0] = 0;
+
+    offset_type output_ncodeunits = 0;
+    for (int64_t i = 0; i < input1_nstrings; ++i) {
+      if (!input1.IsNull(i) || !input2.IsNull(i)) {

Review comment:
       You could perhaps use `VisitTwoBitBlocksVoid` to make this slightly 
faster.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -417,6 +419,231 @@ struct StringTransformExecWithState
   }
 };
 
+struct StringBinaryTransformBase {
+  virtual Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* 
out) {
+    return Status::OK();
+  }
+
+  // Return the maximum total size of the output in codeunits (i.e. bytes)
+  // given input characteristics.
+  virtual int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits,
+                               const std::shared_ptr<Scalar>& input2) {
+    return input_ncodeunits;
+  }
+
+  // Return the maximum total size of the output in codeunits (i.e. bytes)
+  // given input characteristics.
+  virtual int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits,
+                               const std::shared_ptr<ArrayData>& data2) {
+    return input_ncodeunits;
+  }
+
+  virtual Status InvalidStatus() {
+    return Status::Invalid("Invalid UTF8 sequence in input");
+  }
+};
+
+/// Kernel exec generator for binary string transforms.
+/// The first parameter is expected to always be a string type while the 
second parameter
+/// is generic. It supports executions of the form:
+///   * Scalar, Scalar
+///   * Array, Scalar - scalar is broadcasted and paired with all values of 
array
+///   * Array, Array - arrays are processed element-wise
+///   * Scalar, Array - not supported by default
+template <typename Type1, typename Type2, typename StringTransform>
+struct StringBinaryTransformExecBase {
+  using offset_type = typename Type1::offset_type;
+  using ArrayType1 = typename TypeTraits<Type1>::ArrayType;
+  using ArrayType2 = typename TypeTraits<Type2>::ArrayType;
+
+  static Status Execute(KernelContext* ctx, StringTransform* transform,
+                        const ExecBatch& batch, Datum* out) {
+    if (batch.num_values() != 2) {
+      return Status::Invalid("Invalid arity for binary string transform");
+    }
+
+    if (batch[0].is_array()) {
+      if (batch[1].is_array()) {
+        return ExecArrayArray(ctx, transform, batch[0].array(), 
batch[1].array(), out);
+      } else if (batch[1].is_scalar()) {
+        return ExecArrayScalar(ctx, transform, batch[0].array(), 
batch[1].scalar(), out);
+      }
+    } else if (batch[0].is_scalar()) {
+      if (batch[1].is_array()) {
+        return ExecScalarArray(ctx, transform, batch[0].scalar(), 
batch[1].array(), out);
+      } else if (batch[1].is_scalar()) {
+        return ExecScalarScalar(ctx, transform, batch[0].scalar(), 
batch[1].scalar(),
+                                out);
+      }
+    }
+    return Status::Invalid("Invalid ExecBatch kind for binary string 
transform");
+  }
+
+  static Status ExecScalarScalar(KernelContext* ctx, StringTransform* 
transform,
+                                 const std::shared_ptr<Scalar>& scalar1,
+                                 const std::shared_ptr<Scalar>& scalar2, 
Datum* out) {
+    if (!scalar1->is_valid || !scalar2->is_valid) {
+      return Status::OK();
+    }
+
+    const auto& input1 = checked_cast<const BaseBinaryScalar&>(*scalar1);
+    auto input_ncodeunits = input1.value->size();
+    auto input_nstrings = 1;
+    auto output_ncodeunits_max =
+        transform->MaxCodeunits(input_nstrings, input_ncodeunits, scalar2);
+    if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
+      return Status::CapacityError(
+          "Result might not fit in a 32bit utf8 array, convert to large_utf8");
+    }
+
+    ARROW_ASSIGN_OR_RAISE(auto value_buffer, 
ctx->Allocate(output_ncodeunits_max));
+    auto result = checked_cast<BaseBinaryScalar*>(out->scalar().get());
+    result->is_valid = true;
+    result->value = value_buffer;
+    auto output_str = value_buffer->mutable_data();
+
+    auto input1_string = input1.value->data();
+    auto encoded_nbytes = static_cast<offset_type>(
+        transform->Transform(input1_string, input_ncodeunits, scalar2, 
output_str));
+    if (encoded_nbytes < 0) {
+      return transform->InvalidStatus();
+    }
+    DCHECK_LE(encoded_nbytes, output_ncodeunits_max);
+    return value_buffer->Resize(encoded_nbytes, /*shrink_to_fit=*/true);
+  }
+
+  static Status ExecArrayScalar(KernelContext* ctx, StringTransform* transform,
+                                const std::shared_ptr<ArrayData>& data1,
+                                const std::shared_ptr<Scalar>& scalar2, Datum* 
out) {
+    if (!scalar2->is_valid) {
+      return Status::OK();
+    }
+
+    ArrayType1 input1(data1);
+    auto input1_ncodeunits = input1.total_values_length();
+    auto input1_nstrings = input1.length();
+    auto output_ncodeunits_max =
+        transform->MaxCodeunits(input1_nstrings, input1_ncodeunits, scalar2);
+    if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
+      return Status::CapacityError(
+          "Result might not fit in a 32bit utf8 array, convert to large_utf8");
+    }
+
+    ArrayData* output = out->mutable_array();
+    ARROW_ASSIGN_OR_RAISE(auto values_buffer, 
ctx->Allocate(output_ncodeunits_max));
+    output->buffers[2] = values_buffer;
+
+    // String offsets are preallocated
+    auto output_string_offsets = output->GetMutableValues<offset_type>(1);
+    auto output_str = output->buffers[2]->mutable_data();
+    output_string_offsets[0] = 0;
+
+    offset_type output_ncodeunits = 0;
+    for (int64_t i = 0; i < input1_nstrings; ++i) {
+      if (!input1.IsNull(i)) {
+        offset_type input1_string_ncodeunits;
+        auto input1_string = input1.GetValue(i, &input1_string_ncodeunits);
+        auto encoded_nbytes = static_cast<offset_type>(
+            transform->Transform(input1_string, input1_string_ncodeunits, 
scalar2,
+                                 output_str + output_ncodeunits));
+        if (encoded_nbytes < 0) {
+          return transform->InvalidStatus();
+        }
+        output_ncodeunits += encoded_nbytes;
+      }
+      output_string_offsets[i + 1] = output_ncodeunits;
+    }
+    DCHECK_LE(output_ncodeunits, output_ncodeunits_max);
+
+    // Trim the codepoint buffer, since we allocated too much
+    return values_buffer->Resize(output_ncodeunits, /*shrink_to_fit=*/true);
+    return Status::OK();
+  }
+
+  static Status ExecScalarArray(KernelContext* ctx, StringTransform* transform,
+                                const std::shared_ptr<Scalar>& scalar1,
+                                const std::shared_ptr<ArrayData>& data2, 
Datum* out) {
+    return Status::NotImplemented(
+        "Binary string transforms with (scalar, array) inputs are not 
supported for the "
+        "general case");
+  }
+
+  static Status ExecArrayArray(KernelContext* ctx, StringTransform* transform,
+                               const std::shared_ptr<ArrayData>& data1,
+                               const std::shared_ptr<ArrayData>& data2, Datum* 
out) {
+    ArrayType1 input1(data1);
+    ArrayType2 input2(data2);
+
+    auto input1_ncodeunits = input1.total_values_length();
+    auto input1_nstrings = input1.length();
+    auto output_ncodeunits_max =
+        transform->MaxCodeunits(input1_nstrings, input1_ncodeunits, data2);
+    if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
+      return Status::CapacityError(
+          "Result might not fit in a 32bit utf8 array, convert to large_utf8");
+    }
+
+    ArrayData* output = out->mutable_array();
+    ARROW_ASSIGN_OR_RAISE(auto values_buffer, 
ctx->Allocate(output_ncodeunits_max));
+    output->buffers[2] = values_buffer;
+
+    // String offsets are preallocated
+    auto output_string_offsets = output->GetMutableValues<offset_type>(1);
+    auto output_str = output->buffers[2]->mutable_data();
+    output_string_offsets[0] = 0;
+
+    offset_type output_ncodeunits = 0;
+    for (int64_t i = 0; i < input1_nstrings; ++i) {
+      if (!input1.IsNull(i) || !input2.IsNull(i)) {
+        offset_type input1_string_ncodeunits;
+        auto input1_string = input1.GetValue(i, &input1_string_ncodeunits);
+        auto scalar2 = *input2.GetScalar(i);

Review comment:
       Hmm, that will be very inefficient :-( I hope we can find a better way 
of doing this. Perhaps use `input2.GetView(i)`.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -2357,6 +2584,79 @@ void AddSplit(FunctionRegistry* registry) {
 #endif
 }
 
+template <typename Type1, typename Type2>
+struct StrRepeatTransform : public StringBinaryTransformBase {
+  using ArrayType1 = typename TypeTraits<Type1>::ArrayType;
+  using ArrayType2 = typename TypeTraits<Type2>::ArrayType;
+
+  int64_t MaxCodeunits(int64_t inputs, int64_t input_ncodeunits,
+                       const std::shared_ptr<Scalar>& input2) override {
+    auto nrepeats = static_cast<int64_t>(UnboxScalar<Type2>::Unbox(*input2));
+    return std::max(input_ncodeunits * nrepeats, int64_t(0));
+  }
+
+  int64_t MaxCodeunits(int64_t inputs, int64_t input_ncodeunits,
+                       const std::shared_ptr<ArrayData>& data2) override {
+    ArrayType2 array2(data2);
+    // Ideally, we would like to calculate the exact output size by iterating 
over
+    // all strings offsets and summing each length multiplied by the 
corresponding repeat
+    // value, but this requires traversing the data twice (now and during 
transform).
+    // The upper limit is to assume that all strings are repeated the max 
number of
+    // times knowing that a resize operation is performed at end of execution.
+    auto max_nrepeats =
+        static_cast<int64_t>(**std::max_element(array2.begin(), array2.end()));
+    return std::max(input_ncodeunits * max_nrepeats, int64_t(0));
+  }
+
+  int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+                    const std::shared_ptr<Scalar>& input2, uint8_t* output) {

Review comment:
       This could take a `int64_t` instead of a `std::shared_ptr<Scalar>` for 
the second input...

##########
File path: docs/source/cpp/compute.rst
##########
@@ -694,45 +694,47 @@ The third set of functions examines string elements on a 
byte-per-byte basis:
 String transforms
 ~~~~~~~~~~~~~~~~~
 
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| Function name           | Arity | Input types            | Output type       
     | Options class                     | Notes |
-+=========================+=======+========================+========================+===================================+=======+
-| ascii_capitalize        | Unary | String-like            | String-like       
     |                                   | \(1)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| ascii_lower             | Unary | String-like            | String-like       
     |                                   | \(1)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| ascii_reverse           | Unary | String-like            | String-like       
     |                                   | \(2)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| ascii_swapcase          | Unary | String-like            | String-like       
     |                                   | \(1)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| ascii_title             | Unary | String-like            | String-like       
     |                                   | \(1)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| ascii_upper             | Unary | String-like            | String-like       
     |                                   | \(1)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| binary_length           | Unary | Binary- or String-like | Int32 or Int64    
     |                                   | \(3)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| binary_replace_slice    | Unary | String-like            | Binary- or 
String-like | :struct:`ReplaceSliceOptions`     | \(4)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| replace_substring       | Unary | String-like            | String-like       
     | :struct:`ReplaceSubstringOptions` | \(5)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| replace_substring_regex | Unary | String-like            | String-like       
     | :struct:`ReplaceSubstringOptions` | \(6)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_capitalize         | Unary | String-like            | String-like       
     |                                   | \(8)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_length             | Unary | String-like            | Int32 or Int64    
     |                                   | \(7)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_lower              | Unary | String-like            | String-like       
     |                                   | \(8)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_replace_slice      | Unary | String-like            | String-like       
     | :struct:`ReplaceSliceOptions`     | \(4)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_reverse            | Unary | String-like            | String-like       
     |                                   | \(9)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_swapcase           | Unary | String-like            | String-like       
     |                                   | \(8)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_title              | Unary | String-like            | String-like       
     |                                   | \(8)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_upper              | Unary | String-like            | String-like       
     |                                   | \(8)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
++-------------------------+--------+------------------------+------------------------+-----------------------------------+-------+
+| Function name           | Arity  | Input types            | Output type      
      | Options class                     | Notes |
++=========================+========+========================+========================+===================================+=======+
+| ascii_capitalize        | Unary  | String-like            | String-like      
      |                                   | \(1)  |
++-------------------------+--------+------------------------+------------------------+-----------------------------------+-------+
+| ascii_lower             | Unary  | String-like            | String-like      
      |                                   | \(1)  |
++-------------------------+--------+------------------------+------------------------+-----------------------------------+-------+
+| ascii_reverse           | Unary  | String-like            | String-like      
      |                                   | \(2)  |
++-------------------------+--------+------------------------+------------------------+-----------------------------------+-------+
+| ascii_swapcase          | Unary  | String-like            | String-like      
      |                                   | \(1)  |
++-------------------------+--------+------------------------+------------------------+-----------------------------------+-------+
+| ascii_title             | Unary  | String-like            | String-like      
      |                                   | \(1)  |
++-------------------------+--------+------------------------+------------------------+-----------------------------------+-------+
+| ascii_upper             | Unary  | String-like            | String-like      
      |                                   | \(1)  |
++-------------------------+--------+------------------------+------------------------+-----------------------------------+-------+
+| binary_length           | Unary  | Binary- or String-like | Int32 or Int64   
      |                                   | \(3)  |
++-------------------------+--------+------------------------+------------------------+-----------------------------------+-------+
+| binary_replace_slice    | Unary  | String-like            | Binary- or 
String-like | :struct:`ReplaceSliceOptions`     | \(4)  |
++-------------------------+--------+------------------------+------------------------+-----------------------------------+-------+
+| replace_substring       | Unary  | String-like            | String-like      
      | :struct:`ReplaceSubstringOptions` | \(5)  |
++-------------------------+--------+------------------------+------------------------+-----------------------------------+-------+
+| replace_substring_regex | Unary  | String-like            | String-like      
      | :struct:`ReplaceSubstringOptions` | \(6)  |
++-------------------------+--------+------------------------+------------------------+-----------------------------------+-------+
+| str_repeat              | Binary | String-like            | String-like      
      |                                   |       |

Review comment:
       The second input type should be "Integer".

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -2357,6 +2584,79 @@ void AddSplit(FunctionRegistry* registry) {
 #endif
 }
 
+template <typename Type1, typename Type2>
+struct StrRepeatTransform : public StringBinaryTransformBase {
+  using ArrayType1 = typename TypeTraits<Type1>::ArrayType;
+  using ArrayType2 = typename TypeTraits<Type2>::ArrayType;
+
+  int64_t MaxCodeunits(int64_t inputs, int64_t input_ncodeunits,
+                       const std::shared_ptr<Scalar>& input2) override {
+    auto nrepeats = static_cast<int64_t>(UnboxScalar<Type2>::Unbox(*input2));
+    return std::max(input_ncodeunits * nrepeats, int64_t(0));
+  }
+
+  int64_t MaxCodeunits(int64_t inputs, int64_t input_ncodeunits,
+                       const std::shared_ptr<ArrayData>& data2) override {
+    ArrayType2 array2(data2);
+    // Ideally, we would like to calculate the exact output size by iterating 
over
+    // all strings offsets and summing each length multiplied by the 
corresponding repeat
+    // value, but this requires traversing the data twice (now and during 
transform).
+    // The upper limit is to assume that all strings are repeated the max 
number of
+    // times knowing that a resize operation is performed at end of execution.
+    auto max_nrepeats =
+        static_cast<int64_t>(**std::max_element(array2.begin(), array2.end()));
+    return std::max(input_ncodeunits * max_nrepeats, int64_t(0));
+  }
+
+  int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+                    const std::shared_ptr<Scalar>& input2, uint8_t* output) {
+    auto nrepeats = static_cast<int64_t>(UnboxScalar<Type2>::Unbox(*input2));
+    uint8_t* output_start = output;
+    if (nrepeats > 0) {
+      // log2(k) approach

Review comment:
       This comment is a bit misleading (memcpy is not a O(1) operation), 
though I understand the underlying idea.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -417,6 +419,231 @@ struct StringTransformExecWithState
   }
 };
 
+struct StringBinaryTransformBase {
+  virtual Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* 
out) {
+    return Status::OK();
+  }
+
+  // Return the maximum total size of the output in codeunits (i.e. bytes)
+  // given input characteristics.
+  virtual int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits,
+                               const std::shared_ptr<Scalar>& input2) {
+    return input_ncodeunits;
+  }
+
+  // Return the maximum total size of the output in codeunits (i.e. bytes)
+  // given input characteristics.
+  virtual int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits,
+                               const std::shared_ptr<ArrayData>& data2) {
+    return input_ncodeunits;
+  }
+
+  virtual Status InvalidStatus() {
+    return Status::Invalid("Invalid UTF8 sequence in input");
+  }
+};
+
+/// Kernel exec generator for binary string transforms.
+/// The first parameter is expected to always be a string type while the 
second parameter
+/// is generic. It supports executions of the form:
+///   * Scalar, Scalar
+///   * Array, Scalar - scalar is broadcasted and paired with all values of 
array
+///   * Array, Array - arrays are processed element-wise
+///   * Scalar, Array - not supported by default
+template <typename Type1, typename Type2, typename StringTransform>
+struct StringBinaryTransformExecBase {
+  using offset_type = typename Type1::offset_type;
+  using ArrayType1 = typename TypeTraits<Type1>::ArrayType;
+  using ArrayType2 = typename TypeTraits<Type2>::ArrayType;
+
+  static Status Execute(KernelContext* ctx, StringTransform* transform,
+                        const ExecBatch& batch, Datum* out) {
+    if (batch.num_values() != 2) {
+      return Status::Invalid("Invalid arity for binary string transform");
+    }
+
+    if (batch[0].is_array()) {
+      if (batch[1].is_array()) {
+        return ExecArrayArray(ctx, transform, batch[0].array(), 
batch[1].array(), out);
+      } else if (batch[1].is_scalar()) {
+        return ExecArrayScalar(ctx, transform, batch[0].array(), 
batch[1].scalar(), out);
+      }
+    } else if (batch[0].is_scalar()) {
+      if (batch[1].is_array()) {
+        return ExecScalarArray(ctx, transform, batch[0].scalar(), 
batch[1].array(), out);
+      } else if (batch[1].is_scalar()) {
+        return ExecScalarScalar(ctx, transform, batch[0].scalar(), 
batch[1].scalar(),
+                                out);
+      }
+    }
+    return Status::Invalid("Invalid ExecBatch kind for binary string 
transform");
+  }
+
+  static Status ExecScalarScalar(KernelContext* ctx, StringTransform* 
transform,
+                                 const std::shared_ptr<Scalar>& scalar1,
+                                 const std::shared_ptr<Scalar>& scalar2, 
Datum* out) {
+    if (!scalar1->is_valid || !scalar2->is_valid) {
+      return Status::OK();
+    }
+
+    const auto& input1 = checked_cast<const BaseBinaryScalar&>(*scalar1);
+    auto input_ncodeunits = input1.value->size();
+    auto input_nstrings = 1;
+    auto output_ncodeunits_max =
+        transform->MaxCodeunits(input_nstrings, input_ncodeunits, scalar2);
+    if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
+      return Status::CapacityError(
+          "Result might not fit in a 32bit utf8 array, convert to large_utf8");

Review comment:
       "scalar" rather than "array"?

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -417,6 +419,231 @@ struct StringTransformExecWithState
   }
 };
 
+struct StringBinaryTransformBase {
+  virtual Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* 
out) {
+    return Status::OK();
+  }
+
+  // Return the maximum total size of the output in codeunits (i.e. bytes)
+  // given input characteristics.
+  virtual int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits,
+                               const std::shared_ptr<Scalar>& input2) {
+    return input_ncodeunits;
+  }
+
+  // Return the maximum total size of the output in codeunits (i.e. bytes)
+  // given input characteristics.
+  virtual int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits,
+                               const std::shared_ptr<ArrayData>& data2) {
+    return input_ncodeunits;
+  }
+
+  virtual Status InvalidStatus() {
+    return Status::Invalid("Invalid UTF8 sequence in input");
+  }
+};
+
+/// Kernel exec generator for binary string transforms.
+/// The first parameter is expected to always be a string type while the 
second parameter
+/// is generic. It supports executions of the form:
+///   * Scalar, Scalar
+///   * Array, Scalar - scalar is broadcasted and paired with all values of 
array
+///   * Array, Array - arrays are processed element-wise
+///   * Scalar, Array - not supported by default
+template <typename Type1, typename Type2, typename StringTransform>
+struct StringBinaryTransformExecBase {
+  using offset_type = typename Type1::offset_type;
+  using ArrayType1 = typename TypeTraits<Type1>::ArrayType;
+  using ArrayType2 = typename TypeTraits<Type2>::ArrayType;
+
+  static Status Execute(KernelContext* ctx, StringTransform* transform,
+                        const ExecBatch& batch, Datum* out) {
+    if (batch.num_values() != 2) {
+      return Status::Invalid("Invalid arity for binary string transform");
+    }
+
+    if (batch[0].is_array()) {
+      if (batch[1].is_array()) {
+        return ExecArrayArray(ctx, transform, batch[0].array(), 
batch[1].array(), out);
+      } else if (batch[1].is_scalar()) {
+        return ExecArrayScalar(ctx, transform, batch[0].array(), 
batch[1].scalar(), out);
+      }
+    } else if (batch[0].is_scalar()) {
+      if (batch[1].is_array()) {
+        return ExecScalarArray(ctx, transform, batch[0].scalar(), 
batch[1].array(), out);
+      } else if (batch[1].is_scalar()) {
+        return ExecScalarScalar(ctx, transform, batch[0].scalar(), 
batch[1].scalar(),
+                                out);
+      }
+    }
+    return Status::Invalid("Invalid ExecBatch kind for binary string 
transform");
+  }
+
+  static Status ExecScalarScalar(KernelContext* ctx, StringTransform* 
transform,
+                                 const std::shared_ptr<Scalar>& scalar1,
+                                 const std::shared_ptr<Scalar>& scalar2, 
Datum* out) {
+    if (!scalar1->is_valid || !scalar2->is_valid) {
+      return Status::OK();
+    }
+
+    const auto& input1 = checked_cast<const BaseBinaryScalar&>(*scalar1);
+    auto input_ncodeunits = input1.value->size();
+    auto input_nstrings = 1;
+    auto output_ncodeunits_max =
+        transform->MaxCodeunits(input_nstrings, input_ncodeunits, scalar2);
+    if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
+      return Status::CapacityError(
+          "Result might not fit in a 32bit utf8 array, convert to large_utf8");
+    }
+
+    ARROW_ASSIGN_OR_RAISE(auto value_buffer, 
ctx->Allocate(output_ncodeunits_max));
+    auto result = checked_cast<BaseBinaryScalar*>(out->scalar().get());
+    result->is_valid = true;
+    result->value = value_buffer;
+    auto output_str = value_buffer->mutable_data();
+
+    auto input1_string = input1.value->data();
+    auto encoded_nbytes = static_cast<offset_type>(
+        transform->Transform(input1_string, input_ncodeunits, scalar2, 
output_str));
+    if (encoded_nbytes < 0) {
+      return transform->InvalidStatus();
+    }
+    DCHECK_LE(encoded_nbytes, output_ncodeunits_max);
+    return value_buffer->Resize(encoded_nbytes, /*shrink_to_fit=*/true);
+  }
+
+  static Status ExecArrayScalar(KernelContext* ctx, StringTransform* transform,
+                                const std::shared_ptr<ArrayData>& data1,
+                                const std::shared_ptr<Scalar>& scalar2, Datum* 
out) {
+    if (!scalar2->is_valid) {
+      return Status::OK();
+    }
+
+    ArrayType1 input1(data1);
+    auto input1_ncodeunits = input1.total_values_length();
+    auto input1_nstrings = input1.length();
+    auto output_ncodeunits_max =
+        transform->MaxCodeunits(input1_nstrings, input1_ncodeunits, scalar2);
+    if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
+      return Status::CapacityError(
+          "Result might not fit in a 32bit utf8 array, convert to large_utf8");
+    }
+
+    ArrayData* output = out->mutable_array();
+    ARROW_ASSIGN_OR_RAISE(auto values_buffer, 
ctx->Allocate(output_ncodeunits_max));
+    output->buffers[2] = values_buffer;
+
+    // String offsets are preallocated
+    auto output_string_offsets = output->GetMutableValues<offset_type>(1);
+    auto output_str = output->buffers[2]->mutable_data();
+    output_string_offsets[0] = 0;
+
+    offset_type output_ncodeunits = 0;
+    for (int64_t i = 0; i < input1_nstrings; ++i) {
+      if (!input1.IsNull(i)) {
+        offset_type input1_string_ncodeunits;
+        auto input1_string = input1.GetValue(i, &input1_string_ncodeunits);
+        auto encoded_nbytes = static_cast<offset_type>(
+            transform->Transform(input1_string, input1_string_ncodeunits, 
scalar2,
+                                 output_str + output_ncodeunits));
+        if (encoded_nbytes < 0) {
+          return transform->InvalidStatus();
+        }
+        output_ncodeunits += encoded_nbytes;
+      }
+      output_string_offsets[i + 1] = output_ncodeunits;
+    }
+    DCHECK_LE(output_ncodeunits, output_ncodeunits_max);
+
+    // Trim the codepoint buffer, since we allocated too much
+    return values_buffer->Resize(output_ncodeunits, /*shrink_to_fit=*/true);
+    return Status::OK();
+  }
+
+  static Status ExecScalarArray(KernelContext* ctx, StringTransform* transform,
+                                const std::shared_ptr<Scalar>& scalar1,
+                                const std::shared_ptr<ArrayData>& data2, 
Datum* out) {
+    return Status::NotImplemented(
+        "Binary string transforms with (scalar, array) inputs are not 
supported for the "
+        "general case");

Review comment:
       This wouldn't be too difficult to implement, would it?
   (note: perhaps some repetition can be avoided by factoring out common pieces 
of code between the four `ExecXXX` variants, though I'm not sure how easy that 
is)

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -417,6 +419,231 @@ struct StringTransformExecWithState
   }
 };
 
+struct StringBinaryTransformBase {
+  virtual Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* 
out) {
+    return Status::OK();
+  }
+
+  // Return the maximum total size of the output in codeunits (i.e. bytes)
+  // given input characteristics.
+  virtual int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits,
+                               const std::shared_ptr<Scalar>& input2) {
+    return input_ncodeunits;

Review comment:
       This default implementation looks arbitrary. IMHO it would be safer to 
make it pure virtual.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -2357,6 +2584,79 @@ void AddSplit(FunctionRegistry* registry) {
 #endif
 }
 
+template <typename Type1, typename Type2>
+struct StrRepeatTransform : public StringBinaryTransformBase {
+  using ArrayType1 = typename TypeTraits<Type1>::ArrayType;
+  using ArrayType2 = typename TypeTraits<Type2>::ArrayType;
+
+  int64_t MaxCodeunits(int64_t inputs, int64_t input_ncodeunits,
+                       const std::shared_ptr<Scalar>& input2) override {
+    auto nrepeats = static_cast<int64_t>(UnboxScalar<Type2>::Unbox(*input2));
+    return std::max(input_ncodeunits * nrepeats, int64_t(0));
+  }
+
+  int64_t MaxCodeunits(int64_t inputs, int64_t input_ncodeunits,
+                       const std::shared_ptr<ArrayData>& data2) override {
+    ArrayType2 array2(data2);
+    // Ideally, we would like to calculate the exact output size by iterating 
over
+    // all strings offsets and summing each length multiplied by the 
corresponding repeat
+    // value, but this requires traversing the data twice (now and during 
transform).
+    // The upper limit is to assume that all strings are repeated the max 
number of
+    // times knowing that a resize operation is performed at end of execution.
+    auto max_nrepeats =
+        static_cast<int64_t>(**std::max_element(array2.begin(), array2.end()));
+    return std::max(input_ncodeunits * max_nrepeats, int64_t(0));
+  }
+
+  int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+                    const std::shared_ptr<Scalar>& input2, uint8_t* output) {
+    auto nrepeats = static_cast<int64_t>(UnboxScalar<Type2>::Unbox(*input2));
+    uint8_t* output_start = output;
+    if (nrepeats > 0) {
+      // log2(k) approach

Review comment:
       The irony is that for small nrepeats, this may be slower than the more 
straightforward approach, of course :-)
   That said, I'm not sure this kernel is really performance-critical.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -2357,6 +2584,79 @@ void AddSplit(FunctionRegistry* registry) {
 #endif
 }
 
+template <typename Type1, typename Type2>
+struct StrRepeatTransform : public StringBinaryTransformBase {
+  using ArrayType1 = typename TypeTraits<Type1>::ArrayType;
+  using ArrayType2 = typename TypeTraits<Type2>::ArrayType;
+
+  int64_t MaxCodeunits(int64_t inputs, int64_t input_ncodeunits,
+                       const std::shared_ptr<Scalar>& input2) override {
+    auto nrepeats = static_cast<int64_t>(UnboxScalar<Type2>::Unbox(*input2));
+    return std::max(input_ncodeunits * nrepeats, int64_t(0));
+  }
+
+  int64_t MaxCodeunits(int64_t inputs, int64_t input_ncodeunits,
+                       const std::shared_ptr<ArrayData>& data2) override {
+    ArrayType2 array2(data2);
+    // Ideally, we would like to calculate the exact output size by iterating 
over
+    // all strings offsets and summing each length multiplied by the 
corresponding repeat
+    // value, but this requires traversing the data twice (now and during 
transform).
+    // The upper limit is to assume that all strings are repeated the max 
number of
+    // times knowing that a resize operation is performed at end of execution.

Review comment:
       The problem is that the upper limit may end up huge if there is a single 
large repeat count in the array.
   
   It seems to me that traversing twice is actually better here (or you can bit 
the bullet and allow some resizing while building up the output, but that's not 
compatible with `StringBinaryTransformBase`).

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -2357,6 +2584,79 @@ void AddSplit(FunctionRegistry* registry) {
 #endif
 }
 
+template <typename Type1, typename Type2>
+struct StrRepeatTransform : public StringBinaryTransformBase {
+  using ArrayType1 = typename TypeTraits<Type1>::ArrayType;
+  using ArrayType2 = typename TypeTraits<Type2>::ArrayType;
+
+  int64_t MaxCodeunits(int64_t inputs, int64_t input_ncodeunits,
+                       const std::shared_ptr<Scalar>& input2) override {
+    auto nrepeats = static_cast<int64_t>(UnboxScalar<Type2>::Unbox(*input2));
+    return std::max(input_ncodeunits * nrepeats, int64_t(0));
+  }
+
+  int64_t MaxCodeunits(int64_t inputs, int64_t input_ncodeunits,
+                       const std::shared_ptr<ArrayData>& data2) override {
+    ArrayType2 array2(data2);
+    // Ideally, we would like to calculate the exact output size by iterating 
over
+    // all strings offsets and summing each length multiplied by the 
corresponding repeat
+    // value, but this requires traversing the data twice (now and during 
transform).
+    // The upper limit is to assume that all strings are repeated the max 
number of
+    // times knowing that a resize operation is performed at end of execution.
+    auto max_nrepeats =
+        static_cast<int64_t>(**std::max_element(array2.begin(), array2.end()));
+    return std::max(input_ncodeunits * max_nrepeats, int64_t(0));
+  }
+
+  int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+                    const std::shared_ptr<Scalar>& input2, uint8_t* output) {
+    auto nrepeats = static_cast<int64_t>(UnboxScalar<Type2>::Unbox(*input2));
+    uint8_t* output_start = output;
+    if (nrepeats > 0) {
+      // log2(k) approach
+      std::memcpy(output, input, input_string_ncodeunits);
+      output += input_string_ncodeunits;
+      int64_t i = 1;
+      for (int64_t ilen = input_string_ncodeunits; i <= (nrepeats / 2);
+           i *= 2, ilen *= 2) {
+        std::memcpy(output, output_start, ilen);
+        output += ilen;
+      }
+
+      // Epilogue remainder
+      int64_t rem = (nrepeats ^ i) * input_string_ncodeunits;
+      std::memcpy(output, output_start, rem);
+      output += rem;
+    }
+    return output - output_start;
+  }
+};
+
+template <typename Type1, typename Type2>
+using StrRepeat =
+    StringBinaryTransformExec<Type1, Type2, StrRepeatTransform<Type1, Type2>>;
+
+template <template <typename...> class ExecFunctor>

Review comment:
       I'm curious why `ExecFunction` is declared as a template class here, 
while below `ExecFunctor` is used without parametrization.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -557,6 +558,36 @@ TYPED_TEST(TestStringKernels, Utf8Title) {
       R"([null, "", "B", "Aaaz;Zææ&", "Ɑɽɽow", "Ii", "Ⱥ.Ⱥ.Ⱥ..Ⱥ", "Hello, 
World!", "Foo   Bar;Héhé0Zop", "!%$^.,;"])");
 }
 
+TYPED_TEST(TestStringKernels, StrRepeat) {
+  auto values = ArrayFromJSON(
+      this->type(),
+      R"(["aAazZæÆ&", null, "", "b", "ɑɽⱤoW", "ıI", "ⱥⱥⱥȺ", "hEllO, WoRld!", 
"$. A3", "!ɑⱤⱤow"])");
+  std::vector<std::pair<int, std::string>> repeats_and_expected{{
+      {-1, R"(["", null, "", "", "", "", "", "", "", ""])"},
+      {0, R"(["", null, "", "", "", "", "", "", "", ""])"},
+      {1,
+       R"(["aAazZæÆ&", null, "", "b", "ɑɽⱤoW", "ıI", "ⱥⱥⱥȺ", "hEllO, WoRld!", 
"$. A3", "!ɑⱤⱤow"])"},
+      {3,
+       R"(["aAazZæÆ&aAazZæÆ&aAazZæÆ&", null, "", "bbb", "ɑɽⱤoWɑɽⱤoWɑɽⱤoW", 
"ıIıIıI", "ⱥⱥⱥȺⱥⱥⱥȺⱥⱥⱥȺ", "hEllO, WoRld!hEllO, WoRld!hEllO, WoRld!", "$. A3$. 
A3$. A3", "!ɑⱤⱤow!ɑⱤⱤow!ɑⱤⱤow"])"},
+  }};
+
+  for (const auto& pair : repeats_and_expected) {
+    auto repeat = pair.first;
+    auto expected = pair.second;
+    this->CheckVarArgs("str_repeat", {values, Datum(repeat)}, this->type(), 
expected);

Review comment:
       I'm curious: are we sure `Datum(repeat)` instantiates an integer scalar?
   For the sake of clarity, I would call something like `MakeScalar(repeat, 
int64())`.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -50,13 +50,14 @@ class BaseTestStringKernels : public ::testing::Test {
     CheckScalarUnary(func_name, type(), json_input, out_ty, json_expected, 
options);
   }
 
-  void CheckBinaryScalar(std::string func_name, std::string json_left_input,
-                         std::string json_right_scalar, 
std::shared_ptr<DataType> out_ty,
-                         std::string json_expected,
-                         const FunctionOptions* options = nullptr) {
-    CheckScalarBinaryScalar(func_name, type(), json_left_input, 
json_right_scalar, out_ty,
-                            json_expected, options);
-  }
+  // void CheckBinaryScalar(std::string func_name, std::string json_left_input,
+  //                        std::string json_right_scalar, 
std::shared_ptr<DataType>
+  //                        out_ty, std::string json_expected, const 
FunctionOptions*
+  //                        options = nullptr) {
+  //   CheckScalarBinaryScalar(func_name, type(), json_left_input, 
json_right_scalar,
+  //   out_ty,
+  //                           json_expected, options);
+  // }

Review comment:
       Why is this commented out?

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -557,6 +558,36 @@ TYPED_TEST(TestStringKernels, Utf8Title) {
       R"([null, "", "B", "Aaaz;Zææ&", "Ɑɽɽow", "Ii", "Ⱥ.Ⱥ.Ⱥ..Ⱥ", "Hello, 
World!", "Foo   Bar;Héhé0Zop", "!%$^.,;"])");
 }
 
+TYPED_TEST(TestStringKernels, StrRepeat) {
+  auto values = ArrayFromJSON(
+      this->type(),
+      R"(["aAazZæÆ&", null, "", "b", "ɑɽⱤoW", "ıI", "ⱥⱥⱥȺ", "hEllO, WoRld!", 
"$. A3", "!ɑⱤⱤow"])");
+  std::vector<std::pair<int, std::string>> repeats_and_expected{{
+      {-1, R"(["", null, "", "", "", "", "", "", "", ""])"},
+      {0, R"(["", null, "", "", "", "", "", "", "", ""])"},
+      {1,
+       R"(["aAazZæÆ&", null, "", "b", "ɑɽⱤoW", "ıI", "ⱥⱥⱥȺ", "hEllO, WoRld!", 
"$. A3", "!ɑⱤⱤow"])"},
+      {3,
+       R"(["aAazZæÆ&aAazZæÆ&aAazZæÆ&", null, "", "bbb", "ɑɽⱤoWɑɽⱤoWɑɽⱤoW", 
"ıIıIıI", "ⱥⱥⱥȺⱥⱥⱥȺⱥⱥⱥȺ", "hEllO, WoRld!hEllO, WoRld!hEllO, WoRld!", "$. A3$. 
A3$. A3", "!ɑⱤⱤow!ɑⱤⱤow!ɑⱤⱤow"])"},
+  }};
+
+  for (const auto& pair : repeats_and_expected) {
+    auto repeat = pair.first;
+    auto expected = pair.second;
+    this->CheckVarArgs("str_repeat", {values, Datum(repeat)}, this->type(), 
expected);
+  }
+}
+
+TYPED_TEST(TestStringKernels, StrRepeats) {
+  auto repeats = ArrayFromJSON(int64(), R"([1, 2, 4, 2, 0, 1, 3, 2, 3, -1])");
+  auto values = ArrayFromJSON(
+      this->type(),
+      R"(["aAazZæÆ&", "", "b", "ɑɽⱤoW", "ıI", "ⱥⱥⱥȺ", "hEllO, WoRld!", "$. 
A3", "!ɑⱤⱤow", "one"])");
+  std::string expected =
+      R"(["aAazZæÆ&", "", "bbbb", "ɑɽⱤoWɑɽⱤoW", "", "ⱥⱥⱥȺ", "hEllO, 
WoRld!hEllO, WoRld!hEllO, WoRld!", "$. A3$. A3", "!ɑⱤⱤow!ɑⱤⱤow!ɑⱤⱤow", ""])";
+  this->CheckVarArgs("str_repeat", {values, repeats}, this->type(), expected);

Review comment:
       Nulls should be tested too...




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] pitrou commented on a change in pull request #11023: ARROW-12712: [C++] String repeat kernel

Reply via email to