edponce commented on a change in pull request #11023:
URL: https://github.com/apache/arrow/pull/11023#discussion_r740351649



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -537,6 +549,358 @@ struct FixedSizeBinaryTransformExecWithState
   }
 };
 
+template <typename Type1, typename Type2>
+struct StringBinaryTransformBase {
+  using ViewType2 = typename GetViewType<Type2>::T;
+  using ArrayType1 = typename TypeTraits<Type1>::ArrayType;
+  using ArrayType2 = typename TypeTraits<Type2>::ArrayType;
+
+  virtual ~StringBinaryTransformBase() = default;
+
+  virtual Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* 
out) {
+    return Status::OK();
+  }
+
+  virtual Status InvalidInputSequence() {
+    return Status::Invalid("Invalid UTF8 sequence in input");
+  }
+
+  // Return the maximum total size of the output in codeunits (i.e. bytes)
+  // given input characteristics for different input shapes.
+  // The Status parameter should only be set if an error needs to be signaled.
+
+  // Scalar-Scalar
+  virtual int64_t MaxCodeunits(const int64_t input1_ncodeunits, const 
ViewType2,
+                               Status*) {
+    return input1_ncodeunits;
+  }
+
+  // Scalar-Array
+  virtual int64_t MaxCodeunits(const int64_t input1_ncodeunits, const 
ArrayType2&,
+                               Status*) {
+    return input1_ncodeunits;
+  }
+
+  // Array-Scalar
+  virtual int64_t MaxCodeunits(const ArrayType1& input1, const ViewType2, 
Status*) {
+    return input1.total_values_length();
+  }
+
+  // Array-Array
+  virtual int64_t MaxCodeunits(const ArrayType1& input1, const ArrayType2&, 
Status*) {
+    return input1.total_values_length();
+  }
+
+  // Not all combinations of input shapes are meaningful to string binary
+  // transforms, so these flags serve as control toggles for enabling/disabling
+  // the corresponding ones. These flags should be set in the PreExec() method.
+  //
+  // This is an example of a StringTransform that disables support for 
arguments
+  // with mixed Scalar/Array shapes.
+  //
+  // template <typename Type1, typename Type2>
+  // struct MyStringTransform : public StringBinaryTransformBase<Type1, Type2> 
{
+  //   Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) 
override {
+  //     EnableScalarArray = false;
+  //     EnableArrayScalar = false;
+  //     return StringBinaryTransformBase::PreExec(ctx, batch, out);
+  //   }
+  //   ...
+  // };
+  bool EnableScalarScalar = true;
+  bool EnableScalarArray = true;
+  bool EnableArrayScalar = true;
+  bool EnableArrayArray = true;
+};
+
+/// Kernel exec generator for binary (two parameters) string transforms.
+/// The first parameter is expected to always be a Binary/StringType while the
+/// second parameter is generic. Types of template parameter StringTransform
+/// need to define a transform method with the following signature:
+///
+/// int64_t Transform(const uint8_t* input, const int64_t 
input_string_ncodeunits,
+///                   const ViewType2 value2, uint8_t* output, Status* st);
+///
+/// where
+///   * `input` - input sequence (binary or string)
+///   * `input_string_ncodeunits` - length of input sequence in codeunits
+///   * `value2` - second argument to the string transform
+///   * `output` - output sequence (binary or string)
+///   * `st` - Status code, only set if transform needs to signal an error
+///
+/// and returns the number of codeunits of the `output` sequence or a negative
+/// value if an invalid input sequence is detected.
+template <typename Type1, typename Type2, typename StringTransform>
+struct StringBinaryTransformExecBase {
+  using offset_type = typename Type1::offset_type;
+  using ViewType2 = typename GetViewType<Type2>::T;
+  using ArrayType1 = typename TypeTraits<Type1>::ArrayType;
+  using ArrayType2 = typename TypeTraits<Type2>::ArrayType;
+
+  static Status Execute(KernelContext* ctx, StringTransform* transform,
+                        const ExecBatch& batch, Datum* out) {
+    if (batch[0].is_scalar()) {
+      if (batch[1].is_scalar()) {
+        if (transform->EnableScalarScalar) {
+          return ExecScalarScalar(ctx, transform, batch[0].scalar(), 
batch[1].scalar(),
+                                  out);
+        }
+      } else if (batch[1].is_array()) {
+        if (transform->EnableScalarArray) {
+          return ExecScalarArray(ctx, transform, batch[0].scalar(), 
batch[1].array(),
+                                 out);
+        }
+      }
+    } else if (batch[0].is_array()) {
+      if (batch[1].is_array()) {
+        if (transform->EnableArrayArray) {
+          return ExecArrayArray(ctx, transform, batch[0].array(), 
batch[1].array(), out);
+        }
+      } else if (batch[1].is_scalar()) {
+        if (transform->EnableArrayScalar) {
+          return ExecArrayScalar(ctx, transform, batch[0].array(), 
batch[1].scalar(),
+                                 out);
+        }
+      }
+    }
+    return Status::Invalid("Invalid combination of operands for binary string 
transform",
+                           " (", batch[0].ToString(), ", ", 
batch[1].ToString(),
+                           "). Only Array/Scalar kinds are supported.");

Review comment:
       Since it is not known a priori what are the valid combinations, I 
decided to remove the second statement from the error message.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -537,6 +549,358 @@ struct FixedSizeBinaryTransformExecWithState
   }
 };
 
+template <typename Type1, typename Type2>
+struct StringBinaryTransformBase {
+  using ViewType2 = typename GetViewType<Type2>::T;
+  using ArrayType1 = typename TypeTraits<Type1>::ArrayType;
+  using ArrayType2 = typename TypeTraits<Type2>::ArrayType;
+
+  virtual ~StringBinaryTransformBase() = default;
+
+  virtual Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* 
out) {
+    return Status::OK();
+  }
+
+  virtual Status InvalidInputSequence() {
+    return Status::Invalid("Invalid UTF8 sequence in input");
+  }
+
+  // Return the maximum total size of the output in codeunits (i.e. bytes)
+  // given input characteristics for different input shapes.
+  // The Status parameter should only be set if an error needs to be signaled.
+
+  // Scalar-Scalar
+  virtual int64_t MaxCodeunits(const int64_t input1_ncodeunits, const 
ViewType2,
+                               Status*) {
+    return input1_ncodeunits;
+  }
+
+  // Scalar-Array
+  virtual int64_t MaxCodeunits(const int64_t input1_ncodeunits, const 
ArrayType2&,
+                               Status*) {
+    return input1_ncodeunits;
+  }
+
+  // Array-Scalar
+  virtual int64_t MaxCodeunits(const ArrayType1& input1, const ViewType2, 
Status*) {
+    return input1.total_values_length();
+  }
+
+  // Array-Array
+  virtual int64_t MaxCodeunits(const ArrayType1& input1, const ArrayType2&, 
Status*) {
+    return input1.total_values_length();
+  }
+
+  // Not all combinations of input shapes are meaningful to string binary
+  // transforms, so these flags serve as control toggles for enabling/disabling
+  // the corresponding ones. These flags should be set in the PreExec() method.
+  //
+  // This is an example of a StringTransform that disables support for 
arguments
+  // with mixed Scalar/Array shapes.
+  //
+  // template <typename Type1, typename Type2>
+  // struct MyStringTransform : public StringBinaryTransformBase<Type1, Type2> 
{
+  //   Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) 
override {
+  //     EnableScalarArray = false;
+  //     EnableArrayScalar = false;
+  //     return StringBinaryTransformBase::PreExec(ctx, batch, out);
+  //   }
+  //   ...
+  // };
+  bool EnableScalarScalar = true;
+  bool EnableScalarArray = true;
+  bool EnableArrayScalar = true;
+  bool EnableArrayArray = true;
+};
+
+/// Kernel exec generator for binary (two parameters) string transforms.
+/// The first parameter is expected to always be a Binary/StringType while the
+/// second parameter is generic. Types of template parameter StringTransform
+/// need to define a transform method with the following signature:
+///
+/// int64_t Transform(const uint8_t* input, const int64_t 
input_string_ncodeunits,
+///                   const ViewType2 value2, uint8_t* output, Status* st);
+///
+/// where
+///   * `input` - input sequence (binary or string)
+///   * `input_string_ncodeunits` - length of input sequence in codeunits
+///   * `value2` - second argument to the string transform
+///   * `output` - output sequence (binary or string)
+///   * `st` - Status code, only set if transform needs to signal an error
+///
+/// and returns the number of codeunits of the `output` sequence or a negative
+/// value if an invalid input sequence is detected.
+template <typename Type1, typename Type2, typename StringTransform>
+struct StringBinaryTransformExecBase {
+  using offset_type = typename Type1::offset_type;
+  using ViewType2 = typename GetViewType<Type2>::T;
+  using ArrayType1 = typename TypeTraits<Type1>::ArrayType;
+  using ArrayType2 = typename TypeTraits<Type2>::ArrayType;
+
+  static Status Execute(KernelContext* ctx, StringTransform* transform,
+                        const ExecBatch& batch, Datum* out) {
+    if (batch[0].is_scalar()) {
+      if (batch[1].is_scalar()) {
+        if (transform->EnableScalarScalar) {
+          return ExecScalarScalar(ctx, transform, batch[0].scalar(), 
batch[1].scalar(),
+                                  out);
+        }
+      } else if (batch[1].is_array()) {
+        if (transform->EnableScalarArray) {
+          return ExecScalarArray(ctx, transform, batch[0].scalar(), 
batch[1].array(),
+                                 out);
+        }
+      }
+    } else if (batch[0].is_array()) {
+      if (batch[1].is_array()) {
+        if (transform->EnableArrayArray) {
+          return ExecArrayArray(ctx, transform, batch[0].array(), 
batch[1].array(), out);
+        }
+      } else if (batch[1].is_scalar()) {
+        if (transform->EnableArrayScalar) {
+          return ExecArrayScalar(ctx, transform, batch[0].array(), 
batch[1].scalar(),
+                                 out);
+        }
+      }
+    }
+    return Status::Invalid("Invalid combination of operands for binary string 
transform",
+                           " (", batch[0].ToString(), ", ", 
batch[1].ToString(),
+                           "). Only Array/Scalar kinds are supported.");

Review comment:
       Previous statement is false. We actually know the valid combinations, so 
we can print them as a helpful error comment.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -537,6 +549,358 @@ struct FixedSizeBinaryTransformExecWithState
   }
 };
 
+template <typename Type1, typename Type2>
+struct StringBinaryTransformBase {
+  using ViewType2 = typename GetViewType<Type2>::T;
+  using ArrayType1 = typename TypeTraits<Type1>::ArrayType;
+  using ArrayType2 = typename TypeTraits<Type2>::ArrayType;
+
+  virtual ~StringBinaryTransformBase() = default;
+
+  virtual Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* 
out) {
+    return Status::OK();
+  }
+
+  virtual Status InvalidInputSequence() {
+    return Status::Invalid("Invalid UTF8 sequence in input");
+  }
+
+  // Return the maximum total size of the output in codeunits (i.e. bytes)
+  // given input characteristics for different input shapes.
+  // The Status parameter should only be set if an error needs to be signaled.
+
+  // Scalar-Scalar
+  virtual int64_t MaxCodeunits(const int64_t input1_ncodeunits, const 
ViewType2,
+                               Status*) {
+    return input1_ncodeunits;
+  }
+
+  // Scalar-Array
+  virtual int64_t MaxCodeunits(const int64_t input1_ncodeunits, const 
ArrayType2&,
+                               Status*) {
+    return input1_ncodeunits;
+  }
+
+  // Array-Scalar
+  virtual int64_t MaxCodeunits(const ArrayType1& input1, const ViewType2, 
Status*) {
+    return input1.total_values_length();
+  }
+
+  // Array-Array
+  virtual int64_t MaxCodeunits(const ArrayType1& input1, const ArrayType2&, 
Status*) {
+    return input1.total_values_length();
+  }
+
+  // Not all combinations of input shapes are meaningful to string binary
+  // transforms, so these flags serve as control toggles for enabling/disabling
+  // the corresponding ones. These flags should be set in the PreExec() method.
+  //
+  // This is an example of a StringTransform that disables support for 
arguments
+  // with mixed Scalar/Array shapes.
+  //
+  // template <typename Type1, typename Type2>
+  // struct MyStringTransform : public StringBinaryTransformBase<Type1, Type2> 
{
+  //   Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) 
override {
+  //     EnableScalarArray = false;
+  //     EnableArrayScalar = false;
+  //     return StringBinaryTransformBase::PreExec(ctx, batch, out);
+  //   }
+  //   ...
+  // };
+  bool EnableScalarScalar = true;
+  bool EnableScalarArray = true;
+  bool EnableArrayScalar = true;
+  bool EnableArrayArray = true;
+};
+
+/// Kernel exec generator for binary (two parameters) string transforms.
+/// The first parameter is expected to always be a Binary/StringType while the
+/// second parameter is generic. Types of template parameter StringTransform
+/// need to define a transform method with the following signature:
+///
+/// int64_t Transform(const uint8_t* input, const int64_t 
input_string_ncodeunits,
+///                   const ViewType2 value2, uint8_t* output, Status* st);
+///
+/// where
+///   * `input` - input sequence (binary or string)
+///   * `input_string_ncodeunits` - length of input sequence in codeunits
+///   * `value2` - second argument to the string transform
+///   * `output` - output sequence (binary or string)
+///   * `st` - Status code, only set if transform needs to signal an error
+///
+/// and returns the number of codeunits of the `output` sequence or a negative
+/// value if an invalid input sequence is detected.
+template <typename Type1, typename Type2, typename StringTransform>
+struct StringBinaryTransformExecBase {
+  using offset_type = typename Type1::offset_type;
+  using ViewType2 = typename GetViewType<Type2>::T;
+  using ArrayType1 = typename TypeTraits<Type1>::ArrayType;
+  using ArrayType2 = typename TypeTraits<Type2>::ArrayType;
+
+  static Status Execute(KernelContext* ctx, StringTransform* transform,
+                        const ExecBatch& batch, Datum* out) {
+    if (batch[0].is_scalar()) {
+      if (batch[1].is_scalar()) {
+        if (transform->EnableScalarScalar) {
+          return ExecScalarScalar(ctx, transform, batch[0].scalar(), 
batch[1].scalar(),
+                                  out);
+        }
+      } else if (batch[1].is_array()) {
+        if (transform->EnableScalarArray) {
+          return ExecScalarArray(ctx, transform, batch[0].scalar(), 
batch[1].array(),
+                                 out);
+        }
+      }
+    } else if (batch[0].is_array()) {
+      if (batch[1].is_array()) {
+        if (transform->EnableArrayArray) {
+          return ExecArrayArray(ctx, transform, batch[0].array(), 
batch[1].array(), out);
+        }
+      } else if (batch[1].is_scalar()) {
+        if (transform->EnableArrayScalar) {
+          return ExecArrayScalar(ctx, transform, batch[0].array(), 
batch[1].scalar(),
+                                 out);
+        }
+      }
+    }
+    return Status::Invalid("Invalid combination of operands for binary string 
transform",
+                           " (", batch[0].ToString(), ", ", 
batch[1].ToString(),
+                           "). Only Array/Scalar kinds are supported.");

Review comment:
       I added it but maybe it is overkill.

##########
File path: docs/source/cpp/compute.rst
##########
@@ -812,45 +812,47 @@ The third set of functions examines string elements on a 
byte-per-byte basis:
 String transforms
 ~~~~~~~~~~~~~~~~~
 
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| Function name           | Arity | Input types            | Output type       
     | Options class                     | Notes |
-+=========================+=======+========================+========================+===================================+=======+
-| ascii_capitalize        | Unary | String-like            | String-like       
     |                                   | \(1)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| ascii_lower             | Unary | String-like            | String-like       
     |                                   | \(1)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| ascii_reverse           | Unary | String-like            | String-like       
     |                                   | \(2)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| ascii_swapcase          | Unary | String-like            | String-like       
     |                                   | \(1)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| ascii_title             | Unary | String-like            | String-like       
     |                                   | \(1)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| ascii_upper             | Unary | String-like            | String-like       
     |                                   | \(1)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| binary_length           | Unary | Binary- or String-like | Int32 or Int64    
     |                                   | \(3)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| binary_replace_slice    | Unary | Binary- or String-like | Binary- or 
String-like | :struct:`ReplaceSliceOptions`     | \(4)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| replace_substring       | Unary | Binary- or String-like | Binary- or 
String-like | :struct:`ReplaceSubstringOptions` | \(5)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| replace_substring_regex | Unary | Binary- or String-like | Binary- or 
String-like | :struct:`ReplaceSubstringOptions` | \(6)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_capitalize         | Unary | String-like            | String-like       
     |                                   | \(8)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_length             | Unary | String-like            | Int32 or Int64    
     |                                   | \(7)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_lower              | Unary | String-like            | String-like       
     |                                   | \(8)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_replace_slice      | Unary | String-like            | String-like       
     | :struct:`ReplaceSliceOptions`     | \(4)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_reverse            | Unary | String-like            | String-like       
     |                                   | \(9)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_swapcase           | Unary | String-like            | String-like       
     |                                   | \(8)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_title              | Unary | String-like            | String-like       
     |                                   | \(8)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_upper              | Unary | String-like            | String-like       
     |                                   | \(8)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| Function name           | Arity  | Input types                             | 
Output type            | Options class                     | Notes |
++=========================+========+=========================================+========================+===================================+=======+
+| ascii_capitalize        | Unary  | String-like                             | 
String-like            |                                   | \(1)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| ascii_lower             | Unary  | String-like                             | 
String-like            |                                   | \(1)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| ascii_reverse           | Unary  | String-like                             | 
String-like            |                                   | \(2)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| ascii_swapcase          | Unary  | String-like                             | 
String-like            |                                   | \(1)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| ascii_title             | Unary  | String-like                             | 
String-like            |                                   | \(1)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| ascii_upper             | Unary  | String-like                             | 
String-like            |                                   | \(1)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| binary_length           | Unary  | Binary- or String-like                  | 
Int32 or Int64         |                                   | \(3)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| binary_replace_slice    | Unary  | String-like                             | 
Binary- or String-like | :struct:`ReplaceSliceOptions`     | \(4)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| replace_substring       | Unary  | String-like                             | 
String-like            | :struct:`ReplaceSubstringOptions` | \(5)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| replace_substring_regex | Unary  | String-like                             | 
String-like            | :struct:`ReplaceSubstringOptions` | \(6)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| string_repeat           | Binary | Binary/String (Arg 0); Integral (Arg 1) | 
Binary- or String-like |                                   | \(7)  |

Review comment:
       Well, [from a previous 
discussion](https://ursalabs.zulipchat.com/#narrow/stream/180245-dev/topic/Stringlike.20kernels.20on.20binary.20data),
 I am following the pattern that a name with `string` expects/supports both 
binary and string encoded data. While the `binary` prefix only expects binary 
non-encoded data and `ascii/utf8` are for encoding-specific functions.
   There are two solutions to be consistent with functions that have either a 
`binary` or `string` prefix:
   1. Change them all to `binary`
       * `string_repeat` --> `binary_repeat`
       * `string_is_ascii` --> `binary_is_ascii`
   2. Change them all to `string` as they seem to support both binary/string 
types
       * `binary_length` --> `string_length`
       * `binary_replace_slice` --> `string_replace_slice`
       * `binary_join` --> `string_join`
       * `binary_join_element_wise` --> `string_join_element_wise`

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -2513,6 +2877,159 @@ void AddSplit(FunctionRegistry* registry) {
 #endif
 }
 
+/// An ScalarFunction that promotes integer arguments to Int64.
+struct ScalarCTypeToInt64Function : public ScalarFunction {
+  using ScalarFunction::ScalarFunction;
+
+  Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const 
override {
+    RETURN_NOT_OK(CheckArity(*values));
+
+    using arrow::compute::detail::DispatchExactImpl;
+    if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+
+    EnsureDictionaryDecoded(values);
+
+    for (auto& descr : *values) {
+      if (is_integer(descr.type->id())) {
+        descr.type = int64();
+      }
+    }
+
+    if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+    return arrow::compute::detail::NoMatchingKernel(this, *values);
+  }
+};
+
+template <typename Type1, typename Type2>
+struct StringRepeatTransform : public StringBinaryTransformBase<Type1, Type2> {
+  using ArrayType1 = typename TypeTraits<Type1>::ArrayType;
+  using ArrayType2 = typename TypeTraits<Type2>::ArrayType;
+
+  int64_t MaxCodeunits(const int64_t input1_ncodeunits, const int64_t 
num_repeats,
+                       Status*) override {
+    return input1_ncodeunits * num_repeats;
+  }
+
+  int64_t MaxCodeunits(const int64_t input1_ncodeunits, const ArrayType2& 
input2,
+                       Status* st) override {
+    int64_t total_num_repeats = 0;
+    for (int64_t i = 0; i < input2.length(); ++i) {
+      auto num_repeats = input2.GetView(i);
+      if (num_repeats < 0) {
+        *st = InvalidRepeatCount();
+        return num_repeats;
+      }
+      total_num_repeats += num_repeats;
+    }
+    return input1_ncodeunits * total_num_repeats;
+  }
+
+  int64_t MaxCodeunits(const ArrayType1& input1, const int64_t num_repeats,
+                       Status*) override {
+    return input1.total_values_length() * num_repeats;
+  }
+
+  int64_t MaxCodeunits(const ArrayType1& input1, const ArrayType2& input2,
+                       Status* st) override {
+    int64_t total_codeunits = 0;
+    for (int64_t i = 0; i < input2.length(); ++i) {
+      auto num_repeats = input2.GetView(i);
+      if (num_repeats < 0) {
+        *st = InvalidRepeatCount();
+        return num_repeats;
+      }
+      total_codeunits += input1.GetView(i).length() * num_repeats;
+    }
+    return total_codeunits;
+  }
+
+  std::function<int64_t(const uint8_t*, const int64_t, const int64_t, 
uint8_t*, Status*)>
+      Transform;
+
+  static int64_t TransformSimple(const uint8_t* input,
+                                 const int64_t input_string_ncodeunits,
+                                 const int64_t num_repeats, uint8_t* output, 
Status*) {
+    uint8_t* output_start = output;
+    for (int64_t i = 0; i < num_repeats; ++i) {
+      std::memcpy(output, input, input_string_ncodeunits);
+      output += input_string_ncodeunits;
+    }
+    return output - output_start;
+  }
+
+  static int64_t TransformDoubling(const uint8_t* input,
+                                   const int64_t input_string_ncodeunits,
+                                   const int64_t num_repeats, uint8_t* output, 
Status*) {
+    uint8_t* output_start = output;
+    // Repeated doubling of string
+    std::memcpy(output, input, input_string_ncodeunits);
+    output += input_string_ncodeunits;
+    int64_t i = 1;
+    for (int64_t ilen = input_string_ncodeunits; i <= (num_repeats / 2);
+         i *= 2, ilen *= 2) {
+      std::memcpy(output, output_start, ilen);
+      output += ilen;
+    }
+
+    // Epilogue remainder
+    int64_t rem = (num_repeats ^ i) * input_string_ncodeunits;
+    std::memcpy(output, output_start, rem);
+    output += rem;
+    return output - output_start;
+  }
+
+  static int64_t TransformWrapper(const uint8_t* input,
+                                  const int64_t input_string_ncodeunits,
+                                  const int64_t num_repeats, uint8_t* output,
+                                  Status* st) {
+    auto transform = (num_repeats < 4) ? TransformSimple : TransformDoubling;
+    return transform(input, input_string_ncodeunits, num_repeats, output, st);
+  }
+
+  Status PreExec(KernelContext*, const ExecBatch& batch, Datum*) override {
+    // For cases with a scalar repeat count, select the best implementation 
once
+    // before execution. Otherwise, use TransformWrapper to select 
implementation
+    // when processing each value.

Review comment:
       I did not measured this so will run benchmarks to compare.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -2513,6 +2877,159 @@ void AddSplit(FunctionRegistry* registry) {
 #endif
 }
 
+/// An ScalarFunction that promotes integer arguments to Int64.
+struct ScalarCTypeToInt64Function : public ScalarFunction {
+  using ScalarFunction::ScalarFunction;
+
+  Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const 
override {
+    RETURN_NOT_OK(CheckArity(*values));
+
+    using arrow::compute::detail::DispatchExactImpl;
+    if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+
+    EnsureDictionaryDecoded(values);
+
+    for (auto& descr : *values) {
+      if (is_integer(descr.type->id())) {
+        descr.type = int64();
+      }
+    }
+
+    if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+    return arrow::compute::detail::NoMatchingKernel(this, *values);
+  }
+};
+
+template <typename Type1, typename Type2>
+struct StringRepeatTransform : public StringBinaryTransformBase<Type1, Type2> {
+  using ArrayType1 = typename TypeTraits<Type1>::ArrayType;
+  using ArrayType2 = typename TypeTraits<Type2>::ArrayType;
+
+  int64_t MaxCodeunits(const int64_t input1_ncodeunits, const int64_t 
num_repeats,
+                       Status*) override {
+    return input1_ncodeunits * num_repeats;
+  }
+
+  int64_t MaxCodeunits(const int64_t input1_ncodeunits, const ArrayType2& 
input2,
+                       Status* st) override {
+    int64_t total_num_repeats = 0;
+    for (int64_t i = 0; i < input2.length(); ++i) {
+      auto num_repeats = input2.GetView(i);
+      if (num_repeats < 0) {
+        *st = InvalidRepeatCount();
+        return num_repeats;
+      }
+      total_num_repeats += num_repeats;
+    }
+    return input1_ncodeunits * total_num_repeats;
+  }
+
+  int64_t MaxCodeunits(const ArrayType1& input1, const int64_t num_repeats,
+                       Status*) override {
+    return input1.total_values_length() * num_repeats;
+  }
+
+  int64_t MaxCodeunits(const ArrayType1& input1, const ArrayType2& input2,
+                       Status* st) override {
+    int64_t total_codeunits = 0;
+    for (int64_t i = 0; i < input2.length(); ++i) {
+      auto num_repeats = input2.GetView(i);
+      if (num_repeats < 0) {
+        *st = InvalidRepeatCount();
+        return num_repeats;
+      }
+      total_codeunits += input1.GetView(i).length() * num_repeats;
+    }
+    return total_codeunits;
+  }
+
+  std::function<int64_t(const uint8_t*, const int64_t, const int64_t, 
uint8_t*, Status*)>
+      Transform;

Review comment:
       Needs to be `Transform` because the string binary exec expects 
`StringTransforms` to have a `Transform` method to the given signature.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -2513,6 +2877,159 @@ void AddSplit(FunctionRegistry* registry) {
 #endif
 }
 
+/// An ScalarFunction that promotes integer arguments to Int64.
+struct ScalarCTypeToInt64Function : public ScalarFunction {
+  using ScalarFunction::ScalarFunction;
+
+  Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const 
override {
+    RETURN_NOT_OK(CheckArity(*values));
+
+    using arrow::compute::detail::DispatchExactImpl;
+    if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+
+    EnsureDictionaryDecoded(values);
+
+    for (auto& descr : *values) {
+      if (is_integer(descr.type->id())) {
+        descr.type = int64();
+      }
+    }
+
+    if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+    return arrow::compute::detail::NoMatchingKernel(this, *values);
+  }
+};
+
+template <typename Type1, typename Type2>
+struct StringRepeatTransform : public StringBinaryTransformBase<Type1, Type2> {
+  using ArrayType1 = typename TypeTraits<Type1>::ArrayType;
+  using ArrayType2 = typename TypeTraits<Type2>::ArrayType;
+
+  int64_t MaxCodeunits(const int64_t input1_ncodeunits, const int64_t 
num_repeats,
+                       Status*) override {
+    return input1_ncodeunits * num_repeats;
+  }
+
+  int64_t MaxCodeunits(const int64_t input1_ncodeunits, const ArrayType2& 
input2,
+                       Status* st) override {
+    int64_t total_num_repeats = 0;
+    for (int64_t i = 0; i < input2.length(); ++i) {
+      auto num_repeats = input2.GetView(i);
+      if (num_repeats < 0) {
+        *st = InvalidRepeatCount();

Review comment:
       I was simply following the convention used for the unary string 
transform cases, but it definitely fits the bill here. Thanks!

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -2513,6 +2877,159 @@ void AddSplit(FunctionRegistry* registry) {
 #endif
 }
 
+/// An ScalarFunction that promotes integer arguments to Int64.
+struct ScalarCTypeToInt64Function : public ScalarFunction {
+  using ScalarFunction::ScalarFunction;
+
+  Result<const Kernel*> DispatchBest(std::vector<ValueDescr>* values) const 
override {
+    RETURN_NOT_OK(CheckArity(*values));
+
+    using arrow::compute::detail::DispatchExactImpl;
+    if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+
+    EnsureDictionaryDecoded(values);
+
+    for (auto& descr : *values) {
+      if (is_integer(descr.type->id())) {
+        descr.type = int64();
+      }
+    }
+
+    if (auto kernel = DispatchExactImpl(this, *values)) return kernel;
+    return arrow::compute::detail::NoMatchingKernel(this, *values);
+  }
+};
+
+template <typename Type1, typename Type2>
+struct StringRepeatTransform : public StringBinaryTransformBase<Type1, Type2> {
+  using ArrayType1 = typename TypeTraits<Type1>::ArrayType;
+  using ArrayType2 = typename TypeTraits<Type2>::ArrayType;
+
+  int64_t MaxCodeunits(const int64_t input1_ncodeunits, const int64_t 
num_repeats,
+                       Status*) override {
+    return input1_ncodeunits * num_repeats;
+  }
+
+  int64_t MaxCodeunits(const int64_t input1_ncodeunits, const ArrayType2& 
input2,
+                       Status* st) override {
+    int64_t total_num_repeats = 0;
+    for (int64_t i = 0; i < input2.length(); ++i) {
+      auto num_repeats = input2.GetView(i);
+      if (num_repeats < 0) {
+        *st = InvalidRepeatCount();
+        return num_repeats;
+      }
+      total_num_repeats += num_repeats;
+    }
+    return input1_ncodeunits * total_num_repeats;
+  }
+
+  int64_t MaxCodeunits(const ArrayType1& input1, const int64_t num_repeats,
+                       Status*) override {
+    return input1.total_values_length() * num_repeats;
+  }
+
+  int64_t MaxCodeunits(const ArrayType1& input1, const ArrayType2& input2,
+                       Status* st) override {
+    int64_t total_codeunits = 0;
+    for (int64_t i = 0; i < input2.length(); ++i) {
+      auto num_repeats = input2.GetView(i);
+      if (num_repeats < 0) {
+        *st = InvalidRepeatCount();
+        return num_repeats;
+      }
+      total_codeunits += input1.GetView(i).length() * num_repeats;
+    }
+    return total_codeunits;
+  }
+
+  std::function<int64_t(const uint8_t*, const int64_t, const int64_t, 
uint8_t*, Status*)>
+      Transform;
+
+  static int64_t TransformSimple(const uint8_t* input,
+                                 const int64_t input_string_ncodeunits,
+                                 const int64_t num_repeats, uint8_t* output, 
Status*) {
+    uint8_t* output_start = output;
+    for (int64_t i = 0; i < num_repeats; ++i) {
+      std::memcpy(output, input, input_string_ncodeunits);
+      output += input_string_ncodeunits;
+    }
+    return output - output_start;
+  }
+
+  static int64_t TransformDoubling(const uint8_t* input,
+                                   const int64_t input_string_ncodeunits,
+                                   const int64_t num_repeats, uint8_t* output, 
Status*) {
+    uint8_t* output_start = output;
+    // Repeated doubling of string
+    std::memcpy(output, input, input_string_ncodeunits);
+    output += input_string_ncodeunits;
+    int64_t i = 1;
+    for (int64_t ilen = input_string_ncodeunits; i <= (num_repeats / 2);
+         i *= 2, ilen *= 2) {
+      std::memcpy(output, output_start, ilen);
+      output += ilen;
+    }
+
+    // Epilogue remainder
+    int64_t rem = (num_repeats ^ i) * input_string_ncodeunits;
+    std::memcpy(output, output_start, rem);
+    output += rem;
+    return output - output_start;
+  }
+
+  static int64_t TransformWrapper(const uint8_t* input,
+                                  const int64_t input_string_ncodeunits,
+                                  const int64_t num_repeats, uint8_t* output,
+                                  Status* st) {
+    auto transform = (num_repeats < 4) ? TransformSimple : TransformDoubling;
+    return transform(input, input_string_ncodeunits, num_repeats, output, st);
+  }
+
+  Status PreExec(KernelContext*, const ExecBatch& batch, Datum*) override {
+    // For cases with a scalar repeat count, select the best implementation 
once
+    // before execution. Otherwise, use TransformWrapper to select 
implementation
+    // when processing each value.

Review comment:
       Using `std::function` indirection resulted in 2x slower, so good 
call/intuition on this one.
   ```
   StringRepeat_mean    622822087 ns    622817699 ns           10 
bytes_per_second=25.4509M/s items_per_second=1.68498M/s
   StringRepeat_median  623393064 ns    623390528 ns           10 
bytes_per_second=25.4067M/s items_per_second=1.68205M/s
   StringRepeat_stddev   18771545 ns     18770743 ns           10 
bytes_per_second=787.511k/s items_per_second=50.9153k/s
   ```
   Checking `num_repeats < 4` at each iteration
   ```
   StringRepeat_mean    313125674 ns    313123902 ns           10 
bytes_per_second=50.601M/s items_per_second=3.35004M/s
   StringRepeat_median  312795031 ns    312794088 ns           10 
bytes_per_second=50.6405M/s items_per_second=3.35266M/s
   StringRepeat_stddev    6484104 ns      6484645 ns           10 
bytes_per_second=1068k/s items_per_second=69.0502k/s
   ```

##########
File path: docs/source/cpp/compute.rst
##########
@@ -812,45 +812,47 @@ The third set of functions examines string elements on a 
byte-per-byte basis:
 String transforms
 ~~~~~~~~~~~~~~~~~
 
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| Function name           | Arity | Input types            | Output type       
     | Options class                     | Notes |
-+=========================+=======+========================+========================+===================================+=======+
-| ascii_capitalize        | Unary | String-like            | String-like       
     |                                   | \(1)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| ascii_lower             | Unary | String-like            | String-like       
     |                                   | \(1)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| ascii_reverse           | Unary | String-like            | String-like       
     |                                   | \(2)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| ascii_swapcase          | Unary | String-like            | String-like       
     |                                   | \(1)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| ascii_title             | Unary | String-like            | String-like       
     |                                   | \(1)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| ascii_upper             | Unary | String-like            | String-like       
     |                                   | \(1)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| binary_length           | Unary | Binary- or String-like | Int32 or Int64    
     |                                   | \(3)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| binary_replace_slice    | Unary | Binary- or String-like | Binary- or 
String-like | :struct:`ReplaceSliceOptions`     | \(4)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| replace_substring       | Unary | Binary- or String-like | Binary- or 
String-like | :struct:`ReplaceSubstringOptions` | \(5)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| replace_substring_regex | Unary | Binary- or String-like | Binary- or 
String-like | :struct:`ReplaceSubstringOptions` | \(6)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_capitalize         | Unary | String-like            | String-like       
     |                                   | \(8)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_length             | Unary | String-like            | Int32 or Int64    
     |                                   | \(7)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_lower              | Unary | String-like            | String-like       
     |                                   | \(8)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_replace_slice      | Unary | String-like            | String-like       
     | :struct:`ReplaceSliceOptions`     | \(4)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_reverse            | Unary | String-like            | String-like       
     |                                   | \(9)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_swapcase           | Unary | String-like            | String-like       
     |                                   | \(8)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_title              | Unary | String-like            | String-like       
     |                                   | \(8)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
-| utf8_upper              | Unary | String-like            | String-like       
     |                                   | \(8)  |
-+-------------------------+-------+------------------------+------------------------+-----------------------------------+-------+
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| Function name           | Arity  | Input types                             | 
Output type            | Options class                     | Notes |
++=========================+========+=========================================+========================+===================================+=======+
+| ascii_capitalize        | Unary  | String-like                             | 
String-like            |                                   | \(1)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| ascii_lower             | Unary  | String-like                             | 
String-like            |                                   | \(1)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| ascii_reverse           | Unary  | String-like                             | 
String-like            |                                   | \(2)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| ascii_swapcase          | Unary  | String-like                             | 
String-like            |                                   | \(1)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| ascii_title             | Unary  | String-like                             | 
String-like            |                                   | \(1)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| ascii_upper             | Unary  | String-like                             | 
String-like            |                                   | \(1)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| binary_length           | Unary  | Binary- or String-like                  | 
Int32 or Int64         |                                   | \(3)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| binary_replace_slice    | Unary  | String-like                             | 
Binary- or String-like | :struct:`ReplaceSliceOptions`     | \(4)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| replace_substring       | Unary  | String-like                             | 
String-like            | :struct:`ReplaceSubstringOptions` | \(5)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| replace_substring_regex | Unary  | String-like                             | 
String-like            | :struct:`ReplaceSubstringOptions` | \(6)  |
++-------------------------+--------+-----------------------------------------+------------------------+-----------------------------------+-------+
+| string_repeat           | Binary | Binary/String (Arg 0); Integral (Arg 1) | 
Binary- or String-like |                                   | \(7)  |

Review comment:
       I think solution 2 is better.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -19,6 +19,7 @@
 #include <cctype>
 #include <iterator>
 #include <string>
+#include <typeinfo>

Review comment:
       No, I used it when trying to print the StringTransform type using 
`typeid(t).name()` but it printed more info than needed.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -19,6 +19,7 @@
 #include <cctype>
 #include <iterator>
 #include <string>
+#include <typeinfo>

Review comment:
       No, I used it when trying to print the `StringTransform` type using 
`typeid(t).name()` but it printed more info than needed.

##########
File path: cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc
##########
@@ -20,6 +20,7 @@
 #include <memory>
 #include <string>
 #include <utility>
+#include <vector>

Review comment:
       Not for this PR, so I will revert. Nevertheless, I have noticed that 
there are several imports missing and probably some extra in several files. I 
think this should be its own JIRA issue.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to