This is an automated email from the ASF dual-hosted git repository.

npr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new a2a9f5d  ARROW-10306: [C++] Add string replacement kernel
a2a9f5d is described below

commit a2a9f5d0b70d304817e65e32fbe1deef98ca7cb8
Author: Maarten A. Breddels <[email protected]>
AuthorDate: Thu Mar 25 19:17:21 2021 -0700

    ARROW-10306: [C++] Add string replacement kernel
    
    Two new kernels
     * replace_substring like Python's str.replace
     * replace_substring_re2  like Python's re.sub
    
    Closes #8468 from maartenbreddels/ARROW-10306
    
    Lead-authored-by: Maarten A. Breddels <[email protected]>
    Co-authored-by: Neal Richardson <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Neal Richardson <[email protected]>
---
 ci/scripts/PKGBUILD                                |   3 +
 cpp/src/arrow/compute/api_scalar.h                 |  13 ++
 cpp/src/arrow/compute/kernels/scalar_string.cc     | 203 +++++++++++++++++++++
 .../arrow/compute/kernels/scalar_string_test.cc    |  54 ++++++
 docs/source/cpp/compute.rst                        |  51 ++++--
 python/pyarrow/_compute.pyx                        |  20 ++
 python/pyarrow/compute.py                          |   1 +
 python/pyarrow/includes/libarrow.pxd               |   8 +
 python/pyarrow/tests/test_compute.py               |  12 ++
 r/configure.win                                    |   6 +-
 10 files changed, 351 insertions(+), 20 deletions(-)

diff --git a/ci/scripts/PKGBUILD b/ci/scripts/PKGBUILD
index 1d9e41b..c5b55ee 100644
--- a/ci/scripts/PKGBUILD
+++ b/ci/scripts/PKGBUILD
@@ -79,8 +79,10 @@ build() {
     export CPPFLAGS="${CPPFLAGS} -I${MINGW_PREFIX}/include"
     export LIBS="-L${MINGW_PREFIX}/libs"
     export ARROW_S3=OFF
+    export ARROW_WITH_RE2=OFF
   else
     export ARROW_S3=ON
+    export ARROW_WITH_RE2=ON
   fi
 
   MSYS2_ARG_CONV_EXCL="-DCMAKE_INSTALL_PREFIX=" \
@@ -105,6 +107,7 @@ build() {
     -DARROW_SNAPPY_USE_SHARED=OFF \
     -DARROW_USE_GLOG=OFF \
     -DARROW_WITH_LZ4=ON \
+    -DARROW_WITH_RE2="${ARROW_WITH_RE2}" \
     -DARROW_WITH_SNAPPY=ON \
     -DARROW_WITH_ZLIB=ON \
     -DARROW_WITH_ZSTD=ON \
diff --git a/cpp/src/arrow/compute/api_scalar.h 
b/cpp/src/arrow/compute/api_scalar.h
index 0d95092..730836b 100644
--- a/cpp/src/arrow/compute/api_scalar.h
+++ b/cpp/src/arrow/compute/api_scalar.h
@@ -68,6 +68,19 @@ struct ARROW_EXPORT SplitPatternOptions : public 
SplitOptions {
   std::string pattern;
 };
 
+struct ARROW_EXPORT ReplaceSubstringOptions : public FunctionOptions {
+  explicit ReplaceSubstringOptions(std::string pattern, std::string 
replacement,
+                                   int64_t max_replacements = -1)
+      : pattern(pattern), replacement(replacement), 
max_replacements(max_replacements) {}
+
+  /// Pattern to match, literal, or regular expression depending on which 
kernel is used
+  std::string pattern;
+  /// String to replace the pattern with
+  std::string replacement;
+  /// Max number of substrings to replace (-1 means unbounded)
+  int64_t max_replacements;
+};
+
 /// Options for IsIn and IndexIn functions
 struct ARROW_EXPORT SetLookupOptions : public FunctionOptions {
   explicit SetLookupOptions(Datum value_set, bool skip_nulls = false)
diff --git a/cpp/src/arrow/compute/kernels/scalar_string.cc 
b/cpp/src/arrow/compute/kernels/scalar_string.cc
index 88c91a1..3986987 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string.cc
@@ -23,6 +23,10 @@
 #include <utf8proc.h>
 #endif
 
+#ifdef ARROW_WITH_RE2
+#include <re2/re2.h>
+#endif
+
 #include "arrow/array/builder_binary.h"
 #include "arrow/array/builder_nested.h"
 #include "arrow/buffer_builder.h"
@@ -1231,6 +1235,197 @@ void AddSplit(FunctionRegistry* registry) {
 }
 
 // ----------------------------------------------------------------------
+// Replace substring (plain, regex)
+
+template <typename Type, typename Replacer>
+struct ReplaceSubString {
+  using ScalarType = typename TypeTraits<Type>::ScalarType;
+  using offset_type = typename Type::offset_type;
+  using ValueDataBuilder = TypedBufferBuilder<uint8_t>;
+  using OffsetBuilder = TypedBufferBuilder<offset_type>;
+  using State = OptionsWrapper<ReplaceSubstringOptions>;
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    // TODO Cache replacer accross invocations (for regex compilation)
+    Replacer replacer{ctx, State::Get(ctx)};
+    if (!ctx->HasError()) {
+      Replace(ctx, batch, &replacer, out);
+    }
+  }
+
+  static void Replace(KernelContext* ctx, const ExecBatch& batch, Replacer* 
replacer,
+                      Datum* out) {
+    ValueDataBuilder value_data_builder(ctx->memory_pool());
+    OffsetBuilder offset_builder(ctx->memory_pool());
+
+    if (batch[0].kind() == Datum::ARRAY) {
+      // We already know how many strings we have, so we can use 
Reserve/UnsafeAppend
+      KERNEL_RETURN_IF_ERROR(ctx, 
offset_builder.Reserve(batch[0].array()->length));
+      offset_builder.UnsafeAppend(0);  // offsets start at 0
+
+      const ArrayData& input = *batch[0].array();
+      KERNEL_RETURN_IF_ERROR(
+          ctx, VisitArrayDataInline<Type>(
+                   input,
+                   [&](util::string_view s) {
+                     RETURN_NOT_OK(replacer->ReplaceString(s, 
&value_data_builder));
+                     offset_builder.UnsafeAppend(
+                         
static_cast<offset_type>(value_data_builder.length()));
+                     return Status::OK();
+                   },
+                   [&]() {
+                     // offset for null value
+                     offset_builder.UnsafeAppend(
+                         
static_cast<offset_type>(value_data_builder.length()));
+                     return Status::OK();
+                   }));
+      ArrayData* output = out->mutable_array();
+      KERNEL_RETURN_IF_ERROR(ctx, 
value_data_builder.Finish(&output->buffers[2]));
+      KERNEL_RETURN_IF_ERROR(ctx, offset_builder.Finish(&output->buffers[1]));
+    } else {
+      const auto& input = checked_cast<const ScalarType&>(*batch[0].scalar());
+      auto result = std::make_shared<ScalarType>();
+      if (input.is_valid) {
+        util::string_view s = static_cast<util::string_view>(*input.value);
+        KERNEL_RETURN_IF_ERROR(ctx, replacer->ReplaceString(s, 
&value_data_builder));
+        KERNEL_RETURN_IF_ERROR(ctx, value_data_builder.Finish(&result->value));
+        result->is_valid = true;
+      }
+      out->value = result;
+    }
+  }
+};
+
+struct PlainSubStringReplacer {
+  const ReplaceSubstringOptions& options_;
+
+  PlainSubStringReplacer(KernelContext* ctx, const ReplaceSubstringOptions& 
options)
+      : options_(options) {}
+
+  Status ReplaceString(util::string_view s, TypedBufferBuilder<uint8_t>* 
builder) {
+    const char* i = s.begin();
+    const char* end = s.end();
+    int64_t max_replacements = options_.max_replacements;
+    while ((i < end) && (max_replacements != 0)) {
+      const char* pos =
+          std::search(i, end, options_.pattern.begin(), 
options_.pattern.end());
+      if (pos == end) {
+        RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
+                                      static_cast<int64_t>(end - i)));
+        i = end;
+      } else {
+        // the string before the pattern
+        RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
+                                      static_cast<int64_t>(pos - i)));
+        // the replacement
+        RETURN_NOT_OK(
+            builder->Append(reinterpret_cast<const 
uint8_t*>(options_.replacement.data()),
+                            options_.replacement.length()));
+        // skip pattern
+        i = pos + options_.pattern.length();
+        max_replacements--;
+      }
+    }
+    // if we exited early due to max_replacements, add the trailing part
+    RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
+                                  static_cast<int64_t>(end - i)));
+    return Status::OK();
+  }
+};
+
+#ifdef ARROW_WITH_RE2
+struct RegexSubStringReplacer {
+  const ReplaceSubstringOptions& options_;
+  const RE2 regex_find_;
+  const RE2 regex_replacement_;
+
+  // Using RE2::FindAndConsume we can only find the pattern if it is a group, 
therefore
+  // we have 2 regexes, one with () around it, one without.
+  RegexSubStringReplacer(KernelContext* ctx, const ReplaceSubstringOptions& 
options)
+      : options_(options),
+        regex_find_("(" + options_.pattern + ")"),
+        regex_replacement_(options_.pattern) {
+    if (!(regex_find_.ok() && regex_replacement_.ok())) {
+      ctx->SetStatus(Status::Invalid("Regular expression error"));
+      return;
+    }
+  }
+
+  Status ReplaceString(util::string_view s, TypedBufferBuilder<uint8_t>* 
builder) {
+    re2::StringPiece replacement(options_.replacement);
+    if (options_.max_replacements == -1) {
+      std::string s_copy(s.to_string());
+      re2::RE2::GlobalReplace(&s_copy, regex_replacement_, replacement);
+      RETURN_NOT_OK(builder->Append(reinterpret_cast<const 
uint8_t*>(s_copy.data()),
+                                    s_copy.length()));
+      return Status::OK();
+    }
+
+    // Since RE2 does not have the concept of max_replacements, we have to do 
some work
+    // ourselves.
+    // We might do this faster similar to RE2::GlobalReplace using Match and 
Rewrite
+    const char* i = s.begin();
+    const char* end = s.end();
+    re2::StringPiece piece(s.data(), s.length());
+
+    int64_t max_replacements = options_.max_replacements;
+    while ((i < end) && (max_replacements != 0)) {
+      std::string found;
+      if (!re2::RE2::FindAndConsume(&piece, regex_find_, &found)) {
+        RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
+                                      static_cast<int64_t>(end - i)));
+        i = end;
+      } else {
+        // wind back to the beginning of the match
+        const char* pos = piece.begin() - found.length();
+        // the string before the pattern
+        RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
+                                      static_cast<int64_t>(pos - i)));
+        // replace the pattern in what we found
+        if (!re2::RE2::Replace(&found, regex_replacement_, replacement)) {
+          return Status::Invalid("Regex found, but replacement failed");
+        }
+        RETURN_NOT_OK(builder->Append(reinterpret_cast<const 
uint8_t*>(found.data()),
+                                      static_cast<int64_t>(found.length())));
+        // skip pattern
+        i = piece.begin();
+        max_replacements--;
+      }
+    }
+    // If we exited early due to max_replacements, add the trailing part
+    RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
+                                  static_cast<int64_t>(end - i)));
+    return Status::OK();
+  }
+};
+#endif
+
+template <typename Type>
+using ReplaceSubStringPlain = ReplaceSubString<Type, PlainSubStringReplacer>;
+
+const FunctionDoc replace_substring_doc(
+    "Replace non-overlapping substrings that match pattern by replacement",
+    ("For each string in `strings`, replace non-overlapping substrings that 
match\n"
+     "`pattern` by `replacement`. If `max_replacements != -1`, it determines 
the\n"
+     "maximum amount of replacements made, counting from the left. Null values 
emit\n"
+     "null."),
+    {"strings"}, "ReplaceSubstringOptions");
+
+#ifdef ARROW_WITH_RE2
+template <typename Type>
+using ReplaceSubStringRegex = ReplaceSubString<Type, RegexSubStringReplacer>;
+
+const FunctionDoc replace_substring_regex_doc(
+    "Replace non-overlapping substrings that match regex `pattern` by 
`replacement`",
+    ("For each string in `strings`, replace non-overlapping substrings that 
match the\n"
+     "regular expression `pattern` by `replacement` using the Google RE2 
library.\n"
+     "If `max_replacements != -1`, it determines the maximum amount of 
replacements\n"
+     "made, counting from the left. Note that if the pattern contains 
groups,\n"
+     "backreferencing macan be used. Null values emit null."),
+    {"strings"}, "ReplaceSubstringOptions");
+#endif
+
+// ----------------------------------------------------------------------
 // strptime string parsing
 
 using StrptimeState = OptionsWrapper<StrptimeOptions>;
@@ -1904,6 +2099,14 @@ void RegisterScalarStringAscii(FunctionRegistry* 
registry) {
   AddBinaryLength(registry);
   AddUtf8Length(registry);
   AddMatchSubstring(registry);
+  MakeUnaryStringBatchKernelWithState<ReplaceSubStringPlain>(
+      "replace_substring", registry, &replace_substring_doc,
+      MemAllocation::NO_PREALLOCATE);
+#ifdef ARROW_WITH_RE2
+  MakeUnaryStringBatchKernelWithState<ReplaceSubStringRegex>(
+      "replace_substring_regex", registry, &replace_substring_regex_doc,
+      MemAllocation::NO_PREALLOCATE);
+#endif
   AddStrptime(registry);
 }
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc 
b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 281fcb5..88622e8 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -48,6 +48,14 @@ class BaseTestStringKernels : public ::testing::Test {
     CheckScalarUnary(func_name, type(), json_input, out_ty, json_expected, 
options);
   }
 
+  void CheckBinaryScalar(std::string func_name, std::string json_left_input,
+                         std::string json_right_scalar, 
std::shared_ptr<DataType> out_ty,
+                         std::string json_expected,
+                         const FunctionOptions* options = nullptr) {
+    CheckScalarBinaryScalar(func_name, type(), json_left_input, 
json_right_scalar, out_ty,
+                            json_expected, options);
+  }
+
   std::shared_ptr<DataType> type() { return 
TypeTraits<TestType>::type_singleton(); }
 
   std::shared_ptr<DataType> offset_type() {
@@ -422,6 +430,52 @@ TYPED_TEST(TestStringKernels, SplitWhitespaceUTF8Reverse) {
                    &options_max);
 }
 
+TYPED_TEST(TestStringKernels, ReplaceSubstring) {
+  ReplaceSubstringOptions options{"foo", "bazz"};
+  this->CheckUnary("replace_substring", R"(["foo", "this foo that foo", 
null])",
+                   this->type(), R"(["bazz", "this bazz that bazz", null])", 
&options);
+}
+
+TYPED_TEST(TestStringKernels, ReplaceSubstringLimited) {
+  ReplaceSubstringOptions options{"foo", "bazz", 1};
+  this->CheckUnary("replace_substring", R"(["foo", "this foo that foo", 
null])",
+                   this->type(), R"(["bazz", "this bazz that foo", null])", 
&options);
+}
+
+TYPED_TEST(TestStringKernels, ReplaceSubstringNoOptions) {
+  Datum input = ArrayFromJSON(this->type(), "[]");
+  ASSERT_RAISES(Invalid, CallFunction("replace_substring", {input}));
+}
+
+#ifdef ARROW_WITH_RE2
+TYPED_TEST(TestStringKernels, ReplaceSubstringRegex) {
+  ReplaceSubstringOptions options_regex{"(fo+)\\s*", "\\1-bazz"};
+  this->CheckUnary("replace_substring_regex", R"(["foo ", "this foo   that 
foo", null])",
+                   this->type(), R"(["foo-bazz", "this foo-bazzthat foo-bazz", 
null])",
+                   &options_regex);
+  // make sure we match non-overlapping
+  ReplaceSubstringOptions options_regex2{"(a.a)", "aba\\1"};
+  this->CheckUnary("replace_substring_regex", R"(["aaaaaa"])", this->type(),
+                   R"(["abaaaaabaaaa"])", &options_regex2);
+}
+
+TYPED_TEST(TestStringKernels, ReplaceSubstringRegexLimited) {
+  // With a finite number of replacements
+  ReplaceSubstringOptions options1{"foo", "bazz", 1};
+  this->CheckUnary("replace_substring", R"(["foo", "this foo that foo", 
null])",
+                   this->type(), R"(["bazz", "this bazz that foo", null])", 
&options1);
+  ReplaceSubstringOptions options_regex1{"(fo+)\\s*", "\\1-bazz", 1};
+  this->CheckUnary("replace_substring_regex", R"(["foo ", "this foo   that 
foo", null])",
+                   this->type(), R"(["foo-bazz", "this foo-bazzthat foo", 
null])",
+                   &options_regex1);
+}
+
+TYPED_TEST(TestStringKernels, ReplaceSubstringRegexNoOptions) {
+  Datum input = ArrayFromJSON(this->type(), "[]");
+  ASSERT_RAISES(Invalid, CallFunction("replace_substring_regex", {input}));
+}
+#endif
+
 TYPED_TEST(TestStringKernels, Strptime) {
   std::string input1 = R"(["5/1/2020", null, "12/11/1900"])";
   std::string output1 = R"(["2020-05-01", null, "1900-12-11"])";
diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst
index e4eaa94..065b807 100644
--- a/docs/source/cpp/compute.rst
+++ b/docs/source/cpp/compute.rst
@@ -426,21 +426,25 @@ The third set of functions examines string elements on a 
byte-per-byte basis:
 String transforms
 ~~~~~~~~~~~~~~~~~
 
-+--------------------------+------------+-------------------------+---------------------+---------+
-| Function name            | Arity      | Input types             | Output 
type         | Notes   |
-+==========================+============+=========================+=====================+=========+
-| ascii_lower              | Unary      | String-like             | 
String-like         | \(1)    |
-+--------------------------+------------+-------------------------+---------------------+---------+
-| ascii_upper              | Unary      | String-like             | 
String-like         | \(1)    |
-+--------------------------+------------+-------------------------+---------------------+---------+
-| binary_length            | Unary      | Binary- or String-like  | Int32 or 
Int64      | \(2)    |
-+--------------------------+------------+-------------------------+---------------------+---------+
-| utf8_length              | Unary      | String-like             | Int32 or 
Int64      | \(3)    |
-+--------------------------+------------+-------------------------+---------------------+---------+
-| utf8_lower               | Unary      | String-like             | 
String-like         | \(4)    |
-+--------------------------+------------+-------------------------+---------------------+---------+
-| utf8_upper               | Unary      | String-like             | 
String-like         | \(4)    |
-+--------------------------+------------+-------------------------+---------------------+---------+
++--------------------------+------------+-------------------------+---------------------+-------------------------------------------------+
+| Function name            | Arity      | Input types             | Output 
type         | Notes   | Options class                         |
++==========================+============+=========================+=====================+=========+=======================================+
+| ascii_lower              | Unary      | String-like             | 
String-like         | \(1)    |                                       |
++--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
+| ascii_upper              | Unary      | String-like             | 
String-like         | \(1)    |                                       |
++--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
+| binary_length            | Unary      | Binary- or String-like  | Int32 or 
Int64      | \(2)    |                                       |
++--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
+| replace_substring        | Unary      | String-like             | 
String-like         | \(3)    | :struct:`ReplaceSubstringOptions`     |
++--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
+| replace_substring_regex  | Unary      | String-like             | 
String-like         | \(4)    | :struct:`ReplaceSubstringOptions`     |
++--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
+| utf8_length              | Unary      | String-like             | Int32 or 
Int64      | \(5)    |                                       |
++--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
+| utf8_lower               | Unary      | String-like             | 
String-like         | \(6)    |                                       |
++--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
+| utf8_upper               | Unary      | String-like             | 
String-like         | \(6)    |                                       |
++--------------------------+------------+-------------------------+---------------------+---------+---------------------------------------+
 
 
 * \(1) Each ASCII character in the input is converted to lowercase or
@@ -449,10 +453,23 @@ String transforms
 * \(2) Output is the physical length in bytes of each input element.  Output
   type is Int32 for Binary / String, Int64 for LargeBinary / LargeString.
 
-* \(3) Output is the number of characters (not bytes) of each input element.
+* \(3) Replace non-overlapping substrings that match to
+  :member:`ReplaceSubstringOptions::pattern` by
+  :member:`ReplaceSubstringOptions::replacement`. If
+  :member:`ReplaceSubstringOptions::max_replacements` != -1, it determines the
+  maximum number of replacements made, counting from the left.
+
+* \(4) Replace non-overlapping substrings that match to the regular expression
+  :member:`ReplaceSubstringOptions::pattern` by
+  :member:`ReplaceSubstringOptions::replacement`, using the Google RE2 
library. If
+  :member:`ReplaceSubstringOptions::max_replacements` != -1, it determines the
+  maximum number of replacements made, counting from the left. Note that if the
+  pattern contains groups, backreferencing can be used.
+
+* \(5) Output is the number of characters (not bytes) of each input element.
   Output type is Int32 for String, Int64 for LargeString. 
 
-* \(4) Each UTF8-encoded character in the input is converted to lowercase or
+* \(6) Each UTF8-encoded character in the input is converted to lowercase or
   uppercase.
 
 
diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx
index f3a8eb8..1515bdc 100644
--- a/python/pyarrow/_compute.pyx
+++ b/python/pyarrow/_compute.pyx
@@ -684,6 +684,26 @@ class TrimOptions(_TrimOptions):
         self._set_options(characters)
 
 
+cdef class _ReplaceSubstringOptions(FunctionOptions):
+    cdef:
+        unique_ptr[CReplaceSubstringOptions] replace_substring_options
+
+    cdef const CFunctionOptions* get_options(self) except NULL:
+        return self.replace_substring_options.get()
+
+    def _set_options(self, pattern, replacement, max_replacements):
+        self.replace_substring_options.reset(
+            new CReplaceSubstringOptions(tobytes(pattern),
+                                         tobytes(replacement),
+                                         max_replacements)
+        )
+
+
+class ReplaceSubstringOptions(_ReplaceSubstringOptions):
+    def __init__(self, pattern, replacement, max_replacements=-1):
+        self._set_options(pattern, replacement, max_replacements)
+
+
 cdef class _FilterOptions(FunctionOptions):
     cdef:
         unique_ptr[CFilterOptions] filter_options
diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
index 2cdd843..1b46a08 100644
--- a/python/pyarrow/compute.py
+++ b/python/pyarrow/compute.py
@@ -42,6 +42,7 @@ from pyarrow._compute import (  # noqa
     PartitionNthOptions,
     ProjectOptions,
     QuantileOptions,
+    ReplaceSubstringOptions,
     SetLookupOptions,
     SortOptions,
     StrptimeOptions,
diff --git a/python/pyarrow/includes/libarrow.pxd 
b/python/pyarrow/includes/libarrow.pxd
index 61deb65..ebdcd08 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1815,6 +1815,14 @@ cdef extern from "arrow/compute/api.h" namespace 
"arrow::compute" nogil:
                              c_bool reverse)
         c_string pattern
 
+    cdef cppclass CReplaceSubstringOptions \
+            "arrow::compute::ReplaceSubstringOptions"(CFunctionOptions):
+        CReplaceSubstringOptions(c_string pattern, c_string replacement,
+                                 int64_t max_replacements)
+        c_string pattern
+        c_string replacement
+        int64_t max_replacements
+
     cdef cppclass CCastOptions" arrow::compute::CastOptions"(CFunctionOptions):
         CCastOptions()
         CCastOptions(c_bool safe)
diff --git a/python/pyarrow/tests/test_compute.py 
b/python/pyarrow/tests/test_compute.py
index 112629f..160375f 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -579,6 +579,18 @@ def test_string_py_compat_boolean(function_name, variant):
             assert arrow_func(ar)[0].as_py() == getattr(c, py_name)()
 
 
+def test_replace_plain():
+    ar = pa.array(['foo', 'food', None])
+    ar = pc.replace_substring(ar, pattern='foo', replacement='bar')
+    assert ar.tolist() == ['bar', 'bard', None]
+
+
+def test_replace_regex():
+    ar = pa.array(['foo', 'mood', None])
+    ar = pc.replace_substring_regex(ar, pattern='(.)oo', replacement=r'\100')
+    assert ar.tolist() == ['f00', 'm00d', None]
+
+
 @pytest.mark.parametrize(('ty', 'values'), all_array_types)
 def test_take(ty, values):
     arr = pa.array(values, type=ty)
diff --git a/r/configure.win b/r/configure.win
index 88ac0e1..d645834 100644
--- a/r/configure.win
+++ b/r/configure.win
@@ -50,13 +50,13 @@ AWS_LIBS="-laws-cpp-sdk-config -laws-cpp-sdk-transfer 
-laws-cpp-sdk-identity-man
 # NOTE: If you make changes to the libraries below, you should also change
 # ci/scripts/r_windows_build.sh and ci/scripts/PKGBUILD
 PKG_CFLAGS="-I${RWINLIB}/include -DARROW_STATIC -DPARQUET_STATIC 
-DARROW_DS_STATIC -DARROW_R_WITH_ARROW -DARROW_R_WITH_PARQUET 
-DARROW_R_WITH_DATASET"
-PKG_LIBS="-L${RWINLIB}/lib"'$(subst gcc,,$(COMPILED_BY))$(R_ARCH) 
'"-L${RWINLIB}/lib"'$(R_ARCH) '"-lparquet -larrow_dataset -larrow 
-larrow_bundled_dependencies -lutf8proc -lre2 -lthrift -lsnappy -lz -lzstd 
-llz4 ${MIMALLOC_LIBS} ${OPENSSL_LIBS}"
+PKG_LIBS="-L${RWINLIB}/lib"'$(subst gcc,,$(COMPILED_BY))$(R_ARCH) 
'"-L${RWINLIB}/lib"'$(R_ARCH) '"-lparquet -larrow_dataset -larrow 
-larrow_bundled_dependencies -lutf8proc -lthrift -lsnappy -lz -lzstd -llz4 
${MIMALLOC_LIBS} ${OPENSSL_LIBS}"
 
-# S3 support only for Rtools40 (i.e. R >= 4.0)
+# S3 and re2 support only for Rtools40 (i.e. R >= 4.0)
 "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e 'R.version$major >= 4' | grep TRUE 
>/dev/null 2>&1
 if [ $? -eq 0 ]; then
   PKG_CFLAGS="${PKG_CFLAGS} -DARROW_R_WITH_S3"
-  PKG_LIBS="${PKG_LIBS} ${AWS_LIBS}"
+  PKG_LIBS="${PKG_LIBS} -lre2 ${AWS_LIBS}"
 else
   # It seems that order matters
   PKG_LIBS="${PKG_LIBS} -lws2_32"

Reply via email to