[GitHub] [arrow] pitrou commented on a change in pull request #9000: ARROW-10557: [C++] Add scalar string slicing/substring kernel

GitBox Tue, 16 Mar 2021 07:43:34 -0700


pitrou commented on a change in pull request #9000:
URL: https://github.com/apache/arrow/pull/9000#discussion_r595224446




##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -427,6 +427,189 @@ void AddMatchSubstring(FunctionRegistry* registry) {
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
+// Slicing
+
+template <typename Type, typename Derived>
+struct SliceBase : StringTransform<Type, Derived> {
+  using Base = StringTransform<Type, Derived>;
+  using offset_type = typename Base::offset_type;
+  using State = OptionsWrapper<SliceOptions>;
+  SliceOptions options;
+
+  explicit SliceBase(SliceOptions options) : options(options) {}
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    SliceOptions options = State::Get(ctx);
+    if (options.step == 0) {
+      ctx->SetStatus(Status::Invalid("Slice step cannot be zero"));
+      return;
+    }
+    Derived(options).Execute(ctx, batch, out);
+  }
+
+  void Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    Base::Execute(ctx, batch, out);
+  }
+};
+
+#define PROPAGATE_FALSE(expr)         \
+  do {                                \
+    if (ARROW_PREDICT_FALSE(!expr)) { \
+      return false;                   \
+    }                                 \
+  } while (0)
+
+template <typename Type>
+struct SliceCodeunits : SliceBase<Type, SliceCodeunits<Type>> {
+  using Base = SliceBase<Type, SliceCodeunits<Type>>;
+  using offset_type = typename Base::offset_type;
+  using Base::Base;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    const uint8_t* begin = input;
+    const uint8_t* end = input + input_string_ncodeunits;
+    const uint8_t* begin_sliced = begin;
+    const uint8_t* end_sliced = end;
+    SliceOptions& options = this->options;
+
+    if (options.step >= 1) {

Review comment:
       The combinatory explosion in `if` branches seems a bit gratuitous? 
Abstractly it seems you should be able to do:
   ```c++
   // Compute begin_sliced
   if (options.start >= 0) {
     PROPAGATE_FALSE(
               arrow::util::UTF8AdvanceCodepoints(begin, end, &begin_sliced, 
options.start));
   } else {
     PROPAGATE_FALSE(arrow::util::UTF8AdvanceCodepointsReverse(
               begin, end, &begin_sliced, -options.start));
   }
   // Compute end_sliced
   if (options.stop >= 0) {
     PROPAGATE_FALSE(
               arrow::util::UTF8AdvanceCodepoints(begin, end, &end_sliced, 
options.stop));
   } else {
     PROPAGATE_FALSE(arrow::util::UTF8AdvanceCodepointsReverse(
               begin, end, &end_sliced, -options.stop));
   }
   if (options.step > 0) {
     if (end_sliced < begin_sliced) {
       // empty slice
       *output_written = 0;
       return true;
     }
     // ...
   }
   else {
     DCHECK_LT(options.step, 0);
     if (end_sliced > begin_sliced) {
       // empty slice
       *output_written = 0;
       return true;
     }
     // ...
   }
   ```
   

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -462,6 +462,107 @@ TYPED_TEST(TestStringKernels, TrimUTF8) {
 }
 #endif
 
+// produce test data with e.g.:
+// repr([k[-3:1] for k in ["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", 
"𝑓öõḍš"]]).replace("'", '"')
+
+#ifdef ARROW_WITH_UTF8PROC
+TYPED_TEST(TestStringKernels, SliceCodeunitsBasic) {
+  SliceOptions options{2, 4};
+  this->CheckUnary("utf8_slice_codeunits", R"(["foo", "fo", null, "foo bar"])",
+                   this->type(), R"(["o", "", null, "o "])", &options);
+  SliceOptions options_2{2, 3};
+  // ensure we slice in codeunits, not graphmemes

Review comment:
       "graphemes"

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -427,6 +427,189 @@ void AddMatchSubstring(FunctionRegistry* registry) {
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
+// Slicing
+
+template <typename Type, typename Derived>
+struct SliceBase : StringTransform<Type, Derived> {
+  using Base = StringTransform<Type, Derived>;
+  using offset_type = typename Base::offset_type;
+  using State = OptionsWrapper<SliceOptions>;
+  SliceOptions options;
+
+  explicit SliceBase(SliceOptions options) : options(options) {}
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    SliceOptions options = State::Get(ctx);
+    if (options.step == 0) {
+      ctx->SetStatus(Status::Invalid("Slice step cannot be zero"));
+      return;
+    }
+    Derived(options).Execute(ctx, batch, out);
+  }
+
+  void Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    Base::Execute(ctx, batch, out);
+  }
+};
+
+#define PROPAGATE_FALSE(expr)         \
+  do {                                \
+    if (ARROW_PREDICT_FALSE(!expr)) { \
+      return false;                   \
+    }                                 \
+  } while (0)
+
+template <typename Type>
+struct SliceCodeunits : SliceBase<Type, SliceCodeunits<Type>> {
+  using Base = SliceBase<Type, SliceCodeunits<Type>>;
+  using offset_type = typename Base::offset_type;
+  using Base::Base;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    const uint8_t* begin = input;
+    const uint8_t* end = input + input_string_ncodeunits;
+    const uint8_t* begin_sliced = begin;
+    const uint8_t* end_sliced = end;
+    SliceOptions& options = this->options;

Review comment:
       `const&`

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -462,6 +462,107 @@ TYPED_TEST(TestStringKernels, TrimUTF8) {
 }
 #endif
 
+// produce test data with e.g.:
+// repr([k[-3:1] for k in ["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", 
"𝑓öõḍš"]]).replace("'", '"')
+
+#ifdef ARROW_WITH_UTF8PROC
+TYPED_TEST(TestStringKernels, SliceCodeunitsBasic) {
+  SliceOptions options{2, 4};
+  this->CheckUnary("utf8_slice_codeunits", R"(["foo", "fo", null, "foo bar"])",
+                   this->type(), R"(["o", "", null, "o "])", &options);
+  SliceOptions options_2{2, 3};
+  // ensure we slice in codeunits, not graphmemes
+  // a\u0308 is ä, which is 1 graphmeme (character), but two codepoints
+  // \u0308 in utf8 encoding is \xcc\x88
+  this->CheckUnary("utf8_slice_codeunits", R"(["ää", "bä"])", this->type(),
+                   "[\"a\", \"\xcc\x88\"]", &options_2);
+  SliceOptions options_empty_pos{6, 6};
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓öõ"])", this->type(), 
R"(["",
+  ""])",
+                   &options_empty_pos);
+  SliceOptions options_empty_neg{-6, -6};
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓öõ"])", this->type(), 
R"(["",
+  ""])",
+                   &options_empty_neg);
+  SliceOptions options_empty_neg_to_zero{-6, 0};
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓öõ"])", this->type(), 
R"(["", ""])",
+                   &options_empty_neg_to_zero);
+
+  // end is beyond 0, but before start (hence empty)
+  SliceOptions options_edgecase_1{-3, 1};
+  this->CheckUnary("utf8_slice_codeunits", R"(["𝑓öõḍš"])", this->type(), 
R"([""])",
+                   &options_edgecase_1);
+
+  // this is a safeguard agains an optimization path possible, but actually a 
tricky case
+  SliceOptions options_edgecase_2{-6, -2};
+  this->CheckUnary("utf8_slice_codeunits", R"(["𝑓öõḍš"])", this->type(), 
R"(["𝑓öõ"])",
+                   &options_edgecase_2);
+}
+

Review comment:
       By the way, what happens if I call `utf8_slice_codeunits` without an 
options instance? Should you add a test for that?

##########
File path: docs/source/cpp/compute.rst
##########
@@ -553,6 +553,25 @@ when a positive ``max_splits`` is given.
   as separator.
 
 
+Slicing
+~~~~~~~
+
+These function transform each sequence of the array to a subsequence, 
following Python slicing semantics.
+
++--------------------------+------------+-------------------------+-------------------+----------------------------------+---------+
+| Function name            | Arity      | Input types             | Output 
type       | Options class                    | Notes   |
++==========================+============+=========================+===================+==================================+=========+
+| utf8_slice_codepoints    | Unary      | String-like             | 
String-like         | :struct:`SliceOptions`         | \(1)    |
++--------------------------+------------+-------------------------+-------------------+----------------------------------+---------+
+
+
+* \(1) Slice string into a substring defined by (`start`, `stop`, `step`) as

Review comment:
       Need double-backticks in reST:
   ```reST
   * \(1) Slice string into a substring defined by (``start``, ``stop``, 
``step``) as
   ```

##########
File path: docs/source/cpp/compute.rst
##########
@@ -553,6 +553,25 @@ when a positive ``max_splits`` is given.
   as separator.
 
 
+Slicing
+~~~~~~~
+
+These function transform each sequence of the array to a subsequence, 
following Python slicing semantics.
+
++--------------------------+------------+-------------------------+-------------------+----------------------------------+---------+
+| Function name            | Arity      | Input types             | Output 
type       | Options class                    | Notes   |
++==========================+============+=========================+===================+==================================+=========+
+| utf8_slice_codepoints    | Unary      | String-like             | 
String-like         | :struct:`SliceOptions`         | \(1)    |
++--------------------------+------------+-------------------------+-------------------+----------------------------------+---------+
+
+
+* \(1) Slice string into a substring defined by (`start`, `stop`, `step`) as
+given by `SliceOptions` where `start` is inclusive and `stop` is exclusive and

Review comment:
       Must probably indent these lines to be part of the bullet item (you can 
click "view file" to get a reST rendering in the Github UI).

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -427,6 +427,189 @@ void AddMatchSubstring(FunctionRegistry* registry) {
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
+// Slicing
+
+template <typename Type, typename Derived>
+struct SliceBase : StringTransform<Type, Derived> {
+  using Base = StringTransform<Type, Derived>;
+  using offset_type = typename Base::offset_type;
+  using State = OptionsWrapper<SliceOptions>;
+  SliceOptions options;
+
+  explicit SliceBase(SliceOptions options) : options(options) {}
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    SliceOptions options = State::Get(ctx);
+    if (options.step == 0) {
+      ctx->SetStatus(Status::Invalid("Slice step cannot be zero"));
+      return;
+    }
+    Derived(options).Execute(ctx, batch, out);
+  }
+
+  void Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    Base::Execute(ctx, batch, out);
+  }
+};
+
+#define PROPAGATE_FALSE(expr)         \
+  do {                                \
+    if (ARROW_PREDICT_FALSE(!expr)) { \
+      return false;                   \
+    }                                 \
+  } while (0)
+
+template <typename Type>
+struct SliceCodeunits : SliceBase<Type, SliceCodeunits<Type>> {
+  using Base = SliceBase<Type, SliceCodeunits<Type>>;
+  using offset_type = typename Base::offset_type;
+  using Base::Base;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    const uint8_t* begin = input;
+    const uint8_t* end = input + input_string_ncodeunits;
+    const uint8_t* begin_sliced = begin;
+    const uint8_t* end_sliced = end;
+    SliceOptions& options = this->options;
+
+    if (options.step >= 1) {
+      if (options.start >= 0) {
+        // start counting from the left
+        PROPAGATE_FALSE(
+            arrow::util::UTF8AdvanceCodepoints(begin, end, &begin_sliced, 
options.start));
+        if (options.stop > options.start) {
+          // continue counting from begin_sliced
+          int64_t length = options.stop - options.start;
+          PROPAGATE_FALSE(
+              arrow::util::UTF8AdvanceCodepoints(begin_sliced, end, 
&end_sliced, length));
+        } else if (options.stop < 0) {
+          // or from the end (but we will never need to < begin_sliced)
+          PROPAGATE_FALSE(arrow::util::UTF8AdvanceCodepointsReverse(
+              begin_sliced, end, &end_sliced, -options.stop));
+        } else {
+          // zero length slice
+          *output_written = 0;
+          return true;
+        }
+      } else {
+        // start counting from the right
+        PROPAGATE_FALSE(arrow::util::UTF8AdvanceCodepointsReverse(
+            begin, end, &begin_sliced, -options.start));
+        if (options.stop > 0) {
+          // continue counting from the left, we cannot start from 
begin_sliced because we
+          // don't know how many codepoints are between begin and begin_sliced
+          PROPAGATE_FALSE(
+              arrow::util::UTF8AdvanceCodepoints(begin, end, &end_sliced, 
options.stop));
+          // and therefore we also needs this
+          if (end_sliced <= begin_sliced) {
+            // zero length slice
+            *output_written = 0;
+            return true;
+          }
+        } else if ((options.stop < 0) && (options.stop > options.start)) {
+          // stop is negative, but larger than start, so we count again from 
the right
+          // in some cases we can optimize this, depending on the shortest 
path (from end
+          // or begin_sliced), but begin_sliced and options.start can be 'out 
of sync',
+          // for instance when start=-100, when the string length is only 10.
+          PROPAGATE_FALSE(arrow::util::UTF8AdvanceCodepointsReverse(
+              begin_sliced, end, &end_sliced, -options.stop));
+        } else {
+          // zero length slice
+          *output_written = 0;
+          return true;
+        }
+      }
+      DCHECK(begin_sliced <= end_sliced);
+      if (options.step == 1) {
+        // fast case, where we simply can finish with a memcpy
+        std::copy(begin_sliced, end_sliced, output);
+        *output_written = static_cast<offset_type>(end_sliced - begin_sliced);
+      } else {
+        uint8_t* dest = output;
+        const uint8_t* i = begin_sliced;
+
+        while (i < end_sliced) {
+          uint32_t codepoint = 0;
+          // write a single codepoint
+          PROPAGATE_FALSE(arrow::util::UTF8Decode(&i, &codepoint));
+          dest = arrow::util::UTF8Encode(dest, codepoint);
+          // and skip the remainder
+          int64_t skips = options.step - 1;
+          while ((skips--) && (i < end_sliced)) {
+            PROPAGATE_FALSE(arrow::util::UTF8Decode(&i, &codepoint));
+          }
+        }
+        *output_written = static_cast<offset_type>(dest - output);
+      }
+      return true;
+    } else {  // step < 0
+      // serious +1 -1 kung fu because now begin_slice and end_slice act like 
reverse
+      // iterators.
+
+      if (options.start >= 0) {
+        // +1 because begin_sliced acts as as the end of a reverse iterator
+        PROPAGATE_FALSE(arrow::util::UTF8AdvanceCodepoints(begin, end, 
&begin_sliced,
+                                                           options.start + 1));
+        // and make it point at the last codeunit of the previous codeunit
+        begin_sliced--;
+      } else {
+        // -1 because start=-1 means the last codeunit, which is 0 advances
+        PROPAGATE_FALSE(arrow::util::UTF8AdvanceCodepointsReverse(
+            begin, end, &begin_sliced, -options.start - 1));
+        // and make it point at the last codeunit of the previous codeunit
+        begin_sliced--;
+      }
+      // similar to options.start
+      if (options.stop >= 0) {
+        PROPAGATE_FALSE(arrow::util::UTF8AdvanceCodepoints(begin, end, 
&end_sliced,
+                                                           options.stop + 1));
+        end_sliced--;
+      } else {
+        PROPAGATE_FALSE(arrow::util::UTF8AdvanceCodepointsReverse(begin, end, 
&end_sliced,
+                                                                  
-options.stop - 1));
+        end_sliced--;
+      }
+
+      uint8_t* dest = output;
+      const uint8_t* i = begin_sliced;
+
+      while (i > end_sliced) {
+        uint32_t codepoint = 0;
+        // write a single codepoint
+        PROPAGATE_FALSE(arrow::util::UTF8DecodeReverse(&i, &codepoint));
+        dest = arrow::util::UTF8Encode(dest, codepoint);
+        // and skip the remainder
+        int64_t skips = -options.step - 1;
+        while ((skips--) && (i > end_sliced)) {
+          PROPAGATE_FALSE(arrow::util::UTF8DecodeReverse(&i, &codepoint));
+        }
+      }
+      *output_written = static_cast<offset_type>(dest - output);
+      return true;
+    }
+  }
+};
+
+const FunctionDoc utf8_slice_codeunits_doc(
+    "Slice string ",
+    ("For each string in `strings`, slice into a substring defined by\n"
+     "`start`, `stop`, `step`) as given by `SliceOptions` where `start` is 
inclusive\n"
+     "and `stop` is exclusive and are measured in codeunits. If step is 
negative, the\n"
+     "string will be advanced in reversed order. A `step` of zero is 
considered an\n"
+     "error.\n"
+     "Null inputs emit null."),
+    {"strings"}, "SliceOptions");
+
+void AddSlice(FunctionRegistry* registry) {

Review comment:
       Is there a reason we don't also add a `ascii_slice` for binary and 
string?

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -462,6 +462,107 @@ TYPED_TEST(TestStringKernels, TrimUTF8) {
 }
 #endif
 
+// produce test data with e.g.:
+// repr([k[-3:1] for k in ["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", 
"𝑓öõḍš"]]).replace("'", '"')
+
+#ifdef ARROW_WITH_UTF8PROC
+TYPED_TEST(TestStringKernels, SliceCodeunitsBasic) {
+  SliceOptions options{2, 4};
+  this->CheckUnary("utf8_slice_codeunits", R"(["foo", "fo", null, "foo bar"])",
+                   this->type(), R"(["o", "", null, "o "])", &options);
+  SliceOptions options_2{2, 3};
+  // ensure we slice in codeunits, not graphmemes
+  // a\u0308 is ä, which is 1 graphmeme (character), but two codepoints
+  // \u0308 in utf8 encoding is \xcc\x88
+  this->CheckUnary("utf8_slice_codeunits", R"(["ää", "bä"])", this->type(),
+                   "[\"a\", \"\xcc\x88\"]", &options_2);
+  SliceOptions options_empty_pos{6, 6};
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓öõ"])", this->type(), 
R"(["",
+  ""])",
+                   &options_empty_pos);
+  SliceOptions options_empty_neg{-6, -6};
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓öõ"])", this->type(), 
R"(["",
+  ""])",
+                   &options_empty_neg);
+  SliceOptions options_empty_neg_to_zero{-6, 0};
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓öõ"])", this->type(), 
R"(["", ""])",
+                   &options_empty_neg_to_zero);
+
+  // end is beyond 0, but before start (hence empty)
+  SliceOptions options_edgecase_1{-3, 1};
+  this->CheckUnary("utf8_slice_codeunits", R"(["𝑓öõḍš"])", this->type(), 
R"([""])",
+                   &options_edgecase_1);
+
+  // this is a safeguard agains an optimization path possible, but actually a 
tricky case
+  SliceOptions options_edgecase_2{-6, -2};
+  this->CheckUnary("utf8_slice_codeunits", R"(["𝑓öõḍš"])", this->type(), 
R"(["𝑓öõ"])",
+                   &options_edgecase_2);
+}
+
+TYPED_TEST(TestStringKernels, SliceCodeunitsPosPos) {
+  SliceOptions options{2, 4};
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", 
"𝑓öõḍš"])",
+                   this->type(), R"(["", "", "", "õ", "õḍ", "õḍ"])", &options);
+  SliceOptions options_step{1, 5, 2};
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", 
"𝑓öõḍš"])",
+                   this->type(), R"(["", "", "ö", "ö", "öḍ", "öḍ"])", 
&options_step);
+  SliceOptions options_step_neg{5, 1, -2};
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", 
"𝑓öõḍš"])",
+                   this->type(), R"(["", "", "", "õ", "ḍ", "šõ"])", 
&options_step_neg);
+  options_step_neg.stop = 0;
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", 
"𝑓öõḍ","𝑓öõḍš"])",
+                   this->type(), R"(["", "", "ö", "õ", "ḍö", "šõ"])", 
&options_step_neg);
+}
+
+TYPED_TEST(TestStringKernels, SliceCodeunitsPosNeg) {
+  SliceOptions options{2, -1};
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", 
"𝑓öõḍš"])",
+                   this->type(), R"(["", "", "", "", "õ", "õḍ"])", &options);
+  SliceOptions options_step{1, -1, 2};
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "f", "fö", "föo", 
"föod","foodš"])",
+                   this->type(), R"(["", "", "", "ö", "ö", "od"])", 
&options_step);
+  SliceOptions options_step_neg{3, -4, -2};
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", 
"𝑓öõḍ","𝑓öõḍš"])",
+                   this->type(), R"(["", "𝑓", "ö", "õ𝑓", "ḍö", "ḍ"])", 
&options_step_neg);
+  options_step_neg.stop = -5;
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", 
"𝑓öõḍ","𝑓öõḍš"])",
+                   this->type(), R"(["", "𝑓", "ö", "õ𝑓", "ḍö", "ḍö"])",
+                   &options_step_neg);
+}
+
+TYPED_TEST(TestStringKernels, SliceCodeunitsNegNeg) {
+  SliceOptions options{-2, -1};
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", 
"𝑓öõḍš"])",
+                   this->type(), R"(["", "", "𝑓", "ö", "õ", "ḍ"])", &options);
+  SliceOptions options_step{-4, -1, 2};
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", 
"𝑓öõḍš"])",
+                   this->type(), R"(["", "", "𝑓", "𝑓", "𝑓õ", "öḍ"])", 
&options_step);
+  SliceOptions options_step_neg{-1, -3, -2};
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", 
"𝑓öõḍš"])",
+                   this->type(), R"(["", "𝑓", "ö", "õ", "ḍ", "š"])", 
&options_step_neg);
+  options_step_neg.stop = -4;
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", 
"𝑓öõḍš"])",
+                   this->type(), R"(["", "𝑓", "ö", "õ𝑓", "ḍö", "šõ"])",
+                   &options_step_neg);
+}
+
+TYPED_TEST(TestStringKernels, SliceCodeunitsNegPos) {
+  SliceOptions options{-2, 4};
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", 
"𝑓öõḍš"])",
+                   this->type(), R"(["", "𝑓", "𝑓ö", "öõ", "õḍ", "ḍ"])", 
&options);
+  SliceOptions options_step{-4, 4, 2};
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", 
"𝑓öõḍš"])",
+                   this->type(), R"(["", "𝑓", "𝑓", "𝑓õ", "𝑓õ", "öḍ"])", 
&options_step);
+  SliceOptions options_step_neg{-1, 1, -2};
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", 
"𝑓öõḍš"])",
+                   this->type(), R"(["", "", "", "õ", "ḍ", "šõ"])", 
&options_step_neg);
+  options_step_neg.stop = 0;
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", 
"𝑓öõḍš"])",
+                   this->type(), R"(["", "", "ö", "õ", "ḍö", "šõ"])", 
&options_step_neg);
+}
+

Review comment:
       Nice tests, thank you!

##########
File path: docs/source/cpp/compute.rst
##########
@@ -553,6 +553,25 @@ when a positive ``max_splits`` is given.
   as separator.
 
 
+Slicing
+~~~~~~~
+
+These function transform each sequence of the array to a subsequence, 
following Python slicing semantics.
+
++--------------------------+------------+-------------------------+-------------------+----------------------------------+---------+
+| Function name            | Arity      | Input types             | Output 
type       | Options class                    | Notes   |
++==========================+============+=========================+===================+==================================+=========+
+| utf8_slice_codepoints    | Unary      | String-like             | 
String-like         | :struct:`SliceOptions`         | \(1)    |

Review comment:
       There is a slight alignment bug in your reST markup here.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -427,6 +427,189 @@ void AddMatchSubstring(FunctionRegistry* registry) {
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
+// Slicing
+
+template <typename Type, typename Derived>
+struct SliceBase : StringTransform<Type, Derived> {
+  using Base = StringTransform<Type, Derived>;
+  using offset_type = typename Base::offset_type;
+  using State = OptionsWrapper<SliceOptions>;
+  SliceOptions options;
+
+  explicit SliceBase(SliceOptions options) : options(options) {}
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    SliceOptions options = State::Get(ctx);
+    if (options.step == 0) {
+      ctx->SetStatus(Status::Invalid("Slice step cannot be zero"));
+      return;
+    }
+    Derived(options).Execute(ctx, batch, out);
+  }
+
+  void Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    Base::Execute(ctx, batch, out);
+  }
+};
+
+#define PROPAGATE_FALSE(expr)         \
+  do {                                \
+    if (ARROW_PREDICT_FALSE(!expr)) { \
+      return false;                   \
+    }                                 \
+  } while (0)
+
+template <typename Type>
+struct SliceCodeunits : SliceBase<Type, SliceCodeunits<Type>> {
+  using Base = SliceBase<Type, SliceCodeunits<Type>>;
+  using offset_type = typename Base::offset_type;
+  using Base::Base;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {

Review comment:
       Note that most of this function is really identical regardless of the 
offset type. So instead you could define a `Utf8SliceBase` base class:
   ```c++
   struct Utf8SliceBase {
     bool Transform(const uint8_t* input, int64_t input_string_ncodeunits,
                    uint8_t* output, int64_t* output_written) {
       // ...
     }
   };
   ```
   ... and inherit from that mixin class in concrete classes:
   ```c++
   template <typename Type>
   struct SliceCodeunits : SliceBase<Type, SliceCodeunits<Type>>, Utf8SliceBase 
{
     using Base = SliceBase<Type, SliceCodeunits<Type>>;
     using offset_type = typename Base::offset_type;
     using Base::Base;
   
     bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
                    uint8_t* output, offset_type* output_written) {
       int64_t output_written_64;
       bool res = Utf8SliceBase::Transform(input, input_string_ncodeunits, 
output, &output_written_64);
       *output_written = static_cast<offset_type>(output_written_64);
       return res;
     }
   ```




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] pitrou commented on a change in pull request #9000: ARROW-10557: [C++] Add scalar string slicing/substring kernel

Reply via email to