[GitHub] [arrow] pitrou commented on a change in pull request #9000: ARROW-10557: [C++] Add scalar string slicing/substring kernel

GitBox Mon, 12 Apr 2021 08:26:16 -0700


pitrou commented on a change in pull request #9000:
URL: https://github.com/apache/arrow/pull/9000#discussion_r611727515




##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -447,6 +447,197 @@ void AddMatchSubstring(FunctionRegistry* registry) {
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
+// Slicing
+
+template <typename Type, typename Derived>
+struct SliceBase : StringTransform<Type, Derived> {
+  using Base = StringTransform<Type, Derived>;
+  using offset_type = typename Base::offset_type;
+  using State = OptionsWrapper<SliceOptions>;
+  SliceOptions options;
+
+  explicit SliceBase(SliceOptions options) : options(options) {}
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    SliceOptions options = State::Get(ctx);
+    if (options.step == 0) {
+      ctx->SetStatus(Status::Invalid("Slice step cannot be zero"));
+      return;
+    }
+    Derived(options).Execute(ctx, batch, out);
+  }
+
+  void Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    Base::Execute(ctx, batch, out);
+  }
+};
+
+#define PROPAGATE_FALSE(expr)         \

Review comment:
       Can you `#undef` this at the end?

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -447,6 +447,197 @@ void AddMatchSubstring(FunctionRegistry* registry) {
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
+// Slicing
+
+template <typename Type, typename Derived>
+struct SliceBase : StringTransform<Type, Derived> {
+  using Base = StringTransform<Type, Derived>;
+  using offset_type = typename Base::offset_type;
+  using State = OptionsWrapper<SliceOptions>;
+  SliceOptions options;
+
+  explicit SliceBase(SliceOptions options) : options(options) {}
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    SliceOptions options = State::Get(ctx);
+    if (options.step == 0) {
+      ctx->SetStatus(Status::Invalid("Slice step cannot be zero"));
+      return;
+    }
+    Derived(options).Execute(ctx, batch, out);
+  }
+
+  void Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    Base::Execute(ctx, batch, out);
+  }
+};
+
+#define PROPAGATE_FALSE(expr)         \
+  do {                                \
+    if (ARROW_PREDICT_FALSE(!expr)) { \
+      return false;                   \
+    }                                 \
+  } while (0)
+
+bool SliceCodeunitsTransform(const uint8_t* input, int64_t 
input_string_ncodeunits,
+                             uint8_t* output, int64_t* output_written,
+                             const SliceOptions& options) {
+  const uint8_t* begin = input;
+  const uint8_t* end = input + input_string_ncodeunits;
+  const uint8_t* begin_sliced = begin;
+  const uint8_t* end_sliced = end;
+
+  if (options.step >= 1) {
+    if (options.start >= 0) {
+      // start counting from the left
+      PROPAGATE_FALSE(
+          arrow::util::UTF8AdvanceCodepoints(begin, end, &begin_sliced, 
options.start));
+      if (options.stop > options.start) {
+        // continue counting from begin_sliced
+        int64_t length = options.stop - options.start;
+        PROPAGATE_FALSE(
+            arrow::util::UTF8AdvanceCodepoints(begin_sliced, end, &end_sliced, 
length));
+      } else if (options.stop < 0) {
+        // or from the end (but we will never need to < begin_sliced)
+        PROPAGATE_FALSE(arrow::util::UTF8AdvanceCodepointsReverse(
+            begin_sliced, end, &end_sliced, -options.stop));
+      } else {
+        // zero length slice
+        *output_written = 0;
+        return true;
+      }
+    } else {
+      // start counting from the right
+      PROPAGATE_FALSE(arrow::util::UTF8AdvanceCodepointsReverse(begin, end, 
&begin_sliced,
+                                                                
-options.start));
+      if (options.stop > 0) {
+        // continue counting from the left, we cannot start from begin_sliced 
because we
+        // don't know how many codepoints are between begin and begin_sliced

Review comment:
       Hmm... for `step < 0`, wouldn't it be more logical to first compute 
`end_sliced`? Presumably, you then can pass `end_sliced` as one of the 
boundaries for computing `begin_sliced`?

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -522,6 +522,113 @@ TYPED_TEST(TestStringKernels, TrimUTF8) {
 }
 #endif
 
+// produce test data with e.g.:
+// repr([k[-3:1] for k in ["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", 
"𝑓öõḍš"]]).replace("'", '"')
+
+#ifdef ARROW_WITH_UTF8PROC
+TYPED_TEST(TestStringKernels, SliceCodeunitsBasic) {
+  SliceOptions options{2, 4};
+  this->CheckUnary("utf8_slice_codeunits", R"(["foo", "fo", null, "foo bar"])",
+                   this->type(), R"(["o", "", null, "o "])", &options);
+  SliceOptions options_2{2, 3};
+  // ensure we slice in codeunits, not graphemes
+  // a\u0308 is ä, which is 1 grapheme (character), but two codepoints
+  // \u0308 in utf8 encoding is \xcc\x88
+  this->CheckUnary("utf8_slice_codeunits", R"(["ää", "bä"])", this->type(),
+                   "[\"a\", \"\xcc\x88\"]", &options_2);
+  SliceOptions options_empty_pos{6, 6};
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓öõ"])", this->type(), 
R"(["",
+  ""])",
+                   &options_empty_pos);
+  SliceOptions options_empty_neg{-6, -6};
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓öõ"])", this->type(), 
R"(["",
+  ""])",
+                   &options_empty_neg);
+  SliceOptions options_empty_neg_to_zero{-6, 0};
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓öõ"])", this->type(), 
R"(["", ""])",
+                   &options_empty_neg_to_zero);
+
+  // end is beyond 0, but before start (hence empty)
+  SliceOptions options_edgecase_1{-3, 1};
+  this->CheckUnary("utf8_slice_codeunits", R"(["𝑓öõḍš"])", this->type(), 
R"([""])",
+                   &options_edgecase_1);
+
+  // this is a safeguard agains an optimization path possible, but actually a 
tricky case
+  SliceOptions options_edgecase_2{-6, -2};
+  this->CheckUnary("utf8_slice_codeunits", R"(["𝑓öõḍš"])", this->type(), 
R"(["𝑓öõ"])",
+                   &options_edgecase_2);
+
+  auto input = ArrayFromJSON(this->type(), R"(["𝑓öõḍš"])");
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid,
+      testing::HasSubstr("Attempted to initialize KernelState from null 
FunctionOptions"),
+      CallFunction("utf8_slice_codeunits", {input}));
+}

Review comment:
       Also add a test with `stop = 0`?

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -447,6 +447,197 @@ void AddMatchSubstring(FunctionRegistry* registry) {
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
+// Slicing
+
+template <typename Type, typename Derived>
+struct SliceBase : StringTransform<Type, Derived> {
+  using Base = StringTransform<Type, Derived>;
+  using offset_type = typename Base::offset_type;
+  using State = OptionsWrapper<SliceOptions>;
+  SliceOptions options;
+
+  explicit SliceBase(SliceOptions options) : options(options) {}
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    SliceOptions options = State::Get(ctx);
+    if (options.step == 0) {
+      ctx->SetStatus(Status::Invalid("Slice step cannot be zero"));
+      return;
+    }
+    Derived(options).Execute(ctx, batch, out);
+  }
+
+  void Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    Base::Execute(ctx, batch, out);
+  }
+};
+
+#define PROPAGATE_FALSE(expr)         \
+  do {                                \
+    if (ARROW_PREDICT_FALSE(!expr)) { \
+      return false;                   \
+    }                                 \
+  } while (0)
+
+bool SliceCodeunitsTransform(const uint8_t* input, int64_t 
input_string_ncodeunits,
+                             uint8_t* output, int64_t* output_written,
+                             const SliceOptions& options) {
+  const uint8_t* begin = input;
+  const uint8_t* end = input + input_string_ncodeunits;
+  const uint8_t* begin_sliced = begin;
+  const uint8_t* end_sliced = end;
+
+  if (options.step >= 1) {
+    if (options.start >= 0) {
+      // start counting from the left
+      PROPAGATE_FALSE(
+          arrow::util::UTF8AdvanceCodepoints(begin, end, &begin_sliced, 
options.start));
+      if (options.stop > options.start) {
+        // continue counting from begin_sliced
+        int64_t length = options.stop - options.start;
+        PROPAGATE_FALSE(
+            arrow::util::UTF8AdvanceCodepoints(begin_sliced, end, &end_sliced, 
length));
+      } else if (options.stop < 0) {
+        // or from the end (but we will never need to < begin_sliced)
+        PROPAGATE_FALSE(arrow::util::UTF8AdvanceCodepointsReverse(
+            begin_sliced, end, &end_sliced, -options.stop));
+      } else {
+        // zero length slice
+        *output_written = 0;
+        return true;
+      }
+    } else {
+      // start counting from the right
+      PROPAGATE_FALSE(arrow::util::UTF8AdvanceCodepointsReverse(begin, end, 
&begin_sliced,
+                                                                
-options.start));
+      if (options.stop > 0) {
+        // continue counting from the left, we cannot start from begin_sliced 
because we
+        // don't know how many codepoints are between begin and begin_sliced

Review comment:
       Just a question btw, you don't need to act on this if you think it's 
unnecessary.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] pitrou commented on a change in pull request #9000: ARROW-10557: [C++] Add scalar string slicing/substring kernel

Reply via email to