[GitHub] [arrow] pitrou commented on a change in pull request #8621: ARROW-9128: [C++] Implement string space trimming kernels: trim, ltrim, and rtrim

GitBox Thu, 26 Nov 2020 04:46:46 -0800


pitrou commented on a change in pull request #8621:
URL: https://github.com/apache/arrow/pull/8621#discussion_r530997361




##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -1231,6 +1251,302 @@ Result<ValueDescr> StrptimeResolve(KernelContext* ctx, 
const std::vector<ValueDe
   return Status::Invalid("strptime does not provide default StrptimeOptions");
 }
 
+#ifdef ARROW_WITH_UTF8PROC
+
+template <typename Type, bool left, bool right, typename Derived>
+struct UTF8TrimWhitespaceBase : StringTransform<Type, Derived> {
+  using Base = StringTransform<Type, Derived>;
+  using offset_type = typename Base::offset_type;
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    const uint8_t* begin = input;
+    const uint8_t* end = input + input_string_ncodeunits;
+    const uint8_t* end_trimmed = end;
+    const uint8_t* begin_trimmed = begin;
+
+    auto predicate = [](uint32_t c) { return !IsSpaceCharacterUnicode(c); };
+    if (left && !ARROW_PREDICT_TRUE(
+                    arrow::util::UTF8FindIf(begin, end, predicate, 
&begin_trimmed))) {
+      return false;
+    }
+    if (right && (begin_trimmed < end)) {
+      if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed, 
end,
+                                                             predicate, 
&end_trimmed))) {
+        return false;
+      }
+    }
+    std::copy(begin_trimmed, end_trimmed, output);
+    *output_written = static_cast<offset_type>(end_trimmed - begin_trimmed);
+    return true;
+  }
+  void Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    EnsureLookupTablesFilled();
+    Base::Execute(ctx, batch, out);
+  }
+};
+
+template <typename Type>
+struct UTF8TrimWhitespace
+    : UTF8TrimWhitespaceBase<Type, true, true, UTF8TrimWhitespace<Type>> {};
+
+template <typename Type>
+struct UTF8LTrimWhitespace
+    : UTF8TrimWhitespaceBase<Type, true, false, UTF8LTrimWhitespace<Type>> {};
+
+template <typename Type>
+struct UTF8RTrimWhitespace
+    : UTF8TrimWhitespaceBase<Type, false, true, UTF8RTrimWhitespace<Type>> {};
+
+template <typename Type, bool left, bool right, typename Derived>
+struct UTF8TrimBase : StringTransform<Type, Derived> {
+  using Base = StringTransform<Type, Derived>;
+  using offset_type = typename Base::offset_type;
+  using State = OptionsWrapper<TrimOptions>;
+  TrimOptions options;
+  std::vector<bool> codepoints;
+
+  explicit UTF8TrimBase(TrimOptions options) : options(options) {
+    // TODO: check return / can we raise an exception here?
+    arrow::util::UTF8ForEach(options.characters, [&](uint32_t c) {
+      codepoints.resize(std::max(c + 1, 
static_cast<uint32_t>(codepoints.size())));
+      codepoints.at(c) = true;
+    });
+  }
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    TrimOptions options = State::Get(ctx);
+    Derived(options).Execute(ctx, batch, out);
+  }
+
+  void Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    EnsureLookupTablesFilled();
+    Base::Execute(ctx, batch, out);
+  }
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    const uint8_t* begin = input;
+    const uint8_t* end = input + input_string_ncodeunits;
+    const uint8_t* end_trimmed = end;
+    const uint8_t* begin_trimmed = begin;
+
+    auto predicate = [&](uint32_t c) {
+      bool contains = codepoints[c];
+      return !contains;
+    };
+    if (left && !ARROW_PREDICT_TRUE(
+                    arrow::util::UTF8FindIf(begin, end, predicate, 
&begin_trimmed))) {
+      return false;
+    }
+    if (right && (begin_trimmed < end)) {
+      if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed, 
end,
+                                                             predicate, 
&end_trimmed))) {
+        return false;
+      }
+    }
+    std::copy(begin_trimmed, end_trimmed, output);
+    *output_written = static_cast<offset_type>(end_trimmed - begin_trimmed);
+    return true;
+  }
+};
+template <typename Type>
+struct UTF8Trim : UTF8TrimBase<Type, true, true, UTF8Trim<Type>> {
+  using Base = UTF8TrimBase<Type, true, true, UTF8Trim<Type>>;
+  using Base::Base;
+};
+
+template <typename Type>
+struct UTF8LTrim : UTF8TrimBase<Type, true, false, UTF8LTrim<Type>> {
+  using Base = UTF8TrimBase<Type, true, false, UTF8LTrim<Type>>;
+  using Base::Base;
+};
+
+template <typename Type>
+struct UTF8RTrim : UTF8TrimBase<Type, false, true, UTF8RTrim<Type>> {
+  using Base = UTF8TrimBase<Type, false, true, UTF8RTrim<Type>>;
+  using Base::Base;
+};
+
+#endif
+
+template <typename Type, bool left, bool right, typename Derived>
+struct AsciiTrimWhitespaceBase : StringTransform<Type, Derived> {
+  using offset_type = typename Type::offset_type;
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    const uint8_t* begin = input;
+    const uint8_t* end = input + input_string_ncodeunits;
+    const uint8_t* end_trimmed = end;
+
+    auto predicate = [](unsigned char c) { return !IsSpaceCharacterAscii(c); };
+    const uint8_t* begin_trimmed = left ? std::find_if(begin, end, predicate) 
: begin;
+    if (right & (begin_trimmed < end)) {
+      std::reverse_iterator<const uint8_t*> rbegin(end);
+      std::reverse_iterator<const uint8_t*> rend(begin_trimmed);
+      end_trimmed = std::find_if(rbegin, rend, predicate).base();
+    }
+    std::copy(begin_trimmed, end_trimmed, output);
+    *output_written = static_cast<offset_type>(end_trimmed - begin_trimmed);
+    return true;
+  }
+};
+
+template <typename Type>
+struct AsciiTrimWhitespace
+    : AsciiTrimWhitespaceBase<Type, true, true, AsciiTrimWhitespace<Type>> {};
+
+template <typename Type>
+struct AsciiLTrimWhitespace
+    : AsciiTrimWhitespaceBase<Type, true, false, AsciiLTrimWhitespace<Type>> 
{};
+
+template <typename Type>
+struct AsciiRTrimWhitespace
+    : AsciiTrimWhitespaceBase<Type, false, true, AsciiRTrimWhitespace<Type>> 
{};
+
+template <typename Type, bool left, bool right, typename Derived>
+struct AsciiTrimBase : StringTransform<Type, Derived> {
+  using Base = StringTransform<Type, Derived>;
+  using offset_type = typename Base::offset_type;
+  using State = OptionsWrapper<TrimOptions>;
+  TrimOptions options;
+  std::vector<bool> characters;
+
+  explicit AsciiTrimBase(TrimOptions options) : options(options), 
characters(256) {
+    std::for_each(options.characters.begin(), options.characters.end(),
+                  [&](char c) { characters[static_cast<unsigned char>(c)] = 
true; });
+  }
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    TrimOptions options = State::Get(ctx);
+    Derived(options).Execute(ctx, batch, out);

Review comment:
       Can `std::move(options)`

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -1231,6 +1251,302 @@ Result<ValueDescr> StrptimeResolve(KernelContext* ctx, 
const std::vector<ValueDe
   return Status::Invalid("strptime does not provide default StrptimeOptions");
 }
 
+#ifdef ARROW_WITH_UTF8PROC
+
+template <typename Type, bool left, bool right, typename Derived>
+struct UTF8TrimWhitespaceBase : StringTransform<Type, Derived> {
+  using Base = StringTransform<Type, Derived>;
+  using offset_type = typename Base::offset_type;
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    const uint8_t* begin = input;
+    const uint8_t* end = input + input_string_ncodeunits;
+    const uint8_t* end_trimmed = end;
+    const uint8_t* begin_trimmed = begin;
+
+    auto predicate = [](uint32_t c) { return !IsSpaceCharacterUnicode(c); };
+    if (left && !ARROW_PREDICT_TRUE(
+                    arrow::util::UTF8FindIf(begin, end, predicate, 
&begin_trimmed))) {
+      return false;
+    }
+    if (right && (begin_trimmed < end)) {
+      if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed, 
end,
+                                                             predicate, 
&end_trimmed))) {
+        return false;
+      }
+    }
+    std::copy(begin_trimmed, end_trimmed, output);
+    *output_written = static_cast<offset_type>(end_trimmed - begin_trimmed);
+    return true;
+  }
+  void Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    EnsureLookupTablesFilled();
+    Base::Execute(ctx, batch, out);
+  }
+};
+
+template <typename Type>
+struct UTF8TrimWhitespace
+    : UTF8TrimWhitespaceBase<Type, true, true, UTF8TrimWhitespace<Type>> {};
+
+template <typename Type>
+struct UTF8LTrimWhitespace
+    : UTF8TrimWhitespaceBase<Type, true, false, UTF8LTrimWhitespace<Type>> {};
+
+template <typename Type>
+struct UTF8RTrimWhitespace
+    : UTF8TrimWhitespaceBase<Type, false, true, UTF8RTrimWhitespace<Type>> {};
+
+template <typename Type, bool left, bool right, typename Derived>
+struct UTF8TrimBase : StringTransform<Type, Derived> {
+  using Base = StringTransform<Type, Derived>;
+  using offset_type = typename Base::offset_type;
+  using State = OptionsWrapper<TrimOptions>;
+  TrimOptions options;
+  std::vector<bool> codepoints;
+
+  explicit UTF8TrimBase(TrimOptions options) : options(options) {
+    // TODO: check return / can we raise an exception here?
+    arrow::util::UTF8ForEach(options.characters, [&](uint32_t c) {
+      codepoints.resize(std::max(c + 1, 
static_cast<uint32_t>(codepoints.size())));
+      codepoints.at(c) = true;
+    });
+  }
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    TrimOptions options = State::Get(ctx);
+    Derived(options).Execute(ctx, batch, out);
+  }
+
+  void Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    EnsureLookupTablesFilled();
+    Base::Execute(ctx, batch, out);
+  }
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    const uint8_t* begin = input;
+    const uint8_t* end = input + input_string_ncodeunits;
+    const uint8_t* end_trimmed = end;
+    const uint8_t* begin_trimmed = begin;
+
+    auto predicate = [&](uint32_t c) {
+      bool contains = codepoints[c];
+      return !contains;
+    };
+    if (left && !ARROW_PREDICT_TRUE(
+                    arrow::util::UTF8FindIf(begin, end, predicate, 
&begin_trimmed))) {
+      return false;
+    }
+    if (right && (begin_trimmed < end)) {
+      if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed, 
end,
+                                                             predicate, 
&end_trimmed))) {
+        return false;
+      }
+    }
+    std::copy(begin_trimmed, end_trimmed, output);
+    *output_written = static_cast<offset_type>(end_trimmed - begin_trimmed);
+    return true;
+  }
+};
+template <typename Type>
+struct UTF8Trim : UTF8TrimBase<Type, true, true, UTF8Trim<Type>> {
+  using Base = UTF8TrimBase<Type, true, true, UTF8Trim<Type>>;
+  using Base::Base;
+};
+
+template <typename Type>
+struct UTF8LTrim : UTF8TrimBase<Type, true, false, UTF8LTrim<Type>> {
+  using Base = UTF8TrimBase<Type, true, false, UTF8LTrim<Type>>;
+  using Base::Base;
+};
+
+template <typename Type>
+struct UTF8RTrim : UTF8TrimBase<Type, false, true, UTF8RTrim<Type>> {
+  using Base = UTF8TrimBase<Type, false, true, UTF8RTrim<Type>>;
+  using Base::Base;
+};
+
+#endif
+
+template <typename Type, bool left, bool right, typename Derived>
+struct AsciiTrimWhitespaceBase : StringTransform<Type, Derived> {
+  using offset_type = typename Type::offset_type;
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    const uint8_t* begin = input;
+    const uint8_t* end = input + input_string_ncodeunits;
+    const uint8_t* end_trimmed = end;
+
+    auto predicate = [](unsigned char c) { return !IsSpaceCharacterAscii(c); };
+    const uint8_t* begin_trimmed = left ? std::find_if(begin, end, predicate) 
: begin;
+    if (right & (begin_trimmed < end)) {
+      std::reverse_iterator<const uint8_t*> rbegin(end);
+      std::reverse_iterator<const uint8_t*> rend(begin_trimmed);
+      end_trimmed = std::find_if(rbegin, rend, predicate).base();
+    }
+    std::copy(begin_trimmed, end_trimmed, output);
+    *output_written = static_cast<offset_type>(end_trimmed - begin_trimmed);
+    return true;
+  }
+};
+
+template <typename Type>
+struct AsciiTrimWhitespace
+    : AsciiTrimWhitespaceBase<Type, true, true, AsciiTrimWhitespace<Type>> {};
+
+template <typename Type>
+struct AsciiLTrimWhitespace
+    : AsciiTrimWhitespaceBase<Type, true, false, AsciiLTrimWhitespace<Type>> 
{};
+
+template <typename Type>
+struct AsciiRTrimWhitespace
+    : AsciiTrimWhitespaceBase<Type, false, true, AsciiRTrimWhitespace<Type>> 
{};
+
+template <typename Type, bool left, bool right, typename Derived>
+struct AsciiTrimBase : StringTransform<Type, Derived> {
+  using Base = StringTransform<Type, Derived>;
+  using offset_type = typename Base::offset_type;
+  using State = OptionsWrapper<TrimOptions>;
+  TrimOptions options;
+  std::vector<bool> characters;
+
+  explicit AsciiTrimBase(TrimOptions options) : options(options), 
characters(256) {

Review comment:
       Can `std::move(options)`

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -428,6 +428,59 @@ TYPED_TEST(TestStringKernels, 
StrptimeDoesNotProvideDefaultOptions) {
   ASSERT_RAISES(Invalid, CallFunction("strptime", {input}));
 }
 
+#ifdef ARROW_WITH_UTF8PROC
+
+TYPED_TEST(TestStringKernels, TrimWhitespaceUTF8) {
+  // \xe2\x80\x88 is punctuation space
+  this->CheckUnary("utf8_trim_whitespace",
+                   "[\" foo\", null, \"bar  \", \" \xe2\x80\x88 foo bar \"]",

Review comment:
       Just for the record, you can also use the "raw strings" syntax that C++ 
offers:
   ```c++
   R"([" foo", null, "bar  ", " \xe2\x80\x88 foo bar "])"
   ```

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -1231,6 +1251,302 @@ Result<ValueDescr> StrptimeResolve(KernelContext* ctx, 
const std::vector<ValueDe
   return Status::Invalid("strptime does not provide default StrptimeOptions");
 }
 
+#ifdef ARROW_WITH_UTF8PROC
+
+template <typename Type, bool left, bool right, typename Derived>
+struct UTF8TrimWhitespaceBase : StringTransform<Type, Derived> {
+  using Base = StringTransform<Type, Derived>;
+  using offset_type = typename Base::offset_type;
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    const uint8_t* begin = input;
+    const uint8_t* end = input + input_string_ncodeunits;
+    const uint8_t* end_trimmed = end;
+    const uint8_t* begin_trimmed = begin;
+
+    auto predicate = [](uint32_t c) { return !IsSpaceCharacterUnicode(c); };
+    if (left && !ARROW_PREDICT_TRUE(
+                    arrow::util::UTF8FindIf(begin, end, predicate, 
&begin_trimmed))) {
+      return false;
+    }
+    if (right && (begin_trimmed < end)) {
+      if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed, 
end,
+                                                             predicate, 
&end_trimmed))) {
+        return false;
+      }
+    }
+    std::copy(begin_trimmed, end_trimmed, output);
+    *output_written = static_cast<offset_type>(end_trimmed - begin_trimmed);
+    return true;
+  }
+  void Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    EnsureLookupTablesFilled();
+    Base::Execute(ctx, batch, out);
+  }
+};
+
+template <typename Type>
+struct UTF8TrimWhitespace
+    : UTF8TrimWhitespaceBase<Type, true, true, UTF8TrimWhitespace<Type>> {};
+
+template <typename Type>
+struct UTF8LTrimWhitespace
+    : UTF8TrimWhitespaceBase<Type, true, false, UTF8LTrimWhitespace<Type>> {};
+
+template <typename Type>
+struct UTF8RTrimWhitespace
+    : UTF8TrimWhitespaceBase<Type, false, true, UTF8RTrimWhitespace<Type>> {};
+
+template <typename Type, bool left, bool right, typename Derived>
+struct UTF8TrimBase : StringTransform<Type, Derived> {
+  using Base = StringTransform<Type, Derived>;
+  using offset_type = typename Base::offset_type;
+  using State = OptionsWrapper<TrimOptions>;
+  TrimOptions options;
+  std::vector<bool> codepoints;
+
+  explicit UTF8TrimBase(TrimOptions options) : options(options) {
+    // TODO: check return / can we raise an exception here?
+    arrow::util::UTF8ForEach(options.characters, [&](uint32_t c) {
+      codepoints.resize(std::max(c + 1, 
static_cast<uint32_t>(codepoints.size())));
+      codepoints.at(c) = true;
+    });
+  }
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    TrimOptions options = State::Get(ctx);
+    Derived(options).Execute(ctx, batch, out);
+  }
+
+  void Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    EnsureLookupTablesFilled();
+    Base::Execute(ctx, batch, out);
+  }
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    const uint8_t* begin = input;
+    const uint8_t* end = input + input_string_ncodeunits;
+    const uint8_t* end_trimmed = end;
+    const uint8_t* begin_trimmed = begin;
+
+    auto predicate = [&](uint32_t c) {
+      bool contains = codepoints[c];
+      return !contains;
+    };
+    if (left && !ARROW_PREDICT_TRUE(
+                    arrow::util::UTF8FindIf(begin, end, predicate, 
&begin_trimmed))) {
+      return false;
+    }
+    if (right && (begin_trimmed < end)) {
+      if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed, 
end,
+                                                             predicate, 
&end_trimmed))) {
+        return false;
+      }
+    }
+    std::copy(begin_trimmed, end_trimmed, output);
+    *output_written = static_cast<offset_type>(end_trimmed - begin_trimmed);
+    return true;
+  }
+};
+template <typename Type>
+struct UTF8Trim : UTF8TrimBase<Type, true, true, UTF8Trim<Type>> {
+  using Base = UTF8TrimBase<Type, true, true, UTF8Trim<Type>>;
+  using Base::Base;
+};
+
+template <typename Type>
+struct UTF8LTrim : UTF8TrimBase<Type, true, false, UTF8LTrim<Type>> {
+  using Base = UTF8TrimBase<Type, true, false, UTF8LTrim<Type>>;
+  using Base::Base;
+};
+
+template <typename Type>
+struct UTF8RTrim : UTF8TrimBase<Type, false, true, UTF8RTrim<Type>> {
+  using Base = UTF8TrimBase<Type, false, true, UTF8RTrim<Type>>;
+  using Base::Base;
+};
+
+#endif
+
+template <typename Type, bool left, bool right, typename Derived>
+struct AsciiTrimWhitespaceBase : StringTransform<Type, Derived> {
+  using offset_type = typename Type::offset_type;
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    const uint8_t* begin = input;
+    const uint8_t* end = input + input_string_ncodeunits;
+    const uint8_t* end_trimmed = end;
+
+    auto predicate = [](unsigned char c) { return !IsSpaceCharacterAscii(c); };
+    const uint8_t* begin_trimmed = left ? std::find_if(begin, end, predicate) 
: begin;
+    if (right & (begin_trimmed < end)) {
+      std::reverse_iterator<const uint8_t*> rbegin(end);
+      std::reverse_iterator<const uint8_t*> rend(begin_trimmed);
+      end_trimmed = std::find_if(rbegin, rend, predicate).base();
+    }
+    std::copy(begin_trimmed, end_trimmed, output);
+    *output_written = static_cast<offset_type>(end_trimmed - begin_trimmed);
+    return true;
+  }
+};
+
+template <typename Type>
+struct AsciiTrimWhitespace
+    : AsciiTrimWhitespaceBase<Type, true, true, AsciiTrimWhitespace<Type>> {};
+
+template <typename Type>
+struct AsciiLTrimWhitespace
+    : AsciiTrimWhitespaceBase<Type, true, false, AsciiLTrimWhitespace<Type>> 
{};
+
+template <typename Type>
+struct AsciiRTrimWhitespace
+    : AsciiTrimWhitespaceBase<Type, false, true, AsciiRTrimWhitespace<Type>> 
{};
+
+template <typename Type, bool left, bool right, typename Derived>
+struct AsciiTrimBase : StringTransform<Type, Derived> {
+  using Base = StringTransform<Type, Derived>;
+  using offset_type = typename Base::offset_type;
+  using State = OptionsWrapper<TrimOptions>;
+  TrimOptions options;
+  std::vector<bool> characters;

Review comment:
       Nit, but the convention is to append a trailing underscore to instance 
variable names (`options_`, `characters_`), so as to distinguish them from 
local variables.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -1231,6 +1251,302 @@ Result<ValueDescr> StrptimeResolve(KernelContext* ctx, 
const std::vector<ValueDe
   return Status::Invalid("strptime does not provide default StrptimeOptions");
 }
 
+#ifdef ARROW_WITH_UTF8PROC
+
+template <typename Type, bool left, bool right, typename Derived>
+struct UTF8TrimWhitespaceBase : StringTransform<Type, Derived> {
+  using Base = StringTransform<Type, Derived>;
+  using offset_type = typename Base::offset_type;
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    const uint8_t* begin = input;
+    const uint8_t* end = input + input_string_ncodeunits;
+    const uint8_t* end_trimmed = end;
+    const uint8_t* begin_trimmed = begin;
+
+    auto predicate = [](uint32_t c) { return !IsSpaceCharacterUnicode(c); };
+    if (left && !ARROW_PREDICT_TRUE(
+                    arrow::util::UTF8FindIf(begin, end, predicate, 
&begin_trimmed))) {
+      return false;
+    }
+    if (right && (begin_trimmed < end)) {
+      if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed, 
end,
+                                                             predicate, 
&end_trimmed))) {
+        return false;
+      }
+    }
+    std::copy(begin_trimmed, end_trimmed, output);
+    *output_written = static_cast<offset_type>(end_trimmed - begin_trimmed);
+    return true;
+  }
+  void Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    EnsureLookupTablesFilled();
+    Base::Execute(ctx, batch, out);
+  }
+};
+
+template <typename Type>
+struct UTF8TrimWhitespace
+    : UTF8TrimWhitespaceBase<Type, true, true, UTF8TrimWhitespace<Type>> {};
+
+template <typename Type>
+struct UTF8LTrimWhitespace
+    : UTF8TrimWhitespaceBase<Type, true, false, UTF8LTrimWhitespace<Type>> {};
+
+template <typename Type>
+struct UTF8RTrimWhitespace
+    : UTF8TrimWhitespaceBase<Type, false, true, UTF8RTrimWhitespace<Type>> {};
+
+template <typename Type, bool left, bool right, typename Derived>
+struct UTF8TrimBase : StringTransform<Type, Derived> {
+  using Base = StringTransform<Type, Derived>;
+  using offset_type = typename Base::offset_type;
+  using State = OptionsWrapper<TrimOptions>;
+  TrimOptions options;
+  std::vector<bool> codepoints;
+
+  explicit UTF8TrimBase(TrimOptions options) : options(options) {
+    // TODO: check return / can we raise an exception here?

Review comment:
       You could set the status on the KernelContext in the caller.

##########
File path: cpp/src/arrow/util/utf8_util_test.cc
##########
@@ -412,5 +412,32 @@ TEST(UTF8DecodeReverse, Basics) {
   CheckInvalid("\xF0\x80\x80");
 }
 
+TEST(UTF8FindIf, Basics) {
+  auto CheckOk = [](const std::string& s, unsigned char test, int64_t 
offset_left,
+                    int64_t offset_right) -> void {
+    const uint8_t* begin = reinterpret_cast<const uint8_t*>(s.c_str());
+    const uint8_t* end = begin + s.length();
+    std::reverse_iterator<const uint8_t*> rbegin(end);
+    std::reverse_iterator<const uint8_t*> rend(begin);
+    const uint8_t* left;
+    const uint8_t* right;
+    auto predicate = [&](uint32_t c) { return c == test; };
+    EXPECT_TRUE(UTF8FindIf(begin, end, predicate, &left));
+    EXPECT_TRUE(UTF8FindIfReverse(begin, end, predicate, &right));
+    EXPECT_EQ(offset_left, left - begin);
+    EXPECT_EQ(offset_right, right - begin);
+    EXPECT_EQ(std::find_if(begin, end, predicate) - begin, left - begin);
+    EXPECT_EQ(std::find_if(rbegin, rend, predicate).base() - begin, right - 
begin);
+  };
+
+  CheckOk("aaaba", 'a', 0, 5);
+  CheckOk("aaaba", 'b', 3, 4);
+  CheckOk("aaababa", 'b', 3, 6);
+  CheckOk("aaababa", 'c', 7, 0);
+  CheckOk("a", 'a', 0, 1);
+  CheckOk("a", 'b', 1, 0);
+  CheckOk("", 'b', 0, 0);

Review comment:
       Hmm... this conveniently only checks ASCII characters. The 
implementation looks ok, but still.

##########
File path: cpp/src/arrow/util/utf8.h
##########
@@ -456,6 +456,67 @@ static inline bool UTF8Transform(const uint8_t* first, 
const uint8_t* last,
   return true;
 }
 
+template <class Predicate>
+static inline bool UTF8FindIf(const uint8_t* first, const uint8_t* last,
+                              Predicate&& predicate, const uint8_t** position) 
{
+  const uint8_t* i = first;
+  while (i < last) {
+    uint32_t codepoint = 0;
+    const uint8_t* current = i;
+    if (ARROW_PREDICT_FALSE(!UTF8Decode(&i, &codepoint))) {
+      return false;
+    }
+    if (predicate(codepoint)) {
+      *position = current;
+      return true;
+    }
+  }
+  *position = last;
+  return true;
+}
+
+// same semantics as std::find_if using reverse iterators when the return value
+// having the same semantics as std::reverse_iterator<..>.base()
+template <class Predicate>
+static inline bool UTF8FindIfReverse(const uint8_t* first, const uint8_t* last,
+                                     Predicate&& predicate, const uint8_t** 
position) {
+  const uint8_t* i = last - 1;
+  while (i >= first) {
+    uint32_t codepoint = 0;
+    const uint8_t* current = i;
+    if (ARROW_PREDICT_FALSE(!UTF8DecodeReverse(&i, &codepoint))) {
+      return false;
+    }
+    if (predicate(codepoint)) {
+      *position = current + 1;
+      return true;
+    }
+  }
+  *position = first;
+  return true;
+}
+
+template <class UnaryFunction>
+static inline bool UTF8ForEach(const uint8_t* first, const uint8_t* last,
+                               UnaryFunction&& f) {
+  const uint8_t* i = first;
+  while (i < last) {
+    uint32_t codepoint = 0;
+    if (ARROW_PREDICT_FALSE(!UTF8Decode(&i, &codepoint))) {
+      return false;
+    }
+    f(codepoint);
+  }
+  return true;
+}
+
+template <class UnaryFunction>
+static inline bool UTF8ForEach(std::string s, UnaryFunction&& f) {

Review comment:
       `const std::string& s`, since it's neither stored nor mutated?

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -428,6 +428,59 @@ TYPED_TEST(TestStringKernels, 
StrptimeDoesNotProvideDefaultOptions) {
   ASSERT_RAISES(Invalid, CallFunction("strptime", {input}));
 }
 
+#ifdef ARROW_WITH_UTF8PROC
+
+TYPED_TEST(TestStringKernels, TrimWhitespaceUTF8) {
+  // \xe2\x80\x88 is punctuation space
+  this->CheckUnary("utf8_trim_whitespace",
+                   "[\" foo\", null, \"bar  \", \" \xe2\x80\x88 foo bar \"]",
+                   this->type(), "[\"foo\", null, \"bar\", \"foo bar\"]");
+  this->CheckUnary("utf8_rtrim_whitespace",
+                   "[\" foo\", null, \"bar  \", \" \xe2\x80\x88 foo bar \"]",
+                   this->type(), "[\" foo\", null, \"bar\", \" \xe2\x80\x88 
foo bar\"]");
+  this->CheckUnary("utf8_ltrim_whitespace",
+                   "[\" foo\", null, \"bar  \", \" \xe2\x80\x88 foo bar \"]",
+                   this->type(), "[\"foo\", null, \"bar  \", \"foo bar \"]");
+}
+
+TYPED_TEST(TestStringKernels, TrimUTF8) {
+  TrimOptions options{"ȺA"};
+  this->CheckUnary("utf8_trim", "[\"ȺȺfooȺAȺ\", null, \"barȺAȺ\", 
\"ȺAȺfooȺAȺbarA\"]",
+                   this->type(), "[\"foo\", null, \"bar\", \"fooȺAȺbar\"]", 
&options);
+  this->CheckUnary("utf8_ltrim", "[\"ȺȺfooȺAȺ\", null, \"barȺAȺ\", 
\"ȺAȺfooȺAȺbarA\"]",
+                   this->type(), "[\"fooȺAȺ\", null, \"barȺAȺ\", 
\"fooȺAȺbarA\"]",
+                   &options);
+  this->CheckUnary("utf8_rtrim", "[\"ȺȺfooȺAȺ\", null, \"barȺAȺ\", 
\"ȺAȺfooȺAȺbarA\"]",
+                   this->type(), "[\"ȺȺfoo\", null, \"bar\", 
\"ȺAȺfooȺAȺbar\"]",
+                   &options);
+}
+#endif
+
+TYPED_TEST(TestStringKernels, TrimWhitespaceAscii) {
+  // \xe2\x80\x88 is punctuation space

Review comment:
       Also check regular Ascii whitespace such `\t` or `\r`?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] pitrou commented on a change in pull request #8621: ARROW-9128: [C++] Implement string space trimming kernels: trim, ltrim, and rtrim

Reply via email to