maartenbreddels commented on a change in pull request #8621:
URL: https://github.com/apache/arrow/pull/8621#discussion_r521422359



##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -1231,6 +1251,305 @@ Result<ValueDescr> StrptimeResolve(KernelContext* ctx, 
const std::vector<ValueDe
   return Status::Invalid("strptime does not provide default StrptimeOptions");
 }
 
+#ifdef ARROW_WITH_UTF8PROC
+
+template <typename Type, bool left, bool right, typename Derived>
+struct UTF8TrimWhitespaceBase : StringTransform<Type, Derived> {
+  using Base = StringTransform<Type, Derived>;
+  using offset_type = typename Base::offset_type;
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    const uint8_t* begin = input;
+    const uint8_t* end = input + input_string_ncodeunits;
+    const uint8_t* end_trimmed = end;
+    const uint8_t* begin_trimmed = begin;
+
+    auto predicate = [](uint32_t c) { return !IsSpaceCharacterUnicode(c); };
+    if (left && !ARROW_PREDICT_TRUE(
+                    arrow::util::UTF8FindIf(begin, end, predicate, 
&begin_trimmed))) {
+      return false;
+    }
+    if (right && (begin_trimmed < end)) {
+      if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed, 
end,
+                                                             predicate, 
&end_trimmed))) {
+        return false;
+      }
+    }
+    std::copy(begin_trimmed, end_trimmed, output);
+    *output_written = static_cast<offset_type>(end_trimmed - begin_trimmed);
+    return true;
+  }
+  void Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    EnsureLookupTablesFilled();
+    Base::Execute(ctx, batch, out);
+  }
+};
+
+template <typename Type>
+struct UTF8TrimWhitespace
+    : UTF8TrimWhitespaceBase<Type, true, true, UTF8TrimWhitespace<Type>> {};
+
+template <typename Type>
+struct UTF8LTrimWhitespace
+    : UTF8TrimWhitespaceBase<Type, true, false, UTF8LTrimWhitespace<Type>> {};
+
+template <typename Type>
+struct UTF8RTrimWhitespace
+    : UTF8TrimWhitespaceBase<Type, false, true, UTF8RTrimWhitespace<Type>> {};
+
+template <typename Type, bool left, bool right, typename Derived>
+struct UTF8TrimBase : StringTransform<Type, Derived> {
+  using Base = StringTransform<Type, Derived>;
+  using offset_type = typename Base::offset_type;
+  using State = OptionsWrapper<TrimOptions>;
+  TrimOptions options;
+  std::vector<bool> codepoints;
+
+  explicit UTF8TrimBase(TrimOptions options) : options(options) {
+    // TODO: check return / can we raise an exception here?
+    arrow::util::UTF8ForEach(options.characters, [&](uint32_t c) {
+      codepoints.resize(std::max(c + 1, 
static_cast<uint32_t>(codepoints.size())));
+      codepoints.at(c) = true;
+    });
+  }
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    TrimOptions options = State::Get(ctx);
+    Derived(options).Execute(ctx, batch, out);
+  }
+
+  void Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    EnsureLookupTablesFilled();
+    Base::Execute(ctx, batch, out);
+  }
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    const uint8_t* begin = input;
+    const uint8_t* end = input + input_string_ncodeunits;
+    const uint8_t* end_trimmed = end;
+    const uint8_t* begin_trimmed = begin;
+
+    auto predicate = [&](uint32_t c) {
+      bool contains = codepoints[c];
+      return !contains;
+    };
+    if (left && !ARROW_PREDICT_TRUE(
+                    arrow::util::UTF8FindIf(begin, end, predicate, 
&begin_trimmed))) {
+      return false;
+    }
+    if (right && (begin_trimmed < end)) {
+      if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed, 
end,
+                                                             predicate, 
&end_trimmed))) {
+        return false;
+      }
+    }
+    std::copy(begin_trimmed, end_trimmed, output);
+    *output_written = static_cast<offset_type>(end_trimmed - begin_trimmed);
+    return true;
+  }
+};
+template <typename Type>
+struct UTF8Trim : UTF8TrimBase<Type, true, true, UTF8Trim<Type>> {
+  using Base = UTF8TrimBase<Type, true, true, UTF8Trim<Type>>;
+  using Base::Base;
+};
+
+template <typename Type>
+struct UTF8LTrim : UTF8TrimBase<Type, true, false, UTF8LTrim<Type>> {
+  using Base = UTF8TrimBase<Type, true, false, UTF8LTrim<Type>>;
+  using Base::Base;
+};
+
+template <typename Type>
+struct UTF8RTrim : UTF8TrimBase<Type, false, true, UTF8RTrim<Type>> {
+  using Base = UTF8TrimBase<Type, false, true, UTF8RTrim<Type>>;
+  using Base::Base;
+};
+
+#endif
+
+template <typename Type, bool left, bool right, typename Derived>
+struct AsciiTrimWhitespaceBase : StringTransform<Type, Derived> {
+  using offset_type = typename Type::offset_type;
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    const uint8_t* begin = input;
+    const uint8_t* end = input + input_string_ncodeunits;
+    const uint8_t* end_trimmed = end;
+
+    auto predicate = [](unsigned char c) { return !IsSpaceCharacterAscii(c); };
+    const uint8_t* begin_trimmed = left ? std::find_if(begin, end, predicate) 
: begin;
+    if (right & (begin_trimmed < end)) {
+      std::reverse_iterator<const uint8_t*> rbegin(end);
+      std::reverse_iterator<const uint8_t*> rend(begin_trimmed);
+      end_trimmed = std::find_if(rbegin, rend, predicate).base();
+    }
+    std::copy(begin_trimmed, end_trimmed, output);
+    *output_written = static_cast<offset_type>(end_trimmed - begin_trimmed);
+    return true;
+  }
+};
+
+template <typename Type>
+struct AsciiTrimWhitespace
+    : AsciiTrimWhitespaceBase<Type, true, true, AsciiTrimWhitespace<Type>> {};
+
+template <typename Type>
+struct AsciiLTrimWhitespace
+    : AsciiTrimWhitespaceBase<Type, true, false, AsciiLTrimWhitespace<Type>> 
{};
+
+template <typename Type>
+struct AsciiRTrimWhitespace
+    : AsciiTrimWhitespaceBase<Type, false, true, AsciiRTrimWhitespace<Type>> 
{};
+
+template <typename Type, bool left, bool right, typename Derived>
+struct AsciiTrimBase : StringTransform<Type, Derived> {
+  using Base = StringTransform<Type, Derived>;
+  using offset_type = typename Base::offset_type;
+  using State = OptionsWrapper<TrimOptions>;
+  TrimOptions options;
+  std::vector<bool> characters;
+
+  explicit AsciiTrimBase(TrimOptions options) : options(options), 
characters(256) {
+    std::for_each(options.characters.begin(), options.characters.end(),
+                  [&](char c) { characters[static_cast<unsigned char>(c)] = 
true; });
+  }
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    TrimOptions options = State::Get(ctx);
+    Derived(options).Execute(ctx, batch, out);
+  }
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    const uint8_t* begin = input;
+    const uint8_t* end = input + input_string_ncodeunits;
+    const uint8_t* end_trimmed = end;
+    const uint8_t* begin_trimmed;
+
+    auto predicate = [&](unsigned char c) {
+      bool contains = characters[c];
+      return !contains;
+    };
+
+    begin_trimmed = left ? std::find_if(begin, end, predicate) : begin;
+    if (right & (begin_trimmed < end)) {
+      std::reverse_iterator<const uint8_t*> rbegin(end);
+      std::reverse_iterator<const uint8_t*> rend(begin_trimmed);
+      end_trimmed = std::find_if(rbegin, rend, predicate).base();
+    }
+    std::copy(begin_trimmed, end_trimmed, output);
+    *output_written = static_cast<offset_type>(end_trimmed - begin_trimmed);
+    return true;
+  }
+};
+
+template <typename Type>
+struct AsciiTrim : AsciiTrimBase<Type, true, true, AsciiTrim<Type>> {
+  using Base = AsciiTrimBase<Type, true, true, AsciiTrim<Type>>;
+  using Base::Base;
+};
+
+template <typename Type>
+struct AsciiLTrim : AsciiTrimBase<Type, true, false, AsciiLTrim<Type>> {
+  using Base = AsciiTrimBase<Type, true, false, AsciiLTrim<Type>>;
+  using Base::Base;
+};
+
+template <typename Type>
+struct AsciiRTrim : AsciiTrimBase<Type, false, true, AsciiRTrim<Type>> {
+  using Base = AsciiTrimBase<Type, false, true, AsciiRTrim<Type>>;
+  using Base::Base;
+};
+
+const FunctionDoc utf8_trim_whitespace_doc(
+    "Trim leading and trailing whitespace characters",
+    ("For each string in `strings`, emit a string with leading and trailing 
whitespace "

Review comment:
       Thanks, I didn't notice, fixed.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to