[GitHub] [arrow] pitrou commented on a change in pull request #8621: ARROW-9128: [C++] Implement string space trimming kernels: trim, ltrim, and rtrim

2021-01-19 Thread GitBox


pitrou commented on a change in pull request #8621:
URL: https://github.com/apache/arrow/pull/8621#discussion_r560227216



##
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##
@@ -186,16 +172,51 @@ struct UTF8Transform {
   }
 };
 
+#ifdef ARROW_WITH_UTF8PROC
+
+// transforms per codepoint
+template 
+struct StringTransformCodepoint : StringTransform {
+  using Base = StringTransform;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+ uint8_t* output, offset_type* output_written) {
+uint8_t* output_start = output;
+if (ARROW_PREDICT_FALSE(
+!arrow::util::UTF8Transform(input, input + 
input_string_ncodeunits, ,
+Derived::TransformCodepoint))) {
+  return false;
+}
+*output_written = static_cast(output - output_start);
+return true;
+  }
+  static int64_t MaxCodepoints(offset_type input_ncodeunits) {
+// Section 5.18 of the Unicode spec claim that the number of codepoints 
for case
+// mapping can grow by a factor of 3. This means grow by a factor of 3 in 
bytes
+// However, since we don't support all casings (SpecialCasing.txt) the 
growth
+// is actually only at max 3/2 (as covered by the unittest).
+// Note that rounding down the 3/2 is ok, since only codepoints encoded by
+// two code units (even) can grow to 3 code units.
+
+return static_cast(input_ncodeunits) * 3 / 2;

Review comment:
   Now that I read this again, it strikes me that this function is 
estimating a number of codepoints, but we're using it to allocate a number of 
bytes (i.e. utf-8 codeunits). Is that ok?
   
   (and/or the function naming and comment is inconsistent)





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] pitrou commented on a change in pull request #8621: ARROW-9128: [C++] Implement string space trimming kernels: trim, ltrim, and rtrim

2021-01-04 Thread GitBox


pitrou commented on a change in pull request #8621:
URL: https://github.com/apache/arrow/pull/8621#discussion_r551437108



##
File path: cpp/src/arrow/util/utf8.h
##
@@ -456,6 +456,67 @@ static inline bool UTF8Transform(const uint8_t* first, 
const uint8_t* last,
   return true;
 }
 
+template 
+static inline bool UTF8FindIf(const uint8_t* first, const uint8_t* last,
+  Predicate&& predicate, const uint8_t** position) 
{
+  const uint8_t* i = first;
+  while (i < last) {
+uint32_t codepoint = 0;
+const uint8_t* current = i;
+if (ARROW_PREDICT_FALSE(!UTF8Decode(, ))) {
+  return false;
+}
+if (predicate(codepoint)) {
+  *position = current;
+  return true;
+}
+  }
+  *position = last;
+  return true;
+}
+
+// same semantics as std::find_if using reverse iterators when the return value
+// having the same semantics as std::reverse_iterator<..>.base()
+template 
+static inline bool UTF8FindIfReverse(const uint8_t* first, const uint8_t* last,
+ Predicate&& predicate, const uint8_t** 
position) {
+  const uint8_t* i = last - 1;
+  while (i >= first) {
+uint32_t codepoint = 0;
+const uint8_t* current = i;
+if (ARROW_PREDICT_FALSE(!UTF8DecodeReverse(, ))) {
+  return false;
+}
+if (predicate(codepoint)) {
+  *position = current + 1;

Review comment:
   This is a bit weird. It returns the position to the next codepoint? The 
docstring should be a bit clearer about that (the current spelling is cryptic 
to me).

##
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##
@@ -186,6 +172,40 @@ struct UTF8Transform {
   }
 };
 
+#ifdef ARROW_WITH_UTF8PROC
+
+template 
+struct UTF8Transform : StringTransform {

Review comment:
   I don't exactly understand this refactor. There's a `UTF8Transform` with 
a `Transform` method for utf8 kernels but no corresponding class with a 
`Transform` method for ascii kernels, is that right?





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] pitrou commented on a change in pull request #8621: ARROW-9128: [C++] Implement string space trimming kernels: trim, ltrim, and rtrim

2020-11-26 Thread GitBox


pitrou commented on a change in pull request #8621:
URL: https://github.com/apache/arrow/pull/8621#discussion_r530997361



##
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##
@@ -1231,6 +1251,302 @@ Result StrptimeResolve(KernelContext* ctx, 
const std::vector
+struct UTF8TrimWhitespaceBase : StringTransform {
+  using Base = StringTransform;
+  using offset_type = typename Base::offset_type;
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+ uint8_t* output, offset_type* output_written) {
+const uint8_t* begin = input;
+const uint8_t* end = input + input_string_ncodeunits;
+const uint8_t* end_trimmed = end;
+const uint8_t* begin_trimmed = begin;
+
+auto predicate = [](uint32_t c) { return !IsSpaceCharacterUnicode(c); };
+if (left && !ARROW_PREDICT_TRUE(
+arrow::util::UTF8FindIf(begin, end, predicate, 
_trimmed))) {
+  return false;
+}
+if (right && (begin_trimmed < end)) {
+  if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed, 
end,
+ predicate, 
_trimmed))) {
+return false;
+  }
+}
+std::copy(begin_trimmed, end_trimmed, output);
+*output_written = static_cast(end_trimmed - begin_trimmed);
+return true;
+  }
+  void Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+EnsureLookupTablesFilled();
+Base::Execute(ctx, batch, out);
+  }
+};
+
+template 
+struct UTF8TrimWhitespace
+: UTF8TrimWhitespaceBase> {};
+
+template 
+struct UTF8LTrimWhitespace
+: UTF8TrimWhitespaceBase> {};
+
+template 
+struct UTF8RTrimWhitespace
+: UTF8TrimWhitespaceBase> {};
+
+template 
+struct UTF8TrimBase : StringTransform {
+  using Base = StringTransform;
+  using offset_type = typename Base::offset_type;
+  using State = OptionsWrapper;
+  TrimOptions options;
+  std::vector codepoints;
+
+  explicit UTF8TrimBase(TrimOptions options) : options(options) {
+// TODO: check return / can we raise an exception here?
+arrow::util::UTF8ForEach(options.characters, [&](uint32_t c) {
+  codepoints.resize(std::max(c + 1, 
static_cast(codepoints.size(;
+  codepoints.at(c) = true;
+});
+  }
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+TrimOptions options = State::Get(ctx);
+Derived(options).Execute(ctx, batch, out);
+  }
+
+  void Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+EnsureLookupTablesFilled();
+Base::Execute(ctx, batch, out);
+  }
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+ uint8_t* output, offset_type* output_written) {
+const uint8_t* begin = input;
+const uint8_t* end = input + input_string_ncodeunits;
+const uint8_t* end_trimmed = end;
+const uint8_t* begin_trimmed = begin;
+
+auto predicate = [&](uint32_t c) {
+  bool contains = codepoints[c];
+  return !contains;
+};
+if (left && !ARROW_PREDICT_TRUE(
+arrow::util::UTF8FindIf(begin, end, predicate, 
_trimmed))) {
+  return false;
+}
+if (right && (begin_trimmed < end)) {
+  if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed, 
end,
+ predicate, 
_trimmed))) {
+return false;
+  }
+}
+std::copy(begin_trimmed, end_trimmed, output);
+*output_written = static_cast(end_trimmed - begin_trimmed);
+return true;
+  }
+};
+template 
+struct UTF8Trim : UTF8TrimBase> {
+  using Base = UTF8TrimBase>;
+  using Base::Base;
+};
+
+template 
+struct UTF8LTrim : UTF8TrimBase> {
+  using Base = UTF8TrimBase>;
+  using Base::Base;
+};
+
+template 
+struct UTF8RTrim : UTF8TrimBase> {
+  using Base = UTF8TrimBase>;
+  using Base::Base;
+};
+
+#endif
+
+template 
+struct AsciiTrimWhitespaceBase : StringTransform {
+  using offset_type = typename Type::offset_type;
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+ uint8_t* output, offset_type* output_written) {
+const uint8_t* begin = input;
+const uint8_t* end = input + input_string_ncodeunits;
+const uint8_t* end_trimmed = end;
+
+auto predicate = [](unsigned char c) { return !IsSpaceCharacterAscii(c); };
+const uint8_t* begin_trimmed = left ? std::find_if(begin, end, predicate) 
: begin;
+if (right & (begin_trimmed < end)) {
+  std::reverse_iterator rbegin(end);
+  std::reverse_iterator rend(begin_trimmed);
+  end_trimmed = std::find_if(rbegin, rend, predicate).base();
+}
+std::copy(begin_trimmed, end_trimmed, output);
+*output_written = static_cast(end_trimmed - begin_trimmed);
+return true;
+  }
+};
+
+template 
+struct AsciiTrimWhitespace
+: AsciiTrimWhitespaceBase> {};
+
+template 
+struct AsciiLTrimWhitespace
+: AsciiTrimWhitespaceBase> 
{};
+

[GitHub] [arrow] pitrou commented on a change in pull request #8621: ARROW-9128: [C++] Implement string space trimming kernels: trim, ltrim, and rtrim

2020-11-10 Thread GitBox


pitrou commented on a change in pull request #8621:
URL: https://github.com/apache/arrow/pull/8621#discussion_r520803595



##
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##
@@ -1231,6 +1252,302 @@ Result StrptimeResolve(KernelContext* ctx, 
const std::vector
+struct UTF8TrimWhitespaceBase : StringTransform {
+  using Base = StringTransform;
+  using offset_type = typename Base::offset_type;
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+ uint8_t* output, offset_type* output_written) {
+const uint8_t* begin = input;
+const uint8_t* end = input + input_string_ncodeunits;
+const uint8_t* end_trimmed = end;
+const uint8_t* begin_trimmed = begin;
+
+auto predicate = [](uint32_t c) { return !IsSpaceCharacterUnicode(c); };
+if (left && !ARROW_PREDICT_TRUE(
+arrow::util::UTF8FindIf(begin, end, predicate, 
_trimmed))) {
+  return false;
+}
+if (right && (begin_trimmed < end)) {
+  if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed, 
end,
+ predicate, 
_trimmed))) {
+return false;
+  }
+}
+std::copy(begin_trimmed, end_trimmed, output);
+*output_written = static_cast(end_trimmed - begin_trimmed);
+return true;
+  }
+  void Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+EnsureLookupTablesFilled();
+Base::Execute(ctx, batch, out);
+  }
+};
+
+template 
+struct UTF8TrimWhitespace
+: UTF8TrimWhitespaceBase> {};
+
+template 
+struct UTF8LTrimWhitespace
+: UTF8TrimWhitespaceBase> {};
+
+template 
+struct UTF8RTrimWhitespace
+: UTF8TrimWhitespaceBase> {};
+
+template 
+struct UTF8TrimBase : StringTransform {
+  using Base = StringTransform;
+  using offset_type = typename Base::offset_type;
+  using State = OptionsWrapper;
+  TrimOptions options;
+  std::set codepoints;

Review comment:
   Just index the `vector` by codepoint and make `true` mean 
containment.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] pitrou commented on a change in pull request #8621: ARROW-9128: [C++] Implement string space trimming kernels: trim, ltrim, and rtrim

2020-11-10 Thread GitBox


pitrou commented on a change in pull request #8621:
URL: https://github.com/apache/arrow/pull/8621#discussion_r520675172



##
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##
@@ -1231,6 +1252,302 @@ Result StrptimeResolve(KernelContext* ctx, 
const std::vector
+struct UTF8TrimWhitespaceBase : StringTransform {
+  using Base = StringTransform;
+  using offset_type = typename Base::offset_type;
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+ uint8_t* output, offset_type* output_written) {
+const uint8_t* begin = input;
+const uint8_t* end = input + input_string_ncodeunits;
+const uint8_t* end_trimmed = end;
+const uint8_t* begin_trimmed = begin;
+
+auto predicate = [](uint32_t c) { return !IsSpaceCharacterUnicode(c); };
+if (left && !ARROW_PREDICT_TRUE(
+arrow::util::UTF8FindIf(begin, end, predicate, 
_trimmed))) {
+  return false;
+}
+if (right && (begin_trimmed < end)) {
+  if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed, 
end,
+ predicate, 
_trimmed))) {
+return false;
+  }
+}
+std::copy(begin_trimmed, end_trimmed, output);
+*output_written = static_cast(end_trimmed - begin_trimmed);
+return true;
+  }
+  void Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+EnsureLookupTablesFilled();
+Base::Execute(ctx, batch, out);
+  }
+};
+
+template 
+struct UTF8TrimWhitespace
+: UTF8TrimWhitespaceBase> {};
+
+template 
+struct UTF8LTrimWhitespace
+: UTF8TrimWhitespaceBase> {};
+
+template 
+struct UTF8RTrimWhitespace
+: UTF8TrimWhitespaceBase> {};
+
+template 
+struct UTF8TrimBase : StringTransform {
+  using Base = StringTransform;
+  using offset_type = typename Base::offset_type;
+  using State = OptionsWrapper;
+  TrimOptions options;
+  std::set codepoints;

Review comment:
   `std::set` doesn't sound like a terrific data structure for this. I 
would expect `std::vector` (which is 
[specified](https://en.cppreference.com/w/cpp/container/vector_bool) as 
optimizing memory footprint) to give better perf, though benchmarks are 
required to confirm this.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org