[GitHub] [arrow] pitrou commented on a change in pull request #7449: ARROW-9133: [C++] Add utf8_upper and utf8_lower

2020-06-29 Thread GitBox


pitrou commented on a change in pull request #7449:
URL: https://github.com/apache/arrow/pull/7449#discussion_r447171303



##
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##
@@ -133,23 +134,23 @@ struct Utf8Transform {
 output_string_offsets[i + 1] = output_ncodeunits;
   }
 
-  // trim the codepoint buffer, since we allocated too much
+  // Trim the codepoint buffer, since we allocated too much
   KERNEL_RETURN_IF_ERROR(
-  ctx,
-  output->buffers[2]->CopySlice(0, 
output_ncodeunits).Value(>buffers[2]));
+  ctx, values_buffer->Resize(output_ncodeunits, 
/*shrink_to_fit=*/true));

Review comment:
   :-)





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] pitrou commented on a change in pull request #7449: ARROW-9133: [C++] Add utf8_upper and utf8_lower

2020-06-29 Thread GitBox


pitrou commented on a change in pull request #7449:
URL: https://github.com/apache/arrow/pull/7449#discussion_r447161925



##
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##
@@ -39,6 +158,121 @@ struct AsciiLength {
   }
 };
 
+template  class Derived>
+struct Utf8Transform {
+  using offset_type = typename Type::offset_type;
+  using DerivedClass = Derived;
+  using ArrayType = typename TypeTraits::ArrayType;
+
+  static offset_type Transform(const uint8_t* input, offset_type 
input_string_ncodeunits,
+   uint8_t* output) {
+uint8_t* dest = output;
+utf8_transform(input, input + input_string_ncodeunits, dest,
+   DerivedClass::TransformCodepoint);
+return (offset_type)(dest - output);
+  }
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+if (batch[0].kind() == Datum::ARRAY) {
+  std::call_once(flag_case_luts, []() {
+lut_upper_codepoint.reserve(MAX_CODEPOINT_LUT + 1);
+lut_lower_codepoint.reserve(MAX_CODEPOINT_LUT + 1);
+for (int i = 0; i <= MAX_CODEPOINT_LUT; i++) {
+  lut_upper_codepoint.push_back(utf8proc_toupper(i));
+  lut_lower_codepoint.push_back(utf8proc_tolower(i));
+}
+  });
+  const ArrayData& input = *batch[0].array();
+  ArrayType input_boxed(batch[0].array());
+  ArrayData* output = out->mutable_array();
+
+  offset_type const* input_string_offsets = 
input.GetValues(1);
+  utf8proc_uint8_t const* input_str =
+  input.buffers[2]->data() + input_boxed.value_offset(0);
+  offset_type input_ncodeunits = input_boxed.total_values_length();
+  offset_type input_nstrings = (offset_type)input.length;
+
+  // Section 5.18 of the Unicode spec claim that the number of codepoints 
for case
+  // mapping can grow by a factor of 3. This means grow by a factor of 3 
in bytes
+  // However, since we don't support all casings (SpecialCasing.txt) the 
growth
+  // is actually only at max 3/2 (as covered by the unittest).
+  // Note that rounding down the 3/2 is ok, since only codepoints encoded 
by
+  // two code units (even) can grow to 3 code units.
+
+  int64_t output_ncodeunits_max = ((int64_t)input_ncodeunits) * 3 / 2;
+  if (output_ncodeunits_max > std::numeric_limits::max()) {
+ctx->SetStatus(Status::CapacityError(
+"Result might not fit in a 32bit utf8 array, convert to 
large_utf8"));
+return;
+  }
+
+  KERNEL_RETURN_IF_ERROR(
+  ctx, 
ctx->Allocate(output_ncodeunits_max).Value(>buffers[2]));
+  // We could reuse the buffer if it is all ascii, benchmarking showed 
this not to
+  // matter
+  // output->buffers[1] = input.buffers[1];
+  KERNEL_RETURN_IF_ERROR(ctx,
+ ctx->Allocate((input_nstrings + 1) * 
sizeof(offset_type))
+ .Value(>buffers[1]));
+  utf8proc_uint8_t* output_str = output->buffers[2]->mutable_data();
+  offset_type* output_string_offsets = 
output->GetMutableValues(1);
+  offset_type output_ncodeunits = 0;
+
+  offset_type output_string_offset = 0;
+  *output_string_offsets = output_string_offset;
+  offset_type input_string_first_offset = input_string_offsets[0];
+  for (int64_t i = 0; i < input_nstrings; i++) {
+offset_type input_string_offset =
+input_string_offsets[i] - input_string_first_offset;
+offset_type input_string_end =
+input_string_offsets[i + 1] - input_string_first_offset;
+offset_type input_string_ncodeunits = input_string_end - 
input_string_offset;
+offset_type encoded_nbytes = DerivedClass::Transform(
+input_str + input_string_offset, input_string_ncodeunits,
+output_str + output_ncodeunits);
+output_ncodeunits += encoded_nbytes;
+output_string_offsets[i + 1] = output_ncodeunits;
+  }
+
+  // trim the codepoint buffer, since we allocated too much
+  KERNEL_RETURN_IF_ERROR(
+  ctx,
+  output->buffers[2]->CopySlice(0, 
output_ncodeunits).Value(>buffers[2]));

Review comment:
   Ok, I just pushed a change which resizes the buffer (hopefully 
in-place). This makes the benchmarks a bit faster.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] pitrou commented on a change in pull request #7449: ARROW-9133: [C++] Add utf8_upper and utf8_lower

2020-06-29 Thread GitBox


pitrou commented on a change in pull request #7449:
URL: https://github.com/apache/arrow/pull/7449#discussion_r447161391



##
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##
@@ -30,6 +31,124 @@ namespace internal {
 
 namespace {
 
+// lookup tables
+std::vector lut_upper_codepoint;
+std::vector lut_lower_codepoint;
+std::once_flag flag_case_luts;
+
+constexpr uint32_t REPLACEMENT_CHAR =
+'?';  // the proper replacement char would be the 0xFFFD codepoint, but 
that can
+  // increase string length by a factor of 3
+constexpr int MAX_CODEPOINT_LUT = 0x;  // up to this codepoint is in a 
lookup table
+
+static inline void utf8_encode(uint8_t*& str, uint32_t codepoint) {
+  if (codepoint < 0x80) {
+*str++ = codepoint;
+  } else if (codepoint < 0x800) {
+*str++ = 0xC0 + (codepoint >> 6);
+*str++ = 0x80 + (codepoint & 0x3F);
+  } else if (codepoint < 0x1) {
+*str++ = 0xE0 + (codepoint >> 12);
+*str++ = 0x80 + ((codepoint >> 6) & 0x3F);
+*str++ = 0x80 + (codepoint & 0x3F);
+  } else if (codepoint < 0x20) {
+*str++ = 0xF0 + (codepoint >> 18);
+*str++ = 0x80 + ((codepoint >> 12) & 0x3F);
+*str++ = 0x80 + ((codepoint >> 6) & 0x3F);
+*str++ = 0x80 + (codepoint & 0x3F);
+  } else {
+*str++ = codepoint;
+  }
+}
+
+static inline bool utf8_is_continuation(const uint8_t codeunit) {
+  return (codeunit & 0xC0) == 0x80;  // upper two bits should be 10
+}
+
+static inline uint32_t utf8_decode(const uint8_t*& str, int64_t& length) {
+  if (*str < 0x80) {  //
+length -= 1;
+return *str++;
+  } else if (*str < 0xC0) {  // invalid non-ascii char
+length -= 1;
+str++;
+return REPLACEMENT_CHAR;

Review comment:
   There's no reason a priori to be forgiving on invalid data. It there's a 
use case to be tolerant, we may add an option. But by default we should error 
out on invalid input, IMHO.
   
   cc @wesm  for opinions





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] pitrou commented on a change in pull request #7449: ARROW-9133: [C++] Add utf8_upper and utf8_lower

2020-06-29 Thread GitBox


pitrou commented on a change in pull request #7449:
URL: https://github.com/apache/arrow/pull/7449#discussion_r447161391



##
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##
@@ -30,6 +31,124 @@ namespace internal {
 
 namespace {
 
+// lookup tables
+std::vector lut_upper_codepoint;
+std::vector lut_lower_codepoint;
+std::once_flag flag_case_luts;
+
+constexpr uint32_t REPLACEMENT_CHAR =
+'?';  // the proper replacement char would be the 0xFFFD codepoint, but 
that can
+  // increase string length by a factor of 3
+constexpr int MAX_CODEPOINT_LUT = 0x;  // up to this codepoint is in a 
lookup table
+
+static inline void utf8_encode(uint8_t*& str, uint32_t codepoint) {
+  if (codepoint < 0x80) {
+*str++ = codepoint;
+  } else if (codepoint < 0x800) {
+*str++ = 0xC0 + (codepoint >> 6);
+*str++ = 0x80 + (codepoint & 0x3F);
+  } else if (codepoint < 0x1) {
+*str++ = 0xE0 + (codepoint >> 12);
+*str++ = 0x80 + ((codepoint >> 6) & 0x3F);
+*str++ = 0x80 + (codepoint & 0x3F);
+  } else if (codepoint < 0x20) {
+*str++ = 0xF0 + (codepoint >> 18);
+*str++ = 0x80 + ((codepoint >> 12) & 0x3F);
+*str++ = 0x80 + ((codepoint >> 6) & 0x3F);
+*str++ = 0x80 + (codepoint & 0x3F);
+  } else {
+*str++ = codepoint;
+  }
+}
+
+static inline bool utf8_is_continuation(const uint8_t codeunit) {
+  return (codeunit & 0xC0) == 0x80;  // upper two bits should be 10
+}
+
+static inline uint32_t utf8_decode(const uint8_t*& str, int64_t& length) {
+  if (*str < 0x80) {  //
+length -= 1;
+return *str++;
+  } else if (*str < 0xC0) {  // invalid non-ascii char
+length -= 1;
+str++;
+return REPLACEMENT_CHAR;

Review comment:
   There's no reason a priori to be forgiving on invalid data. It there's 
use case to be tolerant, we may add an option. But by default we should error 
out on invalid input.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] pitrou commented on a change in pull request #7449: ARROW-9133: [C++] Add utf8_upper and utf8_lower

2020-06-29 Thread GitBox


pitrou commented on a change in pull request #7449:
URL: https://github.com/apache/arrow/pull/7449#discussion_r447143530



##
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##
@@ -81,5 +147,40 @@ TYPED_TEST(TestStringKernels, 
StrptimeDoesNotProvideDefaultOptions) {
   ASSERT_RAISES(Invalid, CallFunction("strptime", {input}));
 }
 
+TEST(TestStringKernels, UnicodeLibraryAssumptions) {
+  uint8_t output[4];
+  for (utf8proc_int32_t codepoint = 0x100; codepoint < 0x11; codepoint++) {
+utf8proc_ssize_t encoded_nbytes = utf8proc_encode_char(codepoint, output);
+utf8proc_int32_t codepoint_upper = utf8proc_toupper(codepoint);
+utf8proc_ssize_t encoded_nbytes_upper = 
utf8proc_encode_char(codepoint_upper, output);
+if (encoded_nbytes == 2) {
+  EXPECT_LE(encoded_nbytes_upper, 3)
+  << "Expected the upper case codepoint for a 2 byte encoded codepoint 
to be "
+ "encoded in maximum 3 bytes, not "
+  << encoded_nbytes_upper;
+}
+if (encoded_nbytes == 3) {

Review comment:
   Ok, thank you.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] pitrou commented on a change in pull request #7449: ARROW-9133: [C++] Add utf8_upper and utf8_lower

2020-06-29 Thread GitBox


pitrou commented on a change in pull request #7449:
URL: https://github.com/apache/arrow/pull/7449#discussion_r446946562



##
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##
@@ -30,6 +31,124 @@ namespace internal {
 
 namespace {
 
+// lookup tables
+std::vector lut_upper_codepoint;
+std::vector lut_lower_codepoint;
+std::once_flag flag_case_luts;
+
+constexpr uint32_t REPLACEMENT_CHAR =
+'?';  // the proper replacement char would be the 0xFFFD codepoint, but 
that can
+  // increase string length by a factor of 3
+constexpr int MAX_CODEPOINT_LUT = 0x;  // up to this codepoint is in a 
lookup table
+
+static inline void utf8_encode(uint8_t*& str, uint32_t codepoint) {
+  if (codepoint < 0x80) {
+*str++ = codepoint;
+  } else if (codepoint < 0x800) {
+*str++ = 0xC0 + (codepoint >> 6);
+*str++ = 0x80 + (codepoint & 0x3F);
+  } else if (codepoint < 0x1) {
+*str++ = 0xE0 + (codepoint >> 12);
+*str++ = 0x80 + ((codepoint >> 6) & 0x3F);
+*str++ = 0x80 + (codepoint & 0x3F);
+  } else if (codepoint < 0x20) {
+*str++ = 0xF0 + (codepoint >> 18);
+*str++ = 0x80 + ((codepoint >> 12) & 0x3F);
+*str++ = 0x80 + ((codepoint >> 6) & 0x3F);
+*str++ = 0x80 + (codepoint & 0x3F);
+  } else {
+*str++ = codepoint;
+  }
+}
+
+static inline bool utf8_is_continuation(const uint8_t codeunit) {
+  return (codeunit & 0xC0) == 0x80;  // upper two bits should be 10
+}
+
+static inline uint32_t utf8_decode(const uint8_t*& str, int64_t& length) {
+  if (*str < 0x80) {  //
+length -= 1;
+return *str++;
+  } else if (*str < 0xC0) {  // invalid non-ascii char
+length -= 1;
+str++;
+return REPLACEMENT_CHAR;

Review comment:
   Well, I'm not sure why it would make sense to ignore invalid input data 
here, unless not ignoring it has a significant cost (which sounds unlikely).





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org




[GitHub] [arrow] pitrou commented on a change in pull request #7449: ARROW-9133: [C++] Add utf8_upper and utf8_lower

2020-06-25 Thread GitBox


pitrou commented on a change in pull request #7449:
URL: https://github.com/apache/arrow/pull/7449#discussion_r445417901



##
File path: cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc
##
@@ -52,8 +55,18 @@ static void AsciiUpper(benchmark::State& state) {
   UnaryStringBenchmark(state, "ascii_upper");
 }
 
+static void Utf8Upper(benchmark::State& state) {
+  UnaryStringBenchmark(state, "utf8_upper", true);

Review comment:
   Note that `RandomArrayGenerator::String` generates an array of pure 
Ascii characters between `A` and `z`. 
   Perhaps we need two sets of benchmarks:
   * one with pure Ascii values
   * one with partly non-Ascii values

##
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##
@@ -30,6 +31,124 @@ namespace internal {
 
 namespace {
 
+// lookup tables
+std::vector lut_upper_codepoint;
+std::vector lut_lower_codepoint;
+std::once_flag flag_case_luts;
+
+constexpr uint32_t REPLACEMENT_CHAR =
+'?';  // the proper replacement char would be the 0xFFFD codepoint, but 
that can
+  // increase string length by a factor of 3
+constexpr int MAX_CODEPOINT_LUT = 0x;  // up to this codepoint is in a 
lookup table
+
+static inline void utf8_encode(uint8_t*& str, uint32_t codepoint) {
+  if (codepoint < 0x80) {
+*str++ = codepoint;
+  } else if (codepoint < 0x800) {
+*str++ = 0xC0 + (codepoint >> 6);
+*str++ = 0x80 + (codepoint & 0x3F);
+  } else if (codepoint < 0x1) {
+*str++ = 0xE0 + (codepoint >> 12);
+*str++ = 0x80 + ((codepoint >> 6) & 0x3F);
+*str++ = 0x80 + (codepoint & 0x3F);
+  } else if (codepoint < 0x20) {
+*str++ = 0xF0 + (codepoint >> 18);
+*str++ = 0x80 + ((codepoint >> 12) & 0x3F);
+*str++ = 0x80 + ((codepoint >> 6) & 0x3F);
+*str++ = 0x80 + (codepoint & 0x3F);
+  } else {
+*str++ = codepoint;
+  }
+}
+
+static inline bool utf8_is_continuation(const uint8_t codeunit) {
+  return (codeunit & 0xC0) == 0x80;  // upper two bits should be 10
+}
+
+static inline uint32_t utf8_decode(const uint8_t*& str, int64_t& length) {
+  if (*str < 0x80) {  //
+length -= 1;
+return *str++;
+  } else if (*str < 0xC0) {  // invalid non-ascii char
+length -= 1;
+str++;
+return REPLACEMENT_CHAR;

Review comment:
   Hmm, I don't really agree with this... If there's some invalid input, we 
should bail out with `Status::Invalid`, IMO.

##
File path: cpp/src/arrow/compute/kernels/scalar_string_benchmark.cc
##
@@ -41,7 +42,9 @@ static void UnaryStringBenchmark(benchmark::State& state, 
const std::string& fun
 ABORT_NOT_OK(CallFunction(func_name, {values}));
   }
   state.SetItemsProcessed(state.iterations() * array_length);
-  state.SetBytesProcessed(state.iterations() * 
values->data()->buffers[2]->size());
+  state.SetBytesProcessed(state.iterations() *
+  ((touches_offsets ? 
values->data()->buffers[1]->size() : 0) +

Review comment:
   Hmm, this looks a bit pedantic and counter-productive to me. We want 
Ascii and UTF8 numbers to be directly comparable, so let's keep the original 
calculation.

##
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##
@@ -30,6 +31,124 @@ namespace internal {
 
 namespace {
 
+// lookup tables
+std::vector lut_upper_codepoint;
+std::vector lut_lower_codepoint;
+std::once_flag flag_case_luts;
+
+constexpr uint32_t REPLACEMENT_CHAR =
+'?';  // the proper replacement char would be the 0xFFFD codepoint, but 
that can
+  // increase string length by a factor of 3
+constexpr int MAX_CODEPOINT_LUT = 0x;  // up to this codepoint is in a 
lookup table
+
+static inline void utf8_encode(uint8_t*& str, uint32_t codepoint) {

Review comment:
   By the way, all those helper functions (encoding/decoding) may also go 
into `arrow/util/utf8.h`.

##
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##
@@ -68,6 +76,64 @@ TYPED_TEST(TestStringKernels, AsciiLower) {
this->string_type(), "[\"aaazzæÆ&\", null, \"\", \"bbb\"]");
 }
 
+TEST(TestStringKernels, Utf8Upper32bitGrowth) {

Review comment:
   I'm not convinced it's a good idea to consume more than 2GB RAM in a 
test. We have something called `LARGE_MEMORY_TEST` for such tests (you can grep 
for it), though I'm not sure they get exercised on a regular basis.

##
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##
@@ -81,5 +147,40 @@ TYPED_TEST(TestStringKernels, 
StrptimeDoesNotProvideDefaultOptions) {
   ASSERT_RAISES(Invalid, CallFunction("strptime", {input}));
 }
 
+TEST(TestStringKernels, UnicodeLibraryAssumptions) {
+  uint8_t output[4];
+  for (utf8proc_int32_t codepoint = 0x100; codepoint < 0x11; codepoint++) {
+utf8proc_ssize_t encoded_nbytes = utf8proc_encode_char(codepoint, output);
+utf8proc_int32_t codepoint_upper = utf8proc_toupper(codepoint);
+utf8proc_ssize_t encoded_nbytes_upper =