[GitHub] [arrow] maartenbreddels commented on a change in pull request #7449: ARROW-9133: [C++] Add utf8_upper and utf8_lower

GitBox Mon, 29 Jun 2020 05:33:47 -0700


maartenbreddels commented on a change in pull request #7449:
URL: https://github.com/apache/arrow/pull/7449#discussion_r446934815




##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -39,6 +158,121 @@ struct AsciiLength {
   }
 };
 
+template <typename Type, template <typename> class Derived>
+struct Utf8Transform {
+  using offset_type = typename Type::offset_type;
+  using DerivedClass = Derived<Type>;
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+
+  static offset_type Transform(const uint8_t* input, offset_type 
input_string_ncodeunits,
+                               uint8_t* output) {
+    uint8_t* dest = output;
+    utf8_transform(input, input + input_string_ncodeunits, dest,
+                   DerivedClass::TransformCodepoint);
+    return (offset_type)(dest - output);
+  }
+
+  static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    if (batch[0].kind() == Datum::ARRAY) {
+      std::call_once(flag_case_luts, []() {
+        lut_upper_codepoint.reserve(MAX_CODEPOINT_LUT + 1);
+        lut_lower_codepoint.reserve(MAX_CODEPOINT_LUT + 1);
+        for (int i = 0; i <= MAX_CODEPOINT_LUT; i++) {
+          lut_upper_codepoint.push_back(utf8proc_toupper(i));
+          lut_lower_codepoint.push_back(utf8proc_tolower(i));
+        }
+      });
+      const ArrayData& input = *batch[0].array();
+      ArrayType input_boxed(batch[0].array());
+      ArrayData* output = out->mutable_array();
+
+      offset_type const* input_string_offsets = 
input.GetValues<offset_type>(1);
+      utf8proc_uint8_t const* input_str =
+          input.buffers[2]->data() + input_boxed.value_offset(0);
+      offset_type input_ncodeunits = input_boxed.total_values_length();
+      offset_type input_nstrings = (offset_type)input.length;
+
+      // Section 5.18 of the Unicode spec claim that the number of codepoints 
for case
+      // mapping can grow by a factor of 3. This means grow by a factor of 3 
in bytes
+      // However, since we don't support all casings (SpecialCasing.txt) the 
growth
+      // is actually only at max 3/2 (as covered by the unittest).
+      // Note that rounding down the 3/2 is ok, since only codepoints encoded 
by
+      // two code units (even) can grow to 3 code units.
+
+      int64_t output_ncodeunits_max = ((int64_t)input_ncodeunits) * 3 / 2;
+      if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
+        ctx->SetStatus(Status::CapacityError(
+            "Result might not fit in a 32bit utf8 array, convert to 
large_utf8"));
+        return;
+      }
+
+      KERNEL_RETURN_IF_ERROR(
+          ctx, 
ctx->Allocate(output_ncodeunits_max).Value(&output->buffers[2]));
+      // We could reuse the buffer if it is all ascii, benchmarking showed 
this not to
+      // matter
+      // output->buffers[1] = input.buffers[1];
+      KERNEL_RETURN_IF_ERROR(ctx,
+                             ctx->Allocate((input_nstrings + 1) * 
sizeof(offset_type))
+                                 .Value(&output->buffers[1]));
+      utf8proc_uint8_t* output_str = output->buffers[2]->mutable_data();
+      offset_type* output_string_offsets = 
output->GetMutableValues<offset_type>(1);
+      offset_type output_ncodeunits = 0;
+
+      offset_type output_string_offset = 0;
+      *output_string_offsets = output_string_offset;
+      offset_type input_string_first_offset = input_string_offsets[0];
+      for (int64_t i = 0; i < input_nstrings; i++) {
+        offset_type input_string_offset =
+            input_string_offsets[i] - input_string_first_offset;
+        offset_type input_string_end =
+            input_string_offsets[i + 1] - input_string_first_offset;
+        offset_type input_string_ncodeunits = input_string_end - 
input_string_offset;
+        offset_type encoded_nbytes = DerivedClass::Transform(
+            input_str + input_string_offset, input_string_ncodeunits,
+            output_str + output_ncodeunits);
+        output_ncodeunits += encoded_nbytes;
+        output_string_offsets[i + 1] = output_ncodeunits;
+      }
+
+      // trim the codepoint buffer, since we allocated too much
+      KERNEL_RETURN_IF_ERROR(
+          ctx,
+          output->buffers[2]->CopySlice(0, 
output_ncodeunits).Value(&output->buffers[2]));
+    } else {
+      const auto& input = checked_cast<const 
BaseBinaryScalar&>(*batch[0].scalar());
+      auto result = 
checked_pointer_cast<BaseBinaryScalar>(MakeNullScalar(out->type()));
+      if (input.is_valid) {
+        result->is_valid = true;
+        offset_type data_nbytes = (offset_type)input.value->size();
+        // See note above in the Array version explaining the 3 / 2
+        KERNEL_RETURN_IF_ERROR(ctx,
+                               ctx->Allocate(data_nbytes * 3 / 
2).Value(&result->value));
+        offset_type encoded_nbytes = DerivedClass::Transform(
+            input.value->data(), data_nbytes, result->value->mutable_data());
+        KERNEL_RETURN_IF_ERROR(
+            ctx, result->value->CopySlice(0, 
encoded_nbytes).Value(&result->value));
+      }
+      out->value = result;
+    }
+  }
+};
+
+template <typename Type>
+struct Utf8Upper : Utf8Transform<Type, Utf8Upper> {

Review comment:
       Good catch, that was preparing for Upcasting, which would need this, 
since we agreed not to upcast, we can remove it.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] maartenbreddels commented on a change in pull request #7449: ARROW-9133: [C++] Add utf8_upper and utf8_lower

Reply via email to