pitrou commented on a change in pull request #11298:
URL: https://github.com/apache/arrow/pull/11298#discussion_r741975298
##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -672,11 +678,64 @@ struct Utf8TitleTransform : public
FunctionalCaseMappingTransform {
template <typename Type>
using Utf8Title = StringTransformExec<Type, Utf8TitleTransform>;
+struct Utf8NormalizeTransform : public FunctionalCaseMappingTransform {
+ using State = OptionsWrapper<Utf8NormalizeOptions>;
+
+ const Utf8NormalizeOptions* options;
+
+ explicit Utf8NormalizeTransform(const Utf8NormalizeOptions& options)
+ : options{&options} {}
+
+ int64_t MaxCodeunits(const uint8_t* input, int64_t ninputs,
+ int64_t input_ncodeunits) override {
+ const auto option = GenerateUtf8NormalizeOption(options->method);
+ const auto n_chars =
+ utf8proc_decompose_custom(input, input_ncodeunits, NULL, 0, option,
NULL, NULL);
+
+ // convert to byte length
+ return n_chars * 4;
+ }
+
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output, int64_t output_string_ncodeunits) {
+ const auto option = GenerateUtf8NormalizeOption(options->method);
+ const auto n_chars = utf8proc_decompose_custom(
+ input, input_string_ncodeunits,
reinterpret_cast<utf8proc_int32_t*>(output),
+ output_string_ncodeunits, option, NULL, NULL);
+ if (n_chars < 0) return output_string_ncodeunits;
+
+ const auto n_bytes =
+ utf8proc_reencode(reinterpret_cast<utf8proc_int32_t*>(output),
n_chars, option);
+ return n_bytes;
+ }
Review comment:
As for #1, avoiding a temporary copy (and allocation!) for each input
string sounds beneficial.
As for #2, you are right. We can use our own UTF8 encoding instead (see
`UTF8Encode` in `arrow/util/utf8.h`)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]