pitrou commented on a change in pull request #11298:
URL: https://github.com/apache/arrow/pull/11298#discussion_r741969066
##########
File path: cpp/src/arrow/compute/api_scalar.h
##########
@@ -395,6 +399,17 @@ struct ARROW_EXPORT WeekOptions : public FunctionOptions {
bool first_week_is_fully_in_year;
};
+struct ARROW_EXPORT Utf8NormalizeOptions : public FunctionOptions {
Review comment:
Actually, we shouldn't expose different ABIs depending on macro
definitions, so I think this is fine. Calling the function will just error out
at runtime, but there won't be any compile errors if some reason the installed
Arrow version didn't enable utf8proc.
##########
File path: cpp/src/arrow/compute/api_scalar.h
##########
@@ -30,6 +30,10 @@
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
+#ifdef ARROW_WITH_UTF8PROC
+#include <utf8proc.h>
Review comment:
We shouldn't include utf8proc in public headers, IMHO. This is only
necessary for the kernel implementation.
##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -672,11 +678,64 @@ struct Utf8TitleTransform : public
FunctionalCaseMappingTransform {
template <typename Type>
using Utf8Title = StringTransformExec<Type, Utf8TitleTransform>;
+struct Utf8NormalizeTransform : public FunctionalCaseMappingTransform {
+ using State = OptionsWrapper<Utf8NormalizeOptions>;
+
+ const Utf8NormalizeOptions* options;
+
+ explicit Utf8NormalizeTransform(const Utf8NormalizeOptions& options)
+ : options{&options} {}
+
+ int64_t MaxCodeunits(const uint8_t* input, int64_t ninputs,
+ int64_t input_ncodeunits) override {
+ const auto option = GenerateUtf8NormalizeOption(options->method);
+ const auto n_chars =
+ utf8proc_decompose_custom(input, input_ncodeunits, NULL, 0, option,
NULL, NULL);
+
+ // convert to byte length
+ return n_chars * 4;
+ }
+
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output, int64_t output_string_ncodeunits) {
+ const auto option = GenerateUtf8NormalizeOption(options->method);
+ const auto n_chars = utf8proc_decompose_custom(
+ input, input_string_ncodeunits,
reinterpret_cast<utf8proc_int32_t*>(output),
+ output_string_ncodeunits, option, NULL, NULL);
+ if (n_chars < 0) return output_string_ncodeunits;
+
+ const auto n_bytes =
+ utf8proc_reencode(reinterpret_cast<utf8proc_int32_t*>(output),
n_chars, option);
+ return n_bytes;
+ }
Review comment:
As for #1, avoiding a temporary copy (and allocation!) for each input
string sounds beneficial.
As for #2, you are right. We can use our own UTF8 encoding instead (see
`UTF8Encode` in `arrow/util/utf8.h`)
##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -672,11 +678,64 @@ struct Utf8TitleTransform : public
FunctionalCaseMappingTransform {
template <typename Type>
using Utf8Title = StringTransformExec<Type, Utf8TitleTransform>;
+struct Utf8NormalizeTransform : public FunctionalCaseMappingTransform {
+ using State = OptionsWrapper<Utf8NormalizeOptions>;
+
+ const Utf8NormalizeOptions* options;
+
+ explicit Utf8NormalizeTransform(const Utf8NormalizeOptions& options)
+ : options{&options} {}
+
+ int64_t MaxCodeunits(const uint8_t* input, int64_t ninputs,
+ int64_t input_ncodeunits) override {
+ const auto option = GenerateUtf8NormalizeOption(options->method);
+ const auto n_chars =
+ utf8proc_decompose_custom(input, input_ncodeunits, NULL, 0, option,
NULL, NULL);
+
+ // convert to byte length
+ return n_chars * 4;
Review comment:
It is not obvious to me that this sufficient. Some codepoints normalize
to extremely large strings, e.g.:
```python
>>> s = "\uFDFA"
>>> len(unicodedata.normalize('NFD', s))
1
>>> len(unicodedata.normalize('NFC', s))
1
>>> len(unicodedata.normalize('NFKD', s))
18
>>> len(unicodedata.normalize('NFKC', s))
18
>>> len(unicodedata.normalize('NFKC', s).encode())
33
>>> len(unicodedata.normalize('NFKC', s).encode()) / len(s.encode())
11.0 # output utf8 can be 11 times larger than the input utf8
```
So instead of creating an output data equal to at least 11 times the size of
the input data, I think this kernel should adopt another approach and simply
grow the output data as needed.
##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -672,11 +678,64 @@ struct Utf8TitleTransform : public
FunctionalCaseMappingTransform {
template <typename Type>
using Utf8Title = StringTransformExec<Type, Utf8TitleTransform>;
+struct Utf8NormalizeTransform : public FunctionalCaseMappingTransform {
+ using State = OptionsWrapper<Utf8NormalizeOptions>;
+
+ const Utf8NormalizeOptions* options;
+
+ explicit Utf8NormalizeTransform(const Utf8NormalizeOptions& options)
+ : options{&options} {}
+
+ int64_t MaxCodeunits(const uint8_t* input, int64_t ninputs,
+ int64_t input_ncodeunits) override {
+ const auto option = GenerateUtf8NormalizeOption(options->method);
+ const auto n_chars =
+ utf8proc_decompose_custom(input, input_ncodeunits, NULL, 0, option,
NULL, NULL);
+
+ // convert to byte length
+ return n_chars * 4;
Review comment:
Ok, I don't know how much we want to optimize this, but we're calling
the normalization function twice: once here to compute the output size, once in
`Transform` to actually output the data. It seems wasteful (especially as I
don't think normalization is especially fast).
Also, the heuristic of multiplying by 4 to get the number of output bytes is
a bit crude.
My preference would be for this kernel to take an another approach and
simply grow the output buffer as needed (you can presize it with the input data
length as a heuristic).
##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -672,11 +678,64 @@ struct Utf8TitleTransform : public
FunctionalCaseMappingTransform {
template <typename Type>
using Utf8Title = StringTransformExec<Type, Utf8TitleTransform>;
+struct Utf8NormalizeTransform : public FunctionalCaseMappingTransform {
+ using State = OptionsWrapper<Utf8NormalizeOptions>;
+
+ const Utf8NormalizeOptions* options;
+
+ explicit Utf8NormalizeTransform(const Utf8NormalizeOptions& options)
+ : options{&options} {}
+
+ int64_t MaxCodeunits(const uint8_t* input, int64_t ninputs,
+ int64_t input_ncodeunits) override {
+ const auto option = GenerateUtf8NormalizeOption(options->method);
+ const auto n_chars =
+ utf8proc_decompose_custom(input, input_ncodeunits, NULL, 0, option,
NULL, NULL);
+
+ // convert to byte length
+ return n_chars * 4;
Review comment:
Also, there is a more fundamental problem with this approach: if the
input is e.g. `["a", "\u0300"]`, normalizing the entire input data to NFC
results in one codepoint (`"à"` or `"\u00e0"`). However, normalizing each
string indepedently results in one codepoint for each string. So this estimate
may be too low in some cases.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]