bkietz commented on a change in pull request #8470:
URL: https://github.com/apache/arrow/pull/8470#discussion_r505776329
##########
File path: cpp/src/arrow/util/utf8.h
##########
@@ -98,8 +100,9 @@ inline bool ValidateUTF8(const uint8_t* data, int64_t size) {
// XXX This is doing an unaligned access. Contemporary architectures
// (x86-64, AArch64, PPC64) support it natively and often have good
// performance nevertheless.
- memcpy(&mask, data, 8);
- if (ARROW_PREDICT_TRUE((mask & high_bits_64) == 0)) {
+ uint64_t mask64;
+ memcpy(&mask64, data, 8);
Review comment:
```suggestion
uint64_t mask64 = SafeLoadAs<uint64_t>(data);
```
##########
File path: cpp/src/arrow/util/utf8.h
##########
@@ -154,13 +157,50 @@ inline bool ValidateUTF8(const uint8_t* data, int64_t
size) {
return false;
}
- // Validate string tail one byte at a time
+ // Check if string tail is full ASCII (common case, fast)
+ if (size >= 4) {
+ uint32_t tail_mask = SafeLoadAs<uint32_t>(data + size - 4);
+ uint32_t head_mask = SafeLoadAs<uint32_t>(data);
+ if (ARROW_PREDICT_TRUE(((head_mask | tail_mask) & high_bits_32) == 0)) {
+ return true;
+ }
+ } else if (size >= 2) {
+ uint16_t tail_mask = SafeLoadAs<uint32_t>(data + size - 2);
+ uint16_t head_mask = SafeLoadAs<uint32_t>(data);
Review comment:
```suggestion
uint16_t tail_mask = SafeLoadAs<uint16_t>(data + size - 2);
uint16_t head_mask = SafeLoadAs<uint16_t>(data);
```
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]