This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 2510f4f ARROW-10313: [C++] Faster UTF8 validation for small strings
2510f4f is described below
commit 2510f4fd32cefe333ac7340f5dca9a5907b114e5
Author: Antoine Pitrou <[email protected]>
AuthorDate: Fri Oct 16 09:32:24 2020 +0200
ARROW-10313: [C++] Faster UTF8 validation for small strings
This improves CSV string conversion performance by about 30%.
Closes #8470 from pitrou/ARROW-10313-faster-utf8-validate
Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/util/utf8.h | 53 ++++++++++++++++++++++++++++++++++++++++-------
1 file changed, 46 insertions(+), 7 deletions(-)
diff --git a/cpp/src/arrow/util/utf8.h b/cpp/src/arrow/util/utf8.h
index d5875c4..c089fa7 100644
--- a/cpp/src/arrow/util/utf8.h
+++ b/cpp/src/arrow/util/utf8.h
@@ -27,6 +27,7 @@
#include "arrow/util/macros.h"
#include "arrow/util/simd.h"
#include "arrow/util/string_view.h"
+#include "arrow/util/ubsan.h"
#include "arrow/util/visibility.h"
namespace arrow {
@@ -87,8 +88,9 @@ ARROW_EXPORT void InitializeUTF8();
inline bool ValidateUTF8(const uint8_t* data, int64_t size) {
static constexpr uint64_t high_bits_64 = 0x8080808080808080ULL;
- // For some reason, defining this variable outside the loop helps clang
- uint64_t mask;
+ static constexpr uint32_t high_bits_32 = 0x80808080UL;
+ static constexpr uint16_t high_bits_16 = 0x8080U;
+ static constexpr uint8_t high_bits_8 = 0x80U;
#ifndef NDEBUG
internal::CheckUTF8Initialized();
@@ -98,8 +100,8 @@ inline bool ValidateUTF8(const uint8_t* data, int64_t size) {
// XXX This is doing an unaligned access. Contemporary architectures
// (x86-64, AArch64, PPC64) support it natively and often have good
// performance nevertheless.
- memcpy(&mask, data, 8);
- if (ARROW_PREDICT_TRUE((mask & high_bits_64) == 0)) {
+ uint64_t mask64 = SafeLoadAs<uint64_t>(data);
+ if (ARROW_PREDICT_TRUE((mask64 & high_bits_64) == 0)) {
// 8 bytes of pure ASCII, move forward
size -= 8;
data += 8;
@@ -154,13 +156,50 @@ inline bool ValidateUTF8(const uint8_t* data, int64_t
size) {
return false;
}
- // Validate string tail one byte at a time
+ // Check if string tail is full ASCII (common case, fast)
+ if (size >= 4) {
+ uint32_t tail_mask = SafeLoadAs<uint32_t>(data + size - 4);
+ uint32_t head_mask = SafeLoadAs<uint32_t>(data);
+ if (ARROW_PREDICT_TRUE(((head_mask | tail_mask) & high_bits_32) == 0)) {
+ return true;
+ }
+ } else if (size >= 2) {
+ uint16_t tail_mask = SafeLoadAs<uint16_t>(data + size - 2);
+ uint16_t head_mask = SafeLoadAs<uint16_t>(data);
+ if (ARROW_PREDICT_TRUE(((head_mask | tail_mask) & high_bits_16) == 0)) {
+ return true;
+ }
+ } else if (size == 1) {
+ if (ARROW_PREDICT_TRUE((*data & high_bits_8) == 0)) {
+ return true;
+ }
+ } else {
+ /* size == 0 */
+ return true;
+ }
+
+ // Fall back to UTF8 validation of tail string.
// Note the state table is designed so that, once in the reject state,
// we remain in that state until the end. So we needn't check for
// rejection at each char (we don't gain much by short-circuiting here).
uint16_t state = internal::kUTF8ValidateAccept;
- while (size-- > 0) {
- state = internal::ValidateOneUTF8Byte(*data++, state);
+ switch (size) {
+ case 7:
+ state = internal::ValidateOneUTF8Byte(data[size - 7], state);
+ case 6:
+ state = internal::ValidateOneUTF8Byte(data[size - 6], state);
+ case 5:
+ state = internal::ValidateOneUTF8Byte(data[size - 5], state);
+ case 4:
+ state = internal::ValidateOneUTF8Byte(data[size - 4], state);
+ case 3:
+ state = internal::ValidateOneUTF8Byte(data[size - 3], state);
+ case 2:
+ state = internal::ValidateOneUTF8Byte(data[size - 2], state);
+ case 1:
+ state = internal::ValidateOneUTF8Byte(data[size - 1], state);
+ default:
+ break;
}
return ARROW_PREDICT_TRUE(state == internal::kUTF8ValidateAccept);
}