This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new af4d6b624e Fast utf8 validation when loading string view from parquet
(#6009)
af4d6b624e is described below
commit af4d6b624e073817c71264c645ec25a546938b8f
Author: Xiangpeng Hao <[email protected]>
AuthorDate: Mon Jul 8 14:42:39 2024 -0400
Fast utf8 validation when loading string view from parquet (#6009)
* fast utf8 validation
* better documentation
* Update parquet/src/arrow/array_reader/byte_view_array.rs
Co-authored-by: Andrew Lamb <[email protected]>
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
parquet/src/arrow/array_reader/byte_view_array.rs | 40 ++++++++++++++++++++++-
1 file changed, 39 insertions(+), 1 deletion(-)
diff --git a/parquet/src/arrow/array_reader/byte_view_array.rs
b/parquet/src/arrow/array_reader/byte_view_array.rs
index 7f0a0dd2a5..9c5caaad59 100644
--- a/parquet/src/arrow/array_reader/byte_view_array.rs
+++ b/parquet/src/arrow/array_reader/byte_view_array.rs
@@ -314,6 +314,8 @@ impl ByteViewArrayDecoderPlain {
let buf = self.buf.as_ref();
let mut read = 0;
output.views.reserve(to_read);
+
+ let mut utf8_validation_begin = self.offset;
while self.offset < self.buf.len() && read != to_read {
if self.offset + 4 > self.buf.len() {
return Err(ParquetError::EOF("eof decoding byte
array".into()));
@@ -332,7 +334,38 @@ impl ByteViewArrayDecoderPlain {
}
if self.validate_utf8 {
- check_valid_utf8(unsafe {
buf.get_unchecked(start_offset..end_offset) })?;
+ // It seems you are trying to understand what's going on here,
take a breath and be patient.
+ // Utf-8 validation is a non-trivial task, here are some
background facts:
+ // (1) Validating one 2048-byte string is much faster than
validating 128 of 16-byte string.
+ // As shown in
https://github.com/apache/arrow-rs/pull/6009#issuecomment-2211174229
+ // Potentially because the SIMD operations favor longer
strings.
+ // (2) Practical strings are short, 99% of strings are smaller
than 100 bytes, as shown in paper:
+ // https://www.vldb.org/pvldb/vol17/p148-zeng.pdf, Figure
5f.
+ // (3) Parquet plain encoding makes utf-8 validation harder,
+ // because it stores the length of each string right
before the string.
+ // This means naive utf-8 validation will be slow, because
the validation need to skip the length bytes.
+ // I.e., the validation cannot validate the buffer in one
pass, but instead, validate strings chunk by chunk.
+ //
+ // Given the above observations, the goal is to do batch
validation as much as possible.
+ // The key idea is that if the length is smaller than 128 (99%
of the case), then the length bytes are valid utf-8, as reasoned below:
+ // If the length is smaller than 128, its 4-byte encoding are
[0, 0, 0, len].
+ // Each of the byte is a valid ASCII character, so they are
valid utf-8.
+ // Since they are all smaller than 128, the won't break a
utf-8 code point (won't mess with later bytes).
+ //
+ // The implementation keeps a water mark
`utf8_validation_begin` to track the beginning of the buffer that is not
validated.
+ // If the length is smaller than 128, then we continue to next
string.
+ // If the length is larger than 128, then we validate the
buffer before the length bytes, and move the water mark to the beginning of
next string.
+ if len < 128 {
+ // fast path, move to next string.
+ // the len bytes are valid utf8.
+ } else {
+ // unfortunately, the len bytes may not be valid utf8, we
need to wrap up and validate everything before it.
+ check_valid_utf8(unsafe {
+ buf.get_unchecked(utf8_validation_begin..self.offset)
+ })?;
+ // move the cursor to skip the len bytes.
+ utf8_validation_begin = start_offset;
+ }
}
unsafe {
@@ -342,6 +375,11 @@ impl ByteViewArrayDecoderPlain {
read += 1;
}
+ // validate the last part of the buffer
+ if self.validate_utf8 {
+ check_valid_utf8(unsafe {
buf.get_unchecked(utf8_validation_begin..self.offset) })?;
+ }
+
self.max_remaining_values -= to_read;
Ok(to_read)
}