Re: [PR] Improve parquet BinaryView / StringView decoder performance (up to -35%) [arrow-rs]

via GitHub Thu, 22 Jan 2026 08:51:59 -0800


alamb commented on code in PR #9236:
URL: https://github.com/apache/arrow-rs/pull/9236#discussion_r2717750787



##########
parquet/src/arrow/array_reader/byte_view_array.rs:
##########
@@ -373,32 +394,39 @@ impl ByteViewArrayDecoderPlain {
                 // The implementation keeps a water mark 
`utf8_validation_begin` to track the beginning of the buffer that is not 
validated.
                 // If the length is smaller than 128, then we continue to next 
string.
                 // If the length is larger than 128, then we validate the 
buffer before the length bytes, and move the water mark to the beginning of 
next string.
-                if len < 128 {
-                    // fast path, move to next string.
-                    // the len bytes are valid utf8.
-                } else {
+                if len >= 128 {
                     // unfortunately, the len bytes may not be valid utf8, we 
need to wrap up and validate everything before it.
                     check_valid_utf8(unsafe {
-                        buf.get_unchecked(utf8_validation_begin..self.offset)
+                        buf.get_unchecked(utf8_validation_begin..end_offset)
                     })?;
                     // move the cursor to skip the len bytes.
                     utf8_validation_begin = start_offset;
                 }
             }
 
+            let view = make_view(
+                unsafe { buf.get_unchecked(start_offset..end_offset) },
+                block_id,
+                start_offset as u32,
+            );
+            // Safety: views_ptr is valid for writes, and we have reserved 
enough space.
             unsafe {
-                output.append_view_unchecked(block_id, start_offset as u32, 
len);
+                views_ptr.add(i).write(view);
             }
-            self.offset = end_offset;
-            read += 1;
         }
 
-        // validate the last part of the buffer
-        if self.validate_utf8 {
-            check_valid_utf8(unsafe { 
buf.get_unchecked(utf8_validation_begin..self.offset) })?;
+        // Safety: we have written `to_read` views to `views_ptr`
+        unsafe {
+            output.views.set_len(output.views.len() + to_read);
+        }
+        if VALIDATE_UTF8 {
+            // validate the last part of the buffer
+            check_valid_utf8(unsafe { 
buf.get_unchecked(utf8_validation_begin..end_offset) })?;

Review Comment:
   Codex flagged this as an error because it includes the length as well. Here 
is a test that passes on main but fails on this PR:
   - https://github.com/apache/arrow-rs/pull/9246
   
   I stared at it a while and I think the issue is that the end_offset is the 
end of the current string, when what needs to be validated is actually up to 
immediately before the current length
   
   Here is a proposed fix (from codex that I carefully reviewed) -- basically 
it stores the offset to len as well 
   
   ```diff
   • Edited parquet/src/arrow/array_reader/byte_view_array.rs (+4 -3)
       355          for i in 0..to_read {
       356 -            let start_offset = end_offset + 4;
       356 +            let len_offset = end_offset;
       357 +            let start_offset = len_offset + 4;
       358
           ⋮
       364              let len = u32::from_le_bytes(
       364 -                unsafe { 
buf.get_unchecked(end_offset..start_offset) }
       365 +                unsafe { 
buf.get_unchecked(len_offset..start_offset) }
       366                      .try_into()
           ⋮
       400                      check_valid_utf8(unsafe {
       400 -                        
buf.get_unchecked(utf8_validation_begin..end_offset)
       401 +                        
buf.get_unchecked(utf8_validation_begin..len_offset)
       402                      })?;
   
   ```
    



##########
parquet/src/arrow/array_reader/byte_view_array.rs:
##########
@@ -330,28 +342,37 @@ impl ByteViewArrayDecoderPlain {
         let to_read = len.min(self.max_remaining_values);
 
         let buf: &[u8] = self.buf.as_ref();
-        let mut read = 0;
+        let buf_len = buf.len();
+        let mut end_offset = self.offset;
+        let mut utf8_validation_begin = end_offset;
+
         output.views.reserve(to_read);
 
-        let mut utf8_validation_begin = self.offset;
-        while self.offset < self.buf.len() && read != to_read {
-            if self.offset + 4 > self.buf.len() {
+        // Safety: we reserved enough space in output.views
+        // and we will only write up to to_read views / track how many views 
we wrote.
+        // Ideally, we would use `Vec::extend` here, but this generates 
sub-optimal code.
+        let views_ptr = 
output.views.as_mut_ptr().wrapping_add(output.views.len());
+        for i in 0..to_read {
+            let start_offset = end_offset + 4;
+
+            if start_offset > buf_len {
                 return Err(ParquetError::EOF("eof decoding byte 
array".into()));
             }
-            let len_bytes: [u8; 4] = unsafe {
-                buf.get_unchecked(self.offset..self.offset + 4)
+
+            // Safety: we have checked that start_offset <= buf_len
+            let len = u32::from_le_bytes(
+                unsafe { buf.get_unchecked(end_offset..start_offset) }

Review Comment:
   this took me a moment to grok as end_offset is the end of the previous 
string value (but then gets updated below



##########
parquet/src/arrow/array_reader/byte_view_array.rs:
##########
@@ -373,32 +394,39 @@ impl ByteViewArrayDecoderPlain {
                 // The implementation keeps a water mark 
`utf8_validation_begin` to track the beginning of the buffer that is not 
validated.
                 // If the length is smaller than 128, then we continue to next 
string.
                 // If the length is larger than 128, then we validate the 
buffer before the length bytes, and move the water mark to the beginning of 
next string.
-                if len < 128 {
-                    // fast path, move to next string.
-                    // the len bytes are valid utf8.
-                } else {
+                if len >= 128 {
                     // unfortunately, the len bytes may not be valid utf8, we 
need to wrap up and validate everything before it.
                     check_valid_utf8(unsafe {
-                        buf.get_unchecked(utf8_validation_begin..self.offset)
+                        buf.get_unchecked(utf8_validation_begin..end_offset)
                     })?;
                     // move the cursor to skip the len bytes.
                     utf8_validation_begin = start_offset;
                 }
             }
 
+            let view = make_view(
+                unsafe { buf.get_unchecked(start_offset..end_offset) },
+                block_id,
+                start_offset as u32,
+            );
+            // Safety: views_ptr is valid for writes, and we have reserved 
enough space.
             unsafe {
-                output.append_view_unchecked(block_id, start_offset as u32, 
len);
+                views_ptr.add(i).write(view);
             }
-            self.offset = end_offset;
-            read += 1;
         }
 
-        // validate the last part of the buffer
-        if self.validate_utf8 {
-            check_valid_utf8(unsafe { 
buf.get_unchecked(utf8_validation_begin..self.offset) })?;
+        // Safety: we have written `to_read` views to `views_ptr`
+        unsafe {
+            output.views.set_len(output.views.len() + to_read);
+        }
+        if VALIDATE_UTF8 {
+            // validate the last part of the buffer

Review Comment:
   ```suggestion
               // validate values from the previously validated location up to 
(but not including)
               // the length of this string
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Improve parquet BinaryView / StringView decoder performance (up to -35%) [arrow-rs]

Reply via email to