alamb commented on code in PR #9236:
URL: https://github.com/apache/arrow-rs/pull/9236#discussion_r2717750787
##########
parquet/src/arrow/array_reader/byte_view_array.rs:
##########
@@ -373,32 +394,39 @@ impl ByteViewArrayDecoderPlain {
// The implementation keeps a water mark
`utf8_validation_begin` to track the beginning of the buffer that is not
validated.
// If the length is smaller than 128, then we continue to next
string.
// If the length is larger than 128, then we validate the
buffer before the length bytes, and move the water mark to the beginning of
next string.
- if len < 128 {
- // fast path, move to next string.
- // the len bytes are valid utf8.
- } else {
+ if len >= 128 {
// unfortunately, the len bytes may not be valid utf8, we
need to wrap up and validate everything before it.
check_valid_utf8(unsafe {
- buf.get_unchecked(utf8_validation_begin..self.offset)
+ buf.get_unchecked(utf8_validation_begin..end_offset)
})?;
// move the cursor to skip the len bytes.
utf8_validation_begin = start_offset;
}
}
+ let view = make_view(
+ unsafe { buf.get_unchecked(start_offset..end_offset) },
+ block_id,
+ start_offset as u32,
+ );
+ // Safety: views_ptr is valid for writes, and we have reserved
enough space.
unsafe {
- output.append_view_unchecked(block_id, start_offset as u32,
len);
+ views_ptr.add(i).write(view);
}
- self.offset = end_offset;
- read += 1;
}
- // validate the last part of the buffer
- if self.validate_utf8 {
- check_valid_utf8(unsafe {
buf.get_unchecked(utf8_validation_begin..self.offset) })?;
+ // Safety: we have written `to_read` views to `views_ptr`
+ unsafe {
+ output.views.set_len(output.views.len() + to_read);
+ }
+ if VALIDATE_UTF8 {
+ // validate the last part of the buffer
+ check_valid_utf8(unsafe {
buf.get_unchecked(utf8_validation_begin..end_offset) })?;
Review Comment:
Codex flagged this as an error because it includes the length as well. Here
is a test that passes on main but fails on this PR:
- https://github.com/apache/arrow-rs/pull/9246
I stared at it a while and I think the issue is that the end_offset is the
end of the current string, when what needs to be validated is actually up to
immediately before the current length
Here is a proposed fix (from codex that I carefully reviewed) -- basically
it stores the offset to len as well
```diff
• Edited parquet/src/arrow/array_reader/byte_view_array.rs (+4 -3)
355 for i in 0..to_read {
356 - let start_offset = end_offset + 4;
356 + let len_offset = end_offset;
357 + let start_offset = len_offset + 4;
358
⋮
364 let len = u32::from_le_bytes(
364 - unsafe {
buf.get_unchecked(end_offset..start_offset) }
365 + unsafe {
buf.get_unchecked(len_offset..start_offset) }
366 .try_into()
⋮
400 check_valid_utf8(unsafe {
400 -
buf.get_unchecked(utf8_validation_begin..end_offset)
401 +
buf.get_unchecked(utf8_validation_begin..len_offset)
402 })?;
```
##########
parquet/src/arrow/array_reader/byte_view_array.rs:
##########
@@ -330,28 +342,37 @@ impl ByteViewArrayDecoderPlain {
let to_read = len.min(self.max_remaining_values);
let buf: &[u8] = self.buf.as_ref();
- let mut read = 0;
+ let buf_len = buf.len();
+ let mut end_offset = self.offset;
+ let mut utf8_validation_begin = end_offset;
+
output.views.reserve(to_read);
- let mut utf8_validation_begin = self.offset;
- while self.offset < self.buf.len() && read != to_read {
- if self.offset + 4 > self.buf.len() {
+ // Safety: we reserved enough space in output.views
+ // and we will only write up to to_read views / track how many views
we wrote.
+ // Ideally, we would use `Vec::extend` here, but this generates
sub-optimal code.
+ let views_ptr =
output.views.as_mut_ptr().wrapping_add(output.views.len());
+ for i in 0..to_read {
+ let start_offset = end_offset + 4;
+
+ if start_offset > buf_len {
return Err(ParquetError::EOF("eof decoding byte
array".into()));
}
- let len_bytes: [u8; 4] = unsafe {
- buf.get_unchecked(self.offset..self.offset + 4)
+
+ // Safety: we have checked that start_offset <= buf_len
+ let len = u32::from_le_bytes(
+ unsafe { buf.get_unchecked(end_offset..start_offset) }
Review Comment:
this took me a moment to grok as end_offset is the end of the previous
string value (but then gets updated below
##########
parquet/src/arrow/array_reader/byte_view_array.rs:
##########
@@ -373,32 +394,39 @@ impl ByteViewArrayDecoderPlain {
// The implementation keeps a water mark
`utf8_validation_begin` to track the beginning of the buffer that is not
validated.
// If the length is smaller than 128, then we continue to next
string.
// If the length is larger than 128, then we validate the
buffer before the length bytes, and move the water mark to the beginning of
next string.
- if len < 128 {
- // fast path, move to next string.
- // the len bytes are valid utf8.
- } else {
+ if len >= 128 {
// unfortunately, the len bytes may not be valid utf8, we
need to wrap up and validate everything before it.
check_valid_utf8(unsafe {
- buf.get_unchecked(utf8_validation_begin..self.offset)
+ buf.get_unchecked(utf8_validation_begin..end_offset)
})?;
// move the cursor to skip the len bytes.
utf8_validation_begin = start_offset;
}
}
+ let view = make_view(
+ unsafe { buf.get_unchecked(start_offset..end_offset) },
+ block_id,
+ start_offset as u32,
+ );
+ // Safety: views_ptr is valid for writes, and we have reserved
enough space.
unsafe {
- output.append_view_unchecked(block_id, start_offset as u32,
len);
+ views_ptr.add(i).write(view);
}
- self.offset = end_offset;
- read += 1;
}
- // validate the last part of the buffer
- if self.validate_utf8 {
- check_valid_utf8(unsafe {
buf.get_unchecked(utf8_validation_begin..self.offset) })?;
+ // Safety: we have written `to_read` views to `views_ptr`
+ unsafe {
+ output.views.set_len(output.views.len() + to_read);
+ }
+ if VALIDATE_UTF8 {
+ // validate the last part of the buffer
Review Comment:
```suggestion
// validate values from the previously validated location up to
(but not including)
// the length of this string
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]