alamb commented on code in PR #6021:
URL: https://github.com/apache/arrow-rs/pull/6021#discussion_r1669119163
##########
parquet/src/util/bit_util.rs:
##########
@@ -435,6 +435,10 @@ impl BitReader {
/// This function panics if
/// - `num_bits` is larger than the bit-capacity of `T`
///
+ // FIXME: soundness issue - this method can be used to write arbitrary
bytes to any
Review Comment:
can you provide an example where this leads to unsound behavior?
##########
parquet/src/arrow/array_reader/byte_view_array.rs:
##########
@@ -315,26 +315,30 @@ impl ByteViewArrayDecoderPlain {
let mut read = 0;
output.views.reserve(to_read);
while self.offset < self.buf.len() && read != to_read {
- if self.offset + 4 > self.buf.len() {
- return Err(ParquetError::EOF("eof decoding byte
array".into()));
- }
- let len_bytes: [u8; 4] = unsafe {
- buf.get_unchecked(self.offset..self.offset + 4)
- .try_into()
- .unwrap()
+ let len_bytes = buf.get(self.offset..self.offset + 4);
+ let len = match len_bytes {
+ None => {
+ return Err(ParquetError::EOF("eof decoding byte
array".into()));
+ }
+ Some(l) => u32::from_le_bytes(l.try_into().unwrap()),
};
- let len = u32::from_le_bytes(len_bytes);
let start_offset = self.offset + 4;
let end_offset = start_offset + len as usize;
- if end_offset > buf.len() {
+ if end_offset > buf.len() || end_offset < start_offset {
return Err(ParquetError::EOF("eof decoding byte
array".into()));
}
if self.validate_utf8 {
- check_valid_utf8(unsafe {
buf.get_unchecked(start_offset..end_offset) })?;
+ check_valid_utf8(
+ // SAFETY: bounds are checked above.
+ unsafe { buf.get_unchecked(start_offset..end_offset) },
+ )?;
}
+ // SAFETY: bounds are checked above, block_id comes from a call to
append_block, and
+ // we are dealing with byte arrays with no alignment requirements.
+ // FIXME: what about UTF8 arrays if validate_utf8 is false?
Review Comment:
validate_utf8 is false only for `ByteViewArray` (that can have arbitrary
bytes) -- for StringViewArray. `validate_utf8` is always true
##########
parquet/src/arrow/array_reader/byte_view_array.rs:
##########
@@ -315,26 +315,30 @@ impl ByteViewArrayDecoderPlain {
let mut read = 0;
output.views.reserve(to_read);
while self.offset < self.buf.len() && read != to_read {
- if self.offset + 4 > self.buf.len() {
- return Err(ParquetError::EOF("eof decoding byte
array".into()));
- }
- let len_bytes: [u8; 4] = unsafe {
- buf.get_unchecked(self.offset..self.offset + 4)
- .try_into()
- .unwrap()
+ let len_bytes = buf.get(self.offset..self.offset + 4);
+ let len = match len_bytes {
+ None => {
+ return Err(ParquetError::EOF("eof decoding byte
array".into()));
+ }
+ Some(l) => u32::from_le_bytes(l.try_into().unwrap()),
Review Comment:
Is this any more sound? it seems to me this is simply does the same check
via using slice
BTW I think you can write this kind of code more concisely like
```suggestion
let Some(len_bytes) = buf.get(self.offset..self.offset + 4) else
{
return Err(ParquetError::EOF("eof decoding byte
array".into()));
};
let len = u32::from_le_bytes(len_bytes.try_into().unwrap()),
```
##########
parquet/src/arrow/array_reader/byte_view_array.rs:
##########
@@ -315,26 +315,30 @@ impl ByteViewArrayDecoderPlain {
let mut read = 0;
output.views.reserve(to_read);
while self.offset < self.buf.len() && read != to_read {
- if self.offset + 4 > self.buf.len() {
- return Err(ParquetError::EOF("eof decoding byte
array".into()));
- }
- let len_bytes: [u8; 4] = unsafe {
- buf.get_unchecked(self.offset..self.offset + 4)
- .try_into()
- .unwrap()
+ let len_bytes = buf.get(self.offset..self.offset + 4);
+ let len = match len_bytes {
+ None => {
+ return Err(ParquetError::EOF("eof decoding byte
array".into()));
+ }
+ Some(l) => u32::from_le_bytes(l.try_into().unwrap()),
};
- let len = u32::from_le_bytes(len_bytes);
let start_offset = self.offset + 4;
let end_offset = start_offset + len as usize;
- if end_offset > buf.len() {
+ if end_offset > buf.len() || end_offset < start_offset {
Review Comment:
since offset is always increasing (it is formed by start offset + a `usize`
how could it ever be smaller than start offset?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]