Re: [PR] Assorted panics we've found [arrow-rs]

via GitHub Tue, 14 Oct 2025 12:38:17 -0700


scovich commented on code in PR #8602:
URL: https://github.com/apache/arrow-rs/pull/8602#discussion_r2430209368



##########
parquet/src/file/reader.rs:
##########
@@ -124,11 +124,26 @@ impl ChunkReader for Bytes {
 
     fn get_read(&self, start: u64) -> Result<Self::T> {
         let start = start as usize;
+        if start > self.len() {
+            return Err(eof_err!(
+                "Expected to read at offset {}, while file has length {}",
+                start,

Review Comment:
   nit:
   ```suggestion
                   "Expected to read at offset {start}, while file has length 
{}",
   ```



##########
parquet/src/encodings/decoding.rs:
##########
@@ -382,6 +382,12 @@ impl<T: DataType> Decoder<T> for DictDecoder<T> {
     fn set_data(&mut self, data: Bytes, num_values: usize) -> Result<()> {
         // First byte in `data` is bit width
         let bit_width = data.as_ref()[0];

Review Comment:
   How do we know `data.len() > 0`?



##########
parquet/src/schema/types.rs:
##########
@@ -1359,6 +1359,8 @@ fn schema_from_array_helper<'a>(
                 if !is_root_node {
                     builder = builder.with_repetition(rep);
                 }
+            } else if !is_root_node {
+                return Err(general_err!("Repetition level must be defined for 
non-root types"));
             }
             Ok((next_index, Arc::new(builder.build().unwrap())))

Review Comment:
   How do we know the `unwrap` is safe?



##########
parquet/src/schema/types.rs:
##########
@@ -1359,6 +1359,8 @@ fn schema_from_array_helper<'a>(
                 if !is_root_node {
                     builder = builder.with_repetition(rep);
                 }
+            } else if !is_root_node {
+                return Err(general_err!("Repetition level must be defined for 
non-root types"));

Review Comment:
   This logic seems a bit convoluted... would it make more sense to do 
something like this?
   ```rust
   // Sometimes parquet-cpp and parquet-mr set repetition level REQUIRED or
   // REPEATED for root node.
   //
   // We only set repetition for group types that are not top-level message
   // type. According to parquet-format:
   //   Root of the schema does not have a repetition_type.
   //   All other types must have one.
   if !is_root_node {
     let Some(rep) = repetition else {
       return Err(...);
     };
     builder = builder.with_repetition(rep);
   }
   ```
   



##########
parquet/src/column/reader.rs:
##########
@@ -569,11 +569,15 @@ fn parse_v1_level(
     match encoding {
         Encoding::RLE => {
             let i32_size = std::mem::size_of::<i32>();
-            let data_size = read_num_bytes::<i32>(i32_size, buf.as_ref()) as 
usize;
-            Ok((
-                i32_size + data_size,
-                buf.slice(i32_size..i32_size + data_size),
-            ))
+            if i32_size <= buf.len() {
+                let data_size = read_num_bytes::<i32>(i32_size, buf.as_ref()) 
as usize;
+                let end =
+                    
i32_size.checked_add(data_size).ok_or(general_err!("invalid level length"))?;
+                if end <= buf.len() {
+                    return Ok((end, buf.slice(i32_size..end)));
+                }
+            }
+            Err(general_err!("not enough data to read levels"))

Review Comment:
   This is _definitely_ an improvement over the existing code, but it opens a 
question:
   
   Given that we're reading bytes from a byte buffer, it seems like we must 
expect to hit this situation at least occasionally? And the correct response is 
to fetch more bytes, not fail? Is there some mechanism for handling that higher 
up in the call stack? Or is there some reason it should be impossible for this 
code to run off the end of the buffer?



##########
parquet/src/encodings/rle.rs:
##########
@@ -513,7 +513,10 @@ impl RleDecoder {
                 self.rle_left = (indicator_value >> 1) as u32;
                 let value_width = bit_util::ceil(self.bit_width as usize, 8);
                 self.current_value = 
bit_reader.get_aligned::<u64>(value_width);
-                assert!(self.current_value.is_some());
+                assert!(
+                    self.current_value.is_some(),
+                    "parquet_data_error: not enough data for RLE decoding"
+                );

Review Comment:
   This is still a panic? Just with a better error message?



##########
parquet/src/column/reader.rs:
##########
@@ -569,11 +569,15 @@ fn parse_v1_level(
     match encoding {
         Encoding::RLE => {
             let i32_size = std::mem::size_of::<i32>();
-            let data_size = read_num_bytes::<i32>(i32_size, buf.as_ref()) as 
usize;
-            Ok((
-                i32_size + data_size,
-                buf.slice(i32_size..i32_size + data_size),
-            ))
+            if i32_size <= buf.len() {
+                let data_size = read_num_bytes::<i32>(i32_size, buf.as_ref()) 
as usize;
+                let end =
+                    
i32_size.checked_add(data_size).ok_or(general_err!("invalid level length"))?;
+                if end <= buf.len() {
+                    return Ok((end, buf.slice(i32_size..end)));
+                }
+            }
+            Err(general_err!("not enough data to read levels"))

Review Comment:
   Also -- it seems like `read_num_bytes` should do bounds checking internally 
and return `Option<T>`, so buffer overrun is obvious at the call site instead 
of a hidden panic footgun? The method has a half dozen other callers, and they 
all need to do manual bounds checking, in various ways and with varying degrees 
of safety. In particular, parquet/src/data_type.rs has two call sites that lack 
any visible bounds checks.
   



##########
parquet/src/file/reader.rs:
##########
@@ -124,11 +124,26 @@ impl ChunkReader for Bytes {
 
     fn get_read(&self, start: u64) -> Result<Self::T> {
         let start = start as usize;
+        if start > self.len() {
+            return Err(eof_err!(
+                "Expected to read at offset {}, while file has length {}",
+                start,
+                self.len()
+            ));
+        }
         Ok(self.slice(start..).reader())
     }
 
     fn get_bytes(&self, start: u64, length: usize) -> Result<Bytes> {
         let start = start as usize;
+        if start > self.len() || start + length > self.len() {
+            return Err(eof_err!(
+                "Expected to read {} bytes at offset {}, while file has length 
{}",
+                length,
+                start,

Review Comment:
   nit
   ```suggestion
                   "Expected to read {length} bytes at offset {start}, while 
file has length {}",
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Assorted panics we've found [arrow-rs]

Reply via email to