Re: [PR] Add regression tests for Parquet large binary offset overflow [arrow-rs]

via GitHub Sun, 08 Feb 2026 04:02:36 -0800


alamb commented on code in PR #9361:
URL: https://github.com/apache/arrow-rs/pull/9361#discussion_r2779183358



##########
parquet/tests/arrow_reader/large_string_overflow.rs:
##########
@@ -0,0 +1,116 @@
+use std::sync::Arc;
+
+use arrow_array::{ArrayRef, BinaryArray, RecordBatch};
+use arrow_schema::{DataType, Field, Schema};
+use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
+use parquet::arrow::arrow_writer::ArrowWriter;
+use parquet::basic::Encoding;
+use parquet::file::properties::WriterProperties;
+
+use tempfile::tempfile;
+
+const ROWS: usize = 500;
+const VALUE_SIZE: usize = 5_068_563; // ~5MB per row → triggers >2GB total
+
+fn make_large_binary_array() -> ArrayRef {
+    let value = vec![b'a'; VALUE_SIZE];
+    let values: Vec<Vec<u8>> = std::iter::repeat(value)
+        .take(ROWS)
+        .collect();
+
+    Arc::new(BinaryArray::from(values)) as ArrayRef
+}
+
+fn write_parquet_with_encoding(
+    array: ArrayRef,
+    encoding: Encoding,
+) -> std::fs::File {
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("col", DataType::Binary, false),
+    ]));
+
+    let batch =
+        RecordBatch::try_new(schema.clone(), vec![array])
+            .unwrap();
+
+    let file = tempfile().unwrap();
+
+    let props = WriterProperties::builder()
+        .set_dictionary_enabled(true)
+        .set_encoding(encoding)
+        .build();
+

Review Comment:
   I think this is an important comment to resolve



##########
parquet/tests/arrow_reader/large_string_overflow.rs:
##########
@@ -0,0 +1,116 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+use std::sync::Arc;
+
+use arrow_array::{ArrayRef, BinaryArray, RecordBatch};
+use arrow_schema::{DataType, Field, Schema};
+use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
+use parquet::arrow::arrow_writer::ArrowWriter;
+use parquet::basic::Encoding;
+use parquet::file::properties::WriterProperties;
+
+use tempfile::tempfile;
+
+const ROWS: usize = 500;
+const VALUE_SIZE: usize = 5_068_563; // ~5MB per row → triggers >2GB total
+
+fn make_large_binary_array() -> ArrayRef {
+    let value = vec![b'a'; VALUE_SIZE];
+    let values: Vec<Vec<u8>> = std::iter::repeat(value).take(ROWS).collect();
+
+    Arc::new(BinaryArray::from(values)) as ArrayRef
+}
+
+fn write_parquet_with_encoding(array: ArrayRef, encoding: Encoding) -> 
std::fs::File {
+    let schema = Arc::new(Schema::new(vec![Field::new(
+        "col",
+        DataType::Binary,
+        false,
+    )]));
+
+    let batch = RecordBatch::try_new(schema.clone(), vec![array]).unwrap();
+
+    let file = tempfile().unwrap();
+
+    let props = WriterProperties::builder()
+        .set_dictionary_enabled(true)
+        .set_encoding(encoding)
+        .build();
+
+    let mut writer = ArrowWriter::try_new(file.try_clone().unwrap(), schema, 
Some(props)).unwrap();
+
+    writer.write(&batch).unwrap();
+    writer.close().unwrap();
+
+    file
+}
+
+#[test]
+#[ignore = "regression test for >2GB binary offset overflow"]

Review Comment:
   Rather than ignoring the tests, can you please 
   1. add an assertion that it should panic? 
   2. Add a comment with a link to the ticket?
   
   That way we are sure the test properly covers what you want
   
   For example
   
   ```rust
   #[test]
   // Panics until https://github.com/apache/arrow-rs/issues/7973 is fixed
   #[should_panic(expected = "byte array offset overflow")]
   fn large_binary_plain_encoding_overflow() {
   ```



##########
parquet/tests/arrow_reader/large_string_overflow.rs:
##########
@@ -0,0 +1,116 @@
+// Licensed to the Apache Software Foundation (ASF) under one

Review Comment:
   In order for this test to actually be run, I think you need to add it to 
`mod.rs`
   
   As it is the file is ignored and the tests are not run
   
   ```diff
   index ffc36655b3..33fc59d494 100644
   --- a/parquet/tests/arrow_reader/mod.rs
   +++ b/parquet/tests/arrow_reader/mod.rs
   @@ -48,6 +48,7 @@ mod io;
    mod predicate_cache;
    mod row_filter;
    mod statistics;
   +mod large_string_overflow;
   
    // returns a struct array with columns "int32_col", "float32_col" and 
"float64_col" with the specified values
    fn struct_array(input: Vec<(Option<i32>, Option<f32>, Option<f64>)>) -> 
ArrayRef {
   ```
   
   When I add that this file doesn't compile
   
   I had to change the building ot
   ```rust
   fn make_large_binary_array() -> ArrayRef {
       let value = vec![b'a'; VALUE_SIZE];
       let mut builder = BinaryBuilder::new();
          for _ in 0..ROWS {
           builder.append_value(&value);
       }
   
       Arc::new(builder.finish())
   }
   ```
   
   And then it seems to work well



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Add regression tests for Parquet large binary offset overflow [arrow-rs]

Reply via email to