Re: [PR] Add regression tests for Parquet large binary offset overflow [arrow-rs]

via GitHub Thu, 05 Feb 2026 06:49:57 -0800


Copilot commented on code in PR #9361:
URL: https://github.com/apache/arrow-rs/pull/9361#discussion_r2769569636



##########
parquet/tests/arrow_reader/large_string_overflow.rs:
##########
@@ -0,0 +1,116 @@
+use std::sync::Arc;

Review Comment:
   This file is missing the Apache License header that is standard across all 
files in this codebase. Please add the Apache Software Foundation license 
header at the beginning of the file, following the same format as other test 
files in this directory (see parquet/tests/arrow_reader/bad_data.rs or 
checksum.rs for reference).



##########
parquet/tests/arrow_reader/large_string_overflow.rs:
##########
@@ -0,0 +1,116 @@
+use std::sync::Arc;
+
+use arrow_array::{ArrayRef, BinaryArray, RecordBatch};
+use arrow_schema::{DataType, Field, Schema};
+use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
+use parquet::arrow::arrow_writer::ArrowWriter;
+use parquet::basic::Encoding;
+use parquet::file::properties::WriterProperties;
+
+use tempfile::tempfile;
+
+const ROWS: usize = 500;
+const VALUE_SIZE: usize = 5_068_563; // ~5MB per row → triggers >2GB total
+
+fn make_large_binary_array() -> ArrayRef {
+    let value = vec![b'a'; VALUE_SIZE];
+    let values: Vec<Vec<u8>> = std::iter::repeat(value)
+        .take(ROWS)
+        .collect();
+
+    Arc::new(BinaryArray::from(values)) as ArrayRef
+}
+
+fn write_parquet_with_encoding(
+    array: ArrayRef,
+    encoding: Encoding,
+) -> std::fs::File {
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("col", DataType::Binary, false),
+    ]));
+
+    let batch =
+        RecordBatch::try_new(schema.clone(), vec![array])
+            .unwrap();
+
+    let file = tempfile().unwrap();
+
+    let props = WriterProperties::builder()
+        .set_dictionary_enabled(true)
+        .set_encoding(encoding)
+        .build();
+

Review Comment:
   The `write_parquet_with_encoding` function sets both 
`set_dictionary_enabled(true)` and `set_encoding(encoding)`. According to the 
WriterProperties documentation, when dictionary is enabled and a specific 
encoding is set, the specified encoding becomes the "fallback" encoding (see 
parquet/src/file/properties.rs:1079-1081). This means these tests may not 
actually be testing the specified encodings directly, but rather testing 
dictionary encoding with these as fallbacks.
   
   For more accurate testing of specific encodings, consider setting 
`set_dictionary_enabled(false)` for non-dictionary encodings (PLAIN, 
DELTA_LENGTH_BYTE_ARRAY, DELTA_BYTE_ARRAY). The dictionary test should only use 
`set_dictionary_enabled(true)` without calling `set_encoding`.
   ```suggestion
       let mut builder = WriterProperties::builder();
   
       // For non-dictionary encodings, disable dictionary and set the encoding 
directly.
       // For dictionary encoding, enable dictionary and rely on the default 
dictionary behavior.
       let builder = match encoding {
           Encoding::RLE_DICTIONARY => builder.set_dictionary_enabled(true),
           _ => builder
               .set_dictionary_enabled(false)
               .set_encoding(encoding),
       };
   
       let props = builder.build();
   ```



##########
parquet/tests/arrow_reader/large_string_overflow.rs:
##########
@@ -0,0 +1,116 @@
+use std::sync::Arc;
+
+use arrow_array::{ArrayRef, BinaryArray, RecordBatch};
+use arrow_schema::{DataType, Field, Schema};
+use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
+use parquet::arrow::arrow_writer::ArrowWriter;
+use parquet::basic::Encoding;
+use parquet::file::properties::WriterProperties;
+
+use tempfile::tempfile;
+
+const ROWS: usize = 500;
+const VALUE_SIZE: usize = 5_068_563; // ~5MB per row → triggers >2GB total
+
+fn make_large_binary_array() -> ArrayRef {
+    let value = vec![b'a'; VALUE_SIZE];
+    let values: Vec<Vec<u8>> = std::iter::repeat(value)
+        .take(ROWS)
+        .collect();
+
+    Arc::new(BinaryArray::from(values)) as ArrayRef
+}
+
+fn write_parquet_with_encoding(
+    array: ArrayRef,
+    encoding: Encoding,
+) -> std::fs::File {
+    let schema = Arc::new(Schema::new(vec![
+        Field::new("col", DataType::Binary, false),
+    ]));
+
+    let batch =
+        RecordBatch::try_new(schema.clone(), vec![array])
+            .unwrap();
+
+    let file = tempfile().unwrap();
+
+    let props = WriterProperties::builder()
+        .set_dictionary_enabled(true)
+        .set_encoding(encoding)
+        .build();
+
+    let mut writer =
+        ArrowWriter::try_new(file.try_clone().unwrap(), schema, Some(props))
+            .unwrap();
+
+    writer.write(&batch).unwrap();
+    writer.close().unwrap();
+
+    file
+}
+
+#[test]
+#[ignore = "regression test for >2GB binary offset overflow"]
+fn large_binary_plain_encoding_overflow() {
+    let array = make_large_binary_array();
+    let file = write_parquet_with_encoding(array, Encoding::PLAIN);
+
+    let mut reader =
+        ParquetRecordBatchReaderBuilder::try_new(file)
+            .unwrap()
+            .build()
+            .unwrap();
+
+    assert!(matches!(reader.next(), Some(Ok(_))));
+}
+
+#[test]
+#[ignore = "regression test for >2GB binary offset overflow"]
+fn large_binary_delta_length_encoding_overflow() {
+    let array = make_large_binary_array();
+    let file = write_parquet_with_encoding(
+        array,
+        Encoding::DELTA_LENGTH_BYTE_ARRAY,
+    );
+
+    let mut reader =
+        ParquetRecordBatchReaderBuilder::try_new(file)
+            .unwrap()
+            .build()
+            .unwrap();
+
+    assert!(matches!(reader.next(), Some(Ok(_))));
+}
+
+#[test]
+#[ignore = "regression test for >2GB binary offset overflow"]
+fn large_binary_delta_byte_array_encoding_overflow() {
+    let array = make_large_binary_array();
+    let file =
+        write_parquet_with_encoding(array, Encoding::DELTA_BYTE_ARRAY);
+
+    let mut reader =
+        ParquetRecordBatchReaderBuilder::try_new(file)
+            .unwrap()
+            .build()
+            .unwrap();
+
+    assert!(matches!(reader.next(), Some(Ok(_))));
+}
+
+#[test]
+#[ignore = "regression test for >2GB binary offset overflow"]
+fn large_binary_rle_dictionary_encoding_overflow() {
+    let array = make_large_binary_array();
+    let file =
+        write_parquet_with_encoding(array, Encoding::RLE_DICTIONARY);

Review Comment:
   This test will panic when run. The WriterProperties builder at line 40 calls 
`set_encoding(encoding)` with `Encoding::RLE_DICTIONARY`, but according to the 
parquet properties implementation (parquet/src/file/properties.rs:1087-1089), 
calling `set_encoding` with `RLE_DICTIONARY` or `PLAIN_DICTIONARY` will panic 
with "Dictionary encoding can not be used as fallback encoding". 
   
   To enable dictionary encoding, you should only use 
`set_dictionary_enabled(true)` without calling 
`set_encoding(Encoding::RLE_DICTIONARY)`. The dictionary encoding will be used 
automatically when dictionary is enabled.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Add regression tests for Parquet large binary offset overflow [arrow-rs]

Reply via email to