Re: [PR] Add regression tests for Parquet large binary offset overflow [arrow-rs]

via GitHub Tue, 10 Feb 2026 08:49:17 -0800


Copilot commented on code in PR #9361:
URL: https://github.com/apache/arrow-rs/pull/9361#discussion_r2789117140



##########
parquet/tests/arrow_reader/large_string_overflow.rs:
##########
@@ -0,0 +1,134 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow_array::builder::BinaryBuilder;
+use arrow_array::{ArrayRef, RecordBatch};
+use arrow_schema::{DataType, Field, Schema};
+
+use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
+use parquet::arrow::arrow_writer::ArrowWriter;
+use parquet::basic::Encoding;
+use parquet::file::properties::WriterProperties;
+
+use tempfile::tempfile;
+
+/// Number of rows written
+const ROWS: usize = 1024;
+
+/// Size of each binary value (~3MB)
+/// 1024 * 3MB ≈ 3GB total → guaranteed offset overflow for i32
+const VALUE_SIZE: usize = 3 * 1024 * 1024;
+
+fn make_large_binary_array() -> ArrayRef {
+    let mut builder = BinaryBuilder::new();
+
+    for _ in 0..ROWS {
+        let data = vec![b'a'; VALUE_SIZE];
+        builder.append_value(&data);
+    }
+
+    Arc::new(builder.finish()) as ArrayRef
+}
+
+fn write_parquet_with_encoding(array: ArrayRef, encoding: Encoding) -> 
std::fs::File {
+    let schema = Arc::new(Schema::new(vec![Field::new(
+        "col",
+        DataType::Binary,
+        false,
+    )]));
+
+    let batch = RecordBatch::try_new(schema.clone(), vec![array]).unwrap();
+
+    let file = tempfile().unwrap();
+
+    let builder = WriterProperties::builder();
+    let builder = match encoding {
+        Encoding::RLE_DICTIONARY => builder.set_dictionary_enabled(true),
+        _ => builder.set_dictionary_enabled(false).set_encoding(encoding),
+    };
+
+    let props = builder.build();
+
+    let mut writer = ArrowWriter::try_new(file.try_clone().unwrap(), schema, 
Some(props)).unwrap();
+
+    writer.write(&batch).unwrap();
+    writer.close().unwrap();
+
+    file
+}
+
+#[test]
+// Panics until https://github.com/apache/arrow-rs/issues/7973 is fixed
+#[should_panic(expected = "byte array offset overflow")]
+fn large_binary_plain_encoding_overflow() {

Review Comment:
   These tests allocate/write ~3GB of binary data and currently lack 
`#[ignore]`, which will likely cause OOM/timeouts in CI. This also contradicts 
the PR description stating the tests are ignored. Consider marking these tests 
`#[ignore]` (or gating behind an env flag), and make the assertions reflect the 
intended post-fix behavior (i.e., successful `RecordBatch` decoding) rather 
than using `#[should_panic]`.



##########
parquet/tests/arrow_reader/large_string_overflow.rs:
##########
@@ -0,0 +1,134 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::sync::Arc;
+
+use arrow_array::builder::BinaryBuilder;
+use arrow_array::{ArrayRef, RecordBatch};
+use arrow_schema::{DataType, Field, Schema};
+
+use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
+use parquet::arrow::arrow_writer::ArrowWriter;
+use parquet::basic::Encoding;
+use parquet::file::properties::WriterProperties;
+
+use tempfile::tempfile;
+
+/// Number of rows written
+const ROWS: usize = 1024;
+
+/// Size of each binary value (~3MB)
+/// 1024 * 3MB ≈ 3GB total → guaranteed offset overflow for i32
+const VALUE_SIZE: usize = 3 * 1024 * 1024;
+
+fn make_large_binary_array() -> ArrayRef {
+    let mut builder = BinaryBuilder::new();
+
+    for _ in 0..ROWS {
+        let data = vec![b'a'; VALUE_SIZE];
+        builder.append_value(&data);
+    }
+
+    Arc::new(builder.finish()) as ArrayRef
+}
+
+fn write_parquet_with_encoding(array: ArrayRef, encoding: Encoding) -> 
std::fs::File {
+    let schema = Arc::new(Schema::new(vec![Field::new(
+        "col",
+        DataType::Binary,
+        false,
+    )]));
+
+    let batch = RecordBatch::try_new(schema.clone(), vec![array]).unwrap();
+
+    let file = tempfile().unwrap();
+
+    let builder = WriterProperties::builder();
+    let builder = match encoding {
+        Encoding::RLE_DICTIONARY => builder.set_dictionary_enabled(true),
+        _ => builder.set_dictionary_enabled(false).set_encoding(encoding),
+    };
+
+    let props = builder.build();
+
+    let mut writer = ArrowWriter::try_new(file.try_clone().unwrap(), schema, 
Some(props)).unwrap();
+
+    writer.write(&batch).unwrap();
+    writer.close().unwrap();
+
+    file
+}
+
+#[test]
+// Panics until https://github.com/apache/arrow-rs/issues/7973 is fixed
+#[should_panic(expected = "byte array offset overflow")]
+fn large_binary_plain_encoding_overflow() {

Review Comment:
   `#[should_panic(expected = "byte array offset overflow")]` doesn't appear to 
match the actual overflow error text produced by the Parquet arrow reader (e.g. 
`"index overflow decoding byte array"` from 
`parquet/src/arrow/buffer/offset_buffer.rs`). As written, these tests may fail 
even when the overflow occurs. If you keep a panic-based test, align the 
expected substring with the real message, or (preferably) assert on the 
returned `Err` explicitly instead of relying on `unwrap()` panics.



##########
parquet/tests/arrow_reader/mod.rs:
##########
@@ -44,6 +44,7 @@ mod bloom_filter;
 mod checksum;
 mod int96_stats_roundtrip;
 mod io;
+mod large_string_overflow;

Review Comment:
   The test module/file name (`large_string_overflow`) is misleading: this code 
is exercising `DataType::Binary` / `BinaryBuilder`, not strings. Renaming to 
something like `large_binary_overflow` (and updating the `mod` declaration) 
would make the purpose clearer and align with the PR title.
   ```suggestion
   mod large_binary_overflow;
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Add regression tests for Parquet large binary offset overflow [arrow-rs]

Reply via email to