Copilot commented on code in PR #9361:
URL: https://github.com/apache/arrow-rs/pull/9361#discussion_r2769569636
##########
parquet/tests/arrow_reader/large_string_overflow.rs:
##########
@@ -0,0 +1,116 @@
+use std::sync::Arc;
Review Comment:
This file is missing the Apache License header that is standard across all
files in this codebase. Please add the Apache Software Foundation license
header at the beginning of the file, following the same format as other test
files in this directory (see parquet/tests/arrow_reader/bad_data.rs or
checksum.rs for reference).
##########
parquet/tests/arrow_reader/large_string_overflow.rs:
##########
@@ -0,0 +1,116 @@
+use std::sync::Arc;
+
+use arrow_array::{ArrayRef, BinaryArray, RecordBatch};
+use arrow_schema::{DataType, Field, Schema};
+use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
+use parquet::arrow::arrow_writer::ArrowWriter;
+use parquet::basic::Encoding;
+use parquet::file::properties::WriterProperties;
+
+use tempfile::tempfile;
+
+const ROWS: usize = 500;
+const VALUE_SIZE: usize = 5_068_563; // ~5MB per row → triggers >2GB total
+
+fn make_large_binary_array() -> ArrayRef {
+ let value = vec![b'a'; VALUE_SIZE];
+ let values: Vec<Vec<u8>> = std::iter::repeat(value)
+ .take(ROWS)
+ .collect();
+
+ Arc::new(BinaryArray::from(values)) as ArrayRef
+}
+
+fn write_parquet_with_encoding(
+ array: ArrayRef,
+ encoding: Encoding,
+) -> std::fs::File {
+ let schema = Arc::new(Schema::new(vec![
+ Field::new("col", DataType::Binary, false),
+ ]));
+
+ let batch =
+ RecordBatch::try_new(schema.clone(), vec![array])
+ .unwrap();
+
+ let file = tempfile().unwrap();
+
+ let props = WriterProperties::builder()
+ .set_dictionary_enabled(true)
+ .set_encoding(encoding)
+ .build();
+
Review Comment:
The `write_parquet_with_encoding` function sets both
`set_dictionary_enabled(true)` and `set_encoding(encoding)`. According to the
WriterProperties documentation, when dictionary is enabled and a specific
encoding is set, the specified encoding becomes the "fallback" encoding (see
parquet/src/file/properties.rs:1079-1081). This means these tests may not
actually be testing the specified encodings directly, but rather testing
dictionary encoding with these as fallbacks.
For more accurate testing of specific encodings, consider setting
`set_dictionary_enabled(false)` for non-dictionary encodings (PLAIN,
DELTA_LENGTH_BYTE_ARRAY, DELTA_BYTE_ARRAY). The dictionary test should only use
`set_dictionary_enabled(true)` without calling `set_encoding`.
```suggestion
let mut builder = WriterProperties::builder();
// For non-dictionary encodings, disable dictionary and set the encoding
directly.
// For dictionary encoding, enable dictionary and rely on the default
dictionary behavior.
let builder = match encoding {
Encoding::RLE_DICTIONARY => builder.set_dictionary_enabled(true),
_ => builder
.set_dictionary_enabled(false)
.set_encoding(encoding),
};
let props = builder.build();
```
##########
parquet/tests/arrow_reader/large_string_overflow.rs:
##########
@@ -0,0 +1,116 @@
+use std::sync::Arc;
+
+use arrow_array::{ArrayRef, BinaryArray, RecordBatch};
+use arrow_schema::{DataType, Field, Schema};
+use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
+use parquet::arrow::arrow_writer::ArrowWriter;
+use parquet::basic::Encoding;
+use parquet::file::properties::WriterProperties;
+
+use tempfile::tempfile;
+
+const ROWS: usize = 500;
+const VALUE_SIZE: usize = 5_068_563; // ~5MB per row → triggers >2GB total
+
+fn make_large_binary_array() -> ArrayRef {
+ let value = vec![b'a'; VALUE_SIZE];
+ let values: Vec<Vec<u8>> = std::iter::repeat(value)
+ .take(ROWS)
+ .collect();
+
+ Arc::new(BinaryArray::from(values)) as ArrayRef
+}
+
+fn write_parquet_with_encoding(
+ array: ArrayRef,
+ encoding: Encoding,
+) -> std::fs::File {
+ let schema = Arc::new(Schema::new(vec![
+ Field::new("col", DataType::Binary, false),
+ ]));
+
+ let batch =
+ RecordBatch::try_new(schema.clone(), vec![array])
+ .unwrap();
+
+ let file = tempfile().unwrap();
+
+ let props = WriterProperties::builder()
+ .set_dictionary_enabled(true)
+ .set_encoding(encoding)
+ .build();
+
+ let mut writer =
+ ArrowWriter::try_new(file.try_clone().unwrap(), schema, Some(props))
+ .unwrap();
+
+ writer.write(&batch).unwrap();
+ writer.close().unwrap();
+
+ file
+}
+
+#[test]
+#[ignore = "regression test for >2GB binary offset overflow"]
+fn large_binary_plain_encoding_overflow() {
+ let array = make_large_binary_array();
+ let file = write_parquet_with_encoding(array, Encoding::PLAIN);
+
+ let mut reader =
+ ParquetRecordBatchReaderBuilder::try_new(file)
+ .unwrap()
+ .build()
+ .unwrap();
+
+ assert!(matches!(reader.next(), Some(Ok(_))));
+}
+
+#[test]
+#[ignore = "regression test for >2GB binary offset overflow"]
+fn large_binary_delta_length_encoding_overflow() {
+ let array = make_large_binary_array();
+ let file = write_parquet_with_encoding(
+ array,
+ Encoding::DELTA_LENGTH_BYTE_ARRAY,
+ );
+
+ let mut reader =
+ ParquetRecordBatchReaderBuilder::try_new(file)
+ .unwrap()
+ .build()
+ .unwrap();
+
+ assert!(matches!(reader.next(), Some(Ok(_))));
+}
+
+#[test]
+#[ignore = "regression test for >2GB binary offset overflow"]
+fn large_binary_delta_byte_array_encoding_overflow() {
+ let array = make_large_binary_array();
+ let file =
+ write_parquet_with_encoding(array, Encoding::DELTA_BYTE_ARRAY);
+
+ let mut reader =
+ ParquetRecordBatchReaderBuilder::try_new(file)
+ .unwrap()
+ .build()
+ .unwrap();
+
+ assert!(matches!(reader.next(), Some(Ok(_))));
+}
+
+#[test]
+#[ignore = "regression test for >2GB binary offset overflow"]
+fn large_binary_rle_dictionary_encoding_overflow() {
+ let array = make_large_binary_array();
+ let file =
+ write_parquet_with_encoding(array, Encoding::RLE_DICTIONARY);
Review Comment:
This test will panic when run. The WriterProperties builder at line 40 calls
`set_encoding(encoding)` with `Encoding::RLE_DICTIONARY`, but according to the
parquet properties implementation (parquet/src/file/properties.rs:1087-1089),
calling `set_encoding` with `RLE_DICTIONARY` or `PLAIN_DICTIONARY` will panic
with "Dictionary encoding can not be used as fallback encoding".
To enable dictionary encoding, you should only use
`set_dictionary_enabled(true)` without calling
`set_encoding(Encoding::RLE_DICTIONARY)`. The dictionary encoding will be used
automatically when dictionary is enabled.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]