This is an automated email from the ASF dual-hosted git repository.
etseidl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new d3cad6e0a4 [Parquet] Do not panic when trying to skip records in delta
encoded files using non-standard block sizes (#9794)
d3cad6e0a4 is described below
commit d3cad6e0a402aeddbdeb67e9f276d9d697b69860
Author: Ed Seidl <[email protected]>
AuthorDate: Fri May 1 15:03:20 2026 -0700
[Parquet] Do not panic when trying to skip records in delta encoded files
using non-standard block sizes (#9794)
# Which issue does this PR close?
- Closes #9793.
# Rationale for this change
`DeltaBitPackDecoder::skip` uses some magic numbers when sizing the
buffer used for skipping. Files that use non-standard miniblock sizes
will cause `skip` to panic.
# What changes are included in this PR?
Check for non-standard miniblock sizes and return an error rather than a
panic.
Note: allocating a vec sized with `values_per_mini_block` resulted in a
significant performance regression.
# Are these changes tested?
Yes, test with file with non-standard sizes is added to the arrow_reader
tests.
# Are there any user-facing changes?
No.
---
parquet/src/encodings/decoding.rs | 22 +++++++++++++++++----
parquet/tests/arrow_reader/bad_data.rs | 29 +++++++++++++++++++++++++++-
parquet/tests/arrow_reader/bigdelta.parquet | Bin 0 -> 3660 bytes
3 files changed, 46 insertions(+), 5 deletions(-)
diff --git a/parquet/src/encodings/decoding.rs
b/parquet/src/encodings/decoding.rs
index da07d23eb6..0fb9604122 100644
--- a/parquet/src/encodings/decoding.rs
+++ b/parquet/src/encodings/decoding.rs
@@ -847,10 +847,21 @@ where
self.values_left -= 1;
}
- let mini_block_batch_size = match T::T::PHYSICAL_TYPE {
- Type::INT32 => 32,
- Type::INT64 => 64,
- _ => unreachable!(),
+ // See https://github.com/apache/arrow-rs/pull/9794.
+ // The parquet spec actually allows for miniblock sizes other than 32
or 64, but
+ // no current writers use anything else. Using values_per_mini_block
directly
+ // for the skip_buffer doesn't allow stack allocation and leads to a
significant
+ // drop in performance. We'll settle for erroring out here and come up
with a
+ // better fix if writers ever start getting creative with block sizes.
+ let mini_block_batch_size = match self.values_per_mini_block {
+ 32 => 32,
+ 64 => 64,
+ _ => {
+ return Err(general_err!(
+ "cannot skip miniblock of size {}",
+ self.values_per_mini_block
+ ));
+ }
};
let mut skip_buffer = vec![T::T::default(); mini_block_batch_size];
@@ -863,10 +874,13 @@ where
self.check_bit_width(bit_width)?;
let mini_block_to_skip = self.mini_block_remaining.min(to_skip -
skip);
+ // see commentary in self.get() above regarding optimizations
let min_delta = self.min_delta.as_i64()?;
if bit_width == 0 {
// All remainders are zero: every delta equals min_delta
exactly.
// Advance last_value by n * min_delta with no bit reads.
+ // When min_delta == 0 there is nothing to do: last_value is
+ // unchanged and no bytes are consumed from the bit reader.
if min_delta != 0 {
let total = min_delta.wrapping_mul(mini_block_to_skip as
i64);
let step = T::T::from_i64(total)
diff --git a/parquet/tests/arrow_reader/bad_data.rs
b/parquet/tests/arrow_reader/bad_data.rs
index 54c92976e4..d9b0d89e2c 100644
--- a/parquet/tests/arrow_reader/bad_data.rs
+++ b/parquet/tests/arrow_reader/bad_data.rs
@@ -18,6 +18,7 @@
//! Tests that reading invalid parquet files returns an error
use arrow::util::test_util::parquet_test_data;
+use bytes::Bytes;
use parquet::arrow::arrow_reader::ArrowReaderBuilder;
use parquet::errors::ParquetError;
use std::collections::HashSet;
@@ -147,11 +148,37 @@ fn read_file(name: &str) -> Result<usize, ParquetError> {
Ok(num_rows)
}
+#[test]
+fn non_standard_delta_blocks() {
+ let file = Bytes::from_static(include_bytes!("bigdelta.parquet"));
+ use parquet::arrow::arrow_reader::{RowSelection, RowSelector};
+
+ let selectors = vec![RowSelector::skip(1000), RowSelector::select(5)];
+
+ let selection: RowSelection = selectors.into();
+ let reader = ArrowReaderBuilder::try_new(file)
+ .unwrap()
+ .with_row_selection(selection)
+ .build()
+ .unwrap();
+
+ if let Some(maybe_batch) = reader.into_iter().next() {
+ // TODO: uncomment if we ever allow skipping miniblocks > 64 elements
+ //let batch = maybe_batch.expect("skip should succeed");
+ //assert_eq!(batch.num_rows(), 5);
+ assert!(
+ maybe_batch
+ .unwrap_err()
+ .to_string()
+ .contains("cannot skip miniblock of size 128")
+ );
+ }
+}
+
#[cfg(feature = "async")]
#[tokio::test]
#[allow(deprecated)]
async fn bad_metadata_err() {
- use bytes::Bytes;
use parquet::file::metadata::ParquetMetaDataReader;
let metadata_buffer =
Bytes::from_static(include_bytes!("bad_raw_metadata.bin"));
diff --git a/parquet/tests/arrow_reader/bigdelta.parquet
b/parquet/tests/arrow_reader/bigdelta.parquet
new file mode 100644
index 0000000000..0fb270af8f
Binary files /dev/null and b/parquet/tests/arrow_reader/bigdelta.parquet differ