This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new e814b97415 fix: error reading multiple batches of `Dict(_,
FixedSizeBinary(_))` (#7585)
e814b97415 is described below
commit e814b97415d2df78c273302dab934dfe2a0a4a64
Author: albertlockett <[email protected]>
AuthorDate: Tue Jun 3 12:04:53 2025 -0400
fix: error reading multiple batches of `Dict(_, FixedSizeBinary(_))` (#7585)
# Which issue does this PR close?
- Closes # https://github.com/apache/arrow-rs/issues/7545
# Rationale for this change
See comment here for what I think is the root cause of the issue:
https://github.com/apache/arrow-rs/issues/7545#issuecomment-2927239790
# What changes are included in this PR?
# Are there any user-facing changes?
No breaking changes. Users may notice that if they try to collect
batches with arrow type `Dictionary(_, FixedSizeBinary(_))` when reading
their parquet file, that there will no longer be an error
---------
Co-authored-by: Ed Seidl <[email protected]>
---
.../arrow/array_reader/byte_array_dictionary.rs | 9 +++++-
parquet/src/arrow/arrow_reader/mod.rs | 35 ++++++++++++++++++++++
2 files changed, 43 insertions(+), 1 deletion(-)
diff --git a/parquet/src/arrow/array_reader/byte_array_dictionary.rs
b/parquet/src/arrow/array_reader/byte_array_dictionary.rs
index 757d3df8a8..0f8a21478e 100644
--- a/parquet/src/arrow/array_reader/byte_array_dictionary.rs
+++ b/parquet/src/arrow/array_reader/byte_array_dictionary.rs
@@ -19,7 +19,7 @@ use std::any::Any;
use std::marker::PhantomData;
use std::sync::Arc;
-use arrow_array::{Array, ArrayRef, OffsetSizeTrait};
+use arrow_array::{new_empty_array, Array, ArrayRef, OffsetSizeTrait};
use arrow_buffer::ArrowNativeType;
use arrow_schema::DataType as ArrowType;
use bytes::Bytes;
@@ -165,6 +165,13 @@ where
}
fn consume_batch(&mut self) -> Result<ArrayRef> {
+ if self.record_reader.num_values() == 0 {
+ // once the record_reader has been consumed, we've replaced its
values with the default
+ // variant of DictionaryBuffer (Offset). If `consume_batch` then
gets called again, we
+ // avoid using the wrong variant of the buffer by returning empty
array.
+ return Ok(new_empty_array(&self.data_type));
+ }
+
let buffer = self.record_reader.consume_record_data();
let null_buffer = self.record_reader.consume_bitmap_buffer();
let array = buffer.into_array(null_buffer, &self.data_type)?;
diff --git a/parquet/src/arrow/arrow_reader/mod.rs
b/parquet/src/arrow/arrow_reader/mod.rs
index 2e5809304c..9127423efe 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -2158,6 +2158,41 @@ mod tests {
);
}
+ #[test]
+ fn test_read_dict_fixed_size_binary() {
+ let schema = Arc::new(Schema::new(vec![Field::new(
+ "a",
+ ArrowDataType::Dictionary(
+ Box::new(ArrowDataType::UInt8),
+ Box::new(ArrowDataType::FixedSizeBinary(8)),
+ ),
+ true,
+ )]));
+ let keys = UInt8Array::from_iter_values(vec![0, 0, 1]);
+ let values = FixedSizeBinaryArray::try_from_iter(
+ vec![
+ (0u8..8u8).collect::<Vec<u8>>(),
+ (24u8..32u8).collect::<Vec<u8>>(),
+ ]
+ .into_iter(),
+ )
+ .unwrap();
+ let arr = UInt8DictionaryArray::new(keys, Arc::new(values));
+ let batch = RecordBatch::try_new(schema, vec![Arc::new(arr)]).unwrap();
+
+ let mut buffer = Vec::with_capacity(1024);
+ let mut writer = ArrowWriter::try_new(&mut buffer, batch.schema(),
None).unwrap();
+ writer.write(&batch).unwrap();
+ writer.close().unwrap();
+ let read = ParquetRecordBatchReader::try_new(Bytes::from(buffer), 3)
+ .unwrap()
+ .collect::<Result<Vec<_>, _>>()
+ .unwrap();
+
+ assert_eq!(read.len(), 1);
+ assert_eq!(&batch, &read[0])
+ }
+
/// Parameters for single_column_reader_test
#[derive(Clone)]
struct TestOptions {