(arrow-rs) branch main updated: fix: error reading multiple batches of `Dict(_, FixedSizeBinary(_))` (#7585)

alamb Tue, 03 Jun 2025 12:42:04 -0700

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git



The following commit(s) were added to refs/heads/main by this push:
     new e814b97415 fix: error reading multiple batches of `Dict(_, 
FixedSizeBinary(_))` (#7585)
e814b97415 is described below

commit e814b97415d2df78c273302dab934dfe2a0a4a64
Author: albertlockett <[email protected]>
AuthorDate: Tue Jun 3 12:04:53 2025 -0400

    fix: error reading multiple batches of `Dict(_, FixedSizeBinary(_))` (#7585)
    
    # Which issue does this PR close?
    
    - Closes # https://github.com/apache/arrow-rs/issues/7545
    
    # Rationale for this change
    
    See comment here for what I think is the root cause of the issue:
    https://github.com/apache/arrow-rs/issues/7545#issuecomment-2927239790
    
    # What changes are included in this PR?
    
    # Are there any user-facing changes?
    
    No breaking changes. Users may notice that if they try to collect
    batches with arrow type `Dictionary(_, FixedSizeBinary(_))` when reading
    their parquet file, that there will no longer be an error
    
    ---------
    
    Co-authored-by: Ed Seidl <[email protected]>
---
 .../arrow/array_reader/byte_array_dictionary.rs    |  9 +++++-
 parquet/src/arrow/arrow_reader/mod.rs              | 35 ++++++++++++++++++++++
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/parquet/src/arrow/array_reader/byte_array_dictionary.rs 
b/parquet/src/arrow/array_reader/byte_array_dictionary.rs
index 757d3df8a8..0f8a21478e 100644
--- a/parquet/src/arrow/array_reader/byte_array_dictionary.rs
+++ b/parquet/src/arrow/array_reader/byte_array_dictionary.rs
@@ -19,7 +19,7 @@ use std::any::Any;
 use std::marker::PhantomData;
 use std::sync::Arc;
 
-use arrow_array::{Array, ArrayRef, OffsetSizeTrait};
+use arrow_array::{new_empty_array, Array, ArrayRef, OffsetSizeTrait};
 use arrow_buffer::ArrowNativeType;
 use arrow_schema::DataType as ArrowType;
 use bytes::Bytes;
@@ -165,6 +165,13 @@ where
     }
 
     fn consume_batch(&mut self) -> Result<ArrayRef> {
+        if self.record_reader.num_values() == 0 {
+            // once the record_reader has been consumed, we've replaced its 
values with the default
+            // variant of DictionaryBuffer (Offset). If `consume_batch` then 
gets called again, we
+            // avoid using the wrong variant of the buffer by returning empty 
array.
+            return Ok(new_empty_array(&self.data_type));
+        }
+
         let buffer = self.record_reader.consume_record_data();
         let null_buffer = self.record_reader.consume_bitmap_buffer();
         let array = buffer.into_array(null_buffer, &self.data_type)?;
diff --git a/parquet/src/arrow/arrow_reader/mod.rs 
b/parquet/src/arrow/arrow_reader/mod.rs
index 2e5809304c..9127423efe 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -2158,6 +2158,41 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_read_dict_fixed_size_binary() {
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "a",
+            ArrowDataType::Dictionary(
+                Box::new(ArrowDataType::UInt8),
+                Box::new(ArrowDataType::FixedSizeBinary(8)),
+            ),
+            true,
+        )]));
+        let keys = UInt8Array::from_iter_values(vec![0, 0, 1]);
+        let values = FixedSizeBinaryArray::try_from_iter(
+            vec![
+                (0u8..8u8).collect::<Vec<u8>>(),
+                (24u8..32u8).collect::<Vec<u8>>(),
+            ]
+            .into_iter(),
+        )
+        .unwrap();
+        let arr = UInt8DictionaryArray::new(keys, Arc::new(values));
+        let batch = RecordBatch::try_new(schema, vec![Arc::new(arr)]).unwrap();
+
+        let mut buffer = Vec::with_capacity(1024);
+        let mut writer = ArrowWriter::try_new(&mut buffer, batch.schema(), 
None).unwrap();
+        writer.write(&batch).unwrap();
+        writer.close().unwrap();
+        let read = ParquetRecordBatchReader::try_new(Bytes::from(buffer), 3)
+            .unwrap()
+            .collect::<Result<Vec<_>, _>>()
+            .unwrap();
+
+        assert_eq!(read.len(), 1);
+        assert_eq!(&batch, &read[0])
+    }
+
     /// Parameters for single_column_reader_test
     #[derive(Clone)]
     struct TestOptions {

(arrow-rs) branch main updated: fix: error reading multiple batches of `Dict(_, FixedSizeBinary(_))` (#7585)

Reply via email to