` column (#8573)

etseidl Fri, 17 Oct 2025 17:02:55 -0700

This is an automated email from the ASF dual-hosted git repository.

etseidl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git



The following commit(s) were added to refs/heads/main by this push:
     new 77ca6dcbcc fix: bug when struct nullability determined from `Dict<_, 
ByteArray>>` column (#8573)
77ca6dcbcc is described below

commit 77ca6dcbccf787e94d644e04a4c4102b1740d1a7
Author: albertlockett <[email protected]>
AuthorDate: Tue Oct 14 11:26:29 2025 -0300

    fix: bug when struct nullability determined from `Dict<_, ByteArray>>` 
column (#8573)
    
    # Which issue does this PR close?
    
    - Closes #8404
    
    # Rationale for this change
    
    A regression was reported in issue #8404 which was introduced in
    https://github.com/apache/arrow-rs/pull/7585. This PR resolves the
    issue.
    
    # What changes are included in this PR?
    
    The root cause of the issue was that the behaviour of
    `ByteArrayDictionaryReader` is to return a new empty length array of
    values if the record reader has already been consumed. The problem was
    that the repetition and definition level buffers were not being advanced
    in this early return case.
    
    
    
https://github.com/apache/arrow-rs/blob/521f219e308613811aeae11300bf7a7b0fb5ec29/parquet/src/arrow/array_reader/byte_array_dictionary.rs#L167-L183
    
    The `StructArrayReader` reads the repetition and definition levels from
    the first child to determine the nullability of the struct array. When
    we returned the empty values buffer for the child, without advancing the
    repetition and definition buffers, the `StructArrayReader` a length
    mismatch between the empty child array and the non-empty nullability
    bitmask, and this produces the error.
    
    
    
https://github.com/apache/arrow-rs/blob/521f219e308613811aeae11300bf7a7b0fb5ec29/parquet/src/arrow/array_reader/struct_array.rs#L137-L170
    
    The fix is simple, always have `ByteArrayDictionaryReader` advance the
    repetition and definition level buffers when `consume_next_batch` is
    called.
    
    # Are these changes tested?
    
    Yes, a new unit test was added
    `test_read_nullable_structs_with_binary_dict_as_first_child_column`,
    which before the changes introduced in this PR would replicate the
    issue.
    
    # Are there any user-facing changes?
    No
    
    ---------
    
    Co-authored-by: Ed Seidl <[email protected]>
    Co-authored-by: Ed Seidl <[email protected]>
---
 .../arrow/array_reader/byte_array_dictionary.rs    |  7 +--
 parquet/src/arrow/arrow_reader/mod.rs              | 59 +++++++++++++++++++++-
 2 files changed, 62 insertions(+), 4 deletions(-)

diff --git a/parquet/src/arrow/array_reader/byte_array_dictionary.rs 
b/parquet/src/arrow/array_reader/byte_array_dictionary.rs
index 17c1bd0171..4afe4264cb 100644
--- a/parquet/src/arrow/array_reader/byte_array_dictionary.rs
+++ b/parquet/src/arrow/array_reader/byte_array_dictionary.rs
@@ -165,6 +165,10 @@ where
     }
 
     fn consume_batch(&mut self) -> Result<ArrayRef> {
+        // advance the def & rep level buffers
+        self.def_levels_buffer = self.record_reader.consume_def_levels();
+        self.rep_levels_buffer = self.record_reader.consume_rep_levels();
+
         if self.record_reader.num_values() == 0 {
             // once the record_reader has been consumed, we've replaced its 
values with the default
             // variant of DictionaryBuffer (Offset). If `consume_batch` then 
gets called again, we
@@ -175,9 +179,6 @@ where
         let buffer = self.record_reader.consume_record_data();
         let null_buffer = self.record_reader.consume_bitmap_buffer();
         let array = buffer.into_array(null_buffer, &self.data_type)?;
-
-        self.def_levels_buffer = self.record_reader.consume_def_levels();
-        self.rep_levels_buffer = self.record_reader.consume_rep_levels();
         self.record_reader.reset();
 
         Ok(array)
diff --git a/parquet/src/arrow/arrow_reader/mod.rs 
b/parquet/src/arrow/arrow_reader/mod.rs
index 2506aa9110..3d86800fc2 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -1160,7 +1160,7 @@ mod tests {
         Time64MicrosecondType,
     };
     use arrow_array::*;
-    use arrow_buffer::{ArrowNativeType, Buffer, IntervalDayTime, i256};
+    use arrow_buffer::{ArrowNativeType, Buffer, IntervalDayTime, NullBuffer, 
i256};
     use arrow_data::{ArrayData, ArrayDataBuilder};
     use arrow_schema::{
         ArrowError, DataType as ArrowDataType, Field, Fields, Schema, 
SchemaRef, TimeUnit,
@@ -2360,6 +2360,63 @@ mod tests {
         assert_eq!(&batch, &read[0])
     }
 
+    #[test]
+    fn test_read_nullable_structs_with_binary_dict_as_first_child_column() {
+        // the `StructArrayReader` will check the definition and repetition 
levels of the first
+        // child column in the struct to determine nullability for the struct. 
If the first
+        // column's is being read by `ByteArrayDictionaryReader` we need to 
ensure that the
+        // nullability is interpreted  correctly from the rep/def level 
buffers managed by the
+        // buffers managed by this array reader.
+
+        let struct_fields = Fields::from(vec![
+            Field::new(
+                "city",
+                ArrowDataType::Dictionary(
+                    Box::new(ArrowDataType::UInt8),
+                    Box::new(ArrowDataType::Utf8),
+                ),
+                true,
+            ),
+            Field::new("name", ArrowDataType::Utf8, true),
+        ]);
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "items",
+            ArrowDataType::Struct(struct_fields.clone()),
+            true,
+        )]));
+
+        let items_arr = StructArray::new(
+            struct_fields,
+            vec![
+                Arc::new(DictionaryArray::new(
+                    UInt8Array::from_iter_values(vec![0, 1, 1, 0, 2]),
+                    Arc::new(StringArray::from_iter_values(vec![
+                        "quebec",
+                        "fredericton",
+                        "halifax",
+                    ])),
+                )),
+                Arc::new(StringArray::from_iter_values(vec![
+                    "albert", "terry", "lance", "", "tim",
+                ])),
+            ],
+            Some(NullBuffer::from_iter(vec![true, true, true, false, true])),
+        );
+
+        let batch = RecordBatch::try_new(schema, 
vec![Arc::new(items_arr)]).unwrap();
+        let mut buffer = Vec::with_capacity(1024);
+        let mut writer = ArrowWriter::try_new(&mut buffer, batch.schema(), 
None).unwrap();
+        writer.write(&batch).unwrap();
+        writer.close().unwrap();
+        let read = ParquetRecordBatchReader::try_new(Bytes::from(buffer), 8)
+            .unwrap()
+            .collect::<Result<Vec<_>, _>>()
+            .unwrap();
+
+        assert_eq!(read.len(), 1);
+        assert_eq!(&batch, &read[0])
+    }
+
     /// Parameters for single_column_reader_test
     #[derive(Clone)]
     struct TestOptions {

(arrow-rs) branch main updated: fix: bug when struct nullability determined from `Dict<_, ByteArray>>` column (#8573)

Reply via email to