This is an automated email from the ASF dual-hosted git repository.
etseidl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 77ca6dcbcc fix: bug when struct nullability determined from `Dict<_,
ByteArray>>` column (#8573)
77ca6dcbcc is described below
commit 77ca6dcbccf787e94d644e04a4c4102b1740d1a7
Author: albertlockett <[email protected]>
AuthorDate: Tue Oct 14 11:26:29 2025 -0300
fix: bug when struct nullability determined from `Dict<_, ByteArray>>`
column (#8573)
# Which issue does this PR close?
- Closes #8404
# Rationale for this change
A regression was reported in issue #8404 which was introduced in
https://github.com/apache/arrow-rs/pull/7585. This PR resolves the
issue.
# What changes are included in this PR?
The root cause of the issue was that the behaviour of
`ByteArrayDictionaryReader` is to return a new empty length array of
values if the record reader has already been consumed. The problem was
that the repetition and definition level buffers were not being advanced
in this early return case.
https://github.com/apache/arrow-rs/blob/521f219e308613811aeae11300bf7a7b0fb5ec29/parquet/src/arrow/array_reader/byte_array_dictionary.rs#L167-L183
The `StructArrayReader` reads the repetition and definition levels from
the first child to determine the nullability of the struct array. When
we returned the empty values buffer for the child, without advancing the
repetition and definition buffers, the `StructArrayReader` a length
mismatch between the empty child array and the non-empty nullability
bitmask, and this produces the error.
https://github.com/apache/arrow-rs/blob/521f219e308613811aeae11300bf7a7b0fb5ec29/parquet/src/arrow/array_reader/struct_array.rs#L137-L170
The fix is simple, always have `ByteArrayDictionaryReader` advance the
repetition and definition level buffers when `consume_next_batch` is
called.
# Are these changes tested?
Yes, a new unit test was added
`test_read_nullable_structs_with_binary_dict_as_first_child_column`,
which before the changes introduced in this PR would replicate the
issue.
# Are there any user-facing changes?
No
---------
Co-authored-by: Ed Seidl <[email protected]>
Co-authored-by: Ed Seidl <[email protected]>
---
.../arrow/array_reader/byte_array_dictionary.rs | 7 +--
parquet/src/arrow/arrow_reader/mod.rs | 59 +++++++++++++++++++++-
2 files changed, 62 insertions(+), 4 deletions(-)
diff --git a/parquet/src/arrow/array_reader/byte_array_dictionary.rs
b/parquet/src/arrow/array_reader/byte_array_dictionary.rs
index 17c1bd0171..4afe4264cb 100644
--- a/parquet/src/arrow/array_reader/byte_array_dictionary.rs
+++ b/parquet/src/arrow/array_reader/byte_array_dictionary.rs
@@ -165,6 +165,10 @@ where
}
fn consume_batch(&mut self) -> Result<ArrayRef> {
+ // advance the def & rep level buffers
+ self.def_levels_buffer = self.record_reader.consume_def_levels();
+ self.rep_levels_buffer = self.record_reader.consume_rep_levels();
+
if self.record_reader.num_values() == 0 {
// once the record_reader has been consumed, we've replaced its
values with the default
// variant of DictionaryBuffer (Offset). If `consume_batch` then
gets called again, we
@@ -175,9 +179,6 @@ where
let buffer = self.record_reader.consume_record_data();
let null_buffer = self.record_reader.consume_bitmap_buffer();
let array = buffer.into_array(null_buffer, &self.data_type)?;
-
- self.def_levels_buffer = self.record_reader.consume_def_levels();
- self.rep_levels_buffer = self.record_reader.consume_rep_levels();
self.record_reader.reset();
Ok(array)
diff --git a/parquet/src/arrow/arrow_reader/mod.rs
b/parquet/src/arrow/arrow_reader/mod.rs
index 2506aa9110..3d86800fc2 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -1160,7 +1160,7 @@ mod tests {
Time64MicrosecondType,
};
use arrow_array::*;
- use arrow_buffer::{ArrowNativeType, Buffer, IntervalDayTime, i256};
+ use arrow_buffer::{ArrowNativeType, Buffer, IntervalDayTime, NullBuffer,
i256};
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::{
ArrowError, DataType as ArrowDataType, Field, Fields, Schema,
SchemaRef, TimeUnit,
@@ -2360,6 +2360,63 @@ mod tests {
assert_eq!(&batch, &read[0])
}
+ #[test]
+ fn test_read_nullable_structs_with_binary_dict_as_first_child_column() {
+ // the `StructArrayReader` will check the definition and repetition
levels of the first
+ // child column in the struct to determine nullability for the struct.
If the first
+ // column's is being read by `ByteArrayDictionaryReader` we need to
ensure that the
+ // nullability is interpreted correctly from the rep/def level
buffers managed by the
+ // buffers managed by this array reader.
+
+ let struct_fields = Fields::from(vec![
+ Field::new(
+ "city",
+ ArrowDataType::Dictionary(
+ Box::new(ArrowDataType::UInt8),
+ Box::new(ArrowDataType::Utf8),
+ ),
+ true,
+ ),
+ Field::new("name", ArrowDataType::Utf8, true),
+ ]);
+ let schema = Arc::new(Schema::new(vec![Field::new(
+ "items",
+ ArrowDataType::Struct(struct_fields.clone()),
+ true,
+ )]));
+
+ let items_arr = StructArray::new(
+ struct_fields,
+ vec![
+ Arc::new(DictionaryArray::new(
+ UInt8Array::from_iter_values(vec![0, 1, 1, 0, 2]),
+ Arc::new(StringArray::from_iter_values(vec![
+ "quebec",
+ "fredericton",
+ "halifax",
+ ])),
+ )),
+ Arc::new(StringArray::from_iter_values(vec![
+ "albert", "terry", "lance", "", "tim",
+ ])),
+ ],
+ Some(NullBuffer::from_iter(vec![true, true, true, false, true])),
+ );
+
+ let batch = RecordBatch::try_new(schema,
vec![Arc::new(items_arr)]).unwrap();
+ let mut buffer = Vec::with_capacity(1024);
+ let mut writer = ArrowWriter::try_new(&mut buffer, batch.schema(),
None).unwrap();
+ writer.write(&batch).unwrap();
+ writer.close().unwrap();
+ let read = ParquetRecordBatchReader::try_new(Bytes::from(buffer), 8)
+ .unwrap()
+ .collect::<Result<Vec<_>, _>>()
+ .unwrap();
+
+ assert_eq!(read.len(), 1);
+ assert_eq!(&batch, &read[0])
+ }
+
/// Parameters for single_column_reader_test
#[derive(Clone)]
struct TestOptions {