This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 57b64b4978 Fix IPC roundtripping dicts nested in ListViews (#9126)
57b64b4978 is described below
commit 57b64b49788da4ba36e63415f8130b01ab8dbb86
Author: Frederic Branczyk <[email protected]>
AuthorDate: Sat Jan 10 13:03:01 2026 +0100
Fix IPC roundtripping dicts nested in ListViews (#9126)
# Which issue does this PR close?
Closes https://github.com/apache/arrow-rs/issues/9125
# Rationale for this change
Fixes correct handling of dict building for IPC when ListView is used.
# What changes are included in this PR?
Creating schemas correctly and reading dict IDs correctly when ListView
occurs in IPC.
# Are these changes tested?
Yes, see new unit tests.
# Are there any user-facing changes?
No, just a bug fix.
@alamb
---
arrow-ipc/src/writer.rs | 136 ++++++++++++++++++++++++++++++++++++++++++++++
arrow-schema/src/field.rs | 2 +
2 files changed, 138 insertions(+)
diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs
index c868641ab6..8afc3c8ed2 100644
--- a/arrow-ipc/src/writer.rs
+++ b/arrow-ipc/src/writer.rs
@@ -296,6 +296,30 @@ impl IpcDataGenerator {
compression_context,
)?;
}
+ DataType::ListView(field) => {
+ let list = column.as_list_view::<i32>();
+ self.encode_dictionaries(
+ field,
+ list.values(),
+ encoded_dictionaries,
+ dictionary_tracker,
+ write_options,
+ dict_id,
+ compression_context,
+ )?;
+ }
+ DataType::LargeListView(field) => {
+ let list = column.as_list_view::<i64>();
+ self.encode_dictionaries(
+ field,
+ list.values(),
+ encoded_dictionaries,
+ dictionary_tracker,
+ write_options,
+ dict_id,
+ compression_context,
+ )?;
+ }
DataType::FixedSizeList(field, _) => {
let list = column
.as_any()
@@ -3402,6 +3426,118 @@ mod tests {
assert_eq!(in_batch, out_batch);
}
+ fn test_roundtrip_list_view_of_dict_impl<OffsetSize: OffsetSizeTrait, U:
ArrowNativeType>(
+ list_data_type: DataType,
+ offsets: &[U; 5],
+ sizes: &[U; 4],
+ ) {
+ let values = StringArray::from(vec![Some("alpha"), None, Some("beta"),
Some("gamma")]);
+ let keys = Int32Array::from_iter_values([0, 0, 1, 2, 3, 0, 2]);
+ let dict_array = DictionaryArray::new(keys, Arc::new(values));
+ let dict_data = dict_array.to_data();
+
+ let value_offsets = Buffer::from_slice_ref(offsets);
+ let value_sizes = Buffer::from_slice_ref(sizes);
+
+ let list_data = ArrayData::builder(list_data_type)
+ .len(4)
+ .add_buffer(value_offsets)
+ .add_buffer(value_sizes)
+ .add_child_data(dict_data)
+ .build()
+ .unwrap();
+ let list_view_array =
GenericListViewArray::<OffsetSize>::from(list_data);
+
+ let schema = Arc::new(Schema::new(vec![Field::new(
+ "f1",
+ list_view_array.data_type().clone(),
+ false,
+ )]));
+ let input_batch = RecordBatch::try_new(schema,
vec![Arc::new(list_view_array)]).unwrap();
+
+ let output_batch = deserialize_file(serialize_file(&input_batch));
+ assert_eq!(input_batch, output_batch);
+
+ let output_batch = deserialize_stream(serialize_stream(&input_batch));
+ assert_eq!(input_batch, output_batch);
+ }
+
+ #[test]
+ fn test_roundtrip_list_view_of_dict() {
+ #[allow(deprecated)]
+ let list_data_type = DataType::ListView(Arc::new(Field::new_dict(
+ "item",
+ DataType::Dictionary(Box::new(DataType::Int32),
Box::new(DataType::Utf8)),
+ true,
+ 1,
+ false,
+ )));
+ let offsets: &[i32; 5] = &[0, 2, 4, 4, 7];
+ let sizes: &[i32; 4] = &[2, 2, 0, 3];
+ test_roundtrip_list_view_of_dict_impl::<i32, i32>(list_data_type,
offsets, sizes);
+ }
+
+ #[test]
+ fn test_roundtrip_large_list_view_of_dict() {
+ #[allow(deprecated)]
+ let list_data_type = DataType::LargeListView(Arc::new(Field::new_dict(
+ "item",
+ DataType::Dictionary(Box::new(DataType::Int32),
Box::new(DataType::Utf8)),
+ true,
+ 2,
+ false,
+ )));
+ let offsets: &[i64; 5] = &[0, 2, 4, 4, 7];
+ let sizes: &[i64; 4] = &[2, 2, 0, 3];
+ test_roundtrip_list_view_of_dict_impl::<i64, i64>(list_data_type,
offsets, sizes);
+ }
+
+ #[test]
+ fn test_roundtrip_sliced_list_view_of_dict() {
+ #[allow(deprecated)]
+ let list_data_type = DataType::ListView(Arc::new(Field::new_dict(
+ "item",
+ DataType::Dictionary(Box::new(DataType::Int32),
Box::new(DataType::Utf8)),
+ true,
+ 3,
+ false,
+ )));
+
+ let values = StringArray::from(vec![Some("alpha"), None, Some("beta"),
Some("gamma")]);
+ let keys = Int32Array::from_iter_values([0, 0, 1, 2, 3, 0, 2, 1, 0, 3,
2, 1]);
+ let dict_array = DictionaryArray::new(keys, Arc::new(values));
+ let dict_data = dict_array.to_data();
+
+ let offsets: &[i32; 7] = &[0, 2, 4, 4, 7, 9, 12];
+ let sizes: &[i32; 6] = &[2, 2, 0, 3, 2, 3];
+ let value_offsets = Buffer::from_slice_ref(offsets);
+ let value_sizes = Buffer::from_slice_ref(sizes);
+
+ let list_data = ArrayData::builder(list_data_type)
+ .len(6)
+ .add_buffer(value_offsets)
+ .add_buffer(value_sizes)
+ .add_child_data(dict_data)
+ .build()
+ .unwrap();
+ let list_view_array = GenericListViewArray::<i32>::from(list_data);
+
+ let schema = Arc::new(Schema::new(vec![Field::new(
+ "f1",
+ list_view_array.data_type().clone(),
+ false,
+ )]));
+ let input_batch = RecordBatch::try_new(schema,
vec![Arc::new(list_view_array)]).unwrap();
+
+ let sliced_batch = input_batch.slice(1, 4);
+
+ let output_batch = deserialize_file(serialize_file(&sliced_batch));
+ assert_eq!(sliced_batch, output_batch);
+
+ let output_batch = deserialize_stream(serialize_stream(&sliced_batch));
+ assert_eq!(sliced_batch, output_batch);
+ }
+
#[test]
fn test_decimal128_alignment16_is_sufficient() {
const IPC_ALIGNMENT: usize = 16;
diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs
index 1b9a298e59..c4566e41bf 100644
--- a/arrow-schema/src/field.rs
+++ b/arrow-schema/src/field.rs
@@ -718,6 +718,8 @@ impl Field {
DataType::Union(fields, _) => fields.iter().flat_map(|(_, f)|
f.fields()).collect(),
DataType::List(field)
| DataType::LargeList(field)
+ | DataType::ListView(field)
+ | DataType::LargeListView(field)
| DataType::FixedSizeList(field, _)
| DataType::Map(field, _) => field.fields(),
DataType::Dictionary(_, value_field) =>
Field::_fields(value_field.as_ref()),