This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 1ef55f1dde arrow-ipc: Add tests for nested dicts for Map and Union
arrays (#9146)
1ef55f1dde is described below
commit 1ef55f1ddeedd7b9d30762270ba81a57f5bf10a6
Author: Frederic Branczyk <[email protected]>
AuthorDate: Tue Jan 13 13:14:50 2026 +0100
arrow-ipc: Add tests for nested dicts for Map and Union arrays (#9146)
# Which issue does this PR close?
No known issue, but this confirms that nested dicts do indeed work in
Map and Union arrays. It was brought up here:
https://github.com/apache/arrow-rs/pull/9126#discussion_r2678500873
# Rationale for this change
Ensure that IPC roundtripping nested dicts works in Map and Union
arrays.
# What changes are included in this PR?
Unit tests testing the functionality.
# Are these changes tested?
The whole PR consists of tests only.
# Are there any user-facing changes?
No
@alamb @Jefffrey
---
arrow-ipc/src/writer.rs | 223 ++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 223 insertions(+)
diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs
index 8afc3c8ed2..86376c8e5e 100644
--- a/arrow-ipc/src/writer.rs
+++ b/arrow-ipc/src/writer.rs
@@ -3538,6 +3538,229 @@ mod tests {
assert_eq!(sliced_batch, output_batch);
}
+ #[test]
+ fn test_roundtrip_dense_union_of_dict() {
+ let values = StringArray::from(vec![Some("alpha"), None, Some("beta"),
Some("gamma")]);
+ let keys = Int32Array::from_iter_values([0, 0, 1, 2, 3, 0, 2]);
+ let dict_array = DictionaryArray::new(keys, Arc::new(values));
+
+ #[allow(deprecated)]
+ let dict_field = Arc::new(Field::new_dict(
+ "dict",
+ DataType::Dictionary(Box::new(DataType::Int32),
Box::new(DataType::Utf8)),
+ true,
+ 1,
+ false,
+ ));
+ let int_field = Arc::new(Field::new("int", DataType::Int32, false));
+ let union_fields = UnionFields::try_new(vec![0, 1], vec![dict_field,
int_field]).unwrap();
+
+ let types = ScalarBuffer::from(vec![0i8, 0, 1, 0, 1, 0, 0]);
+ let offsets = ScalarBuffer::from(vec![0i32, 1, 0, 2, 1, 3, 4]);
+
+ let int_array = Int32Array::from(vec![100, 200]);
+
+ let union = UnionArray::try_new(
+ union_fields.clone(),
+ types,
+ Some(offsets),
+ vec![Arc::new(dict_array), Arc::new(int_array)],
+ )
+ .unwrap();
+
+ let schema = Arc::new(Schema::new(vec![Field::new(
+ "union",
+ DataType::Union(union_fields, UnionMode::Dense),
+ false,
+ )]));
+ let input_batch = RecordBatch::try_new(schema,
vec![Arc::new(union)]).unwrap();
+
+ let output_batch = deserialize_file(serialize_file(&input_batch));
+ assert_eq!(input_batch, output_batch);
+
+ let output_batch = deserialize_stream(serialize_stream(&input_batch));
+ assert_eq!(input_batch, output_batch);
+ }
+
+ #[test]
+ fn test_roundtrip_sparse_union_of_dict() {
+ let values = StringArray::from(vec![Some("alpha"), None, Some("beta"),
Some("gamma")]);
+ let keys = Int32Array::from_iter_values([0, 0, 1, 2, 3, 0, 2]);
+ let dict_array = DictionaryArray::new(keys, Arc::new(values));
+
+ #[allow(deprecated)]
+ let dict_field = Arc::new(Field::new_dict(
+ "dict",
+ DataType::Dictionary(Box::new(DataType::Int32),
Box::new(DataType::Utf8)),
+ true,
+ 2,
+ false,
+ ));
+ let int_field = Arc::new(Field::new("int", DataType::Int32, false));
+ let union_fields = UnionFields::try_new(vec![0, 1], vec![dict_field,
int_field]).unwrap();
+
+ let types = ScalarBuffer::from(vec![0i8, 0, 1, 0, 1, 0, 0]);
+
+ let int_array = Int32Array::from(vec![0, 0, 100, 0, 200, 0, 0]);
+
+ let union = UnionArray::try_new(
+ union_fields.clone(),
+ types,
+ None,
+ vec![Arc::new(dict_array), Arc::new(int_array)],
+ )
+ .unwrap();
+
+ let schema = Arc::new(Schema::new(vec![Field::new(
+ "union",
+ DataType::Union(union_fields, UnionMode::Sparse),
+ false,
+ )]));
+ let input_batch = RecordBatch::try_new(schema,
vec![Arc::new(union)]).unwrap();
+
+ let output_batch = deserialize_file(serialize_file(&input_batch));
+ assert_eq!(input_batch, output_batch);
+
+ let output_batch = deserialize_stream(serialize_stream(&input_batch));
+ assert_eq!(input_batch, output_batch);
+ }
+
+ #[test]
+ fn test_roundtrip_map_with_dict_keys() {
+ // Building a map array is a bit involved. We first build a struct
arary that has a key and
+ // value field and then use that to build the actual map array.
+ let key_values = StringArray::from(vec!["key_a", "key_b", "key_c"]);
+ let keys = Int32Array::from_iter_values([0, 1, 2, 0, 1, 0]);
+ let dict_keys = DictionaryArray::new(keys, Arc::new(key_values));
+
+ let values = Int32Array::from(vec![1, 2, 3, 4, 5, 6]);
+
+ #[allow(deprecated)]
+ let entries_field = Arc::new(Field::new(
+ "entries",
+ DataType::Struct(
+ vec![
+ Field::new_dict(
+ "key",
+ DataType::Dictionary(Box::new(DataType::Int32),
Box::new(DataType::Utf8)),
+ false,
+ 1,
+ false,
+ ),
+ Field::new("value", DataType::Int32, true),
+ ]
+ .into(),
+ ),
+ false,
+ ));
+
+ let entries = StructArray::from(vec![
+ (
+ Arc::new(Field::new(
+ "key",
+ DataType::Dictionary(Box::new(DataType::Int32),
Box::new(DataType::Utf8)),
+ false,
+ )),
+ Arc::new(dict_keys) as ArrayRef,
+ ),
+ (
+ Arc::new(Field::new("value", DataType::Int32, true)),
+ Arc::new(values) as ArrayRef,
+ ),
+ ]);
+
+ let offsets = Buffer::from_slice_ref([0i32, 2, 4, 6]);
+
+ let map_data = ArrayData::builder(DataType::Map(entries_field, false))
+ .len(3)
+ .add_buffer(offsets)
+ .add_child_data(entries.into_data())
+ .build()
+ .unwrap();
+ let map_array = MapArray::from(map_data);
+
+ let schema = Arc::new(Schema::new(vec![Field::new(
+ "map",
+ map_array.data_type().clone(),
+ false,
+ )]));
+ let input_batch = RecordBatch::try_new(schema,
vec![Arc::new(map_array)]).unwrap();
+
+ let output_batch = deserialize_file(serialize_file(&input_batch));
+ assert_eq!(input_batch, output_batch);
+
+ let output_batch = deserialize_stream(serialize_stream(&input_batch));
+ assert_eq!(input_batch, output_batch);
+ }
+
+ #[test]
+ fn test_roundtrip_map_with_dict_values() {
+ // Building a map array is a bit involved. We first build a struct
arary that has a key and
+ // value field and then use that to build the actual map array.
+ let keys = StringArray::from(vec!["a", "b", "c", "d", "e", "f"]);
+
+ let value_values = StringArray::from(vec!["val_x", "val_y", "val_z"]);
+ let value_keys = Int32Array::from_iter_values([0, 1, 2, 0, 1, 0]);
+ let dict_values = DictionaryArray::new(value_keys,
Arc::new(value_values));
+
+ #[allow(deprecated)]
+ let entries_field = Arc::new(Field::new(
+ "entries",
+ DataType::Struct(
+ vec![
+ Field::new("key", DataType::Utf8, false),
+ Field::new_dict(
+ "value",
+ DataType::Dictionary(Box::new(DataType::Int32),
Box::new(DataType::Utf8)),
+ true,
+ 2,
+ false,
+ ),
+ ]
+ .into(),
+ ),
+ false,
+ ));
+
+ let entries = StructArray::from(vec![
+ (
+ Arc::new(Field::new("key", DataType::Utf8, false)),
+ Arc::new(keys) as ArrayRef,
+ ),
+ (
+ Arc::new(Field::new(
+ "value",
+ DataType::Dictionary(Box::new(DataType::Int32),
Box::new(DataType::Utf8)),
+ true,
+ )),
+ Arc::new(dict_values) as ArrayRef,
+ ),
+ ]);
+
+ let offsets = Buffer::from_slice_ref([0i32, 2, 4, 6]);
+
+ let map_data = ArrayData::builder(DataType::Map(entries_field, false))
+ .len(3)
+ .add_buffer(offsets)
+ .add_child_data(entries.into_data())
+ .build()
+ .unwrap();
+ let map_array = MapArray::from(map_data);
+
+ let schema = Arc::new(Schema::new(vec![Field::new(
+ "map",
+ map_array.data_type().clone(),
+ false,
+ )]));
+ let input_batch = RecordBatch::try_new(schema,
vec![Arc::new(map_array)]).unwrap();
+
+ let output_batch = deserialize_file(serialize_file(&input_batch));
+ assert_eq!(input_batch, output_batch);
+
+ let output_batch = deserialize_stream(serialize_stream(&input_batch));
+ assert_eq!(input_batch, output_batch);
+ }
+
#[test]
fn test_decimal128_alignment16_is_sufficient() {
const IPC_ALIGNMENT: usize = 16;