This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new b25c4417456 Change `UnionArray` constructors (#5623)
b25c4417456 is described below

commit b25c441745602c9967b1e3cc4a28bc469cfb1311
Author: Matthijs Brobbel <[email protected]>
AuthorDate: Wed May 8 14:49:40 2024 +0200

    Change `UnionArray` constructors (#5623)
    
    * Change `UnionArray` constructors
    
    * Fix a comment
    
    * Clippy and avoid using hashmaps
    
    * Additional test
    
    ---------
    
    Co-authored-by: Raphael Taylor-Davies <[email protected]>
---
 arrow-array/src/array/union_array.rs     | 372 +++++++++++++++++--------------
 arrow-array/src/builder/union_builder.rs |  78 +++----
 arrow-cast/src/pretty.rs                 |  18 +-
 arrow-flight/src/encode.rs               | 101 ++++-----
 arrow-integration-test/src/lib.rs        |  23 +-
 arrow-ipc/src/reader.rs                  |  17 +-
 arrow-ipc/src/writer.rs                  |   9 +-
 arrow-schema/src/fields.rs               |   2 -
 arrow-select/src/take.rs                 |  30 +--
 arrow/tests/array_transform.rs           |  42 ++--
 10 files changed, 360 insertions(+), 332 deletions(-)

diff --git a/arrow-array/src/array/union_array.rs 
b/arrow-array/src/array/union_array.rs
index 22d4cf90a09..ea4853cd152 100644
--- a/arrow-array/src/array/union_array.rs
+++ b/arrow-array/src/array/union_array.rs
@@ -17,13 +17,12 @@
 
 use crate::{make_array, Array, ArrayRef};
 use arrow_buffer::buffer::NullBuffer;
-use arrow_buffer::{Buffer, ScalarBuffer};
+use arrow_buffer::ScalarBuffer;
 use arrow_data::{ArrayData, ArrayDataBuilder};
-use arrow_schema::{ArrowError, DataType, Field, UnionFields, UnionMode};
+use arrow_schema::{ArrowError, DataType, UnionFields, UnionMode};
 /// Contains the `UnionArray` type.
 ///
 use std::any::Any;
-use std::collections::HashMap;
 use std::sync::Arc;
 
 /// An array of [values of varying 
types](https://arrow.apache.org/docs/format/Columnar.html#union-layout)
@@ -43,25 +42,30 @@ use std::sync::Arc;
 /// # Examples
 /// ## Create a dense UnionArray `[1, 3.2, 34]`
 /// ```
-/// use arrow_buffer::Buffer;
+/// use arrow_buffer::ScalarBuffer;
 /// use arrow_schema::*;
 /// use std::sync::Arc;
 /// use arrow_array::{Array, Int32Array, Float64Array, UnionArray};
 ///
 /// let int_array = Int32Array::from(vec![1, 34]);
 /// let float_array = Float64Array::from(vec![3.2]);
-/// let type_id_buffer = Buffer::from_slice_ref(&[0_i8, 1, 0]);
-/// let value_offsets_buffer = Buffer::from_slice_ref(&[0_i32, 0, 1]);
+/// let type_ids = [0, 1, 0].into_iter().collect::<ScalarBuffer<i8>>();
+/// let offsets = [0, 0, 1].into_iter().collect::<ScalarBuffer<i32>>();
 ///
-/// let children: Vec<(Field, Arc<dyn Array>)> = vec![
-///     (Field::new("A", DataType::Int32, false), Arc::new(int_array)),
-///     (Field::new("B", DataType::Float64, false), Arc::new(float_array)),
+/// let union_fields = [
+///     (0, Arc::new(Field::new("A", DataType::Int32, false))),
+///     (1, Arc::new(Field::new("B", DataType::Float64, false))),
+/// ].into_iter().collect::<UnionFields>();
+///
+/// let children = vec![
+///     Arc::new(int_array) as Arc<dyn Array>,
+///     Arc::new(float_array),
 /// ];
 ///
 /// let array = UnionArray::try_new(
-///     &vec![0, 1],
-///     type_id_buffer,
-///     Some(value_offsets_buffer),
+///     union_fields,
+///     type_ids,
+///     Some(offsets),
 ///     children,
 /// ).unwrap();
 ///
@@ -77,23 +81,28 @@ use std::sync::Arc;
 ///
 /// ## Create a sparse UnionArray `[1, 3.2, 34]`
 /// ```
-/// use arrow_buffer::Buffer;
+/// use arrow_buffer::ScalarBuffer;
 /// use arrow_schema::*;
 /// use std::sync::Arc;
 /// use arrow_array::{Array, Int32Array, Float64Array, UnionArray};
 ///
 /// let int_array = Int32Array::from(vec![Some(1), None, Some(34)]);
 /// let float_array = Float64Array::from(vec![None, Some(3.2), None]);
-/// let type_id_buffer = Buffer::from_slice_ref(&[0_i8, 1, 0]);
+/// let type_ids = [0_i8, 1, 0].into_iter().collect::<ScalarBuffer<i8>>();
+///
+/// let union_fields = [
+///     (0, Arc::new(Field::new("A", DataType::Int32, false))),
+///     (1, Arc::new(Field::new("B", DataType::Float64, false))),
+/// ].into_iter().collect::<UnionFields>();
 ///
-/// let children: Vec<(Field, Arc<dyn Array>)> = vec![
-///     (Field::new("A", DataType::Int32, false), Arc::new(int_array)),
-///     (Field::new("B", DataType::Float64, false), Arc::new(float_array)),
+/// let children = vec![
+///     Arc::new(int_array) as Arc<dyn Array>,
+///     Arc::new(float_array),
 /// ];
 ///
 /// let array = UnionArray::try_new(
-///     &vec![0, 1],
-///     type_id_buffer,
+///     union_fields,
+///     type_ids,
 ///     None,
 ///     children,
 /// ).unwrap();
@@ -125,102 +134,99 @@ impl UnionArray {
     ///
     /// # Safety
     ///
-    /// The `type_ids` `Buffer` should contain `i8` values.  These values 
should be greater than
-    /// zero and must be less than the number of children provided in 
`child_arrays`.  These values
-    /// are used to index into the `child_arrays`.
+    /// The `type_ids` values should be positive and must match one of the 
type ids of the fields provided in `fields`.
+    /// These values are used to index into the `children` arrays.
     ///
-    /// The `value_offsets` `Buffer` is only provided in the case of a dense 
union, sparse unions
-    /// should use `None`.  If provided the `value_offsets` `Buffer` should 
contain `i32` values.
-    /// The values in this array should be greater than zero and must be less 
than the length of the
-    /// overall array.
+    /// The `offsets` is provided in the case of a dense union, sparse unions 
should use `None`.
+    /// If provided the `offsets` values should be positive and must be less 
than the length of the
+    /// corresponding array.
     ///
     /// In both cases above we use signed integer types to maintain 
compatibility with other
     /// Arrow implementations.
-    ///
-    /// In both of the cases above we are accepting `Buffer`'s which are 
assumed to be representing
-    /// `i8` and `i32` values respectively.  `Buffer` objects are untyped and 
no attempt is made
-    /// to ensure that the data provided is valid.
     pub unsafe fn new_unchecked(
-        field_type_ids: &[i8],
-        type_ids: Buffer,
-        value_offsets: Option<Buffer>,
-        child_arrays: Vec<(Field, ArrayRef)>,
+        fields: UnionFields,
+        type_ids: ScalarBuffer<i8>,
+        offsets: Option<ScalarBuffer<i32>>,
+        children: Vec<ArrayRef>,
     ) -> Self {
-        let (fields, field_values): (Vec<_>, Vec<_>) = 
child_arrays.into_iter().unzip();
-        let len = type_ids.len();
-
-        let mode = if value_offsets.is_some() {
+        let mode = if offsets.is_some() {
             UnionMode::Dense
         } else {
             UnionMode::Sparse
         };
 
-        let builder = ArrayData::builder(DataType::Union(
-            UnionFields::new(field_type_ids.iter().copied(), fields),
-            mode,
-        ))
-        .add_buffer(type_ids)
-        .child_data(field_values.into_iter().map(|a| a.into_data()).collect())
-        .len(len);
+        let len = type_ids.len();
+        let builder = ArrayData::builder(DataType::Union(fields, mode))
+            .add_buffer(type_ids.into_inner())
+            .child_data(children.into_iter().map(Array::into_data).collect())
+            .len(len);
 
-        let data = match value_offsets {
-            Some(b) => builder.add_buffer(b).build_unchecked(),
+        let data = match offsets {
+            Some(offsets) => 
builder.add_buffer(offsets.into_inner()).build_unchecked(),
             None => builder.build_unchecked(),
         };
         Self::from(data)
     }
 
     /// Attempts to create a new `UnionArray`, validating the inputs provided.
+    ///
+    /// The order of child arrays child array order must match the fields order
     pub fn try_new(
-        field_type_ids: &[i8],
-        type_ids: Buffer,
-        value_offsets: Option<Buffer>,
-        child_arrays: Vec<(Field, ArrayRef)>,
+        fields: UnionFields,
+        type_ids: ScalarBuffer<i8>,
+        offsets: Option<ScalarBuffer<i32>>,
+        children: Vec<ArrayRef>,
     ) -> Result<Self, ArrowError> {
-        if let Some(b) = &value_offsets {
-            if ((type_ids.len()) * 4) != b.len() {
+        // There must be a child array for every field.
+        if fields.len() != children.len() {
+            return Err(ArrowError::InvalidArgumentError(
+                "Union fields length must match child arrays 
length".to_string(),
+            ));
+        }
+
+        // There must be an offset value for every type id value.
+        if let Some(offsets) = &offsets {
+            if offsets.len() != type_ids.len() {
                 return Err(ArrowError::InvalidArgumentError(
-                    "Type Ids and Offsets represent a different number of 
array slots.".to_string(),
+                    "Type Ids and Offsets lengths must match".to_string(),
                 ));
             }
         }
 
-        // Check the type_ids
-        let type_id_slice: &[i8] = type_ids.typed_data();
-        let invalid_type_ids = type_id_slice
-            .iter()
-            .filter(|i| *i < &0)
-            .collect::<Vec<&i8>>();
-        if !invalid_type_ids.is_empty() {
-            return Err(ArrowError::InvalidArgumentError(format!(
-                "Type Ids must be positive and cannot be greater than the 
number of \
-                child arrays, found:\n{invalid_type_ids:?}"
-            )));
+        // Create mapping from type id to array lengths.
+        let max_id = fields.iter().map(|(i, _)| i).max().unwrap_or_default() 
as usize;
+        let mut array_lens = vec![i32::MIN; max_id + 1];
+        for (cd, (field_id, _)) in children.iter().zip(fields.iter()) {
+            array_lens[field_id as usize] = cd.len() as i32;
         }
 
-        // Check the value offsets if provided
-        if let Some(offset_buffer) = &value_offsets {
-            let max_len = type_ids.len() as i32;
-            let offsets_slice: &[i32] = offset_buffer.typed_data();
-            let invalid_offsets = offsets_slice
-                .iter()
-                .filter(|i| *i < &0 || *i > &max_len)
-                .collect::<Vec<&i32>>();
-            if !invalid_offsets.is_empty() {
-                return Err(ArrowError::InvalidArgumentError(format!(
-                    "Offsets must be positive and within the length of the 
Array, \
-                    found:\n{invalid_offsets:?}"
-                )));
+        // Type id values must match one of the fields.
+        for id in &type_ids {
+            match array_lens.get(*id as usize) {
+                Some(x) if *x != i32::MIN => {}
+                _ => {
+                    return Err(ArrowError::InvalidArgumentError(
+                        "Type Ids values must match one of the field type 
ids".to_owned(),
+                    ))
+                }
             }
         }
 
-        // Unsafe Justification: arguments were validated above (and
-        // re-revalidated as part of data().validate() below)
-        let new_self =
-            unsafe { Self::new_unchecked(field_type_ids, type_ids, 
value_offsets, child_arrays) };
-        new_self.to_data().validate()?;
+        // Check the value offsets are in bounds.
+        if let Some(offsets) = &offsets {
+            let mut iter = type_ids.iter().zip(offsets.iter());
+            if iter.any(|(type_id, &offset)| offset < 0 || offset >= 
array_lens[*type_id as usize])
+            {
+                return Err(ArrowError::InvalidArgumentError(
+                    "Offsets must be positive and within the length of the 
Array".to_owned(),
+                ));
+            }
+        }
 
-        Ok(new_self)
+        // Safety:
+        // - Arguments validated above.
+        let union_array = unsafe { Self::new_unchecked(fields, type_ids, 
offsets, children) };
+        Ok(union_array)
     }
 
     /// Accesses the child array for `type_id`.
@@ -336,14 +342,14 @@ impl UnionArray {
     /// let union_array = builder.build()?;
     ///
     /// // Deconstruct into parts
-    /// let (type_ids, offsets, field_type_ids, fields) = 
union_array.into_parts();
+    /// let (union_fields, type_ids, offsets, children) = 
union_array.into_parts();
     ///
     /// // Reconstruct from parts
     /// let union_array = UnionArray::try_new(
-    ///     &field_type_ids,
-    ///     type_ids.into_inner(),
-    ///     offsets.map(ScalarBuffer::into_inner),
-    ///     fields,
+    ///     union_fields,
+    ///     type_ids,
+    ///     offsets,
+    ///     children,
     /// );
     /// # Ok(())
     /// # }
@@ -352,34 +358,24 @@ impl UnionArray {
     pub fn into_parts(
         self,
     ) -> (
+        UnionFields,
         ScalarBuffer<i8>,
         Option<ScalarBuffer<i32>>,
-        Vec<i8>,
-        Vec<(Field, ArrayRef)>,
+        Vec<ArrayRef>,
     ) {
         let Self {
             data_type,
             type_ids,
             offsets,
-            fields,
+            mut fields,
         } = self;
         match data_type {
             DataType::Union(union_fields, _) => {
-                let union_fields = union_fields.iter().collect::<HashMap<_, 
_>>();
-                let (field_type_ids, fields) = fields
-                    .into_iter()
-                    .enumerate()
-                    .flat_map(|(type_id, array_ref)| {
-                        array_ref.map(|array_ref| {
-                            let type_id = type_id as i8;
-                            (
-                                type_id,
-                                
((*Arc::clone(union_fields[&type_id])).clone(), array_ref),
-                            )
-                        })
-                    })
-                    .unzip();
-                (type_ids, offsets, field_type_ids, fields)
+                let children = union_fields
+                    .iter()
+                    .map(|(type_id, _)| fields[type_id as 
usize].take().unwrap())
+                    .collect();
+                (union_fields, type_ids, offsets, children)
             }
             _ => unreachable!(),
         }
@@ -569,6 +565,7 @@ impl std::fmt::Debug for UnionArray {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use std::collections::HashSet;
 
     use crate::array::Int8Type;
     use crate::builder::UnionBuilder;
@@ -576,7 +573,8 @@ mod tests {
     use crate::types::{Float32Type, Float64Type, Int32Type, Int64Type};
     use crate::RecordBatch;
     use crate::{Float64Array, Int32Array, Int64Array, StringArray};
-    use arrow_schema::Schema;
+    use arrow_buffer::Buffer;
+    use arrow_schema::{Field, Schema};
 
     #[test]
     fn test_dense_i32() {
@@ -809,30 +807,27 @@ mod tests {
         let int_array = Int32Array::from(vec![5, 6]);
         let float_array = Float64Array::from(vec![10.0]);
 
-        let type_ids = [1_i8, 0, 0, 2, 0, 1];
-        let offsets = [0_i32, 0, 1, 0, 2, 1];
-
-        let type_id_buffer = Buffer::from_slice_ref(type_ids);
-        let value_offsets_buffer = Buffer::from_slice_ref(offsets);
-
-        let children: Vec<(Field, Arc<dyn Array>)> = vec![
-            (
-                Field::new("A", DataType::Utf8, false),
-                Arc::new(string_array),
-            ),
-            (Field::new("B", DataType::Int32, false), Arc::new(int_array)),
-            (
-                Field::new("C", DataType::Float64, false),
-                Arc::new(float_array),
-            ),
-        ];
-        let array = UnionArray::try_new(
-            &[0, 1, 2],
-            type_id_buffer,
-            Some(value_offsets_buffer),
-            children,
-        )
-        .unwrap();
+        let type_ids = [1, 0, 0, 2, 0, 
1].into_iter().collect::<ScalarBuffer<i8>>();
+        let offsets = [0, 0, 1, 0, 2, 1]
+            .into_iter()
+            .collect::<ScalarBuffer<i32>>();
+
+        let fields = [
+            (0, Arc::new(Field::new("A", DataType::Utf8, false))),
+            (1, Arc::new(Field::new("B", DataType::Int32, false))),
+            (2, Arc::new(Field::new("C", DataType::Float64, false))),
+        ]
+        .into_iter()
+        .collect::<UnionFields>();
+        let children = [
+            Arc::new(string_array) as Arc<dyn Array>,
+            Arc::new(int_array),
+            Arc::new(float_array),
+        ]
+        .into_iter()
+        .collect();
+        let array =
+            UnionArray::try_new(fields, type_ids.clone(), 
Some(offsets.clone()), children).unwrap();
 
         // Check type ids
         assert_eq!(*array.type_ids(), type_ids);
@@ -1277,29 +1272,22 @@ mod tests {
         let dense_union = builder.build().unwrap();
 
         let field = [
-            Field::new("a", DataType::Int32, false),
-            Field::new("b", DataType::Int8, false),
+            &Arc::new(Field::new("a", DataType::Int32, false)),
+            &Arc::new(Field::new("b", DataType::Int8, false)),
         ];
-        let (type_ids, offsets, field_type_ids, fields) = 
dense_union.into_parts();
-        assert_eq!(field_type_ids, [0, 1]);
+        let (union_fields, type_ids, offsets, children) = 
dense_union.into_parts();
         assert_eq!(
-            field.to_vec(),
-            fields
+            union_fields
                 .iter()
-                .cloned()
-                .map(|(field, _)| field)
-                .collect::<Vec<_>>()
+                .map(|(_, field)| field)
+                .collect::<Vec<_>>(),
+            field
         );
         assert_eq!(type_ids, [0, 1, 0]);
         assert!(offsets.is_some());
         assert_eq!(offsets.as_ref().unwrap(), &[0, 0, 1]);
 
-        let result = UnionArray::try_new(
-            &field_type_ids,
-            type_ids.into_inner(),
-            offsets.map(ScalarBuffer::into_inner),
-            fields,
-        );
+        let result = UnionArray::try_new(union_fields, type_ids, offsets, 
children);
         assert!(result.is_ok());
         assert_eq!(result.unwrap().len(), 3);
 
@@ -1309,23 +1297,18 @@ mod tests {
         builder.append::<Int32Type>("a", 3).unwrap();
         let sparse_union = builder.build().unwrap();
 
-        let (type_ids, offsets, field_type_ids, fields) = 
sparse_union.into_parts();
+        let (union_fields, type_ids, offsets, children) = 
sparse_union.into_parts();
         assert_eq!(type_ids, [0, 1, 0]);
         assert!(offsets.is_none());
 
-        let result = UnionArray::try_new(
-            &field_type_ids,
-            type_ids.into_inner(),
-            offsets.map(ScalarBuffer::into_inner),
-            fields,
-        );
+        let result = UnionArray::try_new(union_fields, type_ids, offsets, 
children);
         assert!(result.is_ok());
         assert_eq!(result.unwrap().len(), 3);
     }
 
     #[test]
     fn into_parts_custom_type_ids() {
-        let mut set_field_type_ids: [i8; 3] = [8, 4, 9];
+        let set_field_type_ids: [i8; 3] = [8, 4, 9];
         let data_type = DataType::Union(
             UnionFields::new(
                 set_field_type_ids,
@@ -1354,17 +1337,80 @@ mod tests {
             .unwrap();
         let array = UnionArray::from(data);
 
-        let (type_ids, offsets, field_type_ids, fields) = array.into_parts();
-        set_field_type_ids.sort();
-        assert_eq!(field_type_ids, set_field_type_ids);
-        let result = UnionArray::try_new(
-            &field_type_ids,
-            type_ids.into_inner(),
-            offsets.map(ScalarBuffer::into_inner),
-            fields,
+        let (union_fields, type_ids, offsets, children) = array.into_parts();
+        assert_eq!(
+            type_ids.iter().collect::<HashSet<_>>(),
+            set_field_type_ids.iter().collect::<HashSet<_>>()
         );
+        let result = UnionArray::try_new(union_fields, type_ids, offsets, 
children);
         assert!(result.is_ok());
         let array = result.unwrap();
         assert_eq!(array.len(), 7);
     }
+
+    #[test]
+    fn test_invalid() {
+        let fields = UnionFields::new(
+            [3, 2],
+            [
+                Field::new("a", DataType::Utf8, false),
+                Field::new("b", DataType::Utf8, false),
+            ],
+        );
+        let children = vec![
+            Arc::new(StringArray::from_iter_values(["a", "b"])) as _,
+            Arc::new(StringArray::from_iter_values(["c", "d"])) as _,
+        ];
+
+        let type_ids = vec![3, 3, 2].into();
+        UnionArray::try_new(fields.clone(), type_ids, None, 
children.clone()).unwrap();
+
+        let type_ids = vec![1, 2].into();
+        let err =
+            UnionArray::try_new(fields.clone(), type_ids, None, 
children.clone()).unwrap_err();
+        assert_eq!(
+            err.to_string(),
+            "Invalid argument error: Type Ids values must match one of the 
field type ids"
+        );
+
+        let type_ids = vec![7, 2].into();
+        let err = UnionArray::try_new(fields.clone(), type_ids, None, 
children).unwrap_err();
+        assert_eq!(
+            err.to_string(),
+            "Invalid argument error: Type Ids values must match one of the 
field type ids"
+        );
+
+        let children = vec![
+            Arc::new(StringArray::from_iter_values(["a", "b"])) as _,
+            Arc::new(StringArray::from_iter_values(["c"])) as _,
+        ];
+        let type_ids = ScalarBuffer::from(vec![3_i8, 3, 2]);
+        let offsets = Some(vec![0, 1, 0].into());
+        UnionArray::try_new(fields.clone(), type_ids.clone(), offsets, 
children.clone()).unwrap();
+
+        let offsets = Some(vec![0, 1, 1].into());
+        let err = UnionArray::try_new(fields.clone(), type_ids.clone(), 
offsets, children.clone())
+            .unwrap_err();
+
+        assert_eq!(
+            err.to_string(),
+            "Invalid argument error: Offsets must be positive and within the 
length of the Array"
+        );
+
+        let offsets = Some(vec![0, 1].into());
+        let err =
+            UnionArray::try_new(fields.clone(), type_ids.clone(), offsets, 
children).unwrap_err();
+
+        assert_eq!(
+            err.to_string(),
+            "Invalid argument error: Type Ids and Offsets lengths must match"
+        );
+
+        let err = UnionArray::try_new(fields.clone(), type_ids, None, 
vec![]).unwrap_err();
+
+        assert_eq!(
+            err.to_string(),
+            "Invalid argument error: Union fields length must match child 
arrays length"
+        );
+    }
 }
diff --git a/arrow-array/src/builder/union_builder.rs 
b/arrow-array/src/builder/union_builder.rs
index 4f88c9d41b9..e6184f4ac6d 100644
--- a/arrow-array/src/builder/union_builder.rs
+++ b/arrow-array/src/builder/union_builder.rs
@@ -23,7 +23,8 @@ use arrow_buffer::{ArrowNativeType, Buffer};
 use arrow_data::ArrayDataBuilder;
 use arrow_schema::{ArrowError, DataType, Field};
 use std::any::Any;
-use std::collections::HashMap;
+use std::collections::BTreeMap;
+use std::sync::Arc;
 
 /// `FieldData` is a helper struct to track the state of the fields in the 
`UnionBuilder`.
 #[derive(Debug)]
@@ -142,7 +143,7 @@ pub struct UnionBuilder {
     /// The current number of slots in the array
     len: usize,
     /// Maps field names to `FieldData` instances which track the builders for 
that field
-    fields: HashMap<String, FieldData>,
+    fields: BTreeMap<String, FieldData>,
     /// Builder to keep track of type ids
     type_id_builder: Int8BufferBuilder,
     /// Builder to keep track of offsets (`None` for sparse unions)
@@ -165,7 +166,7 @@ impl UnionBuilder {
     pub fn with_capacity_dense(capacity: usize) -> Self {
         Self {
             len: 0,
-            fields: HashMap::default(),
+            fields: Default::default(),
             type_id_builder: Int8BufferBuilder::new(capacity),
             value_offset_builder: Some(Int32BufferBuilder::new(capacity)),
             initial_capacity: capacity,
@@ -176,7 +177,7 @@ impl UnionBuilder {
     pub fn with_capacity_sparse(capacity: usize) -> Self {
         Self {
             len: 0,
-            fields: HashMap::default(),
+            fields: Default::default(),
             type_id_builder: Int8BufferBuilder::new(capacity),
             value_offset_builder: None,
             initial_capacity: capacity,
@@ -274,40 +275,39 @@ impl UnionBuilder {
     }
 
     /// Builds this builder creating a new `UnionArray`.
-    pub fn build(mut self) -> Result<UnionArray, ArrowError> {
-        let type_id_buffer = self.type_id_builder.finish();
-        let value_offsets_buffer = self.value_offset_builder.map(|mut b| 
b.finish());
-        let mut children = Vec::new();
-        for (
-            name,
-            FieldData {
-                type_id,
-                data_type,
-                mut values_buffer,
-                slots,
-                null_buffer_builder: mut bitmap_builder,
-            },
-        ) in self.fields.into_iter()
-        {
-            let buffer = values_buffer.finish();
-            let arr_data_builder = ArrayDataBuilder::new(data_type.clone())
-                .add_buffer(buffer)
-                .len(slots)
-                .nulls(bitmap_builder.finish());
-
-            let arr_data_ref = unsafe { arr_data_builder.build_unchecked() };
-            let array_ref = make_array(arr_data_ref);
-            children.push((type_id, (Field::new(name, data_type, false), 
array_ref)))
-        }
-
-        children.sort_by(|a, b| {
-            a.0.partial_cmp(&b.0)
-                .expect("This will never be None as type ids are always i8 
values.")
-        });
-        let children: Vec<_> = children.into_iter().map(|(_, b)| b).collect();
-
-        let type_ids: Vec<i8> = (0_i8..children.len() as i8).collect();
-
-        UnionArray::try_new(&type_ids, type_id_buffer, value_offsets_buffer, 
children)
+    pub fn build(self) -> Result<UnionArray, ArrowError> {
+        let mut children = Vec::with_capacity(self.fields.len());
+        let union_fields = self
+            .fields
+            .into_iter()
+            .map(
+                |(
+                    name,
+                    FieldData {
+                        type_id,
+                        data_type,
+                        mut values_buffer,
+                        slots,
+                        mut null_buffer_builder,
+                    },
+                )| {
+                    let array_ref = make_array(unsafe {
+                        ArrayDataBuilder::new(data_type.clone())
+                            .add_buffer(values_buffer.finish())
+                            .len(slots)
+                            .nulls(null_buffer_builder.finish())
+                            .build_unchecked()
+                    });
+                    children.push(array_ref);
+                    (type_id, Arc::new(Field::new(name, data_type, false)))
+                },
+            )
+            .collect();
+        UnionArray::try_new(
+            union_fields,
+            self.type_id_builder.into(),
+            self.value_offset_builder.map(Into::into),
+            children,
+        )
     }
 }
diff --git a/arrow-cast/src/pretty.rs b/arrow-cast/src/pretty.rs
index da7c5e9bb6b..00bba928114 100644
--- a/arrow-cast/src/pretty.rs
+++ b/arrow-cast/src/pretty.rs
@@ -142,7 +142,7 @@ mod tests {
     use arrow_array::builder::*;
     use arrow_array::types::*;
     use arrow_array::*;
-    use arrow_buffer::Buffer;
+    use arrow_buffer::ScalarBuffer;
     use arrow_schema::*;
 
     use crate::display::array_value_to_string;
@@ -851,14 +851,18 @@ mod tests {
 
         // Can't use UnionBuilder with non-primitive types, so manually build 
outer UnionArray
         let a_array = Int32Array::from(vec![None, None, None, Some(1234), 
Some(23)]);
-        let type_ids = Buffer::from_slice_ref([1_i8, 1, 0, 0, 1]);
+        let type_ids = [1, 1, 0, 0, 
1].into_iter().collect::<ScalarBuffer<i8>>();
 
-        let children: Vec<(Field, Arc<dyn Array>)> = vec![
-            (Field::new("a", DataType::Int32, true), Arc::new(a_array)),
-            (inner_field.clone(), Arc::new(inner)),
-        ];
+        let children = vec![Arc::new(a_array) as Arc<dyn Array>, 
Arc::new(inner)];
+
+        let union_fields = [
+            (0, Arc::new(Field::new("a", DataType::Int32, true))),
+            (1, Arc::new(inner_field.clone())),
+        ]
+        .into_iter()
+        .collect();
 
-        let outer = UnionArray::try_new(&[0, 1], type_ids, None, 
children).unwrap();
+        let outer = UnionArray::try_new(union_fields, type_ids, None, 
children).unwrap();
 
         let schema = Schema::new(vec![Field::new_union(
             "Teamsters",
diff --git a/arrow-flight/src/encode.rs b/arrow-flight/src/encode.rs
index 7604f3cd4d6..f59c29e6817 100644
--- a/arrow-flight/src/encode.rs
+++ b/arrow-flight/src/encode.rs
@@ -597,20 +597,17 @@ fn hydrate_dictionary(array: &ArrayRef, data_type: 
&DataType) -> Result<ArrayRef
         (DataType::Union(_, UnionMode::Sparse), DataType::Union(fields, 
UnionMode::Sparse)) => {
             let union_arr = 
array.as_any().downcast_ref::<UnionArray>().unwrap();
 
-            let (type_ids, fields): (Vec<i8>, Vec<&FieldRef>) = 
fields.iter().unzip();
-
             Arc::new(UnionArray::try_new(
-                &type_ids,
-                union_arr.type_ids().inner().clone(),
+                fields.clone(),
+                union_arr.type_ids().clone(),
                 None,
                 fields
                     .iter()
-                    .enumerate()
-                    .map(|(col, field)| {
-                        Ok((
-                            field.as_ref().clone(),
-                            arrow_cast::cast(union_arr.child(col as i8), 
field.data_type())?,
-                        ))
+                    .map(|(type_id, field)| {
+                        Ok(arrow_cast::cast(
+                            union_arr.child(type_id),
+                            field.data_type(),
+                        )?)
                     })
                     .collect::<Result<Vec<_>>>()?,
             )?)
@@ -625,10 +622,10 @@ mod tests {
     use arrow_array::builder::StringDictionaryBuilder;
     use arrow_array::*;
     use arrow_array::{cast::downcast_array, types::*};
-    use arrow_buffer::Buffer;
+    use arrow_buffer::ScalarBuffer;
     use arrow_cast::pretty::pretty_format_batches;
     use arrow_ipc::MetadataVersion;
-    use arrow_schema::UnionMode;
+    use arrow_schema::{UnionFields, UnionMode};
     use std::collections::HashMap;
 
     use crate::decode::{DecodedPayload, FlightDataDecoder};
@@ -849,16 +846,23 @@ mod tests {
             true,
         )];
 
-        let type_ids = vec![0, 1, 2];
-        let union_fields = vec![
-            Field::new_list(
-                "dict_list",
-                Field::new_dictionary("item", DataType::UInt16, 
DataType::Utf8, true),
-                true,
+        let union_fields = [
+            (
+                0,
+                Arc::new(Field::new_list(
+                    "dict_list",
+                    Field::new_dictionary("item", DataType::UInt16, 
DataType::Utf8, true),
+                    true,
+                )),
             ),
-            Field::new_struct("struct", struct_fields.clone(), true),
-            Field::new("string", DataType::Utf8, true),
-        ];
+            (
+                1,
+                Arc::new(Field::new_struct("struct", struct_fields.clone(), 
true)),
+            ),
+            (2, Arc::new(Field::new("string", DataType::Utf8, true))),
+        ]
+        .into_iter()
+        .collect::<UnionFields>();
 
         let struct_fields = vec![Field::new_list(
             "dict_list",
@@ -872,21 +876,15 @@ mod tests {
 
         let arr1 = builder.finish();
 
-        let type_id_buffer = Buffer::from_slice_ref([0_i8]);
+        let type_id_buffer = [0].into_iter().collect::<ScalarBuffer<i8>>();
         let arr1 = UnionArray::try_new(
-            &type_ids,
+            union_fields.clone(),
             type_id_buffer,
             None,
             vec![
-                (union_fields[0].clone(), Arc::new(arr1)),
-                (
-                    union_fields[1].clone(),
-                    new_null_array(union_fields[1].data_type(), 1),
-                ),
-                (
-                    union_fields[2].clone(),
-                    new_null_array(union_fields[2].data_type(), 1),
-                ),
+                Arc::new(arr1) as Arc<dyn Array>,
+                
new_null_array(union_fields.iter().nth(1).unwrap().1.data_type(), 1),
+                
new_null_array(union_fields.iter().nth(2).unwrap().1.data_type(), 1),
             ],
         )
         .unwrap();
@@ -896,47 +894,36 @@ mod tests {
         let arr2 = Arc::new(builder.finish());
         let arr2 = StructArray::new(struct_fields.clone().into(), vec![arr2], 
None);
 
-        let type_id_buffer = Buffer::from_slice_ref([1_i8]);
+        let type_id_buffer = [1].into_iter().collect::<ScalarBuffer<i8>>();
         let arr2 = UnionArray::try_new(
-            &type_ids,
+            union_fields.clone(),
             type_id_buffer,
             None,
             vec![
-                (
-                    union_fields[0].clone(),
-                    new_null_array(union_fields[0].data_type(), 1),
-                ),
-                (union_fields[1].clone(), Arc::new(arr2)),
-                (
-                    union_fields[2].clone(),
-                    new_null_array(union_fields[2].data_type(), 1),
-                ),
+                
new_null_array(union_fields.iter().next().unwrap().1.data_type(), 1),
+                Arc::new(arr2),
+                
new_null_array(union_fields.iter().nth(2).unwrap().1.data_type(), 1),
             ],
         )
         .unwrap();
 
-        let type_id_buffer = Buffer::from_slice_ref([2_i8]);
+        let type_id_buffer = [2].into_iter().collect::<ScalarBuffer<i8>>();
         let arr3 = UnionArray::try_new(
-            &type_ids,
+            union_fields.clone(),
             type_id_buffer,
             None,
             vec![
-                (
-                    union_fields[0].clone(),
-                    new_null_array(union_fields[0].data_type(), 1),
-                ),
-                (
-                    union_fields[1].clone(),
-                    new_null_array(union_fields[1].data_type(), 1),
-                ),
-                (
-                    union_fields[2].clone(),
-                    Arc::new(StringArray::from(vec!["e"])),
-                ),
+                
new_null_array(union_fields.iter().next().unwrap().1.data_type(), 1),
+                
new_null_array(union_fields.iter().nth(1).unwrap().1.data_type(), 1),
+                Arc::new(StringArray::from(vec!["e"])),
             ],
         )
         .unwrap();
 
+        let (type_ids, union_fields): (Vec<_>, Vec<_>) = union_fields
+            .iter()
+            .map(|(type_id, field_ref)| (type_id, 
(*Arc::clone(field_ref)).clone()))
+            .unzip();
         let schema = Arc::new(Schema::new(vec![Field::new_union(
             "union",
             type_ids.clone(),
diff --git a/arrow-integration-test/src/lib.rs 
b/arrow-integration-test/src/lib.rs
index d6e0dda51a8..30f0ccfbe12 100644
--- a/arrow-integration-test/src/lib.rs
+++ b/arrow-integration-test/src/lib.rs
@@ -21,6 +21,7 @@
 //!
 //! This is not a canonical format, but provides a human-readable way of 
verifying language implementations
 
+use arrow_buffer::ScalarBuffer;
 use hex::decode;
 use num::BigInt;
 use num::Signed;
@@ -835,26 +836,18 @@ pub fn array_from_json(
                 ));
             };
 
-            let offset: Option<Buffer> = json_col.offset.map(|offsets| {
-                let offsets: Vec<i32> =
-                    offsets.iter().map(|v| v.as_i64().unwrap() as 
i32).collect();
-                Buffer::from(&offsets.to_byte_slice())
-            });
+            let offset: Option<ScalarBuffer<i32>> = json_col
+                .offset
+                .map(|offsets| offsets.iter().map(|v| v.as_i64().unwrap() as 
i32).collect());
 
-            let mut children: Vec<(Field, Arc<dyn Array>)> = vec![];
+            let mut children = Vec::with_capacity(fields.len());
             for ((_, field), col) in 
fields.iter().zip(json_col.children.unwrap()) {
                 let array = array_from_json(field, col, dictionaries)?;
-                children.push((field.as_ref().clone(), array));
+                children.push(array);
             }
 
-            let field_type_ids = fields.iter().map(|(id, _)| 
id).collect::<Vec<_>>();
-            let array = UnionArray::try_new(
-                &field_type_ids,
-                Buffer::from(&type_ids.to_byte_slice()),
-                offset,
-                children,
-            )
-            .unwrap();
+            let array =
+                UnionArray::try_new(fields.clone(), type_ids.into(), offset, 
children).unwrap();
             Ok(Arc::new(array))
         }
         t => Err(ArrowError::JsonError(format!(
diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs
index 8eac17e2076..3c203a7f365 100644
--- a/arrow-ipc/src/reader.rs
+++ b/arrow-ipc/src/reader.rs
@@ -31,7 +31,7 @@ use std::io::{BufReader, Read, Seek, SeekFrom};
 use std::sync::Arc;
 
 use arrow_array::*;
-use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer};
+use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer, ScalarBuffer};
 use arrow_data::ArrayData;
 use arrow_schema::*;
 
@@ -214,26 +214,25 @@ fn create_array(
                 reader.next_buffer()?;
             }
 
-            let type_ids: Buffer = reader.next_buffer()?[..len].into();
+            let type_ids: ScalarBuffer<i8> = 
reader.next_buffer()?.slice_with_length(0, len).into();
 
             let value_offsets = match mode {
                 UnionMode::Dense => {
-                    let buffer = reader.next_buffer()?;
-                    Some(buffer[..len * 4].into())
+                    let offsets: ScalarBuffer<i32> =
+                        reader.next_buffer()?.slice_with_length(0, len * 
4).into();
+                    Some(offsets)
                 }
                 UnionMode::Sparse => None,
             };
 
             let mut children = Vec::with_capacity(fields.len());
-            let mut ids = Vec::with_capacity(fields.len());
 
-            for (id, field) in fields.iter() {
+            for (_id, field) in fields.iter() {
                 let child = create_array(reader, field, variadic_counts, 
require_alignment)?;
-                children.push((field.as_ref().clone(), child));
-                ids.push(id);
+                children.push(child);
             }
 
-            let array = UnionArray::try_new(&ids, type_ids, value_offsets, 
children)?;
+            let array = UnionArray::try_new(fields.clone(), type_ids, 
value_offsets, children)?;
             Ok(Arc::new(array))
         }
         Null => {
diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs
index 97136bd97c2..ef08a6130e3 100644
--- a/arrow-ipc/src/writer.rs
+++ b/arrow-ipc/src/writer.rs
@@ -1526,6 +1526,7 @@ mod tests {
     use arrow_array::builder::UnionBuilder;
     use arrow_array::builder::{PrimitiveRunBuilder, UInt32Builder};
     use arrow_array::types::*;
+    use arrow_buffer::ScalarBuffer;
 
     use crate::convert::fb_to_schema;
     use crate::reader::*;
@@ -1800,12 +1801,12 @@ mod tests {
 
         // Dict field with id 2
         let dctfield = Field::new_dict("dict", array.data_type().clone(), 
false, 2, false);
+        let union_fields = [(0, Arc::new(dctfield))].into_iter().collect();
 
-        let types = Buffer::from_slice_ref([0_i8, 0, 0]);
-        let offsets = Buffer::from_slice_ref([0_i32, 1, 2]);
+        let types = [0, 0, 0].into_iter().collect::<ScalarBuffer<i8>>();
+        let offsets = [0, 1, 2].into_iter().collect::<ScalarBuffer<i32>>();
 
-        let union =
-            UnionArray::try_new(&[0], types, Some(offsets), vec![(dctfield, 
array)]).unwrap();
+        let union = UnionArray::try_new(union_fields, types, Some(offsets), 
vec![array]).unwrap();
 
         let schema = Arc::new(Schema::new(vec![Field::new(
             "union",
diff --git a/arrow-schema/src/fields.rs b/arrow-schema/src/fields.rs
index 5a1a6c84c25..63aef18ddf9 100644
--- a/arrow-schema/src/fields.rs
+++ b/arrow-schema/src/fields.rs
@@ -420,8 +420,6 @@ impl UnionFields {
     }
 
     /// Returns an iterator over the fields and type ids in this 
[`UnionFields`]
-    ///
-    /// Note: the iteration order is not guaranteed
     pub fn iter(&self) -> impl Iterator<Item = (i8, &FieldRef)> + '_ {
         self.0.iter().map(|(id, f)| (*id, f))
     }
diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs
index 8939d3f719f..a4dd2470ab6 100644
--- a/arrow-select/src/take.rs
+++ b/arrow-select/src/take.rs
@@ -229,18 +229,15 @@ fn take_impl<IndexType: ArrowPrimitiveType>(
             }
         }
         DataType::Union(fields, UnionMode::Sparse) => {
-            let mut field_type_ids = Vec::with_capacity(fields.len());
             let mut children = Vec::with_capacity(fields.len());
             let values = values.as_any().downcast_ref::<UnionArray>().unwrap();
-            let type_ids = take_native(values.type_ids(), 
indices).into_inner();
-            for (type_id, field) in fields.iter() {
+            let type_ids = take_native(values.type_ids(), indices);
+            for (type_id, _field) in fields.iter() {
                 let values = values.child(type_id);
                 let values = take_impl(values, indices)?;
-                let field = (**field).clone();
-                children.push((field, values));
-                field_type_ids.push(type_id);
+                children.push(values);
             }
-            let array = UnionArray::try_new(field_type_ids.as_slice(), 
type_ids, None, children)?;
+            let array = UnionArray::try_new(fields.clone(), type_ids, None, 
children)?;
             Ok(Arc::new(array))
         }
         t => unimplemented!("Take not supported for data type {:?}", t)
@@ -2151,19 +2148,22 @@ mod tests {
             None,
         ]);
         let strings = StringArray::from(vec![Some("a"), None, Some("c"), None, 
Some("d")]);
-        let type_ids = Buffer::from_slice_ref(vec![1i8; 5]);
+        let type_ids = [1; 5].into_iter().collect::<ScalarBuffer<i8>>();
 
-        let children: Vec<(Field, Arc<dyn Array>)> = vec![
+        let union_fields = [
             (
-                Field::new("f1", structs.data_type().clone(), true),
-                Arc::new(structs),
+                0,
+                Arc::new(Field::new("f1", structs.data_type().clone(), true)),
             ),
             (
-                Field::new("f2", strings.data_type().clone(), true),
-                Arc::new(strings),
+                1,
+                Arc::new(Field::new("f2", strings.data_type().clone(), true)),
             ),
-        ];
-        let array = UnionArray::try_new(&[0, 1], type_ids, None, 
children).unwrap();
+        ]
+        .into_iter()
+        .collect();
+        let children = vec![Arc::new(structs) as Arc<dyn Array>, 
Arc::new(strings)];
+        let array = UnionArray::try_new(union_fields, type_ids, None, 
children).unwrap();
 
         let indices = vec![0, 3, 1, 0, 2, 4];
         let index = UInt32Array::from(indices.clone());
diff --git a/arrow/tests/array_transform.rs b/arrow/tests/array_transform.rs
index 83d3003a058..42e4da7c4b4 100644
--- a/arrow/tests/array_transform.rs
+++ b/arrow/tests/array_transform.rs
@@ -23,10 +23,10 @@ use arrow::array::{
 };
 use arrow::datatypes::Int16Type;
 use arrow_array::StringViewArray;
-use arrow_buffer::Buffer;
+use arrow_buffer::{Buffer, ScalarBuffer};
 use arrow_data::transform::MutableArrayData;
 use arrow_data::ArrayData;
-use arrow_schema::{DataType, Field, Fields};
+use arrow_schema::{DataType, Field, Fields, UnionFields};
 use std::sync::Arc;
 
 #[allow(unused)]
@@ -482,17 +482,25 @@ fn test_union_dense() {
         Some(4),
         Some(5),
     ]));
-    let offsets = Buffer::from_slice_ref([0, 0, 1, 1, 2, 2, 3, 4i32]);
-    let type_ids = Buffer::from_slice_ref([42, 84, 42, 84, 84, 42, 84, 84i8]);
+    let offsets = [0, 0, 1, 1, 2, 2, 3, 4]
+        .into_iter()
+        .collect::<ScalarBuffer<i32>>();
+    let type_ids = [42, 84, 42, 84, 84, 42, 84, 84]
+        .into_iter()
+        .collect::<ScalarBuffer<i8>>();
+
+    let union_fields = [
+        (84, Arc::new(Field::new("int", DataType::Int32, false))),
+        (42, Arc::new(Field::new("string", DataType::Utf8, false))),
+    ]
+    .into_iter()
+    .collect::<UnionFields>();
 
     let array = UnionArray::try_new(
-        &[84, 42],
+        union_fields.clone(),
         type_ids,
         Some(offsets),
-        vec![
-            (Field::new("int", DataType::Int32, false), ints),
-            (Field::new("string", DataType::Utf8, false), strings),
-        ],
+        vec![ints, strings],
     )
     .unwrap()
     .into_data();
@@ -507,19 +515,11 @@ fn test_union_dense() {
     // Expected data
     let strings: ArrayRef = Arc::new(StringArray::from(vec![Some("doe")]));
     let ints: ArrayRef = Arc::new(Int32Array::from(vec![Some(3), Some(4)]));
-    let offsets = Buffer::from_slice_ref([0, 0, 1i32]);
-    let type_ids = Buffer::from_slice_ref([84, 42, 84i8]);
+    let offsets = [0, 0, 1].into_iter().collect::<ScalarBuffer<i32>>();
+    let type_ids = [84, 42, 84].into_iter().collect::<ScalarBuffer<i8>>();
 
-    let expected = UnionArray::try_new(
-        &[84, 42],
-        type_ids,
-        Some(offsets),
-        vec![
-            (Field::new("int", DataType::Int32, false), ints),
-            (Field::new("string", DataType::Utf8, false), strings),
-        ],
-    )
-    .unwrap();
+    let expected =
+        UnionArray::try_new(union_fields, type_ids, Some(offsets), vec![ints, 
strings]).unwrap();
 
     assert_eq!(array.to_data(), expected.to_data());
 }


Reply via email to