This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 7cd29d735 Add ArrayData::new_null and DataType::primitive_width (#3676)
7cd29d735 is described below

commit 7cd29d7353369589c18377de4300c44f91a54462
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Thu Feb 9 13:24:52 2023 +0000

    Add ArrayData::new_null and DataType::primitive_width (#3676)
    
    * Add ArrayData::new_null and DataType::primitive_width
    
    * Add FixedSizeBinary test
    
    * Update arrow-data/src/data.rs
    
    Co-authored-by: askoa <[email protected]>
    
    * Only generate nulls for first UnionArray child
    
    ---------
    
    Co-authored-by: askoa <[email protected]>
---
 arrow-array/src/array/mod.rs | 279 +++++++++-------------------------
 arrow-data/src/data.rs       | 354 +++++++++++++++++++------------------------
 arrow-schema/src/datatype.rs |  33 ++++
 arrow/src/ffi.rs             |  45 ++----
 4 files changed, 277 insertions(+), 434 deletions(-)

diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs
index e953781e5..b293d797e 100644
--- a/arrow-array/src/array/mod.rs
+++ b/arrow-array/src/array/mod.rs
@@ -20,7 +20,6 @@
 mod binary_array;
 
 use crate::types::*;
-use arrow_buffer::{Buffer, MutableBuffer, ToByteSlice};
 use arrow_data::ArrayData;
 use arrow_schema::{DataType, IntervalUnit, TimeUnit};
 use std::any::Any;
@@ -634,207 +633,7 @@ pub fn new_empty_array(data_type: &DataType) -> ArrayRef {
 /// assert_eq!(&array, &null_array);
 /// ```
 pub fn new_null_array(data_type: &DataType, length: usize) -> ArrayRef {
-    // context: https://github.com/apache/arrow/pull/9469#discussion_r574761687
-    match data_type {
-        DataType::Null => Arc::new(NullArray::new(length)),
-        DataType::Boolean => {
-            let null_buf: Buffer = MutableBuffer::new_null(length).into();
-            make_array(unsafe {
-                ArrayData::new_unchecked(
-                    data_type.clone(),
-                    length,
-                    Some(length),
-                    Some(null_buf.clone()),
-                    0,
-                    vec![null_buf],
-                    vec![],
-                )
-            })
-        }
-        DataType::Int8 => new_null_sized_array::<Int8Type>(data_type, length),
-        DataType::UInt8 => new_null_sized_array::<UInt8Type>(data_type, 
length),
-        DataType::Int16 => new_null_sized_array::<Int16Type>(data_type, 
length),
-        DataType::UInt16 => new_null_sized_array::<UInt16Type>(data_type, 
length),
-        DataType::Float16 => new_null_sized_array::<Float16Type>(data_type, 
length),
-        DataType::Int32 => new_null_sized_array::<Int32Type>(data_type, 
length),
-        DataType::UInt32 => new_null_sized_array::<UInt32Type>(data_type, 
length),
-        DataType::Float32 => new_null_sized_array::<Float32Type>(data_type, 
length),
-        DataType::Date32 => new_null_sized_array::<Date32Type>(data_type, 
length),
-        // expanding this into Date23{unit}Type results in needless branching
-        DataType::Time32(_) => new_null_sized_array::<Int32Type>(data_type, 
length),
-        DataType::Int64 => new_null_sized_array::<Int64Type>(data_type, 
length),
-        DataType::UInt64 => new_null_sized_array::<UInt64Type>(data_type, 
length),
-        DataType::Float64 => new_null_sized_array::<Float64Type>(data_type, 
length),
-        DataType::Date64 => new_null_sized_array::<Date64Type>(data_type, 
length),
-        // expanding this into Timestamp{unit}Type results in needless 
branching
-        DataType::Timestamp(_, _) => 
new_null_sized_array::<Int64Type>(data_type, length),
-        DataType::Time64(_) => new_null_sized_array::<Int64Type>(data_type, 
length),
-        DataType::Duration(_) => new_null_sized_array::<Int64Type>(data_type, 
length),
-        DataType::Interval(unit) => match unit {
-            IntervalUnit::YearMonth => {
-                new_null_sized_array::<IntervalYearMonthType>(data_type, 
length)
-            }
-            IntervalUnit::DayTime => {
-                new_null_sized_array::<IntervalDayTimeType>(data_type, length)
-            }
-            IntervalUnit::MonthDayNano => {
-                new_null_sized_array::<IntervalMonthDayNanoType>(data_type, 
length)
-            }
-        },
-        DataType::FixedSizeBinary(value_len) => make_array(unsafe {
-            ArrayData::new_unchecked(
-                data_type.clone(),
-                length,
-                Some(length),
-                Some(MutableBuffer::new_null(length).into()),
-                0,
-                vec![Buffer::from(vec![0u8; *value_len as usize * length])],
-                vec![],
-            )
-        }),
-        DataType::Binary | DataType::Utf8 => {
-            new_null_binary_array::<i32>(data_type, length)
-        }
-        DataType::LargeBinary | DataType::LargeUtf8 => {
-            new_null_binary_array::<i64>(data_type, length)
-        }
-        DataType::List(field) => {
-            new_null_list_array::<i32>(data_type, field.data_type(), length)
-        }
-        DataType::LargeList(field) => {
-            new_null_list_array::<i64>(data_type, field.data_type(), length)
-        }
-        DataType::FixedSizeList(field, value_len) => make_array(unsafe {
-            ArrayData::new_unchecked(
-                data_type.clone(),
-                length,
-                Some(length),
-                Some(MutableBuffer::new_null(length).into()),
-                0,
-                vec![],
-                vec![
-                    new_null_array(field.data_type(), *value_len as usize * 
length)
-                        .data()
-                        .clone(),
-                ],
-            )
-        }),
-        DataType::Struct(fields) => {
-            let fields: Vec<_> = fields
-                .iter()
-                .map(|field| (field.clone(), new_null_array(field.data_type(), 
length)))
-                .collect();
-
-            let null_buffer = MutableBuffer::new_null(length);
-            Arc::new(StructArray::from((fields, null_buffer.into())))
-        }
-        DataType::Map(field, _keys_sorted) => {
-            new_null_list_array::<i32>(data_type, field.data_type(), length)
-        }
-        DataType::Union(_, _, _) => {
-            unimplemented!("Creating null Union array not yet supported")
-        }
-        DataType::Dictionary(key, value) => {
-            let keys = new_null_array(key, length);
-            let keys = keys.data();
-
-            make_array(unsafe {
-                ArrayData::new_unchecked(
-                    data_type.clone(),
-                    length,
-                    Some(length),
-                    keys.null_buffer().cloned(),
-                    0,
-                    keys.buffers().into(),
-                    vec![new_empty_array(value.as_ref()).into_data()],
-                )
-            })
-        }
-        DataType::Decimal128(_, _) => {
-            new_null_sized_decimal(data_type, length, 
std::mem::size_of::<i128>())
-        }
-        DataType::Decimal256(_, _) => new_null_sized_decimal(data_type, 
length, 32),
-        DataType::RunEndEncoded(_, _) => todo!(),
-    }
-}
-
-#[inline]
-fn new_null_list_array<OffsetSize: OffsetSizeTrait>(
-    data_type: &DataType,
-    child_data_type: &DataType,
-    length: usize,
-) -> ArrayRef {
-    make_array(unsafe {
-        ArrayData::new_unchecked(
-            data_type.clone(),
-            length,
-            Some(length),
-            Some(MutableBuffer::new_null(length).into()),
-            0,
-            vec![Buffer::from(
-                vec![OffsetSize::zero(); length + 1].to_byte_slice(),
-            )],
-            vec![ArrayData::new_empty(child_data_type)],
-        )
-    })
-}
-
-#[inline]
-fn new_null_binary_array<OffsetSize: OffsetSizeTrait>(
-    data_type: &DataType,
-    length: usize,
-) -> ArrayRef {
-    make_array(unsafe {
-        ArrayData::new_unchecked(
-            data_type.clone(),
-            length,
-            Some(length),
-            Some(MutableBuffer::new_null(length).into()),
-            0,
-            vec![
-                Buffer::from(vec![OffsetSize::zero(); length + 
1].to_byte_slice()),
-                MutableBuffer::new(0).into(),
-            ],
-            vec![],
-        )
-    })
-}
-
-#[inline]
-fn new_null_sized_array<T: ArrowPrimitiveType>(
-    data_type: &DataType,
-    length: usize,
-) -> ArrayRef {
-    make_array(unsafe {
-        ArrayData::new_unchecked(
-            data_type.clone(),
-            length,
-            Some(length),
-            Some(MutableBuffer::new_null(length).into()),
-            0,
-            vec![Buffer::from(vec![0u8; length * T::get_byte_width()])],
-            vec![],
-        )
-    })
-}
-
-#[inline]
-fn new_null_sized_decimal(
-    data_type: &DataType,
-    length: usize,
-    byte_width: usize,
-) -> ArrayRef {
-    make_array(unsafe {
-        ArrayData::new_unchecked(
-            data_type.clone(),
-            length,
-            Some(length),
-            Some(MutableBuffer::new_null(length).into()),
-            0,
-            vec![Buffer::from(vec![0u8; length * byte_width])],
-            vec![],
-        )
-    })
+    make_array(ArrayData::new_null(data_type, length))
 }
 
 // Helper function for printing potentially long arrays.
@@ -881,8 +680,10 @@ where
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::cast::downcast_array;
-    use arrow_schema::Field;
+    use crate::cast::{as_union_array, downcast_array};
+    use crate::downcast_run_array;
+    use arrow_buffer::{Buffer, MutableBuffer};
+    use arrow_schema::{Field, UnionMode};
 
     #[test]
     fn test_empty_primitive() {
@@ -1012,6 +813,76 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_null_union() {
+        for mode in [UnionMode::Sparse, UnionMode::Dense] {
+            let data_type = DataType::Union(
+                vec![
+                    Field::new("foo", DataType::Int32, true),
+                    Field::new("bar", DataType::Int64, true),
+                ],
+                vec![2, 1],
+                mode,
+            );
+            let array = new_null_array(&data_type, 4);
+
+            let array = as_union_array(array.as_ref());
+            assert_eq!(array.len(), 4);
+            assert_eq!(array.null_count(), 0);
+
+            for i in 0..4 {
+                let a = array.value(i);
+                assert_eq!(a.len(), 1);
+                assert_eq!(a.null_count(), 1);
+                assert!(a.is_null(0))
+            }
+        }
+    }
+
+    #[test]
+    #[allow(unused_parens)]
+    fn test_null_runs() {
+        for r in [DataType::Int16, DataType::Int32, DataType::Int64] {
+            let data_type = DataType::RunEndEncoded(
+                Box::new(Field::new("run_ends", r, false)),
+                Box::new(Field::new("values", DataType::Utf8, true)),
+            );
+
+            let array = new_null_array(&data_type, 4);
+            let array = array.as_ref();
+
+            downcast_run_array! {
+                array => {
+                    assert_eq!(array.len(), 4);
+                    assert_eq!(array.null_count(), 0);
+                    assert_eq!(array.values().len(), 1);
+                    assert_eq!(array.values().null_count(), 1);
+                    assert_eq!(array.run_ends().values(), &[4]);
+
+                    let idx = array.get_physical_indices(&[0, 1, 2, 
3]).unwrap();
+                    assert_eq!(idx, &[0,0,0,0]);
+                }
+                d => unreachable!("{d}")
+            }
+        }
+    }
+
+    #[test]
+    fn test_null_fixed_size_binary() {
+        for size in [1, 2, 7] {
+            let array = new_null_array(&DataType::FixedSizeBinary(size), 6);
+            let array = array
+                .as_ref()
+                .as_any()
+                .downcast_ref::<FixedSizeBinaryArray>()
+                .unwrap();
+
+            assert_eq!(array.len(), 6);
+            assert_eq!(array.null_count(), 6);
+            array.iter().for_each(|x| assert!(x.is_none()));
+        }
+    }
+
     #[test]
     fn test_memory_size_null() {
         let null_arr = NullArray::new(32);
diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs
index 709262e83..8b727ec95 100644
--- a/arrow-data/src/data.rs
+++ b/arrow-data/src/data.rs
@@ -21,8 +21,7 @@
 use crate::{bit_iterator::BitSliceIterator, bitmap::Bitmap};
 use arrow_buffer::bit_chunk_iterator::BitChunks;
 use arrow_buffer::{bit_util, ArrowNativeType, Buffer, MutableBuffer};
-use arrow_schema::{ArrowError, DataType, IntervalUnit, UnionMode};
-use half::f16;
+use arrow_schema::{ArrowError, DataType, UnionMode};
 use std::convert::TryInto;
 use std::mem;
 use std::ops::Range;
@@ -69,71 +68,25 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: 
usize) -> [MutableBuff
             let buffer = MutableBuffer::new(bytes);
             [buffer, empty_buffer]
         }
-        DataType::UInt8 => [
-            MutableBuffer::new(capacity * mem::size_of::<u8>()),
-            empty_buffer,
-        ],
-        DataType::UInt16 => [
-            MutableBuffer::new(capacity * mem::size_of::<u16>()),
-            empty_buffer,
-        ],
-        DataType::UInt32 => [
-            MutableBuffer::new(capacity * mem::size_of::<u32>()),
-            empty_buffer,
-        ],
-        DataType::UInt64 => [
-            MutableBuffer::new(capacity * mem::size_of::<u64>()),
-            empty_buffer,
-        ],
-        DataType::Int8 => [
-            MutableBuffer::new(capacity * mem::size_of::<i8>()),
-            empty_buffer,
-        ],
-        DataType::Int16 => [
-            MutableBuffer::new(capacity * mem::size_of::<i16>()),
-            empty_buffer,
-        ],
-        DataType::Int32 => [
-            MutableBuffer::new(capacity * mem::size_of::<i32>()),
-            empty_buffer,
-        ],
-        DataType::Int64 => [
-            MutableBuffer::new(capacity * mem::size_of::<i64>()),
-            empty_buffer,
-        ],
-        DataType::Float16 => [
-            MutableBuffer::new(capacity * mem::size_of::<f16>()),
-            empty_buffer,
-        ],
-        DataType::Float32 => [
-            MutableBuffer::new(capacity * mem::size_of::<f32>()),
-            empty_buffer,
-        ],
-        DataType::Float64 => [
-            MutableBuffer::new(capacity * mem::size_of::<f64>()),
-            empty_buffer,
-        ],
-        DataType::Date32 | DataType::Time32(_) => [
-            MutableBuffer::new(capacity * mem::size_of::<i32>()),
-            empty_buffer,
-        ],
-        DataType::Date64
+        DataType::UInt8
+        | DataType::UInt16
+        | DataType::UInt32
+        | DataType::UInt64
+        | DataType::Int8
+        | DataType::Int16
+        | DataType::Int32
+        | DataType::Int64
+        | DataType::Float16
+        | DataType::Float32
+        | DataType::Float64
+        | DataType::Date32
+        | DataType::Time32(_)
+        | DataType::Date64
         | DataType::Time64(_)
         | DataType::Duration(_)
-        | DataType::Timestamp(_, _) => [
-            MutableBuffer::new(capacity * mem::size_of::<i64>()),
-            empty_buffer,
-        ],
-        DataType::Interval(IntervalUnit::YearMonth) => [
-            MutableBuffer::new(capacity * mem::size_of::<i32>()),
-            empty_buffer,
-        ],
-        DataType::Interval(IntervalUnit::DayTime) => [
-            MutableBuffer::new(capacity * mem::size_of::<i64>()),
-            empty_buffer,
-        ],
-        DataType::Interval(IntervalUnit::MonthDayNano) => [
-            MutableBuffer::new(capacity * mem::size_of::<i128>()),
+        | DataType::Timestamp(_, _)
+        | DataType::Interval(_) => [
+            MutableBuffer::new(capacity * 
data_type.primitive_width().unwrap()),
             empty_buffer,
         ],
         DataType::Utf8 | DataType::Binary => {
@@ -163,41 +116,10 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity: 
usize) -> [MutableBuff
         DataType::FixedSizeBinary(size) => {
             [MutableBuffer::new(capacity * *size as usize), empty_buffer]
         }
-        DataType::Dictionary(child_data_type, _) => match 
child_data_type.as_ref() {
-            DataType::UInt8 => [
-                MutableBuffer::new(capacity * mem::size_of::<u8>()),
-                empty_buffer,
-            ],
-            DataType::UInt16 => [
-                MutableBuffer::new(capacity * mem::size_of::<u16>()),
-                empty_buffer,
-            ],
-            DataType::UInt32 => [
-                MutableBuffer::new(capacity * mem::size_of::<u32>()),
-                empty_buffer,
-            ],
-            DataType::UInt64 => [
-                MutableBuffer::new(capacity * mem::size_of::<u64>()),
-                empty_buffer,
-            ],
-            DataType::Int8 => [
-                MutableBuffer::new(capacity * mem::size_of::<i8>()),
-                empty_buffer,
-            ],
-            DataType::Int16 => [
-                MutableBuffer::new(capacity * mem::size_of::<i16>()),
-                empty_buffer,
-            ],
-            DataType::Int32 => [
-                MutableBuffer::new(capacity * mem::size_of::<i32>()),
-                empty_buffer,
-            ],
-            DataType::Int64 => [
-                MutableBuffer::new(capacity * mem::size_of::<i64>()),
-                empty_buffer,
-            ],
-            _ => unreachable!(),
-        },
+        DataType::Dictionary(k, _) => [
+            MutableBuffer::new(capacity * k.primitive_width().unwrap()),
+            empty_buffer,
+        ],
         DataType::FixedSizeList(_, _)
         | DataType::Struct(_)
         | DataType::RunEndEncoded(_, _) => [empty_buffer, 
MutableBuffer::new(0)],
@@ -667,83 +589,125 @@ impl ArrayData {
         &values.1[self.offset..]
     }
 
-    /// Returns a new empty [ArrayData] valid for `data_type`.
-    pub fn new_empty(data_type: &DataType) -> Self {
-        let buffers = new_buffers(data_type, 0);
-        let [buffer1, buffer2] = buffers;
-        let buffers = into_buffers(data_type, buffer1, buffer2);
-
-        let child_data = match data_type {
-            DataType::Null
-            | DataType::Boolean
-            | DataType::UInt8
-            | DataType::UInt16
-            | DataType::UInt32
-            | DataType::UInt64
-            | DataType::Int8
-            | DataType::Int16
-            | DataType::Int32
-            | DataType::Int64
-            | DataType::Float16
-            | DataType::Float32
-            | DataType::Float64
-            | DataType::Date32
-            | DataType::Date64
-            | DataType::Time32(_)
-            | DataType::Time64(_)
-            | DataType::Duration(_)
-            | DataType::Timestamp(_, _)
-            | DataType::Utf8
-            | DataType::Binary
-            | DataType::LargeUtf8
-            | DataType::LargeBinary
-            | DataType::Interval(_)
-            | DataType::FixedSizeBinary(_)
-            | DataType::Decimal128(_, _)
-            | DataType::Decimal256(_, _) => vec![],
-            DataType::List(field) => {
-                vec![Self::new_empty(field.data_type())]
-            }
-            DataType::FixedSizeList(field, _) => {
-                vec![Self::new_empty(field.data_type())]
-            }
-            DataType::LargeList(field) => {
-                vec![Self::new_empty(field.data_type())]
-            }
-            DataType::Struct(fields) => fields
-                .iter()
-                .map(|field| Self::new_empty(field.data_type()))
-                .collect(),
-            DataType::Map(field, _) => {
-                vec![Self::new_empty(field.data_type())]
-            }
-            DataType::Union(fields, _, _) => fields
-                .iter()
-                .map(|field| Self::new_empty(field.data_type()))
-                .collect(),
-            DataType::Dictionary(_, data_type) => {
-                vec![Self::new_empty(data_type)]
-            }
-            DataType::RunEndEncoded(run_ends, values) => {
-                vec![
-                    Self::new_empty(run_ends.data_type()),
-                    Self::new_empty(values.data_type()),
-                ]
-            }
+    /// Returns a new [`ArrayData`] valid for `data_type` containing `len` 
null values
+    pub fn new_null(data_type: &DataType, len: usize) -> Self {
+        let bit_len = bit_util::ceil(len, 8);
+        let zeroed = |len: usize| 
Buffer::from(MutableBuffer::from_len_zeroed(len));
+
+        let (buffers, child_data, has_nulls) = match 
data_type.primitive_width() {
+            Some(width) => (vec![zeroed(width * len)], vec![], true),
+            None => match data_type {
+                DataType::Null => (vec![], vec![], false),
+                DataType::Boolean => (vec![zeroed(bit_len)], vec![], true),
+                DataType::Binary | DataType::Utf8 => {
+                    (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true)
+                }
+                DataType::LargeBinary | DataType::LargeUtf8 => {
+                    (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true)
+                }
+                DataType::FixedSizeBinary(i) => {
+                    (vec![zeroed(*i as usize * len)], vec![], true)
+                }
+                DataType::List(f) | DataType::Map(f, _) => (
+                    vec![zeroed((len + 1) * 4)],
+                    vec![ArrayData::new_empty(f.data_type())],
+                    true,
+                ),
+                DataType::LargeList(f) => (
+                    vec![zeroed((len + 1) * 8)],
+                    vec![ArrayData::new_empty(f.data_type())],
+                    true,
+                ),
+                DataType::FixedSizeList(f, list_len) => (
+                    vec![],
+                    vec![ArrayData::new_null(f.data_type(), *list_len as usize 
* len)],
+                    true,
+                ),
+                DataType::Struct(fields) => (
+                    vec![],
+                    fields
+                        .iter()
+                        .map(|f| Self::new_null(f.data_type(), len))
+                        .collect(),
+                    true,
+                ),
+                DataType::Dictionary(k, v) => (
+                    vec![zeroed(k.primitive_width().unwrap() * len)],
+                    vec![ArrayData::new_empty(v.as_ref())],
+                    true,
+                ),
+                DataType::Union(f, i, mode) => {
+                    let ids = 
Buffer::from_iter(std::iter::repeat(i[0]).take(len));
+                    let buffers = match mode {
+                        UnionMode::Sparse => vec![ids],
+                        UnionMode::Dense => {
+                            let end_offset = i32::from_usize(len).unwrap();
+                            vec![ids, Buffer::from_iter(0_i32..end_offset)]
+                        }
+                    };
+
+                    let children = f
+                        .iter()
+                        .enumerate()
+                        .map(|(idx, f)| match idx {
+                            0 => Self::new_null(f.data_type(), len),
+                            _ => Self::new_empty(f.data_type()),
+                        })
+                        .collect();
+
+                    (buffers, children, false)
+                }
+                DataType::RunEndEncoded(r, v) => {
+                    let runs = match r.data_type() {
+                        DataType::Int16 => {
+                            let i = i16::from_usize(len).expect("run 
overflow");
+                            Buffer::from_slice_ref([i])
+                        }
+                        DataType::Int32 => {
+                            let i = i32::from_usize(len).expect("run 
overflow");
+                            Buffer::from_slice_ref([i])
+                        }
+                        DataType::Int64 => {
+                            let i = i64::from_usize(len).expect("run 
overflow");
+                            Buffer::from_slice_ref([i])
+                        }
+                        dt => unreachable!("Invalid run ends data type {dt}"),
+                    };
+
+                    let builder = ArrayData::builder(r.data_type().clone())
+                        .len(1)
+                        .buffers(vec![runs]);
+
+                    // SAFETY:
+                    // Valid by construction
+                    let runs = unsafe { builder.build_unchecked() };
+                    (
+                        vec![],
+                        vec![runs, ArrayData::new_null(v.data_type(), 1)],
+                        false,
+                    )
+                }
+                d => unreachable!("{d}"),
+            },
         };
 
-        // Data was constructed correctly above
-        unsafe {
-            Self::new_unchecked(
-                data_type.clone(),
-                0,
-                Some(0),
-                None,
-                0,
-                buffers,
-                child_data,
-            )
+        let mut builder = ArrayDataBuilder::new(data_type.clone())
+            .len(len)
+            .buffers(buffers)
+            .child_data(child_data);
+
+        if has_nulls {
+            builder = 
builder.null_count(len).null_bit_buffer(Some(zeroed(len)))
         }
+
+        // SAFETY:
+        // Data valid by construction
+        unsafe { builder.build_unchecked() }
+    }
+
+    /// Returns a new empty [ArrayData] valid for `data_type`.
+    pub fn new_empty(data_type: &DataType) -> Self {
+        Self::new_null(data_type, 0)
     }
 
     /// "cheap" validation of an `ArrayData`. Ensures buffers are
@@ -1578,30 +1542,24 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout {
             buffers: vec![BufferSpec::BitMap],
             can_contain_null_mask: true,
         },
-        DataType::Int8 => DataTypeLayout::new_fixed_width(size_of::<i8>()),
-        DataType::Int16 => DataTypeLayout::new_fixed_width(size_of::<i16>()),
-        DataType::Int32 => DataTypeLayout::new_fixed_width(size_of::<i32>()),
-        DataType::Int64 => DataTypeLayout::new_fixed_width(size_of::<i64>()),
-        DataType::UInt8 => DataTypeLayout::new_fixed_width(size_of::<u8>()),
-        DataType::UInt16 => DataTypeLayout::new_fixed_width(size_of::<u16>()),
-        DataType::UInt32 => DataTypeLayout::new_fixed_width(size_of::<u32>()),
-        DataType::UInt64 => DataTypeLayout::new_fixed_width(size_of::<u64>()),
-        DataType::Float16 => DataTypeLayout::new_fixed_width(size_of::<f16>()),
-        DataType::Float32 => DataTypeLayout::new_fixed_width(size_of::<f32>()),
-        DataType::Float64 => DataTypeLayout::new_fixed_width(size_of::<f64>()),
-        DataType::Timestamp(_, _) => 
DataTypeLayout::new_fixed_width(size_of::<i64>()),
-        DataType::Date32 => DataTypeLayout::new_fixed_width(size_of::<i32>()),
-        DataType::Date64 => DataTypeLayout::new_fixed_width(size_of::<i64>()),
-        DataType::Time32(_) => 
DataTypeLayout::new_fixed_width(size_of::<i32>()),
-        DataType::Time64(_) => 
DataTypeLayout::new_fixed_width(size_of::<i64>()),
-        DataType::Interval(IntervalUnit::YearMonth) => {
-            DataTypeLayout::new_fixed_width(size_of::<i32>())
-        }
-        DataType::Interval(IntervalUnit::DayTime) => {
-            DataTypeLayout::new_fixed_width(size_of::<i64>())
-        }
-        DataType::Interval(IntervalUnit::MonthDayNano) => {
-            DataTypeLayout::new_fixed_width(size_of::<i128>())
+        DataType::Int8
+        | DataType::Int16
+        | DataType::Int32
+        | DataType::Int64
+        | DataType::UInt8
+        | DataType::UInt16
+        | DataType::UInt32
+        | DataType::UInt64
+        | DataType::Float16
+        | DataType::Float32
+        | DataType::Float64
+        | DataType::Timestamp(_, _)
+        | DataType::Date32
+        | DataType::Date64
+        | DataType::Time32(_)
+        | DataType::Time64(_)
+        | DataType::Interval(_) => {
+            
DataTypeLayout::new_fixed_width(data_type.primitive_width().unwrap())
         }
         DataType::Duration(_) => 
DataTypeLayout::new_fixed_width(size_of::<i64>()),
         DataType::Binary => DataTypeLayout::new_binary(size_of::<i32>()),
diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs
index 9476535fa..56eb6e8ce 100644
--- a/arrow-schema/src/datatype.rs
+++ b/arrow-schema/src/datatype.rs
@@ -410,6 +410,39 @@ impl DataType {
         }
     }
 
+    /// Returns the bit width of this type if it is a primitive type
+    ///
+    /// Returns `None` if not a primitive type
+    #[inline]
+    pub fn primitive_width(&self) -> Option<usize> {
+        match self {
+            DataType::Null => None,
+            DataType::Boolean => None,
+            DataType::Int8 | DataType::UInt8 => Some(1),
+            DataType::Int16 | DataType::UInt16 | DataType::Float16 => Some(2),
+            DataType::Int32 | DataType::UInt32 | DataType::Float32 => Some(4),
+            DataType::Int64 | DataType::UInt64 | DataType::Float64 => Some(8),
+            DataType::Timestamp(_, _) => Some(8),
+            DataType::Date32 | DataType::Time32(_) => Some(4),
+            DataType::Date64 | DataType::Time64(_) => Some(8),
+            DataType::Duration(_) => Some(8),
+            DataType::Interval(IntervalUnit::YearMonth) => Some(4),
+            DataType::Interval(IntervalUnit::DayTime) => Some(8),
+            DataType::Interval(IntervalUnit::MonthDayNano) => Some(16),
+            DataType::Decimal128(_, _) => Some(16),
+            DataType::Decimal256(_, _) => Some(32),
+            DataType::Utf8 | DataType::LargeUtf8 => None,
+            DataType::Binary | DataType::LargeBinary => None,
+            DataType::FixedSizeBinary(_) => None,
+            DataType::List(_) | DataType::LargeList(_) | DataType::Map(_, _) 
=> None,
+            DataType::FixedSizeList(_, _) => None,
+            DataType::Struct(_) => None,
+            DataType::Union(_, _, _) => None,
+            DataType::Dictionary(_, _) => None,
+            DataType::RunEndEncoded(_, _) => None,
+        }
+    }
+
     /// Return size of this instance in bytes.
     ///
     /// Includes the size of `Self`.
diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs
index dc234c859..78dd1ef45 100644
--- a/arrow/src/ffi.rs
+++ b/arrow/src/ffi.rs
@@ -120,7 +120,6 @@ use std::{
     sync::Arc,
 };
 
-use arrow_buffer::i256;
 use arrow_schema::UnionMode;
 use bitflags::bitflags;
 
@@ -311,39 +310,21 @@ impl Drop for FFI_ArrowSchema {
 // This is set by the Arrow specification
 #[allow(clippy::manual_bits)]
 fn bit_width(data_type: &DataType, i: usize) -> Result<usize> {
+    if let Some(primitive) = data_type.primitive_width() {
+        return match i {
+            0 => Err(ArrowError::CDataInterface(format!(
+                "The datatype \"{data_type:?}\" doesn't expect buffer at index 
0. Please verify that the C data interface is correctly implemented."
+            ))),
+            1 => Ok(primitive * 8),
+            i => Err(ArrowError::CDataInterface(format!(
+                "The datatype \"{data_type:?}\" expects 2 buffers, but 
requested {i}. Please verify that the C data interface is correctly 
implemented."
+            ))),
+        };
+    }
+
     Ok(match (data_type, i) {
-        // primitive types first buffer's size is given by the native types
         (DataType::Boolean, 1) => 1,
-        (DataType::UInt8, 1) => size_of::<u8>() * 8,
-        (DataType::UInt16, 1) => size_of::<u16>() * 8,
-        (DataType::UInt32, 1) => size_of::<u32>() * 8,
-        (DataType::UInt64, 1) => size_of::<u64>() * 8,
-        (DataType::Int8, 1) => size_of::<i8>() * 8,
-        (DataType::Int16, 1) => size_of::<i16>() * 8,
-        (DataType::Int32, 1) | (DataType::Date32, 1) | (DataType::Time32(_), 
1) => size_of::<i32>() * 8,
-        (DataType::Int64, 1) | (DataType::Date64, 1) | (DataType::Time64(_), 
1) => size_of::<i64>() * 8,
-        (DataType::Float32, 1) => size_of::<f32>() * 8,
-        (DataType::Float64, 1) => size_of::<f64>() * 8,
-        (DataType::Decimal128(..), 1) => size_of::<i128>() * 8,
-        (DataType::Decimal256(..), 1) => size_of::<i256>() * 8,
-        (DataType::Timestamp(..), 1) => size_of::<i64>() * 8,
-        (DataType::Duration(..), 1) => size_of::<i64>() * 8,
-        // primitive types have a single buffer
-        (DataType::Boolean, _) |
-        (DataType::UInt8, _) |
-        (DataType::UInt16, _) |
-        (DataType::UInt32, _) |
-        (DataType::UInt64, _) |
-        (DataType::Int8, _) |
-        (DataType::Int16, _) |
-        (DataType::Int32, _) | (DataType::Date32, _) | (DataType::Time32(_), 
_) |
-        (DataType::Int64, _) | (DataType::Date64, _) | (DataType::Time64(_), 
_) |
-        (DataType::Float32, _) |
-        (DataType::Float64, _) |
-        (DataType::Decimal128(..), _) |
-        (DataType::Decimal256(..), _) |
-        (DataType::Timestamp(..), _) |
-        (DataType::Duration(..), _) => {
+        (DataType::Boolean, _) => {
             return Err(ArrowError::CDataInterface(format!(
                 "The datatype \"{data_type:?}\" expects 2 buffers, but 
requested {i}. Please verify that the C data interface is correctly 
implemented."
             )))

Reply via email to