This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 7cd29d735 Add ArrayData::new_null and DataType::primitive_width (#3676)
7cd29d735 is described below
commit 7cd29d7353369589c18377de4300c44f91a54462
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Thu Feb 9 13:24:52 2023 +0000
Add ArrayData::new_null and DataType::primitive_width (#3676)
* Add ArrayData::new_null and DataType::primitive_width
* Add FixedSizeBinary test
* Update arrow-data/src/data.rs
Co-authored-by: askoa <[email protected]>
* Only generate nulls for first UnionArray child
---------
Co-authored-by: askoa <[email protected]>
---
arrow-array/src/array/mod.rs | 279 +++++++++-------------------------
arrow-data/src/data.rs | 354 +++++++++++++++++++------------------------
arrow-schema/src/datatype.rs | 33 ++++
arrow/src/ffi.rs | 45 ++----
4 files changed, 277 insertions(+), 434 deletions(-)
diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs
index e953781e5..b293d797e 100644
--- a/arrow-array/src/array/mod.rs
+++ b/arrow-array/src/array/mod.rs
@@ -20,7 +20,6 @@
mod binary_array;
use crate::types::*;
-use arrow_buffer::{Buffer, MutableBuffer, ToByteSlice};
use arrow_data::ArrayData;
use arrow_schema::{DataType, IntervalUnit, TimeUnit};
use std::any::Any;
@@ -634,207 +633,7 @@ pub fn new_empty_array(data_type: &DataType) -> ArrayRef {
/// assert_eq!(&array, &null_array);
/// ```
pub fn new_null_array(data_type: &DataType, length: usize) -> ArrayRef {
- // context: https://github.com/apache/arrow/pull/9469#discussion_r574761687
- match data_type {
- DataType::Null => Arc::new(NullArray::new(length)),
- DataType::Boolean => {
- let null_buf: Buffer = MutableBuffer::new_null(length).into();
- make_array(unsafe {
- ArrayData::new_unchecked(
- data_type.clone(),
- length,
- Some(length),
- Some(null_buf.clone()),
- 0,
- vec![null_buf],
- vec![],
- )
- })
- }
- DataType::Int8 => new_null_sized_array::<Int8Type>(data_type, length),
- DataType::UInt8 => new_null_sized_array::<UInt8Type>(data_type,
length),
- DataType::Int16 => new_null_sized_array::<Int16Type>(data_type,
length),
- DataType::UInt16 => new_null_sized_array::<UInt16Type>(data_type,
length),
- DataType::Float16 => new_null_sized_array::<Float16Type>(data_type,
length),
- DataType::Int32 => new_null_sized_array::<Int32Type>(data_type,
length),
- DataType::UInt32 => new_null_sized_array::<UInt32Type>(data_type,
length),
- DataType::Float32 => new_null_sized_array::<Float32Type>(data_type,
length),
- DataType::Date32 => new_null_sized_array::<Date32Type>(data_type,
length),
- // expanding this into Date23{unit}Type results in needless branching
- DataType::Time32(_) => new_null_sized_array::<Int32Type>(data_type,
length),
- DataType::Int64 => new_null_sized_array::<Int64Type>(data_type,
length),
- DataType::UInt64 => new_null_sized_array::<UInt64Type>(data_type,
length),
- DataType::Float64 => new_null_sized_array::<Float64Type>(data_type,
length),
- DataType::Date64 => new_null_sized_array::<Date64Type>(data_type,
length),
- // expanding this into Timestamp{unit}Type results in needless
branching
- DataType::Timestamp(_, _) =>
new_null_sized_array::<Int64Type>(data_type, length),
- DataType::Time64(_) => new_null_sized_array::<Int64Type>(data_type,
length),
- DataType::Duration(_) => new_null_sized_array::<Int64Type>(data_type,
length),
- DataType::Interval(unit) => match unit {
- IntervalUnit::YearMonth => {
- new_null_sized_array::<IntervalYearMonthType>(data_type,
length)
- }
- IntervalUnit::DayTime => {
- new_null_sized_array::<IntervalDayTimeType>(data_type, length)
- }
- IntervalUnit::MonthDayNano => {
- new_null_sized_array::<IntervalMonthDayNanoType>(data_type,
length)
- }
- },
- DataType::FixedSizeBinary(value_len) => make_array(unsafe {
- ArrayData::new_unchecked(
- data_type.clone(),
- length,
- Some(length),
- Some(MutableBuffer::new_null(length).into()),
- 0,
- vec![Buffer::from(vec![0u8; *value_len as usize * length])],
- vec![],
- )
- }),
- DataType::Binary | DataType::Utf8 => {
- new_null_binary_array::<i32>(data_type, length)
- }
- DataType::LargeBinary | DataType::LargeUtf8 => {
- new_null_binary_array::<i64>(data_type, length)
- }
- DataType::List(field) => {
- new_null_list_array::<i32>(data_type, field.data_type(), length)
- }
- DataType::LargeList(field) => {
- new_null_list_array::<i64>(data_type, field.data_type(), length)
- }
- DataType::FixedSizeList(field, value_len) => make_array(unsafe {
- ArrayData::new_unchecked(
- data_type.clone(),
- length,
- Some(length),
- Some(MutableBuffer::new_null(length).into()),
- 0,
- vec![],
- vec![
- new_null_array(field.data_type(), *value_len as usize *
length)
- .data()
- .clone(),
- ],
- )
- }),
- DataType::Struct(fields) => {
- let fields: Vec<_> = fields
- .iter()
- .map(|field| (field.clone(), new_null_array(field.data_type(),
length)))
- .collect();
-
- let null_buffer = MutableBuffer::new_null(length);
- Arc::new(StructArray::from((fields, null_buffer.into())))
- }
- DataType::Map(field, _keys_sorted) => {
- new_null_list_array::<i32>(data_type, field.data_type(), length)
- }
- DataType::Union(_, _, _) => {
- unimplemented!("Creating null Union array not yet supported")
- }
- DataType::Dictionary(key, value) => {
- let keys = new_null_array(key, length);
- let keys = keys.data();
-
- make_array(unsafe {
- ArrayData::new_unchecked(
- data_type.clone(),
- length,
- Some(length),
- keys.null_buffer().cloned(),
- 0,
- keys.buffers().into(),
- vec![new_empty_array(value.as_ref()).into_data()],
- )
- })
- }
- DataType::Decimal128(_, _) => {
- new_null_sized_decimal(data_type, length,
std::mem::size_of::<i128>())
- }
- DataType::Decimal256(_, _) => new_null_sized_decimal(data_type,
length, 32),
- DataType::RunEndEncoded(_, _) => todo!(),
- }
-}
-
-#[inline]
-fn new_null_list_array<OffsetSize: OffsetSizeTrait>(
- data_type: &DataType,
- child_data_type: &DataType,
- length: usize,
-) -> ArrayRef {
- make_array(unsafe {
- ArrayData::new_unchecked(
- data_type.clone(),
- length,
- Some(length),
- Some(MutableBuffer::new_null(length).into()),
- 0,
- vec![Buffer::from(
- vec![OffsetSize::zero(); length + 1].to_byte_slice(),
- )],
- vec![ArrayData::new_empty(child_data_type)],
- )
- })
-}
-
-#[inline]
-fn new_null_binary_array<OffsetSize: OffsetSizeTrait>(
- data_type: &DataType,
- length: usize,
-) -> ArrayRef {
- make_array(unsafe {
- ArrayData::new_unchecked(
- data_type.clone(),
- length,
- Some(length),
- Some(MutableBuffer::new_null(length).into()),
- 0,
- vec![
- Buffer::from(vec![OffsetSize::zero(); length +
1].to_byte_slice()),
- MutableBuffer::new(0).into(),
- ],
- vec![],
- )
- })
-}
-
-#[inline]
-fn new_null_sized_array<T: ArrowPrimitiveType>(
- data_type: &DataType,
- length: usize,
-) -> ArrayRef {
- make_array(unsafe {
- ArrayData::new_unchecked(
- data_type.clone(),
- length,
- Some(length),
- Some(MutableBuffer::new_null(length).into()),
- 0,
- vec![Buffer::from(vec![0u8; length * T::get_byte_width()])],
- vec![],
- )
- })
-}
-
-#[inline]
-fn new_null_sized_decimal(
- data_type: &DataType,
- length: usize,
- byte_width: usize,
-) -> ArrayRef {
- make_array(unsafe {
- ArrayData::new_unchecked(
- data_type.clone(),
- length,
- Some(length),
- Some(MutableBuffer::new_null(length).into()),
- 0,
- vec![Buffer::from(vec![0u8; length * byte_width])],
- vec![],
- )
- })
+ make_array(ArrayData::new_null(data_type, length))
}
// Helper function for printing potentially long arrays.
@@ -881,8 +680,10 @@ where
#[cfg(test)]
mod tests {
use super::*;
- use crate::cast::downcast_array;
- use arrow_schema::Field;
+ use crate::cast::{as_union_array, downcast_array};
+ use crate::downcast_run_array;
+ use arrow_buffer::{Buffer, MutableBuffer};
+ use arrow_schema::{Field, UnionMode};
#[test]
fn test_empty_primitive() {
@@ -1012,6 +813,76 @@ mod tests {
);
}
+ #[test]
+ fn test_null_union() {
+ for mode in [UnionMode::Sparse, UnionMode::Dense] {
+ let data_type = DataType::Union(
+ vec![
+ Field::new("foo", DataType::Int32, true),
+ Field::new("bar", DataType::Int64, true),
+ ],
+ vec![2, 1],
+ mode,
+ );
+ let array = new_null_array(&data_type, 4);
+
+ let array = as_union_array(array.as_ref());
+ assert_eq!(array.len(), 4);
+ assert_eq!(array.null_count(), 0);
+
+ for i in 0..4 {
+ let a = array.value(i);
+ assert_eq!(a.len(), 1);
+ assert_eq!(a.null_count(), 1);
+ assert!(a.is_null(0))
+ }
+ }
+ }
+
+ #[test]
+ #[allow(unused_parens)]
+ fn test_null_runs() {
+ for r in [DataType::Int16, DataType::Int32, DataType::Int64] {
+ let data_type = DataType::RunEndEncoded(
+ Box::new(Field::new("run_ends", r, false)),
+ Box::new(Field::new("values", DataType::Utf8, true)),
+ );
+
+ let array = new_null_array(&data_type, 4);
+ let array = array.as_ref();
+
+ downcast_run_array! {
+ array => {
+ assert_eq!(array.len(), 4);
+ assert_eq!(array.null_count(), 0);
+ assert_eq!(array.values().len(), 1);
+ assert_eq!(array.values().null_count(), 1);
+ assert_eq!(array.run_ends().values(), &[4]);
+
+ let idx = array.get_physical_indices(&[0, 1, 2,
3]).unwrap();
+ assert_eq!(idx, &[0,0,0,0]);
+ }
+ d => unreachable!("{d}")
+ }
+ }
+ }
+
+ #[test]
+ fn test_null_fixed_size_binary() {
+ for size in [1, 2, 7] {
+ let array = new_null_array(&DataType::FixedSizeBinary(size), 6);
+ let array = array
+ .as_ref()
+ .as_any()
+ .downcast_ref::<FixedSizeBinaryArray>()
+ .unwrap();
+
+ assert_eq!(array.len(), 6);
+ assert_eq!(array.null_count(), 6);
+ array.iter().for_each(|x| assert!(x.is_none()));
+ }
+ }
+
#[test]
fn test_memory_size_null() {
let null_arr = NullArray::new(32);
diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs
index 709262e83..8b727ec95 100644
--- a/arrow-data/src/data.rs
+++ b/arrow-data/src/data.rs
@@ -21,8 +21,7 @@
use crate::{bit_iterator::BitSliceIterator, bitmap::Bitmap};
use arrow_buffer::bit_chunk_iterator::BitChunks;
use arrow_buffer::{bit_util, ArrowNativeType, Buffer, MutableBuffer};
-use arrow_schema::{ArrowError, DataType, IntervalUnit, UnionMode};
-use half::f16;
+use arrow_schema::{ArrowError, DataType, UnionMode};
use std::convert::TryInto;
use std::mem;
use std::ops::Range;
@@ -69,71 +68,25 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity:
usize) -> [MutableBuff
let buffer = MutableBuffer::new(bytes);
[buffer, empty_buffer]
}
- DataType::UInt8 => [
- MutableBuffer::new(capacity * mem::size_of::<u8>()),
- empty_buffer,
- ],
- DataType::UInt16 => [
- MutableBuffer::new(capacity * mem::size_of::<u16>()),
- empty_buffer,
- ],
- DataType::UInt32 => [
- MutableBuffer::new(capacity * mem::size_of::<u32>()),
- empty_buffer,
- ],
- DataType::UInt64 => [
- MutableBuffer::new(capacity * mem::size_of::<u64>()),
- empty_buffer,
- ],
- DataType::Int8 => [
- MutableBuffer::new(capacity * mem::size_of::<i8>()),
- empty_buffer,
- ],
- DataType::Int16 => [
- MutableBuffer::new(capacity * mem::size_of::<i16>()),
- empty_buffer,
- ],
- DataType::Int32 => [
- MutableBuffer::new(capacity * mem::size_of::<i32>()),
- empty_buffer,
- ],
- DataType::Int64 => [
- MutableBuffer::new(capacity * mem::size_of::<i64>()),
- empty_buffer,
- ],
- DataType::Float16 => [
- MutableBuffer::new(capacity * mem::size_of::<f16>()),
- empty_buffer,
- ],
- DataType::Float32 => [
- MutableBuffer::new(capacity * mem::size_of::<f32>()),
- empty_buffer,
- ],
- DataType::Float64 => [
- MutableBuffer::new(capacity * mem::size_of::<f64>()),
- empty_buffer,
- ],
- DataType::Date32 | DataType::Time32(_) => [
- MutableBuffer::new(capacity * mem::size_of::<i32>()),
- empty_buffer,
- ],
- DataType::Date64
+ DataType::UInt8
+ | DataType::UInt16
+ | DataType::UInt32
+ | DataType::UInt64
+ | DataType::Int8
+ | DataType::Int16
+ | DataType::Int32
+ | DataType::Int64
+ | DataType::Float16
+ | DataType::Float32
+ | DataType::Float64
+ | DataType::Date32
+ | DataType::Time32(_)
+ | DataType::Date64
| DataType::Time64(_)
| DataType::Duration(_)
- | DataType::Timestamp(_, _) => [
- MutableBuffer::new(capacity * mem::size_of::<i64>()),
- empty_buffer,
- ],
- DataType::Interval(IntervalUnit::YearMonth) => [
- MutableBuffer::new(capacity * mem::size_of::<i32>()),
- empty_buffer,
- ],
- DataType::Interval(IntervalUnit::DayTime) => [
- MutableBuffer::new(capacity * mem::size_of::<i64>()),
- empty_buffer,
- ],
- DataType::Interval(IntervalUnit::MonthDayNano) => [
- MutableBuffer::new(capacity * mem::size_of::<i128>()),
+ | DataType::Timestamp(_, _)
+ | DataType::Interval(_) => [
+ MutableBuffer::new(capacity *
data_type.primitive_width().unwrap()),
empty_buffer,
],
DataType::Utf8 | DataType::Binary => {
@@ -163,41 +116,10 @@ pub(crate) fn new_buffers(data_type: &DataType, capacity:
usize) -> [MutableBuff
DataType::FixedSizeBinary(size) => {
[MutableBuffer::new(capacity * *size as usize), empty_buffer]
}
- DataType::Dictionary(child_data_type, _) => match
child_data_type.as_ref() {
- DataType::UInt8 => [
- MutableBuffer::new(capacity * mem::size_of::<u8>()),
- empty_buffer,
- ],
- DataType::UInt16 => [
- MutableBuffer::new(capacity * mem::size_of::<u16>()),
- empty_buffer,
- ],
- DataType::UInt32 => [
- MutableBuffer::new(capacity * mem::size_of::<u32>()),
- empty_buffer,
- ],
- DataType::UInt64 => [
- MutableBuffer::new(capacity * mem::size_of::<u64>()),
- empty_buffer,
- ],
- DataType::Int8 => [
- MutableBuffer::new(capacity * mem::size_of::<i8>()),
- empty_buffer,
- ],
- DataType::Int16 => [
- MutableBuffer::new(capacity * mem::size_of::<i16>()),
- empty_buffer,
- ],
- DataType::Int32 => [
- MutableBuffer::new(capacity * mem::size_of::<i32>()),
- empty_buffer,
- ],
- DataType::Int64 => [
- MutableBuffer::new(capacity * mem::size_of::<i64>()),
- empty_buffer,
- ],
- _ => unreachable!(),
- },
+ DataType::Dictionary(k, _) => [
+ MutableBuffer::new(capacity * k.primitive_width().unwrap()),
+ empty_buffer,
+ ],
DataType::FixedSizeList(_, _)
| DataType::Struct(_)
| DataType::RunEndEncoded(_, _) => [empty_buffer,
MutableBuffer::new(0)],
@@ -667,83 +589,125 @@ impl ArrayData {
&values.1[self.offset..]
}
- /// Returns a new empty [ArrayData] valid for `data_type`.
- pub fn new_empty(data_type: &DataType) -> Self {
- let buffers = new_buffers(data_type, 0);
- let [buffer1, buffer2] = buffers;
- let buffers = into_buffers(data_type, buffer1, buffer2);
-
- let child_data = match data_type {
- DataType::Null
- | DataType::Boolean
- | DataType::UInt8
- | DataType::UInt16
- | DataType::UInt32
- | DataType::UInt64
- | DataType::Int8
- | DataType::Int16
- | DataType::Int32
- | DataType::Int64
- | DataType::Float16
- | DataType::Float32
- | DataType::Float64
- | DataType::Date32
- | DataType::Date64
- | DataType::Time32(_)
- | DataType::Time64(_)
- | DataType::Duration(_)
- | DataType::Timestamp(_, _)
- | DataType::Utf8
- | DataType::Binary
- | DataType::LargeUtf8
- | DataType::LargeBinary
- | DataType::Interval(_)
- | DataType::FixedSizeBinary(_)
- | DataType::Decimal128(_, _)
- | DataType::Decimal256(_, _) => vec![],
- DataType::List(field) => {
- vec![Self::new_empty(field.data_type())]
- }
- DataType::FixedSizeList(field, _) => {
- vec![Self::new_empty(field.data_type())]
- }
- DataType::LargeList(field) => {
- vec![Self::new_empty(field.data_type())]
- }
- DataType::Struct(fields) => fields
- .iter()
- .map(|field| Self::new_empty(field.data_type()))
- .collect(),
- DataType::Map(field, _) => {
- vec![Self::new_empty(field.data_type())]
- }
- DataType::Union(fields, _, _) => fields
- .iter()
- .map(|field| Self::new_empty(field.data_type()))
- .collect(),
- DataType::Dictionary(_, data_type) => {
- vec![Self::new_empty(data_type)]
- }
- DataType::RunEndEncoded(run_ends, values) => {
- vec![
- Self::new_empty(run_ends.data_type()),
- Self::new_empty(values.data_type()),
- ]
- }
+ /// Returns a new [`ArrayData`] valid for `data_type` containing `len`
null values
+ pub fn new_null(data_type: &DataType, len: usize) -> Self {
+ let bit_len = bit_util::ceil(len, 8);
+ let zeroed = |len: usize|
Buffer::from(MutableBuffer::from_len_zeroed(len));
+
+ let (buffers, child_data, has_nulls) = match
data_type.primitive_width() {
+ Some(width) => (vec![zeroed(width * len)], vec![], true),
+ None => match data_type {
+ DataType::Null => (vec![], vec![], false),
+ DataType::Boolean => (vec![zeroed(bit_len)], vec![], true),
+ DataType::Binary | DataType::Utf8 => {
+ (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true)
+ }
+ DataType::LargeBinary | DataType::LargeUtf8 => {
+ (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true)
+ }
+ DataType::FixedSizeBinary(i) => {
+ (vec![zeroed(*i as usize * len)], vec![], true)
+ }
+ DataType::List(f) | DataType::Map(f, _) => (
+ vec![zeroed((len + 1) * 4)],
+ vec![ArrayData::new_empty(f.data_type())],
+ true,
+ ),
+ DataType::LargeList(f) => (
+ vec![zeroed((len + 1) * 8)],
+ vec![ArrayData::new_empty(f.data_type())],
+ true,
+ ),
+ DataType::FixedSizeList(f, list_len) => (
+ vec![],
+ vec![ArrayData::new_null(f.data_type(), *list_len as usize
* len)],
+ true,
+ ),
+ DataType::Struct(fields) => (
+ vec![],
+ fields
+ .iter()
+ .map(|f| Self::new_null(f.data_type(), len))
+ .collect(),
+ true,
+ ),
+ DataType::Dictionary(k, v) => (
+ vec![zeroed(k.primitive_width().unwrap() * len)],
+ vec![ArrayData::new_empty(v.as_ref())],
+ true,
+ ),
+ DataType::Union(f, i, mode) => {
+ let ids =
Buffer::from_iter(std::iter::repeat(i[0]).take(len));
+ let buffers = match mode {
+ UnionMode::Sparse => vec![ids],
+ UnionMode::Dense => {
+ let end_offset = i32::from_usize(len).unwrap();
+ vec![ids, Buffer::from_iter(0_i32..end_offset)]
+ }
+ };
+
+ let children = f
+ .iter()
+ .enumerate()
+ .map(|(idx, f)| match idx {
+ 0 => Self::new_null(f.data_type(), len),
+ _ => Self::new_empty(f.data_type()),
+ })
+ .collect();
+
+ (buffers, children, false)
+ }
+ DataType::RunEndEncoded(r, v) => {
+ let runs = match r.data_type() {
+ DataType::Int16 => {
+ let i = i16::from_usize(len).expect("run
overflow");
+ Buffer::from_slice_ref([i])
+ }
+ DataType::Int32 => {
+ let i = i32::from_usize(len).expect("run
overflow");
+ Buffer::from_slice_ref([i])
+ }
+ DataType::Int64 => {
+ let i = i64::from_usize(len).expect("run
overflow");
+ Buffer::from_slice_ref([i])
+ }
+ dt => unreachable!("Invalid run ends data type {dt}"),
+ };
+
+ let builder = ArrayData::builder(r.data_type().clone())
+ .len(1)
+ .buffers(vec![runs]);
+
+ // SAFETY:
+ // Valid by construction
+ let runs = unsafe { builder.build_unchecked() };
+ (
+ vec![],
+ vec![runs, ArrayData::new_null(v.data_type(), 1)],
+ false,
+ )
+ }
+ d => unreachable!("{d}"),
+ },
};
- // Data was constructed correctly above
- unsafe {
- Self::new_unchecked(
- data_type.clone(),
- 0,
- Some(0),
- None,
- 0,
- buffers,
- child_data,
- )
+ let mut builder = ArrayDataBuilder::new(data_type.clone())
+ .len(len)
+ .buffers(buffers)
+ .child_data(child_data);
+
+ if has_nulls {
+ builder =
builder.null_count(len).null_bit_buffer(Some(zeroed(len)))
}
+
+ // SAFETY:
+ // Data valid by construction
+ unsafe { builder.build_unchecked() }
+ }
+
+ /// Returns a new empty [ArrayData] valid for `data_type`.
+ pub fn new_empty(data_type: &DataType) -> Self {
+ Self::new_null(data_type, 0)
}
/// "cheap" validation of an `ArrayData`. Ensures buffers are
@@ -1578,30 +1542,24 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout {
buffers: vec![BufferSpec::BitMap],
can_contain_null_mask: true,
},
- DataType::Int8 => DataTypeLayout::new_fixed_width(size_of::<i8>()),
- DataType::Int16 => DataTypeLayout::new_fixed_width(size_of::<i16>()),
- DataType::Int32 => DataTypeLayout::new_fixed_width(size_of::<i32>()),
- DataType::Int64 => DataTypeLayout::new_fixed_width(size_of::<i64>()),
- DataType::UInt8 => DataTypeLayout::new_fixed_width(size_of::<u8>()),
- DataType::UInt16 => DataTypeLayout::new_fixed_width(size_of::<u16>()),
- DataType::UInt32 => DataTypeLayout::new_fixed_width(size_of::<u32>()),
- DataType::UInt64 => DataTypeLayout::new_fixed_width(size_of::<u64>()),
- DataType::Float16 => DataTypeLayout::new_fixed_width(size_of::<f16>()),
- DataType::Float32 => DataTypeLayout::new_fixed_width(size_of::<f32>()),
- DataType::Float64 => DataTypeLayout::new_fixed_width(size_of::<f64>()),
- DataType::Timestamp(_, _) =>
DataTypeLayout::new_fixed_width(size_of::<i64>()),
- DataType::Date32 => DataTypeLayout::new_fixed_width(size_of::<i32>()),
- DataType::Date64 => DataTypeLayout::new_fixed_width(size_of::<i64>()),
- DataType::Time32(_) =>
DataTypeLayout::new_fixed_width(size_of::<i32>()),
- DataType::Time64(_) =>
DataTypeLayout::new_fixed_width(size_of::<i64>()),
- DataType::Interval(IntervalUnit::YearMonth) => {
- DataTypeLayout::new_fixed_width(size_of::<i32>())
- }
- DataType::Interval(IntervalUnit::DayTime) => {
- DataTypeLayout::new_fixed_width(size_of::<i64>())
- }
- DataType::Interval(IntervalUnit::MonthDayNano) => {
- DataTypeLayout::new_fixed_width(size_of::<i128>())
+ DataType::Int8
+ | DataType::Int16
+ | DataType::Int32
+ | DataType::Int64
+ | DataType::UInt8
+ | DataType::UInt16
+ | DataType::UInt32
+ | DataType::UInt64
+ | DataType::Float16
+ | DataType::Float32
+ | DataType::Float64
+ | DataType::Timestamp(_, _)
+ | DataType::Date32
+ | DataType::Date64
+ | DataType::Time32(_)
+ | DataType::Time64(_)
+ | DataType::Interval(_) => {
+
DataTypeLayout::new_fixed_width(data_type.primitive_width().unwrap())
}
DataType::Duration(_) =>
DataTypeLayout::new_fixed_width(size_of::<i64>()),
DataType::Binary => DataTypeLayout::new_binary(size_of::<i32>()),
diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs
index 9476535fa..56eb6e8ce 100644
--- a/arrow-schema/src/datatype.rs
+++ b/arrow-schema/src/datatype.rs
@@ -410,6 +410,39 @@ impl DataType {
}
}
+ /// Returns the bit width of this type if it is a primitive type
+ ///
+ /// Returns `None` if not a primitive type
+ #[inline]
+ pub fn primitive_width(&self) -> Option<usize> {
+ match self {
+ DataType::Null => None,
+ DataType::Boolean => None,
+ DataType::Int8 | DataType::UInt8 => Some(1),
+ DataType::Int16 | DataType::UInt16 | DataType::Float16 => Some(2),
+ DataType::Int32 | DataType::UInt32 | DataType::Float32 => Some(4),
+ DataType::Int64 | DataType::UInt64 | DataType::Float64 => Some(8),
+ DataType::Timestamp(_, _) => Some(8),
+ DataType::Date32 | DataType::Time32(_) => Some(4),
+ DataType::Date64 | DataType::Time64(_) => Some(8),
+ DataType::Duration(_) => Some(8),
+ DataType::Interval(IntervalUnit::YearMonth) => Some(4),
+ DataType::Interval(IntervalUnit::DayTime) => Some(8),
+ DataType::Interval(IntervalUnit::MonthDayNano) => Some(16),
+ DataType::Decimal128(_, _) => Some(16),
+ DataType::Decimal256(_, _) => Some(32),
+ DataType::Utf8 | DataType::LargeUtf8 => None,
+ DataType::Binary | DataType::LargeBinary => None,
+ DataType::FixedSizeBinary(_) => None,
+ DataType::List(_) | DataType::LargeList(_) | DataType::Map(_, _)
=> None,
+ DataType::FixedSizeList(_, _) => None,
+ DataType::Struct(_) => None,
+ DataType::Union(_, _, _) => None,
+ DataType::Dictionary(_, _) => None,
+ DataType::RunEndEncoded(_, _) => None,
+ }
+ }
+
/// Return size of this instance in bytes.
///
/// Includes the size of `Self`.
diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs
index dc234c859..78dd1ef45 100644
--- a/arrow/src/ffi.rs
+++ b/arrow/src/ffi.rs
@@ -120,7 +120,6 @@ use std::{
sync::Arc,
};
-use arrow_buffer::i256;
use arrow_schema::UnionMode;
use bitflags::bitflags;
@@ -311,39 +310,21 @@ impl Drop for FFI_ArrowSchema {
// This is set by the Arrow specification
#[allow(clippy::manual_bits)]
fn bit_width(data_type: &DataType, i: usize) -> Result<usize> {
+ if let Some(primitive) = data_type.primitive_width() {
+ return match i {
+ 0 => Err(ArrowError::CDataInterface(format!(
+ "The datatype \"{data_type:?}\" doesn't expect buffer at index
0. Please verify that the C data interface is correctly implemented."
+ ))),
+ 1 => Ok(primitive * 8),
+ i => Err(ArrowError::CDataInterface(format!(
+ "The datatype \"{data_type:?}\" expects 2 buffers, but
requested {i}. Please verify that the C data interface is correctly
implemented."
+ ))),
+ };
+ }
+
Ok(match (data_type, i) {
- // primitive types first buffer's size is given by the native types
(DataType::Boolean, 1) => 1,
- (DataType::UInt8, 1) => size_of::<u8>() * 8,
- (DataType::UInt16, 1) => size_of::<u16>() * 8,
- (DataType::UInt32, 1) => size_of::<u32>() * 8,
- (DataType::UInt64, 1) => size_of::<u64>() * 8,
- (DataType::Int8, 1) => size_of::<i8>() * 8,
- (DataType::Int16, 1) => size_of::<i16>() * 8,
- (DataType::Int32, 1) | (DataType::Date32, 1) | (DataType::Time32(_),
1) => size_of::<i32>() * 8,
- (DataType::Int64, 1) | (DataType::Date64, 1) | (DataType::Time64(_),
1) => size_of::<i64>() * 8,
- (DataType::Float32, 1) => size_of::<f32>() * 8,
- (DataType::Float64, 1) => size_of::<f64>() * 8,
- (DataType::Decimal128(..), 1) => size_of::<i128>() * 8,
- (DataType::Decimal256(..), 1) => size_of::<i256>() * 8,
- (DataType::Timestamp(..), 1) => size_of::<i64>() * 8,
- (DataType::Duration(..), 1) => size_of::<i64>() * 8,
- // primitive types have a single buffer
- (DataType::Boolean, _) |
- (DataType::UInt8, _) |
- (DataType::UInt16, _) |
- (DataType::UInt32, _) |
- (DataType::UInt64, _) |
- (DataType::Int8, _) |
- (DataType::Int16, _) |
- (DataType::Int32, _) | (DataType::Date32, _) | (DataType::Time32(_),
_) |
- (DataType::Int64, _) | (DataType::Date64, _) | (DataType::Time64(_),
_) |
- (DataType::Float32, _) |
- (DataType::Float64, _) |
- (DataType::Decimal128(..), _) |
- (DataType::Decimal256(..), _) |
- (DataType::Timestamp(..), _) |
- (DataType::Duration(..), _) => {
+ (DataType::Boolean, _) => {
return Err(ArrowError::CDataInterface(format!(
"The datatype \"{data_type:?}\" expects 2 buffers, but
requested {i}. Please verify that the C data interface is correctly
implemented."
)))