This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 7852e763f ArrayData Enumeration for Remaining Layouts (#3769)
7852e763f is described below
commit 7852e763fea66b33a2766b6d6421cafcf6a58c29
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Wed Mar 1 11:54:52 2023 +0000
ArrayData Enumeration for Remaining Layouts (#3769)
* Add StructArrayData
* Add ListArrayData
* Add DictionaryArrayData
* Format
* Add FixedSizeBinaryArrayData
* Add UnionArrayData
* Docs
* Add FixedSizeListArrayData
* Derive Debug and Clone
* Add RunArrayData
* Review feedback
---
arrow-data/src/data/bytes.rs | 80 ++++++++++++-
arrow-data/src/data/dictionary.rs | 174 +++++++++++++++++++++++++++
arrow-data/src/data/list.rs | 241 ++++++++++++++++++++++++++++++++++++++
arrow-data/src/data/mod.rs | 10 ++
arrow-data/src/data/primitive.rs | 22 ++--
arrow-data/src/data/run.rs | 149 +++++++++++++++++++++++
arrow-data/src/data/struct.rs | 81 +++++++++++++
arrow-data/src/data/types.rs | 3 +-
arrow-data/src/data/union.rs | 77 ++++++++++++
9 files changed, 819 insertions(+), 18 deletions(-)
diff --git a/arrow-data/src/data/bytes.rs b/arrow-data/src/data/bytes.rs
index 86839c671..521c1959a 100644
--- a/arrow-data/src/data/bytes.rs
+++ b/arrow-data/src/data/bytes.rs
@@ -73,7 +73,7 @@ mod private {
}
/// Types backed by a variable length slice of bytes
-pub trait Bytes: private::BytesSealed {
+pub trait Bytes: private::BytesSealed + std::fmt::Debug {
const TYPE: BytesType;
}
@@ -195,6 +195,7 @@ impl private::BytesOffsetSealed for i64 {
}
/// An enumeration of the types of [`ArrayDataBytesOffset`]
+#[derive(Debug, Clone)]
pub enum ArrayDataBytes {
Binary(ArrayDataBytesOffset<[u8]>),
Utf8(ArrayDataBytesOffset<str>),
@@ -217,18 +218,29 @@ impl ArrayDataBytes {
}
/// An enumeration of the types of [`BytesArrayData`]
+#[derive(Debug)]
pub enum ArrayDataBytesOffset<B: Bytes + ?Sized> {
Small(BytesArrayData<i32, B>),
Large(BytesArrayData<i64, B>),
}
+impl<B: Bytes + ?Sized> Clone for ArrayDataBytesOffset<B> {
+ fn clone(&self) -> Self {
+ match self {
+ Self::Small(v) => Self::Small(v.clone()),
+ Self::Large(v) => Self::Large(v.clone()),
+ }
+ }
+}
+
impl<O: BytesOffset, B: Bytes + ?Sized> From<BytesArrayData<O, B>> for
ArrayDataBytes {
fn from(value: BytesArrayData<O, B>) -> Self {
B::upcast(O::upcast(value))
}
}
-/// ArrayData for arrays of [`Bytes`]
+/// ArrayData for [variable-sized
arrays](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout)
of [`Bytes`]
+#[derive(Debug)]
pub struct BytesArrayData<O: BytesOffset, B: Bytes + ?Sized> {
data_type: DataType,
nulls: Option<NullBuffer>,
@@ -237,13 +249,25 @@ pub struct BytesArrayData<O: BytesOffset, B: Bytes +
?Sized> {
phantom: PhantomData<B>,
}
-impl<O: BytesOffset, B: Bytes> BytesArrayData<O, B> {
+impl<O: BytesOffset, B: Bytes + ?Sized> Clone for BytesArrayData<O, B> {
+ fn clone(&self) -> Self {
+ Self {
+ data_type: self.data_type.clone(),
+ nulls: self.nulls.clone(),
+ offsets: self.offsets.clone(),
+ values: self.values.clone(),
+ phantom: Default::default(),
+ }
+ }
+}
+
+impl<O: BytesOffset, B: Bytes + ?Sized> BytesArrayData<O, B> {
/// Creates a new [`BytesArrayData`]
///
/// # Safety
///
/// - Each consecutive window of `offsets` must identify a valid slice of
`values`
- /// - `nulls.len() == offsets.len() + 1`
+ /// - `nulls.len() == offsets.len() - 1`
/// - `data_type` must be valid for this layout
pub unsafe fn new_unchecked(
data_type: DataType,
@@ -270,7 +294,7 @@ impl<O: BytesOffset, B: Bytes> BytesArrayData<O, B> {
/// Returns the offsets
#[inline]
- pub fn value_offsets(&self) -> &[O] {
+ pub fn offsets(&self) -> &[O] {
&self.offsets
}
@@ -286,3 +310,49 @@ impl<O: BytesOffset, B: Bytes> BytesArrayData<O, B> {
&self.data_type
}
}
+
+/// ArrayData for [fixed-size
arrays](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-primitive-layout)
of bytes
+#[derive(Debug, Clone)]
+pub struct FixedSizeBinaryArrayData {
+ data_type: DataType,
+ nulls: Option<NullBuffer>,
+ values: Buffer,
+}
+
+impl FixedSizeBinaryArrayData {
+ /// Creates a new [`FixedSizeBinaryArrayData`]
+ ///
+ /// # Safety
+ ///
+ /// - `data_type` must be valid for this layout
+ /// - `nulls.len() == values.len() / element_size`
+ pub unsafe fn new_unchecked(
+ data_type: DataType,
+ values: Buffer,
+ nulls: Option<NullBuffer>,
+ ) -> Self {
+ Self {
+ data_type,
+ nulls,
+ values,
+ }
+ }
+
+ /// Returns the raw byte data
+ #[inline]
+ pub fn values(&self) -> &[u8] {
+ &self.values
+ }
+
+ /// Returns the null buffer if any
+ #[inline]
+ pub fn null_buffer(&self) -> Option<&NullBuffer> {
+ self.nulls.as_ref()
+ }
+
+ /// Returns the data type of this array
+ #[inline]
+ pub fn data_type(&self) -> &DataType {
+ &self.data_type
+ }
+}
diff --git a/arrow-data/src/data/dictionary.rs
b/arrow-data/src/data/dictionary.rs
new file mode 100644
index 000000000..2ec4ee005
--- /dev/null
+++ b/arrow-data/src/data/dictionary.rs
@@ -0,0 +1,174 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::data::types::DictionaryKeyType;
+use crate::ArrayData;
+use arrow_buffer::buffer::{NullBuffer, ScalarBuffer};
+use arrow_buffer::ArrowNativeType;
+use arrow_schema::DataType;
+
+mod private {
+ use super::*;
+
+ pub trait DictionaryKeySealed {
+ /// Downcast [`ArrayDataDictionary`] to `[DictionaryArrayData`]
+ fn downcast_ref(data: &ArrayDataDictionary) ->
Option<&DictionaryArrayData<Self>>
+ where
+ Self: DictionaryKey;
+
+ /// Downcast [`ArrayDataDictionary`] to `[DictionaryArrayData`]
+ fn downcast(data: ArrayDataDictionary) ->
Option<DictionaryArrayData<Self>>
+ where
+ Self: DictionaryKey;
+
+ /// Cast [`DictionaryArrayData`] to [`ArrayDataDictionary`]
+ fn upcast(v: DictionaryArrayData<Self>) -> ArrayDataDictionary
+ where
+ Self: DictionaryKey;
+ }
+}
+
+/// Types of dictionary key used by dictionary arrays
+pub trait DictionaryKey: private::DictionaryKeySealed + ArrowNativeType {
+ const TYPE: DictionaryKeyType;
+}
+
+macro_rules! dictionary {
+ ($t:ty,$v:ident) => {
+ impl DictionaryKey for $t {
+ const TYPE: DictionaryKeyType = DictionaryKeyType::$v;
+ }
+ impl private::DictionaryKeySealed for $t {
+ fn downcast_ref(
+ data: &ArrayDataDictionary,
+ ) -> Option<&DictionaryArrayData<Self>> {
+ match data {
+ ArrayDataDictionary::$v(v) => Some(v),
+ _ => None,
+ }
+ }
+
+ fn downcast(data: ArrayDataDictionary) ->
Option<DictionaryArrayData<Self>> {
+ match data {
+ ArrayDataDictionary::$v(v) => Some(v),
+ _ => None,
+ }
+ }
+
+ fn upcast(v: DictionaryArrayData<Self>) -> ArrayDataDictionary {
+ ArrayDataDictionary::$v(v)
+ }
+ }
+ };
+}
+
+dictionary!(i8, Int8);
+dictionary!(i16, Int16);
+dictionary!(i32, Int32);
+dictionary!(i64, Int64);
+dictionary!(u8, UInt8);
+dictionary!(u16, UInt16);
+dictionary!(u32, UInt32);
+dictionary!(u64, UInt64);
+
+/// An enumeration of the types of [`DictionaryArrayData`]
+#[derive(Debug, Clone)]
+pub enum ArrayDataDictionary {
+ Int8(DictionaryArrayData<i8>),
+ Int16(DictionaryArrayData<i16>),
+ Int32(DictionaryArrayData<i32>),
+ Int64(DictionaryArrayData<i64>),
+ UInt8(DictionaryArrayData<u8>),
+ UInt16(DictionaryArrayData<u16>),
+ UInt32(DictionaryArrayData<u32>),
+ UInt64(DictionaryArrayData<u64>),
+}
+
+impl ArrayDataDictionary {
+ /// Downcast this [`ArrayDataDictionary`] to the corresponding
[`DictionaryArrayData`]
+ pub fn downcast_ref<K: DictionaryKey>(&self) ->
Option<&DictionaryArrayData<K>> {
+ K::downcast_ref(self)
+ }
+
+ /// Downcast this [`ArrayDataDictionary`] to the corresponding
[`DictionaryArrayData`]
+ pub fn downcast<K: DictionaryKey>(self) -> Option<DictionaryArrayData<K>> {
+ K::downcast(self)
+ }
+}
+
+impl<K: DictionaryKey> From<DictionaryArrayData<K>> for ArrayDataDictionary {
+ fn from(value: DictionaryArrayData<K>) -> Self {
+ K::upcast(value)
+ }
+}
+
+/// ArrayData for [dictionary
arrays](https://arrow.apache.org/docs/format/Columnar.html#dictionary-encoded-layout)
+#[derive(Debug, Clone)]
+pub struct DictionaryArrayData<K: DictionaryKey> {
+ data_type: DataType,
+ nulls: Option<NullBuffer>,
+ keys: ScalarBuffer<K>,
+ child: Box<ArrayData>,
+}
+
+impl<K: DictionaryKey> DictionaryArrayData<K> {
+ /// Create a new [`DictionaryArrayData`]
+ ///
+ /// # Safety
+ ///
+ /// - `data_type` must be valid for this layout
+ /// - child must have a type matching `data_type`
+ /// - all values in `keys` must be `0 < v < child.len()` or be a null
according to `nulls`
+ /// - `nulls` must have the same length as `child`
+ pub unsafe fn new_unchecked(
+ data_type: DataType,
+ keys: ScalarBuffer<K>,
+ nulls: Option<NullBuffer>,
+ child: ArrayData,
+ ) -> Self {
+ Self {
+ data_type,
+ nulls,
+ keys,
+ child: Box::new(child),
+ }
+ }
+
+ /// Returns the null buffer if any
+ #[inline]
+ pub fn nulls(&self) -> Option<&NullBuffer> {
+ self.nulls.as_ref()
+ }
+
+ /// Returns the keys
+ #[inline]
+ pub fn keys(&self) -> &[K] {
+ &self.keys
+ }
+
+ /// Returns the child data
+ #[inline]
+ pub fn child(&self) -> &ArrayData {
+ self.child.as_ref()
+ }
+
+ /// Returns the data type of this array
+ #[inline]
+ pub fn data_type(&self) -> &DataType {
+ &self.data_type
+ }
+}
diff --git a/arrow-data/src/data/list.rs b/arrow-data/src/data/list.rs
new file mode 100644
index 000000000..59909289e
--- /dev/null
+++ b/arrow-data/src/data/list.rs
@@ -0,0 +1,241 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::data::types::OffsetType;
+use crate::ArrayData;
+use arrow_buffer::buffer::{NullBuffer, ScalarBuffer};
+use arrow_buffer::{ArrowNativeType, Buffer};
+use arrow_schema::DataType;
+
+mod private {
+ use super::*;
+
+ pub trait ListOffsetSealed {
+ /// Downcast [`ArrayDataList`] to `[ListArrayData`]
+ fn downcast_ref(data: &ArrayDataList) -> Option<&ListArrayData<Self>>
+ where
+ Self: ListOffset;
+
+ /// Downcast [`ArrayDataList`] to `[ListArrayData`]
+ fn downcast(data: ArrayDataList) -> Option<ListArrayData<Self>>
+ where
+ Self: ListOffset;
+
+ /// Cast [`ListArrayData`] to [`ArrayDataList`]
+ fn upcast(v: ListArrayData<Self>) -> ArrayDataList
+ where
+ Self: ListOffset;
+ }
+}
+
+/// Types of offset used by variable length list arrays
+pub trait ListOffset: private::ListOffsetSealed + ArrowNativeType {
+ const TYPE: OffsetType;
+}
+
+impl ListOffset for i32 {
+ const TYPE: OffsetType = OffsetType::Int32;
+}
+
+impl private::ListOffsetSealed for i32 {
+ fn downcast_ref(data: &ArrayDataList) -> Option<&ListArrayData<Self>>
+ where
+ Self: ListOffset,
+ {
+ match data {
+ ArrayDataList::Small(v) => Some(v),
+ ArrayDataList::Large(_) => None,
+ }
+ }
+
+ fn downcast(data: ArrayDataList) -> Option<ListArrayData<Self>>
+ where
+ Self: ListOffset,
+ {
+ match data {
+ ArrayDataList::Small(v) => Some(v),
+ ArrayDataList::Large(_) => None,
+ }
+ }
+
+ fn upcast(v: ListArrayData<Self>) -> ArrayDataList
+ where
+ Self: ListOffset,
+ {
+ ArrayDataList::Small(v)
+ }
+}
+
+impl ListOffset for i64 {
+ const TYPE: OffsetType = OffsetType::Int64;
+}
+
+impl private::ListOffsetSealed for i64 {
+ fn downcast_ref(data: &ArrayDataList) -> Option<&ListArrayData<Self>>
+ where
+ Self: ListOffset,
+ {
+ match data {
+ ArrayDataList::Small(_) => None,
+ ArrayDataList::Large(v) => Some(v),
+ }
+ }
+
+ fn downcast(data: ArrayDataList) -> Option<ListArrayData<Self>>
+ where
+ Self: ListOffset,
+ {
+ match data {
+ ArrayDataList::Small(_) => None,
+ ArrayDataList::Large(v) => Some(v),
+ }
+ }
+
+ fn upcast(v: ListArrayData<Self>) -> ArrayDataList
+ where
+ Self: ListOffset,
+ {
+ ArrayDataList::Large(v)
+ }
+}
+
+/// An enumeration of the types of [`ListArrayData`]
+#[derive(Debug, Clone)]
+pub enum ArrayDataList {
+ Small(ListArrayData<i32>),
+ Large(ListArrayData<i64>),
+}
+
+impl ArrayDataList {
+ /// Downcast this [`ArrayDataList`] to the corresponding [`ListArrayData`]
+ pub fn downcast_ref<O: ListOffset>(&self) -> Option<&ListArrayData<O>> {
+ O::downcast_ref(self)
+ }
+
+ /// Downcast this [`ArrayDataList`] to the corresponding [`ListArrayData`]
+ pub fn downcast<O: ListOffset>(self) -> Option<ListArrayData<O>> {
+ O::downcast(self)
+ }
+}
+
+impl<O: ListOffset> From<ListArrayData<O>> for ArrayDataList {
+ fn from(value: ListArrayData<O>) -> Self {
+ O::upcast(value)
+ }
+}
+
+/// ArrayData for [variable-size list
arrays](https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout)
+#[derive(Debug, Clone)]
+pub struct ListArrayData<O: ListOffset> {
+ data_type: DataType,
+ nulls: Option<NullBuffer>,
+ offsets: ScalarBuffer<O>,
+ child: Box<ArrayData>,
+}
+
+impl<O: ListOffset> ListArrayData<O> {
+ /// Create a new [`ListArrayData`]
+ ///
+ /// # Safety
+ ///
+ /// - Each consecutive window of `offsets` must identify a valid slice of
`child`
+ /// - `nulls.len() == offsets.len() - 1`
+ /// - `data_type` must be valid for this layout
+ pub unsafe fn new_unchecked(
+ data_type: DataType,
+ offsets: ScalarBuffer<O>,
+ nulls: Option<NullBuffer>,
+ child: ArrayData,
+ ) -> Self {
+ Self {
+ data_type,
+ nulls,
+ offsets,
+ child: Box::new(child),
+ }
+ }
+
+ /// Returns the null buffer if any
+ #[inline]
+ pub fn nulls(&self) -> Option<&NullBuffer> {
+ self.nulls.as_ref()
+ }
+
+ /// Returns the offsets
+ #[inline]
+ pub fn offsets(&self) -> &[O] {
+ &self.offsets
+ }
+
+ /// Returns the child data
+ #[inline]
+ pub fn child(&self) -> &ArrayData {
+ self.child.as_ref()
+ }
+
+ /// Returns the data type of this array
+ #[inline]
+ pub fn data_type(&self) -> &DataType {
+ &self.data_type
+ }
+}
+
+/// ArrayData for [fixed-size list
arrays](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-list-layout)
+#[derive(Debug, Clone)]
+pub struct FixedSizeListArrayData {
+ data_type: DataType,
+ nulls: Option<NullBuffer>,
+ child: Box<ArrayData>,
+}
+
+impl FixedSizeListArrayData {
+ /// Create a new [`FixedSizeListArrayData`]
+ ///
+ /// # Safety
+ ///
+ /// - `data_type` must be valid for this layout
+ /// - `nulls.len() == values.len() / element_size`
+ pub unsafe fn new_unchecked(
+ data_type: DataType,
+ nulls: Option<NullBuffer>,
+ child: ArrayData,
+ ) -> Self {
+ Self {
+ data_type,
+ nulls,
+ child: Box::new(child),
+ }
+ }
+
+ /// Returns the null buffer if any
+ #[inline]
+ pub fn nulls(&self) -> Option<&NullBuffer> {
+ self.nulls.as_ref()
+ }
+
+ /// Returns the child data
+ #[inline]
+ pub fn child(&self) -> &ArrayData {
+ self.child.as_ref()
+ }
+
+ /// Returns the data type of this array
+ #[inline]
+ pub fn data_type(&self) -> &DataType {
+ &self.data_type
+ }
+}
diff --git a/arrow-data/src/data/mod.rs b/arrow-data/src/data/mod.rs
index eb1fe2bcf..2f9e142b1 100644
--- a/arrow-data/src/data/mod.rs
+++ b/arrow-data/src/data/mod.rs
@@ -32,9 +32,19 @@ use crate::equal;
#[allow(unused)] // Private until ready (#1176)
mod bytes;
#[allow(unused)] // Private until ready (#1176)
+mod dictionary;
+#[allow(unused)] // Private until ready (#1176)
+mod list;
+#[allow(unused)] // Private until ready (#1176)
mod primitive;
#[allow(unused)] // Private until ready (#1176)
+mod run;
+#[allow(unused)] // Private until ready (#1176)
+mod r#struct;
+#[allow(unused)] // Private until ready (#1176)
mod types;
+#[allow(unused)] // Private until ready (#1176)
+mod union;
#[inline]
pub(crate) fn contains_nulls(
diff --git a/arrow-data/src/data/primitive.rs b/arrow-data/src/data/primitive.rs
index d34ef42db..058b3e822 100644
--- a/arrow-data/src/data/primitive.rs
+++ b/arrow-data/src/data/primitive.rs
@@ -43,13 +43,13 @@ mod private {
}
pub trait Primitive: private::PrimitiveSealed + ArrowNativeType {
- const VARIANT: PrimitiveType;
+ const TYPE: PrimitiveType;
}
macro_rules! primitive {
($t:ty,$v:ident) => {
impl Primitive for $t {
- const VARIANT: PrimitiveType = PrimitiveType::$v;
+ const TYPE: PrimitiveType = PrimitiveType::$v;
}
impl private::PrimitiveSealed for $t {
fn downcast_ref(
@@ -118,7 +118,13 @@ impl ArrayDataPrimitive {
}
}
-/// ArrayData for arrays of [`Primitive`]
+impl<P: Primitive> From<PrimitiveArrayData<P>> for ArrayDataPrimitive {
+ fn from(value: PrimitiveArrayData<P>) -> Self {
+ P::upcast(value)
+ }
+}
+
+/// ArrayData for [fixed size
arrays](https://arrow.apache.org/docs/format/Columnar.html#fixed-size-primitive-layout)
of [`Primitive`]
#[derive(Debug, Clone)]
pub struct PrimitiveArrayData<T: Primitive> {
data_type: DataType,
@@ -126,12 +132,6 @@ pub struct PrimitiveArrayData<T: Primitive> {
values: ScalarBuffer<T>,
}
-impl<P: Primitive> From<PrimitiveArrayData<P>> for ArrayDataPrimitive {
- fn from(value: PrimitiveArrayData<P>) -> Self {
- P::upcast(value)
- }
-}
-
impl<T: Primitive> PrimitiveArrayData<T> {
/// Create a new [`PrimitiveArrayData`]
///
@@ -147,10 +147,10 @@ impl<T: Primitive> PrimitiveArrayData<T> {
) -> Self {
let physical = PhysicalType::from(&data_type);
assert!(
- matches!(physical, PhysicalType::Primitive(p) if p == T::VARIANT),
+ matches!(physical, PhysicalType::Primitive(p) if p == T::TYPE),
"Illegal physical type for PrimitiveArrayData of datatype {:?},
expected {:?} got {:?}",
data_type,
- T::VARIANT,
+ T::TYPE,
physical
);
diff --git a/arrow-data/src/data/run.rs b/arrow-data/src/data/run.rs
new file mode 100644
index 000000000..cd993de1b
--- /dev/null
+++ b/arrow-data/src/data/run.rs
@@ -0,0 +1,149 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::data::types::RunEndType;
+use crate::ArrayData;
+use arrow_buffer::buffer::ScalarBuffer;
+use arrow_buffer::ArrowNativeType;
+use arrow_schema::DataType;
+use std::marker::PhantomData;
+
+mod private {
+ use super::*;
+
+ pub trait RunEndSealed {
+ /// Downcast [`ArrayDataRun`] to `[RunArrayData`]
+ fn downcast_ref(data: &ArrayDataRun) -> Option<&RunArrayData<Self>>
+ where
+ Self: RunEnd;
+
+ /// Downcast [`ArrayDataRun`] to `[RunArrayData`]
+ fn downcast(data: ArrayDataRun) -> Option<RunArrayData<Self>>
+ where
+ Self: RunEnd;
+
+ /// Cast [`RunArrayData`] to [`ArrayDataRun`]
+ fn upcast(v: RunArrayData<Self>) -> ArrayDataRun
+ where
+ Self: RunEnd;
+ }
+}
+
+pub trait RunEnd: private::RunEndSealed + ArrowNativeType {
+ const TYPE: RunEndType;
+}
+
+macro_rules! run_end {
+ ($t:ty,$v:ident) => {
+ impl RunEnd for $t {
+ const TYPE: RunEndType = RunEndType::$v;
+ }
+ impl private::RunEndSealed for $t {
+ fn downcast_ref(data: &ArrayDataRun) ->
Option<&RunArrayData<Self>> {
+ match data {
+ ArrayDataRun::$v(v) => Some(v),
+ _ => None,
+ }
+ }
+
+ fn downcast(data: ArrayDataRun) -> Option<RunArrayData<Self>> {
+ match data {
+ ArrayDataRun::$v(v) => Some(v),
+ _ => None,
+ }
+ }
+
+ fn upcast(v: RunArrayData<Self>) -> ArrayDataRun {
+ ArrayDataRun::$v(v)
+ }
+ }
+ };
+}
+
+run_end!(i16, Int16);
+run_end!(i32, Int32);
+run_end!(i64, Int64);
+
+/// An enumeration of the types of [`RunArrayData`]
+pub enum ArrayDataRun {
+ Int16(RunArrayData<i16>),
+ Int32(RunArrayData<i32>),
+ Int64(RunArrayData<i64>),
+}
+
+impl ArrayDataRun {
+ /// Downcast this [`ArrayDataRun`] to the corresponding [`RunArrayData`]
+ pub fn downcast_ref<E: RunEnd>(&self) -> Option<&RunArrayData<E>> {
+ E::downcast_ref(self)
+ }
+
+ /// Downcast this [`ArrayDataRun`] to the corresponding [`RunArrayData`]
+ pub fn downcast<E: RunEnd>(self) -> Option<RunArrayData<E>> {
+ E::downcast(self)
+ }
+}
+
+impl<E: RunEnd> From<RunArrayData<E>> for ArrayDataRun {
+ fn from(value: RunArrayData<E>) -> Self {
+ E::upcast(value)
+ }
+}
+
+/// ArrayData for [run-end encoded
arrays](https://arrow.apache.org/docs/format/Columnar.html#run-end-encoded-layout)
+pub struct RunArrayData<E: RunEnd> {
+ data_type: DataType,
+ run_ends: ScalarBuffer<E>,
+ child: Box<ArrayData>,
+}
+
+impl<E: RunEnd> RunArrayData<E> {
+ /// Create a new [`RunArrayData`]
+ ///
+ /// # Safety
+ ///
+ /// - `data_type` must be valid for this layout
+ /// - `run_ends` must contain monotonically increasing, positive values
`<= child.len()`
+ pub unsafe fn new_unchecked(
+ data_type: DataType,
+ run_ends: ScalarBuffer<E>,
+ child: ArrayData,
+ ) -> Self {
+ Self {
+ data_type,
+ run_ends,
+ child: Box::new(child),
+ }
+ }
+
+ /// Returns the run ends
+ #[inline]
+ pub fn run_ends(&self) -> &[E] {
+ &self.run_ends
+ }
+
+ /// Returns the data type of this array
+ #[inline]
+ pub fn data_type(&self) -> &DataType {
+ &self.data_type
+ }
+
+ /// Returns the child data
+ #[inline]
+ pub fn child(&self) -> &ArrayData {
+ self.child.as_ref()
+ }
+}
diff --git a/arrow-data/src/data/struct.rs b/arrow-data/src/data/struct.rs
new file mode 100644
index 000000000..d99992619
--- /dev/null
+++ b/arrow-data/src/data/struct.rs
@@ -0,0 +1,81 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::ArrayData;
+use arrow_buffer::buffer::NullBuffer;
+use arrow_schema::DataType;
+
+/// ArrayData for [struct
arrays](https://arrow.apache.org/docs/format/Columnar.html#struct-layout)
+#[derive(Debug, Clone)]
+pub struct StructArrayData {
+ data_type: DataType,
+ len: usize,
+ nulls: Option<NullBuffer>,
+ children: Vec<ArrayData>,
+}
+
+impl StructArrayData {
+ /// Create a new [`StructArrayData`]
+ ///
+ /// # Safety
+ ///
+ /// - data_type must be a StructArray with fields matching `child_data`
+ /// - all child data and nulls must have length matching `len`
+ pub unsafe fn new_unchecked(
+ data_type: DataType,
+ len: usize,
+ nulls: Option<NullBuffer>,
+ children: Vec<ArrayData>,
+ ) -> Self {
+ Self {
+ data_type,
+ len,
+ nulls,
+ children,
+ }
+ }
+
+ /// Returns the length of this [`StructArrayData`]
+ #[inline]
+ pub fn len(&self) -> usize {
+ self.len
+ }
+
+ /// Returns `true` if this [`StructArrayData`] has zero length
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.len == 0
+ }
+
+ /// Returns the null buffer if any
+ #[inline]
+ pub fn nulls(&self) -> Option<&NullBuffer> {
+ self.nulls.as_ref()
+ }
+
+ /// Returns the primitive values
+ #[inline]
+ pub fn children(&self) -> &[ArrayData] {
+ &self.children
+ }
+
+ /// Returns the data type of this array
+ #[inline]
+ pub fn data_type(&self) -> &DataType {
+ &self.data_type
+ }
+}
diff --git a/arrow-data/src/data/types.rs b/arrow-data/src/data/types.rs
index 09e169f6a..3414e481c 100644
--- a/arrow-data/src/data/types.rs
+++ b/arrow-data/src/data/types.rs
@@ -80,7 +80,6 @@ pub enum PhysicalType {
Bytes(OffsetType, BytesType),
FixedSizeList,
List(OffsetType),
- Map,
Struct,
Union,
Dictionary(DictionaryKeyType),
@@ -141,7 +140,7 @@ impl From<&DataType> for PhysicalType {
DataType::UInt64 =>
Self::Dictionary(DictionaryKeyType::UInt64),
d => panic!("illegal dictionary key data type {d}"),
},
- DataType::Map(_, _) => Self::Map,
+ DataType::Map(_, _) => Self::List(OffsetType::Int32),
DataType::RunEndEncoded(f, _) => match f.data_type() {
DataType::Int16 => Self::Run(RunEndType::Int16),
DataType::Int32 => Self::Run(RunEndType::Int32),
diff --git a/arrow-data/src/data/union.rs b/arrow-data/src/data/union.rs
new file mode 100644
index 000000000..7861bd154
--- /dev/null
+++ b/arrow-data/src/data/union.rs
@@ -0,0 +1,77 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::ArrayData;
+use arrow_buffer::buffer::ScalarBuffer;
+use arrow_schema::DataType;
+
+/// ArrayData for [union
arrays](https://arrow.apache.org/docs/format/Columnar.html#union-layout)
+#[derive(Debug, Clone)]
+pub struct UnionArrayData {
+ data_type: DataType,
+ type_ids: ScalarBuffer<i8>,
+ offsets: Option<ScalarBuffer<i32>>,
+ children: Vec<ArrayData>,
+}
+
+impl UnionArrayData {
+ /// Creates a new [`UnionArrayData`]
+ ///
+ /// # Safety
+ ///
+ /// - `data_type` must be valid for this layout
+ /// - `type_ids` must only contain values corresponding to a field in
`data_type`
+ /// - `children` must match the field definitions in `data_type`
+ /// - For each value id in type_ids, the corresponding offset, must be in
bounds for the child
+ pub unsafe fn new_unchecked(
+ data_type: DataType,
+ type_ids: ScalarBuffer<i8>,
+ offsets: Option<ScalarBuffer<i32>>,
+ children: Vec<ArrayData>,
+ ) -> Self {
+ Self {
+ data_type,
+ type_ids,
+ offsets,
+ children,
+ }
+ }
+
+ /// Returns the type ids for this array
+ #[inline]
+ pub fn type_ids(&self) -> &[i8] {
+ &self.type_ids
+ }
+
+ /// Returns the offsets for this array if this is a dense union
+ #[inline]
+ pub fn offsets(&self) -> Option<&[i32]> {
+ self.offsets.as_deref()
+ }
+
+ /// Returns the children of this array
+ #[inline]
+ pub fn children(&self) -> &[ArrayData] {
+ &self.children
+ }
+
+ /// Returns the data type of this array
+ #[inline]
+ pub fn data_type(&self) -> &DataType {
+ &self.data_type
+ }
+}