alamb commented on code in PR #2769: URL: https://github.com/apache/arrow-rs/pull/2769#discussion_r979230577
########## arrow-array/src/record_batch.rs: ########## @@ -469,29 +446,14 @@ impl From<RecordBatch> for StructArray { } } -/// Trait for types that can read `RecordBatch`'s. -pub trait RecordBatchReader: Iterator<Item = Result<RecordBatch>> { - /// Returns the schema of this `RecordBatchReader`. - /// - /// Implementation of this trait should guarantee that all `RecordBatch`'s returned by this - /// reader should have the same schema as returned from this method. - fn schema(&self) -> SchemaRef; - - /// Reads the next `RecordBatch`. - #[deprecated( Review Comment: 🎉 This is another breaking API change (nice cleanup ########## arrow-pyarrow-integration-testing/src/lib.rs: ########## @@ -51,7 +51,7 @@ fn double(array: &PyAny, py: Python) -> PyResult<PyObject> { let array = kernels::arithmetic::add(array, array).map_err(to_py_err)?; // export - array.to_pyarrow(py) + array.data().to_pyarrow(py) Review Comment: this seems a very reasonable change (but is it also an API change?) ########## arrow-array/src/array/dictionary_array.rs: ########## @@ -15,20 +15,130 @@ // specific language governing permissions and limitations // under the License. -use crate::array::{ArrayAccessor, ArrayIter}; +use crate::builder::StringDictionaryBuilder; +use crate::iterator::ArrayIter; +use crate::types::*; +use crate::{ + make_array, Array, ArrayAccessor, ArrayRef, ArrowPrimitiveType, PrimitiveArray, + StringArray, +}; +use arrow_buffer::ArrowNativeType; +use arrow_data::ArrayData; +use arrow_schema::{ArrowError, DataType}; use std::any::Any; -use std::fmt; -use std::iter::IntoIterator; -use std::{convert::From, iter::FromIterator}; -use super::{ - make_array, Array, ArrayData, ArrayRef, PrimitiveArray, StringArray, - StringDictionaryBuilder, -}; -use crate::datatypes::{ - ArrowDictionaryKeyType, ArrowNativeType, ArrowPrimitiveType, DataType, -}; -use crate::error::Result; +/// +/// A dictionary array where each element is a single value indexed by an integer key. +/// +/// # Example: Using `collect` +/// ``` +/// # use arrow_array::{Array, Int8DictionaryArray, Int8Array, StringArray}; +/// # use std::sync::Arc; +/// +/// let array: Int8DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect(); +/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"])); +/// assert_eq!(array.keys(), &Int8Array::from(vec![0, 0, 1, 2])); +/// assert_eq!(array.values(), &values); +/// ``` +pub type Int8DictionaryArray = DictionaryArray<Int8Type>; Review Comment: these `pub type`s are new, right? ########## arrow-array/src/array/primitive_array.rs: ########## @@ -15,34 +15,194 @@ // specific language governing permissions and limitations // under the License. -use std::any::Any; -use std::convert::From; -use std::fmt; -use std::iter::{FromIterator, IntoIterator}; -use std::mem; - -use chrono::{prelude::*, Duration}; - -use super::array::print_long_array; -use super::raw_pointer::RawPtrBox; -use super::*; -use crate::temporal_conversions; -use crate::util::bit_util; -use crate::{ - buffer::{Buffer, MutableBuffer}, - util::trusted_len_unzip, -}; - -use crate::array::array::ArrayAccessor; +use crate::builder::{BooleanBufferBuilder, PrimitiveBuilder}; +use crate::iterator::PrimitiveIter; +use crate::raw_pointer::RawPtrBox; +use crate::temporal_conversions::{as_date, as_datetime, as_duration, as_time}; +use crate::trusted_len::trusted_len_unzip; +use crate::types::*; +use crate::{print_long_array, Array, ArrayAccessor}; +use arrow_buffer::{bit_util, ArrowNativeType, Buffer, MutableBuffer}; +use arrow_data::ArrayData; +use arrow_schema::DataType; +use chrono::{Duration, FixedOffset, NaiveDate, NaiveDateTime, NaiveTime}; use half::f16; +use std::any::Any; + +/// +/// # Example: Using `collect` Review Comment: 💯 for adding basic doc examples to these typedefs ########## arrow-array/src/lib.rs: ########## @@ -0,0 +1,209 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! The central type in Apache Arrow are arrays, which are a known-length sequence of values +//! all having the same type. This module provides concrete implementations of each type, as +//! well as an [`Array`] trait that can be used for type-erasure. +//! +//! # Downcasting an Array +//! +//! Arrays are often passed around as a dynamically typed [`&dyn Array`] or [`ArrayRef`]. +//! For example, [`RecordBatch`](`crate::RecordBatch`) stores columns as [`ArrayRef`]. +//! +//! Whilst these arrays can be passed directly to the [`compute`], [`csv`], [`json`], etc... APIs, +//! it is often the case that you wish to interact with the data directly. +//! +//! This requires downcasting to the concrete type of the array: +//! +//! ``` +//! # use arrow_array::{Array, Float32Array, Int32Array}; +//! +//! fn sum_int32(array: &dyn Array) -> i32 { +//! let integers: &Int32Array = array.as_any().downcast_ref().unwrap(); +//! integers.iter().map(|val| val.unwrap_or_default()).sum() +//! } +//! +//! // Note: the values for positions corresponding to nulls will be arbitrary +//! fn as_f32_slice(array: &dyn Array) -> &[f32] { +//! array.as_any().downcast_ref::<Float32Array>().unwrap().values() +//! } +//! ``` +//! +//! Additionally, there are convenient functions to do this casting +//! such as [`cast::as_primitive_array<T>`] and [`cast::as_string_array`]: +//! +//! ``` +//! # use arrow_array::Array; +//! # use arrow_array::cast::as_primitive_array; +//! # use arrow_array::types::Float32Type; +//! +//! fn as_f32_slice(array: &dyn Array) -> &[f32] { +//! // use as_primtive_array +//! as_primitive_array::<Float32Type>(array).values() +//! } +//! ``` + +//! # Building an Array +//! +//! Most [`Array`] implementations can be constructed directly from iterators or [`Vec`] +//! +//! ``` +//! # use arrow_array::{Int32Array, ListArray, StringArray}; +//! # use arrow_array::types::Int32Type; +//! +//! Int32Array::from(vec![1, 2]); +//! Int32Array::from(vec![Some(1), None]); +//! Int32Array::from_iter([1, 2, 3, 4]); +//! Int32Array::from_iter([Some(1), Some(2), None, Some(4)]); +//! +//! StringArray::from(vec!["foo", "bar"]); +//! StringArray::from(vec![Some("foo"), None]); +//! StringArray::from_iter([Some("foo"), None]); +//! StringArray::from_iter_values(["foo", "bar"]); +//! +//! ListArray::from_iter_primitive::<Int32Type, _, _>([ +//! Some(vec![Some(1), None, Some(3)]), +//! None, +//! Some(vec![]) +//! ]); +//! ``` +//! +//! Additionally [`ArrayBuilder`](builder::ArrayBuilder) implementations can be +//! used to construct arrays with a push-based interface +//! +//! ``` +//! # use arrow_array::Int16Array; +//! # +//! // Create a new builder with a capacity of 100 +//! let mut builder = Int16Array::builder(100); +//! +//! // Append a single primitive value +//! builder.append_value(1); +//! +//! // Append a null value +//! builder.append_null(); +//! +//! // Append a slice of primitive values +//! builder.append_slice(&[2, 3, 4]); +//! +//! // Build the array +//! let array = builder.finish(); +//! +//! assert_eq!( +//! 5, +//! array.len(), +//! "The array has 5 values, counting the null value" +//! ); +//! +//! assert_eq!(2, array.value(2), "Get the value with index 2"); +//! +//! assert_eq!( +//! &array.values()[3..5], +//! &[3, 4], +//! "Get slice of len 2 starting at idx 3" +//! ) +//! ``` +//! +//! # Zero-Copy Slicing +//! +//! Given an [`Array`] of arbitrary length, it is possible to create an owned slice of this +//! data. Internally this just increments some ref-counts, and so is incredibly cheap +//! +//! ```rust +//! # use std::sync::Arc; +//! # use arrow_array::{ArrayRef, Int32Array}; +//! let array = Arc::new(Int32Array::from_iter([1, 2, 3])) as ArrayRef; +//! +//! // Slice with offset 1 and length 2 +//! let sliced = array.slice(1, 2); +//! let ints = sliced.as_any().downcast_ref::<Int32Array>().unwrap(); +//! assert_eq!(ints.values(), &[2, 3]); +//! ``` +//! +//! # Internal Representation +//! +//! Internally, arrays are represented by one or several [`Buffer`], the number and meaning of +//! which depend on the array’s data type, as documented in the [Arrow specification]. +//! +//! For example, the type [`Int16Array`] represents an array of 16-bit integers and consists of: +//! +//! * An optional [`Bitmap`] identifying any null values +//! * A contiguous [`Buffer`] of 16-bit integers +//! +//! Similarly, the type [`StringArray`] represents an array of UTF-8 strings and consists of: +//! +//! * An optional [`Bitmap`] identifying any null values +//! * An offsets [`Buffer`] of 32-bit integers identifying valid UTF-8 sequences within the values buffer +//! * A values [`Buffer`] of UTF-8 encoded string data +//! +//! [Arrow specification]: https://arrow.apache.org/docs/format/Columnar.html +//! [`&dyn Array`]: Array +//! [`Bitmap`]: arrow_data::Bitmap +//! [`Buffer`]: arrow_buffer::Buffer +//! [`compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html +//! [`json`]: https://docs.rs/arrow/latest/arrow/json/index.html +//! [`csv`]: https://docs.rs/arrow/latest/arrow/csv/index.html + +pub mod array; +pub use array::*; + +mod record_batch; +pub use record_batch::{RecordBatch, RecordBatchOptions}; + +pub mod builder; +pub mod cast; +pub mod decimal; +mod delta; +pub mod iterator; +mod raw_pointer; +pub mod temporal_conversions; +mod trusted_len; +pub mod types; + +#[cfg(test)] +mod tests { + use crate::builder::*; + + #[test] + fn test_buffer_builder_availability() { Review Comment: I wonder if this is the kind of thing that should be in in a `tests` type integration test to ensure that the types are `pub` and not `pub(crate)` for example ########## arrow/src/lib.rs: ########## @@ -270,8 +272,32 @@ pub mod ipc; pub mod json; #[cfg(feature = "pyarrow")] pub mod pyarrow; -pub mod record_batch; + +pub mod record_batch { + pub use arrow_array::{RecordBatch, RecordBatchOptions}; + use arrow_schema::{ArrowError, SchemaRef}; + + /// Trait for types that can read `RecordBatch`'s. + pub trait RecordBatchReader: + Iterator<Item = Result<RecordBatch, ArrowError>> + { + /// Returns the schema of this `RecordBatchReader`. + /// + /// Implementation of this trait should guarantee that all `RecordBatch`'s returned by this + /// reader should have the same schema as returned from this method. + fn schema(&self) -> SchemaRef; + + /// Reads the next `RecordBatch`. + #[deprecated( Review Comment: Oh, whoops -- maybe we should remove this deprecated API (perhaps as a follow on PR) ########## arrow-array/src/cast.rs: ########## @@ -0,0 +1,767 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Defines helper functions for downcasting [`dyn Array`](Array) to concrete types + +use crate::array::*; +use crate::types::*; + +/// Downcast an [`Array`] to a [`PrimitiveArray`] based on its [`DataType`] +/// accepts a number of subsequent patterns to match the data type +/// +/// ``` +/// # use arrow_array::{Array, downcast_primitive_array, cast::as_string_array}; +/// # use arrow_schema::DataType; +/// +/// fn print_primitive(array: &dyn Array) { +/// downcast_primitive_array!( +/// array => { +/// for v in array { +/// println!("{:?}", v); +/// } +/// } +/// DataType::Utf8 => { +/// for v in as_string_array(array) { +/// println!("{:?}", v); +/// } +/// } +/// t => println!("Unsupported datatype {}", t) +/// ) +/// } +/// ``` +/// +/// [`DataType`]: arrow_schema::DataType +#[macro_export] +macro_rules! downcast_primitive_array { + ($values:ident => $e:expr, $($p:pat => $fallback:expr $(,)*)*) => { + downcast_primitive_array!($values => {$e} $($p => $fallback)*) + }; + + ($values:ident => $e:block $($p:pat => $fallback:expr $(,)*)*) => { + match $values.data_type() { + arrow_schema::DataType::Int8 => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::Int8Type, + >($values); + $e + } + arrow_schema::DataType::Int16 => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::Int16Type, + >($values); + $e + } + arrow_schema::DataType::Int32 => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::Int32Type, + >($values); + $e + } + arrow_schema::DataType::Int64 => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::Int64Type, + >($values); + $e + } + arrow_schema::DataType::UInt8 => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::UInt8Type, + >($values); + $e + } + arrow_schema::DataType::UInt16 => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::UInt16Type, + >($values); + $e + } + arrow_schema::DataType::UInt32 => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::UInt32Type, + >($values); + $e + } + arrow_schema::DataType::UInt64 => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::UInt64Type, + >($values); + $e + } + arrow_schema::DataType::Float16 => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::Float16Type, + >($values); + $e + } + arrow_schema::DataType::Float32 => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::Float32Type, + >($values); + $e + } + arrow_schema::DataType::Float64 => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::Float64Type, + >($values); + $e + } + arrow_schema::DataType::Date32 => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::Date32Type, + >($values); + $e + } + arrow_schema::DataType::Date64 => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::Date64Type, + >($values); + $e + } + arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Second) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::Time32SecondType, + >($values); + $e + } + arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Millisecond) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::Time32MillisecondType, + >($values); + $e + } + arrow_schema::DataType::Time64(arrow_schema::TimeUnit::Microsecond) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::Time64MicrosecondType, + >($values); + $e + } + arrow_schema::DataType::Time64(arrow_schema::TimeUnit::Nanosecond) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::Time64NanosecondType, + >($values); + $e + } + arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Second, _) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::TimestampSecondType, + >($values); + $e + } + arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, _) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::TimestampMillisecondType, + >($values); + $e + } + arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, _) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::TimestampMicrosecondType, + >($values); + $e + } + arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, _) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::TimestampNanosecondType, + >($values); + $e + } + arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::YearMonth) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::IntervalYearMonthType, + >($values); + $e + } + arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::DayTime) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::IntervalDayTimeType, + >($values); + $e + } + arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::IntervalMonthDayNanoType, + >($values); + $e + } + arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Second) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::DurationSecondType, + >($values); + $e + } + arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Millisecond) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::DurationMillisecondType, + >($values); + $e + } + arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Microsecond) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::DurationMicrosecondType, + >($values); + $e + } + arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Nanosecond) => { + let $values = $crate::cast::as_primitive_array::< + $crate::types::DurationNanosecondType, + >($values); + $e + } + $($p => $fallback,)* + } + }; + + (($values1:ident, $values2:ident) => $e:block $($p:pat => $fallback:expr $(,)*)*) => { + match ($values1.data_type(), $values2.data_type()) { + (arrow_schema::DataType::Int8, arrow_schema::DataType::Int8) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::Int8Type, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::Int8Type, + >($values2); + $e + } + (arrow_schema::DataType::Int16, arrow_schema::DataType::Int16) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::Int16Type, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::Int16Type, + >($values2); + $e + } + (arrow_schema::DataType::Int32, arrow_schema::DataType::Int32) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::Int32Type, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::Int32Type, + >($values2); + $e + } + (arrow_schema::DataType::Int64, arrow_schema::DataType::Int64) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::Int64Type, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::Int64Type, + >($values2); + $e + } + (arrow_schema::DataType::UInt8, arrow_schema::DataType::UInt8) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::UInt8Type, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::UInt8Type, + >($values2); + $e + } + (arrow_schema::DataType::UInt16, arrow_schema::DataType::UInt16) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::UInt16Type, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::UInt16Type, + >($values2); + $e + } + (arrow_schema::DataType::UInt32, arrow_schema::DataType::UInt32) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::UInt32Type, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::UInt32Type, + >($values2); + $e + } + (arrow_schema::DataType::UInt64, arrow_schema::DataType::UInt64) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::UInt64Type, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::UInt64Type, + >($values2); + $e + } + (arrow_schema::DataType::Float32, arrow_schema::DataType::Float32) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::Float32Type, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::Float32Type, + >($values2); + $e + } + (arrow_schema::DataType::Float64, arrow_schema::DataType::Float64) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::Float64Type, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::Float64Type, + >($values2); + $e + } + (arrow_schema::DataType::Date32, arrow_schema::DataType::Date32) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::Date32Type, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::Date32Type, + >($values2); + $e + } + (arrow_schema::DataType::Date64, arrow_schema::DataType::Date64) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::Date64Type, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::Date64Type, + >($values2); + $e + } + (arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Second), arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Second)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::Time32SecondType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::Time32SecondType, + >($values2); + $e + } + (arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Millisecond), arrow_schema::DataType::Time32(arrow_schema::TimeUnit::Millisecond)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::Time32MillisecondType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::Time32MillisecondType, + >($values2); + $e + } + (arrow_schema::DataType::Time64(arrow_schema::TimeUnit::Microsecond), arrow_schema::DataType::Time64(arrow_schema::TimeUnit::Microsecond)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::Time64MicrosecondType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::Time64MicrosecondType, + >($values2); + $e + } + (arrow_schema::DataType::Time64(arrow_schema::TimeUnit::Nanosecond), arrow_schema::DataType::Time64(arrow_schema::TimeUnit::Nanosecond)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::Time64NanosecondType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::Time64NanosecondType, + >($values2); + $e + } + (arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Second, _), arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Second, _)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::TimestampSecondType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::TimestampSecondType, + >($values2); + $e + } + (arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, _), arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, _)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::TimestampMillisecondType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::TimestampMillisecondType, + >($values2); + $e + } + (arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, _), arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, _)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::TimestampMicrosecondType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::TimestampMicrosecondType, + >($values2); + $e + } + (arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, _), arrow_schema::DataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, _)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::TimestampNanosecondType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::TimestampNanosecondType, + >($values2); + $e + } + (arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::YearMonth), arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::YearMonth)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::IntervalYearMonthType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::IntervalYearMonthType, + >($values2); + $e + } + (arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::DayTime), arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::DayTime)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::IntervalDayTimeType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::IntervalDayTimeType, + >($values2); + $e + } + (arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano), arrow_schema::DataType::Interval(arrow_schema::IntervalUnit::MonthDayNano)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::IntervalMonthDayNanoType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::IntervalMonthDayNanoType, + >($values2); + $e + } + (arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Second), arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Second)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::DurationSecondType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::DurationSecondType, + >($values2); + $e + } + (arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Millisecond), arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Millisecond)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::DurationMillisecondType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::DurationMillisecondType, + >($values2); + $e + } + (arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Microsecond), arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Microsecond)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::DurationMicrosecondType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::DurationMicrosecondType, + >($values2); + $e + } + (arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Nanosecond), arrow_schema::DataType::Duration(arrow_schema::TimeUnit::Nanosecond)) => { + let $values1 = $crate::cast::as_primitive_array::< + $crate::types::DurationNanosecondType, + >($values1); + let $values2 = $crate::cast::as_primitive_array::< + $crate::types::DurationNanosecondType, + >($values2); + $e + } + $($p => $fallback,)* + } + }; +} + +/// Force downcast of an [`Array`], such as an [`ArrayRef`], to +/// [`PrimitiveArray<T>`], panic'ing on failure. +/// +/// # Example +/// +/// ``` +/// # use std::sync::Arc; +/// # use arrow_array::{ArrayRef, Int32Array}; +/// # use arrow_array::cast::as_primitive_array; +/// # use arrow_array::types::Int32Type; +/// +/// let arr: ArrayRef = Arc::new(Int32Array::from(vec![Some(1)])); +/// +/// // Downcast an `ArrayRef` to Int32Array / PrimiveArray<Int32>: +/// let primitive_array: &Int32Array = as_primitive_array(&arr); +/// +/// // Equivalently: +/// let primitive_array = as_primitive_array::<Int32Type>(&arr); +/// +/// // This is the equivalent of: +/// let primitive_array = arr +/// .as_any() +/// .downcast_ref::<Int32Array>() +/// .unwrap(); +/// ``` + +pub fn as_primitive_array<T>(arr: &dyn Array) -> &PrimitiveArray<T> +where + T: ArrowPrimitiveType, +{ + arr.as_any() + .downcast_ref::<PrimitiveArray<T>>() + .expect("Unable to downcast to primitive array") +} + +/// Downcast an [`Array`] to a [`DictionaryArray`] based on its [`DataType`], accepts +/// a number of subsequent patterns to match the data type +/// +/// ``` +/// # use arrow_array::{Array, StringArray, downcast_dictionary_array, cast::as_string_array}; +/// # use arrow_schema::DataType; +/// +/// fn print_strings(array: &dyn Array) { +/// downcast_dictionary_array!( +/// array => match array.values().data_type() { +/// DataType::Utf8 => { +/// for v in array.downcast_dict::<StringArray>().unwrap() { +/// println!("{:?}", v); +/// } +/// } +/// t => println!("Unsupported dictionary value type {}", t), +/// }, +/// DataType::Utf8 => { +/// for v in as_string_array(array) { +/// println!("{:?}", v); +/// } +/// } +/// t => println!("Unsupported datatype {}", t) +/// ) +/// } +/// ``` +/// +/// [`DataType`]: arrow_schema::DataType +#[macro_export] +macro_rules! downcast_dictionary_array { + ($values:ident => $e:expr, $($p:pat => $fallback:expr $(,)*)*) => { + downcast_dictionary_array!($values => {$e} $($p => $fallback)*) + }; + + ($values:ident => $e:block $($p:pat => $fallback:expr $(,)*)*) => { + match $values.data_type() { + arrow_schema::DataType::Dictionary(k, _) => match k.as_ref() { + arrow_schema::DataType::Int8 => { + let $values = $crate::cast::as_dictionary_array::< + $crate::types::Int8Type, + >($values); + $e + }, + arrow_schema::DataType::Int16 => { + let $values = $crate::cast::as_dictionary_array::< + $crate::types::Int16Type, + >($values); + $e + }, + arrow_schema::DataType::Int32 => { + let $values = $crate::cast::as_dictionary_array::< + $crate::types::Int32Type, + >($values); + $e + }, + arrow_schema::DataType::Int64 => { + let $values = $crate::cast::as_dictionary_array::< + $crate::types::Int64Type, + >($values); + $e + }, + arrow_schema::DataType::UInt8 => { + let $values = $crate::cast::as_dictionary_array::< + $crate::types::UInt8Type, + >($values); + $e + }, + arrow_schema::DataType::UInt16 => { + let $values = $crate::cast::as_dictionary_array::< + $crate::types::UInt16Type, + >($values); + $e + }, + arrow_schema::DataType::UInt32 => { + let $values = $crate::cast::as_dictionary_array::< + $crate::types::UInt32Type, + >($values); + $e + }, + arrow_schema::DataType::UInt64 => { + let $values = $crate::cast::as_dictionary_array::< + $crate::types::UInt64Type, + >($values); + $e + }, + k => unreachable!("unsupported dictionary key type: {}", k) + } + $($p => $fallback,)* + } + } +} + +/// Force downcast of an [`Array`], such as an [`ArrayRef`] to +/// [`DictionaryArray<T>`], panic'ing on failure. +/// +/// # Example +/// +/// ``` +/// # use arrow_array::{ArrayRef, DictionaryArray}; +/// # use arrow_array::cast::as_dictionary_array; +/// # use arrow_array::types::Int32Type; +/// +/// let arr: DictionaryArray<Int32Type> = vec![Some("foo")].into_iter().collect(); +/// let arr: ArrayRef = std::sync::Arc::new(arr); +/// let dict_array: &DictionaryArray<Int32Type> = as_dictionary_array::<Int32Type>(&arr); +/// ``` +pub fn as_dictionary_array<T>(arr: &dyn Array) -> &DictionaryArray<T> +where + T: ArrowDictionaryKeyType, +{ + arr.as_any() + .downcast_ref::<DictionaryArray<T>>() + .expect("Unable to downcast to dictionary array") +} + +/// Force downcast of an [`Array`], such as an [`ArrayRef`] to +/// [`GenericListArray<T>`], panic'ing on failure. +pub fn as_generic_list_array<S: OffsetSizeTrait>( + arr: &dyn Array, +) -> &GenericListArray<S> { + arr.as_any() + .downcast_ref::<GenericListArray<S>>() + .expect("Unable to downcast to list array") +} + +/// Force downcast of an [`Array`], such as an [`ArrayRef`] to +/// [`ListArray`], panic'ing on failure. +#[inline] +pub fn as_list_array(arr: &dyn Array) -> &ListArray { + as_generic_list_array::<i32>(arr) +} + +/// Force downcast of an [`Array`], such as an [`ArrayRef`] to +/// [`LargeListArray`], panic'ing on failure. +#[inline] +pub fn as_large_list_array(arr: &dyn Array) -> &LargeListArray { + as_generic_list_array::<i64>(arr) +} + +/// Force downcast of an [`Array`], such as an [`ArrayRef`] to +/// [`GenericBinaryArray<S>`], panic'ing on failure. +#[inline] +pub fn as_generic_binary_array<S: OffsetSizeTrait>( + arr: &dyn Array, +) -> &GenericBinaryArray<S> { + arr.as_any() + .downcast_ref::<GenericBinaryArray<S>>() + .expect("Unable to downcast to binary array") +} + +/// Force downcast of an [`Array`], such as an [`ArrayRef`] to +/// [`StringArray`], panic'ing on failure. +/// +/// # Example +/// +/// ``` +/// # use std::sync::Arc; +/// # use arrow_array::cast::as_string_array; +/// # use arrow_array::{ArrayRef, StringArray}; +/// +/// let arr: ArrayRef = Arc::new(StringArray::from_iter(vec![Some("foo")])); +/// let string_array = as_string_array(&arr); +/// ``` +pub fn as_string_array(arr: &dyn Array) -> &StringArray { + arr.as_any() + .downcast_ref::<StringArray>() + .expect("Unable to downcast to StringArray") +} + +/// Force downcast of an [`Array`], such as an [`ArrayRef`] to +/// [`BooleanArray`], panic'ing on failure. +/// +/// # Example +/// +/// ``` +/// # use std::sync::Arc; +/// # use arrow_array::{ArrayRef, BooleanArray}; +/// # use arrow_array::cast::as_boolean_array; +/// +/// let arr: ArrayRef = Arc::new(BooleanArray::from_iter(vec![Some(true)])); +/// let boolean_array = as_boolean_array(&arr); +/// ``` +pub fn as_boolean_array(arr: &dyn Array) -> &BooleanArray { + arr.as_any() + .downcast_ref::<BooleanArray>() + .expect("Unable to downcast to BooleanArray") +} + +macro_rules! array_downcast_fn { + ($name: ident, $arrty: ty, $arrty_str:expr) => { + #[doc = "Force downcast of an [`Array`], such as an [`ArrayRef`] to "] + #[doc = $arrty_str] + pub fn $name(arr: &dyn Array) -> &$arrty { + arr.as_any().downcast_ref::<$arrty>().expect(concat!( + "Unable to downcast to typed array through ", + stringify!($name) + )) + } + }; + + // use recursive macro to generate dynamic doc string for a given array type + ($name: ident, $arrty: ty) => { + array_downcast_fn!( + $name, + $arrty, + concat!("[`", stringify!($arrty), "`], panic'ing on failure.") + ); + }; +} + +array_downcast_fn!(as_largestring_array, LargeStringArray); +array_downcast_fn!(as_null_array, NullArray); +array_downcast_fn!(as_struct_array, StructArray); +array_downcast_fn!(as_union_array, UnionArray); +array_downcast_fn!(as_map_array, MapArray); +array_downcast_fn!(as_decimal_array, Decimal128Array); + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use super::*; + + #[test] + fn test_as_decimal_array_ref() { + let array: Decimal128Array = vec![Some(123), None, Some(1111)] + .into_iter() + .collect::<Decimal128Array>() + .with_precision_and_scale(10, 2) + .unwrap(); + assert!(!as_decimal_array(&array).is_empty()); + let result_decimal = as_decimal_array(&array); + assert_eq!(result_decimal, &array); + } Review Comment: Nice -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org