This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new bf1a9ec7fa Add additional documentation and examples to ArrayAccessor
(#6141)
bf1a9ec7fa is described below
commit bf1a9ec7faa1e271681317572098c4d83297c3a9
Author: Andrew Lamb <[email protected]>
AuthorDate: Thu Aug 1 12:31:14 2024 -0400
Add additional documentation and examples to ArrayAccessor (#6141)
---
arrow-array/src/array/mod.rs | 79 +++++++++++++++++++++++++++++++++++++++++---
arrow/src/lib.rs | 26 ++++++++++-----
2 files changed, 93 insertions(+), 12 deletions(-)
diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs
index b115ff9c14..50c5699bac 100644
--- a/arrow-array/src/array/mod.rs
+++ b/arrow-array/src/array/mod.rs
@@ -437,13 +437,84 @@ impl<'a, T: Array> Array for &'a T {
/// A generic trait for accessing the values of an [`Array`]
///
+/// This trait helps write specialized implementations of algorithms for
+/// different array types. Specialized implementations allow the compiler
+/// to optimize the code for the specific array type, which can lead to
+/// significant performance improvements.
+///
+/// # Example
+/// For example, to write three different implementations of a string length
function
+/// for [`StringArray`], [`LargeStringArray`], and [`StringViewArray`], you
can write
+///
+/// ```
+/// # use std::sync::Arc;
+/// # use arrow_array::{ArrayAccessor, ArrayRef, ArrowPrimitiveType,
OffsetSizeTrait, PrimitiveArray};
+/// # use arrow_buffer::ArrowNativeType;
+/// # use arrow_array::cast::AsArray;
+/// # use arrow_array::iterator::ArrayIter;
+/// # use arrow_array::types::{Int32Type, Int64Type};
+/// # use arrow_schema::{ArrowError, DataType};
+/// /// This function takes a dynamically typed `ArrayRef` and calls
+/// /// calls one of three specialized implementations
+/// fn character_length(arg: ArrayRef) -> Result<ArrayRef, ArrowError> {
+/// match arg.data_type() {
+/// DataType::Utf8 => {
+/// // downcast the ArrayRef to a StringArray and call the
specialized implementation
+/// let string_array = arg.as_string::<i32>();
+/// character_length_general::<Int32Type, _>(string_array)
+/// }
+/// DataType::LargeUtf8 => {
+/// character_length_general::<Int64Type,
_>(arg.as_string::<i64>())
+/// }
+/// DataType::Utf8View => {
+/// character_length_general::<Int32Type, _>(arg.as_string_view())
+/// }
+/// _ => Err(ArrowError::InvalidArgumentError("Unsupported data
type".to_string())),
+/// }
+/// }
+///
+/// /// A generic implementation of the character_length function
+/// /// This function uses the `ArrayAccessor` trait to access the values of
the array
+/// /// so the compiler can generated specialized implementations for
different array types
+/// ///
+/// /// Returns a new array with the length of each string in the input array
+/// /// * Int32Array for Utf8 and Utf8View arrays (lengths are 32-bit integers)
+/// /// * Int64Array for LargeUtf8 arrays (lengths are 64-bit integers)
+/// ///
+/// /// This is generic on the type of the primitive array (different string
arrays have
+/// /// different lengths) and the type of the array accessor (different
string arrays
+/// /// have different ways to access the values)
+/// fn character_length_general<'a, T: ArrowPrimitiveType, V:
ArrayAccessor<Item = &'a str>>(
+/// array: V,
+/// ) -> Result<ArrayRef, ArrowError>
+/// where
+/// T::Native: OffsetSizeTrait,
+/// {
+/// let iter = ArrayIter::new(array);
+/// // Create a Int32Array / Int64Array with the length of each string
+/// let result = iter
+/// .map(|string| {
+/// string.map(|string: &str| {
+/// T::Native::from_usize(string.chars().count())
+/// .expect("should not fail as string.chars will always
return integer")
+/// })
+/// })
+/// .collect::<PrimitiveArray<T>>();
+///
+/// /// Return the result as a new ArrayRef (dynamically typed)
+/// Ok(Arc::new(result) as ArrayRef)
+/// }
+/// ```
+///
/// # Validity
///
-/// An [`ArrayAccessor`] must always return a well-defined value for an index
that is
-/// within the bounds `0..Array::len`, including for null indexes where
[`Array::is_null`] is true.
+/// An [`ArrayAccessor`] must always return a well-defined value for an index
+/// that is within the bounds `0..Array::len`, including for null indexes where
+/// [`Array::is_null`] is true.
///
-/// The value at null indexes is unspecified, and implementations must not
rely on a specific
-/// value such as [`Default::default`] being returned, however, it must not be
undefined
+/// The value at null indexes is unspecified, and implementations must not rely
+/// on a specific value such as [`Default::default`] being returned, however,
it
+/// must not be undefined
pub trait ArrayAccessor: Array {
/// The Arrow type of the element being accessed.
type Item: Send + Sync;
diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs
index ea8dfb36b2..8796caf43e 100644
--- a/arrow/src/lib.rs
+++ b/arrow/src/lib.rs
@@ -40,8 +40,10 @@
//! assert_eq!(array.values(), &[1, 0, 3])
//! ```
//!
-//! It is also possible to write generic code. For example, the following is
generic over
-//! all primitively typed arrays
+//! It is also possible to write generic code for different concrete types.
+//! For example, since the following function is generic over all primitively
+//! typed arrays, when invoked the Rust compiler will generate specialized
implementations
+//! with optimized code for each concrete type.
//!
//! ```rust
//! # use std::iter::Sum;
@@ -60,7 +62,10 @@
//! assert_eq!(sum(&TimestampNanosecondArray::from(vec![1, 2, 3])), 6);
//! ```
//!
-//! And the following is generic over all arrays with comparable values
+//! And the following uses [`ArrayAccessor`] to implement a generic function
+//! over all arrays with comparable values.
+//!
+//! [`ArrayAccessor`]: array::ArrayAccessor
//!
//! ```rust
//! # use arrow::array::{ArrayAccessor, ArrayIter, Int32Array, StringArray};
@@ -81,10 +86,11 @@
//!
//! # Type Erasure / Trait Objects
//!
-//! It is often the case that code wishes to handle any type of array, without
necessarily knowing
-//! its concrete type. This use-case is catered for by a combination of
[`Array`]
-//! and [`DataType`](datatypes::DataType), with the former providing a
type-erased container for
-//! the array, and the latter identifying the concrete type of array.
+//! It is common to write code that handles any type of array, without
necessarily
+//! knowing its concrete type. This is done using the [`Array`] trait and using
+//! [`DataType`] to determine the appropriate `downcast_ref`.
+//!
+//! [`DataType`]: datatypes::DataType
//!
//! ```rust
//! # use arrow::array::{Array, Float32Array};
@@ -96,14 +102,18 @@
//!
//! fn impl_dyn(array: &dyn Array) {
//! match array.data_type() {
+//! // downcast `dyn Array` to concrete `StringArray`
//! DataType::Utf8 =>
impl_string(array.as_any().downcast_ref().unwrap()),
+//! // downcast `dyn Array` to concrete `Float32Array`
//! DataType::Float32 =>
impl_f32(array.as_any().downcast_ref().unwrap()),
//! _ => unimplemented!()
//! }
//! }
//! ```
//!
-//! To facilitate downcasting, the [`AsArray`](crate::array::AsArray)
extension trait can be used
+//! You can use the [`AsArray`] extension trait to facilitate downcasting:
+//!
+//! [`AsArray`]: crate::array::AsArray
//!
//! ```rust
//! # use arrow::array::{Array, Float32Array, AsArray};