This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 44b6ded134 Adds documentation and example recommending Vec<ArrayRef> 
over ChunkedArray (#6527)
44b6ded134 is described below

commit 44b6ded1340d8f3c1bfece18e871ed3be4c394d6
Author: Eric Fredine <[email protected]>
AuthorDate: Thu Oct 10 13:54:46 2024 -0700

    Adds documentation and example recommending Vec<ArrayRef> over ChunkedArray 
(#6527)
    
    * Adds documentation and example recommending Vec<ArrayRef> as an 
alternative to a ChunkedArray abstraction."
    
    * Remove link to example.
    
    * Reduce width of doc example
    
    * Move documentation to arrow-array. Simplify doc example. Remove top-level 
example.
    
    * Update arrow-array/src/lib.rs
    
    ---------
    
    Co-authored-by: Eric Fredine <[email protected]>
    Co-authored-by: Raphael Taylor-Davies 
<[email protected]>
---
 arrow-array/src/lib.rs   | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 arrow/examples/README.md |  2 +-
 2 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs
index 90bc5e3120..0fc9d30ab6 100644
--- a/arrow-array/src/lib.rs
+++ b/arrow-array/src/lib.rs
@@ -161,7 +161,52 @@
 //!     array.as_primitive::<Float32Type>().values()
 //! }
 //! ```
+//! # Alternatives to ChunkedArray Support
 //!
+//! The Rust implementation does not provide the ChunkedArray abstraction 
implemented by the Python
+//! and C++ Arrow implementations. The recommended alternative is to use one 
of the following:
+//! - `Vec<ArrayRef>` a simple, eager version of a `ChunkedArray`
+//! - `impl Iterator<Item=ArrayRef>` a lazy version of a `ChunkedArray`
+//! - `impl Stream<Item=ArrayRef>` a lazy async version of a `ChunkedArray`
+//!
+//! Similar patterns can be applied at the `RecordBatch` level. For example, 
[DataFusion] makes
+//! extensive use of [RecordBatchStream].
+//!
+//! This approach integrates well into the Rust ecosystem, simplifies the 
implementation and
+//! encourages the use of performant lazy and async patterns.
+//! ```rust
+//! use std::sync::Arc;
+//! use arrow_array::{ArrayRef, Float32Array, RecordBatch, StringArray};
+//! use arrow_array::cast::AsArray;
+//! use arrow_array::types::Float32Type;
+//! use arrow_schema::DataType;
+//!
+//! let batches = [
+//!    RecordBatch::try_from_iter(vec![
+//!         ("label", Arc::new(StringArray::from(vec!["A", "B", "C"])) as 
ArrayRef),
+//!         ("value", Arc::new(Float32Array::from(vec![0.1, 0.2, 0.3])) as 
ArrayRef),
+//!     ]).unwrap(),
+//!    RecordBatch::try_from_iter(vec![
+//!         ("label", Arc::new(StringArray::from(vec!["D", "E"])) as ArrayRef),
+//!         ("value", Arc::new(Float32Array::from(vec![0.4, 0.5])) as 
ArrayRef),
+//!    ]).unwrap(),
+//! ];
+//!
+//! let labels: Vec<&str> = batches
+//!    .iter()
+//!    .flat_map(|batch| batch.column(0).as_string::<i32>())
+//!    .map(Option::unwrap)
+//!    .collect();
+//!
+//! let values: Vec<f32> = batches
+//!    .iter()
+//!    .flat_map(|batch| 
batch.column(1).as_primitive::<Float32Type>().values())
+//!    .copied()
+//!    .collect();
+//!
+//! assert_eq!(labels, ["A", "B", "C", "D", "E"]);
+//! assert_eq!(values, [0.1, 0.2, 0.3, 0.4, 0.5]);
+//!```
 //! [`ScalarBuffer<T>`]: arrow_buffer::ScalarBuffer
 //! [`ScalarBuffer<i16>`]: arrow_buffer::ScalarBuffer
 //! [`OffsetBuffer<i32>`]: arrow_buffer::OffsetBuffer
@@ -173,6 +218,8 @@
 //! [`compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html
 //! [`json`]: https://docs.rs/arrow/latest/arrow/json/index.html
 //! [`csv`]: https://docs.rs/arrow/latest/arrow/csv/index.html
+//! [DataFusion]: https://github.com/apache/arrow-datafusion
+//! [RecordBatchStream]: 
https://docs.rs/datafusion/latest/datafusion/execution/trait.RecordBatchStream.html
 
 #![deny(rustdoc::broken_intra_doc_links)]
 #![warn(missing_docs)]
diff --git a/arrow/examples/README.md b/arrow/examples/README.md
index 5c57ec00cd..87aa6ee047 100644
--- a/arrow/examples/README.md
+++ b/arrow/examples/README.md
@@ -21,7 +21,7 @@
 
 - [`builders.rs`](builders.rs): Using the Builder API
 - [`collect.rs`](collect.rs): Using the `FromIter` API
-- [`dynamic_types.rs`](dynamic_types.rs):
+- [`dynamic_types.rs`](dynamic_types.rs): Dealing with mixed types dynamically 
at runtime
 - [`read_csv.rs`](read_csv.rs): Reading CSV files with explicit schema, pretty 
printing Arrays
 - [`read_csv_infer_schema.rs`](read_csv_infer_schema.rs): Reading CSV files, 
pretty printing Arrays
 - [`tensor_builder.rs`](tensor_builder.rs): Using tensor builder

Reply via email to