This is an automated email from the ASF dual-hosted git repository.
agrove pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new b3b814c ARROW-4304: [Rust] Enhance documentation for arrow
b3b814c is described below
commit b3b814c141ee7cb88601c91ae5f75659a5a6b7fc
Author: Andreas Zimmerer <[email protected]>
AuthorDate: Mon Apr 6 11:35:42 2020 -0600
ARROW-4304: [Rust] Enhance documentation for arrow
Hi all,
I hope I don't bother you too much with this.
I recently started using the Rust implementation of Apache Arrow and it's
going well so far! However, I noticed that the documentation is a bit sparse.
I then dug a little bit around in Jira and noticed there are a bunch of
issues targeting enhancement of documentation, most notably
[ARROW-4304](https://issues.apache.org/jira/browse/ARROW-4304),
[ARROW-4683](https://issues.apache.org/jira/browse/ARROW-4683) and
[ARROW-4927](https://issues.apache.org/jira/browse/ARROW-4927).
I then started documenting a few things and adding working doctests to it.
I also updated some external links that were broken.
It's far from complete but I guess it's a decent start for a wonderful
documentation of this crate :blush:
I am happy to discuss what I've written :innocent:
- [x] tests passing (including doctests)
- [x] code formatting passes
- [x] link check (only for external links)
Closes #6828 from Jibbow/arrow-doc
Authored-by: Andreas Zimmerer <[email protected]>
Signed-off-by: Andy Grove <[email protected]>
---
rust/arrow/src/array/array.rs | 152 +++++++++++++++++++++++++++++-----
rust/arrow/src/array/builder.rs | 176 +++++++++++++++++++++++++++++++++++-----
rust/arrow/src/array/mod.rs | 33 +++++++-
rust/arrow/src/datatypes.rs | 88 ++++++++++++++++----
rust/arrow/src/record_batch.rs | 113 ++++++++++++++++++++++----
5 files changed, 489 insertions(+), 73 deletions(-)
diff --git a/rust/arrow/src/array/array.rs b/rust/arrow/src/array/array.rs
index 05620e3..9c04b16 100644
--- a/rust/arrow/src/array/array.rs
+++ b/rust/arrow/src/array/array.rs
@@ -45,57 +45,166 @@ const MICROSECONDS: i64 = 1_000_000;
const NANOSECONDS: i64 = 1_000_000_000;
/// Trait for dealing with different types of array at runtime when the type
of the
-/// array is not known in advance
+/// array is not known in advance.
pub trait Array: fmt::Debug + Send + Sync + ArrayEqual + JsonEqual {
- /// Returns the array as `Any` so that it can be downcast to a specific
implementation
+ /// Returns the array as [`Any`](std::any::Any) so that it can be
+ /// downcasted to a specific implementation.
+ ///
+ /// # Example:
+ ///
+ /// ```
+ /// use std::sync::Arc;
+ /// use arrow::array::Int32Array;
+ /// use arrow::datatypes::{Schema, Field, DataType};
+ /// use arrow::record_batch::RecordBatch;
+ ///
+ /// # fn main() -> arrow::error::Result<()> {
+ /// let id = Int32Array::from(vec![1, 2, 3, 4, 5]);
+ /// let batch = RecordBatch::try_new(
+ /// Arc::new(Schema::new(vec![Field::new("id", DataType::Int32,
false)])),
+ /// vec![Arc::new(id)]
+ /// )?;
+ ///
+ /// let int32array = batch
+ /// .column(0)
+ /// .as_any()
+ /// .downcast_ref::<Int32Array>()
+ /// .expect("Failed to downcast");
+ /// # Ok(())
+ /// # }
+ /// ```
fn as_any(&self) -> &Any;
- /// Returns a reference-counted pointer to the data of this array
+ /// Returns a reference-counted pointer to the underlying data of this
array.
fn data(&self) -> ArrayDataRef;
- /// Returns a borrowed & reference-counted pointer to the data of this
array
+ /// Returns a borrowed & reference-counted pointer to the underlying data
of this array.
fn data_ref(&self) -> &ArrayDataRef;
- /// Returns a reference to the data type of this array
+ /// Returns a reference to the [`DataType`](crate::datatype::DataType) of
this array.
+ ///
+ /// # Example:
+ ///
+ /// ```
+ /// use arrow::datatypes::DataType;
+ /// use arrow::array::{Array, Int32Array};
+ ///
+ /// let array = Int32Array::from(vec![1, 2, 3, 4, 5]);
+ ///
+ /// assert_eq!(*array.data_type(), DataType::Int32);
+ /// ```
fn data_type(&self) -> &DataType {
self.data_ref().data_type()
}
/// Returns a zero-copy slice of this array with the indicated offset and
length.
+ ///
+ /// # Example:
+ ///
+ /// ```
+ /// use arrow::array::{Array, Int32Array};
+ ///
+ /// let array = Int32Array::from(vec![1, 2, 3, 4, 5]);
+ /// // Make slice over the values [2, 3, 4]
+ /// let array_slice = array.slice(1, 3);
+ ///
+ /// assert!(array_slice.equals(&Int32Array::from(vec![2, 3, 4])));
+ /// ```
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
make_array(slice_data(self.data(), offset, length))
}
- /// Returns the length (i.e., number of elements) of this array
+ /// Returns the length (i.e., number of elements) of this array.
+ ///
+ /// # Example:
+ ///
+ /// ```
+ /// use arrow::array::{Array, Int32Array};
+ ///
+ /// let array = Int32Array::from(vec![1, 2, 3, 4, 5]);
+ ///
+ /// assert_eq!(array.len(), 5);
+ /// ```
fn len(&self) -> usize {
self.data().len()
}
- /// Returns the offset of this array
+ /// Returns the offset into the underlying data used by this array(-slice).
+ /// Note that the underlying data can be shared by many arrays.
+ /// This defaults to `0`.
+ ///
+ /// # Example:
+ ///
+ /// ```
+ /// use arrow::array::{Array, Int32Array};
+ ///
+ /// let array = Int32Array::from(vec![1, 2, 3, 4, 5]);
+ /// // Make slice over the values [2, 3, 4]
+ /// let array_slice = array.slice(1, 3);
+ ///
+ /// assert_eq!(array.offset(), 0);
+ /// assert_eq!(array_slice.offset(), 1);
+ /// ```
fn offset(&self) -> usize {
self.data().offset()
}
- /// Returns whether the element at index `i` is null
- fn is_null(&self, i: usize) -> bool {
- self.data().is_null(self.data().offset() + i)
+ /// Returns whether the element at `index` is null.
+ /// When using this function on a slice, the index is relative to the
slice.
+ ///
+ /// # Example:
+ ///
+ /// ```
+ /// use arrow::array::{Array, Int32Array};
+ ///
+ /// let array = Int32Array::from(vec![Some(1), None]);
+ ///
+ /// assert_eq!(array.is_null(0), false);
+ /// assert_eq!(array.is_null(1), true);
+ /// ```
+ fn is_null(&self, index: usize) -> bool {
+ self.data().is_null(self.data().offset() + index)
}
- /// Returns whether the element at index `i` is not null
- fn is_valid(&self, i: usize) -> bool {
- self.data().is_valid(self.data().offset() + i)
+ /// Returns whether the element at `index` is not null.
+ /// When using this function on a slice, the index is relative to the
slice.
+ ///
+ /// # Example:
+ ///
+ /// ```
+ /// use arrow::array::{Array, Int32Array};
+ ///
+ /// let array = Int32Array::from(vec![Some(1), None]);
+ ///
+ /// assert_eq!(array.is_valid(0), true);
+ /// assert_eq!(array.is_valid(1), false);
+ /// ```
+ fn is_valid(&self, index: usize) -> bool {
+ self.data().is_valid(self.data().offset() + index)
}
- /// Returns the total number of nulls in this array
+ /// Returns the total number of null values in this array.
+ ///
+ /// # Example:
+ ///
+ /// ```
+ /// use arrow::array::{Array, Int32Array};
+ ///
+ /// // Construct an array with values [1, NULL, NULL]
+ /// let array = Int32Array::from(vec![Some(1), None, None]);
+ ///
+ /// assert_eq!(array.null_count(), 2);
+ /// ```
fn null_count(&self) -> usize {
self.data().null_count()
}
}
+/// A reference-counted reference to a generic `Array`.
pub type ArrayRef = Arc<Array>;
-/// Constructs an array using the input `data`. Returns a reference-counted
`Array`
-/// instance.
+/// Constructs an array using the input `data`.
+/// Returns a reference-counted `Array` instance.
pub fn make_array(data: ArrayDataRef) -> ArrayRef {
match data.data_type() {
DataType::Boolean => Arc::new(BooleanArray::from(data)) as ArrayRef,
@@ -197,6 +306,11 @@ pub fn make_array(data: ArrayDataRef) -> ArrayRef {
}
}
+/// Creates a zero-copy slice of the array's data.
+///
+/// # Panics
+///
+/// Panics if `offset + length < data.len()`.
fn slice_data(data: ArrayDataRef, mut offset: usize, length: usize) ->
ArrayDataRef {
assert!((offset + length) <= data.len());
@@ -316,14 +430,14 @@ impl<T: ArrowNumericType> PrimitiveArray<T> {
PrimitiveArray::from(array_data)
}
- /// Returns a `Buffer` holds all the values of this array.
+ /// Returns a `Buffer` holding all the values of this array.
///
- /// Note this doesn't take account into the offset of this array.
+ /// Note this doesn't take the offset of this array into account.
pub fn values(&self) -> Buffer {
self.data.buffers()[0].clone()
}
- /// Returns the length of this array
+ /// Returns the length of this array.
pub fn len(&self) -> usize {
self.data.len()
}
diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs
index bd73dbd..00d9433 100644
--- a/rust/arrow/src/array/builder.rs
+++ b/rust/arrow/src/array/builder.rs
@@ -15,8 +15,10 @@
// specific language governing permissions and limitations
// under the License.
-//! Defines a `BufferBuilder` capable of creating a `Buffer` which can be used
as an
-//! internal buffer in an `ArrayData` object.
+//! Defines a [`BufferBuilder`](crate::array::BufferBuilder) capable
+//! of creating a [`Buffer`](crate::buffer::Buffer) which can be used
+//! as an internal buffer in an [`ArrayData`](crate::array::ArrayData)
+//! object.
use std::any::Any;
use std::collections::HashMap;
@@ -31,29 +33,172 @@ use crate::datatypes::*;
use crate::error::{ArrowError, Result};
use crate::util::bit_util;
-/// Buffer builder with zero-copy build method
+/// Builder for creating a [`Buffer`](crate::buffer::Buffer) object.
+///
+/// This builder is implemented for primitive types and creates a
+/// buffer with a zero-copy `build()` method.
+///
+/// See trait [`BufferBuilderTrait`](crate::array::BufferBuilderTrait)
+/// for further documentation and examples.
+///
+/// A [`Buffer`](crate::buffer::Buffer) is the underlying data
+/// structure of Arrow's [`Arrays`](crate::array::Array).
+///
+/// For all supported types, there are type definitions for the
+/// generic version of `BufferBuilder<T>`, e.g. `UInt8BufferBuilder`.
+///
+/// # Example:
+///
+/// ```
+/// use arrow::array::{UInt8BufferBuilder, BufferBuilderTrait};
+///
+/// # fn main() -> arrow::error::Result<()> {
+/// let mut builder = UInt8BufferBuilder::new(100);
+/// builder.append_slice(&[42, 43, 44]);
+/// builder.append(45);
+/// let buffer = builder.finish();
+///
+/// assert_eq!(unsafe { buffer.typed_data::<u8>() }, &[42, 43, 44, 45]);
+/// # Ok(())
+/// # }
+/// ```
pub struct BufferBuilder<T: ArrowPrimitiveType> {
buffer: MutableBuffer,
len: usize,
_marker: PhantomData<T>,
}
-// Trait for buffer builder. This is used mainly to offer separate
implementations for
-// numeric types and boolean types, while still be able to call methods on
buffer builder
-// with generic primitive type.
+/// Trait for simplifying the construction of
[`Buffers`](crate::buffer::Buffer).
+///
+/// This trait is used mainly to offer separate implementations for
+/// numeric types and boolean types, while still be able to call methods on
buffer builder
+/// with generic primitive type.
+/// Seperate implementations of this trait allow to add implementation-details,
+/// e.g. the implementation for boolean types uses bit-packing.
pub trait BufferBuilderTrait<T: ArrowPrimitiveType> {
+ /// Creates a new builder with initial capacity for _at least_ `capacity`
+ /// elements of type `T`.
+ ///
+ /// The capacity can later be manually adjusted with the
+ /// [`reserve()`](BufferBuilderTrait::reserve) method.
+ /// Also the
+ /// [`append()`](BufferBuilderTrait::append),
+ /// [`append_slice()`](BufferBuilderTrait::append_slice) and
+ /// [`advance()`](BufferBuilderTrait::advance)
+ /// methods automatically increase the capacity if needed.
+ ///
+ /// # Example:
+ ///
+ /// ```
+ /// use arrow::array::{UInt8BufferBuilder, BufferBuilderTrait};
+ ///
+ /// let mut builder = UInt8BufferBuilder::new(10);
+ ///
+ /// assert!(builder.capacity() >= 10);
+ /// ```
fn new(capacity: usize) -> Self;
+
+ /// Returns the current number of array elements in the internal buffer.
+ ///
+ /// # Example:
+ ///
+ /// ```
+ /// use arrow::array::{UInt8BufferBuilder, BufferBuilderTrait};
+ ///
+ /// let mut builder = UInt8BufferBuilder::new(10);
+ /// builder.append(42);
+ ///
+ /// assert_eq!(builder.len(), 1);
+ /// ```
fn len(&self) -> usize;
+
+ /// Returns the actual capacity (number of elements) of the internal
buffer.
+ ///
+ /// Note: the internal capacity returned by this method might be larger
than
+ /// what you'd expect after setting the capacity in the `new()` or
`reserve()`
+ /// functions.
fn capacity(&self) -> usize;
- fn advance(&mut self, i: usize) -> Result<()>;
+
+ /// Increases the number of elements in the internal buffer by `n`
+ /// and resizes the buffer as needed.
+ ///
+ /// The values of the newly added elements are undefined.
+ /// This method is usually used when appending `NULL` values to the buffer
+ /// as they still require physical memory space.
+ ///
+ /// # Example:
+ ///
+ /// ```
+ /// use arrow::array::{UInt8BufferBuilder, BufferBuilderTrait};
+ ///
+ /// let mut builder = UInt8BufferBuilder::new(10);
+ /// builder.advance(2);
+ ///
+ /// assert_eq!(builder.len(), 2);
+ /// ```
+ fn advance(&mut self, n: usize) -> Result<()>;
+
+ /// Reserves memory for _at least_ `n` more elements of type `T`.
+ ///
+ /// # Example:
+ ///
+ /// ```
+ /// use arrow::array::{UInt8BufferBuilder, BufferBuilderTrait};
+ ///
+ /// let mut builder = UInt8BufferBuilder::new(10);
+ /// builder.reserve(10);
+ ///
+ /// assert!(builder.capacity() >= 20);
+ /// ```
fn reserve(&mut self, n: usize) -> Result<()>;
- fn append(&mut self, v: T::Native) -> Result<()>;
+
+ /// Appends a value of type `T` into the builder,
+ /// growing the internal buffer as needed.
+ ///
+ /// # Example:
+ ///
+ /// ```
+ /// use arrow::array::{UInt8BufferBuilder, BufferBuilderTrait};
+ ///
+ /// let mut builder = UInt8BufferBuilder::new(10);
+ /// builder.append(42);
+ ///
+ /// assert_eq!(builder.len(), 1);
+ /// ```
+ fn append(&mut self, value: T::Native) -> Result<()>;
+
+ /// Appends a slice of type `T`, growing the internal buffer as needed.
+ ///
+ /// # Example:
+ ///
+ /// ```
+ /// use arrow::array::{UInt8BufferBuilder, BufferBuilderTrait};
+ ///
+ /// let mut builder = UInt8BufferBuilder::new(10);
+ /// builder.append_slice(&[42, 44, 46]);
+ ///
+ /// assert_eq!(builder.len(), 3);
+ /// ```
fn append_slice(&mut self, slice: &[T::Native]) -> Result<()>;
+
+ /// Resets this builder and returns an immutable
[`Buffer`](crate::buffer::Buffer).
+ ///
+ /// # Example:
+ ///
+ /// ```
+ /// use arrow::array::{UInt8BufferBuilder, BufferBuilderTrait};
+ ///
+ /// let mut builder = UInt8BufferBuilder::new(10);
+ /// builder.append_slice(&[42, 44, 46]);
+ ///
+ /// let buffer = builder.finish();
+ ///
+ /// assert_eq!(unsafe { buffer.typed_data::<u8>() }, &[42, 44, 46]);
+ /// ```
fn finish(&mut self) -> Buffer;
}
impl<T: ArrowPrimitiveType> BufferBuilderTrait<T> for BufferBuilder<T> {
- /// Creates a builder with a fixed initial capacity
default fn new(capacity: usize) -> Self {
let buffer = MutableBuffer::new(capacity *
mem::size_of::<T::Native>());
Self {
@@ -63,18 +208,15 @@ impl<T: ArrowPrimitiveType> BufferBuilderTrait<T> for
BufferBuilder<T> {
}
}
- /// Returns the number of array elements (slots) in the builder
fn len(&self) -> usize {
self.len
}
- /// Returns the current capacity of the builder (number of elements)
fn capacity(&self) -> usize {
let bit_capacity = self.buffer.capacity() * 8;
(bit_capacity / T::get_bit_width())
}
- // Advances the `len` of the underlying `Buffer` by `i` slots of type T
default fn advance(&mut self, i: usize) -> Result<()> {
let new_buffer_len = (self.len + i) * mem::size_of::<T::Native>();
self.buffer.resize(new_buffer_len)?;
@@ -82,7 +224,6 @@ impl<T: ArrowPrimitiveType> BufferBuilderTrait<T> for
BufferBuilder<T> {
Ok(())
}
- /// Reserves memory for `n` elements of type `T`.
default fn reserve(&mut self, n: usize) -> Result<()> {
let new_capacity = self.len + n;
let byte_capacity = mem::size_of::<T::Native>() * new_capacity;
@@ -90,20 +231,17 @@ impl<T: ArrowPrimitiveType> BufferBuilderTrait<T> for
BufferBuilder<T> {
Ok(())
}
- /// Appends a value into the builder, growing the internal buffer as
needed.
default fn append(&mut self, v: T::Native) -> Result<()> {
self.reserve(1)?;
self.write_bytes(v.to_byte_slice(), 1)
}
- /// Appends a slice of type `T`, growing the internal buffer as needed.
default fn append_slice(&mut self, slice: &[T::Native]) -> Result<()> {
let array_slots = slice.len();
self.reserve(array_slots)?;
self.write_bytes(slice.to_byte_slice(), array_slots)
}
- /// Reset this builder and returns an immutable `Buffer`.
default fn finish(&mut self) -> Buffer {
let buf = std::mem::replace(&mut self.buffer, MutableBuffer::new(0));
self.len = 0;
@@ -131,7 +269,6 @@ impl<T: ArrowPrimitiveType> BufferBuilder<T> {
}
impl BufferBuilderTrait<BooleanType> for BufferBuilder<BooleanType> {
- /// Creates a builder with a fixed initial capacity.
fn new(capacity: usize) -> Self {
let byte_capacity = bit_util::ceil(capacity, 8);
let actual_capacity =
bit_util::round_upto_multiple_of_64(byte_capacity);
@@ -144,7 +281,6 @@ impl BufferBuilderTrait<BooleanType> for
BufferBuilder<BooleanType> {
}
}
- // Advances the `len` of the underlying `Buffer` by `i` slots of type T
fn advance(&mut self, i: usize) -> Result<()> {
let new_buffer_len = bit_util::ceil(self.len + i, 8);
self.buffer.resize(new_buffer_len)?;
@@ -152,7 +288,6 @@ impl BufferBuilderTrait<BooleanType> for
BufferBuilder<BooleanType> {
Ok(())
}
- /// Appends a value into the builder, growing the internal buffer as
needed.
fn append(&mut self, v: bool) -> Result<()> {
self.reserve(1)?;
if v {
@@ -166,7 +301,6 @@ impl BufferBuilderTrait<BooleanType> for
BufferBuilder<BooleanType> {
Ok(())
}
- /// Appends a slice of type `T`, growing the internal buffer as needed.
fn append_slice(&mut self, slice: &[bool]) -> Result<()> {
self.reserve(slice.len())?;
for v in slice {
@@ -183,7 +317,6 @@ impl BufferBuilderTrait<BooleanType> for
BufferBuilder<BooleanType> {
Ok(())
}
- /// Reserves memory for `n` elements of type `T`.
fn reserve(&mut self, n: usize) -> Result<()> {
let new_capacity = self.len + n;
if new_capacity > self.capacity() {
@@ -196,7 +329,6 @@ impl BufferBuilderTrait<BooleanType> for
BufferBuilder<BooleanType> {
Ok(())
}
- /// Reset this builder and returns an immutable `Buffer`.
fn finish(&mut self) -> Buffer {
// `append` does not update the buffer's `len` so do it before
`freeze` is called.
let new_buffer_len = bit_util::ceil(self.len, 8);
diff --git a/rust/arrow/src/array/mod.rs b/rust/arrow/src/array/mod.rs
index fbe485a..bd3c2e3 100644
--- a/rust/arrow/src/array/mod.rs
+++ b/rust/arrow/src/array/mod.rs
@@ -15,10 +15,37 @@
// specific language governing permissions and limitations
// under the License.
-//! Defines public types representing Apache Arrow arrays. Arrow's
specification defines
-//! an array as "a sequence of values with known length all having the same
type." For
-//! example, the type `Int16Array` represents an Apache Arrow array of 16-bit
integers.
+//! The central type in Apache Arrow are arrays, represented
+//! by the [`Array` trait](crate::array::Array).
+//! An array represents a known-length sequence of values all
+//! having the same type.
//!
+//! Internally, those values are represented by one or several
+//! [buffers](crate::buffer::Buffer), the number and meaning
+//! of which depend on the array’s data type, as documented in
+//! [the Arrow data layout
specification](https://arrow.apache.org/docs/format/Columnar.html).
+//! For example, the type `Int16Array` represents an Apache
+//! Arrow array of 16-bit integers.
+//!
+//! Those buffers consist of the value data itself and an
+//! optional [bitmap buffer](crate::bitmap::Bitmap) that
+//! indicates which array entries are null values.
+//! The bitmap buffer can be entirely omitted if the array is
+//! known to have zero null values.
+//!
+//! There are concrete implementations of this trait for each
+//! data type, that help you access individual values of the
+//! array.
+//!
+//! # Building an Array
+//!
+//! Arrow's `Arrays` are immutable, but there is the trait
+//! [`ArrayBuilder`](crate::array::ArrayBuilder)
+//! that helps you with constructing new `Arrays`. As with the
+//! `Array` trait, there are builder implementations for all
+//! concrete array types.
+//!
+//! # Example
//! ```
//! extern crate arrow;
//!
diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs
index 0ee8cce..03d9dca 100644
--- a/rust/arrow/src/datatypes.rs
+++ b/rust/arrow/src/datatypes.rs
@@ -15,11 +15,12 @@
// specific language governing permissions and limitations
// under the License.
-//! Defines the data-types of Arrow arrays.
+//! Defines the logical data types of Arrow arrays.
//!
-//! For an overview of the terminology used within the arrow project and more
general
-//! information regarding data-types and memory layouts see
-//! [here](https://arrow.apache.org/docs/memory_layout.html).
+//! The most important things you might be looking for are:
+//! * [`Schema`](crate::datatypes::Schema) to describe a schema.
+//! * [`Field`](crate::datatypes::Field) to describe one field withing a
schema.
+//! * [`DataType`](crate::datatypes::DataType) to describe the type of a
field.
use std::collections::HashMap;
use std::fmt;
@@ -39,7 +40,11 @@ use serde_json::{
use crate::error::{ArrowError, Result};
-/// The possible relative types that are supported.
+/// The set of datatypes that are supported by this implementation of Apache
Arrow.
+///
+/// The Arrow specification on data types includes some more types.
+/// See also
[`Schema.fbs`](https://github.com/apache/arrow/blob/master/format/Schema.fbs)
+/// for Arrow's specification.
///
/// The variants of this enum include primitive fixed size types as well as
parametric or
/// nested types.
@@ -49,55 +54,108 @@ use crate::error::{ArrowError, Result};
///
/// Nested types can themselves be nested within other arrays.
/// For more information on these types please see
-/// [here](https://arrow.apache.org/docs/memory_layout.html).
+/// [the physical memory layout of Apache
Arrow](https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout).
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash,
PartialOrd, Ord)]
pub enum DataType {
+ /// A boolean datatype representing the values `true` and `false`.
Boolean,
+ /// A signed 8-bit integer.
Int8,
+ /// A signed 16-bit integer.
Int16,
+ /// A signed 32-bit integer.
Int32,
+ /// A signed 64-bit integer.
Int64,
+ /// An unsigned 8-bit integer.
UInt8,
+ /// An unsigned 16-bit integer.
UInt16,
+ /// An unsigned 32-bit integer.
UInt32,
+ /// An unsigned 64-bit integer.
UInt64,
+ /// A 16-bit floating point number.
Float16,
+ /// A 32-bit floating point number.
Float32,
+ /// A 64-bit floating point number.
Float64,
- /// A timestamp with an optional timezone
+ /// A timestamp with an optional timezone.
+ ///
+ /// Time is measured as a Unix epoch, counting the seconds from
+ /// 00:00:00.000 on 1 January 1970, excluding leap seconds,
+ /// as a 64-bit integer.
+ ///
+ /// The time zone is a string indicating the name of a time zone, one of:
+ ///
+ /// * As used in the Olson time zone database (the "tz database" or
+ /// "tzdata"), such as "America/New_York"
+ /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as
+07:30
Timestamp(TimeUnit, Option<Arc<String>>),
+ /// A 32-bit date representing the elapsed time since UNIX epoch
(1970-01-01)
+ /// in days (32 bits).
Date32(DateUnit),
+ /// A 64-bit date representing the elapsed time since UNIX epoch
(1970-01-01)
+ /// in milliseconds (64 bits).
Date64(DateUnit),
+ /// A 32-bit time representing the elapsed time since midnight in the unit
of `TimeUnit`.
Time32(TimeUnit),
+ /// A 64-bit time representing the elapsed time since midnight in the unit
of `TimeUnit`.
Time64(TimeUnit),
+ /// Measure of elapsed time in either seconds, milliseconds, microseconds
or nanoseconds.
Duration(TimeUnit),
+ /// A "calendar" interval which models types that don't necessarily
+ /// have a precise duration without the context of a base timestamp (e.g.
+ /// days can differ in length during day light savings time transitions).
Interval(IntervalUnit),
+ /// Opaque binary data of variable length.
Binary,
+ /// Opaque binary data of fixed size.
+ /// Enum parameter specifies the number of bytes per value.
FixedSizeBinary(i32),
+ /// A variable-length string in Unicode with UTF-8 encoding.
Utf8,
+ /// A list of some logical data type with variable length.
List(Box<DataType>),
+ /// A list of some logical data type with fixed length.
FixedSizeList(Box<DataType>, i32),
+ /// A nested datatype that contains a number of sub-fields.
Struct(Vec<Field>),
Dictionary(Box<DataType>, Box<DataType>),
}
+/// Date is either a 32-bit or 64-bit type representing elapsed time since UNIX
+/// epoch (1970-01-01) in days or milliseconds.
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash,
PartialOrd, Ord)]
pub enum DateUnit {
+ /// Days since the UNIX epoch.
Day,
+ /// Milliseconds indicating UNIX time elapsed since the epoch (no
+ /// leap seconds), where the values are evenly divisible by 86400000.
Millisecond,
}
+/// An absolute length of time in seconds, milliseconds, microseconds or
nanoseconds.
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash,
PartialOrd, Ord)]
pub enum TimeUnit {
+ /// Time in seconds.
Second,
+ /// Time in milliseconds.
Millisecond,
+ /// Time in microseconds.
Microsecond,
+ /// Time in nanoseconds.
Nanosecond,
}
+/// YEAR_MONTH or DAY_TIME interval in SQL style.
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash,
PartialOrd, Ord)]
pub enum IntervalUnit {
+ /// Indicates the number of elapsed whole months, stored as 4-byte
integers.
YearMonth,
+ /// Indicates the number of elapsed days and milliseconds,
+ /// stored as 2 contiguous 32-bit integers (8-bytes in total).
DayTime,
}
@@ -477,22 +535,22 @@ where
op: F,
) -> Self::Simd;
- // SIMD version of equal
+ /// SIMD version of equal
fn eq(left: Self::Simd, right: Self::Simd) -> Self::SimdMask;
- // SIMD version of not equal
+ /// SIMD version of not equal
fn ne(left: Self::Simd, right: Self::Simd) -> Self::SimdMask;
- // SIMD version of less than
+ /// SIMD version of less than
fn lt(left: Self::Simd, right: Self::Simd) -> Self::SimdMask;
- // SIMD version of less than or equal to
+ /// SIMD version of less than or equal to
fn le(left: Self::Simd, right: Self::Simd) -> Self::SimdMask;
- // SIMD version of greater than
+ /// SIMD version of greater than
fn gt(left: Self::Simd, right: Self::Simd) -> Self::SimdMask;
- // SIMD version of greater than or equal to
+ /// SIMD version of greater than or equal to
fn ge(left: Self::Simd, right: Self::Simd) -> Self::SimdMask;
/// Writes a SIMD result back to a slice
@@ -638,8 +696,9 @@ impl ArrowTemporalType for Time64NanosecondType {}
// impl ArrowTemporalType for IntervalYearMonthType {}
// impl ArrowTemporalType for IntervalDayTimeType {}
-/// A timestamp type allows us to create array builders that take a timestamp
+/// A timestamp type allows us to create array builders that take a timestamp.
pub trait ArrowTimestampType: ArrowTemporalType {
+ /// Returns the `TimeUnit` of this timestamp.
fn get_time_unit() -> TimeUnit;
}
@@ -1322,6 +1381,7 @@ impl fmt::Display for Schema {
}
}
+/// A reference-counted reference to a [`Schema`](crate::datatypes::Schema).
pub type SchemaRef = Arc<Schema>;
#[cfg(test)]
diff --git a/rust/arrow/src/record_batch.rs b/rust/arrow/src/record_batch.rs
index 8cfa225..9ae6505 100644
--- a/rust/arrow/src/record_batch.rs
+++ b/rust/arrow/src/record_batch.rs
@@ -15,11 +15,8 @@
// specific language governing permissions and limitations
// under the License.
-//! According to the [Arrow Metadata
Specification](https://arrow.apache.org/docs/metadata.html):
-//!
-//! > A record batch is a collection of top-level named, equal length Arrow
arrays
-//! > (or vectors). If one of the arrays contains nested data, its child
arrays are not
-//! > required to be the same length as the top-level arrays.
+//! A two-dimensional batch of column-oriented data with a defined
+//! [schema](crate::datatypes::Schema).
use std::sync::Arc;
@@ -27,7 +24,18 @@ use crate::array::*;
use crate::datatypes::*;
use crate::error::{ArrowError, Result};
-/// A batch of column-oriented data
+/// A two-dimensional batch of column-oriented data with a defined
+/// [schema](crate::datatypes::Schema).
+///
+/// A `RecordBatch` is a two-dimensional dataset of a number of
+/// contiguous arrays, each the same length.
+/// A record batch has a schema which must match its arrays’
+/// datatypes.
+///
+/// Record batches are a convenient unit of work for various
+/// serialization and computation functions, possibly incremental.
+/// See also [CSV reader](crate::csv::Reader) and
+/// [JSON reader](crate::json::Reader).
#[derive(Clone)]
pub struct RecordBatch {
schema: Arc<Schema>,
@@ -35,12 +43,37 @@ pub struct RecordBatch {
}
impl RecordBatch {
- /// Creates a `RecordBatch` from a schema and columns
+ /// Creates a `RecordBatch` from a schema and columns.
///
/// Expects the following:
/// * the vec of columns to not be empty
- /// * the schema and column data types to have equal lengths and match
+ /// * the schema and column data types to have equal lengths
+ /// and match
/// * each array in columns to have the same length
+ ///
+ /// If the conditions are not met, an error is returned.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use std::sync::Arc;
+ /// use arrow::array::Int32Array;
+ /// use arrow::datatypes::{Schema, Field, DataType};
+ /// use arrow::record_batch::RecordBatch;
+ ///
+ /// # fn main() -> arrow::error::Result<()> {
+ /// let id_array = Int32Array::from(vec![1, 2, 3, 4, 5]);
+ /// let schema = Schema::new(vec![
+ /// Field::new("id", DataType::Int32, false)
+ /// ]);
+ ///
+ /// let batch = RecordBatch::try_new(
+ /// Arc::new(schema),
+ /// vec![Arc::new(id_array)]
+ /// )?;
+ /// # Ok(())
+ /// # }
+ /// ```
pub fn try_new(schema: Arc<Schema>, columns: Vec<ArrayRef>) ->
Result<Self> {
// check that there are some columns
if columns.is_empty() {
@@ -74,27 +107,77 @@ impl RecordBatch {
Ok(RecordBatch { schema, columns })
}
- /// Returns the schema of the record batch
+ /// Returns the [`Schema`](crate::datatypes::Schema) of the record batch.
pub fn schema(&self) -> &Arc<Schema> {
&self.schema
}
- /// Number of columns in the record batch
+ /// Returns the number of columns in the record batch.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use std::sync::Arc;
+ /// use arrow::array::Int32Array;
+ /// use arrow::datatypes::{Schema, Field, DataType};
+ /// use arrow::record_batch::RecordBatch;
+ ///
+ /// # fn main() -> arrow::error::Result<()> {
+ /// let id_array = Int32Array::from(vec![1, 2, 3, 4, 5]);
+ /// let schema = Schema::new(vec![
+ /// Field::new("id", DataType::Int32, false)
+ /// ]);
+ ///
+ /// let batch = RecordBatch::try_new(Arc::new(schema),
vec![Arc::new(id_array)])?;
+ ///
+ /// assert_eq!(batch.num_columns(), 1);
+ /// # Ok(())
+ /// # }
+ /// ```
pub fn num_columns(&self) -> usize {
self.columns.len()
}
- /// Number of rows in each column
+ /// Returns the number of rows in each column.
+ ///
+ /// # Panics
+ ///
+ /// Panics if the `RecordBatch` contains no columns.
+ ///
+ /// # Example
+ ///
+ /// ```
+ /// use std::sync::Arc;
+ /// use arrow::array::Int32Array;
+ /// use arrow::datatypes::{Schema, Field, DataType};
+ /// use arrow::record_batch::RecordBatch;
+ ///
+ /// # fn main() -> arrow::error::Result<()> {
+ /// let id_array = Int32Array::from(vec![1, 2, 3, 4, 5]);
+ /// let schema = Schema::new(vec![
+ /// Field::new("id", DataType::Int32, false)
+ /// ]);
+ ///
+ /// let batch = RecordBatch::try_new(Arc::new(schema),
vec![Arc::new(id_array)])?;
+ ///
+ /// assert_eq!(batch.num_rows(), 5);
+ /// # Ok(())
+ /// # }
+ /// ```
pub fn num_rows(&self) -> usize {
self.columns[0].data().len()
}
- /// Get a reference to a column's array by index
- pub fn column(&self, i: usize) -> &ArrayRef {
- &self.columns[i]
+ /// Get a reference to a column's array by index.
+ ///
+ /// # Panics
+ ///
+ /// Panics if `index` is outside of `0..num_columns`.
+ pub fn column(&self, index: usize) -> &ArrayRef {
+ &self.columns[index]
}
- /// Get a reference to all columns
+ /// Get a reference to all columns in the record batch.
pub fn columns(&self) -> &[ArrayRef] {
&self.columns[..]
}