This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 51513c1 ARROW-12411: [Rust] Create RecordBatches from Iterators (#7)
51513c1 is described below
commit 51513c165b8f39760960f2eeac9262f0b6e8c9f1
Author: Andrew Lamb <[email protected]>
AuthorDate: Tue Apr 27 06:44:22 2021 -0400
ARROW-12411: [Rust] Create RecordBatches from Iterators (#7)
---
arrow/src/record_batch.rs | 132 +++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 131 insertions(+), 1 deletion(-)
diff --git a/arrow/src/record_batch.rs b/arrow/src/record_batch.rs
index 93abb90..39ef227 100644
--- a/arrow/src/record_batch.rs
+++ b/arrow/src/record_batch.rs
@@ -243,6 +243,90 @@ impl RecordBatch {
pub fn columns(&self) -> &[ArrayRef] {
&self.columns[..]
}
+
+ /// Create a `RecordBatch` from an iterable list of pairs of the
+ /// form `(field_name, array)`, with the same requirements on
+ /// fields and arrays as [`RecordBatch::try_new`]. This method is
+ /// often used to create a single `RecordBatch` from arrays,
+ /// e.g. for testing.
+ ///
+ /// The resulting schema is marked as nullable for each column if
+ /// the array for that column is has any nulls. To explicitly
+ /// specify nullibility, use [`RecordBatch::try_from_iter_with_nullable`]
+ ///
+ /// Example:
+ /// ```
+ /// use std::sync::Arc;
+ /// use arrow::array::{ArrayRef, Int32Array, StringArray};
+ /// use arrow::datatypes::{Schema, Field, DataType};
+ /// use arrow::record_batch::RecordBatch;
+ ///
+ /// let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2]));
+ /// let b: ArrayRef = Arc::new(StringArray::from(vec!["a", "b"]));
+ ///
+ /// let record_batch = RecordBatch::try_from_iter(vec![
+ /// ("a", a),
+ /// ("b", b),
+ /// ]);
+ ///
+ pub fn try_from_iter<I, F>(value: I) -> Result<Self>
+ where
+ I: IntoIterator<Item = (F, ArrayRef)>,
+ F: AsRef<str>,
+ {
+ // TODO: implement `TryFrom` trait, once
+ // https://github.com/rust-lang/rust/issues/50133 is no longer an
+ // issue
+ let iter = value.into_iter().map(|(field_name, array)| {
+ let nullable = array.null_count() > 0;
+ (field_name, array, nullable)
+ });
+
+ Self::try_from_iter_with_nullable(iter)
+ }
+
+ /// Create a `RecordBatch` from an iterable list of tuples of the
+ /// form `(field_name, array, nullable)`, with the same requirements on
+ /// fields and arrays as [`RecordBatch::try_new`]. This method is often
+ /// used to create a single `RecordBatch` from arrays, e.g. for
+ /// testing.
+ ///
+ /// Example:
+ /// ```
+ /// use std::sync::Arc;
+ /// use arrow::array::{ArrayRef, Int32Array, StringArray};
+ /// use arrow::datatypes::{Schema, Field, DataType};
+ /// use arrow::record_batch::RecordBatch;
+ ///
+ /// let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2]));
+ /// let b: ArrayRef = Arc::new(StringArray::from(vec![Some("a"),
Some("b")]));
+ ///
+ /// // Note neither `a` nor `b` has any actual nulls, but we mark
+ /// // b an nullable
+ /// let record_batch = RecordBatch::try_from_iter_with_nullable(vec![
+ /// ("a", a, false),
+ /// ("b", b, true),
+ /// ]);
+ pub fn try_from_iter_with_nullable<I, F>(value: I) -> Result<Self>
+ where
+ I: IntoIterator<Item = (F, ArrayRef, bool)>,
+ F: AsRef<str>,
+ {
+ // TODO: implement `TryFrom` trait, once
+ // https://github.com/rust-lang/rust/issues/50133 is no longer an
+ // issue
+ let (fields, columns) = value
+ .into_iter()
+ .map(|(field_name, array, nullable)| {
+ let field_name = field_name.as_ref();
+ let field = Field::new(field_name, array.data_type().clone(),
nullable);
+ (field, array)
+ })
+ .unzip();
+
+ let schema = Arc::new(Schema::new(fields));
+ RecordBatch::try_new(schema, columns)
+ }
}
/// Options that control the behaviour used when creating a [`RecordBatch`].
@@ -261,7 +345,8 @@ impl Default for RecordBatchOptions {
}
impl From<&StructArray> for RecordBatch {
- /// Create a record batch from struct array.
+ /// Create a record batch from struct array, where each field of
+ /// the `StructArray` becomes a `Field` in the schema.
///
/// This currently does not flatten and nested struct types
fn from(struct_array: &StructArray) -> Self {
@@ -328,7 +413,10 @@ mod tests {
let record_batch =
RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a),
Arc::new(b)])
.unwrap();
+ check_batch(record_batch)
+ }
+ fn check_batch(record_batch: RecordBatch) {
assert_eq!(5, record_batch.num_rows());
assert_eq!(2, record_batch.num_columns());
assert_eq!(&DataType::Int32,
record_batch.schema().field(0).data_type());
@@ -338,6 +426,48 @@ mod tests {
}
#[test]
+ fn create_record_batch_try_from_iter() {
+ let a: ArrayRef = Arc::new(Int32Array::from(vec![
+ Some(1),
+ Some(2),
+ None,
+ Some(4),
+ Some(5),
+ ]));
+ let b: ArrayRef = Arc::new(StringArray::from(vec!["a", "b", "c", "d",
"e"]));
+
+ let record_batch = RecordBatch::try_from_iter(vec![("a", a), ("b", b)])
+ .expect("valid conversion");
+
+ let expected_schema = Schema::new(vec![
+ Field::new("a", DataType::Int32, true),
+ Field::new("b", DataType::Utf8, false),
+ ]);
+ assert_eq!(record_batch.schema().as_ref(), &expected_schema);
+ check_batch(record_batch);
+ }
+
+ #[test]
+ fn create_record_batch_try_from_iter_with_nullable() {
+ let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5]));
+ let b: ArrayRef = Arc::new(StringArray::from(vec!["a", "b", "c", "d",
"e"]));
+
+ // Note there are no nulls in a or b, but we specify that b is nullable
+ let record_batch = RecordBatch::try_from_iter_with_nullable(vec![
+ ("a", a, false),
+ ("b", b, true),
+ ])
+ .expect("valid conversion");
+
+ let expected_schema = Schema::new(vec![
+ Field::new("a", DataType::Int32, false),
+ Field::new("b", DataType::Utf8, true),
+ ]);
+ assert_eq!(record_batch.schema().as_ref(), &expected_schema);
+ check_batch(record_batch);
+ }
+
+ #[test]
fn create_record_batch_schema_mismatch() {
let schema = Schema::new(vec![Field::new("a", DataType::Int32,
false)]);