This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 8d166a144 Add StructArray Constructors (#3879) (#4064)
8d166a144 is described below
commit 8d166a14467ac8e59a47174de676971f9f896e78
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Tue Apr 25 14:04:27 2023 -0400
Add StructArray Constructors (#3879) (#4064)
* Add StructArray Constructors (#3879)
* Fix doc
* Add try_new
* Update other constructors
---
arrow-array/src/array/struct_array.rs | 302 ++++++++++++++++++----------------
arrow-array/src/record_batch.rs | 11 +-
arrow-buffer/src/buffer/null.rs | 7 +
arrow-json/src/reader/struct_array.rs | 16 +-
4 files changed, 173 insertions(+), 163 deletions(-)
diff --git a/arrow-array/src/array/struct_array.rs
b/arrow-array/src/array/struct_array.rs
index fa43062b7..a18f38c08 100644
--- a/arrow-array/src/array/struct_array.rs
+++ b/arrow-array/src/array/struct_array.rs
@@ -15,8 +15,8 @@
// specific language governing permissions and limitations
// under the License.
-use crate::{make_array, Array, ArrayRef, RecordBatch};
-use arrow_buffer::{buffer_bin_or, Buffer, NullBuffer};
+use crate::{make_array, new_null_array, Array, ArrayRef, RecordBatch};
+use arrow_buffer::{BooleanBuffer, Buffer, NullBuffer};
use arrow_data::{ArrayData, ArrayDataBuilder};
use arrow_schema::{ArrowError, DataType, Field, Fields, SchemaBuilder};
use std::sync::Arc;
@@ -77,10 +77,136 @@ pub struct StructArray {
len: usize,
data_type: DataType,
nulls: Option<NullBuffer>,
- pub(crate) fields: Vec<ArrayRef>,
+ fields: Vec<ArrayRef>,
}
impl StructArray {
+ /// Create a new [`StructArray`] from the provided parts, panicking on
failure
+ ///
+ /// # Panics
+ ///
+ /// Panics if [`Self::try_new`] returns an error
+ pub fn new(fields: Fields, arrays: Vec<ArrayRef>, nulls:
Option<NullBuffer>) -> Self {
+ Self::try_new(fields, arrays, nulls).unwrap()
+ }
+
+ /// Create a new [`StructArray`] from the provided parts, returning an
error on failure
+ ///
+ /// # Errors
+ ///
+ /// Errors if
+ ///
+ /// * `fields.len() != arrays.len()`
+ /// * `fields[i].data_type() != arrays[i].data_type()`
+ /// * `arrays[i].len() != arrays[j].len()`
+ /// * `arrays[i].len() != nulls.len()`
+ /// * `!fields[i].is_nullable() && !nulls.contains(arrays[i].nulls())`
+ pub fn try_new(
+ fields: Fields,
+ arrays: Vec<ArrayRef>,
+ nulls: Option<NullBuffer>,
+ ) -> Result<Self, ArrowError> {
+ if fields.len() != arrays.len() {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Incorrect number of arrays for StructArray fields, expected
{} got {}",
+ fields.len(),
+ arrays.len()
+ )));
+ }
+ let len = arrays.first().map(|x| x.len()).unwrap_or_default();
+
+ if let Some(n) = nulls.as_ref() {
+ if n.len() != len {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Incorrect number of nulls for StructArray, expected {len}
got {}",
+ n.len(),
+ )));
+ }
+ }
+
+ for (f, a) in fields.iter().zip(&arrays) {
+ if f.data_type() != a.data_type() {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Incorrect datatype for StructArray field {:?}, expected
{} got {}",
+ f.name(),
+ f.data_type(),
+ a.data_type()
+ )));
+ }
+
+ if a.len() != len {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Incorrect array length for StructArray field {:?},
expected {} got {}",
+ f.name(),
+ len,
+ a.len()
+ )));
+ }
+
+ if let Some(a) = a.nulls() {
+ let nulls_valid = f.is_nullable()
+ || nulls.as_ref().map(|n|
n.contains(a)).unwrap_or_default();
+
+ if !nulls_valid {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Found unmasked nulls for non-nullable StructArray
field {:?}",
+ f.name()
+ )));
+ }
+ }
+ }
+
+ Ok(Self {
+ len,
+ data_type: DataType::Struct(fields),
+ nulls: nulls.filter(|n| n.null_count() > 0),
+ fields: arrays,
+ })
+ }
+
+ /// Create a new [`StructArray`] of length `len` where all values are null
+ pub fn new_null(fields: Fields, len: usize) -> Self {
+ let arrays = fields
+ .iter()
+ .map(|f| new_null_array(f.data_type(), len))
+ .collect();
+
+ Self {
+ len,
+ data_type: DataType::Struct(fields),
+ nulls: Some(NullBuffer::new_null(len)),
+ fields: arrays,
+ }
+ }
+
+ /// Create a new [`StructArray`] from the provided parts without validation
+ ///
+ /// # Safety
+ ///
+ /// Safe if [`Self::new`] would not panic with the given arguments
+ pub unsafe fn new_unchecked(
+ fields: Fields,
+ arrays: Vec<ArrayRef>,
+ nulls: Option<NullBuffer>,
+ ) -> Self {
+ let len = arrays.first().map(|x| x.len()).unwrap_or_default();
+ Self {
+ len,
+ data_type: DataType::Struct(fields),
+ nulls,
+ fields: arrays,
+ }
+ }
+
+ /// Deconstruct this array into its constituent parts
+ pub fn into_parts(self) -> (Fields, Vec<ArrayRef>, Option<NullBuffer>) {
+ let f = match self.data_type {
+ DataType::Struct(f) => f,
+ _ => unreachable!(),
+ };
+ (f, self.fields, self.nulls)
+ }
+
/// Returns the field at `pos`.
pub fn column(&self, pos: usize) -> &ArrayRef {
&self.fields[pos]
@@ -183,66 +309,18 @@ impl TryFrom<Vec<(&str, ArrayRef)>> for StructArray {
type Error = ArrowError;
/// builds a StructArray from a vector of names and arrays.
- /// This errors if the values have a different length.
- /// An entry is set to Null when all values are null.
fn try_from(values: Vec<(&str, ArrayRef)>) -> Result<Self, ArrowError> {
- let values_len = values.len();
-
- // these will be populated
- let mut fields = Vec::with_capacity(values_len);
- let mut child_data = Vec::with_capacity(values_len);
-
- // len: the size of the arrays.
- let mut len: Option<usize> = None;
- // null: the null mask of the arrays.
- let mut null: Option<Buffer> = None;
- for (field_name, array) in values {
- let child_datum = array.to_data();
- let child_datum_len = child_datum.len();
- if let Some(len) = len {
- if len != child_datum_len {
- return Err(ArrowError::InvalidArgumentError(
- format!("Array of field \"{field_name}\" has length
{child_datum_len}, but previous elements have length {len}.
- All arrays in every entry in a struct array must have
the same length.")
- ));
- }
- } else {
- len = Some(child_datum_len)
- }
- fields.push(Arc::new(Field::new(
- field_name,
- array.data_type().clone(),
- child_datum.nulls().is_some(),
- )));
-
- if let Some(child_nulls) = child_datum.nulls() {
- null = Some(if let Some(null_buffer) = &null {
- buffer_bin_or(
- null_buffer,
- 0,
- child_nulls.buffer(),
- child_nulls.offset(),
- child_datum_len,
- )
- } else {
- child_nulls.inner().sliced()
- });
- } else if null.is_some() {
- // when one of the fields has no nulls, then there is no null
in the array
- null = None;
- }
- child_data.push(child_datum);
- }
- let len = len.unwrap();
-
- let builder = ArrayData::builder(DataType::Struct(fields.into()))
- .len(len)
- .null_bit_buffer(null)
- .child_data(child_data);
-
- let array_data = unsafe { builder.build_unchecked() };
-
- Ok(StructArray::from(array_data))
+ let (schema, arrays): (SchemaBuilder, _) = values
+ .into_iter()
+ .map(|(name, array)| {
+ (
+ Field::new(name, array.data_type().clone(),
array.nulls().is_some()),
+ array,
+ )
+ })
+ .unzip();
+
+ StructArray::try_new(schema.finish().fields, arrays, None)
}
}
@@ -303,38 +381,8 @@ impl Array for StructArray {
impl From<Vec<(Field, ArrayRef)>> for StructArray {
fn from(v: Vec<(Field, ArrayRef)>) -> Self {
- let iter = v.into_iter();
- let capacity = iter.size_hint().0;
-
- let mut len = None;
- let mut schema = SchemaBuilder::with_capacity(capacity);
- let mut child_data = Vec::with_capacity(capacity);
- for (field, array) in iter {
- // Check the length of the child arrays
- assert_eq!(
- *len.get_or_insert(array.len()),
- array.len(),
- "all child arrays of a StructArray must have the same length"
- );
- // Check data types of child arrays
- assert_eq!(
- field.data_type(),
- array.data_type(),
- "the field data types must match the array data in a
StructArray"
- );
- schema.push(field);
- child_data.push(array.to_data());
- }
- let field_types = schema.finish().fields;
- let array_data = ArrayData::builder(DataType::Struct(field_types))
- .child_data(child_data)
- .len(len.unwrap_or_default());
- let array_data = unsafe { array_data.build_unchecked() };
-
- // We must validate nullability
- array_data.validate_nulls().unwrap();
-
- Self::from(array_data)
+ let (schema, arrays): (SchemaBuilder, _) = v.into_iter().unzip();
+ StructArray::new(schema.finish().fields, arrays, None)
}
}
@@ -359,37 +407,10 @@ impl std::fmt::Debug for StructArray {
impl From<(Vec<(Field, ArrayRef)>, Buffer)> for StructArray {
fn from(pair: (Vec<(Field, ArrayRef)>, Buffer)) -> Self {
- let capacity = pair.0.len();
- let mut len = None;
- let mut schema = SchemaBuilder::with_capacity(capacity);
- let mut child_data = Vec::with_capacity(capacity);
- for (field, array) in pair.0 {
- // Check the length of the child arrays
- assert_eq!(
- *len.get_or_insert(array.len()),
- array.len(),
- "all child arrays of a StructArray must have the same length"
- );
- // Check data types of child arrays
- assert_eq!(
- field.data_type(),
- array.data_type(),
- "the field data types must match the array data in a
StructArray"
- );
- schema.push(field);
- child_data.push(array.to_data());
- }
- let field_types = schema.finish().fields;
- let array_data = ArrayData::builder(DataType::Struct(field_types))
- .null_bit_buffer(Some(pair.1))
- .child_data(child_data)
- .len(len.unwrap_or_default());
- let array_data = unsafe { array_data.build_unchecked() };
-
- // We must validate nullability
- array_data.validate_nulls().unwrap();
-
- Self::from(array_data)
+ let len = pair.0.first().map(|x| x.1.len()).unwrap_or_default();
+ let (fields, arrays): (SchemaBuilder, Vec<_>) =
pair.0.into_iter().unzip();
+ let nulls = NullBuffer::new(BooleanBuffer::new(pair.1, 0, len));
+ Self::new(fields.finish().fields, arrays, Some(nulls))
}
}
@@ -512,12 +533,7 @@ mod tests {
let struct_data = arr.into_data();
assert_eq!(4, struct_data.len());
- assert_eq!(1, struct_data.null_count());
- assert_eq!(
- // 00001011
- &[11_u8],
- struct_data.nulls().unwrap().validity()
- );
+ assert_eq!(0, struct_data.null_count());
let expected_string_data = ArrayData::builder(DataType::Utf8)
.len(4)
@@ -549,20 +565,20 @@ mod tests {
let ints: ArrayRef =
Arc::new(Int32Array::from(vec![Some(1), Some(2), None, Some(4)]));
- let arr =
- StructArray::try_from(vec![("f1", strings.clone()), ("f2",
ints.clone())]);
+ let err =
+ StructArray::try_from(vec![("f1", strings.clone()), ("f2",
ints.clone())])
+ .unwrap_err()
+ .to_string();
- match arr {
- Err(ArrowError::InvalidArgumentError(e)) => {
- assert!(e.starts_with("Array of field \"f2\" has length 4, but
previous elements have length 3."));
- }
- _ => panic!("This test got an unexpected error type"),
- };
+ assert_eq!(
+ err,
+ "Invalid argument error: Incorrect array length for StructArray
field \"f2\", expected 3 got 4"
+ )
}
#[test]
#[should_panic(
- expected = "the field data types must match the array data in a
StructArray"
+ expected = "Incorrect datatype for StructArray field \\\"b\\\",
expected Int16 got Boolean"
)]
fn test_struct_array_from_mismatched_types_single() {
drop(StructArray::from(vec![(
@@ -574,7 +590,7 @@ mod tests {
#[test]
#[should_panic(
- expected = "the field data types must match the array data in a
StructArray"
+ expected = "Incorrect datatype for StructArray field \\\"b\\\",
expected Int16 got Boolean"
)]
fn test_struct_array_from_mismatched_types_multiple() {
drop(StructArray::from(vec![
@@ -679,7 +695,7 @@ mod tests {
#[test]
#[should_panic(
- expected = "all child arrays of a StructArray must have the same
length"
+ expected = "Incorrect array length for StructArray field \\\"c\\\",
expected 1 got 2"
)]
fn test_invalid_struct_child_array_lengths() {
drop(StructArray::from(vec![
@@ -702,7 +718,7 @@ mod tests {
#[test]
#[should_panic(
- expected = "non-nullable child of type Int32 contains nulls not
present in parent Struct"
+ expected = "Found unmasked nulls for non-nullable StructArray field
\\\"c\\\""
)]
fn test_struct_array_from_mismatched_nullability() {
drop(StructArray::from(vec![(
diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs
index ee61d2da6..8fb08111c 100644
--- a/arrow-array/src/record_batch.rs
+++ b/arrow-array/src/record_batch.rs
@@ -467,17 +467,12 @@ impl Default for RecordBatchOptions {
}
impl From<StructArray> for RecordBatch {
fn from(value: StructArray) -> Self {
- assert_eq!(
- value.null_count(),
- 0,
- "Cannot convert nullable StructArray to RecordBatch, see
StructArray documentation"
- );
let row_count = value.len();
- let schema = Arc::new(Schema::new(value.fields().clone()));
- let columns = value.fields;
+ let (fields, columns, nulls) = value.into_parts();
+ assert_eq!(nulls.map(|n| n.null_count()).unwrap_or_default(), 0,
"Cannot convert nullable StructArray to RecordBatch, see StructArray
documentation");
RecordBatch {
- schema,
+ schema: Arc::new(Schema::new(fields)),
row_count,
columns,
}
diff --git a/arrow-buffer/src/buffer/null.rs b/arrow-buffer/src/buffer/null.rs
index f088e7fa6..cdb0c2aeb 100644
--- a/arrow-buffer/src/buffer/null.rs
+++ b/arrow-buffer/src/buffer/null.rs
@@ -67,6 +67,13 @@ impl NullBuffer {
}
}
+ /// Returns true if all nulls in `other` also exist in self
+ pub fn contains(&self, other: &NullBuffer) -> bool {
+ let lhs = self.inner().bit_chunks().iter_padded();
+ let rhs = other.inner().bit_chunks().iter_padded();
+ lhs.zip(rhs).all(|(l, r)| (l & !r) == 0)
+ }
+
/// Returns the length of this [`NullBuffer`]
#[inline]
pub fn len(&self) -> usize {
diff --git a/arrow-json/src/reader/struct_array.rs
b/arrow-json/src/reader/struct_array.rs
index 6c6f1457b..707b56d50 100644
--- a/arrow-json/src/reader/struct_array.rs
+++ b/arrow-json/src/reader/struct_array.rs
@@ -120,19 +120,11 @@ impl ArrayDecoder for StructArrayDecoder {
for (c, f) in child_data.iter().zip(fields) {
// Sanity check
assert_eq!(c.len(), pos.len());
+ if let Some(a) = c.nulls() {
+ let nulls_valid = f.is_nullable()
+ || nulls.as_ref().map(|n|
n.contains(a)).unwrap_or_default();
- if !f.is_nullable() && c.null_count() != 0 {
- // Need to verify nulls
- let valid = match nulls.as_ref() {
- Some(nulls) => {
- let lhs = nulls.inner().bit_chunks().iter_padded();
- let rhs =
c.nulls().unwrap().inner().bit_chunks().iter_padded();
- lhs.zip(rhs).all(|(l, r)| (l & !r) == 0)
- }
- None => false,
- };
-
- if !valid {
+ if !nulls_valid {
return Err(ArrowError::JsonError(format!("Encountered
unmasked nulls in non-nullable StructArray child: {f}")));
}
}