klion26 commented on code in PR #4064:
URL: https://github.com/apache/arrow-rs/pull/4064#discussion_r2004742125
##########
arrow-array/src/array/struct_array.rs:
##########
@@ -77,10 +77,136 @@ pub struct StructArray {
len: usize,
data_type: DataType,
nulls: Option<NullBuffer>,
- pub(crate) fields: Vec<ArrayRef>,
+ fields: Vec<ArrayRef>,
}
impl StructArray {
+ /// Create a new [`StructArray`] from the provided parts, panicking on
failure
+ ///
+ /// # Panics
+ ///
+ /// Panics if [`Self::try_new`] returns an error
+ pub fn new(fields: Fields, arrays: Vec<ArrayRef>, nulls:
Option<NullBuffer>) -> Self {
+ Self::try_new(fields, arrays, nulls).unwrap()
+ }
+
+ /// Create a new [`StructArray`] from the provided parts, returning an
error on failure
+ ///
+ /// # Errors
+ ///
+ /// Errors if
+ ///
+ /// * `fields.len() != arrays.len()`
+ /// * `fields[i].data_type() != arrays[i].data_type()`
+ /// * `arrays[i].len() != arrays[j].len()`
+ /// * `arrays[i].len() != nulls.len()`
+ /// * `!fields[i].is_nullable() && !nulls.contains(arrays[i].nulls())`
+ pub fn try_new(
+ fields: Fields,
+ arrays: Vec<ArrayRef>,
+ nulls: Option<NullBuffer>,
+ ) -> Result<Self, ArrowError> {
+ if fields.len() != arrays.len() {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Incorrect number of arrays for StructArray fields, expected
{} got {}",
+ fields.len(),
+ arrays.len()
+ )));
+ }
+ let len = arrays.first().map(|x| x.len()).unwrap_or_default();
+
+ if let Some(n) = nulls.as_ref() {
+ if n.len() != len {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Incorrect number of nulls for StructArray, expected {len}
got {}",
+ n.len(),
+ )));
+ }
+ }
+
+ for (f, a) in fields.iter().zip(&arrays) {
+ if f.data_type() != a.data_type() {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Incorrect datatype for StructArray field {:?}, expected
{} got {}",
+ f.name(),
+ f.data_type(),
+ a.data_type()
+ )));
+ }
+
+ if a.len() != len {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Incorrect array length for StructArray field {:?},
expected {} got {}",
Review Comment:
@tustvold, could you please share why we added a check to validate that all
the fields of the `record` type have the same length here? Thanks.
I'm asking this because when playing with [an Avro
file](https://github.com/user-attachments/files/19124467/56f216f3-7e24-40b0-a76a-87a63a5bc254-m0.avro.txt)
with arrow-rs, it will panic there, after adding some debug message with the
code `println!("Field:{f}, len:{:?} array:{:?}/{:?}", len, a.data_type(),
a.len());` before line 137, the result showed as below
```
Field:Field { name: "content", data_type: Int32, nullable: false, dict_id:
0, dict_is_ordered: false, metadata: {} }, len:1 array:Int32/1
Field:Field { name: "file_path", data_type: Utf8, nullable: false, dict_id:
0, dict_is_ordered: false, metadata: {} }, len:1 array:Utf8/1
Field:Field { name: "file_format", data_type: Utf8, nullable: false,
dict_id: 0, dict_is_ordered: false, metadata: {} }, len:1 array:Utf8/1
Field:Field { name: "partition", data_type: Struct([]), nullable: false,
dict_id: 0, dict_is_ordered: false, metadata: {} }, len:1 array:Struct([])/0
Field:Field { name: "record_count", data_type: Int64, nullable: false,
dict_id: 0, dict_is_ordered: false, metadata: {} }, len:1 array:Int64/1
Field:Field { name: "file_size_in_bytes", data_type: Int64, nullable: false,
dict_id: 0, dict_is_ordered: false, metadata: {} }, len:1 array:Int64/1
Field:Field { name: "column_sizes", data_type: List(Field { name: "item",
data_type: Struct([Field { name: "key", data_type: Int32, nullable: false,
dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value",
data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false,
metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata:
{} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} },
len:1 array:List(Field { name: "item", data_type: Struct([Field { name: "key",
data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false,
metadata: {} }, Field { name: "value", data_type: Int64, nullable: false,
dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id:
0, dict_is_ordered: false, metadata: {} })/1
Field:Field { name: "value_counts", data_type: List(Field { name: "item",
data_type: Struct([Field { name: "key", data_type: Int32, nullable: false,
dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value",
data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false,
metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata:
{} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} },
len:1 array:List(Field { name: "item", data_type: Struct([Field { name: "key",
data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false,
metadata: {} }, Field { name: "value", data_type: Int64, nullable: false,
dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id:
0, dict_is_ordered: false, metadata: {} })/1
Field:Field { name: "null_value_counts", data_type: List(Field { name:
"item", data_type: Struct([Field { name: "key", data_type: Int32, nullable:
false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name:
"value", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false,
metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata:
{} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} },
len:1 array:List(Field { name: "item", data_type: Struct([Field { name: "key",
data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false,
metadata: {} }, Field { name: "value", data_type: Int64, nullable: false,
dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id:
0, dict_is_ordered: false, metadata: {} })/1
Field:Field { name: "nan_value_counts", data_type: List(Field { name:
"item", data_type: Struct([Field { name: "key", data_type: Int32, nullable:
false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name:
"value", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false,
metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata:
{} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} },
len:1 array:List(Field { name: "item", data_type: Struct([Field { name: "key",
data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false,
metadata: {} }, Field { name: "value", data_type: Int64, nullable: false,
dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id:
0, dict_is_ordered: false, metadata: {} })/1
Field:Field { name: "lower_bounds", data_type: List(Field { name: "item",
data_type: Struct([Field { name: "key", data_type: Int32, nullable: false,
dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value",
data_type: Binary, nullable: false, dict_id: 0, dict_is_ordered: false,
metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata:
{} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} },
len:1 array:List(Field { name: "item", data_type: Struct([Field { name: "key",
data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false,
metadata: {} }, Field { name: "value", data_type: Binary, nullable: false,
dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id:
0, dict_is_ordered: false, metadata: {} })/1
Field:Field { name: "upper_bounds", data_type: List(Field { name: "item",
data_type: Struct([Field { name: "key", data_type: Int32, nullable: false,
dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value",
data_type: Binary, nullable: false, dict_id: 0, dict_is_ordered: false,
metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata:
{} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} },
len:1 array:List(Field { name: "item", data_type: Struct([Field { name: "key",
data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false,
metadata: {} }, Field { name: "value", data_type: Binary, nullable: false,
dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id:
0, dict_is_ordered: false, metadata: {} })/1
Field:Field { name: "key_metadata", data_type: Binary, nullable: true,
dict_id: 0, dict_is_ordered: false, metadata: {} }, len:1 array:Binary/1
Field:Field { name: "split_offsets", data_type: List(Field { name: "item",
data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata:
{} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata:
{"element-id": "133"} }, len:1 array:List(Field { name: "item", data_type:
Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })/1
Field:Field { name: "equality_ids", data_type: List(Field { name: "item",
data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata:
{} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata:
{"element-id": "136"} }, len:1 array:List(Field { name: "item", data_type:
Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })/1
Field:Field { name: "sort_order_id", data_type: Int32, nullable: true,
dict_id: 0, dict_is_ordered: false, metadata: {} }, len:1 array:Int32/1
```
after reading the
[spec](https://avro.apache.org/docs/++version++/specification/#schema-record)
did not find that all the fields have the same length
If this logic here needs some improvements, maybe I can help here, thanks
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]