klion26 commented on code in PR #4064: URL: https://github.com/apache/arrow-rs/pull/4064#discussion_r2004742125
########## arrow-array/src/array/struct_array.rs: ########## @@ -77,10 +77,136 @@ pub struct StructArray { len: usize, data_type: DataType, nulls: Option<NullBuffer>, - pub(crate) fields: Vec<ArrayRef>, + fields: Vec<ArrayRef>, } impl StructArray { + /// Create a new [`StructArray`] from the provided parts, panicking on failure + /// + /// # Panics + /// + /// Panics if [`Self::try_new`] returns an error + pub fn new(fields: Fields, arrays: Vec<ArrayRef>, nulls: Option<NullBuffer>) -> Self { + Self::try_new(fields, arrays, nulls).unwrap() + } + + /// Create a new [`StructArray`] from the provided parts, returning an error on failure + /// + /// # Errors + /// + /// Errors if + /// + /// * `fields.len() != arrays.len()` + /// * `fields[i].data_type() != arrays[i].data_type()` + /// * `arrays[i].len() != arrays[j].len()` + /// * `arrays[i].len() != nulls.len()` + /// * `!fields[i].is_nullable() && !nulls.contains(arrays[i].nulls())` + pub fn try_new( + fields: Fields, + arrays: Vec<ArrayRef>, + nulls: Option<NullBuffer>, + ) -> Result<Self, ArrowError> { + if fields.len() != arrays.len() { + return Err(ArrowError::InvalidArgumentError(format!( + "Incorrect number of arrays for StructArray fields, expected {} got {}", + fields.len(), + arrays.len() + ))); + } + let len = arrays.first().map(|x| x.len()).unwrap_or_default(); + + if let Some(n) = nulls.as_ref() { + if n.len() != len { + return Err(ArrowError::InvalidArgumentError(format!( + "Incorrect number of nulls for StructArray, expected {len} got {}", + n.len(), + ))); + } + } + + for (f, a) in fields.iter().zip(&arrays) { + if f.data_type() != a.data_type() { + return Err(ArrowError::InvalidArgumentError(format!( + "Incorrect datatype for StructArray field {:?}, expected {} got {}", + f.name(), + f.data_type(), + a.data_type() + ))); + } + + if a.len() != len { + return Err(ArrowError::InvalidArgumentError(format!( + "Incorrect array length for StructArray field {:?}, expected {} got {}", Review Comment: @tustvold, could you please share why we added a check to validate that all the fields of the `record` type have the same length here? Thanks. I'm asking this because when playing with [an Avro file](https://github.com/user-attachments/files/19124467/56f216f3-7e24-40b0-a76a-87a63a5bc254-m0.avro.txt) with arrow-rs, it will panic there, after adding some debug message with the code `println!("Field:{f}, len:{:?} array:{:?}/{:?}", len, a.data_type(), a.len());` before line 137, the result showed as below ``` Field:Field { name: "content", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, len:1 array:Int32/1 Field:Field { name: "file_path", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, len:1 array:Utf8/1 Field:Field { name: "file_format", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, len:1 array:Utf8/1 Field:Field { name: "partition", data_type: Struct([]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, len:1 array:Struct([])/0 Field:Field { name: "record_count", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, len:1 array:Int64/1 Field:Field { name: "file_size_in_bytes", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, len:1 array:Int64/1 Field:Field { name: "column_sizes", data_type: List(Field { name: "item", data_type: Struct([Field { name: "key", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, len:1 array:List(Field { name: "item", data_type: Struct([Field { name: "key", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })/1 Field:Field { name: "value_counts", data_type: List(Field { name: "item", data_type: Struct([Field { name: "key", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, len:1 array:List(Field { name: "item", data_type: Struct([Field { name: "key", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })/1 Field:Field { name: "null_value_counts", data_type: List(Field { name: "item", data_type: Struct([Field { name: "key", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, len:1 array:List(Field { name: "item", data_type: Struct([Field { name: "key", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })/1 Field:Field { name: "nan_value_counts", data_type: List(Field { name: "item", data_type: Struct([Field { name: "key", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, len:1 array:List(Field { name: "item", data_type: Struct([Field { name: "key", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })/1 Field:Field { name: "lower_bounds", data_type: List(Field { name: "item", data_type: Struct([Field { name: "key", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Binary, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, len:1 array:List(Field { name: "item", data_type: Struct([Field { name: "key", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Binary, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })/1 Field:Field { name: "upper_bounds", data_type: List(Field { name: "item", data_type: Struct([Field { name: "key", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Binary, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, len:1 array:List(Field { name: "item", data_type: Struct([Field { name: "key", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Binary, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })/1 Field:Field { name: "key_metadata", data_type: Binary, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, len:1 array:Binary/1 Field:Field { name: "split_offsets", data_type: List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"element-id": "133"} }, len:1 array:List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })/1 Field:Field { name: "equality_ids", data_type: List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"element-id": "136"} }, len:1 array:List(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })/1 Field:Field { name: "sort_order_id", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, len:1 array:Int32/1 ``` after reading the [spec](https://avro.apache.org/docs/++version++/specification/#schema-record) did not find that all the fields have the same length If this logic here needs some improvements, maybe I can help here, thanks -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org