klion26 commented on code in PR #4064:
URL: https://github.com/apache/arrow-rs/pull/4064#discussion_r2004742125


##########
arrow-array/src/array/struct_array.rs:
##########
@@ -77,10 +77,136 @@ pub struct StructArray {
     len: usize,
     data_type: DataType,
     nulls: Option<NullBuffer>,
-    pub(crate) fields: Vec<ArrayRef>,
+    fields: Vec<ArrayRef>,
 }
 
 impl StructArray {
+    /// Create a new [`StructArray`] from the provided parts, panicking on 
failure
+    ///
+    /// # Panics
+    ///
+    /// Panics if [`Self::try_new`] returns an error
+    pub fn new(fields: Fields, arrays: Vec<ArrayRef>, nulls: 
Option<NullBuffer>) -> Self {
+        Self::try_new(fields, arrays, nulls).unwrap()
+    }
+
+    /// Create a new [`StructArray`] from the provided parts, returning an 
error on failure
+    ///
+    /// # Errors
+    ///
+    /// Errors if
+    ///
+    /// * `fields.len() != arrays.len()`
+    /// * `fields[i].data_type() != arrays[i].data_type()`
+    /// * `arrays[i].len() != arrays[j].len()`
+    /// * `arrays[i].len() != nulls.len()`
+    /// * `!fields[i].is_nullable() && !nulls.contains(arrays[i].nulls())`
+    pub fn try_new(
+        fields: Fields,
+        arrays: Vec<ArrayRef>,
+        nulls: Option<NullBuffer>,
+    ) -> Result<Self, ArrowError> {
+        if fields.len() != arrays.len() {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Incorrect number of arrays for StructArray fields, expected 
{} got {}",
+                fields.len(),
+                arrays.len()
+            )));
+        }
+        let len = arrays.first().map(|x| x.len()).unwrap_or_default();
+
+        if let Some(n) = nulls.as_ref() {
+            if n.len() != len {
+                return Err(ArrowError::InvalidArgumentError(format!(
+                    "Incorrect number of nulls for StructArray, expected {len} 
got {}",
+                    n.len(),
+                )));
+            }
+        }
+
+        for (f, a) in fields.iter().zip(&arrays) {
+            if f.data_type() != a.data_type() {
+                return Err(ArrowError::InvalidArgumentError(format!(
+                    "Incorrect datatype for StructArray field {:?}, expected 
{} got {}",
+                    f.name(),
+                    f.data_type(),
+                    a.data_type()
+                )));
+            }
+
+            if a.len() != len {
+                return Err(ArrowError::InvalidArgumentError(format!(
+                    "Incorrect array length for StructArray field {:?}, 
expected {} got {}",

Review Comment:
   @tustvold, could you please share why we added a check to validate that all 
the fields of the `record` type have the same length here? Thanks.
   
   I'm asking this because when playing with [an Avro 
file](https://github.com/user-attachments/files/19124467/56f216f3-7e24-40b0-a76a-87a63a5bc254-m0.avro.txt)
 with arrow-rs, it will panic there, after adding some debug message with the 
code `println!("Field:{f}, len:{:?}  array:{:?}/{:?}", len, a.data_type(), 
a.len());` before line 137, the result showed as below
   ```
   Field:Field { name: "content", data_type: Int32, nullable: false, dict_id: 
0, dict_is_ordered: false, metadata: {} }, len:1  array:Int32/1
   Field:Field { name: "file_path", data_type: Utf8, nullable: false, dict_id: 
0, dict_is_ordered: false, metadata: {} }, len:1  array:Utf8/1
   Field:Field { name: "file_format", data_type: Utf8, nullable: false, 
dict_id: 0, dict_is_ordered: false, metadata: {} }, len:1  array:Utf8/1
   Field:Field { name: "partition", data_type: Struct([]), nullable: false, 
dict_id: 0, dict_is_ordered: false, metadata: {} }, len:1  array:Struct([])/0
   Field:Field { name: "record_count", data_type: Int64, nullable: false, 
dict_id: 0, dict_is_ordered: false, metadata: {} }, len:1  array:Int64/1
   Field:Field { name: "file_size_in_bytes", data_type: Int64, nullable: false, 
dict_id: 0, dict_is_ordered: false, metadata: {} }, len:1  array:Int64/1
   Field:Field { name: "column_sizes", data_type: List(Field { name: "item", 
data_type: Struct([Field { name: "key", data_type: Int32, nullable: false, 
dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", 
data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, 
metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: 
{} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 
len:1  array:List(Field { name: "item", data_type: Struct([Field { name: "key", 
data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, 
metadata: {} }, Field { name: "value", data_type: Int64, nullable: false, 
dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id: 
0, dict_is_ordered: false, metadata: {} })/1
   Field:Field { name: "value_counts", data_type: List(Field { name: "item", 
data_type: Struct([Field { name: "key", data_type: Int32, nullable: false, 
dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", 
data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, 
metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: 
{} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 
len:1  array:List(Field { name: "item", data_type: Struct([Field { name: "key", 
data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, 
metadata: {} }, Field { name: "value", data_type: Int64, nullable: false, 
dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id: 
0, dict_is_ordered: false, metadata: {} })/1
   Field:Field { name: "null_value_counts", data_type: List(Field { name: 
"item", data_type: Struct([Field { name: "key", data_type: Int32, nullable: 
false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: 
"value", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, 
metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: 
{} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 
len:1  array:List(Field { name: "item", data_type: Struct([Field { name: "key", 
data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, 
metadata: {} }, Field { name: "value", data_type: Int64, nullable: false, 
dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id: 
0, dict_is_ordered: false, metadata: {} })/1
   Field:Field { name: "nan_value_counts", data_type: List(Field { name: 
"item", data_type: Struct([Field { name: "key", data_type: Int32, nullable: 
false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: 
"value", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, 
metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: 
{} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 
len:1  array:List(Field { name: "item", data_type: Struct([Field { name: "key", 
data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, 
metadata: {} }, Field { name: "value", data_type: Int64, nullable: false, 
dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id: 
0, dict_is_ordered: false, metadata: {} })/1
   Field:Field { name: "lower_bounds", data_type: List(Field { name: "item", 
data_type: Struct([Field { name: "key", data_type: Int32, nullable: false, 
dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", 
data_type: Binary, nullable: false, dict_id: 0, dict_is_ordered: false, 
metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: 
{} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 
len:1  array:List(Field { name: "item", data_type: Struct([Field { name: "key", 
data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, 
metadata: {} }, Field { name: "value", data_type: Binary, nullable: false, 
dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id: 
0, dict_is_ordered: false, metadata: {} })/1
   Field:Field { name: "upper_bounds", data_type: List(Field { name: "item", 
data_type: Struct([Field { name: "key", data_type: Int32, nullable: false, 
dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", 
data_type: Binary, nullable: false, dict_id: 0, dict_is_ordered: false, 
metadata: {} }]), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: 
{} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 
len:1  array:List(Field { name: "item", data_type: Struct([Field { name: "key", 
data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, 
metadata: {} }, Field { name: "value", data_type: Binary, nullable: false, 
dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: true, dict_id: 
0, dict_is_ordered: false, metadata: {} })/1
   Field:Field { name: "key_metadata", data_type: Binary, nullable: true, 
dict_id: 0, dict_is_ordered: false, metadata: {} }, len:1  array:Binary/1
   Field:Field { name: "split_offsets", data_type: List(Field { name: "item", 
data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: 
{} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: 
{"element-id": "133"} }, len:1  array:List(Field { name: "item", data_type: 
Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })/1
   Field:Field { name: "equality_ids", data_type: List(Field { name: "item", 
data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: 
{} }), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: 
{"element-id": "136"} }, len:1  array:List(Field { name: "item", data_type: 
Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} })/1
   Field:Field { name: "sort_order_id", data_type: Int32, nullable: true, 
dict_id: 0, dict_is_ordered: false, metadata: {} }, len:1  array:Int32/1
   ```
   
   after reading the 
[spec](https://avro.apache.org/docs/++version++/specification/#schema-record) 
did not find that all the fields have the same length
   
   If this logic here needs some improvements, maybe I can help here, thanks



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to