scovich commented on code in PR #9598:
URL: https://github.com/apache/arrow-rs/pull/9598#discussion_r2969786655


##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -216,22 +223,63 @@ fn shredded_get_path(
     // Structs are special. Recurse into each field separately, hoping to 
follow the shredding even
     // further, and build up the final struct from those individually shredded 
results.
     if let DataType::Struct(fields) = as_field.data_type() {
-        let children = fields
+        let mut updated_fields = Vec::with_capacity(fields.len());
+        let children: Result<Vec<_>> = fields
             .iter()
             .map(|field| {
-                shredded_get_path(
+                // If the field has VariantType extension metadata, extract it 
as a
+                // VariantArray instead of casting to the declared data type. 
This allows
+                // callers to request structs where some fields remain as 
variants.
+                // See test_struct_extraction_with_variant_fields for usage 
example.
+                let is_variant_field = 
field.try_extension_type::<VariantType>().is_ok();
+                let field_as_type: Option<&Field> = if is_variant_field {
+                    None
+                } else {
+                    Some(field.as_ref())
+                };
+                let child = shredded_get_path(
                     &target,
                     &[VariantPathElement::from(field.name().as_str())],
-                    Some(field),
+                    field_as_type,
                     cast_options,
-                )
+                )?;
+
+                // Update field type if it was a Variant marker (extracted as 
VariantArray).
+                // The actual data type will be the internal structure of 
VariantArray.
+                // Preserve VariantType extension metadata so downstream 
consumers
+                // can recognize this field as a Variant column.
+                //
+                // When the field is entirely absent in the data, 
shredded_get_path
+                // returns a NullArray (DataType::Null). VariantType only 
supports
+                // Struct storage, so we must skip the extension in that case.
+                let updated_field =
+                    if is_variant_field && matches!(child.data_type(), 
DataType::Struct(_)) {
+                        field
+                            .as_ref()
+                            .clone()
+                            .with_data_type(child.data_type().clone())

Review Comment:
   Why needed? Doesn't it already have that type?



##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -216,22 +223,63 @@ fn shredded_get_path(
     // Structs are special. Recurse into each field separately, hoping to 
follow the shredding even
     // further, and build up the final struct from those individually shredded 
results.
     if let DataType::Struct(fields) = as_field.data_type() {
-        let children = fields
+        let mut updated_fields = Vec::with_capacity(fields.len());
+        let children: Result<Vec<_>> = fields
             .iter()
             .map(|field| {
-                shredded_get_path(
+                // If the field has VariantType extension metadata, extract it 
as a
+                // VariantArray instead of casting to the declared data type. 
This allows
+                // callers to request structs where some fields remain as 
variants.
+                // See test_struct_extraction_with_variant_fields for usage 
example.
+                let is_variant_field = 
field.try_extension_type::<VariantType>().is_ok();
+                let field_as_type: Option<&Field> = if is_variant_field {
+                    None
+                } else {
+                    Some(field.as_ref())
+                };
+                let child = shredded_get_path(
                     &target,
                     &[VariantPathElement::from(field.name().as_str())],
-                    Some(field),
+                    field_as_type,
                     cast_options,
-                )
+                )?;
+
+                // Update field type if it was a Variant marker (extracted as 
VariantArray).
+                // The actual data type will be the internal structure of 
VariantArray.
+                // Preserve VariantType extension metadata so downstream 
consumers
+                // can recognize this field as a Variant column.
+                //
+                // When the field is entirely absent in the data, 
shredded_get_path
+                // returns a NullArray (DataType::Null). VariantType only 
supports
+                // Struct storage, so we must skip the extension in that case.
+                let updated_field =
+                    if is_variant_field && matches!(child.data_type(), 
DataType::Struct(_)) {
+                        field
+                            .as_ref()
+                            .clone()
+                            .with_data_type(child.data_type().clone())
+                            .with_extension_type(VariantType)
+                    } else if is_variant_field {
+                        // Field was requested as Variant but data is all-null;
+                        // preserve the data type from the child without 
extension metadata.
+                        field
+                            .as_ref()
+                            .clone()
+                            .with_data_type(child.data_type().clone())
+                    } else {
+                        field.as_ref().clone()
+                    };
+                updated_fields.push(updated_field);
+
+                Ok(child)
             })
-            .collect::<Result<Vec<_>>>()?;
+            .collect();
+        let children = children?;

Review Comment:
   This bit seems like a gratuitous change?
   ```rust
   let children = fields...collect::<Result<Vec_>>>()?;
   ```
   becomes
   ```rust
   let children: Result<Vec<_>> = fields...collect();
   let children = children?;
   ```



##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -86,15 +87,21 @@ pub(crate) fn follow_shredded_path_element<'a>(
                 return Ok(missing_path_step());
             };
 
-            let struct_array = field.as_struct_opt().ok_or_else(|| {
-                // TODO: Should we blow up? Or just end the traversal and let 
the normal
-                // variant pathing code sort out the mess that it must anyway 
be
-                // prepared to handle?
-                ArrowError::InvalidArgumentError(format!(
-                    "Expected Struct array while following path, got {}",
-                    field.data_type(),
-                ))
-            })?;
+            // The field might be a VariantArray (StructArray) if shredded,
+            // or it might be a primitive array. Only proceed if it's a 
StructArray.
+            let Some(struct_array) = field.as_struct_opt() else {
+                // Field exists but is not a StructArray (VariantArray),
+                // which means it's not shredded further.
+                if !cast_options.safe {
+                    return Err(ArrowError::CastError(format!(

Review Comment:
   * Probably clashes with https://github.com/apache/arrow-rs/issues/9606
   
   (but maybe we want to fix that everywhere in a separate dedicated PR)



##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -216,22 +223,63 @@ fn shredded_get_path(
     // Structs are special. Recurse into each field separately, hoping to 
follow the shredding even
     // further, and build up the final struct from those individually shredded 
results.
     if let DataType::Struct(fields) = as_field.data_type() {
-        let children = fields
+        let mut updated_fields = Vec::with_capacity(fields.len());
+        let children: Result<Vec<_>> = fields
             .iter()
             .map(|field| {
-                shredded_get_path(
+                // If the field has VariantType extension metadata, extract it 
as a
+                // VariantArray instead of casting to the declared data type. 
This allows
+                // callers to request structs where some fields remain as 
variants.
+                // See test_struct_extraction_with_variant_fields for usage 
example.
+                let is_variant_field = 
field.try_extension_type::<VariantType>().is_ok();
+                let field_as_type: Option<&Field> = if is_variant_field {
+                    None
+                } else {
+                    Some(field.as_ref())
+                };
+                let child = shredded_get_path(
                     &target,
                     &[VariantPathElement::from(field.name().as_str())],
-                    Some(field),
+                    field_as_type,
                     cast_options,
-                )
+                )?;
+
+                // Update field type if it was a Variant marker (extracted as 
VariantArray).
+                // The actual data type will be the internal structure of 
VariantArray.
+                // Preserve VariantType extension metadata so downstream 
consumers
+                // can recognize this field as a Variant column.
+                //
+                // When the field is entirely absent in the data, 
shredded_get_path
+                // returns a NullArray (DataType::Null). VariantType only 
supports
+                // Struct storage, so we must skip the extension in that case.
+                let updated_field =
+                    if is_variant_field && matches!(child.data_type(), 
DataType::Struct(_)) {
+                        field
+                            .as_ref()
+                            .clone()
+                            .with_data_type(child.data_type().clone())
+                            .with_extension_type(VariantType)
+                    } else if is_variant_field {
+                        // Field was requested as Variant but data is all-null;
+                        // preserve the data type from the child without 
extension metadata.

Review Comment:
   Is this intentional, to strip away e.g. JSON or other extensions that might 
be present?
   
   Also, why is it correct to change the data type merely because the data are 
all-null? 
   Shouldn't it just come back as an all-null `StructArray` with variant 
annotations?  



##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -216,22 +223,63 @@ fn shredded_get_path(
     // Structs are special. Recurse into each field separately, hoping to 
follow the shredding even
     // further, and build up the final struct from those individually shredded 
results.
     if let DataType::Struct(fields) = as_field.data_type() {
-        let children = fields
+        let mut updated_fields = Vec::with_capacity(fields.len());
+        let children: Result<Vec<_>> = fields
             .iter()
             .map(|field| {
-                shredded_get_path(
+                // If the field has VariantType extension metadata, extract it 
as a
+                // VariantArray instead of casting to the declared data type. 
This allows
+                // callers to request structs where some fields remain as 
variants.
+                // See test_struct_extraction_with_variant_fields for usage 
example.
+                let is_variant_field = 
field.try_extension_type::<VariantType>().is_ok();
+                let field_as_type: Option<&Field> = if is_variant_field {
+                    None
+                } else {
+                    Some(field.as_ref())
+                };

Review Comment:
   This is a bit of a double negative... trying to decide if it could be 
clearer with inverted logic?
   ```rust
   let is_strongly_typed = field.try_extension_type::<VariantType>().is_err();
   let field_as_type = is_strongly_typed.then(|| field.as_ref());
   ```



##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -216,22 +223,63 @@ fn shredded_get_path(
     // Structs are special. Recurse into each field separately, hoping to 
follow the shredding even
     // further, and build up the final struct from those individually shredded 
results.
     if let DataType::Struct(fields) = as_field.data_type() {
-        let children = fields
+        let mut updated_fields = Vec::with_capacity(fields.len());
+        let children: Result<Vec<_>> = fields
             .iter()
             .map(|field| {
-                shredded_get_path(
+                // If the field has VariantType extension metadata, extract it 
as a
+                // VariantArray instead of casting to the declared data type. 
This allows
+                // callers to request structs where some fields remain as 
variants.
+                // See test_struct_extraction_with_variant_fields for usage 
example.
+                let is_variant_field = 
field.try_extension_type::<VariantType>().is_ok();
+                let field_as_type: Option<&Field> = if is_variant_field {
+                    None
+                } else {
+                    Some(field.as_ref())
+                };
+                let child = shredded_get_path(
                     &target,
                     &[VariantPathElement::from(field.name().as_str())],
-                    Some(field),
+                    field_as_type,
                     cast_options,
-                )
+                )?;
+
+                // Update field type if it was a Variant marker (extracted as 
VariantArray).
+                // The actual data type will be the internal structure of 
VariantArray.
+                // Preserve VariantType extension metadata so downstream 
consumers
+                // can recognize this field as a Variant column.
+                //
+                // When the field is entirely absent in the data, 
shredded_get_path
+                // returns a NullArray (DataType::Null). VariantType only 
supports
+                // Struct storage, so we must skip the extension in that case.
+                let updated_field =
+                    if is_variant_field && matches!(child.data_type(), 
DataType::Struct(_)) {
+                        field
+                            .as_ref()
+                            .clone()

Review Comment:
   This part is the same in all cases. Can we simplify by hoisting out a `let 
mut new_field` and then:
   ```rust
   let updated_field = if is_strongly_typed {
       new_field
   } else {
       // handle a variant field that may or may not exist in the data (see PR 
comment below)
   };
   ```    



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to