klion26 commented on code in PR #9598:
URL: https://github.com/apache/arrow-rs/pull/9598#discussion_r2986907761
##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -4237,4 +4291,143 @@ mod test {
);
}
}
+
+ /// Test extracting a struct with mixed typed and variant fields.
+ /// Fields with VariantType extension metadata should be extracted as
VariantArrays.
+ #[test]
+ fn test_struct_extraction_with_variant_fields() {
+ // Create test data: [{"id": 1, "name": "Alice", "data": {"score":
95}},
+ // {"id": 2, "name": "Bob", "data": null}]
+ let json_strings = vec![
+ r#"{"id": 1, "name": "Alice", "data": {"score": 95}}"#,
+ r#"{"id": 2, "name": "Bob", "data": null}"#,
+ r#"{"id": 3, "name": null, "data": {"level": 5}}"#,
+ ];
+ let string_array: Arc<dyn Array> =
Arc::new(StringArray::from(json_strings));
+ let variant_array = json_to_variant(&string_array).unwrap();
+
+ // Request struct where:
+ // - "id" is extracted as Int32
+ // - "name" is extracted as String (Utf8)
+ // - "data" is extracted as Variant (using VariantType extension
metadata)
+ let struct_fields = Fields::from(vec![
+ Field::new("id", DataType::Int32, true),
+ Field::new("name", DataType::Utf8, true),
+ // Use VariantType extension metadata to request extraction as
VariantArray.
+ // The data type must be Struct to satisfy
VariantType::supports_data_type.
+ Field::new("data", DataType::Struct(Fields::empty()), true)
+ .with_extension_type(VariantType),
+ ]);
+ let struct_type = DataType::Struct(struct_fields);
+
+ let options = GetOptions {
+ path: VariantPath::default(),
+ as_type: Some(Arc::new(Field::new("result", struct_type, true))),
+ cast_options: CastOptions::default(),
+ };
+
+ let variant_array_ref = ArrayRef::from(variant_array);
+ let result = variant_get(&variant_array_ref, options).unwrap();
+
+ // Verify the result is a StructArray with 3 fields
+ let struct_result =
result.as_any().downcast_ref::<StructArray>().unwrap();
+ assert_eq!(struct_result.len(), 3);
+ assert_eq!(struct_result.num_columns(), 3);
+
+ // Verify "id" field (Int32)
+ let id_field = struct_result
+ .column(0)
+ .as_any()
+ .downcast_ref::<Int32Array>()
+ .unwrap();
+ assert_eq!(id_field.value(0), 1);
+ assert_eq!(id_field.value(1), 2);
+ assert_eq!(id_field.value(2), 3);
+
+ // Verify "name" field (String/Utf8)
+ let name_field = struct_result
+ .column(1)
+ .as_any()
+ .downcast_ref::<StringArray>()
+ .unwrap();
+ assert_eq!(name_field.value(0), "Alice");
+ assert_eq!(name_field.value(1), "Bob");
+ assert!(name_field.is_null(2)); // null name in row 2
+
+ // Verify "data" field schema has VariantType extension metadata
+ let data_schema_field = struct_result
+ .fields()
+ .iter()
+ .find(|f| f.name() == "data")
+ .unwrap();
+ assert!(
+ data_schema_field
+ .try_extension_type::<VariantType>()
+ .is_ok(),
+ "data field should have VariantType extension metadata"
+ );
+
+ // Verify "data" field (VariantArray)
+ let data_field = struct_result.column(2);
+ // The data field should be a StructArray representing VariantArray's
internal structure
+ // It has columns: metadata, value (optional), typed_value (optional)
+ let data_as_struct = data_field.as_any().downcast_ref::<StructArray>();
+ assert!(
+ data_as_struct.is_some(),
+ "data field should be a VariantArray (represented as StructArray)"
+ );
+
+ // Verify we can access the variant values
+ let data_variant_array = VariantArray::try_new(data_field).unwrap();
+ assert_eq!(data_variant_array.len(), 3);
+
+ // Row 0: data = {"score": 95}
+ let data0 = data_variant_array.value(0);
+ assert!(matches!(data0, Variant::Object(_)));
Review Comment:
Do we need to assert the value of the variant
##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -86,15 +87,21 @@ pub(crate) fn follow_shredded_path_element<'a>(
return Ok(missing_path_step());
};
- let struct_array = field.as_struct_opt().ok_or_else(|| {
- // TODO: Should we blow up? Or just end the traversal and let
the normal
- // variant pathing code sort out the mess that it must anyway
be
- // prepared to handle?
- ArrowError::InvalidArgumentError(format!(
- "Expected Struct array while following path, got {}",
- field.data_type(),
- ))
- })?;
+ // The field might be a VariantArray (StructArray) if shredded,
+ // or it might be a primitive array. Only proceed if it's a
StructArray.
+ let Some(struct_array) = field.as_struct_opt() else {
+ // Field exists but is not a StructArray (VariantArray),
+ // which means it's not shredded further.
+ if !cast_options.safe {
+ return Err(ArrowError::CastError(format!(
+ "Expected Struct array while following path, got {}",
+ field.data_type(),
+ )));
+ }
+ // In safe mode, return NotShredded to let the caller
+ // handle it via value column.
+ return Ok(ShreddedPathStep::NotShredded);
Review Comment:
Why does this isn't `Missing`? Correct me if I'm wrong: the code path
requests a `VariantPathElement::Field`, which means it's in a struct, and the
current field isn't a struct, so the requested `name` is missing?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]