scovich commented on code in PR #8446:
URL: https://github.com/apache/arrow-rs/pull/8446#discussion_r2380012537
##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -972,10 +1059,82 @@ fn typed_value_to_variant<'a>(
"Unsupported typed_value type: {}",
typed_value.data_type()
);
- Variant::Null
+ builder.append_value(Variant::Null);
}
- };
- value.into()
+ }
+
+ // Only a partially shredded struct is allowed to have values for both
columns
+ if value.is_some_and(|v| v.is_valid(index)) {
+ return Err(ArrowError::InvalidArgumentError(
+ "Invalid variant, conflicting value and typed_value".to_string(),
+ ));
+ }
+
+ Ok(())
+}
+
+/// Handles reconstruction of variant objects from shredded struct data
+fn struct_typed_value_to_variant(
+ typed_value: &ArrayRef,
+ value: Option<&BinaryViewArray>,
+ index: usize,
+ metadata: &VariantMetadata,
+ builder: &mut impl VariantBuilderExt,
+) -> Result<(), ArrowError> {
+ let struct_array = typed_value.as_struct();
+ let mut obj_builder = builder.try_new_object()?;
+
+ // Track all shredded field names -- we must ignore them when processing
value fields below.
+ let mut shredded_field_names = std::collections::HashSet::new();
+ for (field_name, field_array) in struct_array
+ .column_names()
+ .iter()
+ .zip(struct_array.columns())
+ {
+ shredded_field_names.insert(*field_name);
+ let field_shredded_array =
ShreddedVariantFieldArray::try_new(field_array.as_ref())?;
+ let shredding_state = field_shredded_array.shredding_state();
+ let value_field = shredding_state.value_field();
+ let typed_value_field = shredding_state.typed_value_field();
+
+ match (typed_value_field, value_field) {
+ (Some(typed_value), value) if typed_value.is_valid(index) => {
+ // Handle typed value with optional value column
+ let mut field_builder = ObjectFieldBuilder::new(field_name,
&mut obj_builder);
+ typed_value_to_variant(typed_value, value, index, metadata,
&mut field_builder)?;
+ }
+ (_, Some(value)) if value.is_valid(index) => {
+ // Handle unshredded value only
+ // TODO: Add raw byte append capability to VariantBuilderExt
to avoid bytes -> variant -> bytes conversion
+ let variant_bytes = value.value(index);
+ let field_variant =
Variant::new_with_metadata(metadata.clone(), variant_bytes);
+ obj_builder.insert_bytes(field_name, field_variant);
+ }
+ // Skip missing or invalid fields
+ _ => {}
+ }
+ }
+
+ // Add fields from the value column if present (with collision detection)
+ if let Some(value_array) = value {
+ if value_array.is_valid(index) {
+ let variant_bytes = value_array.value(index);
+ let field_variant = Variant::new_with_metadata(metadata.clone(),
variant_bytes);
+ let Variant::Object(obj) = field_variant else {
+ return Err(ArrowError::InvalidArgumentError(
+ "Invalid variant, non-object value with shredded
fields".to_string(),
+ ));
+ };
+ for (obj_field_name, obj_field_value) in obj.iter() {
+ if !shredded_field_names.contains(obj_field_name) {
Review Comment:
The [shredding
spec](https://github.com/apache/parquet-format/blob/master/VariantShredding.md#objects)
says:
> The value column of a partially shredded object must never contain fields
represented by the Parquet columns in typed_value (shredded fields). Readers
may always assume that data is written correctly and that shredded fields in
typed_value are not present in value. As a result, reads when a field is
defined in both value and a typed_value shredded field may be inconsistent.
So in theory we don't need this check. But variant integration test cases 43
(testPartiallyShreddedObjectMissingFieldConflict) and 125
(testPartiallyShreddedObjectFieldConflict) both have conflicting value fields
they expect readers to ignore.
Is this actually a bug in those tests that we should file an issue for? Or
do we keep the current code?
CC @alamb
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]