scovich commented on code in PR #8166: URL: https://github.com/apache/arrow-rs/pull/8166#discussion_r2298323189
########## parquet-variant-compute/src/variant_array.rs: ########## @@ -102,31 +105,57 @@ impl VariantArray { ))); }; - // Find the value field, if present - let value = inner - .column_by_name("value") - .map(|v| { - v.as_binary_view_opt().ok_or_else(|| { - ArrowError::NotYetImplemented(format!( - "VariantArray 'value' field must be BinaryView, got {}", - v.data_type() - )) - }) - }) - .transpose()?; - - // Find the typed_value field, if present - let typed_value = inner.column_by_name("typed_value"); + // Extract value and typed_value fields + let value = if let Some(value_col) = inner.column_by_name("value") { + if let Some(binary_view) = value_col.as_binary_view_opt() { + Some(binary_view.clone()) + } else { + return Err(ArrowError::NotYetImplemented(format!( + "VariantArray 'value' field must be BinaryView, got {}", + value_col.data_type() + ))); + } + } else { + None + }; Review Comment: tiny nit? ```suggestion let value = match inner.column_by_name("value") { Some(value_col) => match value_col.as_binary_view_opt() { Some(binary_view) => Some(binary_view.clone()), None => return Err(ArrowError::NotYetImplemented(format!( "VariantArray 'value' field must be BinaryView, got {}", value_col.data_type() ))), } None => None, }) ``` ########## parquet-variant-compute/src/variant_array.rs: ########## @@ -166,28 +195,28 @@ impl VariantArray { /// caller to ensure that the metadata and value were constructed correctly. pub fn value(&self, index: usize) -> Variant<'_, '_> { match &self.shredding_state { - ShreddingState::Unshredded { metadata, value } => { - Variant::new(metadata.value(index), value.value(index)) + ShreddingState::Unshredded { value, .. } => { + // Unshredded case + Variant::new(self.metadata.value(index), value.value(index)) } ShreddingState::Typed { typed_value, .. } => { + // Typed case (formerly PerfectlyShredded) if typed_value.is_null(index) { Variant::Null } else { typed_value_to_variant(typed_value, index) } } - ShreddingState::PartiallyShredded { - metadata, - value, - typed_value, - } => { + ShreddingState::PartiallyShredded { value, typed_value, .. } => { + // PartiallyShredded case (formerly ImperfectlyShredded) Review Comment: This is incorrect naming. The term "partially shredded" is specifically defined in the [variant shredding spec](https://github.com/apache/parquet-format/blob/master/VariantShredding.md#value-shredding), and that definition does not agree with how the current code uses the name: > An object is _partially shredded_ when the `value` is an object and the `typed_value` is a shredded object. Partial shredding is just one specific kind of imperfect shredding -- _any_ type can shred imperfectly, producing both `value` and `typed_value` columns. ########## parquet-variant-compute/src/variant_array.rs: ########## @@ -213,6 +242,166 @@ impl VariantArray { } } +/// One shredded field of a partially or prefectly shredded variant. For example, suppose the +/// shredding schema for variant `v` treats it as an object with a single field `a`, where `a` is +/// itself a struct with the single field `b` of type INT. Then the physical layout of the column +/// is: +/// +/// ```text +/// v: VARIANT { +/// metadata: BINARY, +/// value: BINARY, +/// typed_value: STRUCT { +/// a: SHREDDED_VARIANT_FIELD { +/// value: BINARY, +/// typed_value: STRUCT { +/// a: SHREDDED_VARIANT_FIELD { +/// value: BINARY, +/// typed_value: INT, +/// }, +/// }, +/// }, +/// }, +/// } +/// ``` +/// +/// In the above, each row of `v.value` is either a variant value (shredding failed, `v` was not an +/// object at all) or a variant object (partial shredding, `v` was an object but included unexpected +/// fields other than `a`), or is NULL (perfect shredding, `v` was an object containing only the +/// single expected field `a`). +/// +/// A similar story unfolds for each `v.typed_value.a.value` -- a variant value if shredding failed +/// (`v:a` was not an object at all), or a variant object (`v:a` was an object with unexpected +/// additional fields), or NULL (`v:a` was an object containing only the single expected field `b`). +/// +/// Finally, `v.typed_value.a.typed_value.b.value` is either NULL (`v:a.b` was an integer) or else a +/// variant value (which could be `Variant::Null`). +#[derive(Debug)] +pub struct ShreddedVariantFieldArray { + /// Reference to the underlying StructArray + inner: StructArray, + shredding_state: ShreddingState, +} + +#[allow(unused)] +impl ShreddedVariantFieldArray { + /// Creates a new `ShreddedVariantFieldArray` from a [`StructArray`]. + /// + /// # Arguments + /// - `inner` - The underlying [`StructArray`] that contains the variant data. + /// + /// # Returns + /// - A new instance of `ShreddedVariantFieldArray`. + /// + /// # Errors: + /// - If the `StructArray` does not contain the required fields + /// + /// # Requirements of the `StructArray` + /// + /// 1. An optional field named `value` that is binary, large_binary, or + /// binary_view + /// + /// 2. An optional field named `typed_value` which can be any primitive type + /// or be a list, large_list, list_view or struct + /// + /// Currently, only `value` columns of type [`BinaryViewArray`] are supported. + pub fn try_new(inner: ArrayRef) -> Result<Self, ArrowError> { + let Some(inner_struct) = inner.as_struct_opt() else { + return Err(ArrowError::InvalidArgumentError( + "Invalid ShreddedVariantFieldArray: requires StructArray as input".to_string(), + )); + }; + + // Extract value and typed_value fields (metadata is not expected in ShreddedVariantFieldArray) + let value = inner_struct.column_by_name("value").and_then(|col| col.as_binary_view_opt().cloned()); + let typed_value = inner_struct.column_by_name("typed_value").cloned(); + + // Use a dummy metadata for the constructor (ShreddedVariantFieldArray doesn't have metadata) + let dummy_metadata = arrow::array::BinaryViewArray::new_null(inner_struct.len()); Review Comment: ```suggestion let dummy_metadata = BinaryViewArray::new_null(inner_struct.len()); ``` ########## parquet-variant-compute/src/variant_get/output/mod.rs: ########## @@ -15,80 +15,4 @@ // specific language governing permissions and limitations // under the License. -mod primitive; -mod variant; - -use crate::variant_get::output::primitive::PrimitiveOutputBuilder; -use crate::variant_get::output::variant::VariantOutputBuilder; -use crate::variant_get::GetOptions; -use crate::VariantArray; -use arrow::array::{ArrayRef, BinaryViewArray}; -use arrow::datatypes::Int32Type; -use arrow::error::Result; -use arrow_schema::{ArrowError, DataType}; - -/// This trait represents something that gets the output of the variant_get kernel. -/// -/// For example, there are specializations for writing the output as a VariantArray, -/// or as a specific type (e.g. Int32Array). -/// -/// See [`instantiate_output_builder`] to create an instance of this trait. -pub(crate) trait OutputBuilder { - /// create output for a shredded variant array - fn partially_shredded( - &self, - variant_array: &VariantArray, - metadata: &BinaryViewArray, - value_field: &BinaryViewArray, - typed_value: &ArrayRef, - ) -> Result<ArrayRef>; - - /// output for a perfectly shredded variant array - fn typed( - &self, - variant_array: &VariantArray, - metadata: &BinaryViewArray, - typed_value: &ArrayRef, - ) -> Result<ArrayRef>; - - /// write out an unshredded variant array - fn unshredded( - &self, - variant_array: &VariantArray, - metadata: &BinaryViewArray, - value_field: &BinaryViewArray, - ) -> Result<ArrayRef>; - - /// write out an all-null variant array - fn all_null( - &self, - variant_array: &VariantArray, - metadata: &BinaryViewArray, - ) -> Result<ArrayRef>; -} - -pub(crate) fn instantiate_output_builder<'a>( - options: GetOptions<'a>, -) -> Result<Box<dyn OutputBuilder + 'a>> { - let GetOptions { - as_type, - path, - cast_options, - } = options; - - let Some(as_type) = as_type else { - return Ok(Box::new(VariantOutputBuilder::new(path))); - }; - - // handle typed output - match as_type.data_type() { - DataType::Int32 => Ok(Box::new(PrimitiveOutputBuilder::<Int32Type>::new( - path, - as_type, - cast_options, - ))), - dt => Err(ArrowError::NotYetImplemented(format!( - "variant_get with as_type={dt} is not implemented yet", - ))), - } -} +pub(crate) mod row_builder; Review Comment: Shouldn't cargo fmt have fixed this? ########## parquet-variant/src/path.rs: ########## @@ -95,10 +95,10 @@ impl<'a> From<Vec<VariantPathElement<'a>>> for VariantPath<'a> { } } -/// Create from &str +/// Create from &str with support for dot notation impl<'a> From<&'a str> for VariantPath<'a> { Review Comment: Rescuing https://github.com/apache/arrow-rs/pull/8166#discussion_r2286247849 from github oblivion: > > This is dangerous without some well-defined way to escape values. Otherwise, a column name that contains a dot is perfectly legal but would produce an incorrect path. > > I agree with the long-form more, but I just added this just as a proof of concept, to see what opinion everyone has of this, can fallback to the original approach, maybe @alamb can also give his opinion? (impl From makes it part of the public API, and I'm nervious about giving people a massive footgun) ########## parquet-variant-compute/src/variant_array.rs: ########## @@ -166,28 +195,28 @@ impl VariantArray { /// caller to ensure that the metadata and value were constructed correctly. pub fn value(&self, index: usize) -> Variant<'_, '_> { match &self.shredding_state { - ShreddingState::Unshredded { metadata, value } => { - Variant::new(metadata.value(index), value.value(index)) + ShreddingState::Unshredded { value, .. } => { + // Unshredded case + Variant::new(self.metadata.value(index), value.value(index)) } ShreddingState::Typed { typed_value, .. } => { + // Typed case (formerly PerfectlyShredded) Review Comment: Aside: * https://github.com/apache/parquet-format/pull/520 ########## parquet-variant-compute/src/variant_array.rs: ########## @@ -166,28 +195,28 @@ impl VariantArray { /// caller to ensure that the metadata and value were constructed correctly. pub fn value(&self, index: usize) -> Variant<'_, '_> { match &self.shredding_state { - ShreddingState::Unshredded { metadata, value } => { - Variant::new(metadata.value(index), value.value(index)) + ShreddingState::Unshredded { value, .. } => { + // Unshredded case + Variant::new(self.metadata.value(index), value.value(index)) } ShreddingState::Typed { typed_value, .. } => { + // Typed case (formerly PerfectlyShredded) if typed_value.is_null(index) { Variant::Null } else { typed_value_to_variant(typed_value, index) } } - ShreddingState::PartiallyShredded { - metadata, - value, - typed_value, - } => { + ShreddingState::PartiallyShredded { value, typed_value, .. } => { + // PartiallyShredded case (formerly ImperfectlyShredded) if typed_value.is_null(index) { - Variant::new(metadata.value(index), value.value(index)) + Variant::new(self.metadata.value(index), value.value(index)) } else { typed_value_to_variant(typed_value, index) } } ShreddingState::AllNull { .. } => { Review Comment: Another naming nit: This should probably be called `ShreddingState::Missing`, to match terminology of the [shredding spec](https://github.com/apache/parquet-format/blob/master/VariantShredding.md#value-shredding)? <img width="668" height="90" alt="image" src="https://github.com/user-attachments/assets/9faf4d67-b170-4e75-9ea9-d96538d7a7f6" /> ########## parquet-variant-compute/src/variant_array.rs: ########## @@ -166,28 +195,28 @@ impl VariantArray { /// caller to ensure that the metadata and value were constructed correctly. pub fn value(&self, index: usize) -> Variant<'_, '_> { match &self.shredding_state { - ShreddingState::Unshredded { metadata, value } => { - Variant::new(metadata.value(index), value.value(index)) + ShreddingState::Unshredded { value, .. } => { + // Unshredded case + Variant::new(self.metadata.value(index), value.value(index)) } ShreddingState::Typed { typed_value, .. } => { + // Typed case (formerly PerfectlyShredded) Review Comment: Not a fan of `Typed` as the name here. It doesn't match any language in the variant shredding spec, and IMO it doesn't really self-describe either. At a minimum we should consider `StronglyTyped`? (in contrast to variant normally being weakly typed)? But I'm curious why "perfectly shredded" and "imperfectly shredded" are unclear or otherwise unwelcome? (see comment below) ########## parquet-variant-compute/src/variant_array.rs: ########## @@ -213,6 +242,166 @@ impl VariantArray { } } +/// One shredded field of a partially or prefectly shredded variant. For example, suppose the +/// shredding schema for variant `v` treats it as an object with a single field `a`, where `a` is +/// itself a struct with the single field `b` of type INT. Then the physical layout of the column +/// is: +/// +/// ```text +/// v: VARIANT { +/// metadata: BINARY, +/// value: BINARY, +/// typed_value: STRUCT { +/// a: SHREDDED_VARIANT_FIELD { +/// value: BINARY, +/// typed_value: STRUCT { +/// a: SHREDDED_VARIANT_FIELD { Review Comment: ```suggestion /// b: SHREDDED_VARIANT_FIELD { ``` ########## parquet-variant-compute/src/variant_get/output/row_builder.rs: ########## @@ -0,0 +1,211 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::ArrayRef; +use arrow::datatypes; +use arrow::datatypes::ArrowPrimitiveType; +use arrow::error::{ArrowError, Result}; +use parquet_variant::{Variant, VariantPath}; + +use crate::VariantArrayBuilder; + +use std::sync::Arc; + +pub(crate) fn make_shredding_row_builder<'a>( + //metadata: &BinaryViewArray, + path: VariantPath<'a>, + data_type: Option<&'a datatypes::DataType>, +) -> Result<Box<dyn VariantShreddingRowBuilder + 'a>> { + use arrow::array::PrimitiveBuilder; + use datatypes::Int32Type; + + // support non-empty paths (field access) and some empty path cases + if path.is_empty() { + return match data_type { + Some(datatypes::DataType::Int32) => { + // Return PrimitiveInt32Builder for type conversion + let builder = PrimitiveVariantShreddingRowBuilder { + builder: PrimitiveBuilder::<Int32Type>::new(), + }; + Ok(Box::new(builder)) + } + None => { + // Return VariantArrayBuilder for VariantArray output + let builder = VariantArrayShreddingRowBuilder::new(16); + Ok(Box::new(builder)) + } + _ => { + // only Int32 supported for empty paths + Err(ArrowError::NotYetImplemented(format!( + "variant_get with empty path and data_type={:?} not yet implemented", + data_type + ))) + } + }; + } + + // Non-empty paths: field access functionality + // Helper macro to reduce duplication when wrapping builders with path functionality + macro_rules! wrap_with_path { Review Comment: Rescuing https://github.com/apache/arrow-rs/pull/8166#discussion_r2286407569 from github oblivion: If the goal is to avoid duplication, can we move the branch inside the macro, instead of duplicating the entire match statement? We're not manipulating individual rows here, so the branching should have no measurable impact on performance -- if constructing your builders is the bottleneck, something else is very wrong. ########## parquet-variant-compute/src/variant_get/output/row_builder.rs: ########## @@ -0,0 +1,211 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::ArrayRef; +use arrow::datatypes; +use arrow::datatypes::ArrowPrimitiveType; +use arrow::error::{ArrowError, Result}; +use parquet_variant::{Variant, VariantPath}; + +use crate::VariantArrayBuilder; + +use std::sync::Arc; + +pub(crate) fn make_shredding_row_builder<'a>( + //metadata: &BinaryViewArray, + path: VariantPath<'a>, + data_type: Option<&'a datatypes::DataType>, +) -> Result<Box<dyn VariantShreddingRowBuilder + 'a>> { + use arrow::array::PrimitiveBuilder; + use datatypes::Int32Type; + + // support non-empty paths (field access) and some empty path cases + if path.is_empty() { + return match data_type { + Some(datatypes::DataType::Int32) => { + // Return PrimitiveInt32Builder for type conversion + let builder = PrimitiveVariantShreddingRowBuilder { + builder: PrimitiveBuilder::<Int32Type>::new(), + }; + Ok(Box::new(builder)) + } + None => { + // Return VariantArrayBuilder for VariantArray output + let builder = VariantArrayShreddingRowBuilder::new(16); + Ok(Box::new(builder)) + } + _ => { + // only Int32 supported for empty paths + Err(ArrowError::NotYetImplemented(format!( + "variant_get with empty path and data_type={:?} not yet implemented", + data_type + ))) + } + }; + } + + // Non-empty paths: field access functionality + // Helper macro to reduce duplication when wrapping builders with path functionality + macro_rules! wrap_with_path { + ($inner_builder:expr) => { + Ok(Box::new(VariantPathRowBuilder { + builder: $inner_builder, + path, + }) as Box<dyn VariantShreddingRowBuilder + 'a>) Review Comment: Is the cast actually needed? The function's return value _should_ constrain it already? ########## parquet-variant-compute/src/variant_array.rs: ########## @@ -166,28 +195,28 @@ impl VariantArray { /// caller to ensure that the metadata and value were constructed correctly. pub fn value(&self, index: usize) -> Variant<'_, '_> { match &self.shredding_state { - ShreddingState::Unshredded { metadata, value } => { - Variant::new(metadata.value(index), value.value(index)) + ShreddingState::Unshredded { value, .. } => { + // Unshredded case + Variant::new(self.metadata.value(index), value.value(index)) } ShreddingState::Typed { typed_value, .. } => { + // Typed case (formerly PerfectlyShredded) Review Comment: Ah! I think I figured out the confusion. The spec says this: > Both `value` and `typed_value` are optional fields used together to encode a single value. > Values in the two fields must be interpreted according to the following table: > > | `value` | `typed_value` | Meaning | > |----------|---------------|-------------------------------------------------------------| > | null | null | The value is missing; only valid for shredded object fields | > | non-null | null | The value is present and may be any type, including null | > | null | non-null | The value is present and is the shredded type | > | non-null | non-null | The value is present and is a partially shredded object | > > An object is _partially shredded_ when the `value` is an object and the `typed_value` is a shredded object. Writers must not produce data where both `value` and `typed_value` are non-null, unless the Variant value is an object. ... but those are row-wise concepts -- assuming that both columns are physically present -- and this code is dealing with column-wise concepts where one or both columns could be physically missing. If we produced _that_ table, it might look something like this: > The `value` and `typed_value` columns are optional columns that together encode columns of variant values. Either or both columns may be physically missing, which can be interpreted according to the following table: > > | `value` | `typed_value` | Meaning | > |----------|---------------|-------------------------------------------------------------| > | missing | missing | All values are missing; only valid for shredded object fields | > | present | missing | All values are unshredded, and can be any type, including null | > | missing | present | All values are present and the shredded type | > | present | present | At lease some values are present and can be any type, including null | > > A shredded object field is perfectly shredded when the `typed_value` column is present and the `value` column is either all-null or missing. ########## parquet-variant-compute/src/variant_array.rs: ########## @@ -213,6 +242,166 @@ impl VariantArray { } } +/// One shredded field of a partially or prefectly shredded variant. For example, suppose the +/// shredding schema for variant `v` treats it as an object with a single field `a`, where `a` is +/// itself a struct with the single field `b` of type INT. Then the physical layout of the column +/// is: +/// +/// ```text +/// v: VARIANT { +/// metadata: BINARY, +/// value: BINARY, +/// typed_value: STRUCT { +/// a: SHREDDED_VARIANT_FIELD { +/// value: BINARY, +/// typed_value: STRUCT { +/// a: SHREDDED_VARIANT_FIELD { +/// value: BINARY, +/// typed_value: INT, +/// }, +/// }, +/// }, +/// }, +/// } +/// ``` +/// +/// In the above, each row of `v.value` is either a variant value (shredding failed, `v` was not an +/// object at all) or a variant object (partial shredding, `v` was an object but included unexpected +/// fields other than `a`), or is NULL (perfect shredding, `v` was an object containing only the +/// single expected field `a`). +/// +/// A similar story unfolds for each `v.typed_value.a.value` -- a variant value if shredding failed +/// (`v:a` was not an object at all), or a variant object (`v:a` was an object with unexpected +/// additional fields), or NULL (`v:a` was an object containing only the single expected field `b`). +/// +/// Finally, `v.typed_value.a.typed_value.b.value` is either NULL (`v:a.b` was an integer) or else a +/// variant value (which could be `Variant::Null`). +#[derive(Debug)] +pub struct ShreddedVariantFieldArray { + /// Reference to the underlying StructArray + inner: StructArray, + shredding_state: ShreddingState, +} + +#[allow(unused)] +impl ShreddedVariantFieldArray { + /// Creates a new `ShreddedVariantFieldArray` from a [`StructArray`]. + /// + /// # Arguments + /// - `inner` - The underlying [`StructArray`] that contains the variant data. + /// + /// # Returns + /// - A new instance of `ShreddedVariantFieldArray`. + /// + /// # Errors: + /// - If the `StructArray` does not contain the required fields + /// + /// # Requirements of the `StructArray` + /// + /// 1. An optional field named `value` that is binary, large_binary, or + /// binary_view + /// + /// 2. An optional field named `typed_value` which can be any primitive type + /// or be a list, large_list, list_view or struct + /// + /// Currently, only `value` columns of type [`BinaryViewArray`] are supported. + pub fn try_new(inner: ArrayRef) -> Result<Self, ArrowError> { + let Some(inner_struct) = inner.as_struct_opt() else { + return Err(ArrowError::InvalidArgumentError( + "Invalid ShreddedVariantFieldArray: requires StructArray as input".to_string(), + )); + }; + + // Extract value and typed_value fields (metadata is not expected in ShreddedVariantFieldArray) + let value = inner_struct.column_by_name("value").and_then(|col| col.as_binary_view_opt().cloned()); + let typed_value = inner_struct.column_by_name("typed_value").cloned(); + + // Use a dummy metadata for the constructor (ShreddedVariantFieldArray doesn't have metadata) + let dummy_metadata = arrow::array::BinaryViewArray::new_null(inner_struct.len()); + + // Note this clone is cheap, it just bumps the ref count + let inner = inner_struct.clone(); + Ok(Self { + inner: inner.clone(), + shredding_state: ShreddingState::try_new(dummy_metadata, value, typed_value)?, + }) + } + + /// Return the shredding state of this `VariantArray` + pub fn shredding_state(&self) -> &ShreddingState { + &self.shredding_state + } + + /// Return a reference to the value field of the `StructArray` + pub fn value_field(&self) -> Option<&BinaryViewArray> { + self.shredding_state.value_field() + } + + /// Return a reference to the typed_value field of the `StructArray`, if present + pub fn typed_value_field(&self) -> Option<&ArrayRef> { + self.shredding_state.typed_value_field() + } + + /// Returns a reference to the underlying [`StructArray`]. + pub fn inner(&self) -> &StructArray { + &self.inner + } +} + +impl Array for ShreddedVariantFieldArray { + fn as_any(&self) -> &dyn Any { + self + } + + fn to_data(&self) -> ArrayData { + self.inner.to_data() + } + + fn into_data(self) -> ArrayData { + self.inner.into_data() + } + + fn data_type(&self) -> &DataType { + self.inner.data_type() + } + + fn slice(&self, offset: usize, length: usize) -> ArrayRef { + let inner = self.inner.slice(offset, length); + let shredding_state = self.shredding_state.slice(offset, length); + Arc::new(Self { + inner, + shredding_state, + }) + } + + fn len(&self) -> usize { + self.inner.len() + } + + fn is_empty(&self) -> bool { + self.inner.is_empty() + } + + fn offset(&self) -> usize { + self.inner.offset() + } + + fn nulls(&self) -> Option<&NullBuffer> { + // According to the shredding spec, ShreddedVariantFieldArray should be + // physically non-nullable - SQL NULL is inferred by both value and + // typed_value being physically NULL + None + } Review Comment: Rescuing from github oblivion: * https://github.com/apache/arrow-rs/pull/8166#discussion_r2286276624 > We may need to override the [logical_nulls](https://docs.rs/arrow/latest/arrow/array/trait.Array.html#method.logical_nulls) method. If `nulls` is hard-wired to return `None` then we _definitely_ need to provide `logical_nulls`? Otherwise people have to manually dig into the guts of the array? ########## parquet-variant-compute/src/variant_get/mod.rs: ########## @@ -500,4 +668,1013 @@ mod test { VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"), ) } + /// This test manually constructs a shredded variant array representing objects + /// like {"x": 1, "y": "foo"} and {"x": 42} and tests extracting the "x" field + /// as VariantArray using variant_get. + #[test] + fn test_shredded_object_field_access() { + let array = shredded_object_with_x_field_variant_array(); + + // Test: Extract the "x" field as VariantArray first + let options = GetOptions::new_with_path(VariantPath::from("x")); + let result = variant_get(&array, options).unwrap(); + + let result_variant: &VariantArray = result.as_any().downcast_ref().unwrap(); + assert_eq!(result_variant.len(), 2); + + // Row 0: expect x=1 + assert_eq!(result_variant.value(0), Variant::Int32(1)); + // Row 1: expect x=42 + assert_eq!(result_variant.value(1), Variant::Int32(42)); + } + + /// Test extracting shredded object field with type conversion + #[test] + fn test_shredded_object_field_as_int32() { + let array = shredded_object_with_x_field_variant_array(); + + // Test: Extract the "x" field as Int32Array (type conversion) + let field = Field::new("x", DataType::Int32, false); + let options = GetOptions::new_with_path(VariantPath::from("x")) + .with_as_type(Some(FieldRef::from(field))); + let result = variant_get(&array, options).unwrap(); + + // Should get Int32Array + let expected: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(42)])); + assert_eq!(&result, &expected); + } + + /// Helper function to create a shredded variant array representing objects + /// + /// This creates an array that represents: + /// Row 0: {"x": 1, "y": "foo"} (x is shredded, y is in value field) + /// Row 1: {"x": 42} (x is shredded, perfect shredding) + /// + /// The physical layout follows the shredding spec where: + /// - metadata: contains object metadata + /// - typed_value: StructArray with field "x" (ShreddedVariantFieldArray) + /// - value: contains fallback for unshredded fields like {"y": "foo"} + /// - The "x" field has typed_value=Int32Array and value=NULL (perfect shredding) + fn shredded_object_with_x_field_variant_array() -> ArrayRef { + // Create the base metadata for objects + let (metadata, y_field_value) = { + let mut builder = parquet_variant::VariantBuilder::new(); + let mut obj = builder.new_object(); + obj.insert("x", Variant::Int32(42)); + obj.insert("y", Variant::from("foo")); Review Comment: It's illegal for a partially shredded object to mention the same field in both the `value` and `typed_value` columns. To add "x" to the dictionary, manually just use the `VariantBuilder::with_field_names` method? ########## parquet-variant-compute/src/variant_array.rs: ########## @@ -166,28 +195,28 @@ impl VariantArray { /// caller to ensure that the metadata and value were constructed correctly. pub fn value(&self, index: usize) -> Variant<'_, '_> { match &self.shredding_state { - ShreddingState::Unshredded { metadata, value } => { - Variant::new(metadata.value(index), value.value(index)) + ShreddingState::Unshredded { value, .. } => { + // Unshredded case + Variant::new(self.metadata.value(index), value.value(index)) } ShreddingState::Typed { typed_value, .. } => { + // Typed case (formerly PerfectlyShredded) if typed_value.is_null(index) { Variant::Null } else { typed_value_to_variant(typed_value, index) } } - ShreddingState::PartiallyShredded { - metadata, - value, - typed_value, - } => { + ShreddingState::PartiallyShredded { value, typed_value, .. } => { + // PartiallyShredded case (formerly ImperfectlyShredded) Review Comment: I realize this PR didn't make the naming decisions, but the wording in the PR is not consistent with these namings. I would personally prefer to fix the naming here to be consistent with how the PR uses it, but we could also try to adjust the PR wording to match these enum variant names. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org