alamb commented on code in PR #8021: URL: https://github.com/apache/arrow-rs/pull/8021#discussion_r2261124392
########## parquet-variant-compute/src/variant_get/output/primitive.rs: ########## @@ -0,0 +1,166 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::variant_get::output::OutputBuilder; +use crate::VariantArray; +use arrow::error::Result; + +use arrow::array::{ + Array, ArrayRef, ArrowPrimitiveType, AsArray, BinaryViewArray, NullBufferBuilder, + PrimitiveArray, +}; +use arrow::compute::{cast_with_options, CastOptions}; +use arrow::datatypes::Int32Type; +use arrow_schema::{ArrowError, FieldRef}; +use parquet_variant::{Variant, VariantPath}; +use std::marker::PhantomData; +use std::sync::Arc; + +/// Trait for Arrow primitive types that can be used in the output builder +/// +/// This just exists to add a generic way to convert from Variant to the primitive type +pub(super) trait ArrowPrimitiveVariant: ArrowPrimitiveType { + /// Try to extract the primitive value from a Variant, returning None if it + /// cannot be converted + /// + /// TODO: figure out how to handle coercion/casting + fn from_variant(variant: &Variant) -> Option<Self::Native>; +} + +/// Outputs Primitive arrays +pub(super) struct PrimitiveOutputBuilder<'a, T: ArrowPrimitiveVariant> { + /// What path to extract + path: VariantPath<'a>, + /// Returned output type + as_type: FieldRef, + /// Controls the casting behavior (e.g. error vs substituting null on cast error). + cast_options: CastOptions<'a>, + /// Phantom data for the primitive type + _phantom: PhantomData<T>, +} + +impl<'a, T: ArrowPrimitiveVariant> PrimitiveOutputBuilder<'a, T> { + pub(super) fn new( + path: VariantPath<'a>, + as_type: FieldRef, + cast_options: CastOptions<'a>, + ) -> Self { + Self { + path, + as_type, + cast_options, + _phantom: PhantomData, + } + } +} + +impl<'a, T: ArrowPrimitiveVariant> OutputBuilder for PrimitiveOutputBuilder<'a, T> { + fn partially_shredded( + &self, + variant_array: &VariantArray, + _metadata: &BinaryViewArray, + _value_field: &BinaryViewArray, + typed_value: &ArrayRef, + ) -> arrow::error::Result<ArrayRef> { + // build up the output array element by element + let mut nulls = NullBufferBuilder::new(variant_array.len()); + let mut values = Vec::with_capacity(variant_array.len()); + let typed_value = + cast_with_options(typed_value, self.as_type.data_type(), &self.cast_options)?; + // downcast to the primitive array (e.g. Int32Array, Float64Array, etc) + let typed_value = typed_value.as_primitive::<T>(); + + for i in 0..variant_array.len() { + if variant_array.is_null(i) { + nulls.append_null(); + values.push(T::default_value()); // not used, placeholder + continue; + } + + // if the typed value is null, decode the variant and extract the value + if typed_value.is_null(i) { + // todo follow path + let variant = variant_array.value(i); + let Some(value) = T::from_variant(&variant) else { + if self.cast_options.safe { + // safe mode: append null if we can't convert + nulls.append_null(); + values.push(T::default_value()); // not used, placeholder + continue; + } else { + return Err(ArrowError::CastError(format!( + "Failed to extract primitive of type {} from variant {:?} at path {:?}", + self.as_type.data_type(), + variant, + self.path + ))); + } + }; + + nulls.append_non_null(); + values.push(value) + } else { + // otherwise we have a typed value, so we can use it directly + nulls.append_non_null(); + values.push(typed_value.value(i)); + } + } + + let nulls = nulls.finish(); + let array = PrimitiveArray::<T>::new(values.into(), nulls) + .with_data_type(self.as_type.data_type().clone()); + Ok(Arc::new(array)) + } + + fn typed( + &self, + _variant_array: &VariantArray, + _metadata: &BinaryViewArray, + typed_value: &ArrayRef, + ) -> arrow::error::Result<ArrayRef> { + // if the types match exactly, we can just return the typed_value + if typed_value.data_type() == self.as_type.data_type() { + Ok(typed_value.clone()) + } else { + // TODO: try to cast the typed_value to the desired type? Review Comment: ```suggestion // TODO: try to cast the typed_value to the desired type? // https://github.com/apache/arrow-rs/issues/8086 ``` ########## parquet-variant-compute/src/variant_get/output/variant.rs: ########## @@ -0,0 +1,146 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::variant_get::output::OutputBuilder; +use crate::{VariantArray, VariantArrayBuilder}; +use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray}; +use arrow::datatypes::Int32Type; +use arrow_schema::{ArrowError, DataType}; +use parquet_variant::{Variant, VariantPath}; +use std::sync::Arc; + +/// Outputs VariantArrays +pub(super) struct VariantOutputBuilder<'a> { + /// What path to extract + path: VariantPath<'a>, +} + +impl<'a> VariantOutputBuilder<'a> { + pub(super) fn new(path: VariantPath<'a>) -> Self { + Self { path } + } +} + +impl<'a> OutputBuilder for VariantOutputBuilder<'a> { + fn partially_shredded( + &self, + variant_array: &VariantArray, + // TODO(perf): can reuse the metadata field here to avoid re-creating it + _metadata: &BinaryViewArray, + _value_field: &BinaryViewArray, + typed_value: &ArrayRef, + ) -> arrow::error::Result<ArrayRef> { + // in this case dispatch on the typed_value and + // TODO macro'ize this using downcast! to handle all other primitive types + // TODO(perf): avoid builders entirely (and write the raw variant directly as we know the metadata is the same) + let mut array_builder = VariantArrayBuilder::new(variant_array.len()); + match typed_value.data_type() { + DataType::Int32 => { + let primitive_array = typed_value.as_primitive::<Int32Type>(); + for i in 0..variant_array.len() { + if variant_array.is_null(i) { + array_builder.append_null(); + continue; + } + + if typed_value.is_null(i) { + // fall back to the value (variant) field + // (TODO could copy the variant bytes directly) + let value = variant_array.value(i); + array_builder.append_variant(value); + continue; + } + + // otherwise we have a typed value, so we can use it directly + let int_value = primitive_array.value(i); + array_builder.append_variant(Variant::from(int_value)); + } + } + dt => { + return Err(ArrowError::NotYetImplemented(format!( + "variant_get fully_shredded with typed_value={dt} is not implemented yet", + ))); + } + }; + Ok(Arc::new(array_builder.build())) + } + + fn typed( + &self, + variant_array: &VariantArray, + // TODO(perf): can reuse the metadata field here to avoid re-creating it + _metadata: &BinaryViewArray, + typed_value: &ArrayRef, + ) -> arrow::error::Result<ArrayRef> { + // in this case dispatch on the typed_value and + // TODO macro'ize this using downcast! to handle all other primitive types + // TODO(perf): avoid builders entirely (and write the raw variant directly as we know the metadata is the same) + let mut array_builder = VariantArrayBuilder::new(variant_array.len()); + match typed_value.data_type() { + DataType::Int32 => { + let primitive_array = typed_value.as_primitive::<Int32Type>(); + for i in 0..variant_array.len() { + if primitive_array.is_null(i) { + array_builder.append_null(); + continue; + } + + let int_value = primitive_array.value(i); + array_builder.append_variant(Variant::from(int_value)); + } + } + dt => { + return Err(ArrowError::NotYetImplemented(format!( Review Comment: - Filed https://github.com/apache/arrow-rs/issues/8086 for the remaining types ```suggestion // https://github.com/apache/arrow-rs/issues/8086 return Err(ArrowError::NotYetImplemented(format!( ``` ########## parquet-variant-compute/src/variant_get/output/primitive.rs: ########## @@ -0,0 +1,166 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::variant_get::output::OutputBuilder; +use crate::VariantArray; +use arrow::error::Result; + +use arrow::array::{ + Array, ArrayRef, ArrowPrimitiveType, AsArray, BinaryViewArray, NullBufferBuilder, + PrimitiveArray, +}; +use arrow::compute::{cast_with_options, CastOptions}; +use arrow::datatypes::Int32Type; +use arrow_schema::{ArrowError, FieldRef}; +use parquet_variant::{Variant, VariantPath}; +use std::marker::PhantomData; +use std::sync::Arc; + +/// Trait for Arrow primitive types that can be used in the output builder +/// +/// This just exists to add a generic way to convert from Variant to the primitive type +pub(super) trait ArrowPrimitiveVariant: ArrowPrimitiveType { + /// Try to extract the primitive value from a Variant, returning None if it + /// cannot be converted + /// + /// TODO: figure out how to handle coercion/casting + fn from_variant(variant: &Variant) -> Option<Self::Native>; +} + +/// Outputs Primitive arrays +pub(super) struct PrimitiveOutputBuilder<'a, T: ArrowPrimitiveVariant> { + /// What path to extract + path: VariantPath<'a>, + /// Returned output type + as_type: FieldRef, + /// Controls the casting behavior (e.g. error vs substituting null on cast error). + cast_options: CastOptions<'a>, + /// Phantom data for the primitive type + _phantom: PhantomData<T>, +} + +impl<'a, T: ArrowPrimitiveVariant> PrimitiveOutputBuilder<'a, T> { + pub(super) fn new( + path: VariantPath<'a>, + as_type: FieldRef, + cast_options: CastOptions<'a>, + ) -> Self { + Self { + path, + as_type, + cast_options, + _phantom: PhantomData, + } + } +} + +impl<'a, T: ArrowPrimitiveVariant> OutputBuilder for PrimitiveOutputBuilder<'a, T> { + fn partially_shredded( + &self, + variant_array: &VariantArray, + _metadata: &BinaryViewArray, + _value_field: &BinaryViewArray, + typed_value: &ArrayRef, + ) -> arrow::error::Result<ArrayRef> { + // build up the output array element by element + let mut nulls = NullBufferBuilder::new(variant_array.len()); + let mut values = Vec::with_capacity(variant_array.len()); + let typed_value = + cast_with_options(typed_value, self.as_type.data_type(), &self.cast_options)?; + // downcast to the primitive array (e.g. Int32Array, Float64Array, etc) + let typed_value = typed_value.as_primitive::<T>(); + + for i in 0..variant_array.len() { + if variant_array.is_null(i) { + nulls.append_null(); + values.push(T::default_value()); // not used, placeholder + continue; + } + + // if the typed value is null, decode the variant and extract the value + if typed_value.is_null(i) { + // todo follow path + let variant = variant_array.value(i); + let Some(value) = T::from_variant(&variant) else { + if self.cast_options.safe { + // safe mode: append null if we can't convert + nulls.append_null(); + values.push(T::default_value()); // not used, placeholder + continue; + } else { + return Err(ArrowError::CastError(format!( + "Failed to extract primitive of type {} from variant {:?} at path {:?}", + self.as_type.data_type(), + variant, + self.path + ))); + } + }; + + nulls.append_non_null(); + values.push(value) + } else { + // otherwise we have a typed value, so we can use it directly + nulls.append_non_null(); + values.push(typed_value.value(i)); + } + } + + let nulls = nulls.finish(); + let array = PrimitiveArray::<T>::new(values.into(), nulls) + .with_data_type(self.as_type.data_type().clone()); + Ok(Arc::new(array)) + } + + fn fully_shredded( + &self, + _variant_array: &VariantArray, + _metadata: &BinaryViewArray, + typed_value: &ArrayRef, + ) -> arrow::error::Result<ArrayRef> { + // if the types match exactly, we can just return the typed_value + if typed_value.data_type() == self.as_type.data_type() { + Ok(typed_value.clone()) + } else { + // TODO: try to cast the typed_value to the desired type? + Err(ArrowError::NotYetImplemented(format!( Review Comment: - filed https://github.com/apache/arrow-rs/issues/8086 ########## parquet-variant-compute/src/variant_get/output/primitive.rs: ########## @@ -0,0 +1,166 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::variant_get::output::OutputBuilder; +use crate::VariantArray; +use arrow::error::Result; + +use arrow::array::{ + Array, ArrayRef, ArrowPrimitiveType, AsArray, BinaryViewArray, NullBufferBuilder, + PrimitiveArray, +}; +use arrow::compute::{cast_with_options, CastOptions}; +use arrow::datatypes::Int32Type; +use arrow_schema::{ArrowError, FieldRef}; +use parquet_variant::{Variant, VariantPath}; +use std::marker::PhantomData; +use std::sync::Arc; + +/// Trait for Arrow primitive types that can be used in the output builder +/// +/// This just exists to add a generic way to convert from Variant to the primitive type +pub(super) trait ArrowPrimitiveVariant: ArrowPrimitiveType { + /// Try to extract the primitive value from a Variant, returning None if it + /// cannot be converted + /// + /// TODO: figure out how to handle coercion/casting + fn from_variant(variant: &Variant) -> Option<Self::Native>; +} + +/// Outputs Primitive arrays +pub(super) struct PrimitiveOutputBuilder<'a, T: ArrowPrimitiveVariant> { + /// What path to extract + path: VariantPath<'a>, + /// Returned output type + as_type: FieldRef, + /// Controls the casting behavior (e.g. error vs substituting null on cast error). + cast_options: CastOptions<'a>, + /// Phantom data for the primitive type + _phantom: PhantomData<T>, +} + +impl<'a, T: ArrowPrimitiveVariant> PrimitiveOutputBuilder<'a, T> { + pub(super) fn new( + path: VariantPath<'a>, + as_type: FieldRef, + cast_options: CastOptions<'a>, + ) -> Self { + Self { + path, + as_type, + cast_options, + _phantom: PhantomData, + } + } +} + +impl<'a, T: ArrowPrimitiveVariant> OutputBuilder for PrimitiveOutputBuilder<'a, T> { + fn partially_shredded( + &self, + variant_array: &VariantArray, + _metadata: &BinaryViewArray, + _value_field: &BinaryViewArray, + typed_value: &ArrayRef, + ) -> arrow::error::Result<ArrayRef> { + // build up the output array element by element + let mut nulls = NullBufferBuilder::new(variant_array.len()); + let mut values = Vec::with_capacity(variant_array.len()); + let typed_value = + cast_with_options(typed_value, self.as_type.data_type(), &self.cast_options)?; + // downcast to the primitive array (e.g. Int32Array, Float64Array, etc) + let typed_value = typed_value.as_primitive::<T>(); + + for i in 0..variant_array.len() { + if variant_array.is_null(i) { + nulls.append_null(); + values.push(T::default_value()); // not used, placeholder + continue; + } + + // if the typed value is null, decode the variant and extract the value + if typed_value.is_null(i) { + // todo follow path Review Comment: - Tracking in https://github.com/apache/arrow-rs/issues/8083 ########## parquet-variant-compute/src/variant_array.rs: ########## @@ -135,36 +140,189 @@ impl VariantArray { self.inner } + /// Return the shredding state of this `VariantArray` + pub fn shredding_state(&self) -> &ShreddingState { + &self.shredding_state + } + /// Return the [`Variant`] instance stored at the given row /// - /// Panics if the index is out of bounds. + /// Consistently with other Arrow arrays types, this API requires you to + /// check for nulls first using [`Self::is_valid`]. + /// + /// # Panics + /// * if the index is out of bounds + /// * if the array value is null + /// + /// If this is a shredded variant but has no value at the shredded location, it + /// will return [`Variant::Null`]. + /// + /// + /// # Performance Note + /// + /// This is certainly not the most efficient way to access values in a + /// `VariantArray`, but it is useful for testing and debugging. /// /// Note: Does not do deep validation of the [`Variant`], so it is up to the /// caller to ensure that the metadata and value were constructed correctly. pub fn value(&self, index: usize) -> Variant { - let metadata = self.metadata_field().as_binary_view().value(index); - let value = self.value_field().as_binary_view().value(index); - Variant::new(metadata, value) + match &self.shredding_state { + ShreddingState::Unshredded { metadata, value } => { + Variant::new(metadata.value(index), value.value(index)) + } + ShreddingState::FullyShredded { + metadata: _, + typed_value, + } => { + if typed_value.is_null(index) { + Variant::Null + } else { + typed_value_to_variant(typed_value, index) + } + } + ShreddingState::PartiallyShredded { + metadata, + value, + typed_value, + } => { + if typed_value.is_null(index) { + Variant::new(metadata.value(index), value.value(index)) + } else { + typed_value_to_variant(typed_value, index) + } + } + } } - fn find_metadata_field(array: &StructArray) -> Option<ArrayRef> { - array.column_by_name("metadata").cloned() + /// Return a reference to the metadata field of the [`StructArray`] + pub fn metadata_field(&self) -> &BinaryViewArray { + self.shredding_state.metadata_field() } - fn find_value_field(array: &StructArray) -> Option<ArrayRef> { - array.column_by_name("value").cloned() + /// Return a reference to the value field of the `StructArray` + pub fn value_field(&self) -> Option<&BinaryViewArray> { + self.shredding_state.value_field() } - /// Return a reference to the metadata field of the [`StructArray`] - pub fn metadata_field(&self) -> &ArrayRef { - // spec says fields order is not guaranteed, so we search by name - &self.metadata_ref + /// Return a reference to the typed_value field of the `StructArray`, if present + pub fn typed_value_field(&self) -> Option<&ArrayRef> { + self.shredding_state.typed_value_field() } +} - /// Return a reference to the value field of the `StructArray` - pub fn value_field(&self) -> &ArrayRef { - // spec says fields order is not guaranteed, so we search by name - &self.value_ref +/// Variant arrays can be shredded in one of three states, encoded here +#[derive(Debug)] +pub enum ShreddingState { + /// This variant has no typed_value field + Unshredded { + metadata: BinaryViewArray, + value: BinaryViewArray, + }, + /// This variant has a typed_value field and no value field + /// meaning it is fully shredded (aka the value is stored in typed_value) + FullyShredded { + metadata: BinaryViewArray, + typed_value: ArrayRef, + }, + /// This variant has both a value field and a typed_value field + /// meaning it is partially shredded: first the typed_value is used, and + /// if that is null, the value field is used. + PartiallyShredded { + metadata: BinaryViewArray, + value: BinaryViewArray, + typed_value: ArrayRef, + }, +} + +impl ShreddingState { + /// try to create a new `ShreddingState` from the given fields + pub fn try_new( + metadata: BinaryViewArray, + value: Option<BinaryViewArray>, + typed_value: Option<ArrayRef>, + ) -> Result<Self, ArrowError> { + match (metadata, value, typed_value) { + (metadata, Some(value), Some(typed_value)) => Ok(Self::PartiallyShredded { + metadata, + value, + typed_value, + }), + (metadata, Some(value), None) => Ok(Self::Unshredded { metadata, value }), + (metadata, None, Some(typed_value)) => Ok(Self::FullyShredded { + metadata, + typed_value, + }), + (_metadata_field, None, None) => Err(ArrowError::InvalidArgumentError(String::from( + "VariantArray has neither value nor typed_value field", Review Comment: - https://github.com/apache/arrow-rs/issues/8088 ########## parquet-variant-compute/src/variant_array.rs: ########## @@ -135,36 +139,217 @@ impl VariantArray { self.inner } + /// Return the shredding state of this `VariantArray` + pub fn shredding_state(&self) -> &ShreddingState { + &self.shredding_state + } + /// Return the [`Variant`] instance stored at the given row /// - /// Panics if the index is out of bounds. + /// Consistently with other Arrow arrays types, this API requires you to + /// check for nulls first using [`Self::is_valid`]. + /// + /// # Panics + /// * if the index is out of bounds + /// * if the array value is null + /// + /// If this is a shredded variant but has no value at the shredded location, it + /// will return [`Variant::Null`]. + /// + /// + /// # Performance Note + /// + /// This is certainly not the most efficient way to access values in a + /// `VariantArray`, but it is useful for testing and debugging. /// /// Note: Does not do deep validation of the [`Variant`], so it is up to the /// caller to ensure that the metadata and value were constructed correctly. pub fn value(&self, index: usize) -> Variant<'_, '_> { - let metadata = self.metadata_field().as_binary_view().value(index); - let value = self.value_field().as_binary_view().value(index); - Variant::new(metadata, value) + match &self.shredding_state { + ShreddingState::Unshredded { metadata, value } => { + Variant::new(metadata.value(index), value.value(index)) + } + ShreddingState::Typed { typed_value, .. } => { + if typed_value.is_null(index) { + Variant::Null + } else { + typed_value_to_variant(typed_value, index) + } + } + ShreddingState::PartiallyShredded { + metadata, + value, + typed_value, + } => { + if typed_value.is_null(index) { + Variant::new(metadata.value(index), value.value(index)) + } else { + typed_value_to_variant(typed_value, index) + } + } + } } - fn find_metadata_field(array: &StructArray) -> Option<ArrayRef> { - array.column_by_name("metadata").cloned() + /// Return a reference to the metadata field of the [`StructArray`] + pub fn metadata_field(&self) -> &BinaryViewArray { + self.shredding_state.metadata_field() } - fn find_value_field(array: &StructArray) -> Option<ArrayRef> { - array.column_by_name("value").cloned() + /// Return a reference to the value field of the `StructArray` + pub fn value_field(&self) -> Option<&BinaryViewArray> { + self.shredding_state.value_field() } - /// Return a reference to the metadata field of the [`StructArray`] - pub fn metadata_field(&self) -> &ArrayRef { - // spec says fields order is not guaranteed, so we search by name - &self.metadata_ref + /// Return a reference to the typed_value field of the `StructArray`, if present + pub fn typed_value_field(&self) -> Option<&ArrayRef> { + self.shredding_state.typed_value_field() } +} - /// Return a reference to the value field of the `StructArray` - pub fn value_field(&self) -> &ArrayRef { - // spec says fields order is not guaranteed, so we search by name - &self.value_ref +/// Represents the shredding state of a [`VariantArray`] +/// +/// [`VariantArray`]s can be shredded according to the [Parquet Variant +/// Shredding Spec]. Shredding means that the actual value is stored in a typed +/// `typed_field` instead of the generic `value` field. +/// +/// Both value and typed_value are optional fields used together to encode a +/// single value. Values in the two fields must be interpreted according to the +/// following table (see [Parquet Variant Shredding Spec] for more details): +/// +/// | value | typed_value | Meaning | +/// |----------|--------------|---------| +/// | null | null | The value is missing; only valid for shredded object fields | +/// | non-null | null | The value is present and may be any type, including `null` | +/// | null | non-null | The value is present and is the shredded type | +/// | non-null | non-null | The value is present and is a partially shredded object | +/// +/// [Parquet Variant Shredding Spec]: https://github.com/apache/parquet-format/blob/master/VariantShredding.md#value-shredding +#[derive(Debug)] +pub enum ShreddingState { + // TODO: add missing state where there is neither value nor typed_value Review Comment: ```suggestion // TODO: add missing state where there is neither value nor typed_value // https://github.com/apache/arrow-rs/issues/8088 ``` ########## parquet-variant-compute/src/variant_array.rs: ########## @@ -135,36 +142,195 @@ impl VariantArray { self.inner } + /// Return the shredding state of this `VariantArray` + pub fn shredding_state(&self) -> &ShreddingState { + &self.shredding_state + } + /// Return the [`Variant`] instance stored at the given row /// - /// Panics if the index is out of bounds. + /// Consistently with other Arrow arrays types, this API requires you to + /// check for nulls first using [`Self::is_valid`]. + /// + /// # Panics + /// * if the index is out of bounds + /// * if the array value is null + /// + /// If this is a shredded variant but has no value at the shredded location, it + /// will return [`Variant::Null`]. + /// + /// + /// # Performance Note + /// + /// This is certainly not the most efficient way to access values in a + /// `VariantArray`, but it is useful for testing and debugging. /// /// Note: Does not do deep validation of the [`Variant`], so it is up to the /// caller to ensure that the metadata and value were constructed correctly. pub fn value(&self, index: usize) -> Variant { - let metadata = self.metadata_field().as_binary_view().value(index); - let value = self.value_field().as_binary_view().value(index); - Variant::new(metadata, value) + match &self.shredding_state { + ShreddingState::Unshredded { metadata, value } => { + Variant::new(metadata.value(index), value.value(index)) + } + ShreddingState::FullyShredded { typed_value, .. } => { + if typed_value.is_null(index) { + Variant::Null + } else { + typed_value_to_variant(typed_value, index) + } + } + ShreddingState::PartiallyShredded { + metadata, + value, + typed_value, + } => { + if typed_value.is_null(index) { + Variant::new(metadata.value(index), value.value(index)) + } else { + typed_value_to_variant(typed_value, index) + } + } + } } - fn find_metadata_field(array: &StructArray) -> Option<ArrayRef> { - array.column_by_name("metadata").cloned() + /// Return a reference to the metadata field of the [`StructArray`] + pub fn metadata_field(&self) -> &BinaryViewArray { + self.shredding_state.metadata_field() } - fn find_value_field(array: &StructArray) -> Option<ArrayRef> { - array.column_by_name("value").cloned() + /// Return a reference to the value field of the `StructArray` + pub fn value_field(&self) -> Option<&BinaryViewArray> { + self.shredding_state.value_field() } - /// Return a reference to the metadata field of the [`StructArray`] - pub fn metadata_field(&self) -> &ArrayRef { - // spec says fields order is not guaranteed, so we search by name - &self.metadata_ref + /// Return a reference to the typed_value field of the `StructArray`, if present + pub fn typed_value_field(&self) -> Option<&ArrayRef> { + self.shredding_state.typed_value_field() } +} - /// Return a reference to the value field of the `StructArray` - pub fn value_field(&self) -> &ArrayRef { - // spec says fields order is not guaranteed, so we search by name - &self.value_ref +/// Variant arrays can be shredded in one of three states, encoded here +#[derive(Debug)] +pub enum ShreddingState { Review Comment: I am not sure I filed a ticket to track adding AllNull: - https://github.com/apache/arrow-rs/issues/8088 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
