scovich commented on code in PR #8166: URL: https://github.com/apache/arrow-rs/pull/8166#discussion_r2313774965
########## parquet-variant-compute/src/variant_get/mod.rs: ########## @@ -15,51 +15,240 @@ // specific language governing permissions and limitations // under the License. use arrow::{ - array::{Array, ArrayRef}, + array::{self, Array, ArrayRef, BinaryViewArray, StructArray}, compute::CastOptions, + datatypes::Field, error::Result, }; -use arrow_schema::{ArrowError, FieldRef}; -use parquet_variant::VariantPath; +use arrow_schema::{ArrowError, DataType, FieldRef}; +use parquet_variant::{VariantPath, VariantPathElement}; use crate::variant_array::ShreddingState; -use crate::variant_get::output::instantiate_output_builder; -use crate::VariantArray; +use crate::{variant_array::ShreddedVariantFieldArray, VariantArray}; + +use std::sync::Arc; mod output; +pub(crate) enum ShreddedPathStep<'a> { + /// Path step succeeded, return the new shredding state + Success(&'a ShreddingState), + /// The path element is not present in the `typed_value` column and there is no `value` column, + /// so we we know it does not exist. It, and all paths under it, are all-NULL. + Missing, + /// The path element is not present in the `typed_value` column and must be retrieved from the `value` + /// column instead. The caller should be prepared to handle any value, including the requested + /// type, an arbitrary "wrong" type, or `Variant::Null`. + NotShredded, +} + +/// Given a shredded variant field -- a `(value?, typed_value?)` pair -- try to take one path step +/// deeper. For a `VariantPathElement::Field`, the step fails if there is no `typed_value` at this +/// level, or if `typed_value` is not a struct, or if the requested field name does not exist. +/// +/// TODO: Support `VariantPathElement::Index`? It wouldn't be easy, and maybe not even possible. +pub(crate) fn follow_shredded_path_element<'a>( + shredding_state: &'a ShreddingState, + path_element: &VariantPathElement<'_>, + cast_options: &CastOptions, +) -> Result<ShreddedPathStep<'a>> { + // If the requested path element is not present in `typed_value`, and `value` is missing, then + // we know it does not exist; it, and all paths under it, are all-NULL. + let missing_path_step = || { + let Some(_value_field) = shredding_state.value_field() else { + return ShreddedPathStep::Missing; + }; + ShreddedPathStep::NotShredded + }; + + let Some(typed_value) = shredding_state.typed_value_field() else { + return Ok(missing_path_step()); + }; + + match path_element { + VariantPathElement::Field { name } => { + // Try to step into the requested field name of a struct. + // First, try to downcast to StructArray + let Some(struct_array) = typed_value.as_any().downcast_ref::<StructArray>() else { + // Downcast failure - if strict cast options are enabled, this should be an error + if !cast_options.safe { Review Comment: Good catch. This scenario could happen if e.g. we asked for `v:a.b.c::INT` and `v.typed_value.a.typed_value` is shredded as something other than struct. That's not an error at all, there's no rule that the path we asked for has to match the shredding of the underlying data. It just means we need to fetch the value from `v.typed_value.a.value` instead. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org