scovich commented on code in PR #8481:
URL: https://github.com/apache/arrow-rs/pull/8481#discussion_r2392855788


##########
parquet-variant-compute/src/unshred_variant.rs:
##########
@@ -0,0 +1,520 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Module for unshredding VariantArray by folding typed_value columns back 
into the value column.
+
+use crate::{BorrowedShreddingState, VariantArray, VariantValueArrayBuilder};
+use arrow::array::{
+    Array, AsArray as _, BinaryViewArray, BooleanArray, FixedSizeBinaryArray, 
NullBufferBuilder,
+    PrimitiveArray, StringArray, StructArray,
+};
+use arrow::buffer::NullBuffer;
+use arrow::datatypes::{
+    ArrowPrimitiveType, DataType, Date32Type, Float32Type, Float64Type, 
Int16Type, Int32Type,
+    Int64Type, Int8Type, Time64MicrosecondType, TimeUnit, 
TimestampMicrosecondType,
+    TimestampNanosecondType,
+};
+use arrow::error::{ArrowError, Result};
+use arrow::temporal_conversions::time64us_to_time;
+use chrono::{DateTime, Utc};
+use indexmap::IndexMap;
+use parquet_variant::{ObjectFieldBuilder, Variant, VariantBuilderExt, 
VariantMetadata};
+use uuid::Uuid;
+
+/// Removes all (nested) typed_value columns from a VariantArray by converting 
them back to binary
+/// variant and merging the resulting values back into the value column.
+///
+/// This function efficiently converts a shredded VariantArray back to an 
unshredded form where all
+/// data resides in the value column.
+///
+/// # Arguments
+/// * `array` - The VariantArray to unshred
+///
+/// # Returns
+/// A new VariantArray with all data in the value column and no typed_value 
column
+///
+/// # Errors
+/// - If the shredded data contains spec violations (e.g., field name 
conflicts)
+/// - If unsupported data types are encountered in typed_value columns
+pub fn unshred_variant(array: &VariantArray) -> Result<VariantArray> {
+    // Check if already unshredded (optimization for common case)
+    if array.typed_value_field().is_none() && array.value_field().is_some() {
+        return Ok(array.clone());
+    }
+
+    // NOTE: None/None at top-level is technically invalid, but the shredding 
spec requires us to
+    // emit `Variant::Null` when a required value is missing.
+    let mut row_builder = 
make_unshred_variant_row_builder(array.shredding_state().borrow())?
+        .unwrap_or_else(|| UnshredVariantRowBuilder::null(array.nulls()));
+
+    let metadata = array.metadata_field();
+    let mut value_builder = VariantValueArrayBuilder::new(array.len());
+    let mut null_builder = NullBufferBuilder::new(array.len());
+    for i in 0..array.len() {
+        if array.is_null(i) {
+            value_builder.append_null();
+            null_builder.append_null();
+        } else {
+            let metadata = VariantMetadata::new(metadata.value(i));
+            let mut value_builder = value_builder.builder_ext(&metadata);
+            row_builder.append_row(&mut value_builder, &metadata, i)?;
+            null_builder.append_non_null();
+        }
+    }
+
+    let value = value_builder.build()?;
+    Ok(VariantArray::from_parts(
+        metadata.clone(),
+        Some(value),
+        None,
+        null_builder.finish(),
+    ))
+}
+
+/// Row builder for converting shredded VariantArray rows back to unshredded 
form
+enum UnshredVariantRowBuilder<'a> {
+    PrimitiveInt8(UnshredPrimitiveRowBuilder<'a, PrimitiveArray<Int8Type>>),
+    PrimitiveInt16(UnshredPrimitiveRowBuilder<'a, PrimitiveArray<Int16Type>>),
+    PrimitiveInt32(UnshredPrimitiveRowBuilder<'a, PrimitiveArray<Int32Type>>),
+    PrimitiveInt64(UnshredPrimitiveRowBuilder<'a, PrimitiveArray<Int64Type>>),
+    PrimitiveFloat32(UnshredPrimitiveRowBuilder<'a, 
PrimitiveArray<Float32Type>>),
+    PrimitiveFloat64(UnshredPrimitiveRowBuilder<'a, 
PrimitiveArray<Float64Type>>),
+    PrimitiveDate32(UnshredPrimitiveRowBuilder<'a, 
PrimitiveArray<Date32Type>>),
+    PrimitiveTime64(UnshredPrimitiveRowBuilder<'a, 
PrimitiveArray<Time64MicrosecondType>>),
+    TimestampMicrosecond(TimestampUnshredRowBuilder<'a, 
TimestampMicrosecondType>),
+    TimestampNanosecond(TimestampUnshredRowBuilder<'a, 
TimestampNanosecondType>),
+    PrimitiveBoolean(UnshredPrimitiveRowBuilder<'a, BooleanArray>),
+    PrimitiveString(UnshredPrimitiveRowBuilder<'a, StringArray>),
+    PrimitiveBinaryView(UnshredPrimitiveRowBuilder<'a, BinaryViewArray>),
+    PrimitiveUuid(UnshredPrimitiveRowBuilder<'a, FixedSizeBinaryArray>),
+    Struct(StructUnshredVariantBuilder<'a>),
+    ValueOnly(ValueOnlyUnshredVariantBuilder<'a>),
+    Null(NullUnshredVariantBuilder<'a>),
+}
+
+impl<'a> UnshredVariantRowBuilder<'a> {
+    /// Creates an all-null row builder.
+    fn null(nulls: Option<&'a NullBuffer>) -> Self {
+        Self::Null(NullUnshredVariantBuilder::new(nulls))
+    }
+
+    /// Appends a single row at the given value index to the supplied builder.
+    fn append_row(
+        &mut self,
+        builder: &mut impl VariantBuilderExt,
+        metadata: &VariantMetadata,
+        index: usize,
+    ) -> Result<()> {
+        match self {
+            Self::PrimitiveInt8(b) => b.append_row(builder, metadata, index),
+            Self::PrimitiveInt16(b) => b.append_row(builder, metadata, index),
+            Self::PrimitiveInt32(b) => b.append_row(builder, metadata, index),
+            Self::PrimitiveInt64(b) => b.append_row(builder, metadata, index),
+            Self::PrimitiveFloat32(b) => b.append_row(builder, metadata, 
index),
+            Self::PrimitiveFloat64(b) => b.append_row(builder, metadata, 
index),
+            Self::PrimitiveDate32(b) => b.append_row(builder, metadata, index),
+            Self::PrimitiveTime64(b) => b.append_row(builder, metadata, index),
+            Self::TimestampMicrosecond(b) => b.append_row(builder, metadata, 
index),
+            Self::TimestampNanosecond(b) => b.append_row(builder, metadata, 
index),
+            Self::PrimitiveBoolean(b) => b.append_row(builder, metadata, 
index),
+            Self::PrimitiveString(b) => b.append_row(builder, metadata, index),
+            Self::PrimitiveBinaryView(b) => b.append_row(builder, metadata, 
index),
+            Self::PrimitiveUuid(b) => b.append_row(builder, metadata, index),
+            Self::Struct(b) => b.append_row(builder, metadata, index),
+            Self::ValueOnly(b) => b.append_row(builder, metadata, index),
+            Self::Null(b) => b.append_row(builder, metadata, index),
+        }
+    }
+}
+
+/// Factory function to create the appropriate row builder for given field 
components
+/// Returns None for None/None case - caller decides how to handle based on 
context
+fn make_unshred_variant_row_builder<'a>(

Review Comment:
   It was very easy to change this to a `try_new_opt`, so I went ahead and did 
it.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to