This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new c4e154f38b Support Shredded Lists/Array in `variant_get` (#8354)
c4e154f38b is described below

commit c4e154f38b487b3bdefd2d3c7c429bf12a2bcb81
Author: Konstantin Tarasov <[email protected]>
AuthorDate: Mon May 25 07:18:33 2026 -0400

    Support Shredded Lists/Array in `variant_get` (#8354)
    
    # Which issue does this PR close?
    
    - closes #9443.
    
    # Rationale for this change
    
    We should be able to `variant_get` using Indices to path through
    `VariantArray`s
    
    # What changes are included in this PR?
    
    # Are these changes tested?
    
    Yes, unit tested.
    
    # Are there any user-facing changes?
    
    ---------
    
    Co-authored-by: Congxian Qiu <[email protected]>
    Co-authored-by: Ryan Johnson <[email protected]>
---
 parquet-variant-compute/src/variant_get.rs | 338 +++++++++++++++++++++++++++--
 parquet-variant/src/path.rs                |   9 +
 2 files changed, 334 insertions(+), 13 deletions(-)

diff --git a/parquet-variant-compute/src/variant_get.rs 
b/parquet-variant-compute/src/variant_get.rs
index 774da0e72e..38c577564d 100644
--- a/parquet-variant-compute/src/variant_get.rs
+++ b/parquet-variant-compute/src/variant_get.rs
@@ -15,17 +15,20 @@
 // specific language governing permissions and limitations
 // under the License.
 use arrow::{
-    array::{self, Array, ArrayRef, StructArray, make_array},
+    array::{
+        self, Array, ArrayRef, GenericListArray, GenericListViewArray, 
ListLikeArray, StructArray,
+        UInt64Array, make_array,
+    },
     buffer::NullBuffer,
-    compute::CastOptions,
+    compute::{CastOptions, take},
     datatypes::Field,
     error::Result,
 };
 use arrow_schema::{ArrowError, DataType, FieldRef};
 use parquet_variant::{VariantPath, VariantPathElement};
 
+use crate::ShreddingState;
 use crate::VariantArray;
-use crate::variant_array::ShreddingState;
 use crate::variant_to_arrow::make_variant_to_arrow_row_builder;
 
 use arrow::array::AsArray;
@@ -43,12 +46,70 @@ pub(crate) enum ShreddedPathStep {
     NotShredded,
 }
 
+/// Build the next shredding state by taking one list-like element (at 
`index`) per input row.
+///
+fn take_list_like_index_as_shredding_state<L: ListLikeArray + 'static>(
+    typed_value: &dyn Array,
+    index: usize,
+) -> Result<Option<ShreddingState>> {
+    let list_array = typed_value.as_any().downcast_ref::<L>().ok_or_else(|| {
+        ArrowError::ComputeError(format!(
+            "Expected array type '{}' while handling list-like path step, got 
'{}'",
+            std::any::type_name::<L>(),
+            typed_value.data_type()
+        ))
+    })?;
+
+    let values = list_array.values();
+
+    let Some(struct_array) = values.as_struct_opt() else {
+        return Ok(None);
+    };
+    let shredding_state = ShreddingState::try_from(struct_array)?;
+
+    let value_array = shredding_state.value_field();
+    let typed_array = shredding_state.typed_value_field();
+
+    // If list elements have neither typed nor fallback value, this path step 
is missing.
+    if value_array.is_none() && typed_array.is_none() {
+        return Ok(None);
+    }
+
+    let mut take_indices = Vec::with_capacity(list_array.len());
+    for row in 0..list_array.len() {
+        let row_range = list_array.element_range(row);
+        let take_index = (index < row_range.len()).then(|| (row_range.start + 
index) as u64);
+        take_indices.push(take_index);
+    }
+
+    let index_array = UInt64Array::from(take_indices);
+
+    // Gather both typed and fallback values at the requested element index.
+    let taken_value = value_array
+        .map(|value| take(value, &index_array, None))
+        .transpose()?;
+    let taken_typed = typed_array
+        .map(|typed| take(typed, &index_array, None))
+        .transpose()?;
+
+    Ok(Some(ShreddingState::new(taken_value, taken_typed)))
+}
+
 /// Given a shredded variant field -- a `(value?, typed_value?)` pair -- try 
to take one path step
 /// deeper. For a `VariantPathElement::Field`, if there is no `typed_value` at 
this level, if
 /// `typed_value` is not a struct, or if the requested field name does not 
exist, traversal returns
 /// a missing-path step (`Missing` or `NotShredded` depending on whether 
`value` exists).
 ///
-/// TODO: Support `VariantPathElement::Index`? It wouldn't be easy, and maybe 
not even possible.
+/// Safe-cast behavior (`cast_options.safe = true`):
+/// - Type mismatch during path traversal (for example field access on 
non-struct, index access on
+///   non-list) returns [`ShreddedPathStep::Missing`] or 
[`ShreddedPathStep::NotShredded`], allowing
+///   the caller to continue with null/fallback semantics.
+/// - List index out-of-bounds produces nulls for the corresponding rows.
+///
+/// Unsafe-cast behavior (`cast_options.safe = false`):
+/// - Field access on non-struct returns [`ArrowError::CastError`].
+/// - List index path steps follow JSONPath semantics and return missing/null 
for non-list or
+///   out-of-bounds rows.
 pub(crate) fn follow_shredded_path_element(
     shredding_state: &ShreddingState,
     path_element: &VariantPathElement<'_>,
@@ -69,7 +130,7 @@ pub(crate) fn follow_shredded_path_element(
         VariantPathElement::Field { name } => {
             // Try to step into the requested field name of a struct.
             // First, try to downcast to StructArray
-            let Some(struct_array) = 
typed_value.as_any().downcast_ref::<StructArray>() else {
+            let Some(struct_array) = typed_value.as_struct_opt() else {
                 // Object field path step follows JSONPath semantics and 
returns missing path step (NotShredded/Missing) on non-struct path
                 return Ok(missing_path_step());
             };
@@ -93,12 +154,30 @@ pub(crate) fn follow_shredded_path_element(
             let state = ShreddingState::try_from(struct_array)?;
             Ok(ShreddedPathStep::Success(state))
         }
-        VariantPathElement::Index { .. } => {
-            // TODO: Support array indexing. Among other things, it will 
require slicing not
-            // only the array we have here, but also the corresponding 
metadata and null masks.
-            Err(ArrowError::NotYetImplemented(
-                "Pathing into shredded variant array index".into(),
-            ))
+        VariantPathElement::Index { index } => {
+            let state = match typed_value.data_type() {
+                DataType::List(_) => take_list_like_index_as_shredding_state::<
+                    GenericListArray<i32>,
+                >(typed_value.as_ref(), *index)?,
+                DataType::LargeList(_) => 
take_list_like_index_as_shredding_state::<
+                    GenericListArray<i64>,
+                >(typed_value.as_ref(), *index)?,
+                DataType::ListView(_) => 
take_list_like_index_as_shredding_state::<
+                    GenericListViewArray<i32>,
+                >(typed_value.as_ref(), *index)?,
+                DataType::LargeListView(_) => 
take_list_like_index_as_shredding_state::<
+                    GenericListViewArray<i64>,
+                >(typed_value.as_ref(), *index)?,
+                _ => {
+                    // JSONPath semantics: indexing a non-list yields no match.
+                    return Ok(missing_path_step());
+                }
+            };
+
+            match state {
+                Some(state) => Ok(ShreddedPathStep::Success(state)),
+                None => Ok(missing_path_step()),
+            }
         }
     }
 }
@@ -356,7 +435,8 @@ mod test {
     use super::{GetOptions, variant_get};
     use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder};
     use crate::{
-        VariantArray, VariantArrayBuilder, cast_to_variant, json_to_variant, 
shred_variant,
+        ShreddedSchemaBuilder, VariantArray, VariantArrayBuilder, 
cast_to_variant, json_to_variant,
+        shred_variant,
     };
     use arrow::array::{
         Array, ArrayRef, AsArray, BinaryArray, BinaryViewArray, BooleanArray, 
Date32Array,
@@ -448,7 +528,7 @@ mod test {
     fn get_primitive_variant_inside_object_of_list() {
         single_variant_get_test(
             r#"{"some_field": [1234]}"#,
-            VariantPath::try_from("some_field").unwrap().join(0),
+            VariantPath::try_from("some_field[0]").unwrap(),
             "1234",
         );
     }
@@ -1741,6 +1821,7 @@ mod test {
             Some(nulls),
         ))
     }
+
     /// This test manually constructs a shredded variant array representing 
objects
     /// like {"x": 1, "y": "foo"} and {"x": 42} and tests extracting the "x" 
field
     /// as VariantArray using variant_get.
@@ -1777,6 +1858,210 @@ mod test {
         assert_eq!(&result, &expected);
     }
 
+    type ShreddedListLikeArrayGen = fn() -> ArrayRef;
+    type ShreddedListLikeCase = (&'static str, ShreddedListLikeArrayGen);
+
+    fn shredded_list_like_cases() -> [ShreddedListLikeCase; 4] {
+        [
+            ("list", shredded_list_variant_array),
+            ("large_list", shredded_large_list_variant_array),
+            ("list_view", shredded_list_view_variant_array),
+            ("large_list_view", shredded_large_list_view_variant_array),
+        ]
+    }
+
+    #[test]
+    fn test_shredded_list_like_index_access_from_value_field() {
+        let options = GetOptions::new_with_path(VariantPath::from(1));
+
+        for (case, array_gen) in shredded_list_like_cases() {
+            let array = array_gen();
+            let result = variant_get(&array, options.clone()).unwrap();
+            let result_variant = VariantArray::try_new(&result).unwrap();
+
+            assert_eq!(result_variant.value(0), Variant::from("drama"), 
"{case}");
+            assert_eq!(result_variant.value(1).as_int64(), Some(123), 
"{case}");
+        }
+    }
+
+    #[test]
+    fn test_shredded_list_like_index_out_of_bounds_unsafe_cast_returns_null() {
+        let options =
+            
GetOptions::new_with_path(VariantPath::from(10)).with_cast_options(CastOptions {
+                safe: false,
+                ..Default::default()
+            });
+
+        for (case, array_gen) in shredded_list_like_cases() {
+            let result = variant_get(&array_gen(), options.clone()).unwrap();
+            let result_variant = VariantArray::try_new(&result).unwrap();
+            assert_eq!(result_variant.value(0), Variant::Null, "{case}");
+            assert_eq!(result_variant.value(1), Variant::Null, "{case}");
+        }
+    }
+
+    /// Test extracting shredded list-like field with type conversion.
+    #[test]
+    fn test_shredded_list_like_as_string() {
+        let field = Field::new("typed_value", DataType::Utf8, false);
+        let options = GetOptions::new_with_path(VariantPath::from(0))
+            .with_as_type(Some(FieldRef::from(field)));
+        let expected: ArrayRef = 
Arc::new(StringArray::from(vec![Some("comedy"), Some("horror")]));
+
+        for (case, array_gen) in shredded_list_like_cases() {
+            let result = variant_get(&array_gen(), options.clone()).unwrap();
+            assert_eq!(&result, &expected, "{case}");
+        }
+    }
+
+    #[test]
+    fn test_shredded_list_like_index_access_from_value_field_as_int64() {
+        let field = Field::new("typed_value", DataType::Int64, true);
+        let options = GetOptions::new_with_path(VariantPath::from(1))
+            .with_as_type(Some(FieldRef::from(field)));
+        let expected: ArrayRef = Arc::new(Int64Array::from(vec![None, 
Some(123)]));
+
+        for (case, array_gen) in shredded_list_like_cases() {
+            let result = variant_get(&array_gen(), options.clone()).unwrap();
+            // "drama" -> NULL, 123 -> 123.
+            assert_eq!(&result, &expected, "{case}");
+        }
+    }
+
+    #[test]
+    fn test_shredded_list_in_struct_index_access() {
+        let array = shredded_struct_with_list_variant_array();
+        let options = 
GetOptions::new_with_path(VariantPath::try_from("a[1]").unwrap());
+        let result = variant_get(&array, options).unwrap();
+        let result_variant = VariantArray::try_new(&result).unwrap();
+
+        assert_eq!(result_variant.value(0), Variant::from("drama"));
+        assert_eq!(result_variant.value(1).as_int64(), Some(123));
+    }
+
+    #[test]
+    fn test_shredded_struct_in_list_field_access() {
+        let array = shredded_list_of_struct_variant_array();
+        let field = Field::new("x", DataType::Int32, true);
+        let path = VariantPath::from(0).join("x");
+        let options = 
GetOptions::new_with_path(path).with_as_type(Some(FieldRef::from(field)));
+        let result = variant_get(&array, options).unwrap();
+
+        let expected: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), 
Some(3)]));
+        assert_eq!(&result, &expected);
+    }
+
+    #[test]
+    fn test_shredded_list_of_lists_index_access() {
+        let array = shredded_list_of_lists_variant_array();
+        let path = VariantPath::from(0).join(1);
+
+        let result = variant_get(&array, 
GetOptions::new_with_path(path.clone())).unwrap();
+        let result_variant = VariantArray::try_new(&result).unwrap();
+        assert_eq!(result_variant.value(0), Variant::from("b"));
+        assert_eq!(result_variant.value(1).as_int64(), Some(123));
+
+        let field = Field::new("typed_value", DataType::Int64, true);
+        let casted = variant_get(
+            &array,
+            
GetOptions::new_with_path(path).with_as_type(Some(FieldRef::from(field))),
+        )
+        .unwrap();
+        let expected: ArrayRef = Arc::new(Int64Array::from(vec![None, 
Some(123)]));
+        assert_eq!(&casted, &expected);
+    }
+
+    /// Helper to create a shredded list-like variant array used by list index 
tests.
+    ///
+    /// Rows:
+    /// 1. `["comedy", "drama"]` (fully shred-able as `Utf8`)
+    /// 2. `["horror", 123]` (partially shredded, with fallback for the 
numeric element)
+    fn shredded_list_like_variant_array(list_schema: DataType) -> ArrayRef {
+        let json_rows: ArrayRef = Arc::new(StringArray::from(vec![
+            Some(r#"["comedy", "drama"]"#),
+            Some(r#"["horror", 123]"#),
+        ]));
+        let input = json_to_variant(&json_rows).unwrap();
+
+        let shredded = shred_variant(&input, &list_schema).unwrap();
+        ArrayRef::from(shredded)
+    }
+
+    fn shredded_list_of_lists_variant_array() -> ArrayRef {
+        let json_rows: ArrayRef = Arc::new(StringArray::from(vec![
+            Some(r#"[["a", "b"], ["c", "d"]]"#),
+            Some(r#"[["x", 123], ["y", "z"]]"#),
+        ]));
+        let input = json_to_variant(&json_rows).unwrap();
+
+        let inner_list = DataType::List(Arc::new(Field::new("item", 
DataType::Utf8, true)));
+        let outer_list = DataType::List(Arc::new(Field::new("item", 
inner_list, true)));
+        let shredded = shred_variant(&input, &outer_list).unwrap();
+        ArrayRef::from(shredded)
+    }
+
+    fn shredded_list_variant_array() -> ArrayRef {
+        shredded_list_like_variant_array(DataType::List(Arc::new(Field::new(
+            "item",
+            DataType::Utf8,
+            true,
+        ))))
+    }
+
+    fn shredded_large_list_variant_array() -> ArrayRef {
+        
shredded_list_like_variant_array(DataType::LargeList(Arc::new(Field::new(
+            "item",
+            DataType::Utf8,
+            true,
+        ))))
+    }
+
+    fn shredded_list_view_variant_array() -> ArrayRef {
+        
shredded_list_like_variant_array(DataType::ListView(Arc::new(Field::new(
+            "item",
+            DataType::Utf8,
+            true,
+        ))))
+    }
+
+    fn shredded_large_list_view_variant_array() -> ArrayRef {
+        
shredded_list_like_variant_array(DataType::LargeListView(Arc::new(Field::new(
+            "item",
+            DataType::Utf8,
+            true,
+        ))))
+    }
+
+    fn shredded_struct_with_list_variant_array() -> ArrayRef {
+        let json_rows: ArrayRef = Arc::new(StringArray::from(vec![
+            Some(r#"{"a": ["comedy", "drama"]}"#),
+            Some(r#"{"a": ["horror", 123]}"#),
+        ]));
+        let input = json_to_variant(&json_rows).unwrap();
+
+        let list_schema = DataType::List(Arc::new(Field::new("item", 
DataType::Utf8, true)));
+        let shredding_schema = ShreddedSchemaBuilder::default()
+            .with_path("a", &list_schema)
+            .unwrap()
+            .build();
+        let shredded = shred_variant(&input, &shredding_schema).unwrap();
+        ArrayRef::from(shredded)
+    }
+
+    fn shredded_list_of_struct_variant_array() -> ArrayRef {
+        let json_rows: ArrayRef = Arc::new(StringArray::from(vec![
+            Some(r#"[{"x": 1}, {"x": 2}]"#),
+            Some(r#"[{"x": 3}, {"y": 4}]"#),
+        ]));
+        let input = json_to_variant(&json_rows).unwrap();
+
+        let struct_type =
+            DataType::Struct(Fields::from(vec![Field::new("x", 
DataType::Int32, true)]));
+        let list_schema = DataType::List(Arc::new(Field::new("item", 
struct_type, true)));
+        let shredded = shred_variant(&input, &list_schema).unwrap();
+        ArrayRef::from(shredded)
+    }
+
     /// Helper function to create a shredded variant array representing objects
     ///
     /// This creates an array that represents:
@@ -2494,6 +2779,33 @@ mod test {
         }
     }
 
+    #[test]
+    fn test_strict_cast_options_index_on_non_list_returns_null() {
+        use arrow::compute::CastOptions;
+        use arrow::datatypes::{DataType, Field};
+        use parquet_variant::VariantPath;
+        use std::sync::Arc;
+
+        // Use existing test data that has Int32 typed_value at the top level.
+        let variant_array = perfectly_shredded_int32_variant_array();
+        let options = GetOptions {
+            path: VariantPath::from(0),
+            as_type: Some(Arc::new(Field::new("result", DataType::Int32, 
true))),
+            cast_options: CastOptions {
+                safe: false,
+                ..Default::default()
+            },
+        };
+
+        let variant_array_ref: Arc<dyn Array> = variant_array.clone();
+        let result = variant_get(&variant_array_ref, options).unwrap();
+
+        assert_eq!(result.len(), 3);
+        assert!(result.is_null(0));
+        assert!(result.is_null(1));
+        assert!(result.is_null(2));
+    }
+
     #[test]
     fn test_error_message_boolean_type_display() {
         let mut builder = VariantArrayBuilder::new(1);
diff --git a/parquet-variant/src/path.rs b/parquet-variant/src/path.rs
index 8e68d9efad..0837e49f8a 100644
--- a/parquet-variant/src/path.rs
+++ b/parquet-variant/src/path.rs
@@ -346,5 +346,14 @@ mod tests {
             err.to_string(),
             "Parser error: Invalid token in bracket request: `abc`. Expected a 
quoted string or a number(e.g., `['field']` or `[123]`)"
         );
+
+        // Out-of-range integer indexes are invalid path tokens.
+        let too_large_index = (usize::MAX as u128) + 1;
+        let err = 
VariantPath::try_from(format!("[{too_large_index}]").as_str()).unwrap_err();
+        assert!(
+            err.to_string()
+                .contains("Parser error: Invalid token in bracket request"),
+            "{err}"
+        );
     }
 }

Reply via email to