klion26 commented on code in PR #8354:
URL: https://github.com/apache/arrow-rs/pull/8354#discussion_r2591683489


##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -97,14 +98,67 @@ pub(crate) fn follow_shredded_path_element<'a>(
             })?;
 
             let state = BorrowedShreddingState::try_from(struct_array)?;
-            Ok(ShreddedPathStep::Success(state))
+            Ok(ShreddedPathStep::Success(state.into()))
         }
-        VariantPathElement::Index { .. } => {
+        VariantPathElement::Index { index } => {
             // TODO: Support array indexing. Among other things, it will 
require slicing not
             // only the array we have here, but also the corresponding 
metadata and null masks.
-            Err(ArrowError::NotYetImplemented(
-                "Pathing into shredded variant array index".into(),
-            ))
+            let Some(list_array) = 
typed_value.as_any().downcast_ref::<GenericListArray<i32>>()
+            else {
+                // Downcast failure - if strict cast options are enabled, this 
should be an error
+                if !cast_options.safe {
+                    return Err(ArrowError::CastError(format!(
+                        "Cannot access index '{}' on non-list type: {}",
+                        index,
+                        typed_value.data_type()
+                    )));
+                }
+                // With safe cast options, return NULL (missing_path_step)
+                return Ok(missing_path_step());
+            };
+
+            let offsets = list_array.offsets();
+            let values = list_array.values(); // This is a StructArray
+
+            let Some(struct_array) = 
values.as_any().downcast_ref::<StructArray>() else {
+                return Ok(missing_path_step());
+            };
+
+            let Some(typed_array) = struct_array.column_by_name("typed_value") 
else {
+                return Ok(missing_path_step());
+            };
+
+            // Build the list of indices to take
+            let mut take_indices = Vec::with_capacity(list_array.len());
+            for i in 0..list_array.len() {
+                let start = offsets[i] as usize;
+                let end = offsets[i + 1] as usize;
+                let len = end - start;
+
+                if *index < len {
+                    take_indices.push(Some((start + index) as u32));
+                } else {
+                    take_indices.push(None);
+                }
+            }
+
+            let index_array = UInt32Array::from(take_indices);
+
+            // Use Arrow compute kernel to gather elements
+            let taken = take(typed_array, &index_array, None)?;

Review Comment:
   Seems this will create a new array, not sure if we can use some "view" here 
to avoid creating the new array here.



##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -97,14 +98,67 @@ pub(crate) fn follow_shredded_path_element<'a>(
             })?;
 
             let state = BorrowedShreddingState::try_from(struct_array)?;
-            Ok(ShreddedPathStep::Success(state))
+            Ok(ShreddedPathStep::Success(state.into()))
         }
-        VariantPathElement::Index { .. } => {
+        VariantPathElement::Index { index } => {
             // TODO: Support array indexing. Among other things, it will 
require slicing not
             // only the array we have here, but also the corresponding 
metadata and null masks.
-            Err(ArrowError::NotYetImplemented(
-                "Pathing into shredded variant array index".into(),
-            ))
+            let Some(list_array) = 
typed_value.as_any().downcast_ref::<GenericListArray<i32>>()
+            else {
+                // Downcast failure - if strict cast options are enabled, this 
should be an error
+                if !cast_options.safe {
+                    return Err(ArrowError::CastError(format!(
+                        "Cannot access index '{}' on non-list type: {}",
+                        index,
+                        typed_value.data_type()
+                    )));
+                }
+                // With safe cast options, return NULL (missing_path_step)
+                return Ok(missing_path_step());
+            };
+
+            let offsets = list_array.offsets();
+            let values = list_array.values(); // This is a StructArray
+
+            let Some(struct_array) = 
values.as_any().downcast_ref::<StructArray>() else {
+                return Ok(missing_path_step());
+            };
+
+            let Some(typed_array) = struct_array.column_by_name("typed_value") 
else {
+                return Ok(missing_path_step());
+            };
+
+            // Build the list of indices to take
+            let mut take_indices = Vec::with_capacity(list_array.len());
+            for i in 0..list_array.len() {
+                let start = offsets[i] as usize;
+                let end = offsets[i + 1] as usize;
+                let len = end - start;
+
+                if *index < len {
+                    take_indices.push(Some((start + index) as u32));
+                } else {
+                    take_indices.push(None);

Review Comment:
   Does this mean `OutOfBound` in the current list? Currently, we'll return 
`null`, not sure if we need to return an error in this case. Return an error or 
not, maybe we can log the behavior somewhere.



##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -97,14 +98,67 @@ pub(crate) fn follow_shredded_path_element<'a>(
             })?;
 
             let state = BorrowedShreddingState::try_from(struct_array)?;
-            Ok(ShreddedPathStep::Success(state))
+            Ok(ShreddedPathStep::Success(state.into()))
         }
-        VariantPathElement::Index { .. } => {
+        VariantPathElement::Index { index } => {
             // TODO: Support array indexing. Among other things, it will 
require slicing not
             // only the array we have here, but also the corresponding 
metadata and null masks.
-            Err(ArrowError::NotYetImplemented(
-                "Pathing into shredded variant array index".into(),
-            ))
+            let Some(list_array) = 
typed_value.as_any().downcast_ref::<GenericListArray<i32>>()
+            else {
+                // Downcast failure - if strict cast options are enabled, this 
should be an error
+                if !cast_options.safe {
+                    return Err(ArrowError::CastError(format!(
+                        "Cannot access index '{}' on non-list type: {}",
+                        index,
+                        typed_value.data_type()
+                    )));
+                }
+                // With safe cast options, return NULL (missing_path_step)
+                return Ok(missing_path_step());
+            };
+
+            let offsets = list_array.offsets();
+            let values = list_array.values(); // This is a StructArray
+
+            let Some(struct_array) = 
values.as_any().downcast_ref::<StructArray>() else {
+                return Ok(missing_path_step());

Review Comment:
   Double-checking here, 
   From the [variant shredding 
spec](https://github.com/apache/parquet-format/blob/master/VariantShredding.md#arrays),
 the element is a required group list. Does this `return 
Ok(missing_path_step());`  mean the input is not a valid variant?
   
   ```
   optional group tags (VARIANT) {
     required binary metadata;
     optional binary value;
     optional group typed_value (LIST) {   # must be optional to allow a null 
list
       repeated group list {
         required group element {          # shredded element
           optional binary value;
           optional binary typed_value (STRING);
         }
       }
     }
   }
   ```



##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -1646,7 +1701,96 @@ mod test {
         let expected: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), 
Some(42)]));
         assert_eq!(&result, &expected);
     }
+    /// This test manually constructs a shredded variant array representing 
lists
+    /// like ["comedy", "drama"] and ["horror", 123]
+    /// as VariantArray using variant_get.
+    #[test]
+    fn test_shredded_list_index_access() {
+        let array = shredded_list_variant_array();
+        // Test: Extract the 0 index field as VariantArray first
+        let options = GetOptions::new_with_path(VariantPath::from(0));
+        let result = variant_get(&array, options).unwrap();
+        let result_variant = VariantArray::try_new(&result).unwrap();
+        assert_eq!(result_variant.len(), 2);
 
+        // Row 0: expect 0 index = "comedy"
+        assert_eq!(result_variant.value(0), Variant::from("comedy"));
+        // Row 1: expect 0 index = "horror"
+        assert_eq!(result_variant.value(1), Variant::from("horror"));
+    }

Review Comment:
   We may need to add a blank line between tests/functions(above and below) for 
better display.



##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -97,14 +98,67 @@ pub(crate) fn follow_shredded_path_element<'a>(
             })?;
 
             let state = BorrowedShreddingState::try_from(struct_array)?;
-            Ok(ShreddedPathStep::Success(state))
+            Ok(ShreddedPathStep::Success(state.into()))
         }
-        VariantPathElement::Index { .. } => {
+        VariantPathElement::Index { index } => {
             // TODO: Support array indexing. Among other things, it will 
require slicing not
             // only the array we have here, but also the corresponding 
metadata and null masks.
-            Err(ArrowError::NotYetImplemented(
-                "Pathing into shredded variant array index".into(),
-            ))
+            let Some(list_array) = 
typed_value.as_any().downcast_ref::<GenericListArray<i32>>()
+            else {
+                // Downcast failure - if strict cast options are enabled, this 
should be an error
+                if !cast_options.safe {
+                    return Err(ArrowError::CastError(format!(
+                        "Cannot access index '{}' on non-list type: {}",
+                        index,
+                        typed_value.data_type()
+                    )));
+                }
+                // With safe cast options, return NULL (missing_path_step)
+                return Ok(missing_path_step());
+            };
+
+            let offsets = list_array.offsets();
+            let values = list_array.values(); // This is a StructArray
+
+            let Some(struct_array) = 
values.as_any().downcast_ref::<StructArray>() else {
+                return Ok(missing_path_step());
+            };
+
+            let Some(typed_array) = struct_array.column_by_name("typed_value") 
else {
+                return Ok(missing_path_step());
+            };
+
+            // Build the list of indices to take
+            let mut take_indices = Vec::with_capacity(list_array.len());
+            for i in 0..list_array.len() {
+                let start = offsets[i] as usize;
+                let end = offsets[i + 1] as usize;
+                let len = end - start;
+
+                if *index < len {

Review Comment:
   Please correct me if I'm wrong. Here we _assert that **all**_ the values 
will be `typed_value` column, and use the indices collected here to retrieve 
the final value.
   
   What if the value is located in the `value` column instead of the 
`typed_value` column? (change the test `test_shredded_list_as_string` from 
`VariantPath::from(0)` to `VariantPath::from(1)` can see this)



##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -1646,7 +1701,96 @@ mod test {
         let expected: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), 
Some(42)]));
         assert_eq!(&result, &expected);
     }
+    /// This test manually constructs a shredded variant array representing 
lists
+    /// like ["comedy", "drama"] and ["horror", 123]
+    /// as VariantArray using variant_get.
+    #[test]
+    fn test_shredded_list_index_access() {
+        let array = shredded_list_variant_array();
+        // Test: Extract the 0 index field as VariantArray first
+        let options = GetOptions::new_with_path(VariantPath::from(0));
+        let result = variant_get(&array, options).unwrap();
+        let result_variant = VariantArray::try_new(&result).unwrap();
+        assert_eq!(result_variant.len(), 2);
 
+        // Row 0: expect 0 index = "comedy"
+        assert_eq!(result_variant.value(0), Variant::from("comedy"));

Review Comment:
   Do we need to cover the case that `list[index]` located in the `value` field 
of the input variant(such as `VariantPath::from(1)` here for `["horror", 123]`)
   
   Maybe we can add some more test cases
   - the request index value located in the `typed_value` column
   - the request index value located in the `value` column
   - some nest struct(list in struct, or struct in list)
   - ...



##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -97,14 +98,67 @@ pub(crate) fn follow_shredded_path_element<'a>(
             })?;
 
             let state = BorrowedShreddingState::try_from(struct_array)?;
-            Ok(ShreddedPathStep::Success(state))
+            Ok(ShreddedPathStep::Success(state.into()))
         }
-        VariantPathElement::Index { .. } => {
+        VariantPathElement::Index { index } => {
             // TODO: Support array indexing. Among other things, it will 
require slicing not
             // only the array we have here, but also the corresponding 
metadata and null masks.
-            Err(ArrowError::NotYetImplemented(
-                "Pathing into shredded variant array index".into(),
-            ))
+            let Some(list_array) = 
typed_value.as_any().downcast_ref::<GenericListArray<i32>>()

Review Comment:
   Is there any chance the list length exceeds `i32`? 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to