This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new c4e154f38b Support Shredded Lists/Array in `variant_get` (#8354)
c4e154f38b is described below
commit c4e154f38b487b3bdefd2d3c7c429bf12a2bcb81
Author: Konstantin Tarasov <[email protected]>
AuthorDate: Mon May 25 07:18:33 2026 -0400
Support Shredded Lists/Array in `variant_get` (#8354)
# Which issue does this PR close?
- closes #9443.
# Rationale for this change
We should be able to `variant_get` using Indices to path through
`VariantArray`s
# What changes are included in this PR?
# Are these changes tested?
Yes, unit tested.
# Are there any user-facing changes?
---------
Co-authored-by: Congxian Qiu <[email protected]>
Co-authored-by: Ryan Johnson <[email protected]>
---
parquet-variant-compute/src/variant_get.rs | 338 +++++++++++++++++++++++++++--
parquet-variant/src/path.rs | 9 +
2 files changed, 334 insertions(+), 13 deletions(-)
diff --git a/parquet-variant-compute/src/variant_get.rs
b/parquet-variant-compute/src/variant_get.rs
index 774da0e72e..38c577564d 100644
--- a/parquet-variant-compute/src/variant_get.rs
+++ b/parquet-variant-compute/src/variant_get.rs
@@ -15,17 +15,20 @@
// specific language governing permissions and limitations
// under the License.
use arrow::{
- array::{self, Array, ArrayRef, StructArray, make_array},
+ array::{
+ self, Array, ArrayRef, GenericListArray, GenericListViewArray,
ListLikeArray, StructArray,
+ UInt64Array, make_array,
+ },
buffer::NullBuffer,
- compute::CastOptions,
+ compute::{CastOptions, take},
datatypes::Field,
error::Result,
};
use arrow_schema::{ArrowError, DataType, FieldRef};
use parquet_variant::{VariantPath, VariantPathElement};
+use crate::ShreddingState;
use crate::VariantArray;
-use crate::variant_array::ShreddingState;
use crate::variant_to_arrow::make_variant_to_arrow_row_builder;
use arrow::array::AsArray;
@@ -43,12 +46,70 @@ pub(crate) enum ShreddedPathStep {
NotShredded,
}
+/// Build the next shredding state by taking one list-like element (at
`index`) per input row.
+///
+fn take_list_like_index_as_shredding_state<L: ListLikeArray + 'static>(
+ typed_value: &dyn Array,
+ index: usize,
+) -> Result<Option<ShreddingState>> {
+ let list_array = typed_value.as_any().downcast_ref::<L>().ok_or_else(|| {
+ ArrowError::ComputeError(format!(
+ "Expected array type '{}' while handling list-like path step, got
'{}'",
+ std::any::type_name::<L>(),
+ typed_value.data_type()
+ ))
+ })?;
+
+ let values = list_array.values();
+
+ let Some(struct_array) = values.as_struct_opt() else {
+ return Ok(None);
+ };
+ let shredding_state = ShreddingState::try_from(struct_array)?;
+
+ let value_array = shredding_state.value_field();
+ let typed_array = shredding_state.typed_value_field();
+
+ // If list elements have neither typed nor fallback value, this path step
is missing.
+ if value_array.is_none() && typed_array.is_none() {
+ return Ok(None);
+ }
+
+ let mut take_indices = Vec::with_capacity(list_array.len());
+ for row in 0..list_array.len() {
+ let row_range = list_array.element_range(row);
+ let take_index = (index < row_range.len()).then(|| (row_range.start +
index) as u64);
+ take_indices.push(take_index);
+ }
+
+ let index_array = UInt64Array::from(take_indices);
+
+ // Gather both typed and fallback values at the requested element index.
+ let taken_value = value_array
+ .map(|value| take(value, &index_array, None))
+ .transpose()?;
+ let taken_typed = typed_array
+ .map(|typed| take(typed, &index_array, None))
+ .transpose()?;
+
+ Ok(Some(ShreddingState::new(taken_value, taken_typed)))
+}
+
/// Given a shredded variant field -- a `(value?, typed_value?)` pair -- try
to take one path step
/// deeper. For a `VariantPathElement::Field`, if there is no `typed_value` at
this level, if
/// `typed_value` is not a struct, or if the requested field name does not
exist, traversal returns
/// a missing-path step (`Missing` or `NotShredded` depending on whether
`value` exists).
///
-/// TODO: Support `VariantPathElement::Index`? It wouldn't be easy, and maybe
not even possible.
+/// Safe-cast behavior (`cast_options.safe = true`):
+/// - Type mismatch during path traversal (for example field access on
non-struct, index access on
+/// non-list) returns [`ShreddedPathStep::Missing`] or
[`ShreddedPathStep::NotShredded`], allowing
+/// the caller to continue with null/fallback semantics.
+/// - List index out-of-bounds produces nulls for the corresponding rows.
+///
+/// Unsafe-cast behavior (`cast_options.safe = false`):
+/// - Field access on non-struct returns [`ArrowError::CastError`].
+/// - List index path steps follow JSONPath semantics and return missing/null
for non-list or
+/// out-of-bounds rows.
pub(crate) fn follow_shredded_path_element(
shredding_state: &ShreddingState,
path_element: &VariantPathElement<'_>,
@@ -69,7 +130,7 @@ pub(crate) fn follow_shredded_path_element(
VariantPathElement::Field { name } => {
// Try to step into the requested field name of a struct.
// First, try to downcast to StructArray
- let Some(struct_array) =
typed_value.as_any().downcast_ref::<StructArray>() else {
+ let Some(struct_array) = typed_value.as_struct_opt() else {
// Object field path step follows JSONPath semantics and
returns missing path step (NotShredded/Missing) on non-struct path
return Ok(missing_path_step());
};
@@ -93,12 +154,30 @@ pub(crate) fn follow_shredded_path_element(
let state = ShreddingState::try_from(struct_array)?;
Ok(ShreddedPathStep::Success(state))
}
- VariantPathElement::Index { .. } => {
- // TODO: Support array indexing. Among other things, it will
require slicing not
- // only the array we have here, but also the corresponding
metadata and null masks.
- Err(ArrowError::NotYetImplemented(
- "Pathing into shredded variant array index".into(),
- ))
+ VariantPathElement::Index { index } => {
+ let state = match typed_value.data_type() {
+ DataType::List(_) => take_list_like_index_as_shredding_state::<
+ GenericListArray<i32>,
+ >(typed_value.as_ref(), *index)?,
+ DataType::LargeList(_) =>
take_list_like_index_as_shredding_state::<
+ GenericListArray<i64>,
+ >(typed_value.as_ref(), *index)?,
+ DataType::ListView(_) =>
take_list_like_index_as_shredding_state::<
+ GenericListViewArray<i32>,
+ >(typed_value.as_ref(), *index)?,
+ DataType::LargeListView(_) =>
take_list_like_index_as_shredding_state::<
+ GenericListViewArray<i64>,
+ >(typed_value.as_ref(), *index)?,
+ _ => {
+ // JSONPath semantics: indexing a non-list yields no match.
+ return Ok(missing_path_step());
+ }
+ };
+
+ match state {
+ Some(state) => Ok(ShreddedPathStep::Success(state)),
+ None => Ok(missing_path_step()),
+ }
}
}
}
@@ -356,7 +435,8 @@ mod test {
use super::{GetOptions, variant_get};
use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder};
use crate::{
- VariantArray, VariantArrayBuilder, cast_to_variant, json_to_variant,
shred_variant,
+ ShreddedSchemaBuilder, VariantArray, VariantArrayBuilder,
cast_to_variant, json_to_variant,
+ shred_variant,
};
use arrow::array::{
Array, ArrayRef, AsArray, BinaryArray, BinaryViewArray, BooleanArray,
Date32Array,
@@ -448,7 +528,7 @@ mod test {
fn get_primitive_variant_inside_object_of_list() {
single_variant_get_test(
r#"{"some_field": [1234]}"#,
- VariantPath::try_from("some_field").unwrap().join(0),
+ VariantPath::try_from("some_field[0]").unwrap(),
"1234",
);
}
@@ -1741,6 +1821,7 @@ mod test {
Some(nulls),
))
}
+
/// This test manually constructs a shredded variant array representing
objects
/// like {"x": 1, "y": "foo"} and {"x": 42} and tests extracting the "x"
field
/// as VariantArray using variant_get.
@@ -1777,6 +1858,210 @@ mod test {
assert_eq!(&result, &expected);
}
+ type ShreddedListLikeArrayGen = fn() -> ArrayRef;
+ type ShreddedListLikeCase = (&'static str, ShreddedListLikeArrayGen);
+
+ fn shredded_list_like_cases() -> [ShreddedListLikeCase; 4] {
+ [
+ ("list", shredded_list_variant_array),
+ ("large_list", shredded_large_list_variant_array),
+ ("list_view", shredded_list_view_variant_array),
+ ("large_list_view", shredded_large_list_view_variant_array),
+ ]
+ }
+
+ #[test]
+ fn test_shredded_list_like_index_access_from_value_field() {
+ let options = GetOptions::new_with_path(VariantPath::from(1));
+
+ for (case, array_gen) in shredded_list_like_cases() {
+ let array = array_gen();
+ let result = variant_get(&array, options.clone()).unwrap();
+ let result_variant = VariantArray::try_new(&result).unwrap();
+
+ assert_eq!(result_variant.value(0), Variant::from("drama"),
"{case}");
+ assert_eq!(result_variant.value(1).as_int64(), Some(123),
"{case}");
+ }
+ }
+
+ #[test]
+ fn test_shredded_list_like_index_out_of_bounds_unsafe_cast_returns_null() {
+ let options =
+
GetOptions::new_with_path(VariantPath::from(10)).with_cast_options(CastOptions {
+ safe: false,
+ ..Default::default()
+ });
+
+ for (case, array_gen) in shredded_list_like_cases() {
+ let result = variant_get(&array_gen(), options.clone()).unwrap();
+ let result_variant = VariantArray::try_new(&result).unwrap();
+ assert_eq!(result_variant.value(0), Variant::Null, "{case}");
+ assert_eq!(result_variant.value(1), Variant::Null, "{case}");
+ }
+ }
+
+ /// Test extracting shredded list-like field with type conversion.
+ #[test]
+ fn test_shredded_list_like_as_string() {
+ let field = Field::new("typed_value", DataType::Utf8, false);
+ let options = GetOptions::new_with_path(VariantPath::from(0))
+ .with_as_type(Some(FieldRef::from(field)));
+ let expected: ArrayRef =
Arc::new(StringArray::from(vec![Some("comedy"), Some("horror")]));
+
+ for (case, array_gen) in shredded_list_like_cases() {
+ let result = variant_get(&array_gen(), options.clone()).unwrap();
+ assert_eq!(&result, &expected, "{case}");
+ }
+ }
+
+ #[test]
+ fn test_shredded_list_like_index_access_from_value_field_as_int64() {
+ let field = Field::new("typed_value", DataType::Int64, true);
+ let options = GetOptions::new_with_path(VariantPath::from(1))
+ .with_as_type(Some(FieldRef::from(field)));
+ let expected: ArrayRef = Arc::new(Int64Array::from(vec![None,
Some(123)]));
+
+ for (case, array_gen) in shredded_list_like_cases() {
+ let result = variant_get(&array_gen(), options.clone()).unwrap();
+ // "drama" -> NULL, 123 -> 123.
+ assert_eq!(&result, &expected, "{case}");
+ }
+ }
+
+ #[test]
+ fn test_shredded_list_in_struct_index_access() {
+ let array = shredded_struct_with_list_variant_array();
+ let options =
GetOptions::new_with_path(VariantPath::try_from("a[1]").unwrap());
+ let result = variant_get(&array, options).unwrap();
+ let result_variant = VariantArray::try_new(&result).unwrap();
+
+ assert_eq!(result_variant.value(0), Variant::from("drama"));
+ assert_eq!(result_variant.value(1).as_int64(), Some(123));
+ }
+
+ #[test]
+ fn test_shredded_struct_in_list_field_access() {
+ let array = shredded_list_of_struct_variant_array();
+ let field = Field::new("x", DataType::Int32, true);
+ let path = VariantPath::from(0).join("x");
+ let options =
GetOptions::new_with_path(path).with_as_type(Some(FieldRef::from(field)));
+ let result = variant_get(&array, options).unwrap();
+
+ let expected: ArrayRef = Arc::new(Int32Array::from(vec![Some(1),
Some(3)]));
+ assert_eq!(&result, &expected);
+ }
+
+ #[test]
+ fn test_shredded_list_of_lists_index_access() {
+ let array = shredded_list_of_lists_variant_array();
+ let path = VariantPath::from(0).join(1);
+
+ let result = variant_get(&array,
GetOptions::new_with_path(path.clone())).unwrap();
+ let result_variant = VariantArray::try_new(&result).unwrap();
+ assert_eq!(result_variant.value(0), Variant::from("b"));
+ assert_eq!(result_variant.value(1).as_int64(), Some(123));
+
+ let field = Field::new("typed_value", DataType::Int64, true);
+ let casted = variant_get(
+ &array,
+
GetOptions::new_with_path(path).with_as_type(Some(FieldRef::from(field))),
+ )
+ .unwrap();
+ let expected: ArrayRef = Arc::new(Int64Array::from(vec![None,
Some(123)]));
+ assert_eq!(&casted, &expected);
+ }
+
+ /// Helper to create a shredded list-like variant array used by list index
tests.
+ ///
+ /// Rows:
+ /// 1. `["comedy", "drama"]` (fully shred-able as `Utf8`)
+ /// 2. `["horror", 123]` (partially shredded, with fallback for the
numeric element)
+ fn shredded_list_like_variant_array(list_schema: DataType) -> ArrayRef {
+ let json_rows: ArrayRef = Arc::new(StringArray::from(vec![
+ Some(r#"["comedy", "drama"]"#),
+ Some(r#"["horror", 123]"#),
+ ]));
+ let input = json_to_variant(&json_rows).unwrap();
+
+ let shredded = shred_variant(&input, &list_schema).unwrap();
+ ArrayRef::from(shredded)
+ }
+
+ fn shredded_list_of_lists_variant_array() -> ArrayRef {
+ let json_rows: ArrayRef = Arc::new(StringArray::from(vec![
+ Some(r#"[["a", "b"], ["c", "d"]]"#),
+ Some(r#"[["x", 123], ["y", "z"]]"#),
+ ]));
+ let input = json_to_variant(&json_rows).unwrap();
+
+ let inner_list = DataType::List(Arc::new(Field::new("item",
DataType::Utf8, true)));
+ let outer_list = DataType::List(Arc::new(Field::new("item",
inner_list, true)));
+ let shredded = shred_variant(&input, &outer_list).unwrap();
+ ArrayRef::from(shredded)
+ }
+
+ fn shredded_list_variant_array() -> ArrayRef {
+ shredded_list_like_variant_array(DataType::List(Arc::new(Field::new(
+ "item",
+ DataType::Utf8,
+ true,
+ ))))
+ }
+
+ fn shredded_large_list_variant_array() -> ArrayRef {
+
shredded_list_like_variant_array(DataType::LargeList(Arc::new(Field::new(
+ "item",
+ DataType::Utf8,
+ true,
+ ))))
+ }
+
+ fn shredded_list_view_variant_array() -> ArrayRef {
+
shredded_list_like_variant_array(DataType::ListView(Arc::new(Field::new(
+ "item",
+ DataType::Utf8,
+ true,
+ ))))
+ }
+
+ fn shredded_large_list_view_variant_array() -> ArrayRef {
+
shredded_list_like_variant_array(DataType::LargeListView(Arc::new(Field::new(
+ "item",
+ DataType::Utf8,
+ true,
+ ))))
+ }
+
+ fn shredded_struct_with_list_variant_array() -> ArrayRef {
+ let json_rows: ArrayRef = Arc::new(StringArray::from(vec![
+ Some(r#"{"a": ["comedy", "drama"]}"#),
+ Some(r#"{"a": ["horror", 123]}"#),
+ ]));
+ let input = json_to_variant(&json_rows).unwrap();
+
+ let list_schema = DataType::List(Arc::new(Field::new("item",
DataType::Utf8, true)));
+ let shredding_schema = ShreddedSchemaBuilder::default()
+ .with_path("a", &list_schema)
+ .unwrap()
+ .build();
+ let shredded = shred_variant(&input, &shredding_schema).unwrap();
+ ArrayRef::from(shredded)
+ }
+
+ fn shredded_list_of_struct_variant_array() -> ArrayRef {
+ let json_rows: ArrayRef = Arc::new(StringArray::from(vec![
+ Some(r#"[{"x": 1}, {"x": 2}]"#),
+ Some(r#"[{"x": 3}, {"y": 4}]"#),
+ ]));
+ let input = json_to_variant(&json_rows).unwrap();
+
+ let struct_type =
+ DataType::Struct(Fields::from(vec![Field::new("x",
DataType::Int32, true)]));
+ let list_schema = DataType::List(Arc::new(Field::new("item",
struct_type, true)));
+ let shredded = shred_variant(&input, &list_schema).unwrap();
+ ArrayRef::from(shredded)
+ }
+
/// Helper function to create a shredded variant array representing objects
///
/// This creates an array that represents:
@@ -2494,6 +2779,33 @@ mod test {
}
}
+ #[test]
+ fn test_strict_cast_options_index_on_non_list_returns_null() {
+ use arrow::compute::CastOptions;
+ use arrow::datatypes::{DataType, Field};
+ use parquet_variant::VariantPath;
+ use std::sync::Arc;
+
+ // Use existing test data that has Int32 typed_value at the top level.
+ let variant_array = perfectly_shredded_int32_variant_array();
+ let options = GetOptions {
+ path: VariantPath::from(0),
+ as_type: Some(Arc::new(Field::new("result", DataType::Int32,
true))),
+ cast_options: CastOptions {
+ safe: false,
+ ..Default::default()
+ },
+ };
+
+ let variant_array_ref: Arc<dyn Array> = variant_array.clone();
+ let result = variant_get(&variant_array_ref, options).unwrap();
+
+ assert_eq!(result.len(), 3);
+ assert!(result.is_null(0));
+ assert!(result.is_null(1));
+ assert!(result.is_null(2));
+ }
+
#[test]
fn test_error_message_boolean_type_display() {
let mut builder = VariantArrayBuilder::new(1);
diff --git a/parquet-variant/src/path.rs b/parquet-variant/src/path.rs
index 8e68d9efad..0837e49f8a 100644
--- a/parquet-variant/src/path.rs
+++ b/parquet-variant/src/path.rs
@@ -346,5 +346,14 @@ mod tests {
err.to_string(),
"Parser error: Invalid token in bracket request: `abc`. Expected a
quoted string or a number(e.g., `['field']` or `[123]`)"
);
+
+ // Out-of-range integer indexes are invalid path tokens.
+ let too_large_index = (usize::MAX as u128) + 1;
+ let err =
VariantPath::try_from(format!("[{too_large_index}]").as_str()).unwrap_err();
+ assert!(
+ err.to_string()
+ .contains("Parser error: Invalid token in bracket request"),
+ "{err}"
+ );
}
}