This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new df89537ea9 [Variant] remove `BorrowedShreddingState` (#9791)
df89537ea9 is described below
commit df89537ea98dd2b8228fdcb94cfd6a6b92935081
Author: Konstantin Tarasov <[email protected]>
AuthorDate: Fri May 22 14:24:20 2026 -0400
[Variant] remove `BorrowedShreddingState` (#9791)
# Which issue does this PR close?
<!--
We generally require a GitHub issue to be filed for all bug fixes and
enhancements and this helps us generate change logs for our releases.
You can link an issue to this PR using the GitHub syntax.
-->
- Closes #9790.
# Rationale for this change
Check issue
<!--
Why are you proposing this change? If this is already explained clearly
in the issue then this section is not needed.
Explaining clearly why changes are proposed helps reviewers understand
your changes and offer better suggestions for fixes.
-->
# What changes are included in this PR?
- Drop `BorrowedShreddingState`
- Replace it with `ShreddingState`
- ~~Removed the lifetimes in `unshred_variant` as they required helpers
to cover recursive `ShreddingState` handling.~~
- ~~Lifetimes removal introduces clone on `NullBuffer`. Extra 3 usize
(24 bytes) per `Array`. Only used in `NullUnshredVariantBuilder`~~
Removed the only place where `NullBuffer` was stored. No regression.
<!--
There is no need to duplicate the description in the issue here but it
is sometimes worth providing a summary of the individual changes in this
PR.
-->
# Are these changes tested?
Yes, unit tests.
<!--
We typically require tests for all PRs in order to:
1. Prevent the code from being accidentally broken by subsequent changes
2. Serve as another way to document the expected behavior of the code
If tests are not included in your PR, please explain why (for example,
are they covered by existing tests)?
-->
# Are there any user-facing changes?
No.
<!--
If there are user-facing changes then we may require documentation to be
updated before approving the PR.
If there are any breaking changes to public APIs, please call them out.
-->
---
parquet-variant-compute/src/lib.rs | 2 +-
parquet-variant-compute/src/unshred_variant.rs | 58 +++++++++-----------
parquet-variant-compute/src/variant_array.rs | 75 +++-----------------------
parquet-variant-compute/src/variant_get.rs | 16 +++---
4 files changed, 41 insertions(+), 110 deletions(-)
diff --git a/parquet-variant-compute/src/lib.rs
b/parquet-variant-compute/src/lib.rs
index b05d0e0236..066fe15b7c 100644
--- a/parquet-variant-compute/src/lib.rs
+++ b/parquet-variant-compute/src/lib.rs
@@ -51,7 +51,7 @@ mod variant_array_builder;
mod variant_get;
mod variant_to_arrow;
-pub use variant_array::{BorrowedShreddingState, ShreddingState, VariantArray,
VariantType};
+pub use variant_array::{ShreddingState, VariantArray, VariantType};
pub use variant_array_builder::{VariantArrayBuilder, VariantValueArrayBuilder};
pub use cast_to_variant::{cast_to_variant, cast_to_variant_with_options};
diff --git a/parquet-variant-compute/src/unshred_variant.rs
b/parquet-variant-compute/src/unshred_variant.rs
index 7bf31e2256..f4bfa73bde 100644
--- a/parquet-variant-compute/src/unshred_variant.rs
+++ b/parquet-variant-compute/src/unshred_variant.rs
@@ -17,15 +17,14 @@
//! Module for unshredding VariantArray by folding typed_value columns back
into the value column.
-use crate::variant_array::binary_array_value;
-use crate::{BorrowedShreddingState, VariantArray, VariantValueArrayBuilder};
+use crate::variant_array::{binary_array_value, validate_binary_array};
+use crate::{VariantArray, VariantValueArrayBuilder};
use arrow::array::{
Array, ArrayRef, AsArray as _, BinaryArray, BinaryViewArray, BooleanArray,
FixedSizeBinaryArray, FixedSizeListArray, GenericListArray,
GenericListViewArray,
LargeBinaryArray, LargeStringArray, ListLikeArray, PrimitiveArray,
StringArray,
StringViewArray, StructArray,
};
-use arrow::buffer::NullBuffer;
use arrow::datatypes::{
ArrowPrimitiveType, DataType, Date32Type, Decimal32Type, Decimal64Type,
Decimal128Type,
DecimalType, Float32Type, Float64Type, Int8Type, Int16Type, Int32Type,
Int64Type,
@@ -67,8 +66,8 @@ pub fn unshred_variant(array: &VariantArray) ->
Result<VariantArray> {
// NOTE: None/None at top-level is technically invalid, but the shredding
spec requires us to
// emit `Variant::Null` when a required value is missing.
let nulls = array.nulls();
- let mut row_builder =
UnshredVariantRowBuilder::try_new_opt(array.shredding_state().borrow())?
- .unwrap_or_else(|| UnshredVariantRowBuilder::null(nulls));
+ let mut row_builder = UnshredVariantRowBuilder::try_new_opt(array.inner())?
+ .unwrap_or_else(UnshredVariantRowBuilder::null);
let metadata = array.metadata_field();
let mut value_builder = VariantValueArrayBuilder::new(array.len());
@@ -126,13 +125,13 @@ enum UnshredVariantRowBuilder<'a> {
FixedSizeList(ListUnshredVariantBuilder<'a, FixedSizeListArray>),
Struct(StructUnshredVariantBuilder<'a>),
ValueOnly(ValueOnlyUnshredVariantBuilder<'a>),
- Null(NullUnshredVariantBuilder<'a>),
+ Null(NullUnshredVariantBuilder),
}
impl<'a> UnshredVariantRowBuilder<'a> {
/// Creates an all-null row builder.
- fn null(nulls: Option<&'a NullBuffer>) -> Self {
- Self::Null(NullUnshredVariantBuilder::new(nulls))
+ fn null() -> Self {
+ Self::Null(NullUnshredVariantBuilder)
}
/// Appends a single row at the given value index to the supplied builder.
@@ -175,12 +174,17 @@ impl<'a> UnshredVariantRowBuilder<'a> {
}
}
- /// Creates a new UnshredVariantRowBuilder from shredding state
- /// Returns None for None/None case - caller decides how to handle based
on context
- fn try_new_opt(shredding_state: BorrowedShreddingState<'a>) ->
Result<Option<Self>> {
- let value = shredding_state.value_field();
- let typed_value = shredding_state.typed_value_field();
- let Some(typed_value) = typed_value else {
+ /// Creates a new UnshredVariantRowBuilder from the `(value, typed_value)`
pair of a shredded
+ /// variant struct. Returns None for the None/None case - caller decides
how to handle based on
+ /// context.
+ fn try_new_opt(inner_struct: &'a StructArray) -> Result<Option<Self>> {
+ let value = if let Some(value_col) =
inner_struct.column_by_name("value") {
+ validate_binary_array(value_col.as_ref(), "value")?;
+ Some(value_col)
+ } else {
+ None
+ };
+ let Some(typed_value) = inner_struct.column_by_name("typed_value")
else {
// Copy the value across directly, if present. Else caller decides
what to do.
return Ok(value.map(|v|
Self::ValueOnly(ValueOnlyUnshredVariantBuilder::new(v))));
};
@@ -289,27 +293,17 @@ impl<'a> UnshredVariantRowBuilder<'a> {
}
}
-/// Builder for arrays with neither typed_value nor value (all
NULL/Variant::Null)
-struct NullUnshredVariantBuilder<'a> {
- nulls: Option<&'a NullBuffer>,
-}
-
-impl<'a> NullUnshredVariantBuilder<'a> {
- fn new(nulls: Option<&'a NullBuffer>) -> Self {
- Self { nulls }
- }
+/// Builder for arrays with neither typed_value nor value (all Variant::Null)
+struct NullUnshredVariantBuilder;
+impl NullUnshredVariantBuilder {
fn append_row(
&mut self,
builder: &mut impl VariantBuilderExt,
_metadata: &VariantMetadata,
- index: usize,
+ _index: usize,
) -> Result<()> {
- if self.nulls.is_some_and(|nulls| nulls.is_null(index)) {
- builder.append_null();
- } else {
- builder.append_value(Variant::Null);
- }
+ builder.append_value(Variant::Null);
Ok(())
}
}
@@ -590,7 +584,7 @@ impl<'a> StructUnshredVariantBuilder<'a> {
field_array.data_type()
)));
};
- let field_unshredder =
UnshredVariantRowBuilder::try_new_opt(field_array.try_into()?)?;
+ let field_unshredder =
UnshredVariantRowBuilder::try_new_opt(field_array)?;
field_unshredders.insert(field.name().as_ref(), field_unshredder);
}
@@ -670,8 +664,8 @@ impl<'a, L: ListLikeArray> ListUnshredVariantBuilder<'a, L>
{
//
// NOTE: A None/None array element is technically invalid, but the
shredding spec
// requires us to emit `Variant::Null` when a required value is
missing.
- let element_unshredder =
UnshredVariantRowBuilder::try_new_opt(element_values.try_into()?)?
- .unwrap_or_else(|| UnshredVariantRowBuilder::null(None));
+ let element_unshredder =
UnshredVariantRowBuilder::try_new_opt(element_values)?
+ .unwrap_or_else(UnshredVariantRowBuilder::null);
Ok(Self {
value,
diff --git a/parquet-variant-compute/src/variant_array.rs
b/parquet-variant-compute/src/variant_array.rs
index 5f52939e71..a6ee281002 100644
--- a/parquet-variant-compute/src/variant_array.rs
+++ b/parquet-variant-compute/src/variant_array.rs
@@ -64,7 +64,7 @@ pub(crate) fn variant_from_arrays_at<'m, 'v>(
}
/// Validates that an array has a binary-like data type.
-fn validate_binary_array(array: &dyn Array, field_name: &str) -> Result<()> {
+pub(crate) fn validate_binary_array(array: &dyn Array, field_name: &str) ->
Result<()> {
match array.data_type() {
DataType::Binary | DataType::LargeBinary | DataType::BinaryView =>
Ok(()),
_ => Err(ArrowError::InvalidArgumentError(format!(
@@ -843,14 +843,6 @@ impl ShreddingState {
self.typed_value.as_ref()
}
- /// Returns a borrowed version of this shredding state
- pub fn borrow(&self) -> BorrowedShreddingState<'_> {
- BorrowedShreddingState {
- value: self.value_field(),
- typed_value: self.typed_value_field(),
- }
- }
-
/// Slice all the underlying arrays
pub fn slice(&self, offset: usize, length: usize) -> Self {
Self {
@@ -860,74 +852,19 @@ impl ShreddingState {
}
}
-/// Similar to [`ShreddingState`] except it holds borrowed references of the
target arrays. Useful
-/// for avoiding clone operations when the caller does not need a
self-standing shredding state.
-#[derive(Clone, Debug)]
-pub struct BorrowedShreddingState<'a> {
- value: Option<&'a ArrayRef>,
- typed_value: Option<&'a ArrayRef>,
-}
-
-impl<'a> BorrowedShreddingState<'a> {
- /// Create a new `BorrowedShreddingState` from the given `value` and
`typed_value` fields
- ///
- /// Note you can create a `BorrowedShreddingState` from a &[`StructArray`]
using
- /// `BorrowedShreddingState::try_from(&struct_array)`, for example:
- ///
- /// ```no_run
- /// # use arrow::array::StructArray;
- /// # use parquet_variant_compute::BorrowedShreddingState;
- /// # fn get_struct_array() -> StructArray {
- /// # unimplemented!()
- /// # }
- /// let struct_array: StructArray = get_struct_array();
- /// let shredding_state =
BorrowedShreddingState::try_from(&struct_array).unwrap();
- /// ```
- pub fn new(value: Option<&'a ArrayRef>, typed_value: Option<&'a ArrayRef>)
-> Self {
- Self { value, typed_value }
- }
-
- /// Return a reference to the value field, if present
- pub fn value_field(&self) -> Option<&'a ArrayRef> {
- self.value
- }
-
- /// Return a reference to the typed_value field, if present
- pub fn typed_value_field(&self) -> Option<&'a ArrayRef> {
- self.typed_value
- }
-}
-
-impl<'a> TryFrom<&'a StructArray> for BorrowedShreddingState<'a> {
+impl TryFrom<&StructArray> for ShreddingState {
type Error = ArrowError;
- fn try_from(inner_struct: &'a StructArray) -> Result<Self> {
+ fn try_from(inner_struct: &StructArray) -> Result<Self> {
// The `value` column need not exist, but if it does it must be a
binary type.
let value = if let Some(value_col) =
inner_struct.column_by_name("value") {
validate_binary_array(value_col.as_ref(), "value")?;
- Some(value_col)
+ Some(value_col.clone())
} else {
None
};
- let typed_value = inner_struct.column_by_name("typed_value");
- Ok(BorrowedShreddingState::new(value, typed_value))
- }
-}
-
-impl TryFrom<&StructArray> for ShreddingState {
- type Error = ArrowError;
-
- fn try_from(inner_struct: &StructArray) -> Result<Self> {
- Ok(BorrowedShreddingState::try_from(inner_struct)?.into())
- }
-}
-
-impl From<BorrowedShreddingState<'_>> for ShreddingState {
- fn from(state: BorrowedShreddingState<'_>) -> Self {
- ShreddingState {
- value: state.value_field().cloned(),
- typed_value: state.typed_value_field().cloned(),
- }
+ let typed_value = inner_struct.column_by_name("typed_value").cloned();
+ Ok(ShreddingState::new(value, typed_value))
}
}
diff --git a/parquet-variant-compute/src/variant_get.rs
b/parquet-variant-compute/src/variant_get.rs
index f76dc4e446..774da0e72e 100644
--- a/parquet-variant-compute/src/variant_get.rs
+++ b/parquet-variant-compute/src/variant_get.rs
@@ -25,15 +25,15 @@ use arrow_schema::{ArrowError, DataType, FieldRef};
use parquet_variant::{VariantPath, VariantPathElement};
use crate::VariantArray;
-use crate::variant_array::BorrowedShreddingState;
+use crate::variant_array::ShreddingState;
use crate::variant_to_arrow::make_variant_to_arrow_row_builder;
use arrow::array::AsArray;
use std::sync::Arc;
-pub(crate) enum ShreddedPathStep<'a> {
+pub(crate) enum ShreddedPathStep {
/// Path step succeeded, return the new shredding state
- Success(BorrowedShreddingState<'a>),
+ Success(ShreddingState),
/// The path element is not present in the `typed_value` column and there
is no `value` column,
/// so we know it does not exist. It, and all paths under it, are all-NULL.
Missing,
@@ -49,11 +49,11 @@ pub(crate) enum ShreddedPathStep<'a> {
/// a missing-path step (`Missing` or `NotShredded` depending on whether
`value` exists).
///
/// TODO: Support `VariantPathElement::Index`? It wouldn't be easy, and maybe
not even possible.
-pub(crate) fn follow_shredded_path_element<'a>(
- shredding_state: &BorrowedShreddingState<'a>,
+pub(crate) fn follow_shredded_path_element(
+ shredding_state: &ShreddingState,
path_element: &VariantPathElement<'_>,
_cast_options: &CastOptions,
-) -> Result<ShreddedPathStep<'a>> {
+) -> Result<ShreddedPathStep> {
// If the requested path element is not present in `typed_value`, and
`value` is missing, then
// we know it does not exist; it, and all paths under it, are all-NULL.
let missing_path_step = || match shredding_state.value_field() {
@@ -90,7 +90,7 @@ pub(crate) fn follow_shredded_path_element<'a>(
))
})?;
- let state = BorrowedShreddingState::try_from(struct_array)?;
+ let state = ShreddingState::try_from(struct_array)?;
Ok(ShreddedPathStep::Success(state))
}
VariantPathElement::Index { .. } => {
@@ -154,7 +154,7 @@ fn shredded_get_path(
// Peel away the prefix of path elements that traverses the shredded parts
of this variant
// column. Shredding will traverse the rest of the path on a per-row basis.
- let mut shredding_state = input.shredding_state().borrow();
+ let mut shredding_state = input.shredding_state().clone();
let mut accumulated_nulls = input.inner().nulls().cloned();
let mut path_index = 0;
for path_element in path {