scovich commented on code in PR #8831:
URL: https://github.com/apache/arrow-rs/pull/8831#discussion_r2611945009
##########
parquet-variant-compute/src/arrow_to_variant.rs:
##########
@@ -553,20 +553,42 @@ impl<'a, L: ListLikeArray> ListArrowToVariantBuilder<'a,
L> {
}
}
-/// Trait for list-like arrays that can provide element ranges
+/// Trait for list-like arrays that can provide common helpers
pub(crate) trait ListLikeArray: Array {
+ type OffsetSize: OffsetSizeTrait;
+
/// Get the values array
- fn values(&self) -> &dyn Array;
+ fn values(&self) -> &ArrayRef;
+
+ /// Get the offsets backing the list values
+ #[cfg(test)]
+ fn value_offsets(&self) -> Option<&[Self::OffsetSize]>;
+
+ /// Size (number of values) for the element at `index`
+ #[cfg(test)]
+ fn value_size(&self, index: usize) -> Self::OffsetSize;
Review Comment:
Given that we anyway have to upcast ArrayRef to a concrete type before
downcasting to this trait, I'd prefer to either extract the needed values
directly at that time (it's anyway just used by a central test helper), or
define a test-only trait like `TestListLikeArray: ListLikeArray` and house the
extra methods there.
##########
parquet-variant-compute/src/shred_variant.rs:
##########
@@ -236,6 +253,285 @@ impl<'a> VariantToShreddedPrimitiveVariantRowBuilder<'a> {
}
}
+pub(crate) struct VariantToShreddedArrayVariantRowBuilder<'a> {
+ value_builder: VariantValueArrayBuilder,
+ typed_value_builder: ArrayVariantToArrowRowBuilder<'a>,
+}
+
+impl<'a> VariantToShreddedArrayVariantRowBuilder<'a> {
+ fn try_new(
+ data_type: &'a DataType,
+ cast_options: &'a CastOptions,
+ capacity: usize,
+ ) -> Result<Self> {
+ Ok(Self {
+ value_builder: VariantValueArrayBuilder::new(capacity),
+ typed_value_builder: ArrayVariantToArrowRowBuilder::try_new(
+ data_type,
+ cast_options,
+ capacity,
+ )?,
+ })
+ }
+
+ fn append_null(&mut self) -> Result<()> {
+ self.value_builder.append_value(Variant::Null);
+ self.typed_value_builder.append_null();
+ Ok(())
+ }
+
+ fn append_value(&mut self, variant: Variant<'_, '_>) -> Result<bool> {
+ // If the variant is not an array, typed_value must be null.
+ // If the variant is an array, value must be null.
+ match variant {
+ Variant::List(list) => {
+ self.value_builder.append_null();
+ self.typed_value_builder.append_value(list)?;
+ Ok(true)
+ }
+ other => {
+ self.value_builder.append_value(other);
+ self.typed_value_builder.append_null();
+ Ok(false)
+ }
+ }
+ }
+
+ fn finish(self) -> Result<(BinaryViewArray, ArrayRef, Option<NullBuffer>)>
{
+ Ok((
+ self.value_builder.build()?,
+ self.typed_value_builder.finish()?,
+ // All elements of an array must be present (not missing) because
+ // the array Variant encoding does not allow missing elements
+ None,
+ ))
+ }
+}
+
+enum ArrayVariantToArrowRowBuilder<'a> {
+ List(VariantToListArrowRowBuilder<'a, i32>),
+ LargeList(VariantToListArrowRowBuilder<'a, i64>),
+ ListView(VariantToListViewArrowRowBuilder<'a, i32>),
+ LargeListView(VariantToListViewArrowRowBuilder<'a, i64>),
+}
+
+impl<'a> ArrayVariantToArrowRowBuilder<'a> {
+ fn try_new(
+ data_type: &'a DataType,
+ cast_options: &'a CastOptions,
+ capacity: usize,
+ ) -> Result<Self> {
+ use ArrayVariantToArrowRowBuilder::*;
+
+ let builder = match data_type {
+ DataType::List(field) =>
List(VariantToListArrowRowBuilder::try_new(
+ field.clone(),
+ field.data_type(),
+ cast_options,
+ capacity,
+ )?),
+ DataType::LargeList(field) =>
LargeList(VariantToListArrowRowBuilder::try_new(
+ field.clone(),
+ field.data_type(),
+ cast_options,
+ capacity,
+ )?),
+ DataType::ListView(field) =>
ListView(VariantToListViewArrowRowBuilder::try_new(
+ field.clone(),
+ field.data_type(),
+ cast_options,
+ capacity,
+ )?),
+ DataType::LargeListView(field) => {
+ LargeListView(VariantToListViewArrowRowBuilder::try_new(
+ field.clone(),
+ field.data_type(),
+ cast_options,
+ capacity,
+ )?)
+ }
+ other => {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Casting to {other:?} is not applicable for array Variant
types"
+ )));
+ }
+ };
+ Ok(builder)
+ }
+
+ fn append_null(&mut self) {
+ match self {
+ Self::List(builder) => builder.append_null(),
+ Self::LargeList(builder) => builder.append_null(),
+ Self::ListView(builder) => builder.append_null(),
+ Self::LargeListView(builder) => builder.append_null(),
+ }
+ }
+
+ fn append_value(&mut self, list: VariantList<'_, '_>) -> Result<()> {
+ match self {
+ Self::List(builder) => builder.append_value(list),
+ Self::LargeList(builder) => builder.append_value(list),
+ Self::ListView(builder) => builder.append_value(list),
+ Self::LargeListView(builder) => builder.append_value(list),
+ }
+ }
+
+ fn finish(self) -> Result<ArrayRef> {
+ match self {
+ Self::List(builder) => builder.finish(),
+ Self::LargeList(builder) => builder.finish(),
+ Self::ListView(builder) => builder.finish(),
+ Self::LargeListView(builder) => builder.finish(),
+ }
+ }
+}
+
+struct VariantToListArrowRowBuilder<'a, O: OffsetSizeTrait +
ArrowNativeTypeOp> {
+ field: FieldRef,
+ offsets: Vec<O>,
+ element_builder: Box<VariantToShreddedVariantRowBuilder<'a>>,
+ nulls: NullBufferBuilder,
+ current_offset: O,
+}
+
+impl<'a, O: OffsetSizeTrait + ArrowNativeTypeOp>
VariantToListArrowRowBuilder<'a, O> {
+ fn try_new(
+ field: FieldRef,
+ element_data_type: &'a DataType,
+ cast_options: &'a CastOptions,
+ capacity: usize,
+ ) -> Result<Self> {
+ let mut offsets =
Vec::with_capacity(capacity.checked_add(1).ok_or_else(|| {
+ ArrowError::ComputeError(
+ "Capacity exceeded usize::MAX when reserving list
offsets".to_string(),
+ )
+ })?);
+ offsets.push(O::ZERO);
+ let element_builder =
make_variant_to_shredded_variant_arrow_row_builder(
+ element_data_type,
+ cast_options,
+ capacity,
+ false,
+ )?;
+ Ok(Self {
+ field,
+ offsets,
+ element_builder: Box::new(element_builder),
+ nulls: NullBufferBuilder::new(capacity),
+ current_offset: O::ZERO,
+ })
+ }
+
+ fn append_null(&mut self) {
+ self.offsets.push(self.current_offset);
+ self.nulls.append_null();
+ }
+
+ fn append_value(&mut self, list: VariantList<'_, '_>) -> Result<()> {
+ for element in list.iter() {
+ self.element_builder.append_value(element)?;
+ self.current_offset = self.current_offset.add_checked(O::ONE)?;
+ }
+ self.offsets.push(self.current_offset);
+ self.nulls.append_non_null();
+ Ok(())
+ }
+
+ fn finish(mut self) -> Result<ArrayRef> {
+ let (value, typed_value, nulls) = self.element_builder.finish()?;
+ let element_array =
+ ShreddedVariantFieldArray::from_parts(Some(value),
Some(typed_value), nulls);
+ let field = Arc::new(
+ self.field
+ .as_ref()
+ .clone()
+ .with_data_type(element_array.data_type().clone()),
+ );
+ let offsets = OffsetBuffer::<O>::new(ScalarBuffer::from(self.offsets));
+ let list_array = GenericListArray::<O>::new(
+ field,
+ offsets,
+ ArrayRef::from(element_array),
+ self.nulls.finish(),
+ );
+ Ok(Arc::new(list_array))
+ }
+}
+
+struct VariantToListViewArrowRowBuilder<'a, O: OffsetSizeTrait +
ArrowNativeTypeOp> {
+ field: FieldRef,
+ offsets: Vec<O>,
+ sizes: Vec<O>,
+ element_builder: Box<VariantToShreddedVariantRowBuilder<'a>>,
+ nulls: NullBufferBuilder,
+ current_offset: O,
+}
Review Comment:
This is _super_ similar to the variant list builder. AFAICT, the only
meaningful difference is how offsets and sizes are handled? And for this
specific use case, we could compute both from the normal `offsets` array, e.g.:
```
offsets: [0, a, b, ..., y, z]
```
translates in view form to
```
offsets: [0, a, b, ..., y]
sizes: [a-0, b-a, ..., z-y]
```
Thus, a single implementation (generic over `const IS_VIEW: bool`) could
have a uniform implementation for everything except the final array build?
```rust
if IS_VIEW {
let n = self.offsets.len();
let sizes = Vec::with_capacity(n - 1)
for i in 1..n {
sizes.push(self.offsets[i] - self.offsets[i-1]);
}
self.offsets.pop(); // drop the now-unneeded extra entry
// create a ListViewArray from offsets and sizes
} else {
// create a ListArray from offsets
}
```
The only downside is allocating one extra element of capacity in case of a
list view... but that seems highly unlikely to matter in practice?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]