scovich commented on code in PR #8831:
URL: https://github.com/apache/arrow-rs/pull/8831#discussion_r2615384267
##########
parquet-variant-compute/src/shred_variant.rs:
##########
@@ -240,6 +257,223 @@ impl<'a> VariantToShreddedPrimitiveVariantRowBuilder<'a> {
}
}
+pub(crate) struct VariantToShreddedArrayVariantRowBuilder<'a> {
+ value_builder: VariantValueArrayBuilder,
+ typed_value_builder: ArrayVariantToArrowRowBuilder<'a>,
+}
+
+impl<'a> VariantToShreddedArrayVariantRowBuilder<'a> {
+ fn try_new(
+ data_type: &'a DataType,
+ cast_options: &'a CastOptions,
+ capacity: usize,
+ ) -> Result<Self> {
+ Ok(Self {
+ value_builder: VariantValueArrayBuilder::new(capacity),
+ typed_value_builder: ArrayVariantToArrowRowBuilder::try_new(
+ data_type,
+ cast_options,
+ capacity,
+ )?,
+ })
+ }
+
+ fn append_null(&mut self) -> Result<()> {
+ self.value_builder.append_value(Variant::Null);
+ self.typed_value_builder.append_null();
+ Ok(())
+ }
+
+ fn append_value(&mut self, variant: Variant<'_, '_>) -> Result<bool> {
+ // If the variant is not an array, typed_value must be null.
+ // If the variant is an array, value must be null.
+ match variant {
+ Variant::List(list) => {
+ self.value_builder.append_null();
+ self.typed_value_builder.append_value(list)?;
+ Ok(true)
+ }
+ other => {
+ self.value_builder.append_value(other);
+ self.typed_value_builder.append_null();
+ Ok(false)
+ }
+ }
+ }
+
+ fn finish(self) -> Result<(BinaryViewArray, ArrayRef, Option<NullBuffer>)>
{
+ Ok((
+ self.value_builder.build()?,
+ self.typed_value_builder.finish()?,
+ // All elements of an array must be present (not missing) because
+ // the array Variant encoding does not allow missing elements
+ None,
+ ))
+ }
+}
+
+enum ArrayVariantToArrowRowBuilder<'a> {
+ List(VariantToListArrowRowBuilder<'a, i32, false>),
+ LargeList(VariantToListArrowRowBuilder<'a, i64, false>),
+ ListView(VariantToListArrowRowBuilder<'a, i32, true>),
+ LargeListView(VariantToListArrowRowBuilder<'a, i64, true>),
+}
+
+impl<'a> ArrayVariantToArrowRowBuilder<'a> {
+ fn try_new(
+ data_type: &'a DataType,
+ cast_options: &'a CastOptions,
+ capacity: usize,
+ ) -> Result<Self> {
+ use ArrayVariantToArrowRowBuilder::*;
+
+ macro_rules! make_list_builder {
Review Comment:
A one-line comment might help readers quickly understand what the macro is
accomplishing and why?
(it can absolutely be inferred by studying the code, the comment just makes
it faster)
##########
parquet-variant-compute/src/shred_variant.rs:
##########
@@ -240,6 +257,223 @@ impl<'a> VariantToShreddedPrimitiveVariantRowBuilder<'a> {
}
}
+pub(crate) struct VariantToShreddedArrayVariantRowBuilder<'a> {
+ value_builder: VariantValueArrayBuilder,
+ typed_value_builder: ArrayVariantToArrowRowBuilder<'a>,
+}
+
+impl<'a> VariantToShreddedArrayVariantRowBuilder<'a> {
+ fn try_new(
+ data_type: &'a DataType,
+ cast_options: &'a CastOptions,
+ capacity: usize,
+ ) -> Result<Self> {
+ Ok(Self {
+ value_builder: VariantValueArrayBuilder::new(capacity),
+ typed_value_builder: ArrayVariantToArrowRowBuilder::try_new(
+ data_type,
+ cast_options,
+ capacity,
+ )?,
+ })
+ }
+
+ fn append_null(&mut self) -> Result<()> {
+ self.value_builder.append_value(Variant::Null);
+ self.typed_value_builder.append_null();
+ Ok(())
+ }
+
+ fn append_value(&mut self, variant: Variant<'_, '_>) -> Result<bool> {
+ // If the variant is not an array, typed_value must be null.
+ // If the variant is an array, value must be null.
+ match variant {
+ Variant::List(list) => {
+ self.value_builder.append_null();
+ self.typed_value_builder.append_value(list)?;
+ Ok(true)
+ }
+ other => {
+ self.value_builder.append_value(other);
+ self.typed_value_builder.append_null();
+ Ok(false)
+ }
+ }
+ }
+
+ fn finish(self) -> Result<(BinaryViewArray, ArrayRef, Option<NullBuffer>)>
{
+ Ok((
+ self.value_builder.build()?,
+ self.typed_value_builder.finish()?,
+ // All elements of an array must be present (not missing) because
+ // the array Variant encoding does not allow missing elements
+ None,
+ ))
+ }
+}
+
+enum ArrayVariantToArrowRowBuilder<'a> {
+ List(VariantToListArrowRowBuilder<'a, i32, false>),
+ LargeList(VariantToListArrowRowBuilder<'a, i64, false>),
+ ListView(VariantToListArrowRowBuilder<'a, i32, true>),
+ LargeListView(VariantToListArrowRowBuilder<'a, i64, true>),
+}
+
+impl<'a> ArrayVariantToArrowRowBuilder<'a> {
+ fn try_new(
+ data_type: &'a DataType,
+ cast_options: &'a CastOptions,
+ capacity: usize,
+ ) -> Result<Self> {
+ use ArrayVariantToArrowRowBuilder::*;
+
+ macro_rules! make_list_builder {
+ ($variant:ident, $offset:ty, $is_view:expr, $field:ident) => {
+ $variant(VariantToListArrowRowBuilder::<$offset,
$is_view>::try_new(
+ $field.clone(),
+ $field.data_type(),
+ cast_options,
+ capacity,
+ )?)
+ };
+ }
+
+ let builder = match data_type {
+ DataType::List(field) => make_list_builder!(List, i32, false,
field),
+ DataType::LargeList(field) => make_list_builder!(LargeList, i64,
false, field),
+ DataType::ListView(field) => make_list_builder!(ListView, i32,
true, field),
+ DataType::LargeListView(field) =>
make_list_builder!(LargeListView, i64, true, field),
+ other => {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Casting to {other:?} is not applicable for array Variant
types"
+ )));
+ }
+ };
+ Ok(builder)
+ }
+
+ fn append_null(&mut self) {
+ match self {
+ Self::List(builder) => builder.append_null(),
+ Self::LargeList(builder) => builder.append_null(),
+ Self::ListView(builder) => builder.append_null(),
+ Self::LargeListView(builder) => builder.append_null(),
+ }
+ }
+
+ fn append_value(&mut self, list: VariantList<'_, '_>) -> Result<()> {
+ match self {
+ Self::List(builder) => builder.append_value(list),
+ Self::LargeList(builder) => builder.append_value(list),
+ Self::ListView(builder) => builder.append_value(list),
+ Self::LargeListView(builder) => builder.append_value(list),
+ }
+ }
+
+ fn finish(self) -> Result<ArrayRef> {
+ match self {
+ Self::List(builder) => builder.finish(),
+ Self::LargeList(builder) => builder.finish(),
+ Self::ListView(builder) => builder.finish(),
+ Self::LargeListView(builder) => builder.finish(),
+ }
+ }
+}
+
+struct VariantToListArrowRowBuilder<'a, O, const IS_VIEW: bool>
+where
+ O: OffsetSizeTrait + ArrowNativeTypeOp,
+{
+ field: FieldRef,
+ offsets: Vec<O>,
+ element_builder: Box<VariantToShreddedVariantRowBuilder<'a>>,
+ nulls: NullBufferBuilder,
+ current_offset: O,
+}
+
+impl<'a, O, const IS_VIEW: bool> VariantToListArrowRowBuilder<'a, O, IS_VIEW>
+where
+ O: OffsetSizeTrait + ArrowNativeTypeOp,
+{
+ fn try_new(
+ field: FieldRef,
+ element_data_type: &'a DataType,
+ cast_options: &'a CastOptions,
+ capacity: usize,
+ ) -> Result<Self> {
+ let mut offsets =
Vec::with_capacity(capacity.checked_add(1).ok_or_else(|| {
+ ArrowError::ComputeError(
+ "Capacity exceeded usize::MAX when reserving list
offsets".to_string(),
+ )
Review Comment:
aside:
[Vec::with_capacity](https://doc.rust-lang.org/std/vec/struct.Vec.html#method.with_capacity)
will anyway panic for any value larger than `isize::MAX`, so a normal or
saturating add should suffice here (if we're willing to live with the panic).
If we really want to prevent a panic, we should verify that `capacity <
usize::from(isize::MAX)` and then use normal math for `capacity + 1`.
##########
parquet-variant-compute/src/shred_variant.rs:
##########
@@ -240,6 +257,223 @@ impl<'a> VariantToShreddedPrimitiveVariantRowBuilder<'a> {
}
}
+pub(crate) struct VariantToShreddedArrayVariantRowBuilder<'a> {
+ value_builder: VariantValueArrayBuilder,
+ typed_value_builder: ArrayVariantToArrowRowBuilder<'a>,
+}
+
+impl<'a> VariantToShreddedArrayVariantRowBuilder<'a> {
+ fn try_new(
+ data_type: &'a DataType,
+ cast_options: &'a CastOptions,
+ capacity: usize,
+ ) -> Result<Self> {
+ Ok(Self {
+ value_builder: VariantValueArrayBuilder::new(capacity),
+ typed_value_builder: ArrayVariantToArrowRowBuilder::try_new(
+ data_type,
+ cast_options,
+ capacity,
+ )?,
+ })
+ }
+
+ fn append_null(&mut self) -> Result<()> {
+ self.value_builder.append_value(Variant::Null);
+ self.typed_value_builder.append_null();
+ Ok(())
+ }
+
+ fn append_value(&mut self, variant: Variant<'_, '_>) -> Result<bool> {
+ // If the variant is not an array, typed_value must be null.
+ // If the variant is an array, value must be null.
+ match variant {
+ Variant::List(list) => {
+ self.value_builder.append_null();
+ self.typed_value_builder.append_value(list)?;
+ Ok(true)
+ }
+ other => {
+ self.value_builder.append_value(other);
+ self.typed_value_builder.append_null();
+ Ok(false)
+ }
+ }
+ }
+
+ fn finish(self) -> Result<(BinaryViewArray, ArrayRef, Option<NullBuffer>)>
{
+ Ok((
+ self.value_builder.build()?,
+ self.typed_value_builder.finish()?,
+ // All elements of an array must be present (not missing) because
+ // the array Variant encoding does not allow missing elements
+ None,
+ ))
+ }
+}
+
+enum ArrayVariantToArrowRowBuilder<'a> {
+ List(VariantToListArrowRowBuilder<'a, i32, false>),
+ LargeList(VariantToListArrowRowBuilder<'a, i64, false>),
+ ListView(VariantToListArrowRowBuilder<'a, i32, true>),
+ LargeListView(VariantToListArrowRowBuilder<'a, i64, true>),
+}
+
+impl<'a> ArrayVariantToArrowRowBuilder<'a> {
+ fn try_new(
+ data_type: &'a DataType,
+ cast_options: &'a CastOptions,
+ capacity: usize,
+ ) -> Result<Self> {
+ use ArrayVariantToArrowRowBuilder::*;
+
+ macro_rules! make_list_builder {
+ ($variant:ident, $offset:ty, $is_view:expr, $field:ident) => {
+ $variant(VariantToListArrowRowBuilder::<$offset,
$is_view>::try_new(
+ $field.clone(),
+ $field.data_type(),
+ cast_options,
+ capacity,
+ )?)
+ };
+ }
+
+ let builder = match data_type {
+ DataType::List(field) => make_list_builder!(List, i32, false,
field),
+ DataType::LargeList(field) => make_list_builder!(LargeList, i64,
false, field),
+ DataType::ListView(field) => make_list_builder!(ListView, i32,
true, field),
+ DataType::LargeListView(field) =>
make_list_builder!(LargeListView, i64, true, field),
+ other => {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Casting to {other:?} is not applicable for array Variant
types"
+ )));
+ }
+ };
+ Ok(builder)
+ }
+
+ fn append_null(&mut self) {
+ match self {
+ Self::List(builder) => builder.append_null(),
+ Self::LargeList(builder) => builder.append_null(),
+ Self::ListView(builder) => builder.append_null(),
+ Self::LargeListView(builder) => builder.append_null(),
+ }
+ }
+
+ fn append_value(&mut self, list: VariantList<'_, '_>) -> Result<()> {
+ match self {
+ Self::List(builder) => builder.append_value(list),
+ Self::LargeList(builder) => builder.append_value(list),
+ Self::ListView(builder) => builder.append_value(list),
+ Self::LargeListView(builder) => builder.append_value(list),
+ }
+ }
+
+ fn finish(self) -> Result<ArrayRef> {
+ match self {
+ Self::List(builder) => builder.finish(),
+ Self::LargeList(builder) => builder.finish(),
+ Self::ListView(builder) => builder.finish(),
+ Self::LargeListView(builder) => builder.finish(),
+ }
+ }
+}
+
+struct VariantToListArrowRowBuilder<'a, O, const IS_VIEW: bool>
+where
+ O: OffsetSizeTrait + ArrowNativeTypeOp,
+{
+ field: FieldRef,
+ offsets: Vec<O>,
+ element_builder: Box<VariantToShreddedVariantRowBuilder<'a>>,
+ nulls: NullBufferBuilder,
+ current_offset: O,
+}
+
+impl<'a, O, const IS_VIEW: bool> VariantToListArrowRowBuilder<'a, O, IS_VIEW>
+where
+ O: OffsetSizeTrait + ArrowNativeTypeOp,
+{
+ fn try_new(
+ field: FieldRef,
+ element_data_type: &'a DataType,
+ cast_options: &'a CastOptions,
+ capacity: usize,
+ ) -> Result<Self> {
+ let mut offsets =
Vec::with_capacity(capacity.checked_add(1).ok_or_else(|| {
+ ArrowError::ComputeError(
+ "Capacity exceeded usize::MAX when reserving list
offsets".to_string(),
+ )
+ })?);
+ offsets.push(O::ZERO);
+ let element_builder =
make_variant_to_shredded_variant_arrow_row_builder(
+ element_data_type,
+ cast_options,
+ capacity,
+ false,
+ )?;
+ Ok(Self {
+ field,
+ offsets,
+ element_builder: Box::new(element_builder),
+ nulls: NullBufferBuilder::new(capacity),
+ current_offset: O::ZERO,
+ })
+ }
+
+ fn append_null(&mut self) {
+ self.offsets.push(self.current_offset);
+ self.nulls.append_null();
+ }
+
+ fn append_value(&mut self, list: VariantList<'_, '_>) -> Result<()> {
+ for element in list.iter() {
+ self.element_builder.append_value(element)?;
+ self.current_offset = self.current_offset.add_checked(O::ONE)?;
+ }
+ self.offsets.push(self.current_offset);
+ self.nulls.append_non_null();
+ Ok(())
+ }
+
+ fn finish(mut self) -> Result<ArrayRef> {
+ let (value, typed_value, nulls) = self.element_builder.finish()?;
+ let element_array =
+ ShreddedVariantFieldArray::from_parts(Some(value),
Some(typed_value), nulls);
+ let field = Arc::new(
+ self.field
+ .as_ref()
+ .clone()
+ .with_data_type(element_array.data_type().clone()),
+ );
+
+ if IS_VIEW {
+ let mut sizes =
Vec::with_capacity(self.offsets.len().saturating_sub(1));
Review Comment:
```suggestion
// NOTE: `offsets` is never empty (constructor pushes an entry)
let mut sizes = Vec::with_capacity(self.offsets.len() - 1);
```
(which is also why the `pop` below can't panic)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]