klion26 commented on code in PR #8796:
URL: https://github.com/apache/arrow-rs/pull/8796#discussion_r2498459579
##########
parquet-variant-compute/src/shred_variant.rs:
##########
@@ -123,13 +123,39 @@ pub(crate) fn
make_variant_to_shredded_variant_arrow_row_builder<'a>(
"Shredding variant array values as arrow lists".to_string(),
));
}
- _ => {
+ // Supported shredded primitive types, see Variant shredding spec:
+ //
https://github.com/apache/parquet-format/blob/master/VariantShredding.md#shredded-value-types
+ DataType::Boolean
Review Comment:
We can't make do the type cast after this? not sure if this is ok.
Currently, we will do the type check in
`make_primitive_variant_to_arrow_row_builder` with the match arms, so maybe we
don't need to add it here, and add the type check in two places seems will add
maintaince.
##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -210,129 +210,111 @@ impl<'a> VariantToArrowRowBuilder<'a> {
}
}
-/// Creates a primitive row builder, returning Err if the requested data type
is not primitive.
+/// Creates a row builder that converts primitive `Variant` values into the
requested Arrow data type.
pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>(
data_type: &'a DataType,
cast_options: &'a CastOptions,
capacity: usize,
) -> Result<PrimitiveVariantToArrowRowBuilder<'a>> {
use PrimitiveVariantToArrowRowBuilder::*;
- let builder = match data_type {
- DataType::Null => Null(VariantToNullArrowRowBuilder::new(cast_options,
capacity)),
- DataType::Boolean =>
Boolean(VariantToBooleanArrowRowBuilder::new(cast_options, capacity)),
- DataType::Int8 => Int8(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Int16 => Int16(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Int32 => Int32(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Int64 => Int64(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::UInt8 => UInt8(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::UInt16 => UInt16(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::UInt32 => UInt32(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::UInt64 => UInt64(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Float16 => Float16(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Float32 => Float32(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Float64 => Float64(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Decimal32(precision, scale) =>
Decimal32(VariantToDecimalArrowRowBuilder::new(
- cast_options,
- capacity,
- *precision,
- *scale,
- )?),
- DataType::Decimal64(precision, scale) =>
Decimal64(VariantToDecimalArrowRowBuilder::new(
- cast_options,
- capacity,
- *precision,
- *scale,
- )?),
- DataType::Decimal128(precision, scale) =>
Decimal128(VariantToDecimalArrowRowBuilder::new(
- cast_options,
- capacity,
- *precision,
- *scale,
- )?),
- DataType::Decimal256(precision, scale) =>
Decimal256(VariantToDecimalArrowRowBuilder::new(
- cast_options,
- capacity,
- *precision,
- *scale,
- )?),
- DataType::Timestamp(TimeUnit::Microsecond, None) => TimestampMicroNtz(
- VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity),
- ),
- DataType::Timestamp(TimeUnit::Microsecond, tz) => TimestampMicro(
- VariantToTimestampArrowRowBuilder::new(cast_options, capacity,
tz.clone()),
- ),
- DataType::Timestamp(TimeUnit::Nanosecond, None) => TimestampNanoNtz(
- VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity),
- ),
- DataType::Timestamp(TimeUnit::Nanosecond, tz) => TimestampNano(
- VariantToTimestampArrowRowBuilder::new(cast_options, capacity,
tz.clone()),
- ),
- DataType::Date32 => Date(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Time64(TimeUnit::Microsecond) =>
Time(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::FixedSizeBinary(16) => {
- Uuid(VariantToUuidArrowRowBuilder::new(cast_options, capacity))
- }
- DataType::FixedSizeBinary(size) => {
- return Err(ArrowError::InvalidArgumentError(format!(
- "FixedSizeBinary({size}) is not a valid variant shredding
type. Only FixedSizeBinary(16) for UUID is supported."
- )));
- }
- DataType::Utf8 =>
String(VariantToStringArrowBuilder::new(cast_options, capacity)),
- DataType::LargeUtf8 => {
- LargeString(VariantToStringArrowBuilder::new(cast_options,
capacity))
- }
- DataType::Utf8View =>
StringView(VariantToStringArrowBuilder::new(cast_options, capacity)),
- _ if data_type.is_primitive() => {
- return Err(ArrowError::NotYetImplemented(format!(
- "Primitive data_type {data_type:?} not yet implemented"
- )));
- }
- _ => {
- return Err(ArrowError::InvalidArgumentError(format!(
- "Not a primitive type: {data_type:?}"
- )));
- }
- };
+ let builder =
+ match data_type {
+ DataType::Null =>
Null(VariantToNullArrowRowBuilder::new(cast_options, capacity)),
+ DataType::Boolean => {
+ Boolean(VariantToBooleanArrowRowBuilder::new(cast_options,
capacity))
+ }
+ DataType::Int8 => Int8(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Int16 => Int16(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Int32 => Int32(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Int64 => Int64(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::UInt8 => UInt8(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::UInt16 => UInt16(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::UInt32 => UInt32(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::UInt64 => UInt64(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Float16 =>
Float16(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Float32 =>
Float32(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Float64 =>
Float64(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Decimal32(precision, scale) => Decimal32(
+ VariantToDecimalArrowRowBuilder::new(cast_options, capacity,
*precision, *scale)?,
+ ),
+ DataType::Decimal64(precision, scale) => Decimal64(
+ VariantToDecimalArrowRowBuilder::new(cast_options, capacity,
*precision, *scale)?,
+ ),
+ DataType::Decimal128(precision, scale) => Decimal128(
+ VariantToDecimalArrowRowBuilder::new(cast_options, capacity,
*precision, *scale)?,
+ ),
+ DataType::Decimal256(precision, scale) => Decimal256(
+ VariantToDecimalArrowRowBuilder::new(cast_options, capacity,
*precision, *scale)?,
+ ),
+ DataType::Timestamp(TimeUnit::Microsecond, None) =>
TimestampMicroNtz(
+ VariantToTimestampNtzArrowRowBuilder::new(cast_options,
capacity),
+ ),
+ DataType::Timestamp(TimeUnit::Microsecond, tz) => TimestampMicro(
+ VariantToTimestampArrowRowBuilder::new(cast_options, capacity,
tz.clone()),
+ ),
+ DataType::Timestamp(TimeUnit::Nanosecond, None) =>
TimestampNanoNtz(
+ VariantToTimestampNtzArrowRowBuilder::new(cast_options,
capacity),
+ ),
+ DataType::Timestamp(TimeUnit::Nanosecond, tz) => TimestampNano(
+ VariantToTimestampArrowRowBuilder::new(cast_options, capacity,
tz.clone()),
+ ),
+ DataType::Date32 => Date(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Time64(TimeUnit::Microsecond) => Time(
+ VariantToPrimitiveArrowRowBuilder::new(cast_options, capacity),
+ ),
+ DataType::FixedSizeBinary(16) => {
+ Uuid(VariantToUuidArrowRowBuilder::new(cast_options, capacity))
+ }
+ DataType::Utf8 =>
String(VariantToStringArrowBuilder::new(cast_options, capacity)),
+ DataType::LargeUtf8 => {
+ LargeString(VariantToStringArrowBuilder::new(cast_options,
capacity))
+ }
+ DataType::Utf8View => {
+ StringView(VariantToStringArrowBuilder::new(cast_options,
capacity))
+ }
+ _ => {
+ return Err(ArrowError::NotYetImplemented(format!(
Review Comment:
Maybe we can't remove the `_ if data_type.is_primitive()` for now, seems
this arm is for the type we _may but have not_ implement, and `_` match arm is
for the types invalid.
After #8768 merged, we complete the 1-1 mapping(and some transforms for some
types) for all `Variant` primitive types, but we _may_ support some `DataType`s
here which is not a valid variant primitive(e.g. Timestamp with different
unit), and keep the `_ if data_type.is_primitive()` so that we know we _may_
support the required, and remove it after we have a conclusion.
I am sorting out possible conversions and will create an issue to discuss
them after the work done.
##########
parquet-variant-compute/src/shred_variant.rs:
##########
@@ -123,13 +123,39 @@ pub(crate) fn
make_variant_to_shredded_variant_arrow_row_builder<'a>(
"Shredding variant array values as arrow lists".to_string(),
));
}
- _ => {
+ // Supported shredded primitive types, see Variant shredding spec:
+ //
https://github.com/apache/parquet-format/blob/master/VariantShredding.md#shredded-value-types
+ DataType::Boolean
+ | DataType::Int8
+ | DataType::Int16
+ | DataType::Int32
+ | DataType::Int64
+ | DataType::Float32
+ | DataType::Float64
+ | DataType::Decimal32(..)
+ | DataType::Decimal64(..)
+ | DataType::Decimal128(..)
+ | DataType::Date32
+ | DataType::Time64(TimeUnit::Microsecond)
+ | DataType::Timestamp(TimeUnit::Microsecond | TimeUnit::Nanosecond, _)
+ | DataType::Binary
+ | DataType::BinaryView
+ | DataType::Utf8
+ | DataType::Utf8View
+ | DataType::FixedSizeBinary(16) // UUID
+ => {
let builder =
make_primitive_variant_to_arrow_row_builder(data_type,
cast_options, capacity)?;
let typed_value_builder =
VariantToShreddedPrimitiveVariantRowBuilder::new(builder,
capacity, top_level);
VariantToShreddedVariantRowBuilder::Primitive(typed_value_builder)
}
+ DataType::FixedSizeBinary(_) => {
+ return Err(ArrowError::InvalidArgumentError(format!("{data_type}
is not a valid variant shredding type. Only FixedSizeBinary(16) for UUID is
supported.")))
Review Comment:
Do we need to distinguish this with the `_` match arm?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]