liamzwbao commented on code in PR #8796:
URL: https://github.com/apache/arrow-rs/pull/8796#discussion_r2499977917
##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -210,129 +210,111 @@ impl<'a> VariantToArrowRowBuilder<'a> {
}
}
-/// Creates a primitive row builder, returning Err if the requested data type
is not primitive.
+/// Creates a row builder that converts primitive `Variant` values into the
requested Arrow data type.
pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>(
data_type: &'a DataType,
cast_options: &'a CastOptions,
capacity: usize,
) -> Result<PrimitiveVariantToArrowRowBuilder<'a>> {
use PrimitiveVariantToArrowRowBuilder::*;
- let builder = match data_type {
- DataType::Null => Null(VariantToNullArrowRowBuilder::new(cast_options,
capacity)),
- DataType::Boolean =>
Boolean(VariantToBooleanArrowRowBuilder::new(cast_options, capacity)),
- DataType::Int8 => Int8(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Int16 => Int16(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Int32 => Int32(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Int64 => Int64(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::UInt8 => UInt8(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::UInt16 => UInt16(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::UInt32 => UInt32(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::UInt64 => UInt64(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Float16 => Float16(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Float32 => Float32(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Float64 => Float64(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Decimal32(precision, scale) =>
Decimal32(VariantToDecimalArrowRowBuilder::new(
- cast_options,
- capacity,
- *precision,
- *scale,
- )?),
- DataType::Decimal64(precision, scale) =>
Decimal64(VariantToDecimalArrowRowBuilder::new(
- cast_options,
- capacity,
- *precision,
- *scale,
- )?),
- DataType::Decimal128(precision, scale) =>
Decimal128(VariantToDecimalArrowRowBuilder::new(
- cast_options,
- capacity,
- *precision,
- *scale,
- )?),
- DataType::Decimal256(precision, scale) =>
Decimal256(VariantToDecimalArrowRowBuilder::new(
- cast_options,
- capacity,
- *precision,
- *scale,
- )?),
- DataType::Timestamp(TimeUnit::Microsecond, None) => TimestampMicroNtz(
- VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity),
- ),
- DataType::Timestamp(TimeUnit::Microsecond, tz) => TimestampMicro(
- VariantToTimestampArrowRowBuilder::new(cast_options, capacity,
tz.clone()),
- ),
- DataType::Timestamp(TimeUnit::Nanosecond, None) => TimestampNanoNtz(
- VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity),
- ),
- DataType::Timestamp(TimeUnit::Nanosecond, tz) => TimestampNano(
- VariantToTimestampArrowRowBuilder::new(cast_options, capacity,
tz.clone()),
- ),
- DataType::Date32 => Date(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Time64(TimeUnit::Microsecond) =>
Time(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::FixedSizeBinary(16) => {
- Uuid(VariantToUuidArrowRowBuilder::new(cast_options, capacity))
- }
- DataType::FixedSizeBinary(size) => {
- return Err(ArrowError::InvalidArgumentError(format!(
- "FixedSizeBinary({size}) is not a valid variant shredding
type. Only FixedSizeBinary(16) for UUID is supported."
- )));
- }
- DataType::Utf8 =>
String(VariantToStringArrowBuilder::new(cast_options, capacity)),
- DataType::LargeUtf8 => {
- LargeString(VariantToStringArrowBuilder::new(cast_options,
capacity))
- }
- DataType::Utf8View =>
StringView(VariantToStringArrowBuilder::new(cast_options, capacity)),
- _ if data_type.is_primitive() => {
- return Err(ArrowError::NotYetImplemented(format!(
- "Primitive data_type {data_type:?} not yet implemented"
- )));
- }
- _ => {
- return Err(ArrowError::InvalidArgumentError(format!(
- "Not a primitive type: {data_type:?}"
- )));
- }
- };
+ let builder =
+ match data_type {
+ DataType::Null =>
Null(VariantToNullArrowRowBuilder::new(cast_options, capacity)),
+ DataType::Boolean => {
+ Boolean(VariantToBooleanArrowRowBuilder::new(cast_options,
capacity))
+ }
+ DataType::Int8 => Int8(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Int16 => Int16(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Int32 => Int32(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Int64 => Int64(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::UInt8 => UInt8(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::UInt16 => UInt16(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::UInt32 => UInt32(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::UInt64 => UInt64(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Float16 =>
Float16(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Float32 =>
Float32(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Float64 =>
Float64(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Decimal32(precision, scale) => Decimal32(
+ VariantToDecimalArrowRowBuilder::new(cast_options, capacity,
*precision, *scale)?,
+ ),
+ DataType::Decimal64(precision, scale) => Decimal64(
+ VariantToDecimalArrowRowBuilder::new(cast_options, capacity,
*precision, *scale)?,
+ ),
+ DataType::Decimal128(precision, scale) => Decimal128(
+ VariantToDecimalArrowRowBuilder::new(cast_options, capacity,
*precision, *scale)?,
+ ),
+ DataType::Decimal256(precision, scale) => Decimal256(
+ VariantToDecimalArrowRowBuilder::new(cast_options, capacity,
*precision, *scale)?,
+ ),
+ DataType::Timestamp(TimeUnit::Microsecond, None) =>
TimestampMicroNtz(
+ VariantToTimestampNtzArrowRowBuilder::new(cast_options,
capacity),
+ ),
+ DataType::Timestamp(TimeUnit::Microsecond, tz) => TimestampMicro(
+ VariantToTimestampArrowRowBuilder::new(cast_options, capacity,
tz.clone()),
+ ),
+ DataType::Timestamp(TimeUnit::Nanosecond, None) =>
TimestampNanoNtz(
+ VariantToTimestampNtzArrowRowBuilder::new(cast_options,
capacity),
+ ),
+ DataType::Timestamp(TimeUnit::Nanosecond, tz) => TimestampNano(
+ VariantToTimestampArrowRowBuilder::new(cast_options, capacity,
tz.clone()),
+ ),
+ DataType::Date32 => Date(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Time64(TimeUnit::Microsecond) => Time(
+ VariantToPrimitiveArrowRowBuilder::new(cast_options, capacity),
+ ),
+ DataType::FixedSizeBinary(16) => {
+ Uuid(VariantToUuidArrowRowBuilder::new(cast_options, capacity))
+ }
+ DataType::Utf8 =>
String(VariantToStringArrowBuilder::new(cast_options, capacity)),
+ DataType::LargeUtf8 => {
+ LargeString(VariantToStringArrowBuilder::new(cast_options,
capacity))
+ }
+ DataType::Utf8View => {
+ StringView(VariantToStringArrowBuilder::new(cast_options,
capacity))
+ }
+ _ => {
+ return Err(ArrowError::NotYetImplemented(format!(
Review Comment:
Actually
[`data_type.is_primitive()`](https://github.com/apache/arrow-rs/blob/ec75d000c66609b400dc8a62cf8c8b05afc048a7/arrow-schema/src/datatype.rs#L529-L531)
is not checking **Parquet primitive**, that's why I wanted to remove it to
avoid confusion. It checks **arrow primitive** which only includes numeric and
temporal. If we were to keep it, we shouldn't add Binary or String into this
builder as they are not **arrow primitive**.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]