scovich commented on code in PR #8666:
URL: https://github.com/apache/arrow-rs/pull/8666#discussion_r2445900714
##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -192,94 +197,111 @@ pub(crate) fn
make_primitive_variant_to_arrow_row_builder<'a>(
) -> Result<PrimitiveVariantToArrowRowBuilder<'a>> {
use PrimitiveVariantToArrowRowBuilder::*;
- let builder =
- match data_type {
- DataType::Boolean => {
- Boolean(VariantToBooleanArrowRowBuilder::new(cast_options,
capacity))
- }
- DataType::Int8 => Int8(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Int16 => Int16(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Int32 => Int32(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Int64 => Int64(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::UInt8 => UInt8(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::UInt16 => UInt16(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::UInt32 => UInt32(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::UInt64 => UInt64(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Float16 =>
Float16(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Float32 =>
Float32(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Float64 =>
Float64(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Decimal32(precision, scale) => Decimal32(
- VariantToDecimalArrowRowBuilder::new(cast_options, capacity,
*precision, *scale)?,
- ),
- DataType::Decimal64(precision, scale) => Decimal64(
- VariantToDecimalArrowRowBuilder::new(cast_options, capacity,
*precision, *scale)?,
- ),
- DataType::Decimal128(precision, scale) => Decimal128(
- VariantToDecimalArrowRowBuilder::new(cast_options, capacity,
*precision, *scale)?,
- ),
- DataType::Decimal256(precision, scale) => Decimal256(
- VariantToDecimalArrowRowBuilder::new(cast_options, capacity,
*precision, *scale)?,
- ),
- DataType::Timestamp(TimeUnit::Microsecond, None) =>
TimestampMicroNtz(
- VariantToTimestampNtzArrowRowBuilder::new(cast_options,
capacity),
- ),
- DataType::Timestamp(TimeUnit::Microsecond, tz) => TimestampMicro(
- VariantToTimestampArrowRowBuilder::new(cast_options, capacity,
tz.clone()),
- ),
- DataType::Timestamp(TimeUnit::Nanosecond, None) =>
TimestampNanoNtz(
- VariantToTimestampNtzArrowRowBuilder::new(cast_options,
capacity),
- ),
- DataType::Timestamp(TimeUnit::Nanosecond, tz) => TimestampNano(
- VariantToTimestampArrowRowBuilder::new(cast_options, capacity,
tz.clone()),
- ),
- DataType::Date32 => Date(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- _ if data_type.is_primitive() => {
- return Err(ArrowError::NotYetImplemented(format!(
- "Primitive data_type {data_type:?} not yet implemented"
- )));
- }
- _ => {
- return Err(ArrowError::InvalidArgumentError(format!(
- "Not a primitive type: {data_type:?}"
- )));
- }
- };
+ let builder = match data_type {
+ DataType::Boolean =>
Boolean(VariantToBooleanArrowRowBuilder::new(cast_options, capacity)),
+ DataType::Int8 => Int8(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Int16 => Int16(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Int32 => Int32(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Int64 => Int64(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::UInt8 => UInt8(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::UInt16 => UInt16(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::UInt32 => UInt32(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::UInt64 => UInt64(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Float16 => Float16(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Float32 => Float32(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Float64 => Float64(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Decimal32(precision, scale) =>
Decimal32(VariantToDecimalArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ *precision,
+ *scale,
+ )?),
+ DataType::Decimal64(precision, scale) =>
Decimal64(VariantToDecimalArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ *precision,
+ *scale,
+ )?),
+ DataType::Decimal128(precision, scale) =>
Decimal128(VariantToDecimalArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ *precision,
+ *scale,
+ )?),
+ DataType::Decimal256(precision, scale) =>
Decimal256(VariantToDecimalArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ *precision,
+ *scale,
+ )?),
+ DataType::Timestamp(TimeUnit::Microsecond, None) => TimestampMicroNtz(
+ VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity),
+ ),
+ DataType::Timestamp(TimeUnit::Microsecond, tz) => TimestampMicro(
+ VariantToTimestampArrowRowBuilder::new(cast_options, capacity,
tz.clone()),
+ ),
+ DataType::Timestamp(TimeUnit::Nanosecond, None) => TimestampNanoNtz(
+ VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity),
+ ),
+ DataType::Timestamp(TimeUnit::Nanosecond, tz) => TimestampNano(
+ VariantToTimestampArrowRowBuilder::new(cast_options, capacity,
tz.clone()),
+ ),
+ DataType::Date32 => Date(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::FixedSizeBinary(16) => {
+ Uuid(VariantToUuidArrowRowBuilder::new(cast_options, capacity))
+ }
+ DataType::FixedSizeBinary(size) => {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "FixedSizeBinary({size}) is not a valid variant shredding
type. Only FixedSizeBinary(16) for UUID is supported."
Review Comment:
Just double checking -- we have reliable ways of identifying Decimal128 and
Decimal256 (they have proper arrow data types), so we don't need to ever
interpret raw FixedSizeBinary as decimal, even tho it _is_ a valid physical
representation for those larger decimal types?
##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -504,6 +526,50 @@ where
}
}
+/// Builder for converting variant values to FixedSizeBinary(16) for UUIDs
+pub(crate) struct VariantToUuidArrowRowBuilder<'a> {
+ builder: FixedSizeBinaryBuilder,
+ cast_options: &'a CastOptions<'a>,
+}
+
+impl<'a> VariantToUuidArrowRowBuilder<'a> {
+ fn new(cast_options: &'a CastOptions<'a>, capacity: usize) -> Self {
+ Self {
+ builder: FixedSizeBinaryBuilder::with_capacity(capacity, 16),
+ cast_options,
+ }
+ }
+
+ fn append_null(&mut self) -> Result<()> {
+ self.builder.append_null();
+ Ok(())
+ }
+
+ fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool> {
+ match value.as_uuid() {
+ Some(uuid) => {
+ self.builder
+ .append_value(uuid.as_bytes())
+ .map_err(|e| ArrowError::ExternalError(Box::new(e)))?;
+
+ Ok(true)
+ }
+ None if self.cast_options.safe => {
+ self.builder.append_null();
+ Ok(false)
+ }
+ None => Err(ArrowError::CastError(format!(
+ "Failed to extract UUID from variant {:?}",
+ value
Review Comment:
```suggestion
"Failed to extract UUID from variant {value:?}",
```
##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -192,94 +197,111 @@ pub(crate) fn
make_primitive_variant_to_arrow_row_builder<'a>(
) -> Result<PrimitiveVariantToArrowRowBuilder<'a>> {
use PrimitiveVariantToArrowRowBuilder::*;
- let builder =
- match data_type {
- DataType::Boolean => {
- Boolean(VariantToBooleanArrowRowBuilder::new(cast_options,
capacity))
- }
- DataType::Int8 => Int8(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Int16 => Int16(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Int32 => Int32(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Int64 => Int64(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::UInt8 => UInt8(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::UInt16 => UInt16(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::UInt32 => UInt32(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::UInt64 => UInt64(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Float16 =>
Float16(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Float32 =>
Float32(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Float64 =>
Float64(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- DataType::Decimal32(precision, scale) => Decimal32(
- VariantToDecimalArrowRowBuilder::new(cast_options, capacity,
*precision, *scale)?,
- ),
- DataType::Decimal64(precision, scale) => Decimal64(
- VariantToDecimalArrowRowBuilder::new(cast_options, capacity,
*precision, *scale)?,
- ),
- DataType::Decimal128(precision, scale) => Decimal128(
- VariantToDecimalArrowRowBuilder::new(cast_options, capacity,
*precision, *scale)?,
- ),
- DataType::Decimal256(precision, scale) => Decimal256(
- VariantToDecimalArrowRowBuilder::new(cast_options, capacity,
*precision, *scale)?,
- ),
- DataType::Timestamp(TimeUnit::Microsecond, None) =>
TimestampMicroNtz(
- VariantToTimestampNtzArrowRowBuilder::new(cast_options,
capacity),
- ),
- DataType::Timestamp(TimeUnit::Microsecond, tz) => TimestampMicro(
- VariantToTimestampArrowRowBuilder::new(cast_options, capacity,
tz.clone()),
- ),
- DataType::Timestamp(TimeUnit::Nanosecond, None) =>
TimestampNanoNtz(
- VariantToTimestampNtzArrowRowBuilder::new(cast_options,
capacity),
- ),
- DataType::Timestamp(TimeUnit::Nanosecond, tz) => TimestampNano(
- VariantToTimestampArrowRowBuilder::new(cast_options, capacity,
tz.clone()),
- ),
- DataType::Date32 => Date(VariantToPrimitiveArrowRowBuilder::new(
- cast_options,
- capacity,
- )),
- _ if data_type.is_primitive() => {
- return Err(ArrowError::NotYetImplemented(format!(
- "Primitive data_type {data_type:?} not yet implemented"
- )));
- }
- _ => {
- return Err(ArrowError::InvalidArgumentError(format!(
- "Not a primitive type: {data_type:?}"
- )));
- }
- };
+ let builder = match data_type {
+ DataType::Boolean =>
Boolean(VariantToBooleanArrowRowBuilder::new(cast_options, capacity)),
+ DataType::Int8 => Int8(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Int16 => Int16(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Int32 => Int32(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Int64 => Int64(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::UInt8 => UInt8(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::UInt16 => UInt16(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::UInt32 => UInt32(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::UInt64 => UInt64(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Float16 => Float16(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Float32 => Float32(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Float64 => Float64(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::Decimal32(precision, scale) =>
Decimal32(VariantToDecimalArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ *precision,
+ *scale,
+ )?),
+ DataType::Decimal64(precision, scale) =>
Decimal64(VariantToDecimalArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ *precision,
+ *scale,
+ )?),
+ DataType::Decimal128(precision, scale) =>
Decimal128(VariantToDecimalArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ *precision,
+ *scale,
+ )?),
+ DataType::Decimal256(precision, scale) =>
Decimal256(VariantToDecimalArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ *precision,
+ *scale,
+ )?),
+ DataType::Timestamp(TimeUnit::Microsecond, None) => TimestampMicroNtz(
+ VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity),
+ ),
+ DataType::Timestamp(TimeUnit::Microsecond, tz) => TimestampMicro(
+ VariantToTimestampArrowRowBuilder::new(cast_options, capacity,
tz.clone()),
+ ),
+ DataType::Timestamp(TimeUnit::Nanosecond, None) => TimestampNanoNtz(
+ VariantToTimestampNtzArrowRowBuilder::new(cast_options, capacity),
+ ),
+ DataType::Timestamp(TimeUnit::Nanosecond, tz) => TimestampNano(
+ VariantToTimestampArrowRowBuilder::new(cast_options, capacity,
tz.clone()),
+ ),
+ DataType::Date32 => Date(VariantToPrimitiveArrowRowBuilder::new(
+ cast_options,
+ capacity,
+ )),
+ DataType::FixedSizeBinary(16) => {
+ Uuid(VariantToUuidArrowRowBuilder::new(cast_options, capacity))
+ }
+ DataType::FixedSizeBinary(size) => {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "FixedSizeBinary({size}) is not a valid variant shredding
type. Only FixedSizeBinary(16) for UUID is supported."
+ )));
+ }
Review Comment:
This is a weird quick of `fmt` that I keep running into -- it seems to flip
unpredictably back and forth between two forms:
```rust
let builder = match data_type {
```
vs
```rust
let builder =
match data_type {
```
IMO it's a bug in `fmt`. I _think_ it's intentional because changes in
indentation can impact whether certain things fit on one line vs. several? For
example, indenting _further_ can ironically _reduce_ line counts because of
"fun" interactions with the rule that every arg goes on its own line, if they
don't all fit on the same line as the open+close parens.
Case in point -- this PR actually has _more_ lines, because de-denting the
four DecimalXX match arms caused them to put every arg on its own line. Even
tho de-denting also saved a couple lines for the Boolean match arm.
Anyway, I usually just review with whitespace ignored when I see this...
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]