scovich commented on code in PR #8516:
URL: https://github.com/apache/arrow-rs/pull/8516#discussion_r2392087582


##########
parquet-variant-compute/src/type_conversion.rs:
##########
@@ -98,6 +99,34 @@ impl VariantAsPrimitive<datatypes::UInt64Type> for 
Variant<'_, '_> {
     }
 }
 
+impl VariantAsPrimitive<datatypes::TimestampMicrosecondType> for Variant<'_, 
'_> {
+    fn as_primitive(&self) -> Option<i64> {
+        match self {
+            Variant::TimestampMicros(dt) => Some(dt.timestamp_micros()),
+            Variant::TimestampNtzMicros(ndt) => 
Some(ndt.and_utc().timestamp_micros()),
+            _ => None,
+        }
+    }
+}
+
+impl VariantAsPrimitive<datatypes::TimestampNanosecondType> for Variant<'_, 
'_> {
+    fn as_primitive(&self) -> Option<i64> {
+        match self {
+            Variant::TimestampNanos(dt) => dt.timestamp_nanos_opt(),
+            Variant::TimestampNtzNanos(ndt) => 
ndt.and_utc().timestamp_nanos_opt(),
+            _ => None,
+        }
+    }
+}
+
+impl VariantAsPrimitive<datatypes::Date32Type> for Variant<'_, '_> {
+    fn as_primitive(&self) -> Option<i32> {
+        // The number of days from 0001-01-01 to 1970-01-01.
+        const DAYS_FROM_CE_TO_UNIX_EPOCH: i32 = 719163;
+        self.as_naive_date()
+            .map(|d| d.num_days_from_ce() - DAYS_FROM_CE_TO_UNIX_EPOCH)

Review Comment:
   Should we just use `Date32Type::to_naive_date`? 
   
   It has `unwrap` calls internally, but I'm _fairly_ certain it cannot 
actually panic because i32 range isn't big enough to overflow anything.



##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -697,7 +699,7 @@ mod test {
     }
 
     macro_rules! perfectly_shredded_to_arrow_primitive_test {
-        ($name:ident, $primitive_type:ident, 
$perfectly_shredded_array_gen_fun:ident, $expected_array:expr) => {
+        ($name:ident, $primitive_type:expr, 
$perfectly_shredded_array_gen_fun:ident, $expected_array:expr) => {

Review Comment:
   Why this change? I don't see any expressions being passed?



##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -190,6 +211,29 @@ pub(crate) fn 
make_primitive_variant_to_arrow_row_builder<'a>(
             cast_options,
             capacity,
         )),
+        DataType::Timestamp(TimeUnit::Microsecond, tz) => {
+            let target_type = DataType::Timestamp(TimeUnit::Microsecond, 
tz.clone());
+
+            
TimestampMicro(VariantToPrimitiveArrowRowBuilder::new_with_target_type(
+                cast_options,
+                capacity,
+                Some(target_type),
+            ))

Review Comment:
   nit: I think it's actually smaller code (fewer lines _and_ shorter lines) to 
_not_ factor out `target_type`?
   
   ```suggestion
               
TimestampMicro(VariantToPrimitiveArrowRowBuilder::new_with_target_type(
                   cast_options,
                   capacity,
                   Some(DataType::Timestamp(TimeUnit::Microsecond, tz.clone())),
               ))
   ```
   (again below)



##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -342,7 +442,16 @@ where
     }
 
     fn finish(mut self) -> Result<ArrayRef> {
-        Ok(Arc::new(self.builder.finish()))
+        let array: PrimitiveArray<T> = self.builder.finish();
+
+        if let Some(target_type) = self.target_data_type {
+            let data = array.into_data();
+            let new_data = data.into_builder().data_type(target_type).build()?;
+            let array_with_new_type = PrimitiveArray::<T>::from(new_data);
+            return Ok(Arc::new(array_with_new_type));
+        }

Review Comment:
   I don't think I understand why we need this juggling of data types, when all 
the temporal types [impl 
ArrowPrimitiveType](https://docs.rs/arrow/latest/arrow/array/trait.ArrowPrimitiveType.html#implementors)?



##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -342,7 +442,16 @@ where
     }
 
     fn finish(mut self) -> Result<ArrayRef> {
-        Ok(Arc::new(self.builder.finish()))
+        let array: PrimitiveArray<T> = self.builder.finish();
+
+        if let Some(target_type) = self.target_data_type {
+            let data = array.into_data();
+            let new_data = data.into_builder().data_type(target_type).build()?;
+            let array_with_new_type = PrimitiveArray::<T>::from(new_data);
+            return Ok(Arc::new(array_with_new_type));
+        }

Review Comment:
   ```suggestion
           let mut array = self.builder.finish();
   
           if let Some(target_type) = self.target_data_type {
               let data = array.into_data();
               let new_data = 
data.into_builder().data_type(target_type).build()?;
               array = new_data.into();
           }
   ```



##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -293,21 +337,77 @@ fn get_type_name<T: ArrowPrimitiveType>() -> &'static str 
{
         "arrow_array::types::Float32Type" => "Float32",
         "arrow_array::types::Float64Type" => "Float64",
         "arrow_array::types::Float16Type" => "Float16",
+        "arrow_array::types::TimestampMicrosecondType" => 
"Timestamp(Microsecond)",
+        "arrow_array::types::TimestampNanosecondType" => 
"Timestamp(Nanosecond)",
         _ => "Unknown",
     }
 }
 
+/// Builder for converting variant values to boolean values
+/// Boolean is not primitive types in Arrow, so we need a separate builder
+pub(crate) struct VariantToBooleanArrowRowBuilder<'a> {
+    builder: arrow::array::BooleanBuilder,
+    cast_options: &'a CastOptions<'a>,
+}
+
+impl<'a> VariantToBooleanArrowRowBuilder<'a> {
+    fn new(cast_options: &'a CastOptions<'a>, capacity: usize) -> Self {
+        Self {
+            builder: arrow::array::BooleanBuilder::with_capacity(capacity),
+            cast_options,
+        }
+    }
+
+    fn append_null(&mut self) -> Result<()> {
+        self.builder.append_null();
+        Ok(())
+    }
+
+    fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool> {
+        if let Some(v) = value.as_boolean() {
+            self.builder.append_value(v);
+            Ok(true)
+        } else {
+            if !self.cast_options.safe {
+                // Unsafe casting: return error on conversion failure
+                return Err(ArrowError::CastError(format!(
+                    "Failed to extract boolean from variant {:?} at path 
VariantPath([])",
+                    value
+                )));
+            }
+            // Safe casting: append null on conversion failure
+            self.builder.append_null();
+            Ok(false)
+        }
+    }
+
+    fn finish(mut self) -> Result<ArrayRef> {
+        Ok(Arc::new(self.builder.finish()))
+    }
+}
+
 /// Builder for converting variant values to primitive values
 pub(crate) struct VariantToPrimitiveArrowRowBuilder<'a, T: ArrowPrimitiveType> 
{
     builder: arrow::array::PrimitiveBuilder<T>,
     cast_options: &'a CastOptions<'a>,
+    // this used to change the data type of the resulting array, e.g. to add 
timezone info
+    target_data_type: Option<DataType>,
 }
 
 impl<'a, T: ArrowPrimitiveType> VariantToPrimitiveArrowRowBuilder<'a, T> {
     fn new(cast_options: &'a CastOptions<'a>, capacity: usize) -> Self {
+        Self::new_with_target_type(cast_options, capacity, None)
+    }
+
+    fn new_with_target_type(
+        cast_options: &'a CastOptions<'a>,
+        capacity: usize,
+        target_data_type: Option<DataType>,
+    ) -> Self {

Review Comment:
   Would it be safer/cleaner to limit this to timestamp types only?
   ```rust
   impl<'a, T: ArrowTimestampType> VariantToPrimitiveArrowRowBuilder<'a, T> {
       fn with_target_type(mut self, data_type: DataType) -> Self {
           self.target_data_type = Some(data_type)
           self
       }
   }
   ```
   and then e.g.
   ```rust
           DataType::Timestamp(TimeUnit::Microsecond, None) => TimestampMicro(
               VariantToPrimitiveArrowRowBuilder::new(cast_options, capacity)
           ),
           DataType::Timestamp(TimeUnit::Microsecond, _) => TimestampMicro(
               VariantToPrimitiveArrowRowBuilder::new(cast_options, capacity)
                   .with_target_type(data_type.clone()) // preserve timezone 
info
           ),
   ```
   (If the array -> data -> builder -> data -> array conversion is super cheap, 
then we don't need the first match arm above)
   
   I don't know a good way to factor out or isolate the other c ode paths, but 
at least this would prevent them from activating for any unexpected types?



##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -87,12 +99,17 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
             Float16(b) => b.append_value(value),
             Float32(b) => b.append_value(value),
             Float64(b) => b.append_value(value),
+            Boolean(b) => b.append_value(value),

Review Comment:
   nit: Why not ordered like the others?



##########
parquet-variant-compute/src/type_conversion.rs:
##########
@@ -98,6 +99,34 @@ impl VariantAsPrimitive<datatypes::UInt64Type> for 
Variant<'_, '_> {
     }
 }
 
+impl VariantAsPrimitive<datatypes::TimestampMicrosecondType> for Variant<'_, 
'_> {
+    fn as_primitive(&self) -> Option<i64> {
+        match self {
+            Variant::TimestampMicros(dt) => Some(dt.timestamp_micros()),
+            Variant::TimestampNtzMicros(ndt) => 
Some(ndt.and_utc().timestamp_micros()),
+            _ => None,
+        }
+    }
+}
+
+impl VariantAsPrimitive<datatypes::TimestampNanosecondType> for Variant<'_, 
'_> {
+    fn as_primitive(&self) -> Option<i64> {
+        match self {
+            Variant::TimestampNanos(dt) => dt.timestamp_nanos_opt(),
+            Variant::TimestampNtzNanos(ndt) => 
ndt.and_utc().timestamp_nanos_opt(),
+            _ => None,

Review Comment:
   We could also widen micros to nanos here, if we wanted?



##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -293,21 +337,77 @@ fn get_type_name<T: ArrowPrimitiveType>() -> &'static str 
{
         "arrow_array::types::Float32Type" => "Float32",
         "arrow_array::types::Float64Type" => "Float64",
         "arrow_array::types::Float16Type" => "Float16",
+        "arrow_array::types::TimestampMicrosecondType" => 
"Timestamp(Microsecond)",
+        "arrow_array::types::TimestampNanosecondType" => 
"Timestamp(Nanosecond)",
         _ => "Unknown",
     }
 }
 
+/// Builder for converting variant values to boolean values
+/// Boolean is not primitive types in Arrow, so we need a separate builder
+pub(crate) struct VariantToBooleanArrowRowBuilder<'a> {
+    builder: arrow::array::BooleanBuilder,
+    cast_options: &'a CastOptions<'a>,
+}
+
+impl<'a> VariantToBooleanArrowRowBuilder<'a> {
+    fn new(cast_options: &'a CastOptions<'a>, capacity: usize) -> Self {
+        Self {
+            builder: arrow::array::BooleanBuilder::with_capacity(capacity),

Review Comment:
   This is virtually identical to the primitive row builder... AFAICT the only 
syntactic differences are:
   * The generics (or lack there of)
   * The array builder's type name
   * The method invoked to convert a variant value to the target type
   
   I almost wonder if it would be worthwhile to define a macro that can cover 
both, similar to `define_row_builder!` macro in arrow_to_variant.rs? 
   * Arrow decimal data types are not primitive and will eventually need 
similar treatment?
   * Could more easily define temporal builders with the necessary logic 
without "polluting" normal primitive builders (tho the custom finisher logic 
would be add extra complexity to an already complex macro, so maybe not worth 
it)
   
   Invoking such a macro might look like:
   ```rust
   define_row_builder!(
       struct VariantToBooleanArrowRowBuilder<'a> {
           builder: BooleanBuilder,
       },
       |v| v.as_boolean()
   )
   ```
   and 
   ```rust
   define_row_builder!(
       struct VariantToPrimitiveArrowRowBuilder<'a, T: ArrowPrimitiveType>
       where
           for<'m, 'v> Variant<'m, 'v>: VariantAsPrimitive<T>,
       {
           builder: PrimitiveBuilder<T>,
       },
       |v| v.as_primitive()
   )
   ```



##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -293,21 +337,77 @@ fn get_type_name<T: ArrowPrimitiveType>() -> &'static str 
{
         "arrow_array::types::Float32Type" => "Float32",
         "arrow_array::types::Float64Type" => "Float64",
         "arrow_array::types::Float16Type" => "Float16",
+        "arrow_array::types::TimestampMicrosecondType" => 
"Timestamp(Microsecond)",
+        "arrow_array::types::TimestampNanosecondType" => 
"Timestamp(Nanosecond)",
         _ => "Unknown",
     }
 }
 
+/// Builder for converting variant values to boolean values
+/// Boolean is not primitive types in Arrow, so we need a separate builder
+pub(crate) struct VariantToBooleanArrowRowBuilder<'a> {
+    builder: arrow::array::BooleanBuilder,
+    cast_options: &'a CastOptions<'a>,
+}
+
+impl<'a> VariantToBooleanArrowRowBuilder<'a> {
+    fn new(cast_options: &'a CastOptions<'a>, capacity: usize) -> Self {
+        Self {
+            builder: arrow::array::BooleanBuilder::with_capacity(capacity),

Review Comment:
   > We _would_ still need to figure out the best way to handle that time zone 
issue, tho.
   
   Oh... 
[PrimitiveBuilder::with_timezone_opt](https://docs.rs/arrow/latest/arrow/array/struct.PrimitiveBuilder.html#method.with_timezone_opt)
 looks rather ideal! 
   
   In terms of the other macro, the `|array|` "lambda" arg would change to 
something like this for normal primitives:
   ```rust
   |capacity| -> PrimitiveArrayBuilder<T>  { 
PrimitiveArrayBuilder::with_capacity(capacity) }
   ```
   and this for the timestamp types:
   ```rust
   |capacity, tz: Option<Arc<str>>| -> PrimitiveArrayBuilder<T>  { 
       PrimitiveArrayBuilder::with_capacity(capacity).with_timezone_opt(tz)
   }
   ```
   ... which "merely" requires teaching the "lambda" to accept additional 
optional args.
   
   Thoughts?



##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -190,6 +211,29 @@ pub(crate) fn 
make_primitive_variant_to_arrow_row_builder<'a>(
             cast_options,
             capacity,
         )),
+        DataType::Timestamp(TimeUnit::Microsecond, tz) => {
+            let target_type = DataType::Timestamp(TimeUnit::Microsecond, 
tz.clone());
+
+            
TimestampMicro(VariantToPrimitiveArrowRowBuilder::new_with_target_type(
+                cast_options,
+                capacity,
+                Some(target_type),
+            ))

Review Comment:
   ... but actually, isn't this just:
   ```suggestion
               
TimestampMicro(VariantToPrimitiveArrowRowBuilder::new_with_target_type(
                   cast_options,
                   capacity,
                   Some(data_type.clone())
               ))
   ```



##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -293,21 +337,77 @@ fn get_type_name<T: ArrowPrimitiveType>() -> &'static str 
{
         "arrow_array::types::Float32Type" => "Float32",
         "arrow_array::types::Float64Type" => "Float64",
         "arrow_array::types::Float16Type" => "Float16",
+        "arrow_array::types::TimestampMicrosecondType" => 
"Timestamp(Microsecond)",
+        "arrow_array::types::TimestampNanosecondType" => 
"Timestamp(Nanosecond)",
         _ => "Unknown",
     }
 }
 
+/// Builder for converting variant values to boolean values
+/// Boolean is not primitive types in Arrow, so we need a separate builder
+pub(crate) struct VariantToBooleanArrowRowBuilder<'a> {
+    builder: arrow::array::BooleanBuilder,
+    cast_options: &'a CastOptions<'a>,
+}
+
+impl<'a> VariantToBooleanArrowRowBuilder<'a> {
+    fn new(cast_options: &'a CastOptions<'a>, capacity: usize) -> Self {
+        Self {
+            builder: arrow::array::BooleanBuilder::with_capacity(capacity),

Review Comment:
   Actually, this little exploration exposed a bad (IMO) trait design, I made a 
quick PR to fix it:
   * https://github.com/apache/arrow-rs/pull/8519
   
   After that fix merges, the macro should be vastly easier to define -- 
basically copy+paste the definition from the other module and tweak it as 
needed.
   
   And then you can use the macro to define not only timestamp and boolean 
builders, but also builders for the other temporal and decimal types.
   
   We _would_ still need to figure out the best way to handle that time zone 
issue, tho.



##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -342,7 +442,16 @@ where
     }
 
     fn finish(mut self) -> Result<ArrayRef> {
-        Ok(Arc::new(self.builder.finish()))
+        let array: PrimitiveArray<T> = self.builder.finish();
+
+        if let Some(target_type) = self.target_data_type {
+            let data = array.into_data();
+            let new_data = data.into_builder().data_type(target_type).build()?;
+            let array_with_new_type = PrimitiveArray::<T>::from(new_data);
+            return Ok(Arc::new(array_with_new_type));
+        }

Review Comment:
   Oh! It's reinstating timezone info that would otherwise be lost



##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -293,21 +337,77 @@ fn get_type_name<T: ArrowPrimitiveType>() -> &'static str 
{
         "arrow_array::types::Float32Type" => "Float32",
         "arrow_array::types::Float64Type" => "Float64",
         "arrow_array::types::Float16Type" => "Float16",
+        "arrow_array::types::TimestampMicrosecondType" => 
"Timestamp(Microsecond)",
+        "arrow_array::types::TimestampNanosecondType" => 
"Timestamp(Nanosecond)",
         _ => "Unknown",

Review Comment:
   Missing the boolean case?
   
   But actually, I suspect we can get just rid of `get_type_name`, relying on 
`T::DATA_TYPE` and the shiny 
[new](https://github.com/apache/arrow-rs/pull/8290) `impl Display for DataType` 
instead?
   
   (that cleanup would probably be a separate PR tho)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to