scovich commented on code in PR #8516:
URL: https://github.com/apache/arrow-rs/pull/8516#discussion_r2402000279


##########
parquet-variant-compute/src/type_conversion.rs:
##########
@@ -38,12 +38,33 @@ pub(crate) trait PrimitiveFromVariant: ArrowPrimitiveType {
     fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native>;
 }
 
+/// Extension trait for Arrow timestamp types that can extract their native 
value from a Variant
+/// We can't use [`PrimitiveFromVariant`] directly because we might need to 
use methods that
+/// are only available on [`ArrowTimestampType`] (such as with_timezone_opt)
+pub(crate) trait TimestampFromVariant: ArrowTimestampType {
+    fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native>;
+}
+
 /// Macro to generate PrimitiveFromVariant implementations for Arrow primitive 
types
 macro_rules! impl_primitive_from_variant {
-    ($arrow_type:ty, $variant_method:ident) => {
+    ($arrow_type:ty, $variant_method:ident $(, $cast_fn:expr)?) => {
         impl PrimitiveFromVariant for $arrow_type {
             fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native> 
{
-                variant.$variant_method()
+                let value = variant.$variant_method();
+                $( let value = value.map($cast_fn); )?
+                value
+            }
+        }
+    };
+    ($arrow_type:ty, $( $variant_type:pat => $variant_method:ident, 
$cast_fn:expr ),+ $(,)?) => {
+        impl TimestampFromVariant for $arrow_type {
+            fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native> 
{
+                match variant {
+                    $(
+                        $variant_type => 
variant.$variant_method().map($cast_fn),
+                    )+
+                    _ => None
+                }

Review Comment:
   ```suggestion
       };
   }
   macro_rules! impl_timestamp_from_variant {
       ($timestamp_type:ty, $variant_method:ident, ntz=$ntz) => {
           impl TimestampFromVariant<$ntz> for $timestamp_type {
               fn from_variant(variant: &Variant<'_, '_>) -> 
Option<Self::Native> {
                   variant.$variant_method().map(Self::make_value)
   ```
   where
   ```rust
   pub(crate) trait TimestampFromVariant<const NTZ: bool>: ArrowTimestampType {
   ```
   and
   ```rust
   /// Extension trait that lets `ArrowTimestampType` handle `DateTime<Utc>` 
like `NaiveDateTime`
   trait MakeValueTz: ArrowTimestampType {
       fn make_value(timestamp: DateTime<Utc>) -> Option<i64> {
           Self::make_value(*timestamp_type.naive_utc())
       }
   }
   impl<T: ArrowTimestampType> MakeValueTz for T {}
   ```
   
   See https://github.com/apache/arrow-rs/pull/8516#discussion_r2399107704 for 
context -- this would take a generic const bool param, and would no longer need 
the pattern matching because each impl could call the appropriate 
`Variant::as_xxx` method, e.g.:
   ```rust
   impl_timestamp_from_variant!(
       datatypes::TimestampMicrosecondType, 
       as_timestamp_ntz_micros, 
       ntz=true
   );
   impl_timestamp_from_variant!(
       datatypes::TimestampMicrosecondType, 
       as_timestamp_micros, 
       ntz=false
   );
   ```



##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -293,55 +324,90 @@ fn get_type_name<T: ArrowPrimitiveType>() -> &'static str 
{
         "arrow_array::types::Float32Type" => "Float32",
         "arrow_array::types::Float64Type" => "Float64",
         "arrow_array::types::Float16Type" => "Float16",
+        "arrow_array::types::TimestampMicrosecondType" => 
"Timestamp(Microsecond)",
+        "arrow_array::types::TimestampNanosecondType" => 
"Timestamp(Nanosecond)",
+        "arrow_array::types::Date32Type" => "Date32",
         _ => "Unknown",
     }
 }
 
-/// Builder for converting variant values to primitive values
-pub(crate) struct VariantToPrimitiveArrowRowBuilder<'a, T: 
PrimitiveFromVariant> {
-    builder: arrow::array::PrimitiveBuilder<T>,
-    cast_options: &'a CastOptions<'a>,
-}
-
-impl<'a, T: PrimitiveFromVariant> VariantToPrimitiveArrowRowBuilder<'a, T> {
-    fn new(cast_options: &'a CastOptions<'a>, capacity: usize) -> Self {
-        Self {
-            builder: PrimitiveBuilder::<T>::with_capacity(capacity),
-            cast_options,
+macro_rules! define_variant_to_primitive_builder {
+    (struct $name:ident<$lifetime:lifetime $(, $generic:ident: $bound:path )?>
+    |$array_param:ident $(, $field:ident: $field_type:ty)?| -> 
$builder_name:ident $(< $array_type:ty >)? { $init_expr: expr },
+    |$value: ident| $value_transform:expr,
+    type_name: $type_name:expr) => {
+        pub(crate) struct $name<$lifetime $(, $generic : $bound )?>
+        {
+            builder: $builder_name $(<$array_type>)?,
+            cast_options: &$lifetime CastOptions<$lifetime>,
         }
-    }
-}
 
-impl<'a, T: PrimitiveFromVariant> VariantToPrimitiveArrowRowBuilder<'a, T> {
-    fn append_null(&mut self) -> Result<()> {
-        self.builder.append_null();
-        Ok(())
-    }
+        impl<$lifetime $(, $generic: $bound+ )?> $name<$lifetime $(, $generic 
)?> {
+            fn new(
+                cast_options: &$lifetime CastOptions<$lifetime>,
+                $array_param: usize
+                // add this so that $init_expr can use it
+                $(, $field: $field_type)?) -> Self {
+                Self {
+                    builder: $init_expr,
+                    cast_options,
+                }
+            }
 
-    fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool> {
-        if let Some(v) = T::from_variant(value) {
-            self.builder.append_value(v);
-            Ok(true)
-        } else {
-            if !self.cast_options.safe {
-                // Unsafe casting: return error on conversion failure
-                return Err(ArrowError::CastError(format!(
-                    "Failed to extract primitive of type {} from variant {:?} 
at path VariantPath([])",
-                    get_type_name::<T>(),
-                    value
-                )));
+            fn append_null(&mut self) -> Result<()> {
+                self.builder.append_null();
+                Ok(())
             }
-            // Safe casting: append null on conversion failure
-            self.builder.append_null();
-            Ok(false)
-        }
-    }
 
-    fn finish(mut self) -> Result<ArrayRef> {
-        Ok(Arc::new(self.builder.finish()))
+            fn append_value(&mut self, $value: &Variant<'_, '_>) -> 
Result<bool> {
+                if let Some(v) = $value_transform {
+                    self.builder.append_value(v);
+                    Ok(true)
+                } else {
+                    if !self.cast_options.safe {
+                        // Unsafe casting: return error on conversion failure
+                        return Err(ArrowError::CastError(format!(
+                            "Failed to extract primitive of type {} from 
variant {:?} at path VariantPath([])",
+                            $type_name,
+                            $value
+                        )));
+                    }
+                    // Safe casting: append null on conversion failure
+                    self.builder.append_null();
+                    Ok(false)
+                }
+            }
+
+            fn finish(mut self) -> Result<ArrayRef> {
+                Ok(Arc::new(self.builder.finish()))
+            }
+        }
     }
 }
 
+define_variant_to_primitive_builder!(
+    struct VariantToBooleanArrowRowBuilder<'a>
+    |capacity| -> BooleanBuilder { BooleanBuilder::with_capacity(capacity) },
+    |value|  value.as_boolean(),
+    type_name: "Boolean"
+);
+
+define_variant_to_primitive_builder!(
+    struct VariantToPrimitiveArrowRowBuilder<'a, T:PrimitiveFromVariant>
+    |capacity| -> PrimitiveBuilder<T> { 
PrimitiveBuilder::<T>::with_capacity(capacity) },
+    |value| T::from_variant(value),
+    type_name: get_type_name::<T>()
+);
+
+define_variant_to_primitive_builder!(

Review Comment:
   Actually it looks like we _do_ need the `TimestampFromVariant` trait, in 
order to handle UTC vs. NTZ timestamps. See other comment.



##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -293,55 +324,90 @@ fn get_type_name<T: ArrowPrimitiveType>() -> &'static str 
{
         "arrow_array::types::Float32Type" => "Float32",
         "arrow_array::types::Float64Type" => "Float64",
         "arrow_array::types::Float16Type" => "Float16",
+        "arrow_array::types::TimestampMicrosecondType" => 
"Timestamp(Microsecond)",
+        "arrow_array::types::TimestampNanosecondType" => 
"Timestamp(Nanosecond)",
+        "arrow_array::types::Date32Type" => "Date32",
         _ => "Unknown",
     }
 }
 
-/// Builder for converting variant values to primitive values
-pub(crate) struct VariantToPrimitiveArrowRowBuilder<'a, T: 
PrimitiveFromVariant> {
-    builder: arrow::array::PrimitiveBuilder<T>,
-    cast_options: &'a CastOptions<'a>,
-}
-
-impl<'a, T: PrimitiveFromVariant> VariantToPrimitiveArrowRowBuilder<'a, T> {
-    fn new(cast_options: &'a CastOptions<'a>, capacity: usize) -> Self {
-        Self {
-            builder: PrimitiveBuilder::<T>::with_capacity(capacity),
-            cast_options,
+macro_rules! define_variant_to_primitive_builder {
+    (struct $name:ident<$lifetime:lifetime $(, $generic:ident: $bound:path )?>
+    |$array_param:ident $(, $field:ident: $field_type:ty)?| -> 
$builder_name:ident $(< $array_type:ty >)? { $init_expr: expr },
+    |$value: ident| $value_transform:expr,
+    type_name: $type_name:expr) => {
+        pub(crate) struct $name<$lifetime $(, $generic : $bound )?>
+        {
+            builder: $builder_name $(<$array_type>)?,
+            cast_options: &$lifetime CastOptions<$lifetime>,
         }
-    }
-}
 
-impl<'a, T: PrimitiveFromVariant> VariantToPrimitiveArrowRowBuilder<'a, T> {
-    fn append_null(&mut self) -> Result<()> {
-        self.builder.append_null();
-        Ok(())
-    }
+        impl<$lifetime $(, $generic: $bound+ )?> $name<$lifetime $(, $generic 
)?> {
+            fn new(
+                cast_options: &$lifetime CastOptions<$lifetime>,
+                $array_param: usize
+                // add this so that $init_expr can use it
+                $(, $field: $field_type)?) -> Self {
+                Self {
+                    builder: $init_expr,
+                    cast_options,
+                }
+            }
 
-    fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool> {
-        if let Some(v) = T::from_variant(value) {
-            self.builder.append_value(v);
-            Ok(true)
-        } else {
-            if !self.cast_options.safe {
-                // Unsafe casting: return error on conversion failure
-                return Err(ArrowError::CastError(format!(
-                    "Failed to extract primitive of type {} from variant {:?} 
at path VariantPath([])",
-                    get_type_name::<T>(),
-                    value
-                )));
+            fn append_null(&mut self) -> Result<()> {
+                self.builder.append_null();
+                Ok(())
             }
-            // Safe casting: append null on conversion failure
-            self.builder.append_null();
-            Ok(false)
-        }
-    }
 
-    fn finish(mut self) -> Result<ArrayRef> {
-        Ok(Arc::new(self.builder.finish()))
+            fn append_value(&mut self, $value: &Variant<'_, '_>) -> 
Result<bool> {
+                if let Some(v) = $value_transform {
+                    self.builder.append_value(v);
+                    Ok(true)
+                } else {
+                    if !self.cast_options.safe {
+                        // Unsafe casting: return error on conversion failure
+                        return Err(ArrowError::CastError(format!(
+                            "Failed to extract primitive of type {} from 
variant {:?} at path VariantPath([])",
+                            $type_name,
+                            $value
+                        )));
+                    }
+                    // Safe casting: append null on conversion failure
+                    self.builder.append_null();
+                    Ok(false)
+                }
+            }
+
+            fn finish(mut self) -> Result<ArrayRef> {
+                Ok(Arc::new(self.builder.finish()))
+            }
+        }
     }
 }
 
+define_variant_to_primitive_builder!(
+    struct VariantToBooleanArrowRowBuilder<'a>
+    |capacity| -> BooleanBuilder { BooleanBuilder::with_capacity(capacity) },
+    |value|  value.as_boolean(),
+    type_name: "Boolean"
+);
+
+define_variant_to_primitive_builder!(
+    struct VariantToPrimitiveArrowRowBuilder<'a, T:PrimitiveFromVariant>
+    |capacity| -> PrimitiveBuilder<T> { 
PrimitiveBuilder::<T>::with_capacity(capacity) },
+    |value| T::from_variant(value),
+    type_name: get_type_name::<T>()
+);
+
+define_variant_to_primitive_builder!(

Review Comment:
   I'm pretty sure we don't actually need a trait for that, and instead can do:
   ```rust
   struct VariantToTimestampArrowRowBuilder<'a, T:PrimitiveFromVariant + 
ArrowTimestampType>
   ```
   `PrimitiveFromVariant` provides `from_variant`, and `ArrowTimestampType` 
provides `with_timezone_opt`.
   
   However, the rust macro system is pretty bad at trait bounds involving 
multiple types, so we'd either need to add `where` support to the macro 
(similar to the other macro):
   ```rust
   struct VariantToTimestampArrowRowBuilder<'a, T:PrimitiveFromVariant>
   where T: ArrowTimestampType,
   ```
   
   or we'd keep `TimestampFromVariant` as a simple marker trait:
   ```rust
   trait TimestampFromVariant: PrimitiveFromVariant + ArrowTimestampType {}
   ```
   (`where` support in the macro is cleaner IMO)



##########
parquet-variant/src/variant.rs:
##########
@@ -561,6 +561,72 @@ impl<'m, 'v> Variant<'m, 'v> {
         }
     }
 
+    /// Converts this variant to a `i64` representing microseconds since the 
Unix epoch if possible.
+    /// This is useful when convert the variant to arrow types.
+    ///
+    /// Returns Some(i64) for [`Variant::TimestampMicros`] and 
[`Variant::TimestampNtzMicros`],
+    /// None for the other variant types.
+    ///
+    /// ```
+    /// use parquet_variant::Variant;
+    /// use chrono::NaiveDate;
+    ///
+    /// // you can extract an i64 from Variant::TimestampMicros
+    /// let datetime = NaiveDate::from_ymd_opt(2025, 10, 
03).unwrap().and_hms_milli_opt(12, 34, 56, 789).unwrap().and_utc();
+    /// let v1 = Variant::from(datetime);
+    /// assert_eq!(v1.as_timestamp_micros(), Some(1759494896789000));
+    ///
+    /// // or Variant::TimestampNtzMicros
+    /// let datetime_ntz = NaiveDate::from_ymd_opt(2025, 10, 
03).unwrap().and_hms_milli_opt(12, 34, 56, 789).unwrap();
+    /// let v2 = Variant::from(datetime_ntz);
+    /// assert_eq!(v1.as_timestamp_micros(), Some(1759494896789000));
+    ///
+    /// // but not from other variants
+    /// let datetime_nanos = NaiveDate::from_ymd_opt(2025, 10, 
03).unwrap().and_hms_nano_opt(12, 34, 56, 789123456).unwrap().and_utc();
+    /// let v3 = Variant::from(datetime_nanos);
+    /// assert_eq!(v3.as_timestamp_micros(), None);
+    /// ```
+    pub fn as_timestamp_micros(&self) -> Option<i64> {
+        match *self {
+            Variant::TimestampMicros(d) => Some(d.timestamp_micros()),
+            Variant::TimestampNtzMicros(d) => 
Some(d.and_utc().timestamp_micros()),

Review Comment:
   This is a lossy conversion... we probably need to keep separate normal vs. 
ntz versions of these methods, where e.g. `as_timestamp_micros` is _not_ 
willing to work with `Variant::TimestampNtzMicros`. TBD whether e.g. 
`as_timestamp_ntz_micros` is willing to return a `Variant::TimestampMicros` 
(well-defined but technically lossy)



##########
parquet-variant-compute/src/type_conversion.rs:
##########
@@ -60,6 +65,44 @@ impl_primitive_from_variant!(datatypes::UInt64Type, as_u64);
 impl_primitive_from_variant!(datatypes::Float16Type, as_f16);
 impl_primitive_from_variant!(datatypes::Float32Type, as_f32);
 impl_primitive_from_variant!(datatypes::Float64Type, as_f64);
+impl_primitive_from_variant!(
+    datatypes::Date32Type,
+    as_naive_date,
+    Date32Type::from_naive_date
+);
+
+pub(crate) trait TimestampFromVariant: ArrowTimestampType {
+    fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native>;
+}
+
+macro_rules! impl_timestamp_from_variant {
+    ($timestamp_type:ty, {
+        $(($variant_pattern:pat, $conversion:expr)),+ $(,)?
+    }) => {
+        impl TimestampFromVariant for $timestamp_type {
+            fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native> 
{
+                match variant {
+                    $(
+                        $variant_pattern => $conversion,
+                    )+
+                    _ => None,
+                }
+            }
+        }
+    };
+}
+
+impl_timestamp_from_variant!(TimestampMicrosecondType, {
+    (Variant::TimestampMicros(t), Some(t.timestamp_micros())),
+    (Variant::TimestampNtzMicros(t), Some(t.and_utc().timestamp_micros())),
+});
+
+impl_timestamp_from_variant!(TimestampNanosecondType, {
+    (Variant::TimestampMicros(t), Some(t.timestamp_micros()).map(|t| t * 
1000)),
+    (Variant::TimestampNtzMicros(t), 
Some(t.and_utc().timestamp_micros()).map(|t| t * 1000)),
+    (Variant::TimestampNanos(t), t.timestamp_nanos_opt()),
+    (Variant::TimestampNtzNanos(t), t.and_utc().timestamp_nanos_opt()),
+});

Review Comment:
   Ah this is tricky, I see what you mean. 
   
   Stepping back a bit:
   * `NaiveDateTime` methods seem to be deprecated in favor of `DateTime<Utc>`
   * The NTZ types only pass `NaiveDateTime` as a marker (they all have to call 
`and_utc()` to produce a `DateTime<Utc>` before doing anything else)
   * We can safely widen micros to nanos
   * We can "safely" convert a TZ type to an NTZ type (discards the timezone 
info) but not the other way around (wouldn't know which TZ to infer). However, 
it is a [narrowing 
conversion](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html#timestamps-with-an-unset--empty-timezone)
 which the `Variant::as_xxx` methods usually try to avoid. Maybe we shouldn't 
allow TZ <-> NTZ conversions at all?
   * But arrow doesn't distinguish physically between TZ and NTZ (that 
information is embedded in the `DataType` enum variants, not e.g. 
`TimestampMicrosecondType`), so it needs a `from_variant` implementation that 
works for both. 
   * 
[ArrowTimestampType::make_value](https://docs.rs/arrow/latest/arrow/datatypes/trait.ArrowTimestampType.html#tymethod.make_value)
 looks quite useful, and it takes a `NaiveDateTime` as input. So maybe the 
correct approach will be to add `Variant::as_timestamp[_ntz]_[micros|nanos]` 
methods, which respectively return `DateTime<Utc>` and `NaiveDateTime`, and 
then make `TimestampFromVariant` generic over `const NTZ: bool` so we can 
actually have all four implementations we need.



##########
parquet-variant-compute/src/type_conversion.rs:
##########
@@ -38,12 +38,33 @@ pub(crate) trait PrimitiveFromVariant: ArrowPrimitiveType {
     fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native>;
 }
 
+/// Extension trait for Arrow timestamp types that can extract their native 
value from a Variant
+/// We can't use [`PrimitiveFromVariant`] directly because we might need to 
use methods that
+/// are only available on [`ArrowTimestampType`] (such as with_timezone_opt)
+pub(crate) trait TimestampFromVariant: ArrowTimestampType {
+    fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native>;
+}
+
 /// Macro to generate PrimitiveFromVariant implementations for Arrow primitive 
types
 macro_rules! impl_primitive_from_variant {
-    ($arrow_type:ty, $variant_method:ident) => {
+    ($arrow_type:ty, $variant_method:ident $(, $cast_fn:expr)?) => {
         impl PrimitiveFromVariant for $arrow_type {
             fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native> 
{
-                variant.$variant_method()
+                let value = variant.$variant_method();
+                $( let value = value.map($cast_fn); )?
+                value
+            }
+        }
+    };
+    ($arrow_type:ty, $( $variant_type:pat => $variant_method:ident, 
$cast_fn:expr ),+ $(,)?) => {
+        impl TimestampFromVariant for $arrow_type {
+            fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native> 
{
+                match variant {
+                    $(
+                        $variant_type => 
variant.$variant_method().map($cast_fn),
+                    )+
+                    _ => None
+                }

Review Comment:
   The above approach has one big annoyance, tho -- it would require defining 
two timestamp row builder types, one timezone-aware and the other not (= four 
enum variants). Where the current code only needs one (= two enum variants). 
   
   ```rust
   define_variant_to_primitive_builder
       struct VariantToTimestampArrowRowBuilder<'a, T: 
TimestampFromVariant<false>>
       |capacity, tz: Arc<str> | -> PrimitiveBuilder<T> {
           PrimitiveBuilder::<T>::with_capacity(capacity).with_timezone(tz)
       },
       |value| T::from_variant(value),
       type_name: get_type_name::<T>()
   );
   
   define_variant_to_primitive_builder
       struct VariantToTimestampNtzArrowRowBuilder<'a, T: 
TimestampFromVariant<true>>
       |capacity| -> PrimitiveBuilder<T> { 
PrimitiveBuilder::<T>::with_capacity(capacity) },
       |value| T::from_variant(value),
       type_name: get_type_name::<T>()
   );
   ```
   
   But the current code also allows invalid conversions, such as interpreting 
an NTZ timestamp as UTC, because the current `as_timestamp_xxx` methods are too 
narrow of a waist and lose information.
   
   @alamb what's your take? Am I overthinking this?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to