scovich commented on code in PR #8516:
URL: https://github.com/apache/arrow-rs/pull/8516#discussion_r2402000279
##########
parquet-variant-compute/src/type_conversion.rs:
##########
@@ -38,12 +38,33 @@ pub(crate) trait PrimitiveFromVariant: ArrowPrimitiveType {
fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native>;
}
+/// Extension trait for Arrow timestamp types that can extract their native
value from a Variant
+/// We can't use [`PrimitiveFromVariant`] directly because we might need to
use methods that
+/// are only available on [`ArrowTimestampType`] (such as with_timezone_opt)
+pub(crate) trait TimestampFromVariant: ArrowTimestampType {
+ fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native>;
+}
+
/// Macro to generate PrimitiveFromVariant implementations for Arrow primitive
types
macro_rules! impl_primitive_from_variant {
- ($arrow_type:ty, $variant_method:ident) => {
+ ($arrow_type:ty, $variant_method:ident $(, $cast_fn:expr)?) => {
impl PrimitiveFromVariant for $arrow_type {
fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native>
{
- variant.$variant_method()
+ let value = variant.$variant_method();
+ $( let value = value.map($cast_fn); )?
+ value
+ }
+ }
+ };
+ ($arrow_type:ty, $( $variant_type:pat => $variant_method:ident,
$cast_fn:expr ),+ $(,)?) => {
+ impl TimestampFromVariant for $arrow_type {
+ fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native>
{
+ match variant {
+ $(
+ $variant_type =>
variant.$variant_method().map($cast_fn),
+ )+
+ _ => None
+ }
Review Comment:
```suggestion
};
}
macro_rules! impl_timestamp_from_variant {
($timestamp_type:ty, $variant_method:ident, ntz=$ntz) => {
impl TimestampFromVariant<$ntz> for $timestamp_type {
fn from_variant(variant: &Variant<'_, '_>) ->
Option<Self::Native> {
variant.$variant_method().map(Self::make_value)
```
where
```rust
pub(crate) trait TimestampFromVariant<const NTZ: bool>: ArrowTimestampType {
```
and
```rust
/// Extension trait that lets `ArrowTimestampType` handle `DateTime<Utc>`
like `NaiveDateTime`
trait MakeValueTz: ArrowTimestampType {
fn make_value(timestamp: DateTime<Utc>) -> Option<i64> {
Self::make_value(*timestamp_type.naive_utc())
}
}
impl<T: ArrowTimestampType> MakeValueTz for T {}
```
See https://github.com/apache/arrow-rs/pull/8516#discussion_r2399107704 for
context -- this would take a generic const bool param, and would no longer need
the pattern matching because each impl could call the appropriate
`Variant::as_xxx` method, e.g.:
```rust
impl_timestamp_from_variant!(
datatypes::TimestampMicrosecondType,
as_timestamp_ntz_micros,
ntz=true
);
impl_timestamp_from_variant!(
datatypes::TimestampMicrosecondType,
as_timestamp_micros,
ntz=false
);
```
##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -293,55 +324,90 @@ fn get_type_name<T: ArrowPrimitiveType>() -> &'static str
{
"arrow_array::types::Float32Type" => "Float32",
"arrow_array::types::Float64Type" => "Float64",
"arrow_array::types::Float16Type" => "Float16",
+ "arrow_array::types::TimestampMicrosecondType" =>
"Timestamp(Microsecond)",
+ "arrow_array::types::TimestampNanosecondType" =>
"Timestamp(Nanosecond)",
+ "arrow_array::types::Date32Type" => "Date32",
_ => "Unknown",
}
}
-/// Builder for converting variant values to primitive values
-pub(crate) struct VariantToPrimitiveArrowRowBuilder<'a, T:
PrimitiveFromVariant> {
- builder: arrow::array::PrimitiveBuilder<T>,
- cast_options: &'a CastOptions<'a>,
-}
-
-impl<'a, T: PrimitiveFromVariant> VariantToPrimitiveArrowRowBuilder<'a, T> {
- fn new(cast_options: &'a CastOptions<'a>, capacity: usize) -> Self {
- Self {
- builder: PrimitiveBuilder::<T>::with_capacity(capacity),
- cast_options,
+macro_rules! define_variant_to_primitive_builder {
+ (struct $name:ident<$lifetime:lifetime $(, $generic:ident: $bound:path )?>
+ |$array_param:ident $(, $field:ident: $field_type:ty)?| ->
$builder_name:ident $(< $array_type:ty >)? { $init_expr: expr },
+ |$value: ident| $value_transform:expr,
+ type_name: $type_name:expr) => {
+ pub(crate) struct $name<$lifetime $(, $generic : $bound )?>
+ {
+ builder: $builder_name $(<$array_type>)?,
+ cast_options: &$lifetime CastOptions<$lifetime>,
}
- }
-}
-impl<'a, T: PrimitiveFromVariant> VariantToPrimitiveArrowRowBuilder<'a, T> {
- fn append_null(&mut self) -> Result<()> {
- self.builder.append_null();
- Ok(())
- }
+ impl<$lifetime $(, $generic: $bound+ )?> $name<$lifetime $(, $generic
)?> {
+ fn new(
+ cast_options: &$lifetime CastOptions<$lifetime>,
+ $array_param: usize
+ // add this so that $init_expr can use it
+ $(, $field: $field_type)?) -> Self {
+ Self {
+ builder: $init_expr,
+ cast_options,
+ }
+ }
- fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool> {
- if let Some(v) = T::from_variant(value) {
- self.builder.append_value(v);
- Ok(true)
- } else {
- if !self.cast_options.safe {
- // Unsafe casting: return error on conversion failure
- return Err(ArrowError::CastError(format!(
- "Failed to extract primitive of type {} from variant {:?}
at path VariantPath([])",
- get_type_name::<T>(),
- value
- )));
+ fn append_null(&mut self) -> Result<()> {
+ self.builder.append_null();
+ Ok(())
}
- // Safe casting: append null on conversion failure
- self.builder.append_null();
- Ok(false)
- }
- }
- fn finish(mut self) -> Result<ArrayRef> {
- Ok(Arc::new(self.builder.finish()))
+ fn append_value(&mut self, $value: &Variant<'_, '_>) ->
Result<bool> {
+ if let Some(v) = $value_transform {
+ self.builder.append_value(v);
+ Ok(true)
+ } else {
+ if !self.cast_options.safe {
+ // Unsafe casting: return error on conversion failure
+ return Err(ArrowError::CastError(format!(
+ "Failed to extract primitive of type {} from
variant {:?} at path VariantPath([])",
+ $type_name,
+ $value
+ )));
+ }
+ // Safe casting: append null on conversion failure
+ self.builder.append_null();
+ Ok(false)
+ }
+ }
+
+ fn finish(mut self) -> Result<ArrayRef> {
+ Ok(Arc::new(self.builder.finish()))
+ }
+ }
}
}
+define_variant_to_primitive_builder!(
+ struct VariantToBooleanArrowRowBuilder<'a>
+ |capacity| -> BooleanBuilder { BooleanBuilder::with_capacity(capacity) },
+ |value| value.as_boolean(),
+ type_name: "Boolean"
+);
+
+define_variant_to_primitive_builder!(
+ struct VariantToPrimitiveArrowRowBuilder<'a, T:PrimitiveFromVariant>
+ |capacity| -> PrimitiveBuilder<T> {
PrimitiveBuilder::<T>::with_capacity(capacity) },
+ |value| T::from_variant(value),
+ type_name: get_type_name::<T>()
+);
+
+define_variant_to_primitive_builder!(
Review Comment:
Actually it looks like we _do_ need the `TimestampFromVariant` trait, in
order to handle UTC vs. NTZ timestamps. See other comment.
##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -293,55 +324,90 @@ fn get_type_name<T: ArrowPrimitiveType>() -> &'static str
{
"arrow_array::types::Float32Type" => "Float32",
"arrow_array::types::Float64Type" => "Float64",
"arrow_array::types::Float16Type" => "Float16",
+ "arrow_array::types::TimestampMicrosecondType" =>
"Timestamp(Microsecond)",
+ "arrow_array::types::TimestampNanosecondType" =>
"Timestamp(Nanosecond)",
+ "arrow_array::types::Date32Type" => "Date32",
_ => "Unknown",
}
}
-/// Builder for converting variant values to primitive values
-pub(crate) struct VariantToPrimitiveArrowRowBuilder<'a, T:
PrimitiveFromVariant> {
- builder: arrow::array::PrimitiveBuilder<T>,
- cast_options: &'a CastOptions<'a>,
-}
-
-impl<'a, T: PrimitiveFromVariant> VariantToPrimitiveArrowRowBuilder<'a, T> {
- fn new(cast_options: &'a CastOptions<'a>, capacity: usize) -> Self {
- Self {
- builder: PrimitiveBuilder::<T>::with_capacity(capacity),
- cast_options,
+macro_rules! define_variant_to_primitive_builder {
+ (struct $name:ident<$lifetime:lifetime $(, $generic:ident: $bound:path )?>
+ |$array_param:ident $(, $field:ident: $field_type:ty)?| ->
$builder_name:ident $(< $array_type:ty >)? { $init_expr: expr },
+ |$value: ident| $value_transform:expr,
+ type_name: $type_name:expr) => {
+ pub(crate) struct $name<$lifetime $(, $generic : $bound )?>
+ {
+ builder: $builder_name $(<$array_type>)?,
+ cast_options: &$lifetime CastOptions<$lifetime>,
}
- }
-}
-impl<'a, T: PrimitiveFromVariant> VariantToPrimitiveArrowRowBuilder<'a, T> {
- fn append_null(&mut self) -> Result<()> {
- self.builder.append_null();
- Ok(())
- }
+ impl<$lifetime $(, $generic: $bound+ )?> $name<$lifetime $(, $generic
)?> {
+ fn new(
+ cast_options: &$lifetime CastOptions<$lifetime>,
+ $array_param: usize
+ // add this so that $init_expr can use it
+ $(, $field: $field_type)?) -> Self {
+ Self {
+ builder: $init_expr,
+ cast_options,
+ }
+ }
- fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool> {
- if let Some(v) = T::from_variant(value) {
- self.builder.append_value(v);
- Ok(true)
- } else {
- if !self.cast_options.safe {
- // Unsafe casting: return error on conversion failure
- return Err(ArrowError::CastError(format!(
- "Failed to extract primitive of type {} from variant {:?}
at path VariantPath([])",
- get_type_name::<T>(),
- value
- )));
+ fn append_null(&mut self) -> Result<()> {
+ self.builder.append_null();
+ Ok(())
}
- // Safe casting: append null on conversion failure
- self.builder.append_null();
- Ok(false)
- }
- }
- fn finish(mut self) -> Result<ArrayRef> {
- Ok(Arc::new(self.builder.finish()))
+ fn append_value(&mut self, $value: &Variant<'_, '_>) ->
Result<bool> {
+ if let Some(v) = $value_transform {
+ self.builder.append_value(v);
+ Ok(true)
+ } else {
+ if !self.cast_options.safe {
+ // Unsafe casting: return error on conversion failure
+ return Err(ArrowError::CastError(format!(
+ "Failed to extract primitive of type {} from
variant {:?} at path VariantPath([])",
+ $type_name,
+ $value
+ )));
+ }
+ // Safe casting: append null on conversion failure
+ self.builder.append_null();
+ Ok(false)
+ }
+ }
+
+ fn finish(mut self) -> Result<ArrayRef> {
+ Ok(Arc::new(self.builder.finish()))
+ }
+ }
}
}
+define_variant_to_primitive_builder!(
+ struct VariantToBooleanArrowRowBuilder<'a>
+ |capacity| -> BooleanBuilder { BooleanBuilder::with_capacity(capacity) },
+ |value| value.as_boolean(),
+ type_name: "Boolean"
+);
+
+define_variant_to_primitive_builder!(
+ struct VariantToPrimitiveArrowRowBuilder<'a, T:PrimitiveFromVariant>
+ |capacity| -> PrimitiveBuilder<T> {
PrimitiveBuilder::<T>::with_capacity(capacity) },
+ |value| T::from_variant(value),
+ type_name: get_type_name::<T>()
+);
+
+define_variant_to_primitive_builder!(
Review Comment:
I'm pretty sure we don't actually need a trait for that, and instead can do:
```rust
struct VariantToTimestampArrowRowBuilder<'a, T:PrimitiveFromVariant +
ArrowTimestampType>
```
`PrimitiveFromVariant` provides `from_variant`, and `ArrowTimestampType`
provides `with_timezone_opt`.
However, the rust macro system is pretty bad at trait bounds involving
multiple types, so we'd either need to add `where` support to the macro
(similar to the other macro):
```rust
struct VariantToTimestampArrowRowBuilder<'a, T:PrimitiveFromVariant>
where T: ArrowTimestampType,
```
or we'd keep `TimestampFromVariant` as a simple marker trait:
```rust
trait TimestampFromVariant: PrimitiveFromVariant + ArrowTimestampType {}
```
(`where` support in the macro is cleaner IMO)
##########
parquet-variant/src/variant.rs:
##########
@@ -561,6 +561,72 @@ impl<'m, 'v> Variant<'m, 'v> {
}
}
+ /// Converts this variant to a `i64` representing microseconds since the
Unix epoch if possible.
+ /// This is useful when convert the variant to arrow types.
+ ///
+ /// Returns Some(i64) for [`Variant::TimestampMicros`] and
[`Variant::TimestampNtzMicros`],
+ /// None for the other variant types.
+ ///
+ /// ```
+ /// use parquet_variant::Variant;
+ /// use chrono::NaiveDate;
+ ///
+ /// // you can extract an i64 from Variant::TimestampMicros
+ /// let datetime = NaiveDate::from_ymd_opt(2025, 10,
03).unwrap().and_hms_milli_opt(12, 34, 56, 789).unwrap().and_utc();
+ /// let v1 = Variant::from(datetime);
+ /// assert_eq!(v1.as_timestamp_micros(), Some(1759494896789000));
+ ///
+ /// // or Variant::TimestampNtzMicros
+ /// let datetime_ntz = NaiveDate::from_ymd_opt(2025, 10,
03).unwrap().and_hms_milli_opt(12, 34, 56, 789).unwrap();
+ /// let v2 = Variant::from(datetime_ntz);
+ /// assert_eq!(v1.as_timestamp_micros(), Some(1759494896789000));
+ ///
+ /// // but not from other variants
+ /// let datetime_nanos = NaiveDate::from_ymd_opt(2025, 10,
03).unwrap().and_hms_nano_opt(12, 34, 56, 789123456).unwrap().and_utc();
+ /// let v3 = Variant::from(datetime_nanos);
+ /// assert_eq!(v3.as_timestamp_micros(), None);
+ /// ```
+ pub fn as_timestamp_micros(&self) -> Option<i64> {
+ match *self {
+ Variant::TimestampMicros(d) => Some(d.timestamp_micros()),
+ Variant::TimestampNtzMicros(d) =>
Some(d.and_utc().timestamp_micros()),
Review Comment:
This is a lossy conversion... we probably need to keep separate normal vs.
ntz versions of these methods, where e.g. `as_timestamp_micros` is _not_
willing to work with `Variant::TimestampNtzMicros`. TBD whether e.g.
`as_timestamp_ntz_micros` is willing to return a `Variant::TimestampMicros`
(well-defined but technically lossy)
##########
parquet-variant-compute/src/type_conversion.rs:
##########
@@ -60,6 +65,44 @@ impl_primitive_from_variant!(datatypes::UInt64Type, as_u64);
impl_primitive_from_variant!(datatypes::Float16Type, as_f16);
impl_primitive_from_variant!(datatypes::Float32Type, as_f32);
impl_primitive_from_variant!(datatypes::Float64Type, as_f64);
+impl_primitive_from_variant!(
+ datatypes::Date32Type,
+ as_naive_date,
+ Date32Type::from_naive_date
+);
+
+pub(crate) trait TimestampFromVariant: ArrowTimestampType {
+ fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native>;
+}
+
+macro_rules! impl_timestamp_from_variant {
+ ($timestamp_type:ty, {
+ $(($variant_pattern:pat, $conversion:expr)),+ $(,)?
+ }) => {
+ impl TimestampFromVariant for $timestamp_type {
+ fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native>
{
+ match variant {
+ $(
+ $variant_pattern => $conversion,
+ )+
+ _ => None,
+ }
+ }
+ }
+ };
+}
+
+impl_timestamp_from_variant!(TimestampMicrosecondType, {
+ (Variant::TimestampMicros(t), Some(t.timestamp_micros())),
+ (Variant::TimestampNtzMicros(t), Some(t.and_utc().timestamp_micros())),
+});
+
+impl_timestamp_from_variant!(TimestampNanosecondType, {
+ (Variant::TimestampMicros(t), Some(t.timestamp_micros()).map(|t| t *
1000)),
+ (Variant::TimestampNtzMicros(t),
Some(t.and_utc().timestamp_micros()).map(|t| t * 1000)),
+ (Variant::TimestampNanos(t), t.timestamp_nanos_opt()),
+ (Variant::TimestampNtzNanos(t), t.and_utc().timestamp_nanos_opt()),
+});
Review Comment:
Ah this is tricky, I see what you mean.
Stepping back a bit:
* `NaiveDateTime` methods seem to be deprecated in favor of `DateTime<Utc>`
* The NTZ types only pass `NaiveDateTime` as a marker (they all have to call
`and_utc()` to produce a `DateTime<Utc>` before doing anything else)
* We can safely widen micros to nanos
* We can "safely" convert a TZ type to an NTZ type (discards the timezone
info) but not the other way around (wouldn't know which TZ to infer). However,
it is a [narrowing
conversion](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html#timestamps-with-an-unset--empty-timezone)
which the `Variant::as_xxx` methods usually try to avoid. Maybe we shouldn't
allow TZ <-> NTZ conversions at all?
* But arrow doesn't distinguish physically between TZ and NTZ (that
information is embedded in the `DataType` enum variants, not e.g.
`TimestampMicrosecondType`), so it needs a `from_variant` implementation that
works for both.
*
[ArrowTimestampType::make_value](https://docs.rs/arrow/latest/arrow/datatypes/trait.ArrowTimestampType.html#tymethod.make_value)
looks quite useful, and it takes a `NaiveDateTime` as input. So maybe the
correct approach will be to add `Variant::as_timestamp[_ntz]_[micros|nanos]`
methods, which respectively return `DateTime<Utc>` and `NaiveDateTime`, and
then make `TimestampFromVariant` generic over `const NTZ: bool` so we can
actually have all four implementations we need.
##########
parquet-variant-compute/src/type_conversion.rs:
##########
@@ -38,12 +38,33 @@ pub(crate) trait PrimitiveFromVariant: ArrowPrimitiveType {
fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native>;
}
+/// Extension trait for Arrow timestamp types that can extract their native
value from a Variant
+/// We can't use [`PrimitiveFromVariant`] directly because we might need to
use methods that
+/// are only available on [`ArrowTimestampType`] (such as with_timezone_opt)
+pub(crate) trait TimestampFromVariant: ArrowTimestampType {
+ fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native>;
+}
+
/// Macro to generate PrimitiveFromVariant implementations for Arrow primitive
types
macro_rules! impl_primitive_from_variant {
- ($arrow_type:ty, $variant_method:ident) => {
+ ($arrow_type:ty, $variant_method:ident $(, $cast_fn:expr)?) => {
impl PrimitiveFromVariant for $arrow_type {
fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native>
{
- variant.$variant_method()
+ let value = variant.$variant_method();
+ $( let value = value.map($cast_fn); )?
+ value
+ }
+ }
+ };
+ ($arrow_type:ty, $( $variant_type:pat => $variant_method:ident,
$cast_fn:expr ),+ $(,)?) => {
+ impl TimestampFromVariant for $arrow_type {
+ fn from_variant(variant: &Variant<'_, '_>) -> Option<Self::Native>
{
+ match variant {
+ $(
+ $variant_type =>
variant.$variant_method().map($cast_fn),
+ )+
+ _ => None
+ }
Review Comment:
The above approach has one big annoyance, tho -- it would require defining
two timestamp row builder types, one timezone-aware and the other not (= four
enum variants). Where the current code only needs one (= two enum variants).
```rust
define_variant_to_primitive_builder
struct VariantToTimestampArrowRowBuilder<'a, T:
TimestampFromVariant<false>>
|capacity, tz: Arc<str> | -> PrimitiveBuilder<T> {
PrimitiveBuilder::<T>::with_capacity(capacity).with_timezone(tz)
},
|value| T::from_variant(value),
type_name: get_type_name::<T>()
);
define_variant_to_primitive_builder
struct VariantToTimestampNtzArrowRowBuilder<'a, T:
TimestampFromVariant<true>>
|capacity| -> PrimitiveBuilder<T> {
PrimitiveBuilder::<T>::with_capacity(capacity) },
|value| T::from_variant(value),
type_name: get_type_name::<T>()
);
```
But the current code also allows invalid conversions, such as interpreting
an NTZ timestamp as UTC, because the current `as_timestamp_xxx` methods are too
narrow of a waist and lose information.
@alamb what's your take? Am I overthinking this?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]