klion26 commented on code in PR #8516:
URL: https://github.com/apache/arrow-rs/pull/8516#discussion_r2394302559


##########
parquet-variant-compute/src/type_conversion.rs:
##########
@@ -98,6 +99,34 @@ impl VariantAsPrimitive<datatypes::UInt64Type> for 
Variant<'_, '_> {
     }
 }
 
+impl VariantAsPrimitive<datatypes::TimestampMicrosecondType> for Variant<'_, 
'_> {
+    fn as_primitive(&self) -> Option<i64> {
+        match self {
+            Variant::TimestampMicros(dt) => Some(dt.timestamp_micros()),
+            Variant::TimestampNtzMicros(ndt) => 
Some(ndt.and_utc().timestamp_micros()),
+            _ => None,
+        }
+    }
+}
+
+impl VariantAsPrimitive<datatypes::TimestampNanosecondType> for Variant<'_, 
'_> {
+    fn as_primitive(&self) -> Option<i64> {
+        match self {
+            Variant::TimestampNanos(dt) => dt.timestamp_nanos_opt(),
+            Variant::TimestampNtzNanos(ndt) => 
ndt.and_utc().timestamp_nanos_opt(),
+            _ => None,

Review Comment:
   Yes, we can widen micros to nanos if wanted. will add this.



##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -87,12 +99,17 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
             Float16(b) => b.append_value(value),
             Float32(b) => b.append_value(value),
             Float64(b) => b.append_value(value),
+            Boolean(b) => b.append_value(value),

Review Comment:
   Ah, good catch, will change it



##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -342,7 +442,16 @@ where
     }
 
     fn finish(mut self) -> Result<ArrayRef> {
-        Ok(Arc::new(self.builder.finish()))
+        let array: PrimitiveArray<T> = self.builder.finish();
+
+        if let Some(target_type) = self.target_data_type {
+            let data = array.into_data();
+            let new_data = data.into_builder().data_type(target_type).build()?;
+            let array_with_new_type = PrimitiveArray::<T>::from(new_data);
+            return Ok(Arc::new(array_with_new_type));
+        }

Review Comment:
   Fix it



##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -190,6 +211,29 @@ pub(crate) fn 
make_primitive_variant_to_arrow_row_builder<'a>(
             cast_options,
             capacity,
         )),
+        DataType::Timestamp(TimeUnit::Microsecond, tz) => {
+            let target_type = DataType::Timestamp(TimeUnit::Microsecond, 
tz.clone());
+
+            
TimestampMicro(VariantToPrimitiveArrowRowBuilder::new_with_target_type(
+                cast_options,
+                capacity,
+                Some(target_type),
+            ))

Review Comment:
   Fixed it



##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -293,21 +337,77 @@ fn get_type_name<T: ArrowPrimitiveType>() -> &'static str 
{
         "arrow_array::types::Float32Type" => "Float32",
         "arrow_array::types::Float64Type" => "Float64",
         "arrow_array::types::Float16Type" => "Float16",
+        "arrow_array::types::TimestampMicrosecondType" => 
"Timestamp(Microsecond)",
+        "arrow_array::types::TimestampNanosecondType" => 
"Timestamp(Nanosecond)",
         _ => "Unknown",
     }
 }
 
+/// Builder for converting variant values to boolean values
+/// Boolean is not primitive types in Arrow, so we need a separate builder
+pub(crate) struct VariantToBooleanArrowRowBuilder<'a> {
+    builder: arrow::array::BooleanBuilder,
+    cast_options: &'a CastOptions<'a>,
+}
+
+impl<'a> VariantToBooleanArrowRowBuilder<'a> {
+    fn new(cast_options: &'a CastOptions<'a>, capacity: usize) -> Self {
+        Self {
+            builder: arrow::array::BooleanBuilder::with_capacity(capacity),

Review Comment:
   Thanks very much for the detailed description; it is definitely much better 
than the current solution. will try this way.
   
   I've tried to make `VariantToPrimitiveArrowRowBuilder` generic by extracting 
the `value.as_primitive` in `append_value` to some `converter` so that boolean 
just needs to implement a 
`converter`(https://github.com/klion26/arrow-rs/commit/7533c7ee310b0f1f4d829a766978d1ac72a293db),
 but stick with the current version as it did not seem much better.



##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -697,7 +699,7 @@ mod test {
     }
 
     macro_rules! perfectly_shredded_to_arrow_primitive_test {
-        ($name:ident, $primitive_type:ident, 
$perfectly_shredded_array_gen_fun:ident, $expected_array:expr) => {
+        ($name:ident, $primitive_type:expr, 
$perfectly_shredded_array_gen_fun:ident, $expected_array:expr) => {

Review Comment:
   Change this because we have parameters like 
`DataType::Timestamp(TimeUnit::Microsecond, Some(Arc::from("+00:00")))` and 
`ident` doesn't support this.
   Is there any better solution here?



##########
parquet-variant-compute/src/type_conversion.rs:
##########
@@ -98,6 +99,34 @@ impl VariantAsPrimitive<datatypes::UInt64Type> for 
Variant<'_, '_> {
     }
 }
 
+impl VariantAsPrimitive<datatypes::TimestampMicrosecondType> for Variant<'_, 
'_> {
+    fn as_primitive(&self) -> Option<i64> {
+        match self {
+            Variant::TimestampMicros(dt) => Some(dt.timestamp_micros()),
+            Variant::TimestampNtzMicros(ndt) => 
Some(ndt.and_utc().timestamp_micros()),
+            _ => None,
+        }
+    }
+}
+
+impl VariantAsPrimitive<datatypes::TimestampNanosecondType> for Variant<'_, 
'_> {
+    fn as_primitive(&self) -> Option<i64> {
+        match self {
+            Variant::TimestampNanos(dt) => dt.timestamp_nanos_opt(),
+            Variant::TimestampNtzNanos(ndt) => 
ndt.and_utc().timestamp_nanos_opt(),
+            _ => None,
+        }
+    }
+}
+
+impl VariantAsPrimitive<datatypes::Date32Type> for Variant<'_, '_> {
+    fn as_primitive(&self) -> Option<i32> {
+        // The number of days from 0001-01-01 to 1970-01-01.
+        const DAYS_FROM_CE_TO_UNIX_EPOCH: i32 = 719163;
+        self.as_naive_date()
+            .map(|d| d.num_days_from_ce() - DAYS_FROM_CE_TO_UNIX_EPOCH)

Review Comment:
   Will change to use it.
   
   Did not notice this function. Choosing the current solution seems to have a 
better performance(vs a handle write logic like `Date32Type::from_naive_date`). 
Using the existing function is better for maintenance, and we can improve the 
function if needed.



##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -293,21 +337,77 @@ fn get_type_name<T: ArrowPrimitiveType>() -> &'static str 
{
         "arrow_array::types::Float32Type" => "Float32",
         "arrow_array::types::Float64Type" => "Float64",
         "arrow_array::types::Float16Type" => "Float16",
+        "arrow_array::types::TimestampMicrosecondType" => 
"Timestamp(Microsecond)",
+        "arrow_array::types::TimestampNanosecondType" => 
"Timestamp(Nanosecond)",
         _ => "Unknown",

Review Comment:
   Currently, the function `get_type_name` is only used in 
`VariantToPrimitiveArrowRowBuilder`, and `Boolean` is another separate builder, 
so I didn't add `boolean` here.
   
   Will follow the cleanup logic.



##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -293,21 +337,77 @@ fn get_type_name<T: ArrowPrimitiveType>() -> &'static str 
{
         "arrow_array::types::Float32Type" => "Float32",
         "arrow_array::types::Float64Type" => "Float64",
         "arrow_array::types::Float16Type" => "Float16",
+        "arrow_array::types::TimestampMicrosecondType" => 
"Timestamp(Microsecond)",
+        "arrow_array::types::TimestampNanosecondType" => 
"Timestamp(Nanosecond)",
         _ => "Unknown",
     }
 }
 
+/// Builder for converting variant values to boolean values
+/// Boolean is not primitive types in Arrow, so we need a separate builder
+pub(crate) struct VariantToBooleanArrowRowBuilder<'a> {
+    builder: arrow::array::BooleanBuilder,
+    cast_options: &'a CastOptions<'a>,
+}
+
+impl<'a> VariantToBooleanArrowRowBuilder<'a> {
+    fn new(cast_options: &'a CastOptions<'a>, capacity: usize) -> Self {
+        Self {
+            builder: arrow::array::BooleanBuilder::with_capacity(capacity),
+            cast_options,
+        }
+    }
+
+    fn append_null(&mut self) -> Result<()> {
+        self.builder.append_null();
+        Ok(())
+    }
+
+    fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool> {
+        if let Some(v) = value.as_boolean() {
+            self.builder.append_value(v);
+            Ok(true)
+        } else {
+            if !self.cast_options.safe {
+                // Unsafe casting: return error on conversion failure
+                return Err(ArrowError::CastError(format!(
+                    "Failed to extract boolean from variant {:?} at path 
VariantPath([])",
+                    value
+                )));
+            }
+            // Safe casting: append null on conversion failure
+            self.builder.append_null();
+            Ok(false)
+        }
+    }
+
+    fn finish(mut self) -> Result<ArrayRef> {
+        Ok(Arc::new(self.builder.finish()))
+    }
+}
+
 /// Builder for converting variant values to primitive values
 pub(crate) struct VariantToPrimitiveArrowRowBuilder<'a, T: ArrowPrimitiveType> 
{
     builder: arrow::array::PrimitiveBuilder<T>,
     cast_options: &'a CastOptions<'a>,
+    // this used to change the data type of the resulting array, e.g. to add 
timezone info
+    target_data_type: Option<DataType>,
 }
 
 impl<'a, T: ArrowPrimitiveType> VariantToPrimitiveArrowRowBuilder<'a, T> {
     fn new(cast_options: &'a CastOptions<'a>, capacity: usize) -> Self {
+        Self::new_with_target_type(cast_options, capacity, None)
+    }
+
+    fn new_with_target_type(
+        cast_options: &'a CastOptions<'a>,
+        capacity: usize,
+        target_data_type: Option<DataType>,
+    ) -> Self {

Review Comment:
   Yes, it's better than the current solution, will improve it.
   
   Seems `ArrayDataBuiler::build` may consume some time, will add a separate 
match arm for this 
   
   ```
   ArrayDataBuilder::build(self) -> Result<ArrayData, ArrowError> {
     ...
           if align_buffers {
               data.align_buffers();
           }
   
           // SAFETY: `skip_validation` is only set to true using `unsafe` APIs
           if !skip_validation.get() || cfg!(feature = "force_validate") {
               data.validate_data()?;
           }
   ...
   }
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to