scovich commented on code in PR #8516:
URL: https://github.com/apache/arrow-rs/pull/8516#discussion_r2392087582
##########
parquet-variant-compute/src/type_conversion.rs:
##########
@@ -98,6 +99,34 @@ impl VariantAsPrimitive<datatypes::UInt64Type> for
Variant<'_, '_> {
}
}
+impl VariantAsPrimitive<datatypes::TimestampMicrosecondType> for Variant<'_,
'_> {
+ fn as_primitive(&self) -> Option<i64> {
+ match self {
+ Variant::TimestampMicros(dt) => Some(dt.timestamp_micros()),
+ Variant::TimestampNtzMicros(ndt) =>
Some(ndt.and_utc().timestamp_micros()),
+ _ => None,
+ }
+ }
+}
+
+impl VariantAsPrimitive<datatypes::TimestampNanosecondType> for Variant<'_,
'_> {
+ fn as_primitive(&self) -> Option<i64> {
+ match self {
+ Variant::TimestampNanos(dt) => dt.timestamp_nanos_opt(),
+ Variant::TimestampNtzNanos(ndt) =>
ndt.and_utc().timestamp_nanos_opt(),
+ _ => None,
+ }
+ }
+}
+
+impl VariantAsPrimitive<datatypes::Date32Type> for Variant<'_, '_> {
+ fn as_primitive(&self) -> Option<i32> {
+ // The number of days from 0001-01-01 to 1970-01-01.
+ const DAYS_FROM_CE_TO_UNIX_EPOCH: i32 = 719163;
+ self.as_naive_date()
+ .map(|d| d.num_days_from_ce() - DAYS_FROM_CE_TO_UNIX_EPOCH)
Review Comment:
Should we just use `Date32Type::to_naive_date`?
It has `unwrap` calls internally, but I'm _fairly_ certain it cannot
actually panic because i32 range isn't big enough to overflow anything.
##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -697,7 +699,7 @@ mod test {
}
macro_rules! perfectly_shredded_to_arrow_primitive_test {
- ($name:ident, $primitive_type:ident,
$perfectly_shredded_array_gen_fun:ident, $expected_array:expr) => {
+ ($name:ident, $primitive_type:expr,
$perfectly_shredded_array_gen_fun:ident, $expected_array:expr) => {
Review Comment:
Why this change? I don't see any expressions being passed?
##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -190,6 +211,29 @@ pub(crate) fn
make_primitive_variant_to_arrow_row_builder<'a>(
cast_options,
capacity,
)),
+ DataType::Timestamp(TimeUnit::Microsecond, tz) => {
+ let target_type = DataType::Timestamp(TimeUnit::Microsecond,
tz.clone());
+
+
TimestampMicro(VariantToPrimitiveArrowRowBuilder::new_with_target_type(
+ cast_options,
+ capacity,
+ Some(target_type),
+ ))
Review Comment:
nit: I think it's actually smaller code (fewer lines _and_ shorter lines) to
_not_ factor out `target_type`?
```suggestion
TimestampMicro(VariantToPrimitiveArrowRowBuilder::new_with_target_type(
cast_options,
capacity,
Some(DataType::Timestamp(TimeUnit::Microsecond, tz.clone())),
))
```
(again below)
##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -342,7 +442,16 @@ where
}
fn finish(mut self) -> Result<ArrayRef> {
- Ok(Arc::new(self.builder.finish()))
+ let array: PrimitiveArray<T> = self.builder.finish();
+
+ if let Some(target_type) = self.target_data_type {
+ let data = array.into_data();
+ let new_data = data.into_builder().data_type(target_type).build()?;
+ let array_with_new_type = PrimitiveArray::<T>::from(new_data);
+ return Ok(Arc::new(array_with_new_type));
+ }
Review Comment:
I don't think I understand why we need this juggling of data types, when all
the temporal types [impl
ArrowPrimitiveType](https://docs.rs/arrow/latest/arrow/array/trait.ArrowPrimitiveType.html#implementors)?
##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -342,7 +442,16 @@ where
}
fn finish(mut self) -> Result<ArrayRef> {
- Ok(Arc::new(self.builder.finish()))
+ let array: PrimitiveArray<T> = self.builder.finish();
+
+ if let Some(target_type) = self.target_data_type {
+ let data = array.into_data();
+ let new_data = data.into_builder().data_type(target_type).build()?;
+ let array_with_new_type = PrimitiveArray::<T>::from(new_data);
+ return Ok(Arc::new(array_with_new_type));
+ }
Review Comment:
```suggestion
let mut array = self.builder.finish();
if let Some(target_type) = self.target_data_type {
let data = array.into_data();
let new_data =
data.into_builder().data_type(target_type).build()?;
array = new_data.into();
}
```
##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -293,21 +337,77 @@ fn get_type_name<T: ArrowPrimitiveType>() -> &'static str
{
"arrow_array::types::Float32Type" => "Float32",
"arrow_array::types::Float64Type" => "Float64",
"arrow_array::types::Float16Type" => "Float16",
+ "arrow_array::types::TimestampMicrosecondType" =>
"Timestamp(Microsecond)",
+ "arrow_array::types::TimestampNanosecondType" =>
"Timestamp(Nanosecond)",
_ => "Unknown",
}
}
+/// Builder for converting variant values to boolean values
+/// Boolean is not primitive types in Arrow, so we need a separate builder
+pub(crate) struct VariantToBooleanArrowRowBuilder<'a> {
+ builder: arrow::array::BooleanBuilder,
+ cast_options: &'a CastOptions<'a>,
+}
+
+impl<'a> VariantToBooleanArrowRowBuilder<'a> {
+ fn new(cast_options: &'a CastOptions<'a>, capacity: usize) -> Self {
+ Self {
+ builder: arrow::array::BooleanBuilder::with_capacity(capacity),
+ cast_options,
+ }
+ }
+
+ fn append_null(&mut self) -> Result<()> {
+ self.builder.append_null();
+ Ok(())
+ }
+
+ fn append_value(&mut self, value: &Variant<'_, '_>) -> Result<bool> {
+ if let Some(v) = value.as_boolean() {
+ self.builder.append_value(v);
+ Ok(true)
+ } else {
+ if !self.cast_options.safe {
+ // Unsafe casting: return error on conversion failure
+ return Err(ArrowError::CastError(format!(
+ "Failed to extract boolean from variant {:?} at path
VariantPath([])",
+ value
+ )));
+ }
+ // Safe casting: append null on conversion failure
+ self.builder.append_null();
+ Ok(false)
+ }
+ }
+
+ fn finish(mut self) -> Result<ArrayRef> {
+ Ok(Arc::new(self.builder.finish()))
+ }
+}
+
/// Builder for converting variant values to primitive values
pub(crate) struct VariantToPrimitiveArrowRowBuilder<'a, T: ArrowPrimitiveType>
{
builder: arrow::array::PrimitiveBuilder<T>,
cast_options: &'a CastOptions<'a>,
+ // this used to change the data type of the resulting array, e.g. to add
timezone info
+ target_data_type: Option<DataType>,
}
impl<'a, T: ArrowPrimitiveType> VariantToPrimitiveArrowRowBuilder<'a, T> {
fn new(cast_options: &'a CastOptions<'a>, capacity: usize) -> Self {
+ Self::new_with_target_type(cast_options, capacity, None)
+ }
+
+ fn new_with_target_type(
+ cast_options: &'a CastOptions<'a>,
+ capacity: usize,
+ target_data_type: Option<DataType>,
+ ) -> Self {
Review Comment:
Would it be safer/cleaner to limit this to timestamp types only?
```rust
impl<'a, T: ArrowTimestampType> VariantToPrimitiveArrowRowBuilder<'a, T> {
fn with_target_type(mut self, data_type: DataType) -> Self {
self.target_data_type = Some(data_type)
self
}
}
```
and then e.g.
```rust
DataType::Timestamp(TimeUnit::Microsecond, None) => TimestampMicro(
VariantToPrimitiveArrowRowBuilder::new(cast_options, capacity)
),
DataType::Timestamp(TimeUnit::Microsecond, _) => TimestampMicro(
VariantToPrimitiveArrowRowBuilder::new(cast_options, capacity)
.with_target_type(data_type.clone()) // preserve timezone
info
),
```
(If the array -> data -> builder -> data -> array conversion is super cheap,
then we don't need the first match arm above)
I don't know a good way to factor out or isolate the other c ode paths, but
at least this would prevent them from activating for any unexpected types?
##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -87,12 +99,17 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> {
Float16(b) => b.append_value(value),
Float32(b) => b.append_value(value),
Float64(b) => b.append_value(value),
+ Boolean(b) => b.append_value(value),
Review Comment:
nit: Why not ordered like the others?
##########
parquet-variant-compute/src/type_conversion.rs:
##########
@@ -98,6 +99,34 @@ impl VariantAsPrimitive<datatypes::UInt64Type> for
Variant<'_, '_> {
}
}
+impl VariantAsPrimitive<datatypes::TimestampMicrosecondType> for Variant<'_,
'_> {
+ fn as_primitive(&self) -> Option<i64> {
+ match self {
+ Variant::TimestampMicros(dt) => Some(dt.timestamp_micros()),
+ Variant::TimestampNtzMicros(ndt) =>
Some(ndt.and_utc().timestamp_micros()),
+ _ => None,
+ }
+ }
+}
+
+impl VariantAsPrimitive<datatypes::TimestampNanosecondType> for Variant<'_,
'_> {
+ fn as_primitive(&self) -> Option<i64> {
+ match self {
+ Variant::TimestampNanos(dt) => dt.timestamp_nanos_opt(),
+ Variant::TimestampNtzNanos(ndt) =>
ndt.and_utc().timestamp_nanos_opt(),
+ _ => None,
Review Comment:
We could also widen micros to nanos here, if we wanted?
##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -293,21 +337,77 @@ fn get_type_name<T: ArrowPrimitiveType>() -> &'static str
{
"arrow_array::types::Float32Type" => "Float32",
"arrow_array::types::Float64Type" => "Float64",
"arrow_array::types::Float16Type" => "Float16",
+ "arrow_array::types::TimestampMicrosecondType" =>
"Timestamp(Microsecond)",
+ "arrow_array::types::TimestampNanosecondType" =>
"Timestamp(Nanosecond)",
_ => "Unknown",
}
}
+/// Builder for converting variant values to boolean values
+/// Boolean is not primitive types in Arrow, so we need a separate builder
+pub(crate) struct VariantToBooleanArrowRowBuilder<'a> {
+ builder: arrow::array::BooleanBuilder,
+ cast_options: &'a CastOptions<'a>,
+}
+
+impl<'a> VariantToBooleanArrowRowBuilder<'a> {
+ fn new(cast_options: &'a CastOptions<'a>, capacity: usize) -> Self {
+ Self {
+ builder: arrow::array::BooleanBuilder::with_capacity(capacity),
Review Comment:
This is virtually identical to the primitive row builder... AFAICT the only
syntactic differences are:
* The generics (or lack there of)
* The array builder's type name
* The method invoked to convert a variant value to the target type
I almost wonder if it would be worthwhile to define a macro that can cover
both, similar to `define_row_builder!` macro in arrow_to_variant.rs?
* Arrow decimal data types are not primitive and will eventually need
similar treatment?
* Could more easily define temporal builders with the necessary logic
without "polluting" normal primitive builders (tho the custom finisher logic
would be add extra complexity to an already complex macro, so maybe not worth
it)
Invoking such a macro might look like:
```rust
define_row_builder!(
struct VariantToBooleanArrowRowBuilder<'a> {
builder: BooleanBuilder,
},
|v| v.as_boolean()
)
```
and
```rust
define_row_builder!(
struct VariantToPrimitiveArrowRowBuilder<'a, T: ArrowPrimitiveType>
where
for<'m, 'v> Variant<'m, 'v>: VariantAsPrimitive<T>,
{
builder: PrimitiveBuilder<T>,
},
|v| v.as_primitive()
)
```
##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -293,21 +337,77 @@ fn get_type_name<T: ArrowPrimitiveType>() -> &'static str
{
"arrow_array::types::Float32Type" => "Float32",
"arrow_array::types::Float64Type" => "Float64",
"arrow_array::types::Float16Type" => "Float16",
+ "arrow_array::types::TimestampMicrosecondType" =>
"Timestamp(Microsecond)",
+ "arrow_array::types::TimestampNanosecondType" =>
"Timestamp(Nanosecond)",
_ => "Unknown",
}
}
+/// Builder for converting variant values to boolean values
+/// Boolean is not primitive types in Arrow, so we need a separate builder
+pub(crate) struct VariantToBooleanArrowRowBuilder<'a> {
+ builder: arrow::array::BooleanBuilder,
+ cast_options: &'a CastOptions<'a>,
+}
+
+impl<'a> VariantToBooleanArrowRowBuilder<'a> {
+ fn new(cast_options: &'a CastOptions<'a>, capacity: usize) -> Self {
+ Self {
+ builder: arrow::array::BooleanBuilder::with_capacity(capacity),
Review Comment:
> We _would_ still need to figure out the best way to handle that time zone
issue, tho.
Oh...
[PrimitiveBuilder::with_timezone_opt](https://docs.rs/arrow/latest/arrow/array/struct.PrimitiveBuilder.html#method.with_timezone_opt)
looks rather ideal!
In terms of the other macro, the `|array|` "lambda" arg would change to
something like this for normal primitives:
```rust
|capacity| -> PrimitiveArrayBuilder<T> {
PrimitiveArrayBuilder::with_capacity(capacity) }
```
and this for the timestamp types:
```rust
|capacity, tz: Option<Arc<str>>| -> PrimitiveArrayBuilder<T> {
PrimitiveArrayBuilder::with_capacity(capacity).with_timezone_opt(tz)
}
```
... which "merely" requires teaching the "lambda" to accept additional
optional args.
Thoughts?
##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -190,6 +211,29 @@ pub(crate) fn
make_primitive_variant_to_arrow_row_builder<'a>(
cast_options,
capacity,
)),
+ DataType::Timestamp(TimeUnit::Microsecond, tz) => {
+ let target_type = DataType::Timestamp(TimeUnit::Microsecond,
tz.clone());
+
+
TimestampMicro(VariantToPrimitiveArrowRowBuilder::new_with_target_type(
+ cast_options,
+ capacity,
+ Some(target_type),
+ ))
Review Comment:
... but actually, isn't this just:
```suggestion
TimestampMicro(VariantToPrimitiveArrowRowBuilder::new_with_target_type(
cast_options,
capacity,
Some(data_type.clone())
))
```
##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -293,21 +337,77 @@ fn get_type_name<T: ArrowPrimitiveType>() -> &'static str
{
"arrow_array::types::Float32Type" => "Float32",
"arrow_array::types::Float64Type" => "Float64",
"arrow_array::types::Float16Type" => "Float16",
+ "arrow_array::types::TimestampMicrosecondType" =>
"Timestamp(Microsecond)",
+ "arrow_array::types::TimestampNanosecondType" =>
"Timestamp(Nanosecond)",
_ => "Unknown",
}
}
+/// Builder for converting variant values to boolean values
+/// Boolean is not primitive types in Arrow, so we need a separate builder
+pub(crate) struct VariantToBooleanArrowRowBuilder<'a> {
+ builder: arrow::array::BooleanBuilder,
+ cast_options: &'a CastOptions<'a>,
+}
+
+impl<'a> VariantToBooleanArrowRowBuilder<'a> {
+ fn new(cast_options: &'a CastOptions<'a>, capacity: usize) -> Self {
+ Self {
+ builder: arrow::array::BooleanBuilder::with_capacity(capacity),
Review Comment:
Actually, this little exploration exposed a bad (IMO) trait design, I made a
quick PR to fix it:
* https://github.com/apache/arrow-rs/pull/8519
After that fix merges, the macro should be vastly easier to define --
basically copy+paste the definition from the other module and tweak it as
needed.
And then you can use the macro to define not only timestamp and boolean
builders, but also builders for the other temporal and decimal types.
We _would_ still need to figure out the best way to handle that time zone
issue, tho.
##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -342,7 +442,16 @@ where
}
fn finish(mut self) -> Result<ArrayRef> {
- Ok(Arc::new(self.builder.finish()))
+ let array: PrimitiveArray<T> = self.builder.finish();
+
+ if let Some(target_type) = self.target_data_type {
+ let data = array.into_data();
+ let new_data = data.into_builder().data_type(target_type).build()?;
+ let array_with_new_type = PrimitiveArray::<T>::from(new_data);
+ return Ok(Arc::new(array_with_new_type));
+ }
Review Comment:
Oh! It's reinstating timezone info that would otherwise be lost
##########
parquet-variant-compute/src/variant_to_arrow.rs:
##########
@@ -293,21 +337,77 @@ fn get_type_name<T: ArrowPrimitiveType>() -> &'static str
{
"arrow_array::types::Float32Type" => "Float32",
"arrow_array::types::Float64Type" => "Float64",
"arrow_array::types::Float16Type" => "Float16",
+ "arrow_array::types::TimestampMicrosecondType" =>
"Timestamp(Microsecond)",
+ "arrow_array::types::TimestampNanosecondType" =>
"Timestamp(Nanosecond)",
_ => "Unknown",
Review Comment:
Missing the boolean case?
But actually, I suspect we can get just rid of `get_type_name`, relying on
`T::DATA_TYPE` and the shiny
[new](https://github.com/apache/arrow-rs/pull/8290) `impl Display for DataType`
instead?
(that cleanup would probably be a separate PR tho)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]