scovich commented on code in PR #8438:
URL: https://github.com/apache/arrow-rs/pull/8438#discussion_r2377285348
##########
parquet-variant-compute/src/variant_array.rs:
##########
@@ -891,28 +864,116 @@ fn typed_value_to_variant(typed_value: &ArrayRef, index:
usize) -> Variant<'_, '
///
/// So cast them to get the right type.
fn cast_to_binary_view_arrays(array: &dyn Array) -> Result<ArrayRef,
ArrowError> {
- let new_type = rewrite_to_view_types(array.data_type());
- cast(array, &new_type)
+ let new_type = canonicalize_and_verify_data_type(array.data_type())?;
+ cast(array, new_type.as_ref())
+}
+
+/// Validates whether a given arrow decimal is a valid variant decimal
+///
+/// NOTE: By a strict reading of the "decimal table" in the [shredding spec],
each decimal type
+/// should have a lower bound on precision as well as an upper bound (i.e.
Decimal16 with precision
+/// 5 is invalid because Decimal4 can cover it). But the variant shredding
integration tests
+/// specifically expect such cases to succeed, so we only enforce the upper
bound here.
+///
+/// [shredding spec]:
https://github.com/apache/parquet-format/blob/master/VariantEncoding.md#encoding-types
+fn is_valid_variant_decimal(p: &u8, s: &i8, max_precision: u8) -> bool {
+ *p <= max_precision && (0..=*p as i8).contains(s)
}
/// replaces all instances of Binary with BinaryView in a DataType
-fn rewrite_to_view_types(data_type: &DataType) -> DataType {
- match data_type {
- DataType::Binary => DataType::BinaryView,
- DataType::List(field) => DataType::List(rewrite_field_type(field)),
- DataType::Struct(fields) => {
- DataType::Struct(fields.iter().map(rewrite_field_type).collect())
- }
- _ => data_type.clone(),
+fn canonicalize_and_verify_data_type(
+ data_type: &DataType,
+) -> Result<Cow<'_, DataType>, ArrowError> {
+ use DataType::*;
+
+ // helper macros
+ macro_rules! fail {
+ () => {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Illegal shredded value type: {data_type}"
+ )))
+ };
}
+ macro_rules! borrow {
+ () => {
+ Cow::Borrowed(data_type)
+ };
+ }
+
+ let new_data_type = match data_type {
+ // Primitive arrow types that have a direct variant counterpart are
allowed
+ Null | Boolean => borrow!(),
+ Int8 | Int16 | Int32 | Int64 | Float32 | Float64 => borrow!(),
+
+ // Unsigned integers are not allowed at all
+ UInt8 | UInt16 | UInt32 | UInt64 | Float16 => fail!(),
+
+ // Most decimal types are allowed, with restrictions on precision and
scale
+ Decimal32(p, s) if is_valid_variant_decimal(p, s, 9) => borrow!(),
+ Decimal64(p, s) if is_valid_variant_decimal(p, s, 18) => borrow!(),
+ Decimal128(p, s) if is_valid_variant_decimal(p, s, 38) => borrow!(),
+ Decimal32(..) | Decimal64(..) | Decimal128(..) | Decimal256(..) =>
fail!(),
+
+ // Only micro and nano timestamps are supported
+ Timestamp(TimeUnit::Microsecond | TimeUnit::Nanosecond, _) =>
borrow!(),
+ Timestamp(TimeUnit::Millisecond | TimeUnit::Second, _) => fail!(),
+
+ // Only 32-bit dates and 64-bit microsecond time are supported.
+ Date32 | Time64(TimeUnit::Microsecond) => borrow!(),
+ Date64 | Time32(_) | Time64(_) | Duration(_) | Interval(_) => fail!(),
+
+ // Binary and string are allowed.
+ // NOTE: We force Binary to BinaryView because that's what the parquet
reader returns.
+ Binary => Cow::Owned(DataType::BinaryView),
+ BinaryView | Utf8 => borrow!(),
+
+ // UUID maps to 16-byte fixed-size binary; no other width is allowed
+ FixedSizeBinary(16) => borrow!(),
+ FixedSizeBinary(_) | FixedSizeList(..) => fail!(),
+
+ // We can _possibly_ support (some of) these some day?
+ LargeBinary | LargeUtf8 | Utf8View | ListView(_) | LargeList(_) |
LargeListView(_) => {
+ fail!()
+ }
+
+ // Lists and struct are supported, maps and unions are not
+ List(field) => match canonicalize_and_verify_field(field)? {
+ Cow::Borrowed(_) => borrow!(),
+ Cow::Owned(new_field) => Cow::Owned(DataType::List(new_field)),
+ },
+ Struct(fields) => {
+ // Avoid allocation unless at least one field was rewritten
Review Comment:
Observation: The original code unconditionally built a new type from the
ground up, which defeated the purpose of having `Fields` store `Arc<Field>`
(which can break long chains). So this code collects the set of fields changed
by deeper layers, and only constructs a new struct if at least one field
actually changed. In the common case where no fields changed, the hashmap is
empty (no allocations) and we just return a borrowed version of the input data
type.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]