alamb commented on code in PR #9997:
URL: https://github.com/apache/arrow-rs/pull/9997#discussion_r3283543399


##########
parquet/src/parquet_macros.rs:
##########
@@ -260,6 +260,85 @@ macro_rules! thrift_union {
     }
 }
 
+/// Macro used to generate Rust enums for Thrift unions where variants are a 
mix of unit and
+/// tuple types. This version allows for unknown variants for forwards 
compatibility.
+///
+/// Use of this macro requires modifying the thrift IDL. For variants with 
empty structs as their
+/// type, delete the typename (i.e. `1: EmptyStruct Var1;` becomes `1: Var1`). 
For variants with a
+/// non-empty type, the typename must be contained within parens (e.g. `1: 
MyType Var1;` becomes
+/// `1: (MyType) Var1;`).
+///
+/// This macro allows for specifying lifetime annotations for the resulting 
`enum` and its fields.
+///
+/// When utilizing this macro the Thrift serialization traits and structs need 
to be in scope.
+#[macro_export]
+#[allow(clippy::crate_in_macro_def)]
+macro_rules! thrift_union_with_unknown {
+    ($(#[$($def_attrs:tt)*])* union $identifier:ident $(< $lt:lifetime >)? { 
$($(#[$($field_attrs:tt)*])* $field_id:literal : $( ( $field_type:ident $(< 
$element_type:ident >)? $(< $field_lt:lifetime >)?) )? $field_name:ident 
$(;)?)* }) => {
+        $(#[cfg_attr(not(doctest), $($def_attrs)*)])*
+        #[derive(Clone, Debug, Eq, PartialEq)]
+        #[allow(non_camel_case_types)]
+        #[allow(non_snake_case)]
+        #[allow(missing_docs)]
+        pub enum $identifier $(<$lt>)? {
+            $($(#[cfg_attr(not(doctest), $($field_attrs)*)])* $field_name $( ( 
$crate::__thrift_union_type!{$field_type $($field_lt)? $($element_type)?} ) 
)?),*,
+            _Unknown {
+                /// The field id encountered when parsing the unknown variant.
+                field_id: i16,
+            },
+        }
+
+        impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for 
$identifier $(<$lt>)? {
+            fn read_thrift(prot: &mut R) -> Result<Self> {
+                let field_ident = prot.read_field_begin(0)?;
+                if field_ident.field_type == FieldType::Stop {
+                    return Err(general_err!("Received empty union from remote 
{}", stringify!($identifier)));
+                }
+                let ret = match field_ident.id {
+                    $($field_id => {
+                        let val = $crate::__thrift_read_variant!(prot, 
$field_name $($field_type $($element_type)?)?);
+                        val
+                    })*
+                    _ => {
+                        prot.skip(field_ident.field_type)?;
+                        Self::_Unknown {

Review Comment:
   I don't think this logic is tested (skipping unknown fields)



##########
parquet/src/basic.rs:
##########
@@ -183,383 +186,166 @@ union TimeUnit {
 // ----------------------------------------------------------------------
 // Mirrors thrift union `LogicalType`
 
-// private structs for decoding logical type
-
 thrift_struct!(
-struct DecimalType {
+pub struct DecimalType {
+  /// The number of digits in the decimal.
   1: required i32 scale
+  /// The location of the decimal point.
   2: required i32 precision
 }
 );
 
 thrift_struct!(
-struct TimestampType {
+pub struct TimestampType {
+  /// Whether the timestamp is adjusted to UTC.
   1: required bool is_adjusted_to_u_t_c
+  /// The unit of time.
   2: required TimeUnit unit
 }
 );
 
-// they are identical
-use TimestampType as TimeType;
+/// Identical to [`TimestampType`]
+pub use TimestampType as TimeType;
 
 thrift_struct!(
-struct IntType {
+pub struct IntType {
+  /// The number of bits in the integer.
   1: required i8 bit_width
+  /// Whether the integer is signed.
   2: required bool is_signed
 }
 );
 
 thrift_struct!(
-struct VariantType {
-  // The version of the variant specification that the variant was
-  // written with.
+pub struct VariantType {
+  /// The version of the variant specification that the variant was
+  /// written with.
   1: optional i8 specification_version
 }
 );
 
 thrift_struct!(
-struct GeometryType<'a> {
-  1: optional string<'a> crs;
+pub struct GeometryType {
+  /// A custom CRS. If unset the CRS `OGC:CRS84` should be used, which means 
that the geometries
+  /// must be stored in longitude, latitude based on the WGS84 datum.
+  1: optional string crs;
 }
 );
 
 thrift_struct!(
-struct GeographyType<'a> {
-  1: optional string<'a> crs;
+pub struct GeographyType {
+  /// A custom CRS. If unset the CRS `OGC:CRS84` should be used.
+  1: optional string crs;
+  /// An optional algorithm can be set to correctly interpret edges 
interpolation
+  /// of the geometries. If unset, the `SPHERICAL` algorithm should be used.
   2: optional EdgeInterpolationAlgorithm algorithm;
 }
 );
 
-// TODO(ets): should we switch to tuple variants so we can use
-// the thrift macros?
+impl GeographyType {
+    /// Accessor for the `GeographyType::algorithm` field. If this field is 
not set, this
+    /// function returns the default value (currently 
[`EdgeInterpolationAlgorithm::SPHERICAL`]
+    /// per the Parquet [specification]).
+    ///
+    /// [specification]: 
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#geography
+    pub fn algorithm(&self) -> Option<EdgeInterpolationAlgorithm> {
+        self.algorithm.or(Some(Default::default()))
+    }
+}
 
+thrift_union_with_unknown!(
 /// Logical types used by version 2.4.0+ of the Parquet format.
 ///
 /// This is an *entirely new* struct as of version
 /// 4.0.0. The struct previously named `LogicalType` was renamed to
 /// [`ConvertedType`]. Please see the README.md for more details.
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub enum LogicalType {
-    /// A UTF8 encoded string.
-    String,
-    /// A map of key-value pairs.
-    Map,
-    /// A list of elements.
-    List,
-    /// A set of predefined values.
-    Enum,
-    /// A decimal value with a specified scale and precision.
-    Decimal {
-        /// The number of digits in the decimal.
-        scale: i32,
-        /// The location of the decimal point.
-        precision: i32,
-    },
-    /// A date stored as days since Unix epoch.
-    Date,
-    /// A time stored as [`TimeUnit`] since midnight.
-    Time {
-        /// Whether the time is adjusted to UTC.
-        is_adjusted_to_u_t_c: bool,
-        /// The unit of time.
-        unit: TimeUnit,
-    },
-    /// A timestamp stored as [`TimeUnit`] since Unix epoch.
-    Timestamp {
-        /// Whether the timestamp is adjusted to UTC.
-        is_adjusted_to_u_t_c: bool,
-        /// The unit of time.
-        unit: TimeUnit,
-    },
-    /// An integer with a specified bit width and signedness.
-    Integer {
-        /// The number of bits in the integer.
-        bit_width: i8,
-        /// Whether the integer is signed.
-        is_signed: bool,
-    },
-    /// An unknown logical type.
-    Unknown,
-    /// A JSON document.
-    Json,
-    /// A BSON document.
-    Bson,
-    /// A UUID.
-    Uuid,
-    /// A 16-bit floating point number.
-    Float16,
-    /// A Variant value.
-    Variant {
-        /// The version of the variant specification that the variant was 
written with.
-        specification_version: Option<i8>,
-    },
-    /// A geospatial feature in the Well-Known Binary (WKB) format with 
linear/planar edges interpolation.
-    Geometry {
-        /// A custom CRS. If unset the defaults to `OGC:CRS84`, which means 
that the geometries
-        /// must be stored in longitude, latitude based on the WGS84 datum.
-        crs: Option<String>,
-    },
-    /// A geospatial feature in the WKB format with an explicit 
(non-linear/non-planar) edges interpolation.
-    Geography {
-        /// A custom CRS. If unset the defaults to `OGC:CRS84`.
-        crs: Option<String>,
-        /// An optional algorithm can be set to correctly interpret edges 
interpolation
-        /// of the geometries. If unset, the algorithm defaults to `SPHERICAL`.
-        algorithm: Option<EdgeInterpolationAlgorithm>,
-    },
-    /// For forward compatibility; used when an unknown union value is 
encountered.
-    _Unknown {
-        /// The field id encountered when parsing the unknown logical type.
-        field_id: i16,
-    },
+union LogicalType {
+   /// A UTF8 encoded string.
+   1:  String
+   /// A map of key-value pairs.
+   2:  Map
+   /// A list of elements.
+   3:  List
+   /// A set of predefined values.
+   4:  Enum
+   /// A decimal value with a specified scale and precision.
+   5:  (DecimalType) Decimal
+   /// A date stored as days since Unix epoch.
+   6:  Date
+   /// A time stored as [`TimeUnit`] since midnight.
+   7:  (TimeType) Time
+   /// A timestamp stored as [`TimeUnit`] since Unix epoch.
+   8:  (TimestampType) Timestamp
+   // 9: reserved for INTERVAL
+   /// An integer with a specified bit width and signedness.
+   10: (IntType) Integer
+   /// An unknown logical type.
+   11: Unknown
+   /// A JSON document.
+   12: Json
+   /// A BSON document.
+   13: Bson
+   /// A UUID.
+   14: Uuid
+   /// A 16-bit floating point number.
+   15: Float16
+   /// A Variant value.
+   16: (VariantType) Variant
+   /// A geospatial feature in the Well-Known Binary (WKB) format with 
linear/planar edges interpolation.
+   17: (GeometryType) Geometry
+   /// A geospatial feature in the WKB format with an explicit 
(non-linear/non-planar) edges interpolation.
+   18: (GeographyType) Geography
 }
+);
 
 impl LogicalType {
     /// Create a [`LogicalType::Integer`] variant with the given `bit_width` 
and `is_signed`
     pub fn integer(bit_width: i8, is_signed: bool) -> Self {
-        Self::Integer {
+        Self::Integer(IntType {
             bit_width,
             is_signed,
-        }
+        })
     }
 
     /// Create a [`LogicalType::Decimal`] variant with the given `scale` and 
`precision`
     pub fn decimal(scale: i32, precision: i32) -> Self {
-        Self::Decimal { scale, precision }
+        Self::Decimal(DecimalType { scale, precision })
     }
 
     /// Create a [`LogicalType::Time`] variant with the given 
`is_adjusted_to_u_t_c` and `unit`
     pub fn time(is_adjusted_to_u_t_c: bool, unit: TimeUnit) -> Self {
-        Self::Time {
+        Self::Time(TimeType {
             is_adjusted_to_u_t_c,
             unit,
-        }
+        })
     }
 
     /// Create a [`LogicalType::Timestamp`] variant with the given 
`is_adjusted_to_u_t_c` and `unit`
     pub fn timestamp(is_adjusted_to_u_t_c: bool, unit: TimeUnit) -> Self {
-        Self::Timestamp {
+        Self::Timestamp(TimestampType {
             is_adjusted_to_u_t_c,
             unit,
-        }
+        })
     }
 
     /// Create a [`LogicalType::Variant`] variant with the given 
`specification_version`
     pub fn variant(specification_version: Option<i8>) -> Self {
-        Self::Variant {
+        Self::Variant(VariantType {
             specification_version,
-        }
+        })
     }
 
     /// Create a [`LogicalType::Geometry`] variant with the given `crs`
     pub fn geometry(crs: Option<String>) -> Self {
-        Self::Geometry { crs }
+        Self::Geometry(GeometryType { crs })
     }
 
     /// Create a [`LogicalType::Geography`] variant with the given `crs` and 
`algorithm`
     pub fn geography(crs: Option<String>, algorithm: 
Option<EdgeInterpolationAlgorithm>) -> Self {
-        Self::Geography { crs, algorithm }
-    }
-}
-
-impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a, R> for LogicalType {

Review Comment:
   this is the custom parser that is now being generated by macros



##########
parquet/src/basic.rs:
##########
@@ -183,383 +186,166 @@ union TimeUnit {
 // ----------------------------------------------------------------------
 // Mirrors thrift union `LogicalType`
 
-// private structs for decoding logical type
-
 thrift_struct!(
-struct DecimalType {
+pub struct DecimalType {
+  /// The number of digits in the decimal.

Review Comment:
   I think scale is the location of the decimal place and precision is the 
number of digits (in other words I think these comments are backwards for the 
two fields)
   
   However, it appears they were backwards on the old fields too 😆 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to