This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 223ee58d09 [Parquet] Avoid copying `LogicalType` in 
`ColumnOrder::get_sort_order`, deprecate `get_logical_type` (#8789)
223ee58d09 is described below

commit 223ee58d099e87e9e666e83cd316bd36128f3377
Author: Andrew Lamb <[email protected]>
AuthorDate: Tue Nov 11 17:20:18 2025 -0500

    [Parquet] Avoid copying `LogicalType` in `ColumnOrder::get_sort_order`, 
deprecate `get_logical_type` (#8789)
    
    # Which issue does this PR close?
    
    - Builds on https://github.com/apache/arrow-rs/pull/8786 from @etseidl
    
    # Rationale for this change
    
    @etseidl  suggested:
    > A further optimization would be to change ColumnOrder::get_sort_order
    to take an Option<&LogicalType>, but that is a breaking API change.
    
    # What changes are included in this PR?
    
    1. Deprecate `ColumnOrder::get_sort_order`
    2. Add ColumnOrder::sort_order_for_type` that returns the same value
    
    # Are these changes tested?
    
    Yes by CI
    
    Note I was not able to observe any changes in benchmark performance with
    this, so I am not sure it it worth doing
    
    # Are there any user-facing changes?
    There is a new API but I think there not many people use this (low
    level) API so the disruption should be minimal
    
    ---------
    
    Co-authored-by: Ed Seidl <[email protected]>
---
 parquet/src/arrow/arrow_reader/mod.rs   |  9 ++++++---
 parquet/src/arrow/schema/extension.rs   |  4 ++--
 parquet/src/arrow/schema/mod.rs         | 10 +++++-----
 parquet/src/arrow/schema/primitive.rs   | 24 ++++++++++++++----------
 parquet/src/basic.rs                    | 14 +++++++++++++-
 parquet/src/file/metadata/thrift/mod.rs |  8 ++++----
 parquet/src/file/metadata/writer.rs     |  4 ++--
 parquet/src/file/serialized_reader.rs   |  8 ++++----
 parquet/src/geospatial/accumulator.rs   |  2 +-
 parquet/src/schema/printer.rs           |  4 ++--
 parquet/src/schema/types.rs             | 27 +++++++++++++++++++--------
 parquet/src/variant.rs                  |  4 ++--
 parquet/tests/geospatial.rs             | 27 ++++++++++-----------------
 13 files changed, 84 insertions(+), 61 deletions(-)

diff --git a/parquet/src/arrow/arrow_reader/mod.rs 
b/parquet/src/arrow/arrow_reader/mod.rs
index c6f2eaeff4..6122b861f4 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -5096,10 +5096,13 @@ mod tests {
             .expect("Error creating reader builder");
 
         let schema = builder.metadata().file_metadata().schema_descr();
-        assert_eq!(schema.column(0).logical_type(), Some(LogicalType::String));
         assert_eq!(
-            schema.column(1).logical_type(),
-            Some(LogicalType::_Unknown { field_id: 2555 })
+            schema.column(0).logical_type_ref(),
+            Some(&LogicalType::String)
+        );
+        assert_eq!(
+            schema.column(1).logical_type_ref(),
+            Some(&LogicalType::_Unknown { field_id: 2555 })
         );
         assert_eq!(schema.column(1).physical_type(), PhysicalType::BYTE_ARRAY);
 
diff --git a/parquet/src/arrow/schema/extension.rs 
b/parquet/src/arrow/schema/extension.rs
index 9adec8b6c1..fe3e856a6c 100644
--- a/parquet/src/arrow/schema/extension.rs
+++ b/parquet/src/arrow/schema/extension.rs
@@ -39,7 +39,7 @@ pub(crate) fn try_add_extension_type(
     mut arrow_field: Field,
     parquet_type: &Type,
 ) -> Result<Field, ParquetError> {
-    let Some(parquet_logical_type) = 
parquet_type.get_basic_info().logical_type() else {
+    let Some(parquet_logical_type) = 
parquet_type.get_basic_info().logical_type_ref() else {
         return Ok(arrow_field);
     };
     match parquet_logical_type {
@@ -65,7 +65,7 @@ pub(crate) fn try_add_extension_type(
 ///
 /// This is used to preallocate the metadata hashmap size
 pub(crate) fn has_extension_type(parquet_type: &Type) -> bool {
-    let Some(parquet_logical_type) = 
parquet_type.get_basic_info().logical_type() else {
+    let Some(parquet_logical_type) = 
parquet_type.get_basic_info().logical_type_ref() else {
         return false;
     };
     match parquet_logical_type {
diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs
index 75603ac866..1d8d2e1fd3 100644
--- a/parquet/src/arrow/schema/mod.rs
+++ b/parquet/src/arrow/schema/mod.rs
@@ -1840,7 +1840,7 @@ mod tests {
                 // This is because the Arrow conversion always sets logical 
type,
                 // even if there wasn't originally one.
                 // This is not an issue, but is an inconvenience for this test.
-                match a.logical_type() {
+                match a.logical_type_ref() {
                     Some(_) => {
                         assert_eq!(a, b)
                     }
@@ -2216,8 +2216,8 @@ mod tests {
         let parquet_schema = 
ArrowSchemaConverter::new().convert(&arrow_schema)?;
 
         assert_eq!(
-            parquet_schema.column(0).logical_type(),
-            Some(LogicalType::Uuid)
+            parquet_schema.column(0).logical_type_ref(),
+            Some(&LogicalType::Uuid)
         );
 
         let arrow_schema = parquet_to_arrow_schema(&parquet_schema, None)?;
@@ -2237,8 +2237,8 @@ mod tests {
         let parquet_schema = 
ArrowSchemaConverter::new().convert(&arrow_schema)?;
 
         assert_eq!(
-            parquet_schema.column(0).logical_type(),
-            Some(LogicalType::Json)
+            parquet_schema.column(0).logical_type_ref(),
+            Some(&LogicalType::Json)
         );
 
         let arrow_schema = parquet_to_arrow_schema(&parquet_schema, None)?;
diff --git a/parquet/src/arrow/schema/primitive.rs 
b/parquet/src/arrow/schema/primitive.rs
index c9f6482c90..8959081bcb 100644
--- a/parquet/src/arrow/schema/primitive.rs
+++ b/parquet/src/arrow/schema/primitive.rs
@@ -164,7 +164,7 @@ fn decimal_256_type(scale: i32, precision: i32) -> 
Result<DataType> {
 }
 
 fn from_int32(info: &BasicTypeInfo, scale: i32, precision: i32) -> 
Result<DataType> {
-    match (info.logical_type(), info.converted_type()) {
+    match (info.logical_type_ref(), info.converted_type()) {
         (None, ConvertedType::NONE) => Ok(DataType::Int32),
         (
             Some(
@@ -183,7 +183,9 @@ fn from_int32(info: &BasicTypeInfo, scale: i32, precision: 
i32) -> Result<DataTy
             (32, false) => Ok(DataType::UInt32),
             _ => Err(arrow_err!("Cannot create INT32 physical type from {:?}", 
t)),
         },
-        (Some(LogicalType::Decimal { scale, precision }), _) => 
decimal_128_type(scale, precision),
+        (Some(LogicalType::Decimal { scale, precision }), _) => {
+            decimal_128_type(*scale, *precision)
+        }
         (Some(LogicalType::Date), _) => Ok(DataType::Date32),
         (Some(LogicalType::Time { unit, .. }), _) => match unit {
             ParquetTimeUnit::MILLIS => 
Ok(DataType::Time32(TimeUnit::Millisecond)),
@@ -212,7 +214,7 @@ fn from_int32(info: &BasicTypeInfo, scale: i32, precision: 
i32) -> Result<DataTy
 }
 
 fn from_int64(info: &BasicTypeInfo, scale: i32, precision: i32) -> 
Result<DataType> {
-    match (info.logical_type(), info.converted_type()) {
+    match (info.logical_type_ref(), info.converted_type()) {
         (None, ConvertedType::NONE) => Ok(DataType::Int64),
         (
             Some(LogicalType::Integer {
@@ -243,7 +245,7 @@ fn from_int64(info: &BasicTypeInfo, scale: i32, precision: 
i32) -> Result<DataTy
                 ParquetTimeUnit::MICROS => TimeUnit::Microsecond,
                 ParquetTimeUnit::NANOS => TimeUnit::Nanosecond,
             },
-            if is_adjusted_to_u_t_c {
+            if *is_adjusted_to_u_t_c {
                 Some("UTC".into())
             } else {
                 None
@@ -260,7 +262,9 @@ fn from_int64(info: &BasicTypeInfo, scale: i32, precision: 
i32) -> Result<DataTy
             TimeUnit::Microsecond,
             Some("UTC".into()),
         )),
-        (Some(LogicalType::Decimal { scale, precision }), _) => 
decimal_128_type(scale, precision),
+        (Some(LogicalType::Decimal { scale, precision }), _) => {
+            decimal_128_type(*scale, *precision)
+        }
         (None, ConvertedType::DECIMAL) => decimal_128_type(scale, precision),
         (logical, converted) => Err(arrow_err!(
             "Unable to convert parquet INT64 logical type {:?} or converted 
type {}",
@@ -271,7 +275,7 @@ fn from_int64(info: &BasicTypeInfo, scale: i32, precision: 
i32) -> Result<DataTy
 }
 
 fn from_byte_array(info: &BasicTypeInfo, precision: i32, scale: i32) -> 
Result<DataType> {
-    match (info.logical_type(), info.converted_type()) {
+    match (info.logical_type_ref(), info.converted_type()) {
         (Some(LogicalType::String), _) => Ok(DataType::Utf8),
         (Some(LogicalType::Json), _) => Ok(DataType::Utf8),
         (Some(LogicalType::Bson), _) => Ok(DataType::Binary),
@@ -290,7 +294,7 @@ fn from_byte_array(info: &BasicTypeInfo, precision: i32, 
scale: i32) -> Result<D
                 precision: p,
             }),
             _,
-        ) => decimal_type(s, p),
+        ) => decimal_type(*s, *p),
         (None, ConvertedType::DECIMAL) => decimal_type(scale, precision),
         (logical, converted) => Err(arrow_err!(
             "Unable to convert parquet BYTE_ARRAY logical type {:?} or 
converted type {}",
@@ -307,12 +311,12 @@ fn from_fixed_len_byte_array(
     type_length: i32,
 ) -> Result<DataType> {
     // TODO: This should check the type length for the decimal and interval 
types
-    match (info.logical_type(), info.converted_type()) {
+    match (info.logical_type_ref(), info.converted_type()) {
         (Some(LogicalType::Decimal { scale, precision }), _) => {
             if type_length <= 16 {
-                decimal_128_type(scale, precision)
+                decimal_128_type(*scale, *precision)
             } else {
-                decimal_256_type(scale, precision)
+                decimal_256_type(*scale, *precision)
             }
         }
         (None, ConvertedType::DECIMAL) => {
diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs
index eaa889bb99..9566454cb0 100644
--- a/parquet/src/basic.rs
+++ b/parquet/src/basic.rs
@@ -1132,12 +1132,24 @@ pub enum ColumnOrder {
 
 impl ColumnOrder {
     /// Returns sort order for a physical/logical type.
+    #[deprecated(
+        since = "57.1.0",
+        note = "use `ColumnOrder::sort_order_for_type` instead"
+    )]
     pub fn get_sort_order(
         logical_type: Option<LogicalType>,
         converted_type: ConvertedType,
         physical_type: Type,
     ) -> SortOrder {
-        // TODO: Should this take converted and logical type, for 
compatibility?
+        Self::sort_order_for_type(logical_type.as_ref(), converted_type, 
physical_type)
+    }
+
+    /// Returns sort order for a physical/logical type.
+    pub fn sort_order_for_type(
+        logical_type: Option<&LogicalType>,
+        converted_type: ConvertedType,
+        physical_type: Type,
+    ) -> SortOrder {
         match logical_type {
             Some(logical) => match logical {
                 LogicalType::String | LogicalType::Enum | LogicalType::Json | 
LogicalType::Bson => {
diff --git a/parquet/src/file/metadata/thrift/mod.rs 
b/parquet/src/file/metadata/thrift/mod.rs
index 175a152839..6fdb0e0c4b 100644
--- a/parquet/src/file/metadata/thrift/mod.rs
+++ b/parquet/src/file/metadata/thrift/mod.rs
@@ -823,8 +823,8 @@ pub(crate) fn parquet_metadata_from_bytes(
     let column_orders = column_orders.map(|mut cos| {
         for (i, column) in schema_descr.columns().iter().enumerate() {
             if let ColumnOrder::TYPE_DEFINED_ORDER(_) = cos[i] {
-                let sort_order = ColumnOrder::get_sort_order(
-                    column.logical_type(),
+                let sort_order = ColumnOrder::sort_order_for_type(
+                    column.logical_type_ref(),
                     column.converted_type(),
                     column.physical_type(),
                 );
@@ -1396,7 +1396,7 @@ fn write_schema_helper<W: Write>(
                 } else {
                     None
                 },
-                logical_type: basic_info.logical_type(),
+                logical_type: basic_info.logical_type_ref().cloned(),
             };
             element.write_thrift(writer)
         }
@@ -1424,7 +1424,7 @@ fn write_schema_helper<W: Write>(
                 } else {
                     None
                 },
-                logical_type: basic_info.logical_type(),
+                logical_type: basic_info.logical_type_ref().cloned(),
             };
 
             element.write_thrift(writer)?;
diff --git a/parquet/src/file/metadata/writer.rs 
b/parquet/src/file/metadata/writer.rs
index 124bc11bdd..5bb5f5cd85 100644
--- a/parquet/src/file/metadata/writer.rs
+++ b/parquet/src/file/metadata/writer.rs
@@ -156,8 +156,8 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> {
             .columns()
             .iter()
             .map(|col| {
-                let sort_order = ColumnOrder::get_sort_order(
-                    col.logical_type(),
+                let sort_order = ColumnOrder::sort_order_for_type(
+                    col.logical_type_ref(),
                     col.converted_type(),
                     col.physical_type(),
                 );
diff --git a/parquet/src/file/serialized_reader.rs 
b/parquet/src/file/serialized_reader.rs
index 990b2f4f16..ce608c7717 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -2735,12 +2735,12 @@ mod tests {
 
         let schema = reader.metadata().file_metadata().schema_descr();
         assert_eq!(
-            schema.column(0).logical_type(),
-            Some(basic::LogicalType::String)
+            schema.column(0).logical_type_ref(),
+            Some(&basic::LogicalType::String)
         );
         assert_eq!(
-            schema.column(1).logical_type(),
-            Some(basic::LogicalType::_Unknown { field_id: 2555 })
+            schema.column(1).logical_type_ref(),
+            Some(&basic::LogicalType::_Unknown { field_id: 2555 })
         );
         assert_eq!(schema.column(1).physical_type(), Type::BYTE_ARRAY);
 
diff --git a/parquet/src/geospatial/accumulator.rs 
b/parquet/src/geospatial/accumulator.rs
index c0dcbf3c7b..d25a47930f 100644
--- a/parquet/src/geospatial/accumulator.rs
+++ b/parquet/src/geospatial/accumulator.rs
@@ -118,7 +118,7 @@ pub struct DefaultGeoStatsAccumulatorFactory {}
 impl GeoStatsAccumulatorFactory for DefaultGeoStatsAccumulatorFactory {
     fn new_accumulator(&self, _descr: &ColumnDescPtr) -> Box<dyn 
GeoStatsAccumulator> {
         #[cfg(feature = "geospatial")]
-        if let Some(crate::basic::LogicalType::Geometry { .. }) = 
_descr.logical_type() {
+        if let Some(crate::basic::LogicalType::Geometry { .. }) = 
_descr.logical_type_ref() {
             Box::new(ParquetGeoStatsAccumulator::default())
         } else {
             Box::new(VoidGeoStatsAccumulator::default())
diff --git a/parquet/src/schema/printer.rs b/parquet/src/schema/printer.rs
index 838d84a1b1..4ad3b6b93e 100644
--- a/parquet/src/schema/printer.rs
+++ b/parquet/src/schema/printer.rs
@@ -400,7 +400,7 @@ impl Printer<'_> {
                 // Also print logical type if it is available
                 // If there is a logical type, do not print converted type
                 let logical_type_str = print_logical_and_converted(
-                    basic_info.logical_type().as_ref(),
+                    basic_info.logical_type_ref(),
                     basic_info.converted_type(),
                     precision,
                     scale,
@@ -426,7 +426,7 @@ impl Printer<'_> {
                         write!(self.output, "[{}] ", basic_info.id());
                     }
                     let logical_str = print_logical_and_converted(
-                        basic_info.logical_type().as_ref(),
+                        basic_info.logical_type_ref(),
                         basic_info.converted_type(),
                         0,
                         0,
diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs
index 8845f0f1ee..85f3ed4897 100644
--- a/parquet/src/schema/types.rs
+++ b/parquet/src/schema/types.rs
@@ -211,8 +211,8 @@ impl Type {
     pub(crate) fn is_list(&self) -> bool {
         if self.is_group() {
             let basic_info = self.get_basic_info();
-            if let Some(logical_type) = basic_info.logical_type() {
-                return logical_type == LogicalType::List;
+            if let Some(logical_type) = basic_info.logical_type_ref() {
+                return logical_type == &LogicalType::List;
             }
             return basic_info.converted_type() == ConvertedType::LIST;
         }
@@ -710,6 +710,10 @@ impl BasicTypeInfo {
     ///
     /// Note that this function will clone the `LogicalType`. If performance 
is a concern,
     /// use [`Self::logical_type_ref`] instead.
+    #[deprecated(
+        since = "57.1.0",
+        note = "use `BasicTypeInfo::logical_type_ref` instead (LogicalType 
cloning is non trivial)"
+    )]
     pub fn logical_type(&self) -> Option<LogicalType> {
         // Unlike ConvertedType, LogicalType cannot implement Copy, thus we 
clone it
         self.logical_type.clone()
@@ -919,8 +923,15 @@ impl ColumnDescriptor {
     ///
     /// Note that this function will clone the `LogicalType`. If performance 
is a concern,
     /// use [`Self::logical_type_ref`] instead.
+    #[deprecated(
+        since = "57.1.0",
+        note = "use `ColumnDescriptor::logical_type_ref` instead (LogicalType 
cloning is non trivial)"
+    )]
     pub fn logical_type(&self) -> Option<LogicalType> {
-        self.primitive_type.get_basic_info().logical_type()
+        self.primitive_type
+            .get_basic_info()
+            .logical_type_ref()
+            .cloned()
     }
 
     /// Returns a reference to the [`LogicalType`] for this column.
@@ -966,8 +977,8 @@ impl ColumnDescriptor {
 
     /// Returns the sort order for this column
     pub fn sort_order(&self) -> SortOrder {
-        ColumnOrder::get_sort_order(
-            self.logical_type(),
+        ColumnOrder::sort_order_for_type(
+            self.logical_type_ref(),
             self.converted_type(),
             self.physical_type(),
         )
@@ -1417,8 +1428,8 @@ mod tests {
             let basic_info = tp.get_basic_info();
             assert_eq!(basic_info.repetition(), Repetition::OPTIONAL);
             assert_eq!(
-                basic_info.logical_type(),
-                Some(LogicalType::Integer {
+                basic_info.logical_type_ref(),
+                Some(&LogicalType::Integer {
                     bit_width: 32,
                     is_signed: true
                 })
@@ -1768,7 +1779,7 @@ mod tests {
         assert!(tp.is_group());
         assert!(!tp.is_primitive());
         assert_eq!(basic_info.repetition(), Repetition::REPEATED);
-        assert_eq!(basic_info.logical_type(), Some(LogicalType::List));
+        assert_eq!(basic_info.logical_type_ref(), Some(&LogicalType::List));
         assert_eq!(basic_info.converted_type(), ConvertedType::LIST);
         assert_eq!(basic_info.id(), 1);
         assert_eq!(tp.get_fields().len(), 2);
diff --git a/parquet/src/variant.rs b/parquet/src/variant.rs
index d2d62eea6b..b7dde6b3c8 100644
--- a/parquet/src/variant.rs
+++ b/parquet/src/variant.rs
@@ -198,8 +198,8 @@ mod tests {
         assert_eq!(field.name(), "data");
         // data should have been written with the Variant logical type
         assert_eq!(
-            field.get_basic_info().logical_type(),
-            Some(crate::basic::LogicalType::Variant {
+            field.get_basic_info().logical_type_ref(),
+            Some(&crate::basic::LogicalType::Variant {
                 specification_version: None
             })
         );
diff --git a/parquet/tests/geospatial.rs b/parquet/tests/geospatial.rs
index 614acbcf45..b8c623a2a8 100644
--- a/parquet/tests/geospatial.rs
+++ b/parquet/tests/geospatial.rs
@@ -66,28 +66,21 @@ fn test_read_logical_type() {
 
     for (geospatial_file, expected_type) in expected_logical_type {
         let metadata = read_metadata(geospatial_file);
-        let logical_type = metadata
-            .file_metadata()
-            .schema_descr()
-            .column(1)
-            .logical_type()
-            .unwrap();
-
-        assert_eq!(logical_type, expected_type);
+        let column_descr = metadata.file_metadata().schema_descr().column(1);
+        let logical_type = column_descr.logical_type_ref().unwrap();
+
+        assert_eq!(logical_type, &expected_type);
     }
 
     // The crs value may also contain arbitrary values (in this case some JSON
     // a bit too lengthy to type out)
     let metadata = read_metadata("crs-arbitrary-value.parquet");
-    let logical_type = metadata
-        .file_metadata()
-        .schema_descr()
-        .column(1)
-        .logical_type()
-        .unwrap();
+    let column_descr = metadata.file_metadata().schema_descr().column(1);
+    let logical_type = column_descr.logical_type_ref().unwrap();
 
     if let LogicalType::Geometry { crs } = logical_type {
-        let crs_parsed: Value = serde_json::from_str(&crs.unwrap()).unwrap();
+        let crs = crs.as_ref();
+        let crs_parsed: Value = serde_json::from_str(crs.unwrap()).unwrap();
         assert_eq!(crs_parsed.get("id").unwrap().get("code").unwrap(), 5070);
     } else {
         panic!("Expected geometry type but got {logical_type:?}");
@@ -103,8 +96,8 @@ fn test_read_geospatial_statistics() {
     //    optional binary field_id=-1 wkt (String);
     //    optional binary field_id=-1 geometry (Geometry(crs=));
     let fields = metadata.file_metadata().schema().get_fields();
-    let logical_type = fields[2].get_basic_info().logical_type().unwrap();
-    assert_eq!(logical_type, LogicalType::Geometry { crs: None });
+    let logical_type = fields[2].get_basic_info().logical_type_ref().unwrap();
+    assert_eq!(logical_type, &LogicalType::Geometry { crs: None });
 
     let geo_statistics = metadata.row_group(0).column(2).geo_statistics();
     assert!(geo_statistics.is_some());

Reply via email to