This is an automated email from the ASF dual-hosted git repository.

etseidl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 40300ca6f1 perf: Speed up Parquet file writing (10%, back to speed of 
56) (#8786)
40300ca6f1 is described below

commit 40300ca6f17f35cac1da52ae521970655f868425
Author: Ed Seidl <[email protected]>
AuthorDate: Wed Nov 5 12:54:23 2025 -0800

    perf: Speed up Parquet file writing (10%, back to speed of 56) (#8786)
    
    # Which issue does this PR close?
    
    - Partial fix for #8783.
    
    # Rationale for this change
    
    An appreciable slowdown in Parquet writing was noticed. At least part of
    the slowdown seems to stem from changes to `basic::LogicalType` which
    caused cloning the enum to take make longer than previously.
    
    # What changes are included in this PR?
    
    This adds a new `logical_type_ref` call to `BasicTypeInfo` and
    `ColumnDescriptor`. Unlike the existing `logical_type` which returns a
    cloned `Option<LogicalType>`, the new methods return
    `Option<&LogicalType>`. This new function is used in place of
    `logical_type` internally.
    
    # Are these changes tested?
    
    Should be covered by existing tests.
    
    # Are there any user-facing changes?
    No.
    
    A further optimization would be to change `ColumnOrder::get_sort_order`
    to take an `Option<&LogicalType>`, but that is a breaking API change.
---
 parquet/src/column/writer/encoder.rs  |  2 +-
 parquet/src/column/writer/mod.rs      | 14 +++++++-------
 parquet/src/geospatial/accumulator.rs |  2 +-
 parquet/src/record/api.rs             |  2 +-
 parquet/src/schema/types.rs           | 16 ++++++++++++++++
 5 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/parquet/src/column/writer/encoder.rs 
b/parquet/src/column/writer/encoder.rs
index 1cf116dc3c..11d4f3142a 100644
--- a/parquet/src/column/writer/encoder.rs
+++ b/parquet/src/column/writer/encoder.rs
@@ -375,7 +375,7 @@ fn replace_zero<T: ParquetValueType>(val: &T, descr: 
&ColumnDescriptor, replace:
             T::try_from_le_slice(&f64::to_le_bytes(replace as f64)).unwrap()
         }
         Type::FIXED_LEN_BYTE_ARRAY
-            if descr.logical_type() == Some(LogicalType::Float16)
+            if descr.logical_type_ref() == Some(LogicalType::Float16).as_ref()
                 && f16::from_le_bytes(val.as_bytes().try_into().unwrap()) == 
f16::NEG_ZERO =>
         {
             
T::try_from_le_slice(&f16::to_le_bytes(f16::from_f32(replace))).unwrap()
diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs
index fdb94962b6..417c011275 100644
--- a/parquet/src/column/writer/mod.rs
+++ b/parquet/src/column/writer/mod.rs
@@ -868,8 +868,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
             // So truncation of those types could lead to inaccurate min/max 
statistics
             Type::FIXED_LEN_BYTE_ARRAY
                 if !matches!(
-                    self.descr.logical_type(),
-                    Some(LogicalType::Decimal { .. }) | 
Some(LogicalType::Float16)
+                    self.descr.logical_type_ref(),
+                    Some(&LogicalType::Decimal { .. }) | 
Some(&LogicalType::Float16)
                 ) =>
             {
                 true
@@ -882,7 +882,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
 
     /// Returns `true` if this column's logical type is a UTF-8 string.
     fn is_utf8(&self) -> bool {
-        self.get_descriptor().logical_type() == Some(LogicalType::String)
+        self.get_descriptor().logical_type_ref() == Some(&LogicalType::String)
             || self.get_descriptor().converted_type() == ConvertedType::UTF8
     }
 
@@ -1385,7 +1385,7 @@ fn update_max<T: ParquetValueType>(descr: 
&ColumnDescriptor, val: &T, max: &mut
 fn is_nan<T: ParquetValueType>(descr: &ColumnDescriptor, val: &T) -> bool {
     match T::PHYSICAL_TYPE {
         Type::FLOAT | Type::DOUBLE => val != val,
-        Type::FIXED_LEN_BYTE_ARRAY if descr.logical_type() == 
Some(LogicalType::Float16) => {
+        Type::FIXED_LEN_BYTE_ARRAY if descr.logical_type_ref() == 
Some(&LogicalType::Float16) => {
             let val = val.as_bytes();
             let val = f16::from_le_bytes([val[0], val[1]]);
             val.is_nan()
@@ -1421,7 +1421,7 @@ fn compare_greater<T: ParquetValueType>(descr: 
&ColumnDescriptor, a: &T, b: &T)
         Type::INT32 | Type::INT64 => {
             if let Some(LogicalType::Integer {
                 is_signed: false, ..
-            }) = descr.logical_type()
+            }) = descr.logical_type_ref()
             {
                 // need to compare unsigned
                 return compare_greater_unsigned_int(a, b);
@@ -1438,13 +1438,13 @@ fn compare_greater<T: ParquetValueType>(descr: 
&ColumnDescriptor, a: &T, b: &T)
             };
         }
         Type::FIXED_LEN_BYTE_ARRAY | Type::BYTE_ARRAY => {
-            if let Some(LogicalType::Decimal { .. }) = descr.logical_type() {
+            if let Some(LogicalType::Decimal { .. }) = 
descr.logical_type_ref() {
                 return compare_greater_byte_array_decimals(a.as_bytes(), 
b.as_bytes());
             }
             if let ConvertedType::DECIMAL = descr.converted_type() {
                 return compare_greater_byte_array_decimals(a.as_bytes(), 
b.as_bytes());
             }
-            if let Some(LogicalType::Float16) = descr.logical_type() {
+            if let Some(LogicalType::Float16) = descr.logical_type_ref() {
                 return compare_greater_f16(a.as_bytes(), b.as_bytes());
             }
         }
diff --git a/parquet/src/geospatial/accumulator.rs 
b/parquet/src/geospatial/accumulator.rs
index daf7aa959c..c0dcbf3c7b 100644
--- a/parquet/src/geospatial/accumulator.rs
+++ b/parquet/src/geospatial/accumulator.rs
@@ -33,7 +33,7 @@ pub fn try_new_geo_stats_accumulator(
     descr: &ColumnDescPtr,
 ) -> Option<Box<dyn GeoStatsAccumulator>> {
     if !matches!(
-        descr.logical_type(),
+        descr.logical_type_ref(),
         Some(LogicalType::Geometry { .. }) | Some(LogicalType::Geography { .. 
})
     ) {
         return None;
diff --git a/parquet/src/record/api.rs b/parquet/src/record/api.rs
index 8aa660c3c3..33a1464fa3 100644
--- a/parquet/src/record/api.rs
+++ b/parquet/src/record/api.rs
@@ -756,7 +756,7 @@ impl Field {
                     descr.type_precision(),
                     descr.type_scale(),
                 )),
-                ConvertedType::NONE if descr.logical_type() == 
Some(LogicalType::Float16) => {
+                ConvertedType::NONE if descr.logical_type_ref() == 
Some(&LogicalType::Float16) => {
                     if value.len() != 2 {
                         return Err(general_err!(
                             "Error reading FIXED_LEN_BYTE_ARRAY as FLOAT16. 
Length must be 2, got {}",
diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs
index 0dc2a731b9..8845f0f1ee 100644
--- a/parquet/src/schema/types.rs
+++ b/parquet/src/schema/types.rs
@@ -707,11 +707,19 @@ impl BasicTypeInfo {
     }
 
     /// Returns [`LogicalType`] value for the type.
+    ///
+    /// Note that this function will clone the `LogicalType`. If performance 
is a concern,
+    /// use [`Self::logical_type_ref`] instead.
     pub fn logical_type(&self) -> Option<LogicalType> {
         // Unlike ConvertedType, LogicalType cannot implement Copy, thus we 
clone it
         self.logical_type.clone()
     }
 
+    /// Return a reference to the [`LogicalType`] value for the type.
+    pub fn logical_type_ref(&self) -> Option<&LogicalType> {
+        self.logical_type.as_ref()
+    }
+
     /// Returns `true` if id is set, `false` otherwise.
     pub fn has_id(&self) -> bool {
         self.id.is_some()
@@ -908,10 +916,18 @@ impl ColumnDescriptor {
     }
 
     /// Returns [`LogicalType`] for this column.
+    ///
+    /// Note that this function will clone the `LogicalType`. If performance 
is a concern,
+    /// use [`Self::logical_type_ref`] instead.
     pub fn logical_type(&self) -> Option<LogicalType> {
         self.primitive_type.get_basic_info().logical_type()
     }
 
+    /// Returns a reference to the [`LogicalType`] for this column.
+    pub fn logical_type_ref(&self) -> Option<&LogicalType> {
+        self.primitive_type.get_basic_info().logical_type_ref()
+    }
+
     /// Returns physical type for this column.
     /// Note that it will panic if called on a non-primitive type.
     pub fn physical_type(&self) -> PhysicalType {

Reply via email to