This is an automated email from the ASF dual-hosted git repository.
etseidl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 40300ca6f1 perf: Speed up Parquet file writing (10%, back to speed of
56) (#8786)
40300ca6f1 is described below
commit 40300ca6f17f35cac1da52ae521970655f868425
Author: Ed Seidl <[email protected]>
AuthorDate: Wed Nov 5 12:54:23 2025 -0800
perf: Speed up Parquet file writing (10%, back to speed of 56) (#8786)
# Which issue does this PR close?
- Partial fix for #8783.
# Rationale for this change
An appreciable slowdown in Parquet writing was noticed. At least part of
the slowdown seems to stem from changes to `basic::LogicalType` which
caused cloning the enum to take make longer than previously.
# What changes are included in this PR?
This adds a new `logical_type_ref` call to `BasicTypeInfo` and
`ColumnDescriptor`. Unlike the existing `logical_type` which returns a
cloned `Option<LogicalType>`, the new methods return
`Option<&LogicalType>`. This new function is used in place of
`logical_type` internally.
# Are these changes tested?
Should be covered by existing tests.
# Are there any user-facing changes?
No.
A further optimization would be to change `ColumnOrder::get_sort_order`
to take an `Option<&LogicalType>`, but that is a breaking API change.
---
parquet/src/column/writer/encoder.rs | 2 +-
parquet/src/column/writer/mod.rs | 14 +++++++-------
parquet/src/geospatial/accumulator.rs | 2 +-
parquet/src/record/api.rs | 2 +-
parquet/src/schema/types.rs | 16 ++++++++++++++++
5 files changed, 26 insertions(+), 10 deletions(-)
diff --git a/parquet/src/column/writer/encoder.rs
b/parquet/src/column/writer/encoder.rs
index 1cf116dc3c..11d4f3142a 100644
--- a/parquet/src/column/writer/encoder.rs
+++ b/parquet/src/column/writer/encoder.rs
@@ -375,7 +375,7 @@ fn replace_zero<T: ParquetValueType>(val: &T, descr:
&ColumnDescriptor, replace:
T::try_from_le_slice(&f64::to_le_bytes(replace as f64)).unwrap()
}
Type::FIXED_LEN_BYTE_ARRAY
- if descr.logical_type() == Some(LogicalType::Float16)
+ if descr.logical_type_ref() == Some(LogicalType::Float16).as_ref()
&& f16::from_le_bytes(val.as_bytes().try_into().unwrap()) ==
f16::NEG_ZERO =>
{
T::try_from_le_slice(&f16::to_le_bytes(f16::from_f32(replace))).unwrap()
diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs
index fdb94962b6..417c011275 100644
--- a/parquet/src/column/writer/mod.rs
+++ b/parquet/src/column/writer/mod.rs
@@ -868,8 +868,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
// So truncation of those types could lead to inaccurate min/max
statistics
Type::FIXED_LEN_BYTE_ARRAY
if !matches!(
- self.descr.logical_type(),
- Some(LogicalType::Decimal { .. }) |
Some(LogicalType::Float16)
+ self.descr.logical_type_ref(),
+ Some(&LogicalType::Decimal { .. }) |
Some(&LogicalType::Float16)
) =>
{
true
@@ -882,7 +882,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
/// Returns `true` if this column's logical type is a UTF-8 string.
fn is_utf8(&self) -> bool {
- self.get_descriptor().logical_type() == Some(LogicalType::String)
+ self.get_descriptor().logical_type_ref() == Some(&LogicalType::String)
|| self.get_descriptor().converted_type() == ConvertedType::UTF8
}
@@ -1385,7 +1385,7 @@ fn update_max<T: ParquetValueType>(descr:
&ColumnDescriptor, val: &T, max: &mut
fn is_nan<T: ParquetValueType>(descr: &ColumnDescriptor, val: &T) -> bool {
match T::PHYSICAL_TYPE {
Type::FLOAT | Type::DOUBLE => val != val,
- Type::FIXED_LEN_BYTE_ARRAY if descr.logical_type() ==
Some(LogicalType::Float16) => {
+ Type::FIXED_LEN_BYTE_ARRAY if descr.logical_type_ref() ==
Some(&LogicalType::Float16) => {
let val = val.as_bytes();
let val = f16::from_le_bytes([val[0], val[1]]);
val.is_nan()
@@ -1421,7 +1421,7 @@ fn compare_greater<T: ParquetValueType>(descr:
&ColumnDescriptor, a: &T, b: &T)
Type::INT32 | Type::INT64 => {
if let Some(LogicalType::Integer {
is_signed: false, ..
- }) = descr.logical_type()
+ }) = descr.logical_type_ref()
{
// need to compare unsigned
return compare_greater_unsigned_int(a, b);
@@ -1438,13 +1438,13 @@ fn compare_greater<T: ParquetValueType>(descr:
&ColumnDescriptor, a: &T, b: &T)
};
}
Type::FIXED_LEN_BYTE_ARRAY | Type::BYTE_ARRAY => {
- if let Some(LogicalType::Decimal { .. }) = descr.logical_type() {
+ if let Some(LogicalType::Decimal { .. }) =
descr.logical_type_ref() {
return compare_greater_byte_array_decimals(a.as_bytes(),
b.as_bytes());
}
if let ConvertedType::DECIMAL = descr.converted_type() {
return compare_greater_byte_array_decimals(a.as_bytes(),
b.as_bytes());
}
- if let Some(LogicalType::Float16) = descr.logical_type() {
+ if let Some(LogicalType::Float16) = descr.logical_type_ref() {
return compare_greater_f16(a.as_bytes(), b.as_bytes());
}
}
diff --git a/parquet/src/geospatial/accumulator.rs
b/parquet/src/geospatial/accumulator.rs
index daf7aa959c..c0dcbf3c7b 100644
--- a/parquet/src/geospatial/accumulator.rs
+++ b/parquet/src/geospatial/accumulator.rs
@@ -33,7 +33,7 @@ pub fn try_new_geo_stats_accumulator(
descr: &ColumnDescPtr,
) -> Option<Box<dyn GeoStatsAccumulator>> {
if !matches!(
- descr.logical_type(),
+ descr.logical_type_ref(),
Some(LogicalType::Geometry { .. }) | Some(LogicalType::Geography { ..
})
) {
return None;
diff --git a/parquet/src/record/api.rs b/parquet/src/record/api.rs
index 8aa660c3c3..33a1464fa3 100644
--- a/parquet/src/record/api.rs
+++ b/parquet/src/record/api.rs
@@ -756,7 +756,7 @@ impl Field {
descr.type_precision(),
descr.type_scale(),
)),
- ConvertedType::NONE if descr.logical_type() ==
Some(LogicalType::Float16) => {
+ ConvertedType::NONE if descr.logical_type_ref() ==
Some(&LogicalType::Float16) => {
if value.len() != 2 {
return Err(general_err!(
"Error reading FIXED_LEN_BYTE_ARRAY as FLOAT16.
Length must be 2, got {}",
diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs
index 0dc2a731b9..8845f0f1ee 100644
--- a/parquet/src/schema/types.rs
+++ b/parquet/src/schema/types.rs
@@ -707,11 +707,19 @@ impl BasicTypeInfo {
}
/// Returns [`LogicalType`] value for the type.
+ ///
+ /// Note that this function will clone the `LogicalType`. If performance
is a concern,
+ /// use [`Self::logical_type_ref`] instead.
pub fn logical_type(&self) -> Option<LogicalType> {
// Unlike ConvertedType, LogicalType cannot implement Copy, thus we
clone it
self.logical_type.clone()
}
+ /// Return a reference to the [`LogicalType`] value for the type.
+ pub fn logical_type_ref(&self) -> Option<&LogicalType> {
+ self.logical_type.as_ref()
+ }
+
/// Returns `true` if id is set, `false` otherwise.
pub fn has_id(&self) -> bool {
self.id.is_some()
@@ -908,10 +916,18 @@ impl ColumnDescriptor {
}
/// Returns [`LogicalType`] for this column.
+ ///
+ /// Note that this function will clone the `LogicalType`. If performance
is a concern,
+ /// use [`Self::logical_type_ref`] instead.
pub fn logical_type(&self) -> Option<LogicalType> {
self.primitive_type.get_basic_info().logical_type()
}
+ /// Returns a reference to the [`LogicalType`] for this column.
+ pub fn logical_type_ref(&self) -> Option<&LogicalType> {
+ self.primitive_type.get_basic_info().logical_type_ref()
+ }
+
/// Returns physical type for this column.
/// Note that it will panic if called on a non-primitive type.
pub fn physical_type(&self) -> PhysicalType {