This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 223ee58d09 [Parquet] Avoid copying `LogicalType` in
`ColumnOrder::get_sort_order`, deprecate `get_logical_type` (#8789)
223ee58d09 is described below
commit 223ee58d099e87e9e666e83cd316bd36128f3377
Author: Andrew Lamb <[email protected]>
AuthorDate: Tue Nov 11 17:20:18 2025 -0500
[Parquet] Avoid copying `LogicalType` in `ColumnOrder::get_sort_order`,
deprecate `get_logical_type` (#8789)
# Which issue does this PR close?
- Builds on https://github.com/apache/arrow-rs/pull/8786 from @etseidl
# Rationale for this change
@etseidl suggested:
> A further optimization would be to change ColumnOrder::get_sort_order
to take an Option<&LogicalType>, but that is a breaking API change.
# What changes are included in this PR?
1. Deprecate `ColumnOrder::get_sort_order`
2. Add ColumnOrder::sort_order_for_type` that returns the same value
# Are these changes tested?
Yes by CI
Note I was not able to observe any changes in benchmark performance with
this, so I am not sure it it worth doing
# Are there any user-facing changes?
There is a new API but I think there not many people use this (low
level) API so the disruption should be minimal
---------
Co-authored-by: Ed Seidl <[email protected]>
---
parquet/src/arrow/arrow_reader/mod.rs | 9 ++++++---
parquet/src/arrow/schema/extension.rs | 4 ++--
parquet/src/arrow/schema/mod.rs | 10 +++++-----
parquet/src/arrow/schema/primitive.rs | 24 ++++++++++++++----------
parquet/src/basic.rs | 14 +++++++++++++-
parquet/src/file/metadata/thrift/mod.rs | 8 ++++----
parquet/src/file/metadata/writer.rs | 4 ++--
parquet/src/file/serialized_reader.rs | 8 ++++----
parquet/src/geospatial/accumulator.rs | 2 +-
parquet/src/schema/printer.rs | 4 ++--
parquet/src/schema/types.rs | 27 +++++++++++++++++++--------
parquet/src/variant.rs | 4 ++--
parquet/tests/geospatial.rs | 27 ++++++++++-----------------
13 files changed, 84 insertions(+), 61 deletions(-)
diff --git a/parquet/src/arrow/arrow_reader/mod.rs
b/parquet/src/arrow/arrow_reader/mod.rs
index c6f2eaeff4..6122b861f4 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -5096,10 +5096,13 @@ mod tests {
.expect("Error creating reader builder");
let schema = builder.metadata().file_metadata().schema_descr();
- assert_eq!(schema.column(0).logical_type(), Some(LogicalType::String));
assert_eq!(
- schema.column(1).logical_type(),
- Some(LogicalType::_Unknown { field_id: 2555 })
+ schema.column(0).logical_type_ref(),
+ Some(&LogicalType::String)
+ );
+ assert_eq!(
+ schema.column(1).logical_type_ref(),
+ Some(&LogicalType::_Unknown { field_id: 2555 })
);
assert_eq!(schema.column(1).physical_type(), PhysicalType::BYTE_ARRAY);
diff --git a/parquet/src/arrow/schema/extension.rs
b/parquet/src/arrow/schema/extension.rs
index 9adec8b6c1..fe3e856a6c 100644
--- a/parquet/src/arrow/schema/extension.rs
+++ b/parquet/src/arrow/schema/extension.rs
@@ -39,7 +39,7 @@ pub(crate) fn try_add_extension_type(
mut arrow_field: Field,
parquet_type: &Type,
) -> Result<Field, ParquetError> {
- let Some(parquet_logical_type) =
parquet_type.get_basic_info().logical_type() else {
+ let Some(parquet_logical_type) =
parquet_type.get_basic_info().logical_type_ref() else {
return Ok(arrow_field);
};
match parquet_logical_type {
@@ -65,7 +65,7 @@ pub(crate) fn try_add_extension_type(
///
/// This is used to preallocate the metadata hashmap size
pub(crate) fn has_extension_type(parquet_type: &Type) -> bool {
- let Some(parquet_logical_type) =
parquet_type.get_basic_info().logical_type() else {
+ let Some(parquet_logical_type) =
parquet_type.get_basic_info().logical_type_ref() else {
return false;
};
match parquet_logical_type {
diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs
index 75603ac866..1d8d2e1fd3 100644
--- a/parquet/src/arrow/schema/mod.rs
+++ b/parquet/src/arrow/schema/mod.rs
@@ -1840,7 +1840,7 @@ mod tests {
// This is because the Arrow conversion always sets logical
type,
// even if there wasn't originally one.
// This is not an issue, but is an inconvenience for this test.
- match a.logical_type() {
+ match a.logical_type_ref() {
Some(_) => {
assert_eq!(a, b)
}
@@ -2216,8 +2216,8 @@ mod tests {
let parquet_schema =
ArrowSchemaConverter::new().convert(&arrow_schema)?;
assert_eq!(
- parquet_schema.column(0).logical_type(),
- Some(LogicalType::Uuid)
+ parquet_schema.column(0).logical_type_ref(),
+ Some(&LogicalType::Uuid)
);
let arrow_schema = parquet_to_arrow_schema(&parquet_schema, None)?;
@@ -2237,8 +2237,8 @@ mod tests {
let parquet_schema =
ArrowSchemaConverter::new().convert(&arrow_schema)?;
assert_eq!(
- parquet_schema.column(0).logical_type(),
- Some(LogicalType::Json)
+ parquet_schema.column(0).logical_type_ref(),
+ Some(&LogicalType::Json)
);
let arrow_schema = parquet_to_arrow_schema(&parquet_schema, None)?;
diff --git a/parquet/src/arrow/schema/primitive.rs
b/parquet/src/arrow/schema/primitive.rs
index c9f6482c90..8959081bcb 100644
--- a/parquet/src/arrow/schema/primitive.rs
+++ b/parquet/src/arrow/schema/primitive.rs
@@ -164,7 +164,7 @@ fn decimal_256_type(scale: i32, precision: i32) ->
Result<DataType> {
}
fn from_int32(info: &BasicTypeInfo, scale: i32, precision: i32) ->
Result<DataType> {
- match (info.logical_type(), info.converted_type()) {
+ match (info.logical_type_ref(), info.converted_type()) {
(None, ConvertedType::NONE) => Ok(DataType::Int32),
(
Some(
@@ -183,7 +183,9 @@ fn from_int32(info: &BasicTypeInfo, scale: i32, precision:
i32) -> Result<DataTy
(32, false) => Ok(DataType::UInt32),
_ => Err(arrow_err!("Cannot create INT32 physical type from {:?}",
t)),
},
- (Some(LogicalType::Decimal { scale, precision }), _) =>
decimal_128_type(scale, precision),
+ (Some(LogicalType::Decimal { scale, precision }), _) => {
+ decimal_128_type(*scale, *precision)
+ }
(Some(LogicalType::Date), _) => Ok(DataType::Date32),
(Some(LogicalType::Time { unit, .. }), _) => match unit {
ParquetTimeUnit::MILLIS =>
Ok(DataType::Time32(TimeUnit::Millisecond)),
@@ -212,7 +214,7 @@ fn from_int32(info: &BasicTypeInfo, scale: i32, precision:
i32) -> Result<DataTy
}
fn from_int64(info: &BasicTypeInfo, scale: i32, precision: i32) ->
Result<DataType> {
- match (info.logical_type(), info.converted_type()) {
+ match (info.logical_type_ref(), info.converted_type()) {
(None, ConvertedType::NONE) => Ok(DataType::Int64),
(
Some(LogicalType::Integer {
@@ -243,7 +245,7 @@ fn from_int64(info: &BasicTypeInfo, scale: i32, precision:
i32) -> Result<DataTy
ParquetTimeUnit::MICROS => TimeUnit::Microsecond,
ParquetTimeUnit::NANOS => TimeUnit::Nanosecond,
},
- if is_adjusted_to_u_t_c {
+ if *is_adjusted_to_u_t_c {
Some("UTC".into())
} else {
None
@@ -260,7 +262,9 @@ fn from_int64(info: &BasicTypeInfo, scale: i32, precision:
i32) -> Result<DataTy
TimeUnit::Microsecond,
Some("UTC".into()),
)),
- (Some(LogicalType::Decimal { scale, precision }), _) =>
decimal_128_type(scale, precision),
+ (Some(LogicalType::Decimal { scale, precision }), _) => {
+ decimal_128_type(*scale, *precision)
+ }
(None, ConvertedType::DECIMAL) => decimal_128_type(scale, precision),
(logical, converted) => Err(arrow_err!(
"Unable to convert parquet INT64 logical type {:?} or converted
type {}",
@@ -271,7 +275,7 @@ fn from_int64(info: &BasicTypeInfo, scale: i32, precision:
i32) -> Result<DataTy
}
fn from_byte_array(info: &BasicTypeInfo, precision: i32, scale: i32) ->
Result<DataType> {
- match (info.logical_type(), info.converted_type()) {
+ match (info.logical_type_ref(), info.converted_type()) {
(Some(LogicalType::String), _) => Ok(DataType::Utf8),
(Some(LogicalType::Json), _) => Ok(DataType::Utf8),
(Some(LogicalType::Bson), _) => Ok(DataType::Binary),
@@ -290,7 +294,7 @@ fn from_byte_array(info: &BasicTypeInfo, precision: i32,
scale: i32) -> Result<D
precision: p,
}),
_,
- ) => decimal_type(s, p),
+ ) => decimal_type(*s, *p),
(None, ConvertedType::DECIMAL) => decimal_type(scale, precision),
(logical, converted) => Err(arrow_err!(
"Unable to convert parquet BYTE_ARRAY logical type {:?} or
converted type {}",
@@ -307,12 +311,12 @@ fn from_fixed_len_byte_array(
type_length: i32,
) -> Result<DataType> {
// TODO: This should check the type length for the decimal and interval
types
- match (info.logical_type(), info.converted_type()) {
+ match (info.logical_type_ref(), info.converted_type()) {
(Some(LogicalType::Decimal { scale, precision }), _) => {
if type_length <= 16 {
- decimal_128_type(scale, precision)
+ decimal_128_type(*scale, *precision)
} else {
- decimal_256_type(scale, precision)
+ decimal_256_type(*scale, *precision)
}
}
(None, ConvertedType::DECIMAL) => {
diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs
index eaa889bb99..9566454cb0 100644
--- a/parquet/src/basic.rs
+++ b/parquet/src/basic.rs
@@ -1132,12 +1132,24 @@ pub enum ColumnOrder {
impl ColumnOrder {
/// Returns sort order for a physical/logical type.
+ #[deprecated(
+ since = "57.1.0",
+ note = "use `ColumnOrder::sort_order_for_type` instead"
+ )]
pub fn get_sort_order(
logical_type: Option<LogicalType>,
converted_type: ConvertedType,
physical_type: Type,
) -> SortOrder {
- // TODO: Should this take converted and logical type, for
compatibility?
+ Self::sort_order_for_type(logical_type.as_ref(), converted_type,
physical_type)
+ }
+
+ /// Returns sort order for a physical/logical type.
+ pub fn sort_order_for_type(
+ logical_type: Option<&LogicalType>,
+ converted_type: ConvertedType,
+ physical_type: Type,
+ ) -> SortOrder {
match logical_type {
Some(logical) => match logical {
LogicalType::String | LogicalType::Enum | LogicalType::Json |
LogicalType::Bson => {
diff --git a/parquet/src/file/metadata/thrift/mod.rs
b/parquet/src/file/metadata/thrift/mod.rs
index 175a152839..6fdb0e0c4b 100644
--- a/parquet/src/file/metadata/thrift/mod.rs
+++ b/parquet/src/file/metadata/thrift/mod.rs
@@ -823,8 +823,8 @@ pub(crate) fn parquet_metadata_from_bytes(
let column_orders = column_orders.map(|mut cos| {
for (i, column) in schema_descr.columns().iter().enumerate() {
if let ColumnOrder::TYPE_DEFINED_ORDER(_) = cos[i] {
- let sort_order = ColumnOrder::get_sort_order(
- column.logical_type(),
+ let sort_order = ColumnOrder::sort_order_for_type(
+ column.logical_type_ref(),
column.converted_type(),
column.physical_type(),
);
@@ -1396,7 +1396,7 @@ fn write_schema_helper<W: Write>(
} else {
None
},
- logical_type: basic_info.logical_type(),
+ logical_type: basic_info.logical_type_ref().cloned(),
};
element.write_thrift(writer)
}
@@ -1424,7 +1424,7 @@ fn write_schema_helper<W: Write>(
} else {
None
},
- logical_type: basic_info.logical_type(),
+ logical_type: basic_info.logical_type_ref().cloned(),
};
element.write_thrift(writer)?;
diff --git a/parquet/src/file/metadata/writer.rs
b/parquet/src/file/metadata/writer.rs
index 124bc11bdd..5bb5f5cd85 100644
--- a/parquet/src/file/metadata/writer.rs
+++ b/parquet/src/file/metadata/writer.rs
@@ -156,8 +156,8 @@ impl<'a, W: Write> ThriftMetadataWriter<'a, W> {
.columns()
.iter()
.map(|col| {
- let sort_order = ColumnOrder::get_sort_order(
- col.logical_type(),
+ let sort_order = ColumnOrder::sort_order_for_type(
+ col.logical_type_ref(),
col.converted_type(),
col.physical_type(),
);
diff --git a/parquet/src/file/serialized_reader.rs
b/parquet/src/file/serialized_reader.rs
index 990b2f4f16..ce608c7717 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -2735,12 +2735,12 @@ mod tests {
let schema = reader.metadata().file_metadata().schema_descr();
assert_eq!(
- schema.column(0).logical_type(),
- Some(basic::LogicalType::String)
+ schema.column(0).logical_type_ref(),
+ Some(&basic::LogicalType::String)
);
assert_eq!(
- schema.column(1).logical_type(),
- Some(basic::LogicalType::_Unknown { field_id: 2555 })
+ schema.column(1).logical_type_ref(),
+ Some(&basic::LogicalType::_Unknown { field_id: 2555 })
);
assert_eq!(schema.column(1).physical_type(), Type::BYTE_ARRAY);
diff --git a/parquet/src/geospatial/accumulator.rs
b/parquet/src/geospatial/accumulator.rs
index c0dcbf3c7b..d25a47930f 100644
--- a/parquet/src/geospatial/accumulator.rs
+++ b/parquet/src/geospatial/accumulator.rs
@@ -118,7 +118,7 @@ pub struct DefaultGeoStatsAccumulatorFactory {}
impl GeoStatsAccumulatorFactory for DefaultGeoStatsAccumulatorFactory {
fn new_accumulator(&self, _descr: &ColumnDescPtr) -> Box<dyn
GeoStatsAccumulator> {
#[cfg(feature = "geospatial")]
- if let Some(crate::basic::LogicalType::Geometry { .. }) =
_descr.logical_type() {
+ if let Some(crate::basic::LogicalType::Geometry { .. }) =
_descr.logical_type_ref() {
Box::new(ParquetGeoStatsAccumulator::default())
} else {
Box::new(VoidGeoStatsAccumulator::default())
diff --git a/parquet/src/schema/printer.rs b/parquet/src/schema/printer.rs
index 838d84a1b1..4ad3b6b93e 100644
--- a/parquet/src/schema/printer.rs
+++ b/parquet/src/schema/printer.rs
@@ -400,7 +400,7 @@ impl Printer<'_> {
// Also print logical type if it is available
// If there is a logical type, do not print converted type
let logical_type_str = print_logical_and_converted(
- basic_info.logical_type().as_ref(),
+ basic_info.logical_type_ref(),
basic_info.converted_type(),
precision,
scale,
@@ -426,7 +426,7 @@ impl Printer<'_> {
write!(self.output, "[{}] ", basic_info.id());
}
let logical_str = print_logical_and_converted(
- basic_info.logical_type().as_ref(),
+ basic_info.logical_type_ref(),
basic_info.converted_type(),
0,
0,
diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs
index 8845f0f1ee..85f3ed4897 100644
--- a/parquet/src/schema/types.rs
+++ b/parquet/src/schema/types.rs
@@ -211,8 +211,8 @@ impl Type {
pub(crate) fn is_list(&self) -> bool {
if self.is_group() {
let basic_info = self.get_basic_info();
- if let Some(logical_type) = basic_info.logical_type() {
- return logical_type == LogicalType::List;
+ if let Some(logical_type) = basic_info.logical_type_ref() {
+ return logical_type == &LogicalType::List;
}
return basic_info.converted_type() == ConvertedType::LIST;
}
@@ -710,6 +710,10 @@ impl BasicTypeInfo {
///
/// Note that this function will clone the `LogicalType`. If performance
is a concern,
/// use [`Self::logical_type_ref`] instead.
+ #[deprecated(
+ since = "57.1.0",
+ note = "use `BasicTypeInfo::logical_type_ref` instead (LogicalType
cloning is non trivial)"
+ )]
pub fn logical_type(&self) -> Option<LogicalType> {
// Unlike ConvertedType, LogicalType cannot implement Copy, thus we
clone it
self.logical_type.clone()
@@ -919,8 +923,15 @@ impl ColumnDescriptor {
///
/// Note that this function will clone the `LogicalType`. If performance
is a concern,
/// use [`Self::logical_type_ref`] instead.
+ #[deprecated(
+ since = "57.1.0",
+ note = "use `ColumnDescriptor::logical_type_ref` instead (LogicalType
cloning is non trivial)"
+ )]
pub fn logical_type(&self) -> Option<LogicalType> {
- self.primitive_type.get_basic_info().logical_type()
+ self.primitive_type
+ .get_basic_info()
+ .logical_type_ref()
+ .cloned()
}
/// Returns a reference to the [`LogicalType`] for this column.
@@ -966,8 +977,8 @@ impl ColumnDescriptor {
/// Returns the sort order for this column
pub fn sort_order(&self) -> SortOrder {
- ColumnOrder::get_sort_order(
- self.logical_type(),
+ ColumnOrder::sort_order_for_type(
+ self.logical_type_ref(),
self.converted_type(),
self.physical_type(),
)
@@ -1417,8 +1428,8 @@ mod tests {
let basic_info = tp.get_basic_info();
assert_eq!(basic_info.repetition(), Repetition::OPTIONAL);
assert_eq!(
- basic_info.logical_type(),
- Some(LogicalType::Integer {
+ basic_info.logical_type_ref(),
+ Some(&LogicalType::Integer {
bit_width: 32,
is_signed: true
})
@@ -1768,7 +1779,7 @@ mod tests {
assert!(tp.is_group());
assert!(!tp.is_primitive());
assert_eq!(basic_info.repetition(), Repetition::REPEATED);
- assert_eq!(basic_info.logical_type(), Some(LogicalType::List));
+ assert_eq!(basic_info.logical_type_ref(), Some(&LogicalType::List));
assert_eq!(basic_info.converted_type(), ConvertedType::LIST);
assert_eq!(basic_info.id(), 1);
assert_eq!(tp.get_fields().len(), 2);
diff --git a/parquet/src/variant.rs b/parquet/src/variant.rs
index d2d62eea6b..b7dde6b3c8 100644
--- a/parquet/src/variant.rs
+++ b/parquet/src/variant.rs
@@ -198,8 +198,8 @@ mod tests {
assert_eq!(field.name(), "data");
// data should have been written with the Variant logical type
assert_eq!(
- field.get_basic_info().logical_type(),
- Some(crate::basic::LogicalType::Variant {
+ field.get_basic_info().logical_type_ref(),
+ Some(&crate::basic::LogicalType::Variant {
specification_version: None
})
);
diff --git a/parquet/tests/geospatial.rs b/parquet/tests/geospatial.rs
index 614acbcf45..b8c623a2a8 100644
--- a/parquet/tests/geospatial.rs
+++ b/parquet/tests/geospatial.rs
@@ -66,28 +66,21 @@ fn test_read_logical_type() {
for (geospatial_file, expected_type) in expected_logical_type {
let metadata = read_metadata(geospatial_file);
- let logical_type = metadata
- .file_metadata()
- .schema_descr()
- .column(1)
- .logical_type()
- .unwrap();
-
- assert_eq!(logical_type, expected_type);
+ let column_descr = metadata.file_metadata().schema_descr().column(1);
+ let logical_type = column_descr.logical_type_ref().unwrap();
+
+ assert_eq!(logical_type, &expected_type);
}
// The crs value may also contain arbitrary values (in this case some JSON
// a bit too lengthy to type out)
let metadata = read_metadata("crs-arbitrary-value.parquet");
- let logical_type = metadata
- .file_metadata()
- .schema_descr()
- .column(1)
- .logical_type()
- .unwrap();
+ let column_descr = metadata.file_metadata().schema_descr().column(1);
+ let logical_type = column_descr.logical_type_ref().unwrap();
if let LogicalType::Geometry { crs } = logical_type {
- let crs_parsed: Value = serde_json::from_str(&crs.unwrap()).unwrap();
+ let crs = crs.as_ref();
+ let crs_parsed: Value = serde_json::from_str(crs.unwrap()).unwrap();
assert_eq!(crs_parsed.get("id").unwrap().get("code").unwrap(), 5070);
} else {
panic!("Expected geometry type but got {logical_type:?}");
@@ -103,8 +96,8 @@ fn test_read_geospatial_statistics() {
// optional binary field_id=-1 wkt (String);
// optional binary field_id=-1 geometry (Geometry(crs=));
let fields = metadata.file_metadata().schema().get_fields();
- let logical_type = fields[2].get_basic_info().logical_type().unwrap();
- assert_eq!(logical_type, LogicalType::Geometry { crs: None });
+ let logical_type = fields[2].get_basic_info().logical_type_ref().unwrap();
+ assert_eq!(logical_type, &LogicalType::Geometry { crs: None });
let geo_statistics = metadata.row_group(0).column(2).geo_statistics();
assert!(geo_statistics.is_some());