This is an automated email from the ASF dual-hosted git repository.
etseidl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new eaca232c85 [Parquet] Allow reading of files with unknown logical types
(#8777)
eaca232c85 is described below
commit eaca232c85625025d1ce118fcd7f7e2c77c746c0
Author: Ed Seidl <[email protected]>
AuthorDate: Wed Nov 5 06:58:37 2025 -0800
[Parquet] Allow reading of files with unknown logical types (#8777)
# Which issue does this PR close?
- Closes #8776.
# Rationale for this change
See issue
# What changes are included in this PR?
Modifies a few conversion functions to account for unknown logical types
# Are these changes tested?
Yes, tests are added
# Are there any user-facing changes?
No
---
parquet/src/arrow/arrow_reader/mod.rs | 25 ++++++++++++++++++++++++-
parquet/src/arrow/schema/primitive.rs | 1 +
parquet/src/file/serialized_reader.rs | 27 +++++++++++++++++++++++++++
parquet/src/schema/types.rs | 8 ++++++++
4 files changed, 60 insertions(+), 1 deletion(-)
diff --git a/parquet/src/arrow/arrow_reader/mod.rs
b/parquet/src/arrow/arrow_reader/mod.rs
index 1cc7673a57..e41515d613 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -1195,7 +1195,7 @@ mod tests {
};
use crate::arrow::schema::add_encoded_arrow_schema_to_metadata;
use crate::arrow::{ArrowWriter, ProjectionMask};
- use crate::basic::{ConvertedType, Encoding, Repetition, Type as
PhysicalType};
+ use crate::basic::{ConvertedType, Encoding, LogicalType, Repetition, Type
as PhysicalType};
use crate::column::reader::decoder::REPETITION_LEVELS_BATCH_SIZE;
use crate::data_type::{
BoolType, ByteArray, ByteArrayType, DataType, FixedLenByteArray,
FixedLenByteArrayType,
@@ -5019,4 +5019,27 @@ mod tests {
assert!(sbbf.check(&"Hello"));
assert!(!sbbf.check(&"Hello_Not_Exists"));
}
+
+ #[test]
+ fn test_read_unknown_logical_type() {
+ let testdata = arrow::util::test_util::parquet_test_data();
+ let path = format!("{testdata}/unknown-logical-type.parquet");
+ let test_file = File::open(path).unwrap();
+
+ let builder = ParquetRecordBatchReaderBuilder::try_new(test_file)
+ .expect("Error creating reader builder");
+
+ let schema = builder.metadata().file_metadata().schema_descr();
+ assert_eq!(schema.column(0).logical_type(), Some(LogicalType::String));
+ assert_eq!(
+ schema.column(1).logical_type(),
+ Some(LogicalType::_Unknown { field_id: 2555 })
+ );
+ assert_eq!(schema.column(1).physical_type(), PhysicalType::BYTE_ARRAY);
+
+ let mut reader = builder.build().unwrap();
+ let out = reader.next().unwrap().unwrap();
+ assert_eq!(out.num_rows(), 3);
+ assert_eq!(out.num_columns(), 2);
+ }
}
diff --git a/parquet/src/arrow/schema/primitive.rs
b/parquet/src/arrow/schema/primitive.rs
index 564f23a16a..c9f6482c90 100644
--- a/parquet/src/arrow/schema/primitive.rs
+++ b/parquet/src/arrow/schema/primitive.rs
@@ -278,6 +278,7 @@ fn from_byte_array(info: &BasicTypeInfo, precision: i32,
scale: i32) -> Result<D
(Some(LogicalType::Enum), _) => Ok(DataType::Binary),
(Some(LogicalType::Geometry { .. }), _) => Ok(DataType::Binary),
(Some(LogicalType::Geography { .. }), _) => Ok(DataType::Binary),
+ (Some(LogicalType::_Unknown { .. }), _) => Ok(DataType::Binary),
(None, ConvertedType::NONE) => Ok(DataType::Binary),
(None, ConvertedType::JSON) => Ok(DataType::Utf8),
(None, ConvertedType::BSON) => Ok(DataType::Binary),
diff --git a/parquet/src/file/serialized_reader.rs
b/parquet/src/file/serialized_reader.rs
index ef71b4b6ac..1b866a45cf 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -2696,4 +2696,31 @@ mod tests {
);
}
}
+
+ #[test]
+ fn test_read_unknown_logical_type() {
+ let file = get_test_file("unknown-logical-type.parquet");
+ let reader = SerializedFileReader::new(file).expect("Error opening
file");
+
+ let schema = reader.metadata().file_metadata().schema_descr();
+ assert_eq!(
+ schema.column(0).logical_type(),
+ Some(basic::LogicalType::String)
+ );
+ assert_eq!(
+ schema.column(1).logical_type(),
+ Some(basic::LogicalType::_Unknown { field_id: 2555 })
+ );
+ assert_eq!(schema.column(1).physical_type(), Type::BYTE_ARRAY);
+
+ let mut iter = reader
+ .get_row_iter(None)
+ .expect("Failed to create row iterator");
+
+ let mut num_rows = 0;
+ while iter.next().is_some() {
+ num_rows += 1;
+ }
+ assert_eq!(num_rows, reader.metadata().file_metadata().num_rows());
+ }
}
diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs
index 50ae495538..0dc2a731b9 100644
--- a/parquet/src/schema/types.rs
+++ b/parquet/src/schema/types.rs
@@ -418,6 +418,8 @@ impl<'a> PrimitiveTypeBuilder<'a> {
self.name
));
}
+ // unknown logical type means just use physical type
+ (LogicalType::_Unknown { .. }, _) => {}
(a, b) => {
return Err(general_err!(
"Cannot annotate {:?} from {} for field '{}'",
@@ -1714,6 +1716,12 @@ mod tests {
"Parquet error: UUID cannot annotate field 'foo' because it is
not a FIXED_LEN_BYTE_ARRAY(16) field"
);
}
+
+ // test unknown logical types are ok
+ result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
+ .with_logical_type(Some(LogicalType::_Unknown { field_id: 100 }))
+ .build();
+ assert!(result.is_ok());
}
#[test]