This is an automated email from the ASF dual-hosted git repository.

etseidl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new eaca232c85 [Parquet] Allow reading of files with unknown logical types 
(#8777)
eaca232c85 is described below

commit eaca232c85625025d1ce118fcd7f7e2c77c746c0
Author: Ed Seidl <[email protected]>
AuthorDate: Wed Nov 5 06:58:37 2025 -0800

    [Parquet] Allow reading of files with unknown logical types (#8777)
    
    # Which issue does this PR close?
    
    - Closes #8776.
    
    # Rationale for this change
    
    See issue
    
    # What changes are included in this PR?
    
    Modifies a few conversion functions to account for unknown logical types
    
    # Are these changes tested?
    
    Yes, tests are added
    
    # Are there any user-facing changes?
    
    No
---
 parquet/src/arrow/arrow_reader/mod.rs | 25 ++++++++++++++++++++++++-
 parquet/src/arrow/schema/primitive.rs |  1 +
 parquet/src/file/serialized_reader.rs | 27 +++++++++++++++++++++++++++
 parquet/src/schema/types.rs           |  8 ++++++++
 4 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/parquet/src/arrow/arrow_reader/mod.rs 
b/parquet/src/arrow/arrow_reader/mod.rs
index 1cc7673a57..e41515d613 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -1195,7 +1195,7 @@ mod tests {
     };
     use crate::arrow::schema::add_encoded_arrow_schema_to_metadata;
     use crate::arrow::{ArrowWriter, ProjectionMask};
-    use crate::basic::{ConvertedType, Encoding, Repetition, Type as 
PhysicalType};
+    use crate::basic::{ConvertedType, Encoding, LogicalType, Repetition, Type 
as PhysicalType};
     use crate::column::reader::decoder::REPETITION_LEVELS_BATCH_SIZE;
     use crate::data_type::{
         BoolType, ByteArray, ByteArrayType, DataType, FixedLenByteArray, 
FixedLenByteArrayType,
@@ -5019,4 +5019,27 @@ mod tests {
         assert!(sbbf.check(&"Hello"));
         assert!(!sbbf.check(&"Hello_Not_Exists"));
     }
+
+    #[test]
+    fn test_read_unknown_logical_type() {
+        let testdata = arrow::util::test_util::parquet_test_data();
+        let path = format!("{testdata}/unknown-logical-type.parquet");
+        let test_file = File::open(path).unwrap();
+
+        let builder = ParquetRecordBatchReaderBuilder::try_new(test_file)
+            .expect("Error creating reader builder");
+
+        let schema = builder.metadata().file_metadata().schema_descr();
+        assert_eq!(schema.column(0).logical_type(), Some(LogicalType::String));
+        assert_eq!(
+            schema.column(1).logical_type(),
+            Some(LogicalType::_Unknown { field_id: 2555 })
+        );
+        assert_eq!(schema.column(1).physical_type(), PhysicalType::BYTE_ARRAY);
+
+        let mut reader = builder.build().unwrap();
+        let out = reader.next().unwrap().unwrap();
+        assert_eq!(out.num_rows(), 3);
+        assert_eq!(out.num_columns(), 2);
+    }
 }
diff --git a/parquet/src/arrow/schema/primitive.rs 
b/parquet/src/arrow/schema/primitive.rs
index 564f23a16a..c9f6482c90 100644
--- a/parquet/src/arrow/schema/primitive.rs
+++ b/parquet/src/arrow/schema/primitive.rs
@@ -278,6 +278,7 @@ fn from_byte_array(info: &BasicTypeInfo, precision: i32, 
scale: i32) -> Result<D
         (Some(LogicalType::Enum), _) => Ok(DataType::Binary),
         (Some(LogicalType::Geometry { .. }), _) => Ok(DataType::Binary),
         (Some(LogicalType::Geography { .. }), _) => Ok(DataType::Binary),
+        (Some(LogicalType::_Unknown { .. }), _) => Ok(DataType::Binary),
         (None, ConvertedType::NONE) => Ok(DataType::Binary),
         (None, ConvertedType::JSON) => Ok(DataType::Utf8),
         (None, ConvertedType::BSON) => Ok(DataType::Binary),
diff --git a/parquet/src/file/serialized_reader.rs 
b/parquet/src/file/serialized_reader.rs
index ef71b4b6ac..1b866a45cf 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -2696,4 +2696,31 @@ mod tests {
             );
         }
     }
+
+    #[test]
+    fn test_read_unknown_logical_type() {
+        let file = get_test_file("unknown-logical-type.parquet");
+        let reader = SerializedFileReader::new(file).expect("Error opening 
file");
+
+        let schema = reader.metadata().file_metadata().schema_descr();
+        assert_eq!(
+            schema.column(0).logical_type(),
+            Some(basic::LogicalType::String)
+        );
+        assert_eq!(
+            schema.column(1).logical_type(),
+            Some(basic::LogicalType::_Unknown { field_id: 2555 })
+        );
+        assert_eq!(schema.column(1).physical_type(), Type::BYTE_ARRAY);
+
+        let mut iter = reader
+            .get_row_iter(None)
+            .expect("Failed to create row iterator");
+
+        let mut num_rows = 0;
+        while iter.next().is_some() {
+            num_rows += 1;
+        }
+        assert_eq!(num_rows, reader.metadata().file_metadata().num_rows());
+    }
 }
diff --git a/parquet/src/schema/types.rs b/parquet/src/schema/types.rs
index 50ae495538..0dc2a731b9 100644
--- a/parquet/src/schema/types.rs
+++ b/parquet/src/schema/types.rs
@@ -418,6 +418,8 @@ impl<'a> PrimitiveTypeBuilder<'a> {
                         self.name
                     ));
                 }
+                // unknown logical type means just use physical type
+                (LogicalType::_Unknown { .. }, _) => {}
                 (a, b) => {
                     return Err(general_err!(
                         "Cannot annotate {:?} from {} for field '{}'",
@@ -1714,6 +1716,12 @@ mod tests {
                 "Parquet error: UUID cannot annotate field 'foo' because it is 
not a FIXED_LEN_BYTE_ARRAY(16) field"
             );
         }
+
+        // test unknown logical types are ok
+        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
+            .with_logical_type(Some(LogicalType::_Unknown { field_id: 100 }))
+            .build();
+        assert!(result.is_ok());
     }
 
     #[test]

Reply via email to