This is an automated email from the ASF dual-hosted git repository.

etseidl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new fd1c5b391e Safely ignore Parquet fields with unimplemented Thrift 
types (#9974)
fd1c5b391e is described below

commit fd1c5b391e169762a0981870c4e94baa3372d7a3
Author: Ed Seidl <[email protected]>
AuthorDate: Fri May 15 08:19:21 2026 -0700

    Safely ignore Parquet fields with unimplemented Thrift types (#9974)
    
    # Which issue does this PR close?
    
    - Closes #9973.
    
    # Rationale for this change
    
    The thrift decoder should be able to skip fields with unimplemented
    thrift types `set`, `map`, and `uuid`.
    
    # What changes are included in this PR?
    
    Flesh out the thrift enums and add code to the skip function to handle
    the above types.
    
    # Are these changes tested?
    
    Yes, test is added
    
    # Are there any user-facing changes?
    
    No, changes are to internal APIs
---
 parquet/src/parquet_thrift.rs            |  60 +++++++++++++++++++++----------
 parquet/tests/arrow_reader/bad_data.rs   |  11 ++++++
 parquet/tests/arrow_reader/new_types.bin | Bin 0 -> 56 bytes
 3 files changed, 52 insertions(+), 19 deletions(-)

diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs
index 7567ace08b..cc6390b392 100644
--- a/parquet/src/parquet_thrift.rs
+++ b/parquet/src/parquet_thrift.rs
@@ -157,6 +157,7 @@ pub(crate) enum FieldType {
     Set = 10,
     Map = 11,
     Struct = 12,
+    Uuid = 13,
 }
 
 impl TryFrom<u8> for FieldType {
@@ -176,25 +177,27 @@ impl TryFrom<u8> for FieldType {
             10 => Ok(Self::Set),
             11 => Ok(Self::Map),
             12 => Ok(Self::Struct),
+            13 => Ok(Self::Uuid),
             _ => Err(ThriftProtocolError::InvalidFieldType(value)),
         }
     }
 }
 
-impl TryFrom<ElementType> for FieldType {
-    type Error = ThriftProtocolError;
-    fn try_from(value: ElementType) -> std::result::Result<Self, Self::Error> {
+impl From<ElementType> for FieldType {
+    fn from(value: ElementType) -> Self {
         match value {
-            ElementType::Bool => Ok(Self::BooleanTrue),
-            ElementType::Byte => Ok(Self::Byte),
-            ElementType::I16 => Ok(Self::I16),
-            ElementType::I32 => Ok(Self::I32),
-            ElementType::I64 => Ok(Self::I64),
-            ElementType::Double => Ok(Self::Double),
-            ElementType::Binary => Ok(Self::Binary),
-            ElementType::List => Ok(Self::List),
-            ElementType::Struct => Ok(Self::Struct),
-            _ => Err(ThriftProtocolError::InvalidFieldType(value as u8)),
+            ElementType::Bool => Self::BooleanTrue,
+            ElementType::Byte => Self::Byte,
+            ElementType::I16 => Self::I16,
+            ElementType::I32 => Self::I32,
+            ElementType::I64 => Self::I64,
+            ElementType::Double => Self::Double,
+            ElementType::Binary => Self::Binary,
+            ElementType::List => Self::List,
+            ElementType::Set => Self::Set,
+            ElementType::Map => Self::Map,
+            ElementType::Struct => Self::Struct,
+            ElementType::Uuid => Self::Uuid,
         }
     }
 }
@@ -213,6 +216,7 @@ pub(crate) enum ElementType {
     Set = 10,
     Map = 11,
     Struct = 12,
+    Uuid = 13,
 }
 
 impl TryFrom<u8> for ElementType {
@@ -235,6 +239,7 @@ impl TryFrom<u8> for ElementType {
             10 => Ok(Self::Set),
             11 => Ok(Self::Map),
             12 => Ok(Self::Struct),
+            13 => Ok(Self::Uuid),
             _ => Err(ThriftProtocolError::InvalidElementType(value)),
         }
     }
@@ -499,27 +504,44 @@ pub(crate) trait ThriftCompactInputProtocol<'a> {
             FieldType::I64 => self.skip_vlq().map(|_| ()),
             FieldType::Double => self.skip_bytes(8).map(|_| ()),
             FieldType::Binary => self.skip_binary().map(|_| ()),
+            // see 
https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#struct
             FieldType::Struct => {
-                let mut last_field_id = 0i16;
                 loop {
-                    let field_ident = self.read_field_begin(last_field_id)?;
+                    // we don't need field id for skipping, so always pass 0 
for last id
+                    let field_ident = self.read_field_begin(0)?;
                     if field_ident.field_type == FieldType::Stop {
                         break;
                     }
                     self.skip_till_depth(field_ident.field_type, depth - 1)?;
-                    last_field_id = field_ident.id;
                 }
                 Ok(())
             }
-            FieldType::List => {
+            // lists and sets are encoded the same
+            // see 
https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#list-and-set
+            FieldType::List | FieldType::Set => {
                 let list_ident = self.read_list_begin()?;
+                let element_type = FieldType::from(list_ident.element_type);
                 for _ in 0..list_ident.size {
-                    let element_type = 
FieldType::try_from(list_ident.element_type)?;
                     self.skip_till_depth(element_type, depth - 1)?;
                 }
                 Ok(())
             }
-            // no list or map types in parquet format
+            // see 
https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#map
+            FieldType::Map => {
+                let size = i32::try_from(self.read_vlq()?)?;
+                if size > 0 {
+                    let kv = self.read_byte()?;
+                    let key_type = FieldType::from(ElementType::try_from(kv >> 
4)?);
+                    let val_type = FieldType::from(ElementType::try_from(kv & 
0xf)?);
+                    for _ in 0..size {
+                        self.skip_till_depth(key_type, depth - 1)?;
+                        self.skip_till_depth(val_type, depth - 1)?;
+                    }
+                }
+                Ok(())
+            }
+            // see 
https://github.com/apache/thrift/blob/master/doc/specs/thrift-compact-protocol.md#universal-unique-identifier-encoding
+            FieldType::Uuid => self.skip_bytes(16).map(|_| ()),
             _ => Err(ThriftProtocolError::SkipUnsupportedType(field_type)),
         }
     }
diff --git a/parquet/tests/arrow_reader/bad_data.rs 
b/parquet/tests/arrow_reader/bad_data.rs
index 56fddf505d..38fa69cdb1 100644
--- a/parquet/tests/arrow_reader/bad_data.rs
+++ b/parquet/tests/arrow_reader/bad_data.rs
@@ -21,6 +21,7 @@ use arrow::util::test_util::parquet_test_data;
 use bytes::Bytes;
 use parquet::arrow::arrow_reader::ArrowReaderBuilder;
 use parquet::errors::ParquetError;
+use parquet::file::metadata::ParquetMetaDataReader;
 use std::collections::HashSet;
 use std::path::PathBuf;
 
@@ -175,6 +176,16 @@ fn non_standard_delta_blocks() {
     }
 }
 
+#[test]
+fn skip_unknown_types() {
+    // test file contains a FileMetaData with unknown fields with
+    // types not currently used by Parquet (uuid, set, map). The
+    // parser should be able to skip these unknown fields without
+    // erroring.
+    let file = Bytes::from_static(include_bytes!("new_types.bin"));
+    ParquetMetaDataReader::decode_metadata(&file).unwrap();
+}
+
 #[cfg(feature = "async")]
 #[tokio::test]
 #[allow(deprecated)]
diff --git a/parquet/tests/arrow_reader/new_types.bin 
b/parquet/tests/arrow_reader/new_types.bin
new file mode 100644
index 0000000000..00d86fd630
Binary files /dev/null and b/parquet/tests/arrow_reader/new_types.bin differ

Reply via email to