This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 6fc14b60d7 Allow reading of improperly constructed empty lists in 
Parquet metadata (#8827)
6fc14b60d7 is described below

commit 6fc14b60d76cd18e0c89c26a8643e2c5b370c185
Author: Ed Seidl <[email protected]>
AuthorDate: Thu Nov 13 10:02:24 2025 -0800

    Allow reading of improperly constructed empty lists in Parquet metadata 
(#8827)
    
    # Which issue does this PR close?
    
    - Closes #8826.
    
    # Rationale for this change
    
    As reported in the issue, some writers will use an element type of 0 for
    an empty list. This is not thrift compact protocol spec compliant, but
    many readers (including this crate prior to 57.0.0) tolerate this.
    
    # What changes are included in this PR?
    Adds a special case to `read_list_begin` for a 0 length list with a type
    of 0.
    
    # Are these changes tested?
    
    Yes
    
    # Are there any user-facing changes?
    
    No, internal change only
---
 parquet/src/parquet_thrift.rs | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs
index f9fa66ee0d..6c82a0bf2c 100644
--- a/parquet/src/parquet_thrift.rs
+++ b/parquet/src/parquet_thrift.rs
@@ -61,7 +61,7 @@ impl From<ThriftProtocolError> for ParquetError {
                 general_err!("Unexpected struct field type {}", value)
             }
             ThriftProtocolError::InvalidElementType(value) => {
-                general_err!("Unexpected list/set element type{}", value)
+                general_err!("Unexpected list/set element type {}", value)
             }
             ThriftProtocolError::FieldDeltaOverflow {
                 field_delta,
@@ -302,6 +302,14 @@ pub(crate) trait ThriftCompactInputProtocol<'a> {
     /// Read the [`ListIdentifier`] for a Thrift encoded list.
     fn read_list_begin(&mut self) -> ThriftProtocolResult<ListIdentifier> {
         let header = self.read_byte()?;
+        // some parquet writers will have an element_type of 0 for an empty 
list.
+        // account for that and return a bogus but valid element_type.
+        if header == 0 {
+            return Ok(ListIdentifier {
+                element_type: ElementType::Byte,
+                size: 0,
+            });
+        }
         let element_type = ElementType::try_from(header & 0x0f)?;
 
         let possible_element_count = (header & 0xF0) >> 4;
@@ -1089,4 +1097,13 @@ pub(crate) mod tests {
         test_roundtrip(TimeUnit::MICROS);
         test_roundtrip(TimeUnit::NANOS);
     }
+
+    #[test]
+    fn test_decode_empty_list() {
+        let data = vec![0u8; 1];
+        let mut prot = ThriftSliceInputProtocol::new(&data);
+        let header = prot.read_list_begin().expect("error reading list 
header");
+        assert_eq!(header.size, 0);
+        assert_eq!(header.element_type, ElementType::Byte);
+    }
 }

Reply via email to