This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 6fc14b60d7 Allow reading of improperly constructed empty lists in
Parquet metadata (#8827)
6fc14b60d7 is described below
commit 6fc14b60d76cd18e0c89c26a8643e2c5b370c185
Author: Ed Seidl <[email protected]>
AuthorDate: Thu Nov 13 10:02:24 2025 -0800
Allow reading of improperly constructed empty lists in Parquet metadata
(#8827)
# Which issue does this PR close?
- Closes #8826.
# Rationale for this change
As reported in the issue, some writers will use an element type of 0 for
an empty list. This is not thrift compact protocol spec compliant, but
many readers (including this crate prior to 57.0.0) tolerate this.
# What changes are included in this PR?
Adds a special case to `read_list_begin` for a 0 length list with a type
of 0.
# Are these changes tested?
Yes
# Are there any user-facing changes?
No, internal change only
---
parquet/src/parquet_thrift.rs | 19 ++++++++++++++++++-
1 file changed, 18 insertions(+), 1 deletion(-)
diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs
index f9fa66ee0d..6c82a0bf2c 100644
--- a/parquet/src/parquet_thrift.rs
+++ b/parquet/src/parquet_thrift.rs
@@ -61,7 +61,7 @@ impl From<ThriftProtocolError> for ParquetError {
general_err!("Unexpected struct field type {}", value)
}
ThriftProtocolError::InvalidElementType(value) => {
- general_err!("Unexpected list/set element type{}", value)
+ general_err!("Unexpected list/set element type {}", value)
}
ThriftProtocolError::FieldDeltaOverflow {
field_delta,
@@ -302,6 +302,14 @@ pub(crate) trait ThriftCompactInputProtocol<'a> {
/// Read the [`ListIdentifier`] for a Thrift encoded list.
fn read_list_begin(&mut self) -> ThriftProtocolResult<ListIdentifier> {
let header = self.read_byte()?;
+ // some parquet writers will have an element_type of 0 for an empty
list.
+ // account for that and return a bogus but valid element_type.
+ if header == 0 {
+ return Ok(ListIdentifier {
+ element_type: ElementType::Byte,
+ size: 0,
+ });
+ }
let element_type = ElementType::try_from(header & 0x0f)?;
let possible_element_count = (header & 0xF0) >> 4;
@@ -1089,4 +1097,13 @@ pub(crate) mod tests {
test_roundtrip(TimeUnit::MICROS);
test_roundtrip(TimeUnit::NANOS);
}
+
+ #[test]
+ fn test_decode_empty_list() {
+ let data = vec![0u8; 1];
+ let mut prot = ThriftSliceInputProtocol::new(&data);
+ let header = prot.read_list_begin().expect("error reading list
header");
+ assert_eq!(header.size, 0);
+ assert_eq!(header.element_type, ElementType::Byte);
+ }
}