This is an automated email from the ASF dual-hosted git repository.
etseidl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 6ce4bc8996 Validate encoded Thrift lists match the schema (#9924)
6ce4bc8996 is described below
commit 6ce4bc899644c1b0816072f3260d835a0c14d148
Author: Ed Seidl <[email protected]>
AuthorDate: Thu May 7 11:24:08 2026 -0700
Validate encoded Thrift lists match the schema (#9924)
# Which issue does this PR close?
- Part of #9923.
# Rationale for this change
A first attempt at adding some validation. This will check that the
encoded list element type matches what is expected from the Parquet
schema.
# What changes are included in this PR?
Adds an `ELEMENT_TYPE` to the `ReadThrift` trait for use in validating
data types in `read_thrift_vec`.
# Are these changes tested?
Should be covered by existing. These changes also cause an earlier error
detection in an existing test of malformed data.
# Are there any user-facing changes?
No, just improves error handling
---
parquet/src/basic.rs | 4 +++-
parquet/src/file/metadata/thrift/mod.rs | 9 ++++++++-
parquet/src/file/page_index/offset_index.rs | 3 ++-
parquet/src/file/serialized_reader.rs | 2 +-
parquet/src/parquet_thrift.rs | 30 ++++++++++++++++++++++++++++-
parquet/tests/arrow_reader/bad_data.rs | 2 +-
6 files changed, 44 insertions(+), 6 deletions(-)
diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs
index cc7a16eca0..17e5b9c321 100644
--- a/parquet/src/basic.rs
+++ b/parquet/src/basic.rs
@@ -28,7 +28,7 @@ pub use crate::compression::{BrotliLevel, GzipLevel,
ZstdLevel};
use crate::file::metadata::HeapSize;
use crate::parquet_thrift::{
ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol,
ThriftCompactOutputProtocol,
- WriteThrift, WriteThriftField,
+ WriteThrift, WriteThriftField, validate_list_type,
};
use crate::{thrift_enum, thrift_struct, thrift_union_all_empty,
write_thrift_field};
@@ -771,6 +771,8 @@ impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a,
R> for EncodingMask {
// This reads a Thrift `list<Encoding>` and turns it into a bitmask
let list_ident = prot.read_list_begin()?;
+ // check for enum (encoded as I32)
+ validate_list_type(ElementType::I32, &list_ident)?;
for _ in 0..list_ident.size {
let val = Encoding::read_thrift(prot)?;
mask |= 1 << val as i32;
diff --git a/parquet/src/file/metadata/thrift/mod.rs
b/parquet/src/file/metadata/thrift/mod.rs
index b68af0c485..75fcad871e 100644
--- a/parquet/src/file/metadata/thrift/mod.rs
+++ b/parquet/src/file/metadata/thrift/mod.rs
@@ -51,7 +51,7 @@ use crate::{
parquet_thrift::{
ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol,
ThriftCompactOutputProtocol, ThriftSliceInputProtocol, WriteThrift,
WriteThriftField,
- read_thrift_vec,
+ read_thrift_vec, validate_list_type,
},
schema::types::{
ColumnDescriptor, SchemaDescriptor, TypePtr, num_nodes,
parquet_schema_from_array,
@@ -387,6 +387,9 @@ fn read_encoding_stats_as_mask<'a>(
// read the vector of stats, setting mask bits for data pages
let mut mask = 0i32;
let list_ident = prot.read_list_begin()?;
+ // check for PageEncodingStats struct
+ validate_list_type(ElementType::Struct, &list_ident)?;
+
for _ in 0..list_ident.size {
let pes = PageEncodingStats::read_thrift(prot)?;
match pes.page_type {
@@ -652,6 +655,8 @@ fn read_row_group(
match field_ident.id {
1 => {
let list_ident = prot.read_list_begin()?;
+ // check for list of struct
+ validate_list_type(ElementType::Struct, &list_ident)?;
if schema_descr.num_columns() != list_ident.size as usize {
return Err(general_err!(
"Column count mismatch. Schema has {} columns while
Row Group has {}",
@@ -801,6 +806,8 @@ pub(crate) fn parquet_metadata_from_bytes(
}
let schema_descr = schema_descr.as_ref().unwrap();
let list_ident = prot.read_list_begin()?;
+ // check for list of struct
+ validate_list_type(ElementType::Struct, &list_ident)?;
let mut rg_vec = Vec::with_capacity(list_ident.size as usize);
// Read row groups and handle ordinal assignment
diff --git a/parquet/src/file/page_index/offset_index.rs
b/parquet/src/file/page_index/offset_index.rs
index b1e30dd459..06d21efb68 100644
--- a/parquet/src/file/page_index/offset_index.rs
+++ b/parquet/src/file/page_index/offset_index.rs
@@ -23,7 +23,7 @@ use std::io::Write;
use crate::parquet_thrift::{
ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol,
ThriftCompactOutputProtocol,
- WriteThrift, WriteThriftField, read_thrift_vec,
+ WriteThrift, WriteThriftField, read_thrift_vec, validate_list_type,
};
use crate::{
errors::{ParquetError, Result},
@@ -90,6 +90,7 @@ impl OffsetIndexMetaData {
// we have to do this manually because we want to use the fast
PageLocation decoder
let list_ident = prot.read_list_begin()?;
+ validate_list_type(ElementType::Struct, &list_ident)?;
let mut page_locations = Vec::with_capacity(list_ident.size as usize);
for _ in 0..list_ident.size {
page_locations.push(read_page_location(prot)?);
diff --git a/parquet/src/file/serialized_reader.rs
b/parquet/src/file/serialized_reader.rs
index 254ccb779a..4b71e3c14e 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -2117,7 +2117,7 @@ mod tests {
let ret = SerializedFileReader::new(Bytes::copy_from_slice(&data));
assert_eq!(
ret.err().unwrap().to_string(),
- "Parquet error: Received empty union from remote ColumnOrder"
+ "Parquet error: Expected list element type of Struct but got List"
);
}
diff --git a/parquet/src/parquet_thrift.rs b/parquet/src/parquet_thrift.rs
index e621f4c498..b9d69b9d52 100644
--- a/parquet/src/parquet_thrift.rs
+++ b/parquet/src/parquet_thrift.rs
@@ -702,9 +702,10 @@ impl<'a, R: ThriftCompactInputProtocol<'a>> ReadThrift<'a,
R> for &'a [u8] {
pub(crate) fn read_thrift_vec<'a, T, R>(prot: &mut R) -> Result<Vec<T>>
where
R: ThriftCompactInputProtocol<'a>,
- T: ReadThrift<'a, R>,
+ T: ReadThrift<'a, R> + WriteThrift,
{
let list_ident = prot.read_list_begin()?;
+ validate_list_type(T::ELEMENT_TYPE, &list_ident)?;
let mut res = Vec::with_capacity(list_ident.size as usize);
for _ in 0..list_ident.size {
let val = T::read_thrift(prot)?;
@@ -713,6 +714,17 @@ where
Ok(res)
}
+pub(crate) fn validate_list_type(expected: ElementType, got: &ListIdentifier)
-> Result<()> {
+ if got.element_type != expected {
+ return Err(general_err!(
+ "Expected list element type of {:?} but got {:?}",
+ expected,
+ got.element_type
+ ));
+ }
+ Ok(())
+}
+
/////////////////////////
// thrift compact output
@@ -1155,4 +1167,20 @@ pub(crate) mod tests {
let result = prot.read_list_begin();
assert!(result.is_err(), "expected error, got {result:?}");
}
+
+ #[test]
+ fn test_read_list_wrong_type() {
+ // list header: 4 elements of `Boolean`
+ let data = [0x42, 0x01];
+ let mut prot = ThriftSliceInputProtocol::new(&data);
+ // try to read as list<i32>
+ let result = read_thrift_vec::<i32, ThriftSliceInputProtocol>(&mut
prot);
+ println!("{result:?}");
+ assert!(
+ result
+ .unwrap_err()
+ .to_string()
+ .contains("Expected list element type of I32 but got Bool")
+ );
+ }
}
diff --git a/parquet/tests/arrow_reader/bad_data.rs
b/parquet/tests/arrow_reader/bad_data.rs
index d9b0d89e2c..56fddf505d 100644
--- a/parquet/tests/arrow_reader/bad_data.rs
+++ b/parquet/tests/arrow_reader/bad_data.rs
@@ -98,7 +98,7 @@ fn test_arrow_gh_41317() {
let err = read_file("ARROW-GH-41317.parquet").unwrap_err();
assert_eq!(
err.to_string(),
- "External: Parquet argument error: Parquet error: StructArrayReader
out of sync in read_records, expected 5 read, got 2"
+ "Parquet error: Expected list element type of I32 but got I16"
);
}