This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 9d0abcc6f4 Return error instead of panic when reading invalid Parquet
metadata (#5382)
9d0abcc6f4 is described below
commit 9d0abcc6f4e11594c23811c2c2d297f2eb2963af
Author: Matthieu Maitre <[email protected]>
AuthorDate: Sun Feb 11 04:27:20 2024 -0800
Return error instead of panic when reading invalid Parquet metadata (#5382)
* Fix handling of invalid metadata
* Don't touch comments
* Fix lint error
---------
Co-authored-by: Matthieu Maitre <[email protected]>
---
parquet/regen.sh | 16 ++++++++++------
parquet/src/file/serialized_reader.rs | 13 +++++++++++++
parquet/src/format.rs | 16 ++++++++--------
3 files changed, 31 insertions(+), 14 deletions(-)
diff --git a/parquet/regen.sh b/parquet/regen.sh
index 9153963433..f2d8158765 100755
--- a/parquet/regen.sh
+++ b/parquet/regen.sh
@@ -21,15 +21,19 @@ REVISION=46cc3a0647d301bb9579ca8dd2cc356caf2a72d2
SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)"
-docker run -v $SOURCE_DIR:/thrift/src -it archlinux pacman -Sy --noconfirm
thrift && \
+docker run -v $SOURCE_DIR:/thrift -it archlinux /bin/bash -c "\
+ pacman -Sy --noconfirm wget thrift && \
wget
https://raw.githubusercontent.com/apache/parquet-format/$REVISION/src/main/thrift/parquet.thrift
-O /tmp/parquet.thrift && \
thrift --gen rs /tmp/parquet.thrift && \
- echo "Removing TProcessor" && \
+ echo 'Removing TProcessor' && \
sed -i '/use thrift::server::TProcessor;/d' parquet.rs && \
- echo "Replacing TSerializable" && \
+ echo 'Replacing TSerializable' && \
sed -i 's/impl TSerializable for/impl crate::thrift::TSerializable for/g'
parquet.rs && \
- echo "Rewriting write_to_out_protocol" && \
+ echo 'Rewriting write_to_out_protocol' && \
sed -i 's/fn write_to_out_protocol(&self, o_prot: &mut dyn
TOutputProtocol)/fn write_to_out_protocol<T: TOutputProtocol>(\&self, o_prot:
\&mut T)/g' parquet.rs && \
- echo "Rewriting read_from_in_protocol" && \
+ echo 'Rewriting read_from_in_protocol' && \
sed -i 's/fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol)/fn
read_from_in_protocol<T: TInputProtocol>(i_prot: \&mut T)/g' parquet.rs && \
- mv parquet.rs src/format.rs
+ echo 'Rewriting return value expectations' && \
+ sed -i 's/Ok(ret.expect(\"return value should have been
constructed\"))/ret.ok_or_else(||
thrift::Error::Protocol(ProtocolError::new(ProtocolErrorKind::InvalidData,
\"return value should have been constructed\")))/g' parquet.rs && \
+ mv parquet.rs /thrift/src/format.rs
+ "
diff --git a/parquet/src/file/serialized_reader.rs
b/parquet/src/file/serialized_reader.rs
index 7e06cde376..5c47e5a83c 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -1249,6 +1249,19 @@ mod tests {
Ok(())
}
+ #[test]
+ fn test_file_reader_invalid_metadata() {
+ let data = [
+ 255, 172, 1, 0, 50, 82, 65, 73, 1, 0, 0, 0, 169, 168, 168, 162,
87, 255, 16, 0, 0, 0,
+ 80, 65, 82, 49,
+ ];
+ let ret = SerializedFileReader::new(Bytes::copy_from_slice(&data));
+ assert_eq!(
+ ret.err().unwrap().to_string(),
+ "Parquet error: Could not parse metadata: bad data"
+ );
+ }
+
#[test]
// Use java parquet-tools get below pageIndex info
// !```
diff --git a/parquet/src/format.rs b/parquet/src/format.rs
index 4700b05dc2..4e1aa0b65b 100644
--- a/parquet/src/format.rs
+++ b/parquet/src/format.rs
@@ -1369,7 +1369,7 @@ impl crate::thrift::TSerializable for TimeUnit {
)
)
} else {
- Ok(ret.expect("return value should have been constructed"))
+ ret.ok_or_else(||
thrift::Error::Protocol(ProtocolError::new(ProtocolErrorKind::InvalidData,
"return value should have been constructed")))
}
}
fn write_to_out_protocol<T: TOutputProtocol>(&self, o_prot: &mut T) ->
thrift::Result<()> {
@@ -1851,7 +1851,7 @@ impl crate::thrift::TSerializable for LogicalType {
)
)
} else {
- Ok(ret.expect("return value should have been constructed"))
+ ret.ok_or_else(||
thrift::Error::Protocol(ProtocolError::new(ProtocolErrorKind::InvalidData,
"return value should have been constructed")))
}
}
fn write_to_out_protocol<T: TOutputProtocol>(&self, o_prot: &mut T) ->
thrift::Result<()> {
@@ -2625,7 +2625,7 @@ impl crate::thrift::TSerializable for
BloomFilterAlgorithm {
)
)
} else {
- Ok(ret.expect("return value should have been constructed"))
+ ret.ok_or_else(||
thrift::Error::Protocol(ProtocolError::new(ProtocolErrorKind::InvalidData,
"return value should have been constructed")))
}
}
fn write_to_out_protocol<T: TOutputProtocol>(&self, o_prot: &mut T) ->
thrift::Result<()> {
@@ -2738,7 +2738,7 @@ impl crate::thrift::TSerializable for BloomFilterHash {
)
)
} else {
- Ok(ret.expect("return value should have been constructed"))
+ ret.ok_or_else(||
thrift::Error::Protocol(ProtocolError::new(ProtocolErrorKind::InvalidData,
"return value should have been constructed")))
}
}
fn write_to_out_protocol<T: TOutputProtocol>(&self, o_prot: &mut T) ->
thrift::Result<()> {
@@ -2850,7 +2850,7 @@ impl crate::thrift::TSerializable for
BloomFilterCompression {
)
)
} else {
- Ok(ret.expect("return value should have been constructed"))
+ ret.ok_or_else(||
thrift::Error::Protocol(ProtocolError::new(ProtocolErrorKind::InvalidData,
"return value should have been constructed")))
}
}
fn write_to_out_protocol<T: TOutputProtocol>(&self, o_prot: &mut T) ->
thrift::Result<()> {
@@ -3846,7 +3846,7 @@ impl crate::thrift::TSerializable for
ColumnCryptoMetaData {
)
)
} else {
- Ok(ret.expect("return value should have been constructed"))
+ ret.ok_or_else(||
thrift::Error::Protocol(ProtocolError::new(ProtocolErrorKind::InvalidData,
"return value should have been constructed")))
}
}
fn write_to_out_protocol<T: TOutputProtocol>(&self, o_prot: &mut T) ->
thrift::Result<()> {
@@ -4300,7 +4300,7 @@ impl crate::thrift::TSerializable for ColumnOrder {
)
)
} else {
- Ok(ret.expect("return value should have been constructed"))
+ ret.ok_or_else(||
thrift::Error::Protocol(ProtocolError::new(ProtocolErrorKind::InvalidData,
"return value should have been constructed")))
}
}
fn write_to_out_protocol<T: TOutputProtocol>(&self, o_prot: &mut T) ->
thrift::Result<()> {
@@ -4873,7 +4873,7 @@ impl crate::thrift::TSerializable for EncryptionAlgorithm
{
)
)
} else {
- Ok(ret.expect("return value should have been constructed"))
+ ret.ok_or_else(||
thrift::Error::Protocol(ProtocolError::new(ProtocolErrorKind::InvalidData,
"return value should have been constructed")))
}
}
fn write_to_out_protocol<T: TOutputProtocol>(&self, o_prot: &mut T) ->
thrift::Result<()> {