This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 9d0abcc6f4 Return error instead of panic when reading invalid Parquet 
metadata (#5382)
9d0abcc6f4 is described below

commit 9d0abcc6f4e11594c23811c2c2d297f2eb2963af
Author: Matthieu Maitre <[email protected]>
AuthorDate: Sun Feb 11 04:27:20 2024 -0800

    Return error instead of panic when reading invalid Parquet metadata (#5382)
    
    * Fix handling of invalid metadata
    
    * Don't touch comments
    
    * Fix lint error
    
    ---------
    
    Co-authored-by: Matthieu Maitre <[email protected]>
---
 parquet/regen.sh                      | 16 ++++++++++------
 parquet/src/file/serialized_reader.rs | 13 +++++++++++++
 parquet/src/format.rs                 | 16 ++++++++--------
 3 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/parquet/regen.sh b/parquet/regen.sh
index 9153963433..f2d8158765 100755
--- a/parquet/regen.sh
+++ b/parquet/regen.sh
@@ -21,15 +21,19 @@ REVISION=46cc3a0647d301bb9579ca8dd2cc356caf2a72d2
 
 SOURCE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]:-$0}")" && pwd)"
 
-docker run -v $SOURCE_DIR:/thrift/src -it archlinux pacman -Sy --noconfirm 
thrift  && \
+docker run -v $SOURCE_DIR:/thrift -it archlinux /bin/bash -c "\
+  pacman -Sy --noconfirm wget thrift && \
   wget 
https://raw.githubusercontent.com/apache/parquet-format/$REVISION/src/main/thrift/parquet.thrift
 -O /tmp/parquet.thrift && \
   thrift --gen rs /tmp/parquet.thrift && \
-  echo "Removing TProcessor" && \
+  echo 'Removing TProcessor' && \
   sed -i '/use thrift::server::TProcessor;/d' parquet.rs && \
-  echo "Replacing TSerializable" && \
+  echo 'Replacing TSerializable' && \
   sed -i 's/impl TSerializable for/impl crate::thrift::TSerializable for/g' 
parquet.rs && \
-  echo "Rewriting write_to_out_protocol" && \
+  echo 'Rewriting write_to_out_protocol' && \
   sed -i 's/fn write_to_out_protocol(&self, o_prot: &mut dyn 
TOutputProtocol)/fn write_to_out_protocol<T: TOutputProtocol>(\&self, o_prot: 
\&mut T)/g' parquet.rs && \
-  echo "Rewriting read_from_in_protocol" && \
+  echo 'Rewriting read_from_in_protocol' && \
   sed -i 's/fn read_from_in_protocol(i_prot: &mut dyn TInputProtocol)/fn 
read_from_in_protocol<T: TInputProtocol>(i_prot: \&mut T)/g' parquet.rs && \
-  mv parquet.rs src/format.rs
+  echo 'Rewriting return value expectations' && \
+  sed -i 's/Ok(ret.expect(\"return value should have been 
constructed\"))/ret.ok_or_else(|| 
thrift::Error::Protocol(ProtocolError::new(ProtocolErrorKind::InvalidData, 
\"return value should have been constructed\")))/g' parquet.rs && \
+  mv parquet.rs /thrift/src/format.rs
+  "
diff --git a/parquet/src/file/serialized_reader.rs 
b/parquet/src/file/serialized_reader.rs
index 7e06cde376..5c47e5a83c 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -1249,6 +1249,19 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn test_file_reader_invalid_metadata() {
+        let data = [
+            255, 172, 1, 0, 50, 82, 65, 73, 1, 0, 0, 0, 169, 168, 168, 162, 
87, 255, 16, 0, 0, 0,
+            80, 65, 82, 49,
+        ];
+        let ret = SerializedFileReader::new(Bytes::copy_from_slice(&data));
+        assert_eq!(
+            ret.err().unwrap().to_string(),
+            "Parquet error: Could not parse metadata: bad data"
+        );
+    }
+
     #[test]
     // Use java parquet-tools get below pageIndex info
     // !```
diff --git a/parquet/src/format.rs b/parquet/src/format.rs
index 4700b05dc2..4e1aa0b65b 100644
--- a/parquet/src/format.rs
+++ b/parquet/src/format.rs
@@ -1369,7 +1369,7 @@ impl crate::thrift::TSerializable for TimeUnit {
         )
       )
     } else {
-      Ok(ret.expect("return value should have been constructed"))
+      ret.ok_or_else(|| 
thrift::Error::Protocol(ProtocolError::new(ProtocolErrorKind::InvalidData, 
"return value should have been constructed")))
     }
   }
   fn write_to_out_protocol<T: TOutputProtocol>(&self, o_prot: &mut T) -> 
thrift::Result<()> {
@@ -1851,7 +1851,7 @@ impl crate::thrift::TSerializable for LogicalType {
         )
       )
     } else {
-      Ok(ret.expect("return value should have been constructed"))
+      ret.ok_or_else(|| 
thrift::Error::Protocol(ProtocolError::new(ProtocolErrorKind::InvalidData, 
"return value should have been constructed")))
     }
   }
   fn write_to_out_protocol<T: TOutputProtocol>(&self, o_prot: &mut T) -> 
thrift::Result<()> {
@@ -2625,7 +2625,7 @@ impl crate::thrift::TSerializable for 
BloomFilterAlgorithm {
         )
       )
     } else {
-      Ok(ret.expect("return value should have been constructed"))
+      ret.ok_or_else(|| 
thrift::Error::Protocol(ProtocolError::new(ProtocolErrorKind::InvalidData, 
"return value should have been constructed")))
     }
   }
   fn write_to_out_protocol<T: TOutputProtocol>(&self, o_prot: &mut T) -> 
thrift::Result<()> {
@@ -2738,7 +2738,7 @@ impl crate::thrift::TSerializable for BloomFilterHash {
         )
       )
     } else {
-      Ok(ret.expect("return value should have been constructed"))
+      ret.ok_or_else(|| 
thrift::Error::Protocol(ProtocolError::new(ProtocolErrorKind::InvalidData, 
"return value should have been constructed")))
     }
   }
   fn write_to_out_protocol<T: TOutputProtocol>(&self, o_prot: &mut T) -> 
thrift::Result<()> {
@@ -2850,7 +2850,7 @@ impl crate::thrift::TSerializable for 
BloomFilterCompression {
         )
       )
     } else {
-      Ok(ret.expect("return value should have been constructed"))
+      ret.ok_or_else(|| 
thrift::Error::Protocol(ProtocolError::new(ProtocolErrorKind::InvalidData, 
"return value should have been constructed")))
     }
   }
   fn write_to_out_protocol<T: TOutputProtocol>(&self, o_prot: &mut T) -> 
thrift::Result<()> {
@@ -3846,7 +3846,7 @@ impl crate::thrift::TSerializable for 
ColumnCryptoMetaData {
         )
       )
     } else {
-      Ok(ret.expect("return value should have been constructed"))
+      ret.ok_or_else(|| 
thrift::Error::Protocol(ProtocolError::new(ProtocolErrorKind::InvalidData, 
"return value should have been constructed")))
     }
   }
   fn write_to_out_protocol<T: TOutputProtocol>(&self, o_prot: &mut T) -> 
thrift::Result<()> {
@@ -4300,7 +4300,7 @@ impl crate::thrift::TSerializable for ColumnOrder {
         )
       )
     } else {
-      Ok(ret.expect("return value should have been constructed"))
+      ret.ok_or_else(|| 
thrift::Error::Protocol(ProtocolError::new(ProtocolErrorKind::InvalidData, 
"return value should have been constructed")))
     }
   }
   fn write_to_out_protocol<T: TOutputProtocol>(&self, o_prot: &mut T) -> 
thrift::Result<()> {
@@ -4873,7 +4873,7 @@ impl crate::thrift::TSerializable for EncryptionAlgorithm 
{
         )
       )
     } else {
-      Ok(ret.expect("return value should have been constructed"))
+      ret.ok_or_else(|| 
thrift::Error::Protocol(ProtocolError::new(ProtocolErrorKind::InvalidData, 
"return value should have been constructed")))
     }
   }
   fn write_to_out_protocol<T: TOutputProtocol>(&self, o_prot: &mut T) -> 
thrift::Result<()> {

Reply via email to