This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new b5e495dd67 GH-48858: [C++][Parquet] Avoid re-serializing footer for 
signature verification (#48859)
b5e495dd67 is described below

commit b5e495dd6747b6079cbe99f6c210b4b2174e8647
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Jan 21 09:21:31 2026 +0100

    GH-48858: [C++][Parquet] Avoid re-serializing footer for signature 
verification (#48859)
    
    ### Rationale for this change
    
    When reading an encrypted Parquet file with a plaintext footer, the Parquet 
reader is able to verify footer integrity by comparing the signature in the 
file with the one computed by encrypting the footer.
    
    However, the way it does this is to first re-serializes the deserialized 
footer using Thrift. This has several issues:
    
    1. it's inefficient
    2. it's not obvious that it will always produce the same Thrift encoding as 
the original, leading to spurious signature verification failures
    3. if the original footer deserializes to invalid enum values, attempting 
to serialize it again will lead to undefined behavior
    
    Reason 3 is what allowed this to be uncovered by OSS-Fuzz (see 
https://oss-fuzz.com/testcase-detail/4740205688193024).
    
    This PR switches to reusing the original serialized metadata.
    
    ### Are these changes tested?
    
    Yes, by existing tests and new fuzz regression file.
    
    ### Are there any user-facing changes?
    
    No.
    
    * GitHub Issue: #48858
    
    Authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/parquet/file_reader.cc | 124 ++++++++++++++++++-----------------------
 cpp/src/parquet/metadata.cc    |  36 ++++++++++++
 cpp/src/parquet/metadata.h     |  10 +++-
 3 files changed, 99 insertions(+), 71 deletions(-)

diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc
index b246feaf73..af7ccfd7ad 100644
--- a/cpp/src/parquet/file_reader.cc
+++ b/cpp/src/parquet/file_reader.cc
@@ -55,6 +55,10 @@ using arrow::internal::AddWithOverflow;
 
 namespace parquet {
 
+using ::arrow::Future;
+using ::arrow::Result;
+using ::arrow::Status;
+
 namespace {
 bool IsColumnChunkFullyDictionaryEncoded(const ColumnChunkMetaData& col) {
   // Check the encoding_stats to see if all data pages are dictionary encoded.
@@ -398,7 +402,7 @@ class SerializedFile : public ParquetFileReader::Contents {
     PARQUET_THROW_NOT_OK(cached_source_->Cache(ranges));
   }
 
-  ::arrow::Result<std::vector<::arrow::io::ReadRange>> GetReadRanges(
+  Result<std::vector<::arrow::io::ReadRange>> GetReadRanges(
       const std::vector<int>& row_groups, const std::vector<int>& 
column_indices,
       int64_t hole_size_limit, int64_t range_size_limit) {
     std::vector<::arrow::io::ReadRange> ranges;
@@ -413,10 +417,10 @@ class SerializedFile : public ParquetFileReader::Contents 
{
                                                      range_size_limit);
   }
 
-  ::arrow::Future<> WhenBuffered(const std::vector<int>& row_groups,
-                                 const std::vector<int>& column_indices) const 
{
+  Future<> WhenBuffered(const std::vector<int>& row_groups,
+                        const std::vector<int>& column_indices) const {
     if (!cached_source_) {
-      return ::arrow::Status::Invalid("Must call PreBuffer before 
WhenBuffered");
+      return Status::Invalid("Must call PreBuffer before WhenBuffered");
     }
     std::vector<::arrow::io::ReadRange> ranges;
     for (int row : row_groups) {
@@ -465,23 +469,8 @@ class SerializedFile : public ParquetFileReader::Contents {
       // Fall through
     }
 
-    const uint32_t read_metadata_len = ParseUnencryptedFileMetadata(
-        metadata_buffer, metadata_len, std::move(file_decryptor));
-    auto file_decryption_properties = properties_.file_decryption_properties();
-    if (is_encrypted_footer) {
-      // Nothing else to do here.
-      return;
-    } else if (!file_metadata_->is_encryption_algorithm_set()) {  // Non 
encrypted file.
-      if (file_decryption_properties != nullptr) {
-        if (!file_decryption_properties->plaintext_files_allowed()) {
-          throw ParquetException("Applying decryption properties on plaintext 
file");
-        }
-      }
-    } else {
-      // Encrypted file with plaintext footer mode.
-      ParseMetaDataOfEncryptedFileWithPlaintextFooter(
-          file_decryption_properties, metadata_buffer, metadata_len, 
read_metadata_len);
-    }
+    ParseMetaDataFinal(std::move(metadata_buffer), metadata_len, 
is_encrypted_footer,
+                       std::move(file_decryptor));
   }
 
   // Validate the source size and get the initial read size.
@@ -522,16 +511,15 @@ class SerializedFile : public ParquetFileReader::Contents 
{
   }
 
   // Does not throw.
-  ::arrow::Future<> ParseMetaDataAsync() {
+  Future<> ParseMetaDataAsync() {
     int64_t footer_read_size;
     BEGIN_PARQUET_CATCH_EXCEPTIONS
     footer_read_size = GetFooterReadSize();
     END_PARQUET_CATCH_EXCEPTIONS
     // Assumes this is kept alive externally
     return source_->ReadAsync(source_size_ - footer_read_size, 
footer_read_size)
-        .Then([this,
-               footer_read_size](const std::shared_ptr<::arrow::Buffer>& 
footer_buffer)
-                  -> ::arrow::Future<> {
+        .Then([this, footer_read_size](
+                  const std::shared_ptr<::arrow::Buffer>& footer_buffer) -> 
Future<> {
           uint32_t metadata_len;
           BEGIN_PARQUET_CATCH_EXCEPTIONS
           metadata_len = ParseFooterLength(footer_buffer, footer_read_size);
@@ -557,7 +545,7 @@ class SerializedFile : public ParquetFileReader::Contents {
   }
 
   // Continuation
-  ::arrow::Future<> ParseMaybeEncryptedMetaDataAsync(
+  Future<> ParseMaybeEncryptedMetaDataAsync(
       std::shared_ptr<::arrow::Buffer> footer_buffer,
       std::shared_ptr<::arrow::Buffer> metadata_buffer, int64_t 
footer_read_size,
       uint32_t metadata_len) {
@@ -580,26 +568,30 @@ class SerializedFile : public ParquetFileReader::Contents 
{
                  file_decryptor = std::move(file_decryptor)](
                     const std::shared_ptr<::arrow::Buffer>& metadata_buffer) {
             // Continue and read the file footer
-            return ParseMetaDataFinal(metadata_buffer, metadata_len, 
is_encrypted_footer,
-                                      file_decryptor);
+            BEGIN_PARQUET_CATCH_EXCEPTIONS
+            ParseMetaDataFinal(metadata_buffer, metadata_len, 
is_encrypted_footer,
+                               file_decryptor);
+            END_PARQUET_CATCH_EXCEPTIONS
+            return Status::OK();
           });
     }
-    return ParseMetaDataFinal(std::move(metadata_buffer), metadata_len,
-                              is_encrypted_footer, std::move(file_decryptor));
+    BEGIN_PARQUET_CATCH_EXCEPTIONS
+    ParseMetaDataFinal(std::move(metadata_buffer), metadata_len, 
is_encrypted_footer,
+                       std::move(file_decryptor));
+    END_PARQUET_CATCH_EXCEPTIONS
+    return Status::OK();
   }
 
   // Continuation
-  ::arrow::Status ParseMetaDataFinal(
-      std::shared_ptr<::arrow::Buffer> metadata_buffer, uint32_t metadata_len,
-      const bool is_encrypted_footer,
-      std::shared_ptr<InternalFileDecryptor> file_decryptor) {
-    BEGIN_PARQUET_CATCH_EXCEPTIONS
+  void ParseMetaDataFinal(std::shared_ptr<::arrow::Buffer> metadata_buffer,
+                          uint32_t metadata_len, const bool 
is_encrypted_footer,
+                          std::shared_ptr<InternalFileDecryptor> 
file_decryptor) {
     const uint32_t read_metadata_len = ParseUnencryptedFileMetadata(
         metadata_buffer, metadata_len, std::move(file_decryptor));
     auto file_decryption_properties = properties_.file_decryption_properties();
     if (is_encrypted_footer) {
       // Nothing else to do here.
-      return ::arrow::Status::OK();
+      return;
     } else if (!file_metadata_->is_encryption_algorithm_set()) {  // Non 
encrypted file.
       if (file_decryption_properties != nullptr) {
         if (!file_decryption_properties->plaintext_files_allowed()) {
@@ -611,8 +603,6 @@ class SerializedFile : public ParquetFileReader::Contents {
       ParseMetaDataOfEncryptedFileWithPlaintextFooter(
           file_decryption_properties, metadata_buffer, metadata_len, 
read_metadata_len);
     }
-    END_PARQUET_CATCH_EXCEPTIONS
-    return ::arrow::Status::OK();
   }
 
  private:
@@ -707,20 +697,16 @@ void 
SerializedFile::ParseMetaDataOfEncryptedFileWithPlaintextFooter(
     auto file_decryptor = std::make_shared<InternalFileDecryptor>(
         file_decryption_properties, file_aad, algo.algorithm,
         file_metadata_->footer_signing_key_metadata(), 
properties_.memory_pool());
-    // set the InternalFileDecryptor in the metadata as well, as it's used
-    // for signature verification and for ColumnChunkMetaData creation.
-    file_metadata_->set_file_decryptor(std::move(file_decryptor));
+    // Set the InternalFileDecryptor in the metadata as well, as it's used
+    // for ColumnChunkMetaData creation.
+    file_metadata_->set_file_decryptor(file_decryptor);
 
     if (file_decryption_properties->check_plaintext_footer_integrity()) {
-      if (metadata_len - read_metadata_len !=
-          (parquet::encryption::kGcmTagLength + 
parquet::encryption::kNonceLength)) {
-        throw ParquetInvalidOrCorruptedFileException(
-            "Failed reading metadata for encryption signature (requested ",
-            parquet::encryption::kGcmTagLength + 
parquet::encryption::kNonceLength,
-            " bytes but have ", metadata_len - read_metadata_len, " bytes)");
-      }
-
-      if (!file_metadata_->VerifySignature(metadata_buffer->data() + 
read_metadata_len)) {
+      auto serialized_metadata =
+          metadata_buffer->span_as<uint8_t>().subspan(0, read_metadata_len);
+      auto signature = 
metadata_buffer->span_as<uint8_t>().subspan(read_metadata_len);
+      if (!FileMetaData::VerifySignature(serialized_metadata, signature,
+                                         file_decryptor.get())) {
         throw ParquetInvalidOrCorruptedFileException(
             "Parquet crypto signature verification failed");
       }
@@ -804,7 +790,7 @@ std::unique_ptr<ParquetFileReader::Contents> 
ParquetFileReader::Contents::Open(
   return result;
 }
 
-::arrow::Future<std::unique_ptr<ParquetFileReader::Contents>>
+Future<std::unique_ptr<ParquetFileReader::Contents>>
 ParquetFileReader::Contents::OpenAsync(std::shared_ptr<ArrowInputFile> source,
                                        const ReaderProperties& props,
                                        std::shared_ptr<FileMetaData> metadata) 
{
@@ -815,7 +801,7 @@ 
ParquetFileReader::Contents::OpenAsync(std::shared_ptr<ArrowInputFile> source,
   if (metadata == nullptr) {
     // TODO(ARROW-12259): workaround since we have Future<(move-only type)>
     struct {
-      ::arrow::Result<std::unique_ptr<ParquetFileReader::Contents>> 
operator()() {
+      Result<std::unique_ptr<ParquetFileReader::Contents>> operator()() {
         return std::move(result);
       }
 
@@ -825,7 +811,7 @@ 
ParquetFileReader::Contents::OpenAsync(std::shared_ptr<ArrowInputFile> source,
     return file->ParseMetaDataAsync().Then(std::move(Continuation));
   } else {
     file->set_metadata(std::move(metadata));
-    return 
::arrow::Future<std::unique_ptr<ParquetFileReader::Contents>>::MakeFinished(
+    return Future<std::unique_ptr<ParquetFileReader::Contents>>::MakeFinished(
         std::move(result));
   }
   END_PARQUET_CATCH_EXCEPTIONS
@@ -855,24 +841,24 @@ std::unique_ptr<ParquetFileReader> 
ParquetFileReader::OpenFile(
   return Open(std::move(source), props, std::move(metadata));
 }
 
-::arrow::Future<std::unique_ptr<ParquetFileReader>> 
ParquetFileReader::OpenAsync(
+Future<std::unique_ptr<ParquetFileReader>> ParquetFileReader::OpenAsync(
     std::shared_ptr<::arrow::io::RandomAccessFile> source, const 
ReaderProperties& props,
     std::shared_ptr<FileMetaData> metadata) {
   BEGIN_PARQUET_CATCH_EXCEPTIONS
   auto fut = SerializedFile::OpenAsync(std::move(source), props, 
std::move(metadata));
   // TODO(ARROW-12259): workaround since we have Future<(move-only type)>
-  auto completed = ::arrow::Future<std::unique_ptr<ParquetFileReader>>::Make();
-  fut.AddCallback([fut, completed](
-                      const 
::arrow::Result<std::unique_ptr<ParquetFileReader::Contents>>&
-                          contents) mutable {
-    if (!contents.ok()) {
-      completed.MarkFinished(contents.status());
-      return;
-    }
-    std::unique_ptr<ParquetFileReader> result = 
std::make_unique<ParquetFileReader>();
-    result->Open(fut.MoveResult().MoveValueUnsafe());
-    completed.MarkFinished(std::move(result));
-  });
+  auto completed = Future<std::unique_ptr<ParquetFileReader>>::Make();
+  fut.AddCallback(
+      [fut, completed](
+          const Result<std::unique_ptr<ParquetFileReader::Contents>>& 
contents) mutable {
+        if (!contents.ok()) {
+          completed.MarkFinished(contents.status());
+          return;
+        }
+        std::unique_ptr<ParquetFileReader> result = 
std::make_unique<ParquetFileReader>();
+        result->Open(fut.MoveResult().MoveValueUnsafe());
+        completed.MarkFinished(std::move(result));
+      });
   return completed;
   END_PARQUET_CATCH_EXCEPTIONS
 }
@@ -919,7 +905,7 @@ void ParquetFileReader::PreBuffer(const std::vector<int>& 
row_groups,
   file->PreBuffer(row_groups, column_indices, ctx, options);
 }
 
-::arrow::Result<std::vector<::arrow::io::ReadRange>> 
ParquetFileReader::GetReadRanges(
+Result<std::vector<::arrow::io::ReadRange>> ParquetFileReader::GetReadRanges(
     const std::vector<int>& row_groups, const std::vector<int>& column_indices,
     int64_t hole_size_limit, int64_t range_size_limit) {
   // Access private methods here
@@ -929,8 +915,8 @@ void ParquetFileReader::PreBuffer(const std::vector<int>& 
row_groups,
                              range_size_limit);
 }
 
-::arrow::Future<> ParquetFileReader::WhenBuffered(
-    const std::vector<int>& row_groups, const std::vector<int>& 
column_indices) const {
+Future<> ParquetFileReader::WhenBuffered(const std::vector<int>& row_groups,
+                                         const std::vector<int>& 
column_indices) const {
   // Access private methods here
   SerializedFile* file =
       ::arrow::internal::checked_cast<SerializedFile*>(contents_.get());
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index 5def0b8329..505ace275b 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -1169,6 +1169,42 @@ void FileMetaData::WriteTo(::arrow::io::OutputStream* 
dst,
   return impl_->WriteTo(dst, encryptor);
 }
 
+bool FileMetaData::VerifySignature(std::span<const uint8_t> 
serialized_metadata,
+                                   std::span<const uint8_t> signature,
+                                   InternalFileDecryptor* file_decryptor) {
+  DCHECK_NE(file_decryptor, nullptr);
+
+  // In plaintext footer, the "signature" is the concatenation of the nonce 
used
+  // for GCM encryption, and the authentication tag obtained after GCM 
encryption.
+  if (signature.size() != encryption::kGcmTagLength + 
encryption::kNonceLength) {
+    throw ParquetInvalidOrCorruptedFileException(
+        "Invalid footer encryption signature (expected ",
+        encryption::kGcmTagLength + encryption::kNonceLength, " bytes, got ",
+        signature.size(), ")");
+  }
+
+  // Encrypt plaintext serialized metadata so as to compute its signature
+  auto nonce = signature.subspan(0, encryption::kNonceLength);
+  auto tag = signature.subspan(encryption::kNonceLength);
+  const SecureString& key = file_decryptor->GetFooterKey();
+  const std::string& aad = 
encryption::CreateFooterAad(file_decryptor->file_aad());
+
+  auto aes_encryptor = encryption::AesEncryptor::Make(
+      file_decryptor->algorithm(), static_cast<int>(key.size()), 
/*metadata=*/true,
+      /*write_length=*/false);
+
+  std::shared_ptr<Buffer> encrypted_buffer =
+      AllocateBuffer(file_decryptor->pool(),
+                     
aes_encryptor->CiphertextLength(serialized_metadata.size()));
+  int32_t encrypted_len = aes_encryptor->SignedFooterEncrypt(
+      serialized_metadata, key.as_span(), str2span(aad), nonce,
+      encrypted_buffer->mutable_span_as<uint8_t>());
+  DCHECK_EQ(encrypted_len, encrypted_buffer->size());
+  // Check computed signature against expected
+  return 0 == memcmp(encrypted_buffer->data() + encrypted_len - 
encryption::kGcmTagLength,
+                     tag.data(), encryption::kGcmTagLength);
+}
+
 class FileCryptoMetaData::FileCryptoMetaDataImpl {
  public:
   FileCryptoMetaDataImpl() = default;
diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h
index 6d9deee106..5db4905bee 100644
--- a/cpp/src/parquet/metadata.h
+++ b/cpp/src/parquet/metadata.h
@@ -21,6 +21,7 @@
 #include <map>
 #include <memory>
 #include <optional>
+#include <span>
 #include <string>
 #include <vector>
 
@@ -322,8 +323,8 @@ class PARQUET_EXPORT FileMetaData {
   EncryptionAlgorithm encryption_algorithm() const;
   const std::string& footer_signing_key_metadata() const;
 
-  /// \brief Verify signature of FileMetaData when file is encrypted but footer
-  /// is not encrypted (plaintext footer).
+  PARQUET_DEPRECATED(
+      "Deprecated in 24.0.0. If you need this functionality, please report an 
issue.")
   bool VerifySignature(const void* signature);
 
   void WriteTo(::arrow::io::OutputStream* dst,
@@ -383,6 +384,11 @@ class PARQUET_EXPORT FileMetaData {
   void set_file_decryptor(std::shared_ptr<InternalFileDecryptor> 
file_decryptor);
   const std::shared_ptr<InternalFileDecryptor>& file_decryptor() const;
 
+  // Verify the signature of a plaintext footer.
+  static bool VerifySignature(std::span<const uint8_t> serialized_metadata,
+                              std::span<const uint8_t> signature,
+                              InternalFileDecryptor* file_decryptor);
+
   // PIMPL Idiom
   FileMetaData();
   class FileMetaDataImpl;

Reply via email to