This is an automated email from the ASF dual-hosted git repository.

maplefu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new f4a63d41eb GH-44042: [C++][Parquet] Limit num-of row-groups when 
building parquet for encrypted file (# 44043)
f4a63d41eb is described below

commit f4a63d41ebbc57566f215c1d1e87fc1647071dae
Author: mwish <[email protected]>
AuthorDate: Fri Jan 24 17:51:50 2025 +0800

    GH-44042: [C++][Parquet] Limit num-of row-groups when building parquet for 
encrypted file (# 44043)
    
    
    
    ### Rationale for this change
    
    Limit num-of row-groups when build parquet
    
    ### What changes are included in this PR?
    
    Limit num-of row-groups when build parquet
    
    ### Are these changes tested?
    
    No
    
    ### Are there any user-facing changes?
    
    No
    
    * GitHub Issue: #44042
    
    Lead-authored-by: mwish <[email protected]>
    Co-authored-by: mwish <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: mwish <[email protected]>
---
 cpp/src/parquet/file_reader.cc | 12 ++++++++----
 cpp/src/parquet/file_writer.cc | 15 +++++++++++++--
 cpp/src/parquet/metadata.cc    | 27 +++++++++++++++------------
 3 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc
index f422565407..1f3ca8b910 100644
--- a/cpp/src/parquet/file_reader.cc
+++ b/cpp/src/parquet/file_reader.cc
@@ -266,13 +266,17 @@ class SerializedRowGroup : public 
RowGroupReader::Contents {
     ARROW_DCHECK_NE(meta_decryptor, nullptr);
     ARROW_DCHECK_NE(data_decryptor, nullptr);
 
-    constexpr auto kEncryptedRowGroupsLimit = 32767;
-    if (i > kEncryptedRowGroupsLimit) {
+    constexpr auto kEncryptedOrdinalLimit = 32767;
+    if (ARROW_PREDICT_FALSE(row_group_ordinal_ > kEncryptedOrdinalLimit)) {
       throw ParquetException("Encrypted files cannot contain more than 32767 
row groups");
     }
+    if (ARROW_PREDICT_FALSE(i > kEncryptedOrdinalLimit)) {
+      throw ParquetException("Encrypted files cannot contain more than 32767 
columns");
+    }
 
-    CryptoContext ctx(col->has_dictionary_page(), row_group_ordinal_,
-                      static_cast<int16_t>(i), meta_decryptor, data_decryptor);
+    CryptoContext ctx(col->has_dictionary_page(),
+                      static_cast<int16_t>(row_group_ordinal_), 
static_cast<int16_t>(i),
+                      meta_decryptor, data_decryptor);
     return PageReader::Open(stream, col->num_values(), col->compression(), 
properties_,
                             always_compressed, &ctx);
   }
diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc
index 854e2161f8..f80a095a13 100644
--- a/cpp/src/parquet/file_writer.cc
+++ b/cpp/src/parquet/file_writer.cc
@@ -359,14 +359,25 @@ class FileSerializer : public ParquetFileWriter::Contents 
{
     if (row_group_writer_) {
       row_group_writer_->Close();
     }
+    int16_t row_group_ordinal = -1;  // row group ordinal not set
+    if (file_encryptor_ != nullptr) {
+      // Parquet thrifts using int16 for row group ordinal, so we can't have 
more than
+      // 32767 row groups in a file.
+      if (num_row_groups_ <= std::numeric_limits<int16_t>::max()) {
+        row_group_ordinal = static_cast<int16_t>(num_row_groups_);
+      } else {
+        throw ParquetException(
+            "Cannot write more than 32767 row groups in an encrypted file");
+      }
+    }
     num_row_groups_++;
     auto rg_metadata = metadata_->AppendRowGroup();
     if (page_index_builder_) {
       page_index_builder_->AppendRowGroup();
     }
     std::unique_ptr<RowGroupWriter::Contents> contents(new RowGroupSerializer(
-        sink_, rg_metadata, static_cast<int16_t>(num_row_groups_ - 1), 
properties_.get(),
-        buffered_row_group, file_encryptor_.get(), page_index_builder_.get()));
+        sink_, rg_metadata, row_group_ordinal, properties_.get(), 
buffered_row_group,
+        file_encryptor_.get(), page_index_builder_.get()));
     row_group_writer_ = std::make_unique<RowGroupWriter>(std::move(contents));
     return row_group_writer_.get();
   }
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index f47c614219..9b53da021f 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -219,30 +219,29 @@ const std::string& ColumnCryptoMetaData::key_metadata() 
const {
 // ColumnChunk metadata
 class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
  public:
-  explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column,
-                                   const ColumnDescriptor* descr,
-                                   int16_t row_group_ordinal, int16_t 
column_ordinal,
-                                   const ReaderProperties& properties,
-                                   const ApplicationVersion* writer_version,
-                                   std::shared_ptr<InternalFileDecryptor> 
file_decryptor)
+  explicit ColumnChunkMetaDataImpl(
+      const format::ColumnChunk* column, const ColumnDescriptor* descr,
+      int16_t row_group_ordinal, int16_t column_ordinal,
+      const ReaderProperties& properties, const ApplicationVersion* 
writer_version,
+      const std::shared_ptr<InternalFileDecryptor>& file_decryptor)
       : column_(column),
         descr_(descr),
         properties_(properties),
         writer_version_(writer_version) {
     column_metadata_ = &column->meta_data;
     if (column->__isset.crypto_metadata) {  // column metadata is encrypted
-      format::ColumnCryptoMetaData ccmd = column->crypto_metadata;
+      const format::ColumnCryptoMetaData& ccmd = column->crypto_metadata;
 
       if (ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY) {
         if (file_decryptor != nullptr && file_decryptor->properties() != 
nullptr) {
           // should decrypt metadata
           std::shared_ptr<schema::ColumnPath> path = 
std::make_shared<schema::ColumnPath>(
               ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema);
-          std::string key_metadata = 
ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata;
+          const std::string& key_metadata = 
ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata;
 
           std::string aad_column_metadata = encryption::CreateModuleAad(
               file_decryptor->file_aad(), encryption::kColumnMetaData, 
row_group_ordinal,
-              column_ordinal, static_cast<int16_t>(-1));
+              column_ordinal, /*page_ordinal=*/static_cast<int16_t>(-1));
           auto decryptor = file_decryptor->GetColumnMetaDecryptor(
               path->ToDotString(), key_metadata, aad_column_metadata);
           auto len = 
static_cast<uint32_t>(column->encrypted_column_metadata.size());
@@ -565,9 +564,11 @@ class RowGroupMetaData::RowGroupMetaDataImpl {
 
   std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int i) {
     if (i >= 0 && i < num_columns()) {
+      int16_t row_group_ordinal =
+          row_group_->__isset.ordinal ? row_group_->ordinal : 
static_cast<int16_t>(-1);
       return ColumnChunkMetaData::Make(&row_group_->columns[i], 
schema_->Column(i),
-                                       properties_, writer_version_, 
row_group_->ordinal,
-                                       i, file_decryptor_);
+                                       properties_, writer_version_, 
row_group_ordinal, i,
+                                       file_decryptor_);
     }
     throw ParquetException("The file only has ", num_columns(),
                            " columns, requested metadata for column: ", i);
@@ -1854,7 +1855,9 @@ class 
RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl {
     row_group_->__set_file_offset(file_offset);
     row_group_->__set_total_compressed_size(total_compressed_size);
     row_group_->__set_total_byte_size(total_bytes_written);
-    row_group_->__set_ordinal(row_group_ordinal);
+    if (row_group_ordinal >= 0) {
+      row_group_->__set_ordinal(row_group_ordinal);
+    }
   }
 
   void set_num_rows(int64_t num_rows) { row_group_->num_rows = num_rows; }

Reply via email to