This is an automated email from the ASF dual-hosted git repository.
maplefu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new f4a63d41eb GH-44042: [C++][Parquet] Limit num-of row-groups when
building parquet for encrypted file (# 44043)
f4a63d41eb is described below
commit f4a63d41ebbc57566f215c1d1e87fc1647071dae
Author: mwish <[email protected]>
AuthorDate: Fri Jan 24 17:51:50 2025 +0800
GH-44042: [C++][Parquet] Limit num-of row-groups when building parquet for
encrypted file (# 44043)
### Rationale for this change
Limit num-of row-groups when build parquet
### What changes are included in this PR?
Limit num-of row-groups when build parquet
### Are these changes tested?
No
### Are there any user-facing changes?
No
* GitHub Issue: #44042
Lead-authored-by: mwish <[email protected]>
Co-authored-by: mwish <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: mwish <[email protected]>
---
cpp/src/parquet/file_reader.cc | 12 ++++++++----
cpp/src/parquet/file_writer.cc | 15 +++++++++++++--
cpp/src/parquet/metadata.cc | 27 +++++++++++++++------------
3 files changed, 36 insertions(+), 18 deletions(-)
diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc
index f422565407..1f3ca8b910 100644
--- a/cpp/src/parquet/file_reader.cc
+++ b/cpp/src/parquet/file_reader.cc
@@ -266,13 +266,17 @@ class SerializedRowGroup : public
RowGroupReader::Contents {
ARROW_DCHECK_NE(meta_decryptor, nullptr);
ARROW_DCHECK_NE(data_decryptor, nullptr);
- constexpr auto kEncryptedRowGroupsLimit = 32767;
- if (i > kEncryptedRowGroupsLimit) {
+ constexpr auto kEncryptedOrdinalLimit = 32767;
+ if (ARROW_PREDICT_FALSE(row_group_ordinal_ > kEncryptedOrdinalLimit)) {
throw ParquetException("Encrypted files cannot contain more than 32767
row groups");
}
+ if (ARROW_PREDICT_FALSE(i > kEncryptedOrdinalLimit)) {
+ throw ParquetException("Encrypted files cannot contain more than 32767
columns");
+ }
- CryptoContext ctx(col->has_dictionary_page(), row_group_ordinal_,
- static_cast<int16_t>(i), meta_decryptor, data_decryptor);
+ CryptoContext ctx(col->has_dictionary_page(),
+ static_cast<int16_t>(row_group_ordinal_),
static_cast<int16_t>(i),
+ meta_decryptor, data_decryptor);
return PageReader::Open(stream, col->num_values(), col->compression(),
properties_,
always_compressed, &ctx);
}
diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc
index 854e2161f8..f80a095a13 100644
--- a/cpp/src/parquet/file_writer.cc
+++ b/cpp/src/parquet/file_writer.cc
@@ -359,14 +359,25 @@ class FileSerializer : public ParquetFileWriter::Contents
{
if (row_group_writer_) {
row_group_writer_->Close();
}
+ int16_t row_group_ordinal = -1; // row group ordinal not set
+ if (file_encryptor_ != nullptr) {
+ // Parquet thrifts using int16 for row group ordinal, so we can't have
more than
+ // 32767 row groups in a file.
+ if (num_row_groups_ <= std::numeric_limits<int16_t>::max()) {
+ row_group_ordinal = static_cast<int16_t>(num_row_groups_);
+ } else {
+ throw ParquetException(
+ "Cannot write more than 32767 row groups in an encrypted file");
+ }
+ }
num_row_groups_++;
auto rg_metadata = metadata_->AppendRowGroup();
if (page_index_builder_) {
page_index_builder_->AppendRowGroup();
}
std::unique_ptr<RowGroupWriter::Contents> contents(new RowGroupSerializer(
- sink_, rg_metadata, static_cast<int16_t>(num_row_groups_ - 1),
properties_.get(),
- buffered_row_group, file_encryptor_.get(), page_index_builder_.get()));
+ sink_, rg_metadata, row_group_ordinal, properties_.get(),
buffered_row_group,
+ file_encryptor_.get(), page_index_builder_.get()));
row_group_writer_ = std::make_unique<RowGroupWriter>(std::move(contents));
return row_group_writer_.get();
}
diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc
index f47c614219..9b53da021f 100644
--- a/cpp/src/parquet/metadata.cc
+++ b/cpp/src/parquet/metadata.cc
@@ -219,30 +219,29 @@ const std::string& ColumnCryptoMetaData::key_metadata()
const {
// ColumnChunk metadata
class ColumnChunkMetaData::ColumnChunkMetaDataImpl {
public:
- explicit ColumnChunkMetaDataImpl(const format::ColumnChunk* column,
- const ColumnDescriptor* descr,
- int16_t row_group_ordinal, int16_t
column_ordinal,
- const ReaderProperties& properties,
- const ApplicationVersion* writer_version,
- std::shared_ptr<InternalFileDecryptor>
file_decryptor)
+ explicit ColumnChunkMetaDataImpl(
+ const format::ColumnChunk* column, const ColumnDescriptor* descr,
+ int16_t row_group_ordinal, int16_t column_ordinal,
+ const ReaderProperties& properties, const ApplicationVersion*
writer_version,
+ const std::shared_ptr<InternalFileDecryptor>& file_decryptor)
: column_(column),
descr_(descr),
properties_(properties),
writer_version_(writer_version) {
column_metadata_ = &column->meta_data;
if (column->__isset.crypto_metadata) { // column metadata is encrypted
- format::ColumnCryptoMetaData ccmd = column->crypto_metadata;
+ const format::ColumnCryptoMetaData& ccmd = column->crypto_metadata;
if (ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY) {
if (file_decryptor != nullptr && file_decryptor->properties() !=
nullptr) {
// should decrypt metadata
std::shared_ptr<schema::ColumnPath> path =
std::make_shared<schema::ColumnPath>(
ccmd.ENCRYPTION_WITH_COLUMN_KEY.path_in_schema);
- std::string key_metadata =
ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata;
+ const std::string& key_metadata =
ccmd.ENCRYPTION_WITH_COLUMN_KEY.key_metadata;
std::string aad_column_metadata = encryption::CreateModuleAad(
file_decryptor->file_aad(), encryption::kColumnMetaData,
row_group_ordinal,
- column_ordinal, static_cast<int16_t>(-1));
+ column_ordinal, /*page_ordinal=*/static_cast<int16_t>(-1));
auto decryptor = file_decryptor->GetColumnMetaDecryptor(
path->ToDotString(), key_metadata, aad_column_metadata);
auto len =
static_cast<uint32_t>(column->encrypted_column_metadata.size());
@@ -565,9 +564,11 @@ class RowGroupMetaData::RowGroupMetaDataImpl {
std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int i) {
if (i >= 0 && i < num_columns()) {
+ int16_t row_group_ordinal =
+ row_group_->__isset.ordinal ? row_group_->ordinal :
static_cast<int16_t>(-1);
return ColumnChunkMetaData::Make(&row_group_->columns[i],
schema_->Column(i),
- properties_, writer_version_,
row_group_->ordinal,
- i, file_decryptor_);
+ properties_, writer_version_,
row_group_ordinal, i,
+ file_decryptor_);
}
throw ParquetException("The file only has ", num_columns(),
" columns, requested metadata for column: ", i);
@@ -1854,7 +1855,9 @@ class
RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl {
row_group_->__set_file_offset(file_offset);
row_group_->__set_total_compressed_size(total_compressed_size);
row_group_->__set_total_byte_size(total_bytes_written);
- row_group_->__set_ordinal(row_group_ordinal);
+ if (row_group_ordinal >= 0) {
+ row_group_->__set_ordinal(row_group_ordinal);
+ }
}
void set_num_rows(int64_t num_rows) { row_group_->num_rows = num_rows; }