wgtmac commented on code in PR #49880:
URL: https://github.com/apache/arrow/pull/49880#discussion_r3248969477
##########
cpp/src/parquet/bloom_filter_writer.cc:
##########
@@ -225,14 +230,47 @@ IndexLocations
BloomFilterBuilderImpl::WriteTo(::arrow::io::OutputStream* sink)
}
finished_ = true;
+ // Bloom filter ordinals are encoded as int16 in the AAD when encryption is
enabled.
+ constexpr size_t kEncryptedOrdinalLimit =
std::numeric_limits<int16_t>::max(); // 32767
+
IndexLocations locations;
for (size_t i = 0; i != bloom_filters_.size(); ++i) {
auto& row_group_bloom_filters = bloom_filters_[i];
for (const auto& [column_id, filter] : row_group_bloom_filters) {
// TODO(GH-43138): Determine the quality of bloom filter before writing
it.
PARQUET_ASSIGN_OR_THROW(int64_t offset, sink->Tell());
- filter->WriteTo(sink);
+
+ const auto column_path =
schema_->Column(column_id)->path()->ToDotString();
+ std::shared_ptr<Encryptor> meta_encryptor =
+ file_encryptor_ != nullptr
+ ? file_encryptor_->GetColumnMetaEncryptor(column_path)
+ : nullptr;
+ if (meta_encryptor != nullptr) {
+ const auto& column_props =
properties_->column_encryption_properties(column_path);
+ if (column_props != nullptr && column_props->is_encrypted() &&
+ !column_props->is_encrypted_with_footer_key()) {
+ ParquetException::NYI("Bloom filter writing with a dedicated column
key");
+ }
+ if (ARROW_PREDICT_FALSE(i > kEncryptedOrdinalLimit)) {
+ throw ParquetException(
+ "Encrypted files cannot contain more than 32767 row groups");
+ }
+ if (ARROW_PREDICT_FALSE(static_cast<size_t>(column_id) >
+ kEncryptedOrdinalLimit)) {
+ throw ParquetException(
+ "Encrypted files cannot contain more than 32767 columns");
+ }
+ auto* block_filter =
dynamic_cast<BlockSplitBloomFilter*>(filter.get());
+ if (block_filter == nullptr) {
+ throw ParquetException(
+ "Only BlockSplitBloomFilter is supported for encrypted bloom
filters");
+ }
+ block_filter->WriteEncrypted(sink, meta_encryptor.get(),
static_cast<int16_t>(i),
Review Comment:
Column metadata is encrypted during column close (metadata.cc:1765), but
bloom filter offsets are set later (metadata.cc:2066). For column-key metadata,
readers use the decrypted encrypted_column_metadata, so they would miss the
bloom filter offset/length unless the metadata encryption order is fixed.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]