fenfeng9 commented on code in PR #49334:
URL: https://github.com/apache/arrow/pull/49334#discussion_r2966096611
##########
cpp/src/parquet/bloom_filter_reader.cc:
##########
@@ -76,10 +82,32 @@ std::unique_ptr<BloomFilter>
RowGroupBloomFilterReaderImpl::GetColumnBloomFilter
"bloom filter length + bloom filter offset greater than file size");
}
}
- auto stream = ::arrow::io::RandomAccessFile::GetStream(
- input_, *bloom_filter_offset, file_size - *bloom_filter_offset);
+
+ std::unique_ptr<ColumnCryptoMetaData> crypto_metadata =
col_chunk->crypto_metadata();
+ std::unique_ptr<Decryptor> decryptor =
+
InternalFileDecryptor::GetColumnMetaDecryptorFactory(file_decryptor_.get(),
+
crypto_metadata.get())();
+ if (decryptor != nullptr) {
+ constexpr auto kEncryptedOrdinalLimit = 32767;
+ if (ARROW_PREDICT_FALSE(row_group_ordinal_ > kEncryptedOrdinalLimit)) {
+ throw ParquetException("Encrypted files cannot contain more than 32767
row groups");
+ }
+ if (ARROW_PREDICT_FALSE(i > kEncryptedOrdinalLimit)) {
+ throw ParquetException("Encrypted files cannot contain more than 32767
columns");
+ }
+ }
+
+ const int64_t stream_length =
+ bloom_filter_length ? *bloom_filter_length : file_size -
*bloom_filter_offset;
+ auto stream = ::arrow::io::RandomAccessFile::GetStream(input_,
*bloom_filter_offset,
+ stream_length);
auto bloom_filter =
- BlockSplitBloomFilter::Deserialize(properties_, stream->get(),
bloom_filter_length);
+ decryptor != nullptr
+ ? BlockSplitBloomFilter::DeserializeEncrypted(
+ properties_, stream->get(), bloom_filter_length,
decryptor.get(),
+ static_cast<int16_t>(row_group_ordinal_),
static_cast<int16_t>(i))
+ : BlockSplitBloomFilter::Deserialize(properties_, stream->get(),
+ bloom_filter_length);
Review Comment:
Simplified this by using a single metadata decryptor.
The encrypted path now goes through `DeserializeEncrypted(...)`, which reuses
the same decryptor while switching the module AAD between the Bloom filter
header and bitset.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]