wgtmac commented on code in PR #49334:
URL: https://github.com/apache/arrow/pull/49334#discussion_r2973508436
##########
cpp/src/parquet/bloom_filter.cc:
##########
@@ -104,6 +126,111 @@ static ::arrow::Status ValidateBloomFilterHeader(
return ::arrow::Status::OK();
}
+} // namespace
+
+BlockSplitBloomFilter BlockSplitBloomFilter::DeserializeEncrypted(
+ const ReaderProperties& properties, ArrowInputStream* input,
+ std::optional<int64_t> bloom_filter_length, Decryptor* decryptor,
+ int16_t row_group_ordinal, int16_t column_ordinal) {
+ if (decryptor == nullptr) {
+ throw ParquetException("Bloom filter decryptor must be provided");
+ }
+
+ ThriftDeserializer deserializer(properties);
+ format::BloomFilterHeader header;
+
+ // Read the length-prefixed ciphertext for the header.
+ PARQUET_ASSIGN_OR_THROW(auto length_buf, input->Read(kCiphertextLengthSize));
+ if (ARROW_PREDICT_FALSE(length_buf->size() < kCiphertextLengthSize)) {
+ std::stringstream ss;
+ ss << "Bloom filter header read failed: expected " << kCiphertextLengthSize
+ << " bytes, got " << length_buf->size();
+ throw ParquetException(ss.str());
+ }
+
+ const int64_t header_cipher_total_len =
+ ParseCiphertextTotalLength(length_buf->data(), length_buf->size());
+ if (ARROW_PREDICT_FALSE(header_cipher_total_len >
+ std::numeric_limits<int32_t>::max())) {
+ throw ParquetException("Bloom filter header ciphertext length overflows
int32");
+ }
+ if (bloom_filter_length && header_cipher_total_len > *bloom_filter_length) {
+ throw ParquetException(
+ "Bloom filter length less than encrypted bloom filter header length");
+ }
+ // Read the full header ciphertext and decrypt the Thrift header.
+ auto header_cipher_buf =
+ AllocateBuffer(properties.memory_pool(), header_cipher_total_len);
+ std::memcpy(header_cipher_buf->mutable_data(), length_buf->data(),
+ kCiphertextLengthSize);
+ const int64_t header_cipher_remaining = header_cipher_total_len -
kCiphertextLengthSize;
+ PARQUET_ASSIGN_OR_THROW(auto read_size, input->Read(header_cipher_remaining,
+
header_cipher_buf->mutable_data() +
+
kCiphertextLengthSize));
+ if (ARROW_PREDICT_FALSE(read_size < header_cipher_remaining)) {
+ std::stringstream ss;
+ ss << "Bloom filter header read failed: expected " <<
header_cipher_remaining
+ << " bytes, got " << read_size;
+ throw ParquetException(ss.str());
+ }
+
+ // Bloom filter header and bitset are separate encrypted modules with
different AADs.
+ UpdateDecryptor(decryptor, row_group_ordinal, column_ordinal,
+ encryption::kBloomFilterHeader);
+ uint32_t header_cipher_len = static_cast<uint32_t>(header_cipher_total_len);
+ try {
+ deserializer.DeserializeMessage(header_cipher_buf->data(),
&header_cipher_len,
+ &header, decryptor);
+ DCHECK_EQ(header_cipher_len, header_cipher_total_len);
Review Comment:
Should we throw instead of a debug check just like line 224?
##########
cpp/src/parquet/bloom_filter.cc:
##########
@@ -104,6 +126,111 @@ static ::arrow::Status ValidateBloomFilterHeader(
return ::arrow::Status::OK();
}
+} // namespace
+
+BlockSplitBloomFilter BlockSplitBloomFilter::DeserializeEncrypted(
+ const ReaderProperties& properties, ArrowInputStream* input,
+ std::optional<int64_t> bloom_filter_length, Decryptor* decryptor,
+ int16_t row_group_ordinal, int16_t column_ordinal) {
+ if (decryptor == nullptr) {
+ throw ParquetException("Bloom filter decryptor must be provided");
+ }
+
+ ThriftDeserializer deserializer(properties);
+ format::BloomFilterHeader header;
+
+ // Read the length-prefixed ciphertext for the header.
+ PARQUET_ASSIGN_OR_THROW(auto length_buf, input->Read(kCiphertextLengthSize));
Review Comment:
If `bloom_filter_length` is known, we are able to issue a single IO for the
entire bloom filter. I think this is worth doing to add a fast path as it's the
main goal of adding `bloom_filter_length` to the spec.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]