wgtmac commented on code in PR #49334:
URL: https://github.com/apache/arrow/pull/49334#discussion_r2924858353


##########
cpp/src/parquet/bloom_filter.h:
##########
@@ -29,6 +29,8 @@
 
 namespace parquet {
 
+class Decryptor;

Review Comment:
   nit: we can include `parquet/encryption/type_fwd.h` instead



##########
cpp/src/parquet/bloom_filter.cc:
##########
@@ -106,7 +126,100 @@ static ::arrow::Status ValidateBloomFilterHeader(
 
 BlockSplitBloomFilter BlockSplitBloomFilter::Deserialize(
     const ReaderProperties& properties, ArrowInputStream* input,
-    std::optional<int64_t> bloom_filter_length) {
+    std::optional<int64_t> bloom_filter_length, Decryptor* header_decryptor,
+    Decryptor* bitset_decryptor) {
+  if (header_decryptor != nullptr || bitset_decryptor != nullptr) {

Review Comment:
   Perhaps add a separate function like 
`BlockSplitBloomFilter::DeserializeEncrypted`?



##########
cpp/src/parquet/bloom_filter.cc:
##########
@@ -106,7 +126,100 @@ static ::arrow::Status ValidateBloomFilterHeader(
 
 BlockSplitBloomFilter BlockSplitBloomFilter::Deserialize(
     const ReaderProperties& properties, ArrowInputStream* input,
-    std::optional<int64_t> bloom_filter_length) {
+    std::optional<int64_t> bloom_filter_length, Decryptor* header_decryptor,
+    Decryptor* bitset_decryptor) {
+  if (header_decryptor != nullptr || bitset_decryptor != nullptr) {
+    if (header_decryptor == nullptr || bitset_decryptor == nullptr) {
+      throw ParquetException(
+          "Bloom filter decryptors must be both provided or both null");
+    }
+
+    // Encrypted path: header and bitset are encrypted separately.
+    ThriftDeserializer deserializer(properties);
+    format::BloomFilterHeader header;
+
+    // Read the length-prefixed ciphertext for the header.
+    PARQUET_ASSIGN_OR_THROW(auto length_buf, 
input->Read(kCiphertextLengthSize));
+    if (ARROW_PREDICT_FALSE(length_buf->size() < kCiphertextLengthSize)) {
+      throw ParquetException("Bloom filter header read failed: not enough 
data");
+    }
+
+    const int64_t header_cipher_total_len =
+        ParseCiphertextTotalLength(length_buf->data(), length_buf->size());
+    if (ARROW_PREDICT_FALSE(header_cipher_total_len >
+                            std::numeric_limits<int32_t>::max())) {
+      throw ParquetException("Bloom filter header ciphertext length overflows 
int32");
+    }
+    if (bloom_filter_length && header_cipher_total_len > *bloom_filter_length) 
{
+      throw ParquetException(
+          "Bloom filter length less than encrypted bloom filter header 
length");
+    }
+    // Read the full header ciphertext and decrypt the Thrift header.
+    auto header_cipher_buf =
+        AllocateBuffer(properties.memory_pool(), header_cipher_total_len);
+    std::memcpy(header_cipher_buf->mutable_data(), length_buf->data(),
+                kCiphertextLengthSize);
+    const int64_t header_cipher_remaining =
+        header_cipher_total_len - kCiphertextLengthSize;
+    PARQUET_ASSIGN_OR_THROW(
+        auto read_size,
+        input->Read(header_cipher_remaining,

Review Comment:
   I don't remember if `Read` may short read and it's the caller's 
responsibility to use a loop to read enough data. 



##########
cpp/src/parquet/bloom_filter.cc:
##########
@@ -106,7 +126,100 @@ static ::arrow::Status ValidateBloomFilterHeader(
 
 BlockSplitBloomFilter BlockSplitBloomFilter::Deserialize(
     const ReaderProperties& properties, ArrowInputStream* input,
-    std::optional<int64_t> bloom_filter_length) {
+    std::optional<int64_t> bloom_filter_length, Decryptor* header_decryptor,
+    Decryptor* bitset_decryptor) {
+  if (header_decryptor != nullptr || bitset_decryptor != nullptr) {
+    if (header_decryptor == nullptr || bitset_decryptor == nullptr) {
+      throw ParquetException(
+          "Bloom filter decryptors must be both provided or both null");
+    }
+
+    // Encrypted path: header and bitset are encrypted separately.
+    ThriftDeserializer deserializer(properties);
+    format::BloomFilterHeader header;
+
+    // Read the length-prefixed ciphertext for the header.
+    PARQUET_ASSIGN_OR_THROW(auto length_buf, 
input->Read(kCiphertextLengthSize));
+    if (ARROW_PREDICT_FALSE(length_buf->size() < kCiphertextLengthSize)) {
+      throw ParquetException("Bloom filter header read failed: not enough 
data");

Review Comment:
   Let's include the length for debugging purpose.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to