Re: [PR] [feature](lakehouse) use parquet bloom for row group pruning [doris]

via GitHub Sat, 16 Dec 2023 15:05:28 -0800


github-actions[bot] commented on code in PR #28527:
URL: https://github.com/apache/doris/pull/28527#discussion_r1428964688



##########
be/src/vec/exec/format/parquet/parquet_bloom.cpp:
##########
@@ -0,0 +1,201 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet_bloom.h"
+
+#include <util/thrift_util.h>
+
+#include "parquet/xxhasher.h"
+
+namespace doris::vectorized {
+
+constexpr uint32_t BlockSplitBloomFilter::SALT[kBitsSetPerBlock];

Review Comment:
   warning: redundant 'SALT' declaration [readability-redundant-declaration]
   
   ```suggestion
   
   ```
   <details>
   <summary>Additional context</summary>
   
   **be/src/vec/exec/format/parquet/parquet_bloom.h:331:** previously declared 
here
   ```cpp
     static constexpr uint32_t SALT[kBitsSetPerBlock] = {
                               ^
   ```
   
   </details>
   



##########
be/src/vec/exec/format/parquet/parquet_bloom.cpp:
##########
@@ -0,0 +1,201 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet_bloom.h"
+
+#include <util/thrift_util.h>
+
+#include "parquet/xxhasher.h"
+
+namespace doris::vectorized {
+
+constexpr uint32_t BlockSplitBloomFilter::SALT[kBitsSetPerBlock];
+
+BlockSplitBloomFilter::BlockSplitBloomFilter()
+    : data_(nullptr), num_bytes_(0), hash_strategy_(HashStrategy::XXHASH),
+      algorithm_(Algorithm::BLOCK), 
compression_strategy_(CompressionStrategy::UNCOMPRESSED) {}
+
+void BlockSplitBloomFilter::Init(uint32_t num_bytes) {
+    if (num_bytes < kMinimumBloomFilterBytes) {
+        num_bytes = kMinimumBloomFilterBytes;
+    }
+
+    if ((num_bytes & (num_bytes - 1)) != 0) {
+        num_bytes = static_cast<uint32_t>(NextPower2(num_bytes));
+    }
+
+    num_bytes_ = num_bytes;
+    data_ = new (std::nothrow) uint8_t[num_bytes_];
+    memcpy(data_, 0, num_bytes_);
+
+    this->hasher_ = std::make_unique<parquet::XxHasher>();
+}
+
+void BlockSplitBloomFilter::Init(const uint8_t* bitset, uint32_t num_bytes) {
+    DCHECK(bitset != nullptr);
+
+    if (num_bytes < kMinimumBloomFilterBytes || num_bytes > 
kMaximumBloomFilterBytes ||
+        (num_bytes & (num_bytes - 1)) != 0) {
+        //throw ParquetException("Given length of bitset is illegal");
+    }
+
+    num_bytes_ = num_bytes;
+    data_ = new (std::nothrow) uint8_t[num_bytes_];
+    memcpy(data_, 0, num_bytes_);
+
+    this->hasher_ = std::make_unique<parquet::XxHasher>();
+    hash_strategy_ = HashStrategy::XXHASH;
+    algorithm_ = Algorithm::BLOCK;
+    compression_strategy_ = CompressionStrategy::UNCOMPRESSED;
+}
+
+static constexpr uint32_t kBloomFilterHeaderSizeGuess = 256;
+
+static Status ValidateBloomFilterHeader(
+    const tparquet::BloomFilterHeader& header) {
+    if (!header.algorithm.__isset.BLOCK) {
+        std::stringstream ss;
+        ss << "Unsupported Bloom filter algorithm: " << header.algorithm << 
".";
+        return Status::InternalError(ss.str());
+    }
+
+    if (!header.hash.__isset.XXHASH) {
+        std::stringstream ss;
+        ss << "Unsupported Bloom filter hash: " << header.hash << ".";
+        return Status::InternalError(ss.str());
+    }
+
+    if (!header.compression.__isset.UNCOMPRESSED) {
+        std::stringstream ss;
+        ss << "Unsupported Bloom filter compression: " << header.compression 
<< ".";
+        return Status::InternalError(ss.str());
+    }
+
+    if (header.numBytes <= 0 ||
+        static_cast<uint32_t>(header.numBytes) > 
BloomFilter::kMaximumBloomFilterBytes) {
+        std::stringstream ss;
+        ss << "Bloom filter size is incorrect: " << header.numBytes << ". Must 
be in range ("
+           << 0 << ", " << BloomFilter::kMaximumBloomFilterBytes << "].";
+        return Status::InternalError(ss.str());
+        }
+
+    return Status::OK();
+}
+
+
+BlockSplitBloomFilter BlockSplitBloomFilter::Deserialize(
+        io::FileReaderSPtr file_reader_s, int64_t bloom_offset, const 
io::IOContext* io_context) {
+    // NOTE: we don't know the bloom filter header size upfront, and we can't 
rely on
+    // InputStream::Peek() which isn't always implemented. Therefore, we must 
first
+    // Read() with an upper bound estimate of the header size, then once we 
know
+    // the bloom filter data size, we can Read() the exact number of remaining 
data bytes.
+    tparquet::BloomFilterHeader header;
+
+    // Read and deserialize bloom filter header
+    size_t bytes_read = kBloomFilterHeaderSizeGuess;
+    if (file_reader_s->size() < kBloomFilterHeaderSizeGuess) {
+        bytes_read = file_reader_s->size();
+    }
+
+    uint8_t hdr[bytes_read];
+    auto st = file_reader_s->read_at(bloom_offset, Slice(hdr, bytes_read), 
&bytes_read,
+                                           io_context);
+    if (!st.ok()) {
+        return BlockSplitBloomFilter();
+    }
+
+    // This gets used, then set by DeserializeThriftMsg
+    uint32_t header_size = static_cast<uint32_t>(bytes_read);
+    st = deserialize_thrift_msg(hdr, (uint32_t*)&header_size, true, &header);
+    if (!st.ok()) {
+        return BlockSplitBloomFilter();
+    }
+
+    st = ValidateBloomFilterHeader(header);
+    if (!st.ok()) {
+        return BlockSplitBloomFilter();
+    }
+
+    const int32_t bloom_filter_size = header.numBytes;
+    if (bloom_filter_size + header_size <= bytes_read) {
+        // The bloom filter data is entirely contained in the buffer we just 
read
+        // => just return it.
+        BlockSplitBloomFilter bloom_filter;
+        bloom_filter.Init(hdr + header_size, bloom_filter_size);
+        return bloom_filter;
+    }
+    // We have read a part of the bloom filter already, copy it to the target 
buffer
+    // and read the remaining part from the InputStream.
+    auto buffer = new (std::nothrow) uint8_t[bloom_filter_size];
+
+    const auto bloom_filter_bytes_in_header = bytes_read - header_size;
+    if (bloom_filter_bytes_in_header > 0) {
+        std::memcpy(buffer, hdr + header_size, bloom_filter_bytes_in_header);
+    }
+
+    const auto required_read_size = bloom_filter_size - 
bloom_filter_bytes_in_header;
+    auto read_size = required_read_size;
+    st = file_reader_s->read_at(bloom_offset + bytes_read,
+        Slice(buffer + bloom_filter_bytes_in_header, required_read_size), 
&read_size,
+        io_context);
+    if (!st.ok()) {
+        return BlockSplitBloomFilter();
+    }
+    BlockSplitBloomFilter bloom_filter;
+    bloom_filter.Init(buffer, bloom_filter_size);
+    return bloom_filter;
+}
+
+
+bool BlockSplitBloomFilter::FindHash(uint64_t hash) const {
+    const uint32_t bucket_index =
+        static_cast<uint32_t>(((hash >> 32) * (num_bytes_ / 
kBytesPerFilterBlock)) >> 32);
+    const uint32_t key = static_cast<uint32_t>(hash);
+    const uint32_t* bitset32 = reinterpret_cast<const uint32_t*>(data_);
+
+    for (int i = 0; i < kBitsSetPerBlock; ++i) {
+        // Calculate mask for key in the given bitset.
+        const uint32_t mask = UINT32_C(0x1) << ((key * SALT[i]) >> 27);
+        if (0 == (bitset32[kBitsSetPerBlock * bucket_index + i] & mask)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+void BlockSplitBloomFilter::InsertHashImpl(uint64_t hash) const {
+    const uint32_t bucket_index =
+        static_cast<uint32_t>(((hash >> 32) * (num_bytes_ / 
kBytesPerFilterBlock)) >> 32);
+    const uint32_t key = static_cast<uint32_t>(hash);

Review Comment:
   warning: use auto when initializing with a cast to avoid duplicating the 
type name [modernize-use-auto]
   
   ```suggestion
       const auto key = static_cast<uint32_t>(hash);
   ```
   



##########
be/src/vec/exec/format/parquet/parquet_bloom.cpp:
##########
@@ -0,0 +1,201 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet_bloom.h"
+
+#include <util/thrift_util.h>
+
+#include "parquet/xxhasher.h"
+
+namespace doris::vectorized {
+
+constexpr uint32_t BlockSplitBloomFilter::SALT[kBitsSetPerBlock];
+
+BlockSplitBloomFilter::BlockSplitBloomFilter()
+    : data_(nullptr), num_bytes_(0), hash_strategy_(HashStrategy::XXHASH),
+      algorithm_(Algorithm::BLOCK), 
compression_strategy_(CompressionStrategy::UNCOMPRESSED) {}
+
+void BlockSplitBloomFilter::Init(uint32_t num_bytes) {
+    if (num_bytes < kMinimumBloomFilterBytes) {
+        num_bytes = kMinimumBloomFilterBytes;
+    }
+
+    if ((num_bytes & (num_bytes - 1)) != 0) {
+        num_bytes = static_cast<uint32_t>(NextPower2(num_bytes));
+    }
+
+    num_bytes_ = num_bytes;
+    data_ = new (std::nothrow) uint8_t[num_bytes_];
+    memcpy(data_, 0, num_bytes_);
+
+    this->hasher_ = std::make_unique<parquet::XxHasher>();
+}
+
+void BlockSplitBloomFilter::Init(const uint8_t* bitset, uint32_t num_bytes) {
+    DCHECK(bitset != nullptr);
+
+    if (num_bytes < kMinimumBloomFilterBytes || num_bytes > 
kMaximumBloomFilterBytes ||
+        (num_bytes & (num_bytes - 1)) != 0) {
+        //throw ParquetException("Given length of bitset is illegal");
+    }
+
+    num_bytes_ = num_bytes;
+    data_ = new (std::nothrow) uint8_t[num_bytes_];
+    memcpy(data_, 0, num_bytes_);

Review Comment:
   warning: use nullptr [modernize-use-nullptr]
   
   ```suggestion
       memcpy(data_, nullptr, num_bytes_);
   ```
   



##########
be/src/vec/exec/format/parquet/parquet_bloom.cpp:
##########
@@ -0,0 +1,201 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet_bloom.h"
+
+#include <util/thrift_util.h>
+
+#include "parquet/xxhasher.h"
+
+namespace doris::vectorized {
+
+constexpr uint32_t BlockSplitBloomFilter::SALT[kBitsSetPerBlock];
+
+BlockSplitBloomFilter::BlockSplitBloomFilter()
+    : data_(nullptr), num_bytes_(0), hash_strategy_(HashStrategy::XXHASH),
+      algorithm_(Algorithm::BLOCK), 
compression_strategy_(CompressionStrategy::UNCOMPRESSED) {}
+
+void BlockSplitBloomFilter::Init(uint32_t num_bytes) {
+    if (num_bytes < kMinimumBloomFilterBytes) {
+        num_bytes = kMinimumBloomFilterBytes;
+    }
+
+    if ((num_bytes & (num_bytes - 1)) != 0) {
+        num_bytes = static_cast<uint32_t>(NextPower2(num_bytes));
+    }
+
+    num_bytes_ = num_bytes;
+    data_ = new (std::nothrow) uint8_t[num_bytes_];
+    memcpy(data_, 0, num_bytes_);
+
+    this->hasher_ = std::make_unique<parquet::XxHasher>();
+}
+
+void BlockSplitBloomFilter::Init(const uint8_t* bitset, uint32_t num_bytes) {
+    DCHECK(bitset != nullptr);
+
+    if (num_bytes < kMinimumBloomFilterBytes || num_bytes > 
kMaximumBloomFilterBytes ||
+        (num_bytes & (num_bytes - 1)) != 0) {
+        //throw ParquetException("Given length of bitset is illegal");
+    }
+
+    num_bytes_ = num_bytes;
+    data_ = new (std::nothrow) uint8_t[num_bytes_];
+    memcpy(data_, 0, num_bytes_);
+
+    this->hasher_ = std::make_unique<parquet::XxHasher>();
+    hash_strategy_ = HashStrategy::XXHASH;
+    algorithm_ = Algorithm::BLOCK;
+    compression_strategy_ = CompressionStrategy::UNCOMPRESSED;
+}
+
+static constexpr uint32_t kBloomFilterHeaderSizeGuess = 256;
+
+static Status ValidateBloomFilterHeader(
+    const tparquet::BloomFilterHeader& header) {
+    if (!header.algorithm.__isset.BLOCK) {
+        std::stringstream ss;
+        ss << "Unsupported Bloom filter algorithm: " << header.algorithm << 
".";
+        return Status::InternalError(ss.str());
+    }
+
+    if (!header.hash.__isset.XXHASH) {
+        std::stringstream ss;
+        ss << "Unsupported Bloom filter hash: " << header.hash << ".";
+        return Status::InternalError(ss.str());
+    }
+
+    if (!header.compression.__isset.UNCOMPRESSED) {
+        std::stringstream ss;
+        ss << "Unsupported Bloom filter compression: " << header.compression 
<< ".";
+        return Status::InternalError(ss.str());
+    }
+
+    if (header.numBytes <= 0 ||
+        static_cast<uint32_t>(header.numBytes) > 
BloomFilter::kMaximumBloomFilterBytes) {
+        std::stringstream ss;
+        ss << "Bloom filter size is incorrect: " << header.numBytes << ". Must 
be in range ("
+           << 0 << ", " << BloomFilter::kMaximumBloomFilterBytes << "].";
+        return Status::InternalError(ss.str());
+        }
+
+    return Status::OK();
+}
+
+
+BlockSplitBloomFilter BlockSplitBloomFilter::Deserialize(
+        io::FileReaderSPtr file_reader_s, int64_t bloom_offset, const 
io::IOContext* io_context) {
+    // NOTE: we don't know the bloom filter header size upfront, and we can't 
rely on
+    // InputStream::Peek() which isn't always implemented. Therefore, we must 
first
+    // Read() with an upper bound estimate of the header size, then once we 
know
+    // the bloom filter data size, we can Read() the exact number of remaining 
data bytes.
+    tparquet::BloomFilterHeader header;
+
+    // Read and deserialize bloom filter header
+    size_t bytes_read = kBloomFilterHeaderSizeGuess;
+    if (file_reader_s->size() < kBloomFilterHeaderSizeGuess) {
+        bytes_read = file_reader_s->size();
+    }
+
+    uint8_t hdr[bytes_read];
+    auto st = file_reader_s->read_at(bloom_offset, Slice(hdr, bytes_read), 
&bytes_read,
+                                           io_context);
+    if (!st.ok()) {
+        return BlockSplitBloomFilter();
+    }
+
+    // This gets used, then set by DeserializeThriftMsg
+    uint32_t header_size = static_cast<uint32_t>(bytes_read);
+    st = deserialize_thrift_msg(hdr, (uint32_t*)&header_size, true, &header);
+    if (!st.ok()) {
+        return BlockSplitBloomFilter();
+    }
+
+    st = ValidateBloomFilterHeader(header);
+    if (!st.ok()) {
+        return BlockSplitBloomFilter();
+    }
+
+    const int32_t bloom_filter_size = header.numBytes;
+    if (bloom_filter_size + header_size <= bytes_read) {
+        // The bloom filter data is entirely contained in the buffer we just 
read
+        // => just return it.
+        BlockSplitBloomFilter bloom_filter;
+        bloom_filter.Init(hdr + header_size, bloom_filter_size);
+        return bloom_filter;
+    }
+    // We have read a part of the bloom filter already, copy it to the target 
buffer
+    // and read the remaining part from the InputStream.
+    auto buffer = new (std::nothrow) uint8_t[bloom_filter_size];
+
+    const auto bloom_filter_bytes_in_header = bytes_read - header_size;
+    if (bloom_filter_bytes_in_header > 0) {
+        std::memcpy(buffer, hdr + header_size, bloom_filter_bytes_in_header);
+    }
+
+    const auto required_read_size = bloom_filter_size - 
bloom_filter_bytes_in_header;
+    auto read_size = required_read_size;
+    st = file_reader_s->read_at(bloom_offset + bytes_read,
+        Slice(buffer + bloom_filter_bytes_in_header, required_read_size), 
&read_size,
+        io_context);
+    if (!st.ok()) {
+        return BlockSplitBloomFilter();
+    }
+    BlockSplitBloomFilter bloom_filter;
+    bloom_filter.Init(buffer, bloom_filter_size);
+    return bloom_filter;
+}
+
+
+bool BlockSplitBloomFilter::FindHash(uint64_t hash) const {
+    const uint32_t bucket_index =
+        static_cast<uint32_t>(((hash >> 32) * (num_bytes_ / 
kBytesPerFilterBlock)) >> 32);
+    const uint32_t key = static_cast<uint32_t>(hash);

Review Comment:
   warning: use auto when initializing with a cast to avoid duplicating the 
type name [modernize-use-auto]
   
   ```suggestion
       const auto key = static_cast<uint32_t>(hash);
   ```
   



##########
be/src/vec/exec/format/parquet/parquet_bloom.h:
##########
@@ -0,0 +1,355 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <gen_cpp/parquet_types.h>
+#include <io/fs/file_reader.h>
+#include <parquet/hasher.h>
+#include <parquet/types.h>
+
+namespace doris::vectorized {
+class BloomFilter {
+public:
+    // Maximum Bloom filter size, it sets to HDFS default block size 128MB
+    // This value will be reconsidered when implementing Bloom filter producer.
+    static constexpr uint32_t kMaximumBloomFilterBytes = 128 * 1024 * 1024;
+
+    /// Determine whether an element exist in set or not.
+    ///
+    /// @param hash the element to contain.
+    /// @return false if value is definitely not in set, and true means 
PROBABLY
+    /// in set.
+    virtual bool FindHash(uint64_t hash) const = 0;
+
+    /// Insert element to set represented by Bloom filter bitset.
+    /// @param hash the hash of value to insert into Bloom filter.
+    virtual void InsertHash(uint64_t hash) = 0;
+
+    /// Insert elements to set represented by Bloom filter bitset.
+    /// @param hashes the hash values to insert into Bloom filter.
+    /// @param num_values the number of hash values to insert.
+    virtual void InsertHashes(const uint64_t* hashes, int num_values) = 0;
+
+    /// Get the number of bytes of bitset
+    virtual uint32_t GetBitsetSize() const = 0;
+
+    /// Compute hash for 32 bits value by using its plain encoding result.
+    ///
+    /// @param value the value to hash.
+    /// @return hash result.
+    virtual uint64_t Hash(int32_t value) const = 0;
+
+    /// Compute hash for 64 bits value by using its plain encoding result.
+    ///
+    /// @param value the value to hash.
+    /// @return hash result.
+    virtual uint64_t Hash(int64_t value) const = 0;
+
+    /// Compute hash for float value by using its plain encoding result.
+    ///
+    /// @param value the value to hash.
+    /// @return hash result.
+    virtual uint64_t Hash(float value) const = 0;
+
+    /// Compute hash for double value by using its plain encoding result.
+    ///
+    /// @param value the value to hash.
+    /// @return hash result.
+    virtual uint64_t Hash(double value) const = 0;
+
+    /// Compute hash for Int96 value by using its plain encoding result.
+    ///
+    /// @param value the value to hash.
+    /// @return hash result.
+    virtual uint64_t Hash(const parquet::Int96* value) const = 0;
+
+    /// Compute hash for ByteArray value by using its plain encoding result.
+    ///
+    /// @param value the value to hash.
+    /// @return hash result.
+    virtual uint64_t Hash(const parquet::ByteArray* value) const = 0;
+
+    /// Compute hash for fixed byte array value by using its plain encoding 
result.
+    ///
+    /// @param value the value address.
+    /// @param len the value length.
+    /// @return hash result.
+    virtual uint64_t Hash(const parquet::FLBA* value, uint32_t len) const = 0;
+
+    /// Batch compute hashes for 32 bits values by using its plain encoding 
result.
+    ///
+    /// @param values values a pointer to the values to hash.
+    /// @param num_values the number of values to hash.
+    /// @param hashes a pointer to the output hash values, its length should 
be equal to
+    /// num_values.
+    virtual void Hashes(const int32_t* values, int num_values, uint64_t* 
hashes) const = 0;
+
+    /// Batch compute hashes for 64 bits values by using its plain encoding 
result.
+    ///
+    /// @param values values a pointer to the values to hash.
+    /// @param num_values the number of values to hash.
+    /// @param hashes a pointer to the output hash values, its length should 
be equal to
+    /// num_values.
+    virtual void Hashes(const int64_t* values, int num_values, uint64_t* 
hashes) const = 0;
+
+    /// Batch compute hashes for float values by using its plain encoding 
result.
+    ///
+    /// @param values values a pointer to the values to hash.
+    /// @param num_values the number of values to hash.
+    /// @param hashes a pointer to the output hash values, its length should 
be equal to
+    /// num_values.
+    virtual void Hashes(const float* values, int num_values, uint64_t* hashes) 
const = 0;
+
+    /// Batch compute hashes for double values by using its plain encoding 
result.
+    ///
+    /// @param values values a pointer to the values to hash.
+    /// @param num_values the number of values to hash.
+    /// @param hashes a pointer to the output hash values, its length should 
be equal to
+    /// num_values.
+    virtual void Hashes(const double* values, int num_values, uint64_t* 
hashes) const = 0;
+
+    /// Batch compute hashes for Int96 values by using its plain encoding 
result.
+    ///
+    /// @param values values a pointer to the values to hash.
+    /// @param num_values the number of values to hash.
+    /// @param hashes a pointer to the output hash values, its length should 
be equal to
+    /// num_values.
+    virtual void Hashes(const parquet::Int96* values, int num_values, 
uint64_t* hashes) const = 0;
+
+    /// Batch compute hashes for ByteArray values by using its plain encoding 
result.
+    ///
+    /// @param values values a pointer to the values to hash.
+    /// @param num_values the number of values to hash.
+    /// @param hashes a pointer to the output hash values, its length should 
be equal to
+    /// num_values.
+    virtual void Hashes(const parquet::ByteArray* values, int num_values,
+                        uint64_t* hashes) const = 0;
+
+    /// Batch compute hashes for fixed byte array values by using its plain 
encoding result.
+    ///
+    /// @param values values a pointer to the values to hash.
+    /// @param type_len the value length.
+    /// @param num_values the number of values to hash.
+    /// @param hashes a pointer to the output hash values, its length should 
be equal to
+    /// num_values.
+    virtual void Hashes(const parquet::FLBA* values, uint32_t type_len, int 
num_values,
+                        uint64_t* hashes) const = 0;
+
+    virtual ~BloomFilter() = default;
+
+protected:
+    // Hash strategy available for Bloom filter.
+    enum class HashStrategy : uint32_t { XXHASH = 0 };
+
+    // Bloom filter algorithm.
+    enum class Algorithm : uint32_t { BLOCK = 0 };
+
+    enum class CompressionStrategy : uint32_t { UNCOMPRESSED = 0 };
+};
+
+constexpr bool IsMultipleOf8(int64_t n) { return (n & 7) == 0; }
+
+static inline int64_t NextPower2(int64_t n) {
+    // Taken from
+    // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+    n--;
+    n |= n >> 1;
+    n |= n >> 2;
+    n |= n >> 4;
+    n |= n >> 8;
+    n |= n >> 16;
+    n |= n >> 32;
+    n++;
+    return n;
+}
+
+/// The BlockSplitBloomFilter is implemented using block-based Bloom filters 
from
+/// Putze et al.'s "Cache-,Hash- and Space-Efficient Bloom filters". The basic 
idea is to
+/// hash the item to a tiny Bloom filter which size fit a single cache line or 
smaller.
+///
+/// This implementation sets 8 bits in each tiny Bloom filter. Each tiny Bloom
+/// filter is 32 bytes to take advantage of 32-byte SIMD instructions.
+class BlockSplitBloomFilter : public BloomFilter {
+ public:
+  /// The constructor of BlockSplitBloomFilter. It uses XXH64 as hash function.
+  explicit BlockSplitBloomFilter();
+
+  /// Initialize the BlockSplitBloomFilter. The range of num_bytes should be 
within
+  /// [kMinimumBloomFilterBytes, kMaximumBloomFilterBytes], it will be
+  /// rounded up/down to lower/upper bound if num_bytes is out of range and 
also
+  /// will be rounded up to a power of 2.
+  ///
+  /// @param num_bytes The number of bytes to store Bloom filter bitset.
+  void Init(uint32_t num_bytes);
+
+  /// Initialize the BlockSplitBloomFilter. It copies the bitset as underlying
+  /// bitset because the given bitset may not satisfy the 32-byte alignment 
requirement
+  /// which may lead to segfault when performing SIMD instructions. It is the 
caller's
+  /// responsibility to free the bitset passed in. This is used when 
reconstructing
+  /// a Bloom filter from a parquet file.
+  ///
+  /// @param bitset The given bitset to initialize the Bloom filter.
+  /// @param num_bytes  The number of bytes of given bitset.
+  void Init(const uint8_t* bitset, uint32_t num_bytes);
+
+  /// Minimum Bloom filter size, it sets to 32 bytes to fit a tiny Bloom 
filter.
+  static constexpr uint32_t kMinimumBloomFilterBytes = 32;
+
+  /// Calculate optimal size according to the number of distinct values and 
false
+  /// positive probability.
+  ///
+  /// @param ndv The number of distinct values.
+  /// @param fpp The false positive probability.
+  /// @return it always return a value between kMinimumBloomFilterBytes and
+  /// kMaximumBloomFilterBytes, and the return value is always a power of 2
+  static uint32_t OptimalNumOfBytes(uint32_t ndv, double fpp) {
+    uint32_t optimal_num_of_bits = OptimalNumOfBits(ndv, fpp);
+    DCHECK(IsMultipleOf8(optimal_num_of_bits));
+    return optimal_num_of_bits >> 3;
+  }
+
+  /// Calculate optimal size according to the number of distinct values and 
false
+  /// positive probability.
+  ///
+  /// @param ndv The number of distinct values.
+  /// @param fpp The false positive probability.
+  /// @return it always return a value between kMinimumBloomFilterBytes * 8 and
+  /// kMaximumBloomFilterBytes * 8, and the return value is always a power of 
16
+  static uint32_t OptimalNumOfBits(uint32_t ndv, double fpp) {
+    DCHECK(fpp > 0.0 && fpp < 1.0);
+    const double m = -8.0 * ndv / log(1 - pow(fpp, 1.0 / 8));
+    uint32_t num_bits;
+
+    // Handle overflow.
+    if (m < 0 || m > kMaximumBloomFilterBytes << 3) {
+      num_bits = static_cast<uint32_t>(kMaximumBloomFilterBytes << 3);
+    } else {
+      num_bits = static_cast<uint32_t>(m);
+    }
+
+    // Round up to lower bound
+    if (num_bits < kMinimumBloomFilterBytes << 3) {
+      num_bits = kMinimumBloomFilterBytes << 3;
+    }
+
+    // Get next power of 2 if bits is not power of 2.
+    if ((num_bits & (num_bits - 1)) != 0) {
+      num_bits = static_cast<uint32_t>(NextPower2(num_bits));
+    }
+
+    // Round down to upper bound
+    if (num_bits > kMaximumBloomFilterBytes << 3) {
+      num_bits = kMaximumBloomFilterBytes << 3;
+    }
+
+    return num_bits;
+  }
+
+  bool FindHash(uint64_t hash) const override;
+  void InsertHash(uint64_t hash) override;
+  void InsertHashes(const uint64_t* hashes, int num_values) override;
+  uint32_t GetBitsetSize() const override { return num_bytes_; }
+
+  uint64_t Hash(int32_t value) const override { return hasher_->Hash(value); }
+  uint64_t Hash(int64_t value) const override { return hasher_->Hash(value); }
+  uint64_t Hash(float value) const override { return hasher_->Hash(value); }
+  uint64_t Hash(double value) const override { return hasher_->Hash(value); }
+  uint64_t Hash(const parquet::Int96* value) const override { return 
hasher_->Hash(value); }
+  uint64_t Hash(const parquet::ByteArray* value) const override { return 
hasher_->Hash(value); }
+  uint64_t Hash(const parquet::FLBA* value, uint32_t len) const override {
+    return hasher_->Hash(value, len);
+  }
+
+  void Hashes(const int32_t* values, int num_values, uint64_t* hashes) const 
override {
+    hasher_->Hashes(values, num_values, hashes);
+  }
+  void Hashes(const int64_t* values, int num_values, uint64_t* hashes) const 
override {
+    hasher_->Hashes(values, num_values, hashes);
+  }
+  void Hashes(const float* values, int num_values, uint64_t* hashes) const 
override {
+    hasher_->Hashes(values, num_values, hashes);
+  }
+  void Hashes(const double* values, int num_values, uint64_t* hashes) const 
override {
+    hasher_->Hashes(values, num_values, hashes);
+  }
+  void Hashes(const parquet::Int96* values, int num_values, uint64_t* hashes) 
const override {
+    hasher_->Hashes(values, num_values, hashes);
+  }
+  void Hashes(const parquet::ByteArray* values, int num_values, uint64_t* 
hashes) const override {
+    hasher_->Hashes(values, num_values, hashes);
+  }
+  void Hashes(const parquet::FLBA* values, uint32_t type_len, int num_values,
+              uint64_t* hashes) const override {
+    hasher_->Hashes(values, type_len, num_values, hashes);
+  }
+
+  uint64_t Hash(const int32_t* value) const { return hasher_->Hash(*value); }

Review Comment:
   warning: annotate this function with 'override' or (rarely) 'final' 
[modernize-use-override]
   
   ```suggestion
     uint64_t Hash(const int32_t* value) const override { return 
hasher_->Hash(*value); }
   ```
   



##########
be/src/vec/exec/format/parquet/parquet_bloom.cpp:
##########
@@ -0,0 +1,201 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet_bloom.h"
+
+#include <util/thrift_util.h>
+
+#include "parquet/xxhasher.h"
+
+namespace doris::vectorized {
+
+constexpr uint32_t BlockSplitBloomFilter::SALT[kBitsSetPerBlock];
+
+BlockSplitBloomFilter::BlockSplitBloomFilter()
+    : data_(nullptr), num_bytes_(0), hash_strategy_(HashStrategy::XXHASH),
+      algorithm_(Algorithm::BLOCK), 
compression_strategy_(CompressionStrategy::UNCOMPRESSED) {}
+
+void BlockSplitBloomFilter::Init(uint32_t num_bytes) {
+    if (num_bytes < kMinimumBloomFilterBytes) {
+        num_bytes = kMinimumBloomFilterBytes;
+    }
+
+    if ((num_bytes & (num_bytes - 1)) != 0) {
+        num_bytes = static_cast<uint32_t>(NextPower2(num_bytes));
+    }
+
+    num_bytes_ = num_bytes;
+    data_ = new (std::nothrow) uint8_t[num_bytes_];
+    memcpy(data_, 0, num_bytes_);

Review Comment:
   warning: use nullptr [modernize-use-nullptr]
   
   ```suggestion
       memcpy(data_, nullptr, num_bytes_);
   ```
   



##########
be/src/vec/exec/format/parquet/parquet_bloom.cpp:
##########
@@ -0,0 +1,201 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet_bloom.h"
+
+#include <util/thrift_util.h>
+
+#include "parquet/xxhasher.h"
+
+namespace doris::vectorized {
+
+constexpr uint32_t BlockSplitBloomFilter::SALT[kBitsSetPerBlock];
+
+BlockSplitBloomFilter::BlockSplitBloomFilter()
+    : data_(nullptr), num_bytes_(0), hash_strategy_(HashStrategy::XXHASH),
+      algorithm_(Algorithm::BLOCK), 
compression_strategy_(CompressionStrategy::UNCOMPRESSED) {}
+
+void BlockSplitBloomFilter::Init(uint32_t num_bytes) {
+    if (num_bytes < kMinimumBloomFilterBytes) {
+        num_bytes = kMinimumBloomFilterBytes;
+    }
+
+    if ((num_bytes & (num_bytes - 1)) != 0) {
+        num_bytes = static_cast<uint32_t>(NextPower2(num_bytes));
+    }
+
+    num_bytes_ = num_bytes;
+    data_ = new (std::nothrow) uint8_t[num_bytes_];
+    memcpy(data_, 0, num_bytes_);
+
+    this->hasher_ = std::make_unique<parquet::XxHasher>();
+}
+
+void BlockSplitBloomFilter::Init(const uint8_t* bitset, uint32_t num_bytes) {
+    DCHECK(bitset != nullptr);
+
+    if (num_bytes < kMinimumBloomFilterBytes || num_bytes > 
kMaximumBloomFilterBytes ||
+        (num_bytes & (num_bytes - 1)) != 0) {
+        //throw ParquetException("Given length of bitset is illegal");
+    }
+
+    num_bytes_ = num_bytes;
+    data_ = new (std::nothrow) uint8_t[num_bytes_];
+    memcpy(data_, 0, num_bytes_);
+
+    this->hasher_ = std::make_unique<parquet::XxHasher>();
+    hash_strategy_ = HashStrategy::XXHASH;
+    algorithm_ = Algorithm::BLOCK;
+    compression_strategy_ = CompressionStrategy::UNCOMPRESSED;
+}
+
+static constexpr uint32_t kBloomFilterHeaderSizeGuess = 256;
+
+static Status ValidateBloomFilterHeader(
+    const tparquet::BloomFilterHeader& header) {
+    if (!header.algorithm.__isset.BLOCK) {
+        std::stringstream ss;
+        ss << "Unsupported Bloom filter algorithm: " << header.algorithm << 
".";
+        return Status::InternalError(ss.str());
+    }
+
+    if (!header.hash.__isset.XXHASH) {
+        std::stringstream ss;
+        ss << "Unsupported Bloom filter hash: " << header.hash << ".";
+        return Status::InternalError(ss.str());
+    }
+
+    if (!header.compression.__isset.UNCOMPRESSED) {
+        std::stringstream ss;
+        ss << "Unsupported Bloom filter compression: " << header.compression 
<< ".";
+        return Status::InternalError(ss.str());
+    }
+
+    if (header.numBytes <= 0 ||
+        static_cast<uint32_t>(header.numBytes) > 
BloomFilter::kMaximumBloomFilterBytes) {
+        std::stringstream ss;
+        ss << "Bloom filter size is incorrect: " << header.numBytes << ". Must 
be in range ("
+           << 0 << ", " << BloomFilter::kMaximumBloomFilterBytes << "].";
+        return Status::InternalError(ss.str());
+        }
+
+    return Status::OK();
+}
+
+
+BlockSplitBloomFilter BlockSplitBloomFilter::Deserialize(
+        io::FileReaderSPtr file_reader_s, int64_t bloom_offset, const 
io::IOContext* io_context) {
+    // NOTE: we don't know the bloom filter header size upfront, and we can't 
rely on
+    // InputStream::Peek() which isn't always implemented. Therefore, we must 
first
+    // Read() with an upper bound estimate of the header size, then once we 
know
+    // the bloom filter data size, we can Read() the exact number of remaining 
data bytes.
+    tparquet::BloomFilterHeader header;
+
+    // Read and deserialize bloom filter header
+    size_t bytes_read = kBloomFilterHeaderSizeGuess;
+    if (file_reader_s->size() < kBloomFilterHeaderSizeGuess) {
+        bytes_read = file_reader_s->size();
+    }
+
+    uint8_t hdr[bytes_read];
+    auto st = file_reader_s->read_at(bloom_offset, Slice(hdr, bytes_read), 
&bytes_read,
+                                           io_context);
+    if (!st.ok()) {
+        return BlockSplitBloomFilter();
+    }
+
+    // This gets used, then set by DeserializeThriftMsg
+    uint32_t header_size = static_cast<uint32_t>(bytes_read);
+    st = deserialize_thrift_msg(hdr, (uint32_t*)&header_size, true, &header);
+    if (!st.ok()) {
+        return BlockSplitBloomFilter();
+    }
+
+    st = ValidateBloomFilterHeader(header);
+    if (!st.ok()) {
+        return BlockSplitBloomFilter();
+    }
+
+    const int32_t bloom_filter_size = header.numBytes;
+    if (bloom_filter_size + header_size <= bytes_read) {
+        // The bloom filter data is entirely contained in the buffer we just 
read
+        // => just return it.
+        BlockSplitBloomFilter bloom_filter;
+        bloom_filter.Init(hdr + header_size, bloom_filter_size);
+        return bloom_filter;
+    }
+    // We have read a part of the bloom filter already, copy it to the target 
buffer
+    // and read the remaining part from the InputStream.
+    auto buffer = new (std::nothrow) uint8_t[bloom_filter_size];
+
+    const auto bloom_filter_bytes_in_header = bytes_read - header_size;
+    if (bloom_filter_bytes_in_header > 0) {
+        std::memcpy(buffer, hdr + header_size, bloom_filter_bytes_in_header);
+    }
+
+    const auto required_read_size = bloom_filter_size - 
bloom_filter_bytes_in_header;
+    auto read_size = required_read_size;
+    st = file_reader_s->read_at(bloom_offset + bytes_read,
+        Slice(buffer + bloom_filter_bytes_in_header, required_read_size), 
&read_size,
+        io_context);
+    if (!st.ok()) {
+        return BlockSplitBloomFilter();
+    }
+    BlockSplitBloomFilter bloom_filter;
+    bloom_filter.Init(buffer, bloom_filter_size);
+    return bloom_filter;
+}
+
+
+bool BlockSplitBloomFilter::FindHash(uint64_t hash) const {
+    const uint32_t bucket_index =
+        static_cast<uint32_t>(((hash >> 32) * (num_bytes_ / 
kBytesPerFilterBlock)) >> 32);
+    const uint32_t key = static_cast<uint32_t>(hash);
+    const uint32_t* bitset32 = reinterpret_cast<const uint32_t*>(data_);
+
+    for (int i = 0; i < kBitsSetPerBlock; ++i) {
+        // Calculate mask for key in the given bitset.
+        const uint32_t mask = UINT32_C(0x1) << ((key * SALT[i]) >> 27);
+        if (0 == (bitset32[kBitsSetPerBlock * bucket_index + i] & mask)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+void BlockSplitBloomFilter::InsertHashImpl(uint64_t hash) const {
+    const uint32_t bucket_index =

Review Comment:
   warning: use auto when initializing with a cast to avoid duplicating the 
type name [modernize-use-auto]
   
   ```suggestion
       const auto bucket_index =
   ```
   



##########
be/src/vec/exec/format/parquet/parquet_bloom_predicate.h:
##########
@@ -0,0 +1,42 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <gen_cpp/parquet_types.h>

Review Comment:
   warning: 'gen_cpp/parquet_types.h' file not found [clang-diagnostic-error]
   ```cpp
   #include <gen_cpp/parquet_types.h>
            ^
   ```
   



##########
be/src/vec/exec/format/parquet/parquet_bloom.cpp:
##########
@@ -0,0 +1,201 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet_bloom.h"
+
+#include <util/thrift_util.h>
+
+#include "parquet/xxhasher.h"
+
+namespace doris::vectorized {
+
+constexpr uint32_t BlockSplitBloomFilter::SALT[kBitsSetPerBlock];
+
+BlockSplitBloomFilter::BlockSplitBloomFilter()
+    : data_(nullptr), num_bytes_(0), hash_strategy_(HashStrategy::XXHASH),
+      algorithm_(Algorithm::BLOCK), 
compression_strategy_(CompressionStrategy::UNCOMPRESSED) {}
+
+void BlockSplitBloomFilter::Init(uint32_t num_bytes) {
+    if (num_bytes < kMinimumBloomFilterBytes) {
+        num_bytes = kMinimumBloomFilterBytes;
+    }
+
+    if ((num_bytes & (num_bytes - 1)) != 0) {
+        num_bytes = static_cast<uint32_t>(NextPower2(num_bytes));
+    }
+
+    num_bytes_ = num_bytes;
+    data_ = new (std::nothrow) uint8_t[num_bytes_];
+    memcpy(data_, 0, num_bytes_);
+
+    this->hasher_ = std::make_unique<parquet::XxHasher>();
+}
+
+void BlockSplitBloomFilter::Init(const uint8_t* bitset, uint32_t num_bytes) {
+    DCHECK(bitset != nullptr);
+
+    if (num_bytes < kMinimumBloomFilterBytes || num_bytes > 
kMaximumBloomFilterBytes ||
+        (num_bytes & (num_bytes - 1)) != 0) {
+        //throw ParquetException("Given length of bitset is illegal");
+    }
+
+    num_bytes_ = num_bytes;
+    data_ = new (std::nothrow) uint8_t[num_bytes_];
+    memcpy(data_, 0, num_bytes_);
+
+    this->hasher_ = std::make_unique<parquet::XxHasher>();
+    hash_strategy_ = HashStrategy::XXHASH;
+    algorithm_ = Algorithm::BLOCK;
+    compression_strategy_ = CompressionStrategy::UNCOMPRESSED;
+}
+
+static constexpr uint32_t kBloomFilterHeaderSizeGuess = 256;
+
+static Status ValidateBloomFilterHeader(
+    const tparquet::BloomFilterHeader& header) {
+    if (!header.algorithm.__isset.BLOCK) {
+        std::stringstream ss;
+        ss << "Unsupported Bloom filter algorithm: " << header.algorithm << 
".";
+        return Status::InternalError(ss.str());
+    }
+
+    if (!header.hash.__isset.XXHASH) {
+        std::stringstream ss;
+        ss << "Unsupported Bloom filter hash: " << header.hash << ".";
+        return Status::InternalError(ss.str());
+    }
+
+    if (!header.compression.__isset.UNCOMPRESSED) {
+        std::stringstream ss;
+        ss << "Unsupported Bloom filter compression: " << header.compression 
<< ".";
+        return Status::InternalError(ss.str());
+    }
+
+    if (header.numBytes <= 0 ||
+        static_cast<uint32_t>(header.numBytes) > 
BloomFilter::kMaximumBloomFilterBytes) {
+        std::stringstream ss;
+        ss << "Bloom filter size is incorrect: " << header.numBytes << ". Must 
be in range ("
+           << 0 << ", " << BloomFilter::kMaximumBloomFilterBytes << "].";
+        return Status::InternalError(ss.str());
+        }
+
+    return Status::OK();
+}
+
+
+BlockSplitBloomFilter BlockSplitBloomFilter::Deserialize(
+        io::FileReaderSPtr file_reader_s, int64_t bloom_offset, const 
io::IOContext* io_context) {
+    // NOTE: we don't know the bloom filter header size upfront, and we can't 
rely on
+    // InputStream::Peek() which isn't always implemented. Therefore, we must 
first
+    // Read() with an upper bound estimate of the header size, then once we 
know
+    // the bloom filter data size, we can Read() the exact number of remaining 
data bytes.
+    tparquet::BloomFilterHeader header;
+
+    // Read and deserialize bloom filter header
+    size_t bytes_read = kBloomFilterHeaderSizeGuess;
+    if (file_reader_s->size() < kBloomFilterHeaderSizeGuess) {
+        bytes_read = file_reader_s->size();
+    }
+
+    uint8_t hdr[bytes_read];
+    auto st = file_reader_s->read_at(bloom_offset, Slice(hdr, bytes_read), 
&bytes_read,
+                                           io_context);
+    if (!st.ok()) {
+        return BlockSplitBloomFilter();
+    }
+
+    // This gets used, then set by DeserializeThriftMsg
+    uint32_t header_size = static_cast<uint32_t>(bytes_read);
+    st = deserialize_thrift_msg(hdr, (uint32_t*)&header_size, true, &header);
+    if (!st.ok()) {
+        return BlockSplitBloomFilter();
+    }
+
+    st = ValidateBloomFilterHeader(header);
+    if (!st.ok()) {
+        return BlockSplitBloomFilter();
+    }
+
+    const int32_t bloom_filter_size = header.numBytes;
+    if (bloom_filter_size + header_size <= bytes_read) {
+        // The bloom filter data is entirely contained in the buffer we just 
read
+        // => just return it.
+        BlockSplitBloomFilter bloom_filter;
+        bloom_filter.Init(hdr + header_size, bloom_filter_size);
+        return bloom_filter;
+    }
+    // We have read a part of the bloom filter already, copy it to the target 
buffer
+    // and read the remaining part from the InputStream.
+    auto buffer = new (std::nothrow) uint8_t[bloom_filter_size];
+
+    const auto bloom_filter_bytes_in_header = bytes_read - header_size;
+    if (bloom_filter_bytes_in_header > 0) {
+        std::memcpy(buffer, hdr + header_size, bloom_filter_bytes_in_header);
+    }
+
+    const auto required_read_size = bloom_filter_size - 
bloom_filter_bytes_in_header;
+    auto read_size = required_read_size;
+    st = file_reader_s->read_at(bloom_offset + bytes_read,
+        Slice(buffer + bloom_filter_bytes_in_header, required_read_size), 
&read_size,
+        io_context);
+    if (!st.ok()) {
+        return BlockSplitBloomFilter();
+    }
+    BlockSplitBloomFilter bloom_filter;
+    bloom_filter.Init(buffer, bloom_filter_size);
+    return bloom_filter;
+}
+
+
+bool BlockSplitBloomFilter::FindHash(uint64_t hash) const {
+    const uint32_t bucket_index =

Review Comment:
   warning: use auto when initializing with a cast to avoid duplicating the 
type name [modernize-use-auto]
   
   ```suggestion
       const auto bucket_index =
   ```
   



##########
be/src/vec/exec/format/parquet/parquet_bloom.cpp:
##########
@@ -0,0 +1,201 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet_bloom.h"
+
+#include <util/thrift_util.h>
+
+#include "parquet/xxhasher.h"
+
+namespace doris::vectorized {
+
+constexpr uint32_t BlockSplitBloomFilter::SALT[kBitsSetPerBlock];
+
+BlockSplitBloomFilter::BlockSplitBloomFilter()
+    : data_(nullptr), num_bytes_(0), hash_strategy_(HashStrategy::XXHASH),
+      algorithm_(Algorithm::BLOCK), 
compression_strategy_(CompressionStrategy::UNCOMPRESSED) {}
+
+void BlockSplitBloomFilter::Init(uint32_t num_bytes) {
+    if (num_bytes < kMinimumBloomFilterBytes) {
+        num_bytes = kMinimumBloomFilterBytes;
+    }
+
+    if ((num_bytes & (num_bytes - 1)) != 0) {
+        num_bytes = static_cast<uint32_t>(NextPower2(num_bytes));
+    }
+
+    num_bytes_ = num_bytes;
+    data_ = new (std::nothrow) uint8_t[num_bytes_];
+    memcpy(data_, 0, num_bytes_);
+
+    this->hasher_ = std::make_unique<parquet::XxHasher>();
+}
+
+void BlockSplitBloomFilter::Init(const uint8_t* bitset, uint32_t num_bytes) {
+    DCHECK(bitset != nullptr);
+
+    if (num_bytes < kMinimumBloomFilterBytes || num_bytes > 
kMaximumBloomFilterBytes ||
+        (num_bytes & (num_bytes - 1)) != 0) {
+        //throw ParquetException("Given length of bitset is illegal");
+    }
+
+    num_bytes_ = num_bytes;
+    data_ = new (std::nothrow) uint8_t[num_bytes_];
+    memcpy(data_, 0, num_bytes_);
+
+    this->hasher_ = std::make_unique<parquet::XxHasher>();
+    hash_strategy_ = HashStrategy::XXHASH;
+    algorithm_ = Algorithm::BLOCK;
+    compression_strategy_ = CompressionStrategy::UNCOMPRESSED;
+}
+
+static constexpr uint32_t kBloomFilterHeaderSizeGuess = 256;
+
+static Status ValidateBloomFilterHeader(
+    const tparquet::BloomFilterHeader& header) {
+    if (!header.algorithm.__isset.BLOCK) {
+        std::stringstream ss;
+        ss << "Unsupported Bloom filter algorithm: " << header.algorithm << 
".";
+        return Status::InternalError(ss.str());
+    }
+
+    if (!header.hash.__isset.XXHASH) {
+        std::stringstream ss;
+        ss << "Unsupported Bloom filter hash: " << header.hash << ".";
+        return Status::InternalError(ss.str());
+    }
+
+    if (!header.compression.__isset.UNCOMPRESSED) {
+        std::stringstream ss;
+        ss << "Unsupported Bloom filter compression: " << header.compression 
<< ".";
+        return Status::InternalError(ss.str());
+    }
+
+    if (header.numBytes <= 0 ||
+        static_cast<uint32_t>(header.numBytes) > 
BloomFilter::kMaximumBloomFilterBytes) {
+        std::stringstream ss;
+        ss << "Bloom filter size is incorrect: " << header.numBytes << ". Must 
be in range ("
+           << 0 << ", " << BloomFilter::kMaximumBloomFilterBytes << "].";
+        return Status::InternalError(ss.str());
+        }
+
+    return Status::OK();
+}
+
+
+BlockSplitBloomFilter BlockSplitBloomFilter::Deserialize(
+        io::FileReaderSPtr file_reader_s, int64_t bloom_offset, const 
io::IOContext* io_context) {
+    // NOTE: we don't know the bloom filter header size upfront, and we can't 
rely on
+    // InputStream::Peek() which isn't always implemented. Therefore, we must 
first
+    // Read() with an upper bound estimate of the header size, then once we 
know
+    // the bloom filter data size, we can Read() the exact number of remaining 
data bytes.
+    tparquet::BloomFilterHeader header;
+
+    // Read and deserialize bloom filter header
+    size_t bytes_read = kBloomFilterHeaderSizeGuess;
+    if (file_reader_s->size() < kBloomFilterHeaderSizeGuess) {
+        bytes_read = file_reader_s->size();
+    }
+
+    uint8_t hdr[bytes_read];
+    auto st = file_reader_s->read_at(bloom_offset, Slice(hdr, bytes_read), 
&bytes_read,
+                                           io_context);
+    if (!st.ok()) {
+        return BlockSplitBloomFilter();
+    }
+
+    // This gets used, then set by DeserializeThriftMsg
+    uint32_t header_size = static_cast<uint32_t>(bytes_read);
+    st = deserialize_thrift_msg(hdr, (uint32_t*)&header_size, true, &header);
+    if (!st.ok()) {
+        return BlockSplitBloomFilter();
+    }
+
+    st = ValidateBloomFilterHeader(header);
+    if (!st.ok()) {
+        return BlockSplitBloomFilter();
+    }
+
+    const int32_t bloom_filter_size = header.numBytes;
+    if (bloom_filter_size + header_size <= bytes_read) {
+        // The bloom filter data is entirely contained in the buffer we just 
read
+        // => just return it.
+        BlockSplitBloomFilter bloom_filter;
+        bloom_filter.Init(hdr + header_size, bloom_filter_size);
+        return bloom_filter;
+    }
+    // We have read a part of the bloom filter already, copy it to the target 
buffer
+    // and read the remaining part from the InputStream.
+    auto buffer = new (std::nothrow) uint8_t[bloom_filter_size];
+
+    const auto bloom_filter_bytes_in_header = bytes_read - header_size;
+    if (bloom_filter_bytes_in_header > 0) {
+        std::memcpy(buffer, hdr + header_size, bloom_filter_bytes_in_header);
+    }
+
+    const auto required_read_size = bloom_filter_size - 
bloom_filter_bytes_in_header;
+    auto read_size = required_read_size;
+    st = file_reader_s->read_at(bloom_offset + bytes_read,
+        Slice(buffer + bloom_filter_bytes_in_header, required_read_size), 
&read_size,
+        io_context);
+    if (!st.ok()) {
+        return BlockSplitBloomFilter();
+    }
+    BlockSplitBloomFilter bloom_filter;
+    bloom_filter.Init(buffer, bloom_filter_size);
+    return bloom_filter;
+}
+
+
+bool BlockSplitBloomFilter::FindHash(uint64_t hash) const {
+    const uint32_t bucket_index =
+        static_cast<uint32_t>(((hash >> 32) * (num_bytes_ / 
kBytesPerFilterBlock)) >> 32);
+    const uint32_t key = static_cast<uint32_t>(hash);
+    const uint32_t* bitset32 = reinterpret_cast<const uint32_t*>(data_);

Review Comment:
   warning: use auto when initializing with a cast to avoid duplicating the 
type name [modernize-use-auto]
   
   ```suggestion
       const auto* bitset32 = reinterpret_cast<const uint32_t*>(data_);
   ```
   



##########
be/src/vec/exec/format/parquet/parquet_bloom.cpp:
##########
@@ -0,0 +1,201 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet_bloom.h"
+
+#include <util/thrift_util.h>
+
+#include "parquet/xxhasher.h"
+
+namespace doris::vectorized {
+
+constexpr uint32_t BlockSplitBloomFilter::SALT[kBitsSetPerBlock];
+
+BlockSplitBloomFilter::BlockSplitBloomFilter()
+    : data_(nullptr), num_bytes_(0), hash_strategy_(HashStrategy::XXHASH),
+      algorithm_(Algorithm::BLOCK), 
compression_strategy_(CompressionStrategy::UNCOMPRESSED) {}
+
+void BlockSplitBloomFilter::Init(uint32_t num_bytes) {
+    if (num_bytes < kMinimumBloomFilterBytes) {
+        num_bytes = kMinimumBloomFilterBytes;
+    }
+
+    if ((num_bytes & (num_bytes - 1)) != 0) {
+        num_bytes = static_cast<uint32_t>(NextPower2(num_bytes));
+    }
+
+    num_bytes_ = num_bytes;
+    data_ = new (std::nothrow) uint8_t[num_bytes_];
+    memcpy(data_, 0, num_bytes_);
+
+    this->hasher_ = std::make_unique<parquet::XxHasher>();
+}
+
+void BlockSplitBloomFilter::Init(const uint8_t* bitset, uint32_t num_bytes) {
+    DCHECK(bitset != nullptr);
+
+    if (num_bytes < kMinimumBloomFilterBytes || num_bytes > 
kMaximumBloomFilterBytes ||
+        (num_bytes & (num_bytes - 1)) != 0) {
+        //throw ParquetException("Given length of bitset is illegal");
+    }
+
+    num_bytes_ = num_bytes;
+    data_ = new (std::nothrow) uint8_t[num_bytes_];
+    memcpy(data_, 0, num_bytes_);
+
+    this->hasher_ = std::make_unique<parquet::XxHasher>();
+    hash_strategy_ = HashStrategy::XXHASH;
+    algorithm_ = Algorithm::BLOCK;
+    compression_strategy_ = CompressionStrategy::UNCOMPRESSED;
+}
+
+static constexpr uint32_t kBloomFilterHeaderSizeGuess = 256;
+
+static Status ValidateBloomFilterHeader(
+    const tparquet::BloomFilterHeader& header) {
+    if (!header.algorithm.__isset.BLOCK) {
+        std::stringstream ss;
+        ss << "Unsupported Bloom filter algorithm: " << header.algorithm << 
".";
+        return Status::InternalError(ss.str());
+    }
+
+    if (!header.hash.__isset.XXHASH) {
+        std::stringstream ss;
+        ss << "Unsupported Bloom filter hash: " << header.hash << ".";
+        return Status::InternalError(ss.str());
+    }
+
+    if (!header.compression.__isset.UNCOMPRESSED) {
+        std::stringstream ss;
+        ss << "Unsupported Bloom filter compression: " << header.compression 
<< ".";
+        return Status::InternalError(ss.str());
+    }
+
+    if (header.numBytes <= 0 ||
+        static_cast<uint32_t>(header.numBytes) > 
BloomFilter::kMaximumBloomFilterBytes) {
+        std::stringstream ss;
+        ss << "Bloom filter size is incorrect: " << header.numBytes << ". Must 
be in range ("
+           << 0 << ", " << BloomFilter::kMaximumBloomFilterBytes << "].";
+        return Status::InternalError(ss.str());
+        }
+
+    return Status::OK();
+}
+
+
+BlockSplitBloomFilter BlockSplitBloomFilter::Deserialize(
+        io::FileReaderSPtr file_reader_s, int64_t bloom_offset, const 
io::IOContext* io_context) {
+    // NOTE: we don't know the bloom filter header size upfront, and we can't 
rely on
+    // InputStream::Peek() which isn't always implemented. Therefore, we must 
first
+    // Read() with an upper bound estimate of the header size, then once we 
know
+    // the bloom filter data size, we can Read() the exact number of remaining 
data bytes.
+    tparquet::BloomFilterHeader header;
+
+    // Read and deserialize bloom filter header
+    size_t bytes_read = kBloomFilterHeaderSizeGuess;
+    if (file_reader_s->size() < kBloomFilterHeaderSizeGuess) {
+        bytes_read = file_reader_s->size();
+    }
+
+    uint8_t hdr[bytes_read];
+    auto st = file_reader_s->read_at(bloom_offset, Slice(hdr, bytes_read), 
&bytes_read,
+                                           io_context);
+    if (!st.ok()) {
+        return BlockSplitBloomFilter();
+    }
+
+    // This gets used, then set by DeserializeThriftMsg
+    uint32_t header_size = static_cast<uint32_t>(bytes_read);

Review Comment:
   warning: use auto when initializing with a cast to avoid duplicating the 
type name [modernize-use-auto]
   
   ```suggestion
       auto header_size = static_cast<uint32_t>(bytes_read);
   ```
   



##########
be/src/vec/exec/format/parquet/parquet_bloom.cpp:
##########
@@ -0,0 +1,201 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet_bloom.h"
+
+#include <util/thrift_util.h>
+
+#include "parquet/xxhasher.h"
+
+namespace doris::vectorized {
+
+constexpr uint32_t BlockSplitBloomFilter::SALT[kBitsSetPerBlock];
+
+BlockSplitBloomFilter::BlockSplitBloomFilter()
+    : data_(nullptr), num_bytes_(0), hash_strategy_(HashStrategy::XXHASH),
+      algorithm_(Algorithm::BLOCK), 
compression_strategy_(CompressionStrategy::UNCOMPRESSED) {}
+
+void BlockSplitBloomFilter::Init(uint32_t num_bytes) {
+    if (num_bytes < kMinimumBloomFilterBytes) {
+        num_bytes = kMinimumBloomFilterBytes;
+    }
+
+    if ((num_bytes & (num_bytes - 1)) != 0) {
+        num_bytes = static_cast<uint32_t>(NextPower2(num_bytes));
+    }
+
+    num_bytes_ = num_bytes;
+    data_ = new (std::nothrow) uint8_t[num_bytes_];
+    memcpy(data_, 0, num_bytes_);
+
+    this->hasher_ = std::make_unique<parquet::XxHasher>();
+}
+
+void BlockSplitBloomFilter::Init(const uint8_t* bitset, uint32_t num_bytes) {
+    DCHECK(bitset != nullptr);
+
+    if (num_bytes < kMinimumBloomFilterBytes || num_bytes > 
kMaximumBloomFilterBytes ||
+        (num_bytes & (num_bytes - 1)) != 0) {
+        //throw ParquetException("Given length of bitset is illegal");
+    }
+
+    num_bytes_ = num_bytes;
+    data_ = new (std::nothrow) uint8_t[num_bytes_];
+    memcpy(data_, 0, num_bytes_);
+
+    this->hasher_ = std::make_unique<parquet::XxHasher>();
+    hash_strategy_ = HashStrategy::XXHASH;
+    algorithm_ = Algorithm::BLOCK;
+    compression_strategy_ = CompressionStrategy::UNCOMPRESSED;
+}
+
+static constexpr uint32_t kBloomFilterHeaderSizeGuess = 256;
+
+static Status ValidateBloomFilterHeader(
+    const tparquet::BloomFilterHeader& header) {
+    if (!header.algorithm.__isset.BLOCK) {
+        std::stringstream ss;
+        ss << "Unsupported Bloom filter algorithm: " << header.algorithm << 
".";
+        return Status::InternalError(ss.str());
+    }
+
+    if (!header.hash.__isset.XXHASH) {
+        std::stringstream ss;
+        ss << "Unsupported Bloom filter hash: " << header.hash << ".";
+        return Status::InternalError(ss.str());
+    }
+
+    if (!header.compression.__isset.UNCOMPRESSED) {
+        std::stringstream ss;
+        ss << "Unsupported Bloom filter compression: " << header.compression 
<< ".";
+        return Status::InternalError(ss.str());
+    }
+
+    if (header.numBytes <= 0 ||
+        static_cast<uint32_t>(header.numBytes) > 
BloomFilter::kMaximumBloomFilterBytes) {
+        std::stringstream ss;
+        ss << "Bloom filter size is incorrect: " << header.numBytes << ". Must 
be in range ("
+           << 0 << ", " << BloomFilter::kMaximumBloomFilterBytes << "].";
+        return Status::InternalError(ss.str());
+        }
+
+    return Status::OK();
+}
+
+
+BlockSplitBloomFilter BlockSplitBloomFilter::Deserialize(
+        io::FileReaderSPtr file_reader_s, int64_t bloom_offset, const 
io::IOContext* io_context) {
+    // NOTE: we don't know the bloom filter header size upfront, and we can't 
rely on
+    // InputStream::Peek() which isn't always implemented. Therefore, we must 
first
+    // Read() with an upper bound estimate of the header size, then once we 
know
+    // the bloom filter data size, we can Read() the exact number of remaining 
data bytes.
+    tparquet::BloomFilterHeader header;
+
+    // Read and deserialize bloom filter header
+    size_t bytes_read = kBloomFilterHeaderSizeGuess;
+    if (file_reader_s->size() < kBloomFilterHeaderSizeGuess) {
+        bytes_read = file_reader_s->size();
+    }
+
+    uint8_t hdr[bytes_read];
+    auto st = file_reader_s->read_at(bloom_offset, Slice(hdr, bytes_read), 
&bytes_read,
+                                           io_context);
+    if (!st.ok()) {
+        return BlockSplitBloomFilter();
+    }
+
+    // This gets used, then set by DeserializeThriftMsg
+    uint32_t header_size = static_cast<uint32_t>(bytes_read);
+    st = deserialize_thrift_msg(hdr, (uint32_t*)&header_size, true, &header);
+    if (!st.ok()) {
+        return BlockSplitBloomFilter();
+    }
+
+    st = ValidateBloomFilterHeader(header);
+    if (!st.ok()) {
+        return BlockSplitBloomFilter();
+    }
+
+    const int32_t bloom_filter_size = header.numBytes;
+    if (bloom_filter_size + header_size <= bytes_read) {
+        // The bloom filter data is entirely contained in the buffer we just 
read
+        // => just return it.
+        BlockSplitBloomFilter bloom_filter;
+        bloom_filter.Init(hdr + header_size, bloom_filter_size);
+        return bloom_filter;
+    }
+    // We have read a part of the bloom filter already, copy it to the target 
buffer
+    // and read the remaining part from the InputStream.
+    auto buffer = new (std::nothrow) uint8_t[bloom_filter_size];
+
+    const auto bloom_filter_bytes_in_header = bytes_read - header_size;
+    if (bloom_filter_bytes_in_header > 0) {
+        std::memcpy(buffer, hdr + header_size, bloom_filter_bytes_in_header);
+    }
+
+    const auto required_read_size = bloom_filter_size - 
bloom_filter_bytes_in_header;
+    auto read_size = required_read_size;
+    st = file_reader_s->read_at(bloom_offset + bytes_read,
+        Slice(buffer + bloom_filter_bytes_in_header, required_read_size), 
&read_size,
+        io_context);
+    if (!st.ok()) {
+        return BlockSplitBloomFilter();
+    }
+    BlockSplitBloomFilter bloom_filter;
+    bloom_filter.Init(buffer, bloom_filter_size);
+    return bloom_filter;
+}
+
+
+bool BlockSplitBloomFilter::FindHash(uint64_t hash) const {
+    const uint32_t bucket_index =
+        static_cast<uint32_t>(((hash >> 32) * (num_bytes_ / 
kBytesPerFilterBlock)) >> 32);
+    const uint32_t key = static_cast<uint32_t>(hash);
+    const uint32_t* bitset32 = reinterpret_cast<const uint32_t*>(data_);
+
+    for (int i = 0; i < kBitsSetPerBlock; ++i) {
+        // Calculate mask for key in the given bitset.
+        const uint32_t mask = UINT32_C(0x1) << ((key * SALT[i]) >> 27);
+        if (0 == (bitset32[kBitsSetPerBlock * bucket_index + i] & mask)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+void BlockSplitBloomFilter::InsertHashImpl(uint64_t hash) const {
+    const uint32_t bucket_index =
+        static_cast<uint32_t>(((hash >> 32) * (num_bytes_ / 
kBytesPerFilterBlock)) >> 32);
+    const uint32_t key = static_cast<uint32_t>(hash);
+    uint32_t* bitset32 = reinterpret_cast<uint32_t*>(data_);

Review Comment:
   warning: use auto when initializing with a cast to avoid duplicating the 
type name [modernize-use-auto]
   
   ```suggestion
       auto* bitset32 = reinterpret_cast<uint32_t*>(data_);
   ```
   



##########
be/src/vec/exec/format/parquet/parquet_bloom_reader.h:
##########
@@ -0,0 +1,75 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <gen_cpp/parquet_types.h>

Review Comment:
   warning: 'gen_cpp/parquet_types.h' file not found [clang-diagnostic-error]
   ```cpp
   #include <gen_cpp/parquet_types.h>
            ^
   ```
   



##########
be/src/vec/exec/format/parquet/vparquet_reader.h:
##########
@@ -205,12 +208,24 @@ class ParquetReader : public GenericReader {
     void _init_chunk_dicts();
     Status _process_dict_filter(bool* filter_group);
     void _init_bloom_filter();
-    Status _process_bloom_filter(bool* filter_group);
+    Status _process_bloom_filter(bool* filter_group, const tparquet::RowGroup& 
row_group);
     int64_t _get_column_start_offset(const tparquet::ColumnMetaData& 
column_init_column_readers);
     std::string _meta_cache_key(const std::string& path) { return "meta_" + 
path; }
     std::vector<io::PrefetchRange> _generate_random_access_ranges(
             const RowGroupReader::RowGroupIndex& group, size_t* avg_io_size);
 
+    bool _filter_by_bloom(const ColumnValueRangeType& col_value_range,
+                                         const FieldSchema* col_schema,
+                                         std::unique_ptr<BloomFilter>& bf);
+    template <PrimitiveType T>
+    std::vector<BloomScanPredicate> _value_range_to_predicate_bloom(
+            const ColumnValueRange<T>& col_value_range);
+    template <PrimitiveType T>
+    static bool _apply_filter_bloom(const ColumnValueRange<T>& col_value_range,
+                                            const BloomScanPredicate predicate,

Review Comment:
   warning: parameter 'predicate' is const-qualified in the function 
declaration; const-qualification of parameters only has an effect in function 
definitions [readability-avoid-const-params-in-decls]
   
   ```suggestion
                                               BloomScanPredicate predicate,
   ```
   



##########
be/src/vec/exec/format/parquet/parquet_bloom.h:
##########
@@ -0,0 +1,355 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <gen_cpp/parquet_types.h>

Review Comment:
   warning: 'gen_cpp/parquet_types.h' file not found [clang-diagnostic-error]
   ```cpp
   #include <gen_cpp/parquet_types.h>
            ^
   ```
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [feature](lakehouse) use parquet bloom for row group pruning [doris]

Reply via email to