Re: [PR] [feature](lakehouse) use parquet bloom for row group pruning [doris]

via GitHub Thu, 04 Jan 2024 13:47:31 -0800


github-actions[bot] commented on code in PR #28527:
URL: https://github.com/apache/doris/pull/28527#discussion_r1442282571



##########
be/src/vec/exec/format/parquet/parquet_bloom.h:
##########
@@ -0,0 +1,357 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <io/fs/file_reader.h>
+#include <parquet/hasher.h>
+#include <parquet/types.h>
+
+namespace doris::vectorized {
+// Ref: 
https://github.com/apache/parquet-cpp/blob/642da055adf009652689b20e68a198cffb857651/src/parquet/bloom_filter.h#L35
+class BloomFilter {
+public:
+    // Maximum Bloom filter size, it sets to HDFS default block size 128MB
+    // This value will be reconsidered when implementing Bloom filter producer.
+    static constexpr uint32_t kMaximumBloomFilterBytes = 128 * 1024 * 1024;
+
+    /// Determine whether an element exist in set or not.
+    ///
+    /// @param hash the element to contain.
+    /// @return false if value is definitely not in set, and true means 
PROBABLY
+    /// in set.
+    virtual bool find_hash(uint64_t hash) const = 0;
+
+    /// Insert element to set represented by Bloom filter bitset.
+    /// @param hash the hash of value to insert into Bloom filter.
+    virtual void insert_hash(uint64_t hash) = 0;
+
+    /// Insert elements to set represented by Bloom filter bitset.
+    /// @param hashes the hash values to insert into Bloom filter.
+    /// @param num_values the number of hash values to insert.
+    virtual void insert_hashes(const uint64_t* hashes, int num_values) = 0;
+
+    /// Get the number of bytes of bitset
+    virtual uint32_t get_bitset_size() const = 0;
+
+    /// Compute hash for 32 bits value by using its plain encoding result.
+    ///
+    /// @param value the value to hash.
+    /// @return hash result.
+    virtual uint64_t hash(int32_t value) const = 0;
+
+    /// Compute hash for 64 bits value by using its plain encoding result.
+    ///
+    /// @param value the value to hash.
+    /// @return hash result.
+    virtual uint64_t hash(int64_t value) const = 0;
+
+    /// Compute hash for float value by using its plain encoding result.
+    ///
+    /// @param value the value to hash.
+    /// @return hash result.
+    virtual uint64_t hash(float value) const = 0;
+
+    /// Compute hash for double value by using its plain encoding result.
+    ///
+    /// @param value the value to hash.
+    /// @return hash result.
+    virtual uint64_t hash(double value) const = 0;
+
+    /// Compute hash for Int96 value by using its plain encoding result.
+    ///
+    /// @param value the value to hash.
+    /// @return hash result.
+    virtual uint64_t hash(const parquet::Int96* value) const = 0;
+
+    /// Compute hash for ByteArray value by using its plain encoding result.
+    ///
+    /// @param value the value to hash.
+    /// @return hash result.
+    virtual uint64_t hash(const parquet::ByteArray* value) const = 0;
+
+    /// Compute hash for fixed byte array value by using its plain encoding 
result.
+    ///
+    /// @param value the value address.
+    /// @param len the value length.
+    /// @return hash result.
+    virtual uint64_t hash(const parquet::FLBA* value, uint32_t len) const = 0;
+
+    /// Batch compute hashes for 32 bits values by using its plain encoding 
result.
+    ///
+    /// @param values values a pointer to the values to hash.
+    /// @param num_values the number of values to hash.
+    /// @param hashes a pointer to the output hash values, its length should 
be equal to
+    /// num_values.
+    virtual void hashes(const int32_t* values, int num_values, uint64_t* 
hashes) const = 0;
+
+    /// Batch compute hashes for 64 bits values by using its plain encoding 
result.
+    ///
+    /// @param values values a pointer to the values to hash.
+    /// @param num_values the number of values to hash.
+    /// @param hashes a pointer to the output hash values, its length should 
be equal to
+    /// num_values.
+    virtual void hashes(const int64_t* values, int num_values, uint64_t* 
hashes) const = 0;
+
+    /// Batch compute hashes for float values by using its plain encoding 
result.
+    ///
+    /// @param values values a pointer to the values to hash.
+    /// @param num_values the number of values to hash.
+    /// @param hashes a pointer to the output hash values, its length should 
be equal to
+    /// num_values.
+    virtual void hashes(const float* values, int num_values, uint64_t* hashes) 
const = 0;
+
+    /// Batch compute hashes for double values by using its plain encoding 
result.
+    ///
+    /// @param values values a pointer to the values to hash.
+    /// @param num_values the number of values to hash.
+    /// @param hashes a pointer to the output hash values, its length should 
be equal to
+    /// num_values.
+    virtual void hashes(const double* values, int num_values, uint64_t* 
hashes) const = 0;
+
+    /// Batch compute hashes for Int96 values by using its plain encoding 
result.
+    ///
+    /// @param values values a pointer to the values to hash.
+    /// @param num_values the number of values to hash.
+    /// @param hashes a pointer to the output hash values, its length should 
be equal to
+    /// num_values.
+    virtual void hashes(const parquet::Int96* values, int num_values, 
uint64_t* hashes) const = 0;
+
+    /// Batch compute hashes for ByteArray values by using its plain encoding 
result.
+    ///
+    /// @param values values a pointer to the values to hash.
+    /// @param num_values the number of values to hash.
+    /// @param hashes a pointer to the output hash values, its length should 
be equal to
+    /// num_values.
+    virtual void hashes(const parquet::ByteArray* values, int num_values,
+                        uint64_t* hashes) const = 0;
+
+    /// Batch compute hashes for fixed byte array values by using its plain 
encoding result.
+    ///
+    /// @param values values a pointer to the values to hash.
+    /// @param type_len the value length.
+    /// @param num_values the number of values to hash.
+    /// @param hashes a pointer to the output hash values, its length should 
be equal to
+    /// num_values.
+    virtual void hashes(const parquet::FLBA* values, uint32_t type_len, int 
num_values,
+                        uint64_t* hashes) const = 0;
+
+    virtual ~BloomFilter() = default;
+
+protected:
+    // Hash strategy available for Bloom filter.
+    enum class HashStrategy : uint32_t { XXHASH = 0 };
+
+    // Bloom filter algorithm.
+    enum class Algorithm : uint32_t { BLOCK = 0 };
+
+    enum class CompressionStrategy : uint32_t { UNCOMPRESSED = 0 };
+};
+
+constexpr bool IsMultipleOf8(int64_t n) {
+    return (n & 7) == 0;
+}
+
+static inline int64_t NextPower2(int64_t n) {
+    // Taken from
+    // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+    n--;
+    n |= n >> 1;
+    n |= n >> 2;
+    n |= n >> 4;
+    n |= n >> 8;
+    n |= n >> 16;
+    n |= n >> 32;
+    n++;
+    return n;
+}
+
+/// The BlockSplitBloomFilter is implemented using block-based Bloom filters 
from
+/// Putze et al.'s "Cache-,Hash- and Space-Efficient Bloom filters". The basic 
idea is to
+/// hash the item to a tiny Bloom filter which size fit a single cache line or 
smaller.
+///
+/// This implementation sets 8 bits in each tiny Bloom filter. Each tiny Bloom
+/// filter is 32 bytes to take advantage of 32-byte SIMD instructions.
+class BlockSplitBloomFilter : public BloomFilter {
+public:
+    /// The constructor of BlockSplitBloomFilter. It uses XXH64 as hash 
function.
+    explicit BlockSplitBloomFilter();
+
+    /// Initialize the BlockSplitBloomFilter. The range of num_bytes should be 
within
+    /// [kMinimumBloomFilterBytes, kMaximumBloomFilterBytes], it will be
+    /// rounded up/down to lower/upper bound if num_bytes is out of range and 
also
+    /// will be rounded up to a power of 2.
+    ///
+    /// @param num_bytes The number of bytes to store Bloom filter bitset.
+    void init(uint32_t num_bytes);
+
+    /// Initialize the BlockSplitBloomFilter. It copies the bitset as 
underlying
+    /// bitset because the given bitset may not satisfy the 32-byte alignment 
requirement
+    /// which may lead to segfault when performing SIMD instructions. It is 
the caller's
+    /// responsibility to free the bitset passed in. This is used when 
reconstructing
+    /// a Bloom filter from a parquet file.
+    ///
+    /// @param bitset The given bitset to initialize the Bloom filter.
+    /// @param num_bytes  The number of bytes of given bitset.
+    void init(const uint8_t* bitset, uint32_t num_bytes);
+
+    /// Minimum Bloom filter size, it sets to 32 bytes to fit a tiny Bloom 
filter.
+    static constexpr uint32_t kMinimumBloomFilterBytes = 32;
+
+    /// Calculate optimal size according to the number of distinct values and 
false
+    /// positive probability.
+    ///
+    /// @param ndv The number of distinct values.
+    /// @param fpp The false positive probability.
+    /// @return it always return a value between kMinimumBloomFilterBytes and
+    /// kMaximumBloomFilterBytes, and the return value is always a power of 2
+    static uint32_t optimal_num_of_bytes(uint32_t ndv, double fpp) {
+        uint32_t optimal_num_of_bits = optimal_num_of_bits(ndv, fpp);
+        DCHECK(IsMultipleOf8(optimal_num_of_bits));
+        return optimal_num_of_bits >> 3;
+    }
+
+    /// Calculate optimal size according to the number of distinct values and 
false
+    /// positive probability.
+    ///
+    /// @param ndv The number of distinct values.
+    /// @param fpp The false positive probability.
+    /// @return it always return a value between kMinimumBloomFilterBytes * 8 
and
+    /// kMaximumBloomFilterBytes * 8, and the return value is always a power 
of 16
+    static uint32_t optimal_num_of_bits(uint32_t ndv, double fpp) {
+        DCHECK(fpp > 0.0 && fpp < 1.0);
+        const double m = -8.0 * ndv / log(1 - pow(fpp, 1.0 / 8));
+        uint32_t num_bits;
+
+        // Handle overflow.
+        if (m < 0 || m > kMaximumBloomFilterBytes << 3) {
+            num_bits = static_cast<uint32_t>(kMaximumBloomFilterBytes << 3);
+        } else {
+            num_bits = static_cast<uint32_t>(m);
+        }
+
+        // Round up to lower bound
+        if (num_bits < kMinimumBloomFilterBytes << 3) {
+            num_bits = kMinimumBloomFilterBytes << 3;
+        }
+
+        // Get next power of 2 if bits is not power of 2.
+        if ((num_bits & (num_bits - 1)) != 0) {
+            num_bits = static_cast<uint32_t>(NextPower2(num_bits));
+        }
+
+        // Round down to upper bound
+        if (num_bits > kMaximumBloomFilterBytes << 3) {
+            num_bits = kMaximumBloomFilterBytes << 3;
+        }
+
+        return num_bits;
+    }
+
+    bool find_hash(uint64_t hash) const override;
+    void insert_hash(uint64_t hash) override;
+    void insert_hashes(const uint64_t* hashes, int num_values) override;
+    uint32_t get_bitset_size() const override { return num_bytes_; }
+
+    uint64_t hash(int32_t value) const override { return hasher_->Hash(value); 
}
+    uint64_t hash(int64_t value) const override { return hasher_->Hash(value); 
}
+    uint64_t hash(float value) const override { return hasher_->Hash(value); }
+    uint64_t hash(double value) const override { return hasher_->Hash(value); }
+    uint64_t hash(const parquet::Int96* value) const override { return 
hasher_->Hash(value); }
+    uint64_t hash(const parquet::ByteArray* value) const override { return 
hasher_->Hash(value); }
+    uint64_t hash(const parquet::FLBA* value, uint32_t len) const override {
+        return hasher_->Hash(value, len);
+    }
+
+    void hashes(const int32_t* values, int num_values, uint64_t* hashes) const 
override {
+        hasher_->Hashes(values, num_values, hashes);
+    }
+    void hashes(const int64_t* values, int num_values, uint64_t* hashes) const 
override {
+        hasher_->Hashes(values, num_values, hashes);
+    }
+    void hashes(const float* values, int num_values, uint64_t* hashes) const 
override {
+        hasher_->Hashes(values, num_values, hashes);
+    }
+    void hashes(const double* values, int num_values, uint64_t* hashes) const 
override {
+        hasher_->Hashes(values, num_values, hashes);
+    }
+    void hashes(const parquet::Int96* values, int num_values, uint64_t* 
hashes) const override {
+        hasher_->Hashes(values, num_values, hashes);
+    }
+    void hashes(const parquet::ByteArray* values, int num_values, uint64_t* 
hashes) const override {
+        hasher_->Hashes(values, num_values, hashes);
+    }
+    void hashes(const parquet::FLBA* values, uint32_t type_len, int num_values,
+                uint64_t* hashes) const override {
+        hasher_->Hashes(values, type_len, num_values, hashes);
+    }
+
+    uint64_t hash(const int32_t* value) const { return hasher_->Hash(*value); }

Review Comment:
   warning: annotate this function with 'override' or (rarely) 'final' 
[modernize-use-override]
   
   ```suggestion
       uint64_t hash(const int32_t* value) const override { return 
hasher_->Hash(*value); }
   ```
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] [feature](lakehouse) use parquet bloom for row group pruning [doris]

Reply via email to