AlexanderSaydakov commented on code in PR #438: URL: https://github.com/apache/datasketches-cpp/pull/438#discussion_r1717747002
########## filters/include/bloom_filter.hpp: ########## @@ -0,0 +1,757 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef _BLOOM_FILTER_HPP_ +#define _BLOOM_FILTER_HPP_ + +#include <cstdint> +#include <memory> +#include <vector> + +#include "common_defs.hpp" + +namespace datasketches { + +// forward declarations +template<typename A> class bloom_filter_alloc; +template<typename A> class bloom_filter_builder_alloc; + +// aliases with default allocator +using bloom_filter = bloom_filter_alloc<std::allocator<uint8_t>>; +using bloom_filter_builder = bloom_filter_builder_alloc<std::allocator<uint8_t>>; + +/** + * <p>This class provides methods to help estimate the correct parameters when + * creating a Bloom filter, and methods to create the filter using those values.</p> + * + * <p>The underlying math is described in the + * <a href='https://en.wikipedia.org/wiki/Bloom_filter#Optimal_number_of_hash_functions'> + * Wikipedia article on Bloom filters</a>.</p> + */ +template<typename Allocator = std::allocator<uint8_t>> +class bloom_filter_builder_alloc { + using A = Allocator; + +public: + /** + * Returns the optimal number of hash functions to given target numbers of distinct items + * and the Bloom filter size in bits. This function will provide a result even if the input + * values exceed the capacity of a single Bloom filter. + * @param max_distinct_items The maximum expected number of distinct items to add to the filter + * @param num_filter_bits The intended size of the Bloom Filter in bits + * @return The suggested number of hash functions to use with the filter + */ + static uint16_t suggest_num_hashes(const uint64_t max_distinct_items, const uint64_t num_filter_bits); + + /** + * Returns the optimal number of hash functions to achieve a target false positive probability. + * @param target_false_positive_prob A desired false positive probability per item + * @return The suggested number of hash functions to use with the filter. + */ + static uint16_t suggest_num_hashes(const double target_false_positive_prob); + + /** + * Returns the optimal number of bits to use in a Bloom filter given a target number of distinct + * items and a target false positive probability. + * @param max_distinct_items The maximum expected number of distinct items to add to the filter + * @param target_false_positive_prob A desired false positive probability per item + * @return The suggested number of bits to use with the filter + */ + static uint64_t suggest_num_filter_bits(const uint64_t max_distinct_items, const double target_false_positive_prob); + + /** + * Creates a new Bloom filter with an optimal number of bits and hash functions for the given inputs, + * using a random base seed for the hash function. + * @param max_distinct_items The maximum expected number of distinct items to add to the filter + * @param target_false_positive_prob A desired false positive probability per item + * @param seed A bash hash seed (default: random) + * @param allocator The allocator to use for the filter (default: standard allocator) + * @return A new Bloom filter configured for the given input parameters + */ + static bloom_filter_alloc<A> create_by_accuracy(const uint64_t max_distinct_items, + const double target_false_positive_prob, + const uint64_t seed = generate_random_seed(), + const Allocator& allocator = Allocator()); + + /** + * Creates a Bloom filter with given number of bits and number of hash functions, + * using the provided base seed for the hash function. + * + * @param num_bits The size of the BloomFilter, in bits + * @param num_hashes The number of hash functions to apply to items + * @param seed A base hash seed (default: random) + * @param allocator The allocator to use for the filter (default: standard allocator) + * @return A new Bloom filter configured for the given input parameters + */ + static bloom_filter_alloc<A> create_by_size(const uint64_t num_bits, + const uint16_t num_hashes, + const uint64_t seed = generate_random_seed(), + const Allocator& allocator = Allocator()); + + /** + * Creates a new Bloom filter with an optimal number of bits and hash functions for the given inputs, + * using a random base seed for the hash function and writing into the provided memory. The filter does + * not take ownership of the memory but does overwrite the full contents. + * + * @param memory A pointer to the memory to use for the filter + * @param length_bytes The length of the memory in bytes + * @param max_distinct_items The maximum expected number of distinct items to add to the filter + * @param target_false_positive_prob A desired false positive probability per item + * @param dstMem A WritableMemory to hold the initialized filter + * @param allocator The allocator to use for the filter (default: standard allocator) + * @return A new Bloom filter configured for the given input parameters in the provided memory + */ + static bloom_filter_alloc<A> initialize_by_accuracy(void* memory, + const size_t length_bytes, + const uint64_t max_distinct_items, + const double target_false_positive_prob, + const uint64_t seed = generate_random_seed(), + const Allocator& allocator = Allocator()); + + /** + * Initializes a Bloom filter with given number of bits and number of hash functions, + * using the provided base seed for the hash function and writing into the provided memory. The filter does + * not take ownership of the memory but does overwrite the full contents. + * + * @param memory A pointer to the memory to use for the filter + * @param length_bytes The length of the memory in bytes + * @param num_bits The size of the BloomFilter, in bits + * @param num_hashes The number of hash functions to apply to items + * @param seed A base hash seed (default: random) + * @param allocator The allocator to use for the filter (default: standard allocator) + * @return A new BloomFilter configured for the given input parameters + */ + static bloom_filter_alloc<A> initialize_by_size(void* memory, + const size_t length_bytes, + const uint64_t num_bits, + const uint16_t num_hashes, + const uint64_t seed = generate_random_seed(), + const Allocator& allocator = Allocator()); + + /** + * @brief Generates a random 64-bit seed value + * + * @return uint64_t a random value over the range of unsigned 64-bit integers + */ + static uint64_t generate_random_seed(); + +private: + static void validate_size_inputs(uint64_t num_bits, uint16_t num_hashes); + static void validate_accuracy_inputs(uint64_t max_distinct_items, double target_false_positive_prob); +}; + +/** + * <p>A Bloom filter is a data structure that can be used for probabilistic + * set membership.</p> + * + * <p>When querying a Bloom filter, there are no false positives. Specifically: + * When querying an item that has already been inserted to the filter, the filter will + * always indicate that the item is present. There is a chance of false positives, where + * querying an item that has never been presented to the filter will indicate that the + * item has already been seen. Consequently, any query should be interpreted as + * "might have seen."</p> + * + * <p>A standard Bloom filter is unlike typical sketches in that it is not sub-linear + * in size and does not resize itself. A Bloom filter will work up to a target number of + * distinct items, beyond which it will saturate and the false positive rate will start to + * increase. The size of a Bloom filter will be linear in the expected number of + * distinct items.</p> + * + * <p>See the bloom_filter_builder_alloc class for methods to create a filter, especially + * one sized correctly for a target number of distinct elements and a target + * false positive probability.</p> + * + * <p>This implementation uses xxHash64 and follows the approach in Kirsch and Mitzenmacher, + * "Less Hashing, Same Performance: Building a Better Bloom Filter," Wiley Interscience, 2008, pp. 187-218.</p> + */ + +template<typename Allocator = std::allocator<uint8_t>> +class bloom_filter_alloc { + using A = Allocator; + +public: + + /** + * This method deserializes a Bloom filter from a given array of bytes. + * @param bytes pointer to the array of bytes + * @param size the size of the array + * @param allocator instance of an Allocator + * @return an instance of a Bloom filter + */ + static bloom_filter_alloc deserialize(const void* bytes, size_t length_bytes, const Allocator& allocator = Allocator()); + + /** + * This method deserializes a Bloom filter from a given stream. + * @param is input stream + * @param allocator instance of an Allocator + * @return an instance of a Bloom filter + */ + static bloom_filter_alloc deserialize(std::istream& is, const A& allocator = Allocator()); + + /** + * @brief Wraps the provided memory as a read-only Bloom filter. Reads the data in-place and does + * not take ownership of the underlying memory. Does not allow modifying the filter. + * + * @param data The memory to wrap + * @param length_bytes The length of the memory in bytes + * @param allocator instance of an Allocator + * @return a const (read-only) Bloom filter wrapping the provided memory + */ + static const bloom_filter_alloc wrap(const void* data, size_t length_bytes, const Allocator& allocator = Allocator()); + + /** + * @brief Wraps the provided memory as a writable Bloom filter. Reads the data in-place and does + * not take ownership of the underlying memory. Allows modifying the filter. + * + * @param data the memory to wrap + * @param length_bytes the length of the memory in bytes + * @param allocator instance of an Allocator + * @return a Bloom filter wrapping the provided memory + */ + static bloom_filter_alloc writable_wrap(void* data, size_t length_bytes, const Allocator& allocator = Allocator()); + + /** + * Copy constructor + * @param other filter to be copied + */ + bloom_filter_alloc(const bloom_filter_alloc&); + + /** Move constructor + * @param other filter to be moved + */ + bloom_filter_alloc(bloom_filter_alloc&&) noexcept; + + /** + * Copy assignment + * @param other filter to be copied + * @return reference to this filter + */ + bloom_filter_alloc& operator=(const bloom_filter_alloc& other); + + /** + * Move assignment + * @param other filter to be moved + * @return reference to this filter + */ + bloom_filter_alloc& operator=(bloom_filter_alloc&& other); + + /** + * @brief Destroy the bloom filter alloc object Review Comment: I don't think we want to mention alloc here -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
