jmalkin commented on code in PR #438:
URL: https://github.com/apache/datasketches-cpp/pull/438#discussion_r1718027759


##########
filters/include/bloom_filter.hpp:
##########
@@ -0,0 +1,757 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef _BLOOM_FILTER_HPP_
+#define _BLOOM_FILTER_HPP_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "common_defs.hpp"
+
+namespace datasketches {
+
+// forward declarations
+template<typename A> class bloom_filter_alloc;
+template<typename A> class bloom_filter_builder_alloc;
+
+// aliases with default allocator
+using bloom_filter = bloom_filter_alloc<std::allocator<uint8_t>>;
+using bloom_filter_builder = 
bloom_filter_builder_alloc<std::allocator<uint8_t>>;
+
+/**
+ * <p>This class provides methods to help estimate the correct parameters when
+ * creating a Bloom filter, and methods to create the filter using those 
values.</p>
+ *
+ * <p>The underlying math is described in the
+ * <a 
href='https://en.wikipedia.org/wiki/Bloom_filter#Optimal_number_of_hash_functions'>
+ * Wikipedia article on Bloom filters</a>.</p>
+ */
+template<typename Allocator = std::allocator<uint8_t>>
+class bloom_filter_builder_alloc {
+  using A = Allocator;
+
+public:
+  /**
+   * Returns the optimal number of hash functions to given target numbers of 
distinct items
+   * and the Bloom filter size in bits. This function will provide a result 
even if the input
+   * values exceed the capacity of a single Bloom filter.
+   * @param max_distinct_items The maximum expected number of distinct items 
to add to the filter
+   * @param num_filter_bits The intended size of the Bloom Filter in bits
+   * @return The suggested number of hash functions to use with the filter
+   */
+  static uint16_t suggest_num_hashes(const uint64_t max_distinct_items, const 
uint64_t num_filter_bits);
+
+  /**
+   * Returns the optimal number of hash functions to achieve a target false 
positive probability.
+   * @param target_false_positive_prob A desired false positive probability 
per item
+   * @return The suggested number of hash functions to use with the filter.
+   */
+  static uint16_t suggest_num_hashes(const double target_false_positive_prob);
+
+  /**
+   * Returns the optimal number of bits to use in a Bloom filter given a 
target number of distinct
+   * items and a target false positive probability.
+   * @param max_distinct_items The maximum expected number of distinct items 
to add to the filter
+   * @param target_false_positive_prob A desired false positive probability 
per item
+   * @return The suggested number of bits to use with the filter
+   */
+  static uint64_t suggest_num_filter_bits(const uint64_t max_distinct_items, 
const double target_false_positive_prob);
+
+  /**
+   * Creates a new Bloom filter with an optimal number of bits and hash 
functions for the given inputs,
+   * using a random base seed for the hash function.
+   * @param max_distinct_items The maximum expected number of distinct items 
to add to the filter
+   * @param target_false_positive_prob A desired false positive probability 
per item
+   * @param seed A bash hash seed (default: random)
+   * @param allocator The allocator to use for the filter (default: standard 
allocator)
+   * @return A new Bloom filter configured for the given input parameters
+   */
+  static bloom_filter_alloc<A> create_by_accuracy(const uint64_t 
max_distinct_items,
+                                                  const double 
target_false_positive_prob,
+                                                  const uint64_t seed = 
generate_random_seed(),
+                                                  const Allocator& allocator = 
Allocator());
+
+  /**
+   * Creates a Bloom filter with given number of bits and number of hash 
functions,
+   * using the provided base seed for the hash function.
+   *
+   * @param num_bits The size of the BloomFilter, in bits
+   * @param num_hashes The number of hash functions to apply to items
+   * @param seed A base hash seed (default: random)
+   * @param allocator The allocator to use for the filter (default: standard 
allocator)
+   * @return A new Bloom filter configured for the given input parameters
+   */
+  static bloom_filter_alloc<A> create_by_size(const uint64_t num_bits,
+                                              const uint16_t num_hashes,
+                                              const uint64_t seed = 
generate_random_seed(),
+                                              const Allocator& allocator = 
Allocator());
+
+  /**
+   * Creates a new Bloom filter with an optimal number of bits and hash 
functions for the given inputs,
+   * using a random base seed for the hash function and writing into the 
provided memory. The filter does
+   * not take ownership of the memory but does overwrite the full contents.
+   *
+   * @param memory A pointer to the memory to use for the filter
+   * @param length_bytes The length of the memory in bytes
+   * @param max_distinct_items The maximum expected number of distinct items 
to add to the filter
+   * @param target_false_positive_prob A desired false positive probability 
per item
+   * @param dstMem A WritableMemory to hold the initialized filter
+   * @param allocator The allocator to use for the filter (default: standard 
allocator)
+   * @return A new Bloom filter configured for the given input parameters in 
the provided memory
+   */
+  static bloom_filter_alloc<A> initialize_by_accuracy(void* memory,
+                                                      const size_t 
length_bytes,
+                                                      const uint64_t 
max_distinct_items,
+                                                      const double 
target_false_positive_prob,
+                                                      const uint64_t seed = 
generate_random_seed(),
+                                                      const Allocator& 
allocator = Allocator());
+
+  /**
+   * Initializes a Bloom filter with given number of bits and number of hash 
functions,
+   * using the provided base seed for the hash function and writing into the 
provided memory. The filter does
+   * not take ownership of the memory but does overwrite the full contents.
+   *
+   * @param memory A pointer to the memory to use for the filter
+   * @param length_bytes The length of the memory in bytes
+   * @param num_bits The size of the BloomFilter, in bits
+   * @param num_hashes The number of hash functions to apply to items
+   * @param seed A base hash seed (default: random)
+   * @param allocator The allocator to use for the filter (default: standard 
allocator)
+   * @return A new BloomFilter configured for the given input parameters
+   */
+  static bloom_filter_alloc<A> initialize_by_size(void* memory,
+                                                  const size_t length_bytes,
+                                                  const uint64_t num_bits,
+                                                  const uint16_t num_hashes,
+                                                  const uint64_t seed = 
generate_random_seed(),
+                                                  const Allocator& allocator = 
Allocator());
+
+  /**
+   * @brief Generates a random 64-bit seed value
+   *
+   * @return uint64_t a random value over the range of unsigned 64-bit integers
+   */
+  static uint64_t generate_random_seed();
+
+private:
+  static void validate_size_inputs(uint64_t num_bits, uint16_t num_hashes);
+  static void validate_accuracy_inputs(uint64_t max_distinct_items, double 
target_false_positive_prob);
+};
+
+/**
+ * <p>A Bloom filter is a data structure that can be used for probabilistic
+ * set membership.</p>
+ *
+ * <p>When querying a Bloom filter, there are no false positives. Specifically:
+ * When querying an item that has already been inserted to the filter, the 
filter will
+ * always indicate that the item is present. There is a chance of false 
positives, where
+ * querying an item that has never been presented to the filter will indicate 
that the
+ * item has already been seen. Consequently, any query should be interpreted as
+ * "might have seen."</p>
+ *
+ * <p>A standard Bloom filter is unlike typical sketches in that it is not 
sub-linear
+ * in size and does not resize itself. A Bloom filter will work up to a target 
number of
+ * distinct items, beyond which it will saturate and the false positive rate 
will start to
+ * increase. The size of a Bloom filter will be linear in the expected number 
of
+ * distinct items.</p>
+ *
+ * <p>See the bloom_filter_builder_alloc class for methods to create a filter, 
especially
+ * one sized correctly for a target number of distinct elements and a target
+ * false positive probability.</p>
+ *
+ * <p>This implementation uses xxHash64 and follows the approach in Kirsch and 
Mitzenmacher,
+ * "Less Hashing, Same Performance: Building a Better Bloom Filter," Wiley 
Interscience, 2008, pp. 187-218.</p>
+ */
+
+template<typename Allocator = std::allocator<uint8_t>>
+class bloom_filter_alloc {
+  using A = Allocator;
+
+public:
+
+  /**
+   * This method deserializes a Bloom filter from a given array of bytes.
+   * @param bytes pointer to the array of bytes
+   * @param size the size of the array
+   * @param allocator instance of an Allocator
+   * @return an instance of a Bloom filter
+   */
+  static bloom_filter_alloc deserialize(const void* bytes, size_t 
length_bytes, const Allocator& allocator = Allocator());
+
+  /**
+   * This method deserializes a Bloom filter from a given stream.
+   * @param is input stream
+   * @param allocator instance of an Allocator
+   * @return an instance of a Bloom filter
+   */
+  static bloom_filter_alloc deserialize(std::istream& is, const A& allocator = 
Allocator());
+
+  /**
+   * @brief Wraps the provided memory as a read-only Bloom filter. Reads the 
data in-place and does
+   * not take ownership of the underlying memory. Does not allow modifying the 
filter.
+   *
+   * @param data The memory to wrap
+   * @param length_bytes The length of the memory in bytes
+   * @param allocator instance of an Allocator
+   * @return a const (read-only) Bloom filter wrapping the provided memory
+   */
+  static const bloom_filter_alloc wrap(const void* data, size_t length_bytes, 
const Allocator& allocator = Allocator());
+
+  /**
+   * @brief Wraps the provided memory as a writable Bloom filter. Reads the 
data in-place and does
+   * not take ownership of the underlying memory. Allows modifying the filter.
+   *
+   * @param data the memory to wrap
+   * @param length_bytes the length of the memory in bytes
+   * @param allocator instance of an Allocator
+   * @return a Bloom filter wrapping the provided memory
+   */
+  static bloom_filter_alloc writable_wrap(void* data, size_t length_bytes, 
const Allocator& allocator = Allocator());
+
+  /**
+   * Copy constructor
+   * @param other filter to be copied
+   */
+  bloom_filter_alloc(const bloom_filter_alloc&);
+
+  /** Move constructor
+   * @param other filter to be moved
+   */
+  bloom_filter_alloc(bloom_filter_alloc&&) noexcept;
+
+  /**
+   * Copy assignment
+   * @param other filter to be copied
+   * @return reference to this filter
+   */
+  bloom_filter_alloc& operator=(const bloom_filter_alloc& other);
+
+  /**
+   * Move assignment
+   * @param other filter to be moved
+   * @return reference to this filter
+   */
+  bloom_filter_alloc& operator=(bloom_filter_alloc&& other);
+
+  /**
+   * @brief Destroy the bloom filter alloc object
+   */
+  ~bloom_filter_alloc();
+
+  // This is a convenience alias for users
+  // The type returned by the following serialize method
+  using vector_bytes = std::vector<uint8_t, typename 
std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
+
+  /**
+   * This method serializes the filter as a vector of bytes.
+   * An optional header can be reserved in front of the filter.
+   * It is a blank space of a given size.
+   * This header is used in Datasketches PostgreSQL extension.

Review Comment:
   How is "Some integrations such as PostgreSQL may need this header space."



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to