(datasketches-cpp) branch quotient_filter updated: implemented merge

alsay Wed, 17 Jul 2024 00:28:02 -0700

This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch quotient_filter
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git



The following commit(s) were added to refs/heads/quotient_filter by this push:
     new c45d5be  implemented merge
c45d5be is described below

commit c45d5bef4411f7c0ce6208b603c57d4322110771
Author: AlexanderSaydakov <[email protected]>
AuthorDate: Wed Jul 17 00:26:16 2024 -0700

    implemented merge
---
 filters/include/quotient_filter.hpp      | 16 ++++++---
 filters/include/quotient_filter_impl.hpp | 51 +++++++++++++++++++++--------
 filters/test/quotient_filter_test.cpp    | 56 +++++++++++++++++++++++++++++---
 3 files changed, 100 insertions(+), 23 deletions(-)

diff --git a/filters/include/quotient_filter.hpp 
b/filters/include/quotient_filter.hpp
index 10fdf1d..532e1f9 100755
--- a/filters/include/quotient_filter.hpp
+++ b/filters/include/quotient_filter.hpp
@@ -38,11 +38,15 @@ class quotient_filter_alloc {
 public:
   using vector_bytes = std::vector<uint8_t, typename 
std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
 
+  static constexpr float DEFAULT_LOAD_FACTOR = 0.8;
+
   /**
    * @param lg_q
-   * @param num_bits_per_entry length of remainder in bits + 3 metadata bits
+   * @param num_fingerprint_bits length of fingerprint in bits
+   * @param load_factor threshold for the ratio of the number of entries to 
the number of slots for expansion
+   * @param allocator for use by this sketch to allocate memory
    */
-  explicit quotient_filter_alloc(uint8_t lg_q, uint8_t num_bits_per_entry, 
const Allocator& allocator = Allocator());
+  explicit quotient_filter_alloc(uint8_t lg_q, uint8_t num_fingerprint_bits, 
float load_factor = DEFAULT_LOAD_FACTOR, const Allocator& allocator = 
Allocator());
 
   /**
    * Update this filter with given unsigned 64-bit integer.
@@ -91,6 +95,9 @@ public:
    */
   bool query(const void* data, size_t length) const;
 
+
+  void merge(const quotient_filter_alloc& other);
+
   size_t get_num_entries() const;
 
   uint8_t get_lg_q() const;
@@ -119,13 +126,12 @@ public:
 private:
   Allocator allocator_;
   uint8_t lg_q_;
-  uint8_t num_bits_per_entry_;
+  uint8_t num_fingerprint_bits_;
   uint8_t num_expansions_;
+  float load_factor_;
   size_t num_entries_;
   vector_bytes bytes_;
 
-  static constexpr double LOAD_FACTOR = 0.9;
-
   inline size_t get_q() const;
   inline size_t get_slot_mask() const;
   inline uint64_t get_value_mask() const;
diff --git a/filters/include/quotient_filter_impl.hpp 
b/filters/include/quotient_filter_impl.hpp
index 43b280f..887c9e3 100755
--- a/filters/include/quotient_filter_impl.hpp
+++ b/filters/include/quotient_filter_impl.hpp
@@ -83,17 +83,18 @@ static inline uint64_t get_bits(uint8_t bits, const 
uint8_t* ptr, uint8_t offset
 }
 
 template<typename A>
-quotient_filter_alloc<A>::quotient_filter_alloc(uint8_t lg_q, uint8_t 
bits_per_entry, const A& allocator):
+quotient_filter_alloc<A>::quotient_filter_alloc(uint8_t lg_q, uint8_t 
num_fingerprint_bits, float load_factor, const A& allocator):
 allocator_(allocator),
 lg_q_(lg_q),
-num_bits_per_entry_(bits_per_entry),
+num_fingerprint_bits_(num_fingerprint_bits),
 num_expansions_(0),
+load_factor_(load_factor),
 num_entries_(0),
 bytes_(allocator)
 {
   // check input
   // allocate multiples of 8 bytes to match Java
-  bytes_.resize(u64_to_hold_bits(get_q() * bits_per_entry) * sizeof(uint64_t));
+  bytes_.resize(u64_to_hold_bits(get_q() * get_num_bits_per_entry()) * 
sizeof(uint64_t));
 }
 
 template<typename A>
@@ -140,6 +141,31 @@ bool quotient_filter_alloc<A>::query(const void* data, 
size_t length) const {
   return pair.second;
 }
 
+template<typename A>
+void quotient_filter_alloc<A>::merge(const quotient_filter_alloc& other) {
+  if (lg_q_ + num_fingerprint_bits_ != other.lg_q_ + 
other.num_fingerprint_bits_) {
+    throw std::invalid_argument("incompatible sketches in merge");
+  }
+  // find cluster start
+  size_t i = 0;
+  if (!other.is_slot_empty(i)) while (other.get_is_shifted(i)) i = (i - 1) & 
other.get_slot_mask();
+
+  std::queue<size_t> fifo;
+  size_t count = 0;
+  while (count < other.num_entries_) {
+    if (!other.is_slot_empty(i)) {
+      if (other.get_is_occupied(i)) fifo.push(i);
+      const size_t quotient = fifo.front();
+      const uint64_t value = other.get_value(i);
+      const uint64_t hash = quotient << other.num_fingerprint_bits_ | value;
+      insert(quotient_from_hash(hash), value_from_hash(hash));
+      ++count;
+    }
+    i = (i + 1) & other.get_slot_mask();
+    if (!fifo.empty() && !other.get_is_continuation(i)) fifo.pop();
+  }
+}
+
 template<typename A>
 size_t quotient_filter_alloc<A>::get_q() const {
   return static_cast<size_t>(1) << get_lg_q();
@@ -177,12 +203,12 @@ uint8_t quotient_filter_alloc<A>::get_lg_q() const {
 
 template<typename A>
 uint8_t quotient_filter_alloc<A>::get_num_bits_per_entry() const {
-  return num_bits_per_entry_;
+  return num_fingerprint_bits_ + 3;
 }
 
 template<typename A>
 uint8_t quotient_filter_alloc<A>::get_num_bits_in_value() const {
-  return num_bits_per_entry_ - 3;
+  return num_fingerprint_bits_;
 }
 
 template<typename A>
@@ -201,10 +227,11 @@ string<A> quotient_filter_alloc<A>::to_string(bool 
print_entries) const {
   // The stream does not support passing an allocator instance, and 
alternatives are complicated.
   std::ostringstream os;
   os << "### Quotient filter summary:" << std::endl;
-  os << "   LgQ            : " << std::to_string(lg_q_) << std::endl;
-  os << "   Bits per entry : " << std::to_string(num_bits_per_entry_) << 
std::endl;
-  os << "   Num expansions : " << std::to_string(num_expansions_) << std::endl;
-  os << "   Num entries    : " << num_entries_ << std::endl;
+  os << "   LgQ              : " << std::to_string(lg_q_) << std::endl;
+  os << "   Fingerprint bits : " << std::to_string(num_fingerprint_bits_) << 
std::endl;
+  os << "   Load factor      : " << std::to_string(load_factor_) << std::endl;
+  os << "   Num expansions   : " << std::to_string(num_expansions_) << 
std::endl;
+  os << "   Num entries      : " << num_entries_ << std::endl;
   os << "### End filter summary" << std::endl;
 
   if (print_entries) {
@@ -242,12 +269,10 @@ size_t quotient_filter_alloc<A>::find_run_start(size_t 
slot) const {
     slot = (slot - 1) & get_slot_mask();
     if (get_is_occupied(slot)) ++num_runs_to_skip;
   }
-//  std::cout << "cluster start " << slot << " " << num_runs_to_skip << " runs 
to skip\n";
   while (num_runs_to_skip > 0) {
     slot = (slot + 1) & get_slot_mask();
     if (!get_is_continuation(slot)) --num_runs_to_skip;
   }
-//  std::cout << "run start " << slot << "\n";
   return slot;
 }
 
@@ -306,13 +331,13 @@ void quotient_filter_alloc<A>::insert_and_shift(size_t 
quotient, size_t slot, ui
 
   if (is_new_run) set_is_occupied(quotient, true);
   ++num_entries_;
-  if (num_entries_ == static_cast<size_t>(get_q() * LOAD_FACTOR)) expand();
+  if (num_entries_ == static_cast<size_t>(get_q() * load_factor_)) expand();
 }
 
 template<typename A>
 void quotient_filter_alloc<A>::expand() {
   if (get_num_bits_in_value() < 2) throw std::logic_error("for expansion value 
must have at least 2 bits");
-  quotient_filter_alloc other(lg_q_ + 1, num_bits_per_entry_ - 1, allocator_);
+  quotient_filter_alloc other(lg_q_ + 1, num_fingerprint_bits_ - 1, 
load_factor_, allocator_);
 
   // find cluster start
   size_t i = 0;
diff --git a/filters/test/quotient_filter_test.cpp 
b/filters/test/quotient_filter_test.cpp
index b84176e..84e3da1 100755
--- a/filters/test/quotient_filter_test.cpp
+++ b/filters/test/quotient_filter_test.cpp
@@ -26,9 +26,9 @@
 namespace datasketches {
 
 TEST_CASE("empty", "[quotient_filter]") {
-  quotient_filter f(10, 9);
+  quotient_filter f(10, 6);
   REQUIRE(f.get_lg_q() == 10);
-  REQUIRE(f.get_num_bits_per_entry() == 9);
+  REQUIRE(f.get_num_bits_in_value() == 6);
   REQUIRE(f.get_num_entries() == 0);
 }
 
@@ -55,7 +55,7 @@ TEST_CASE("several entries", "[quotient_filter]") {
 
 TEST_CASE("many entries no expansion 1", "[quotient_filter]") {
   quotient_filter f(4, 9);
-  const size_t n = 12;
+  const size_t n = 11;
   for (size_t i = 0; i < n; ++i) f.update(i);
 //  std::cout << f.to_string(true);
 
@@ -146,9 +146,55 @@ TEST_CASE("expansion", "[quotient_filter]") {
   positives = 0;
   for (size_t i = 0; i < n; ++i) if (f.query(i + n)) ++positives;
   REQUIRE(positives < 7);
+}
 
-  for (size_t i = 0; i < n * 2; ++i) if (f.update(i + n)) ++positives;
-  std::cout << f.to_string();
+TEST_CASE("merge empty", "[quotient_filter]") {
+  quotient_filter qf1(4, 3);
+  quotient_filter qf2(4, 3);
+  qf1.merge(qf2);
+  REQUIRE(qf1.get_lg_q() == 4);
+  REQUIRE(qf1.get_num_bits_in_value() == 3);
+  REQUIRE(qf1.get_num_entries() == 0);
+}
+
+TEST_CASE("merge", "[quotient_filter]") {
+  quotient_filter qf1(16, 12);
+  quotient_filter qf2(16, 12);
+  const size_t n = 50000;
+  for (size_t i = 0; i < n / 2; ++i) {
+    qf1.update(i);
+    qf2.update(i + n / 2);
+  }
+  qf1.merge(qf2);
+  REQUIRE(qf1.get_num_expansions() == 0);
+  REQUIRE(qf1.get_num_entries() > n * 0.9999); // allow a few hash collisions
+
+  // query the same keys
+  size_t positives = 0;
+  for (size_t i = 0; i < n; ++i) if (qf1.query(i)) ++positives;
+  REQUIRE(positives == n);
+
+  // query novel keys
+  positives = 0;
+  for (size_t i = 0; i < n; ++i) if (qf1.query(i + n)) ++positives;
+  REQUIRE(positives < 6);
+}
+
+TEST_CASE("merge different configuration", "[quotient_filter]") {
+  quotient_filter qf1(6, 5);
+  quotient_filter qf2(5, 6);
+  for (int i = 0; i < 10; ++i) {
+    qf1.update(i);
+    qf2.update(i);
+  }
+  qf1.merge(qf2);
+  REQUIRE(qf1.get_num_entries() == 10);
+}
+
+TEST_CASE("merge incompatible", "[quotient_filter]") {
+  quotient_filter qf1(6, 5);
+  quotient_filter qf2(6, 6);
+  REQUIRE_THROWS_AS(qf1.merge(qf2), std::invalid_argument);
 }
 
 TEST_CASE("serialize", "[quotient_filter]") {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(datasketches-cpp) branch quotient_filter updated: implemented merge

Reply via email to