This is an automated email from the ASF dual-hosted git repository.
alsay pushed a commit to branch quotient_filter
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git
The following commit(s) were added to refs/heads/quotient_filter by this push:
new c45d5be implemented merge
c45d5be is described below
commit c45d5bef4411f7c0ce6208b603c57d4322110771
Author: AlexanderSaydakov <[email protected]>
AuthorDate: Wed Jul 17 00:26:16 2024 -0700
implemented merge
---
filters/include/quotient_filter.hpp | 16 ++++++---
filters/include/quotient_filter_impl.hpp | 51 +++++++++++++++++++++--------
filters/test/quotient_filter_test.cpp | 56 +++++++++++++++++++++++++++++---
3 files changed, 100 insertions(+), 23 deletions(-)
diff --git a/filters/include/quotient_filter.hpp
b/filters/include/quotient_filter.hpp
index 10fdf1d..532e1f9 100755
--- a/filters/include/quotient_filter.hpp
+++ b/filters/include/quotient_filter.hpp
@@ -38,11 +38,15 @@ class quotient_filter_alloc {
public:
using vector_bytes = std::vector<uint8_t, typename
std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
+ static constexpr float DEFAULT_LOAD_FACTOR = 0.8;
+
/**
* @param lg_q
- * @param num_bits_per_entry length of remainder in bits + 3 metadata bits
+ * @param num_fingerprint_bits length of fingerprint in bits
+ * @param load_factor threshold for the ratio of the number of entries to
the number of slots for expansion
+ * @param allocator for use by this sketch to allocate memory
*/
- explicit quotient_filter_alloc(uint8_t lg_q, uint8_t num_bits_per_entry,
const Allocator& allocator = Allocator());
+ explicit quotient_filter_alloc(uint8_t lg_q, uint8_t num_fingerprint_bits,
float load_factor = DEFAULT_LOAD_FACTOR, const Allocator& allocator =
Allocator());
/**
* Update this filter with given unsigned 64-bit integer.
@@ -91,6 +95,9 @@ public:
*/
bool query(const void* data, size_t length) const;
+
+ void merge(const quotient_filter_alloc& other);
+
size_t get_num_entries() const;
uint8_t get_lg_q() const;
@@ -119,13 +126,12 @@ public:
private:
Allocator allocator_;
uint8_t lg_q_;
- uint8_t num_bits_per_entry_;
+ uint8_t num_fingerprint_bits_;
uint8_t num_expansions_;
+ float load_factor_;
size_t num_entries_;
vector_bytes bytes_;
- static constexpr double LOAD_FACTOR = 0.9;
-
inline size_t get_q() const;
inline size_t get_slot_mask() const;
inline uint64_t get_value_mask() const;
diff --git a/filters/include/quotient_filter_impl.hpp
b/filters/include/quotient_filter_impl.hpp
index 43b280f..887c9e3 100755
--- a/filters/include/quotient_filter_impl.hpp
+++ b/filters/include/quotient_filter_impl.hpp
@@ -83,17 +83,18 @@ static inline uint64_t get_bits(uint8_t bits, const
uint8_t* ptr, uint8_t offset
}
template<typename A>
-quotient_filter_alloc<A>::quotient_filter_alloc(uint8_t lg_q, uint8_t
bits_per_entry, const A& allocator):
+quotient_filter_alloc<A>::quotient_filter_alloc(uint8_t lg_q, uint8_t
num_fingerprint_bits, float load_factor, const A& allocator):
allocator_(allocator),
lg_q_(lg_q),
-num_bits_per_entry_(bits_per_entry),
+num_fingerprint_bits_(num_fingerprint_bits),
num_expansions_(0),
+load_factor_(load_factor),
num_entries_(0),
bytes_(allocator)
{
// check input
// allocate multiples of 8 bytes to match Java
- bytes_.resize(u64_to_hold_bits(get_q() * bits_per_entry) * sizeof(uint64_t));
+ bytes_.resize(u64_to_hold_bits(get_q() * get_num_bits_per_entry()) *
sizeof(uint64_t));
}
template<typename A>
@@ -140,6 +141,31 @@ bool quotient_filter_alloc<A>::query(const void* data,
size_t length) const {
return pair.second;
}
+template<typename A>
+void quotient_filter_alloc<A>::merge(const quotient_filter_alloc& other) {
+ if (lg_q_ + num_fingerprint_bits_ != other.lg_q_ +
other.num_fingerprint_bits_) {
+ throw std::invalid_argument("incompatible sketches in merge");
+ }
+ // find cluster start
+ size_t i = 0;
+ if (!other.is_slot_empty(i)) while (other.get_is_shifted(i)) i = (i - 1) &
other.get_slot_mask();
+
+ std::queue<size_t> fifo;
+ size_t count = 0;
+ while (count < other.num_entries_) {
+ if (!other.is_slot_empty(i)) {
+ if (other.get_is_occupied(i)) fifo.push(i);
+ const size_t quotient = fifo.front();
+ const uint64_t value = other.get_value(i);
+ const uint64_t hash = quotient << other.num_fingerprint_bits_ | value;
+ insert(quotient_from_hash(hash), value_from_hash(hash));
+ ++count;
+ }
+ i = (i + 1) & other.get_slot_mask();
+ if (!fifo.empty() && !other.get_is_continuation(i)) fifo.pop();
+ }
+}
+
template<typename A>
size_t quotient_filter_alloc<A>::get_q() const {
return static_cast<size_t>(1) << get_lg_q();
@@ -177,12 +203,12 @@ uint8_t quotient_filter_alloc<A>::get_lg_q() const {
template<typename A>
uint8_t quotient_filter_alloc<A>::get_num_bits_per_entry() const {
- return num_bits_per_entry_;
+ return num_fingerprint_bits_ + 3;
}
template<typename A>
uint8_t quotient_filter_alloc<A>::get_num_bits_in_value() const {
- return num_bits_per_entry_ - 3;
+ return num_fingerprint_bits_;
}
template<typename A>
@@ -201,10 +227,11 @@ string<A> quotient_filter_alloc<A>::to_string(bool
print_entries) const {
// The stream does not support passing an allocator instance, and
alternatives are complicated.
std::ostringstream os;
os << "### Quotient filter summary:" << std::endl;
- os << " LgQ : " << std::to_string(lg_q_) << std::endl;
- os << " Bits per entry : " << std::to_string(num_bits_per_entry_) <<
std::endl;
- os << " Num expansions : " << std::to_string(num_expansions_) << std::endl;
- os << " Num entries : " << num_entries_ << std::endl;
+ os << " LgQ : " << std::to_string(lg_q_) << std::endl;
+ os << " Fingerprint bits : " << std::to_string(num_fingerprint_bits_) <<
std::endl;
+ os << " Load factor : " << std::to_string(load_factor_) << std::endl;
+ os << " Num expansions : " << std::to_string(num_expansions_) <<
std::endl;
+ os << " Num entries : " << num_entries_ << std::endl;
os << "### End filter summary" << std::endl;
if (print_entries) {
@@ -242,12 +269,10 @@ size_t quotient_filter_alloc<A>::find_run_start(size_t
slot) const {
slot = (slot - 1) & get_slot_mask();
if (get_is_occupied(slot)) ++num_runs_to_skip;
}
-// std::cout << "cluster start " << slot << " " << num_runs_to_skip << " runs
to skip\n";
while (num_runs_to_skip > 0) {
slot = (slot + 1) & get_slot_mask();
if (!get_is_continuation(slot)) --num_runs_to_skip;
}
-// std::cout << "run start " << slot << "\n";
return slot;
}
@@ -306,13 +331,13 @@ void quotient_filter_alloc<A>::insert_and_shift(size_t
quotient, size_t slot, ui
if (is_new_run) set_is_occupied(quotient, true);
++num_entries_;
- if (num_entries_ == static_cast<size_t>(get_q() * LOAD_FACTOR)) expand();
+ if (num_entries_ == static_cast<size_t>(get_q() * load_factor_)) expand();
}
template<typename A>
void quotient_filter_alloc<A>::expand() {
if (get_num_bits_in_value() < 2) throw std::logic_error("for expansion value
must have at least 2 bits");
- quotient_filter_alloc other(lg_q_ + 1, num_bits_per_entry_ - 1, allocator_);
+ quotient_filter_alloc other(lg_q_ + 1, num_fingerprint_bits_ - 1,
load_factor_, allocator_);
// find cluster start
size_t i = 0;
diff --git a/filters/test/quotient_filter_test.cpp
b/filters/test/quotient_filter_test.cpp
index b84176e..84e3da1 100755
--- a/filters/test/quotient_filter_test.cpp
+++ b/filters/test/quotient_filter_test.cpp
@@ -26,9 +26,9 @@
namespace datasketches {
TEST_CASE("empty", "[quotient_filter]") {
- quotient_filter f(10, 9);
+ quotient_filter f(10, 6);
REQUIRE(f.get_lg_q() == 10);
- REQUIRE(f.get_num_bits_per_entry() == 9);
+ REQUIRE(f.get_num_bits_in_value() == 6);
REQUIRE(f.get_num_entries() == 0);
}
@@ -55,7 +55,7 @@ TEST_CASE("several entries", "[quotient_filter]") {
TEST_CASE("many entries no expansion 1", "[quotient_filter]") {
quotient_filter f(4, 9);
- const size_t n = 12;
+ const size_t n = 11;
for (size_t i = 0; i < n; ++i) f.update(i);
// std::cout << f.to_string(true);
@@ -146,9 +146,55 @@ TEST_CASE("expansion", "[quotient_filter]") {
positives = 0;
for (size_t i = 0; i < n; ++i) if (f.query(i + n)) ++positives;
REQUIRE(positives < 7);
+}
- for (size_t i = 0; i < n * 2; ++i) if (f.update(i + n)) ++positives;
- std::cout << f.to_string();
+TEST_CASE("merge empty", "[quotient_filter]") {
+ quotient_filter qf1(4, 3);
+ quotient_filter qf2(4, 3);
+ qf1.merge(qf2);
+ REQUIRE(qf1.get_lg_q() == 4);
+ REQUIRE(qf1.get_num_bits_in_value() == 3);
+ REQUIRE(qf1.get_num_entries() == 0);
+}
+
+TEST_CASE("merge", "[quotient_filter]") {
+ quotient_filter qf1(16, 12);
+ quotient_filter qf2(16, 12);
+ const size_t n = 50000;
+ for (size_t i = 0; i < n / 2; ++i) {
+ qf1.update(i);
+ qf2.update(i + n / 2);
+ }
+ qf1.merge(qf2);
+ REQUIRE(qf1.get_num_expansions() == 0);
+ REQUIRE(qf1.get_num_entries() > n * 0.9999); // allow a few hash collisions
+
+ // query the same keys
+ size_t positives = 0;
+ for (size_t i = 0; i < n; ++i) if (qf1.query(i)) ++positives;
+ REQUIRE(positives == n);
+
+ // query novel keys
+ positives = 0;
+ for (size_t i = 0; i < n; ++i) if (qf1.query(i + n)) ++positives;
+ REQUIRE(positives < 6);
+}
+
+TEST_CASE("merge different configuration", "[quotient_filter]") {
+ quotient_filter qf1(6, 5);
+ quotient_filter qf2(5, 6);
+ for (int i = 0; i < 10; ++i) {
+ qf1.update(i);
+ qf2.update(i);
+ }
+ qf1.merge(qf2);
+ REQUIRE(qf1.get_num_entries() == 10);
+}
+
+TEST_CASE("merge incompatible", "[quotient_filter]") {
+ quotient_filter qf1(6, 5);
+ quotient_filter qf2(6, 6);
+ REQUIRE_THROWS_AS(qf1.merge(qf2), std::invalid_argument);
}
TEST_CASE("serialize", "[quotient_filter]") {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]