This is an automated email from the ASF dual-hosted git repository.
alsay pushed a commit to branch quotient-filter
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git
The following commit(s) were added to refs/heads/quotient-filter by this push:
new 5aa9191f ordered runs, simplified insertion code
5aa9191f is described below
commit 5aa9191fe7dc6def62ed784c3f203e64852703d1
Author: AlexanderSaydakov <[email protected]>
AuthorDate: Tue Jun 4 10:11:48 2024 -0700
ordered runs, simplified insertion code
---
.../filters/quotientfilter/QuotientFilter.java | 116 +++++++++------------
.../filters/quotientfilter/QuotientFilterTest.java | 6 +-
2 files changed, 50 insertions(+), 72 deletions(-)
diff --git
a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java
b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java
index e0a442fc..66b91411 100644
---
a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java
+++
b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java
@@ -312,16 +312,19 @@ public class QuotientFilter extends Filter {
}
// given the start of a run, scan the run and return the index of the
first matching fingerprint
+ // if not found returns the insertion position as bitwise complement to
make it negative
long find_first_fingerprint_in_run(long index, long fingerprint) {
- assert(!is_continuation(index));
- do {
- if (compare(index, fingerprint)) {
- //System.out.println("found matching FP at index " + index);
- return index;
- }
- index = (index + 1) & getMask();
- } while (is_continuation(index));
- return -1;
+ assert(!is_continuation(index));
+ do {
+ final long fingerprintAtIndex = get_fingerprint(index);
+ if (fingerprintAtIndex == fingerprint) {
+ return index;
+ } else if (fingerprintAtIndex > fingerprint) {
+ return ~index;
+ }
+ index = (index + 1) & getMask();
+ } while (is_continuation(index));
+ return ~index;
}
// delete the last matching fingerprint in the run
@@ -354,7 +357,7 @@ public class QuotientFilter extends Filter {
}
long run_start_index = find_run_start(index);
long found_index = find_first_fingerprint_in_run(run_start_index,
fingerprint);
- return found_index > -1;
+ return found_index >= 0;
}
// Given a canonical slot index, find the corresponding run and return all
fingerprints in the run.
@@ -373,74 +376,49 @@ public class QuotientFilter extends Filter {
return set;
}
- // Swaps the fingerprint in a given slot with a new one. Return the
pre-existing fingerprint
- long swap_fingerprints(long index, long new_fingerprint) {
- long existing = get_fingerprint(index);
- set_fingerprint(index, new_fingerprint);
- return existing;
- }
-
- boolean insert_new_run(long canonical_slot, long long_fp) {
- long start_of_this_new_run = find_run_start(canonical_slot);
- boolean slot_initially_empty = is_slot_empty(start_of_this_new_run);
-
- // modify some metadata flags to mark the new run
- set_occupied(canonical_slot, true);
- if (start_of_this_new_run != canonical_slot) {
- set_shifted(start_of_this_new_run, true);
- }
- set_continuation(start_of_this_new_run, false);
-
- // if the slot was initially empty, we can just terminate, as there is
nothing to push to the right
- if (slot_initially_empty) {
- set_fingerprint(start_of_this_new_run, long_fp);
- num_entries++;
- return true;
- }
- return insert_fingerprint_and_push_all_else(long_fp,
start_of_this_new_run, false);
- }
-
- boolean insert(long long_fp, long index) {
+ boolean insert(long fingerprint, long index) {
if (index >= get_num_slots() || num_entries == get_num_slots()) {
return false;
}
+ final long run_start = find_run_start(index);
if (!is_occupied(index)) {
- return insert_new_run(index, long_fp);
+ return insert_fingerprint_and_push_all_else(fingerprint, run_start,
index, true, true);
}
- long run_start_index = find_run_start(index);
- final long found_index = find_first_fingerprint_in_run(run_start_index,
long_fp);
- if (found_index > -1) {
+ final long found_index = find_first_fingerprint_in_run(run_start,
fingerprint);
+ if (found_index >= 0) {
return false;
}
- return insert_fingerprint_and_push_all_else(long_fp, run_start_index,
true);
+ return insert_fingerprint_and_push_all_else(fingerprint, ~found_index,
index, false, ~found_index == run_start);
}
- // insert a fingerprint as the last fingerprint of the run and push all
other entries in the cluster to the right.
- boolean insert_fingerprint_and_push_all_else(long long_fp, long
run_start_index, boolean is_same_run) {
- long current_index = run_start_index;
- boolean is_this_slot_empty;
- boolean temp_continuation = false;
-
- do {
- is_this_slot_empty = is_slot_empty(current_index);
- if (current_index != run_start_index) {
- set_shifted(current_index, true);
- }
- if (current_index != run_start_index && is_same_run &&
!is_continuation(current_index)) {
- is_same_run = false;
- set_continuation(current_index, true);
- long_fp = swap_fingerprints(current_index, long_fp);
- }
- else if (!is_same_run) {
- boolean current_continuation = is_continuation(current_index);
- set_continuation(current_index, temp_continuation);
- temp_continuation = current_continuation;
- long_fp = swap_fingerprints(current_index, long_fp);
- }
- current_index = (current_index + 1) & getMask();
- } while (!is_this_slot_empty);
- num_entries++;
- return true;
+ boolean insert_fingerprint_and_push_all_else(long fingerprint, long index,
long canonical, boolean is_new_run, boolean is_run_start) {
+ boolean existing_is_continuation = is_continuation(index);
+ boolean is_continuation = !is_run_start;
+ boolean is_shifted = index != canonical;
+ boolean force_continuation = !is_new_run && is_run_start;
+ boolean existing_is_empty = is_slot_empty(index);
+ long existing_fingerprint = get_fingerprint(index);
+ while (!existing_is_empty) {
+ set_fingerprint(index, fingerprint);
+ set_continuation(index, is_continuation);
+ set_shifted(index, is_shifted);
+ fingerprint = existing_fingerprint;
+ is_continuation = existing_is_continuation | force_continuation;
+ is_shifted = true;
+ index = (index + 1) & getMask();
+ existing_fingerprint = get_fingerprint(index);
+ existing_is_continuation = is_continuation(index);
+ existing_is_empty = is_slot_empty(index);
+ force_continuation = false;
+ }
+ set_fingerprint(index, fingerprint);
+ set_continuation(index, is_continuation);
+ set_shifted(index, is_shifted);
+ if (is_new_run) {
+ set_occupied(canonical, true);
+ }
+ num_entries++;
+ return true;
}
boolean delete(long fingerprint, long canonical_slot, long
run_start_index, long matching_fingerprint_index) {
diff --git
a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java
b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java
index 4f6486ea..44e9833d 100644
---
a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java
+++
b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java
@@ -67,11 +67,11 @@ public class QuotientFilterTest {
assertEquals(getState(qf, 0), 0);
assertEquals(qf.get_fingerprint(0), 0);
assertEquals(getState(qf, 1), 0b100);
- assertEquals(qf.get_fingerprint(1), B); // this run is not ordered,
which is different from Wikipedia example
+ assertEquals(qf.get_fingerprint(1), A); // this run is not ordered,
which is different from Wikipedia example
assertEquals(getState(qf, 2), 0b111);
- assertEquals(qf.get_fingerprint(2), C);
+ assertEquals(qf.get_fingerprint(2), B);
assertEquals(getState(qf, 3), 0b011);
- assertEquals(qf.get_fingerprint(3), A);
+ assertEquals(qf.get_fingerprint(3), C);
assertEquals(getState(qf, 4), 0b101);
assertEquals(qf.get_fingerprint(4), D);
assertEquals(getState(qf, 5), 0b001);
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]