This is an automated email from the ASF dual-hosted git repository.
alsay pushed a commit to branch theta
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git
The following commit(s) were added to refs/heads/theta by this push:
new 0f3c7dc a-not-b set operation
0f3c7dc is described below
commit 0f3c7dc801a505f79ba3a77f239369b1a8f44a64
Author: AlexanderSaydakov <[email protected]>
AuthorDate: Fri May 24 10:18:22 2019 -0700
a-not-b set operation
---
theta/include/theta_a_not_b.hpp | 43 +++++++
theta/include/theta_a_not_b_impl.hpp | 104 ++++++++++++++++
theta/include/theta_sketch.hpp | 5 +
theta/include/theta_sketch_impl.hpp | 40 +++++--
theta/test/theta_a_not_b_test.cpp | 225 +++++++++++++++++++++++++++++++++++
5 files changed, 410 insertions(+), 7 deletions(-)
diff --git a/theta/include/theta_a_not_b.hpp b/theta/include/theta_a_not_b.hpp
new file mode 100644
index 0000000..7a32adf
--- /dev/null
+++ b/theta/include/theta_a_not_b.hpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2019, Verizon Media.
+ * Licensed under the terms of the Apache License 2.0. See LICENSE file at the
project root for terms.
+ */
+
+#ifndef THETA_A_NOT_B_HPP_
+#define THETA_A_NOT_B_HPP_
+
+#include <memory>
+#include <functional>
+#include <climits>
+
+#include <theta_sketch.hpp>
+
+namespace datasketches {
+
+/*
+ * author Alexander Saydakov
+ * author Lee Rhodes
+ * author Kevin Lang
+ */
+
+template<typename A>
+class theta_a_not_b_alloc {
+public:
+ explicit theta_a_not_b_alloc(uint64_t seed =
update_theta_sketch_alloc<A>::builder::DEFAULT_SEED);
+
+ compact_theta_sketch_alloc<A> compute(const theta_sketch_alloc<A>& a, const
theta_sketch_alloc<A>& b, bool ordered = true) const;
+
+private:
+ typedef typename std::allocator_traits<A>::template rebind_alloc<uint64_t>
AllocU64;
+ uint16_t seed_hash_;
+
+};
+
+// alias with default allocator for convenience
+typedef theta_a_not_b_alloc<std::allocator<void>> theta_a_not_b;
+
+} /* namespace datasketches */
+
+#include "theta_a_not_b_impl.hpp"
+
+# endif
diff --git a/theta/include/theta_a_not_b_impl.hpp
b/theta/include/theta_a_not_b_impl.hpp
new file mode 100644
index 0000000..a794b63
--- /dev/null
+++ b/theta/include/theta_a_not_b_impl.hpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2019, Verizon Media.
+ * Licensed under the terms of the Apache License 2.0. See LICENSE file at the
project root for terms.
+ */
+
+#ifndef THETA_A_NOT_B_IMPL_HPP_
+#define THETA_A_NOT_B_IMPL_HPP_
+
+#include <algorithm>
+
+namespace datasketches {
+
+/*
+ * author Alexander Saydakov
+ * author Lee Rhodes
+ * author Kevin Lang
+ */
+
+template<typename A>
+theta_a_not_b_alloc<A>::theta_a_not_b_alloc(uint64_t seed):
+seed_hash_(theta_sketch_alloc<A>::get_seed_hash(seed))
+{}
+
+constexpr uint8_t log2(uint32_t n) {
+ return (n > 1) ? 1 + log2(n >> 1) : 0;
+}
+
+constexpr uint8_t lg_size_from_count(uint32_t n, double load_factor) {
+ uint8_t lg = log2(n) + 1;
+ if (n > (1 << lg) * load_factor) lg++;
+ return lg;
+}
+
+template<typename A>
+compact_theta_sketch_alloc<A> theta_a_not_b_alloc<A>::compute(const
theta_sketch_alloc<A>& a, const theta_sketch_alloc<A>& b, bool ordered) const {
+ if (a.is_empty()) return compact_theta_sketch_alloc<A>(a, ordered);
+ if (a.get_seed_hash() != seed_hash_) throw std::invalid_argument("A seed
hash mismatch");
+ if (b.get_seed_hash() != seed_hash_) throw std::invalid_argument("B seed
hash mismatch");
+ if (a.get_num_retained() == 0 or b.is_empty()) return
compact_theta_sketch_alloc<A>(a, ordered);
+
+ const uint64_t theta = std::min(a.get_theta64(), b.get_theta64());
+ uint64_t* keys = nullptr;
+ uint32_t keys_size = 0;
+ uint32_t count = 0;
+ bool is_empty = a.is_empty();
+
+ if (b.get_num_retained() == 0) {
+ for (auto key: a) if (key < theta) ++count;
+ keys_size = count;
+ keys = AllocU64().allocate(keys_size);
+ std::copy_if(a.begin(), a.end(), keys, [theta](uint64_t key) { return key
< theta; });
+ if (ordered and !a.is_ordered()) std::sort(keys, &keys[keys_size]);
+ if (count == 0 and theta == theta_sketch_alloc<A>::MAX_THETA) is_empty =
true;
+ return compact_theta_sketch_alloc<A>(is_empty, theta, keys, count,
seed_hash_, a.is_ordered() or ordered);
+ }
+
+ keys_size = a.get_num_retained();
+ keys = AllocU64().allocate(keys_size);
+
+ if (a.is_ordered() and b.is_ordered()) { // sort-based
+ const auto end = std::set_difference(a.begin(), a.end(), b.begin(),
b.end(), keys);
+ count = end - keys;
+ } else { // hash-based
+ const uint8_t lg_size = lg_size_from_count(b.get_num_retained(),
update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
+ uint64_t* b_hash_table = AllocU64().allocate(1 << lg_size);
+ std::fill(b_hash_table, &b_hash_table[1 << lg_size], 0);
+ for (auto key: b) {
+ if (key < theta) {
+ update_theta_sketch_alloc<A>::hash_search_or_insert(key, b_hash_table,
lg_size);
+ } else if (b.is_ordered()) {
+ break; // early stop
+ }
+ }
+
+ // scan A lookup B
+ for (auto key: a) {
+ if (key < theta) {
+ if (!update_theta_sketch_alloc<A>::hash_search(key, b_hash_table,
lg_size)) keys[count++] = key;
+ } else if (a.is_ordered()) {
+ break; // early stop
+ }
+ }
+
+ AllocU64().deallocate(b_hash_table, 1 << lg_size);
+ }
+
+ if (count == 0) {
+ AllocU64().deallocate(keys, keys_size);
+ keys = nullptr;
+ if (theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true;
+ } else if (count < keys_size) {
+ uint64_t* keys_copy = AllocU64().allocate(count);
+ std::copy(keys, &keys[count], keys_copy);
+ AllocU64().deallocate(keys, keys_size);
+ keys = keys_copy;
+ if (ordered and !a.is_ordered()) std::sort(keys, &keys[count]);
+ }
+
+ return compact_theta_sketch_alloc<A>(is_empty, theta, keys, count,
seed_hash_, a.is_ordered() or ordered);
+}
+
+} /* namespace datasketches */
+
+# endif
diff --git a/theta/include/theta_sketch.hpp b/theta/include/theta_sketch.hpp
index a72eaf9..5ec618d 100644
--- a/theta/include/theta_sketch.hpp
+++ b/theta/include/theta_sketch.hpp
@@ -24,6 +24,7 @@ template<typename A> class update_theta_sketch_alloc;
template<typename A> class compact_theta_sketch_alloc;
template<typename A> class theta_union_alloc;
template<typename A> class theta_intersection_alloc;
+template<typename A> class theta_a_not_b_alloc;
// for serialization as raw bytes
typedef std::unique_ptr<void, std::function<void(void*)>>
void_ptr_with_deleter;
@@ -165,9 +166,11 @@ private:
void internal_update(uint64_t hash);
friend theta_intersection_alloc<A>;
+ friend theta_a_not_b_alloc<A>;
static inline uint32_t get_capacity(uint8_t lg_cur_size, uint8_t
lg_nom_size);
static inline uint32_t get_stride(uint64_t hash, uint8_t lg_size);
static bool hash_search_or_insert(uint64_t hash, uint64_t* table, uint8_t
lg_size);
+ static bool hash_search(uint64_t hash, const uint64_t* table, uint8_t
lg_size);
friend theta_sketch_alloc<A>;
static update_theta_sketch_alloc<A> internal_deserialize(std::istream& is,
resize_factor rf, uint8_t lg_nom_size, uint8_t lg_cur_size, uint8_t flags_byte,
uint64_t seed);
@@ -182,6 +185,7 @@ public:
static const uint8_t SKETCH_TYPE = 3;
compact_theta_sketch_alloc(const compact_theta_sketch_alloc<A>& other);
+ compact_theta_sketch_alloc(const theta_sketch_alloc<A>& other, bool ordered);
compact_theta_sketch_alloc(compact_theta_sketch_alloc<A>&& other) noexcept;
virtual ~compact_theta_sketch_alloc();
@@ -214,6 +218,7 @@ private:
friend update_theta_sketch_alloc<A>;
friend theta_union_alloc<A>;
friend theta_intersection_alloc<A>;
+ friend theta_a_not_b_alloc<A>;
compact_theta_sketch_alloc(bool is_empty, uint64_t theta, uint64_t* keys,
uint32_t num_keys, uint16_t seed_hash, bool is_ordered);
static compact_theta_sketch_alloc<A> internal_deserialize(std::istream& is,
uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash);
static compact_theta_sketch_alloc<A> internal_deserialize(const void* bytes,
size_t size, uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash);
diff --git a/theta/include/theta_sketch_impl.hpp
b/theta/include/theta_sketch_impl.hpp
index 16d6eca..234585d 100644
--- a/theta/include/theta_sketch_impl.hpp
+++ b/theta/include/theta_sketch_impl.hpp
@@ -562,12 +562,7 @@ void update_theta_sketch_alloc<A>::update(const void*
data, unsigned length) {
template<typename A>
compact_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::compact(bool
ordered) const {
- if (this->get_num_retained() == 0) return
compact_theta_sketch_alloc<A>(this->is_empty_, this->theta_, nullptr, 0,
get_seed_hash(), ordered);
- uint64_t* keys = AllocU64().allocate(num_keys_);
- uint32_t i = 0;
- for (auto value: *this) keys[i++] = value;
- if (ordered) std::sort(keys, &keys[num_keys_]);
- return compact_theta_sketch_alloc<A>(false, this->theta_, keys, num_keys_,
get_seed_hash(), ordered);
+ return compact_theta_sketch_alloc<A>(*this, ordered);
}
template<typename A>
@@ -661,6 +656,26 @@ bool
update_theta_sketch_alloc<A>::hash_search_or_insert(uint64_t hash, uint64_t
}
template<typename A>
+bool update_theta_sketch_alloc<A>::hash_search(uint64_t hash, const uint64_t*
table, uint8_t lg_size) {
+ const uint32_t mask = (1 << lg_size) - 1;
+ const uint32_t stride = update_theta_sketch_alloc<A>::get_stride(hash,
lg_size);
+ uint32_t cur_probe = static_cast<uint32_t>(hash) & mask;
+ const uint32_t loop_index = cur_probe;
+ do {
+ const uint64_t value = table[cur_probe];
+ if (value == 0) {
+ return false;
+ } else if (value == hash) {
+ return true;
+ }
+ cur_probe = (cur_probe + stride) & mask;
+ } while (cur_probe != loop_index);
+ std::cerr << "hash_search: lg=" << (int)lg_size << ", hash=" << hash <<
std::endl;
+ std::cerr << "cur_probe=" << cur_probe << ", stride=" << stride << std::endl;
+ throw std::logic_error("key not found and search wrapped");
+}
+
+template<typename A>
typename theta_sketch_alloc<A>::const_iterator
update_theta_sketch_alloc<A>::begin() const {
return typename theta_sketch_alloc<A>::const_iterator(keys_, 1 <<
lg_cur_size_, 0);
}
@@ -693,6 +708,18 @@ is_ordered_(other.is_ordered_)
}
template<typename A>
+compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(const
theta_sketch_alloc<A>& other, bool ordered):
+theta_sketch_alloc<A>(other),
+keys_(AllocU64().allocate(other.get_num_retained())),
+num_keys_(other.get_num_retained()),
+seed_hash_(other.get_seed_hash()),
+is_ordered_(other.is_ordered() or ordered)
+{
+ std::copy(other.begin(), other.end(), keys_);
+ if (ordered and !other.is_ordered()) std::sort(keys_, &keys_[num_keys_]);
+}
+
+template<typename A>
compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(compact_theta_sketch_alloc<A>&&
other) noexcept:
theta_sketch_alloc<A>(std::move(other)),
keys_(nullptr),
@@ -717,7 +744,6 @@ compact_theta_sketch_alloc<A>&
compact_theta_sketch_alloc<A>::operator=(const co
num_keys_ = other.num_keys_;
keys_ = AllocU64().allocate(num_keys_);
}
- keys_(AllocU64().allocate(other.num_keys_)),
seed_hash_ = other.seed_hash_;
is_ordered_ = other.is_ordered_;
std::copy(other.keys_, &other.keys_[num_keys_], keys_);
diff --git a/theta/test/theta_a_not_b_test.cpp
b/theta/test/theta_a_not_b_test.cpp
new file mode 100644
index 0000000..6ec9050
--- /dev/null
+++ b/theta/test/theta_a_not_b_test.cpp
@@ -0,0 +1,225 @@
+/*
+ * Copyright 2019, Verizon Media.
+ * Licensed under the terms of the Apache License 2.0. See LICENSE file at the
project root for terms.
+ */
+
+#include <cppunit/TestFixture.h>
+#include <cppunit/extensions/HelperMacros.h>
+
+#include <theta_a_not_b.hpp>
+
+namespace datasketches {
+
+class theta_a_not_b_test: public CppUnit::TestFixture {
+
+ CPPUNIT_TEST_SUITE(theta_a_not_b_test);
+ CPPUNIT_TEST(empty);
+ CPPUNIT_TEST(non_empty_no_retained_keys);
+ CPPUNIT_TEST(exact_mode_half_overlap);
+ CPPUNIT_TEST(exact_mode_disjoint);
+ CPPUNIT_TEST(exact_mode_full_overlap);
+ CPPUNIT_TEST(estimation_mode_half_overlap);
+ CPPUNIT_TEST(estimation_mode_disjoint);
+ CPPUNIT_TEST(estimation_mode_full_overlap);
+ CPPUNIT_TEST(seed_mismatch);
+ CPPUNIT_TEST_SUITE_END();
+
+ void empty() {
+ theta_a_not_b a_not_b;
+ update_theta_sketch a = update_theta_sketch::builder().build();
+ update_theta_sketch b = update_theta_sketch::builder().build();
+ compact_theta_sketch result = a_not_b.compute(a, b);
+ CPPUNIT_ASSERT_EQUAL(0U, result.get_num_retained());
+ CPPUNIT_ASSERT(result.is_empty());
+ CPPUNIT_ASSERT(!result.is_estimation_mode());
+ CPPUNIT_ASSERT_EQUAL(0.0, result.get_estimate());
+ }
+
+ void non_empty_no_retained_keys() {
+ update_theta_sketch a = update_theta_sketch::builder().build();
+ a.update(1);
+ update_theta_sketch b =
update_theta_sketch::builder().set_p(0.001).build();
+ theta_a_not_b a_not_b;
+
+ // B is still empty
+ compact_theta_sketch result = a_not_b.compute(a, b);
+ CPPUNIT_ASSERT(!result.is_empty());
+ CPPUNIT_ASSERT(!result.is_estimation_mode());
+ CPPUNIT_ASSERT_EQUAL(1U, result.get_num_retained());
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(1, result.get_theta(), 1e-10);
+ CPPUNIT_ASSERT_EQUAL(1.0, result.get_estimate());
+
+ // B is not empty in estimation mode and no entries
+ b.update(1);
+ CPPUNIT_ASSERT_EQUAL(0U, b.get_num_retained());
+
+ result = a_not_b.compute(a, b);
+ CPPUNIT_ASSERT(!result.is_empty());
+ CPPUNIT_ASSERT(result.is_estimation_mode());
+ CPPUNIT_ASSERT_EQUAL(0U, result.get_num_retained());
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(0.001, result.get_theta(), 1e-10);
+ CPPUNIT_ASSERT_EQUAL(0.0, result.get_estimate());
+ }
+
+ void exact_mode_half_overlap() {
+ update_theta_sketch a = update_theta_sketch::builder().build();
+ int value = 0;
+ for (int i = 0; i < 1000; i++) a.update(value++);
+
+ update_theta_sketch b = update_theta_sketch::builder().build();
+ value = 500;
+ for (int i = 0; i < 1000; i++) b.update(value++);
+
+ theta_a_not_b a_not_b;
+
+ // unordered inputs, ordered result
+ compact_theta_sketch result = a_not_b.compute(a, b);
+ CPPUNIT_ASSERT(!result.is_empty());
+ CPPUNIT_ASSERT(!result.is_estimation_mode());
+ CPPUNIT_ASSERT(result.is_ordered());
+ CPPUNIT_ASSERT_EQUAL(500.0, result.get_estimate());
+
+ // unordered inputs, unordered result
+ result = a_not_b.compute(a, b, false);
+ CPPUNIT_ASSERT(!result.is_empty());
+ CPPUNIT_ASSERT(!result.is_estimation_mode());
+ CPPUNIT_ASSERT(!result.is_ordered());
+ CPPUNIT_ASSERT_EQUAL(500.0, result.get_estimate());
+
+ // ordered inputs
+ result = a_not_b.compute(a.compact(), b.compact());
+ CPPUNIT_ASSERT(!result.is_empty());
+ CPPUNIT_ASSERT(!result.is_estimation_mode());
+ CPPUNIT_ASSERT(result.is_ordered());
+ CPPUNIT_ASSERT_EQUAL(500.0, result.get_estimate());
+
+ // A is ordered, so the result is ordered regardless
+ result = a_not_b.compute(a.compact(), b, false);
+ CPPUNIT_ASSERT(!result.is_empty());
+ CPPUNIT_ASSERT(!result.is_estimation_mode());
+ CPPUNIT_ASSERT(result.is_ordered());
+ CPPUNIT_ASSERT_EQUAL(500.0, result.get_estimate());
+}
+
+ void exact_mode_disjoint() {
+ update_theta_sketch a = update_theta_sketch::builder().build();
+ int value = 0;
+ for (int i = 0; i < 1000; i++) a.update(value++);
+
+ update_theta_sketch b = update_theta_sketch::builder().build();
+ for (int i = 0; i < 1000; i++) b.update(value++);
+
+ theta_a_not_b a_not_b;
+
+ // unordered inputs
+ compact_theta_sketch result = a_not_b.compute(a, b);
+ CPPUNIT_ASSERT(!result.is_empty());
+ CPPUNIT_ASSERT(!result.is_estimation_mode());
+ CPPUNIT_ASSERT_EQUAL(1000.0, result.get_estimate());
+
+ // ordered inputs
+ result = a_not_b.compute(a.compact(), b.compact());
+ CPPUNIT_ASSERT(!result.is_empty());
+ CPPUNIT_ASSERT(!result.is_estimation_mode());
+ CPPUNIT_ASSERT_EQUAL(1000.0, result.get_estimate());
+ }
+
+ void exact_mode_full_overlap() {
+ update_theta_sketch sketch = update_theta_sketch::builder().build();
+ int value = 0;
+ for (int i = 0; i < 1000; i++) sketch.update(value++);
+
+ theta_a_not_b a_not_b;
+
+ // unordered inputs
+ compact_theta_sketch result = a_not_b.compute(sketch, sketch);
+ CPPUNIT_ASSERT(result.is_empty());
+ CPPUNIT_ASSERT(!result.is_estimation_mode());
+ CPPUNIT_ASSERT_EQUAL(0.0, result.get_estimate());
+
+ // ordered inputs
+ result = a_not_b.compute(sketch.compact(), sketch.compact());
+ CPPUNIT_ASSERT(result.is_empty());
+ CPPUNIT_ASSERT(!result.is_estimation_mode());
+ CPPUNIT_ASSERT_EQUAL(0.0, result.get_estimate());
+ }
+
+ void estimation_mode_half_overlap() {
+ update_theta_sketch a = update_theta_sketch::builder().build();
+ int value = 0;
+ for (int i = 0; i < 10000; i++) a.update(value++);
+
+ update_theta_sketch b = update_theta_sketch::builder().build();
+ value = 5000;
+ for (int i = 0; i < 10000; i++) b.update(value++);
+
+ theta_a_not_b a_not_b;
+
+ // unordered inputs
+ compact_theta_sketch result = a_not_b.compute(a, b);
+ CPPUNIT_ASSERT(!result.is_empty());
+ CPPUNIT_ASSERT(result.is_estimation_mode());
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(5000, result.get_estimate(), 5000 * 0.02);
+
+ // ordered inputs
+ result = a_not_b.compute(a.compact(), b.compact());
+ CPPUNIT_ASSERT(!result.is_empty());
+ CPPUNIT_ASSERT(result.is_estimation_mode());
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(5000, result.get_estimate(), 5000 * 0.02);
+ }
+
+ void estimation_mode_disjoint() {
+ update_theta_sketch a = update_theta_sketch::builder().build();
+ int value = 0;
+ for (int i = 0; i < 10000; i++) a.update(value++);
+
+ update_theta_sketch b = update_theta_sketch::builder().build();
+ for (int i = 0; i < 10000; i++) b.update(value++);
+
+ theta_a_not_b a_not_b;
+
+ // unordered inputs
+ compact_theta_sketch result = a_not_b.compute(a, b);
+ CPPUNIT_ASSERT(!result.is_empty());
+ CPPUNIT_ASSERT(result.is_estimation_mode());
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(10000, result.get_estimate(), 10000 * 0.02);
+
+ // ordered inputs
+ result = a_not_b.compute(a.compact(), b.compact());
+ CPPUNIT_ASSERT(!result.is_empty());
+ CPPUNIT_ASSERT(result.is_estimation_mode());
+ CPPUNIT_ASSERT_DOUBLES_EQUAL(10000, result.get_estimate(), 10000 * 0.02);
+ }
+
+ void estimation_mode_full_overlap() {
+ update_theta_sketch sketch = update_theta_sketch::builder().build();
+ int value = 0;
+ for (int i = 0; i < 10000; i++) sketch.update(value++);
+
+ theta_a_not_b a_not_b;
+
+ // unordered inputs
+ compact_theta_sketch result = a_not_b.compute(sketch, sketch);
+ CPPUNIT_ASSERT(!result.is_empty());
+ CPPUNIT_ASSERT(result.is_estimation_mode());
+ CPPUNIT_ASSERT_EQUAL(0.0, result.get_estimate());
+
+ // ordered inputs
+ result = a_not_b.compute(sketch.compact(), sketch.compact());
+ CPPUNIT_ASSERT(!result.is_empty());
+ CPPUNIT_ASSERT(result.is_estimation_mode());
+ CPPUNIT_ASSERT_EQUAL(0.0, result.get_estimate());
+ }
+
+ void seed_mismatch() {
+ update_theta_sketch sketch = update_theta_sketch::builder().build();
+ sketch.update(1); // non-empty should not be ignored
+ theta_a_not_b a_not_b(123);
+ CPPUNIT_ASSERT_THROW(a_not_b.compute(sketch, sketch),
std::invalid_argument);
+ }
+
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(theta_a_not_b_test);
+
+} /* namespace datasketches */
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]