This is an automated email from the ASF dual-hosted git repository. jmalkin pushed a commit to branch patch_for_rc4 in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git
commit baef5b8947489d6f02d4499a1c8a617f2d69e16b Author: AlexanderSaydakov <[email protected]> AuthorDate: Fri May 29 16:26:06 2020 -0700 fixed ordered a-not-b --- theta/include/theta_a_not_b_impl.hpp | 65 ++++++++++++++---------------------- theta/test/theta_a_not_b_test.cpp | 24 +++++++++++++ 2 files changed, 49 insertions(+), 40 deletions(-) diff --git a/theta/include/theta_a_not_b_impl.hpp b/theta/include/theta_a_not_b_impl.hpp index f080903..8951a8f 100644 --- a/theta/include/theta_a_not_b_impl.hpp +++ b/theta/include/theta_a_not_b_impl.hpp @@ -22,6 +22,8 @@ #include <algorithm> +#include "conditional_back_inserter.hpp" + namespace datasketches { /* @@ -43,54 +45,37 @@ compact_theta_sketch_alloc<A> theta_a_not_b_alloc<A>::compute(const theta_sketch const uint64_t theta = std::min(a.get_theta64(), b.get_theta64()); vector_u64<A> keys; - uint32_t keys_size = 0; - uint32_t count = 0; bool is_empty = a.is_empty(); + auto less_than_theta = [theta](uint64_t key) { return key < theta; }; if (b.get_num_retained() == 0) { - for (auto key: a) if (key < theta) ++count; - keys.resize(count); - std::copy_if(a.begin(), a.end(), &keys[0], [theta](uint64_t key) { return key < theta; }); - if (ordered && !a.is_ordered()) std::sort(keys.begin(), keys.end()); - if (count == 0 && theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true; - return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash_, a.is_ordered() || ordered); - } - - keys_size = a.get_num_retained(); - keys.resize(keys_size); - - if (a.is_ordered() && b.is_ordered()) { // sort-based - const auto end = std::set_difference(a.begin(), a.end(), b.begin(), b.end(), keys.begin()); - count = end - keys.begin(); - } else { // hash-based - const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD); - vector_u64<A> b_hash_table(1 << lg_size, 0); - for (auto key: b) { - if (key < theta) { - update_theta_sketch_alloc<A>::hash_search_or_insert(key, b_hash_table.data(), lg_size); - } else if (b.is_ordered()) { - break; // early stop + std::copy_if(a.begin(), a.end(), std::back_inserter(keys), less_than_theta); + } else { + if (a.is_ordered() && b.is_ordered()) { // sort-based + std::set_difference(a.begin(), a.end(), b.begin(), b.end(), conditional_back_inserter(keys, less_than_theta)); + } else { // hash-based + const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD); + vector_u64<A> b_hash_table(1 << lg_size, 0); + for (auto key: b) { + if (key < theta) { + update_theta_sketch_alloc<A>::hash_search_or_insert(key, b_hash_table.data(), lg_size); + } else if (b.is_ordered()) { + break; // early stop + } } - } - // scan A lookup B - for (auto key: a) { - if (key < theta) { - if (!update_theta_sketch_alloc<A>::hash_search(key, b_hash_table.data(), lg_size)) keys[count++] = key; - } else if (a.is_ordered()) { - break; // early stop + // scan A lookup B + for (auto key: a) { + if (key < theta) { + if (!update_theta_sketch_alloc<A>::hash_search(key, b_hash_table.data(), lg_size)) keys.push_back(key); + } else if (a.is_ordered()) { + break; // early stop + } } } } - - if (count == 0) { - keys.resize(0); - if (theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true; - } else if (count < keys_size) { - keys.resize(count); - if (ordered && !a.is_ordered()) std::sort(keys.begin(), keys.end()); - } - + if (keys.empty() && theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true; + if (ordered && !a.is_ordered()) std::sort(keys.begin(), keys.end()); return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash_, a.is_ordered() || ordered); } diff --git a/theta/test/theta_a_not_b_test.cpp b/theta/test/theta_a_not_b_test.cpp index 54bd3fa..1ef5255 100644 --- a/theta/test/theta_a_not_b_test.cpp +++ b/theta/test/theta_a_not_b_test.cpp @@ -217,4 +217,28 @@ TEST_CASE("theta a-not-b: seed mismatch", "[theta_a_not_b]") { REQUIRE_THROWS_AS(a_not_b.compute(sketch, sketch), std::invalid_argument); } +TEST_CASE("theta a-not-b: issue #152", "[theta_a_not_b]") { + update_theta_sketch a = update_theta_sketch::builder().build(); + int value = 0; + for (int i = 0; i < 10000; i++) a.update(value++); + + update_theta_sketch b = update_theta_sketch::builder().build(); + value = 5000; + for (int i = 0; i < 25000; i++) b.update(value++); + + theta_a_not_b a_not_b; + + // unordered inputs + compact_theta_sketch result = a_not_b.compute(a, b); + REQUIRE_FALSE(result.is_empty()); + REQUIRE(result.is_estimation_mode()); + REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.03)); + + // ordered inputs + result = a_not_b.compute(a.compact(), b.compact()); + REQUIRE_FALSE(result.is_empty()); + REQUIRE(result.is_estimation_mode()); + REQUIRE(result.get_estimate() == Approx(5000).margin(5000 * 0.03)); +} + } /* namespace datasketches */ --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
