This is an automated email from the ASF dual-hosted git repository. alsay pushed a commit to branch filter in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git
commit af8e2b9eb70e88983b430d78ea4209691dd77731 Author: AlexanderSaydakov <[email protected]> AuthorDate: Fri May 10 15:28:35 2024 -0700 implemented filter --- tuple/include/tuple_sketch.hpp | 29 +++++++++++++++++- tuple/include/tuple_sketch_impl.hpp | 33 ++++++++++++++++++++ tuple/test/tuple_sketch_test.cpp | 61 +++++++++++++++++++++++++++++++++++++ 3 files changed, 122 insertions(+), 1 deletion(-) diff --git a/tuple/include/tuple_sketch.hpp b/tuple/include/tuple_sketch.hpp index 383e573..cbfd9f1 100644 --- a/tuple/include/tuple_sketch.hpp +++ b/tuple/include/tuple_sketch.hpp @@ -381,6 +381,15 @@ public: */ compact_tuple_sketch<Summary, Allocator> compact(bool ordered = true) const; + /** + * Produces a Compact Tuple sketch from this sketch + * by applying a given predicate to each entry. + * @param predicate should return true for the entries to keep + * @return compact sketch with the entries retained according to the predicate + */ + template<typename Predicate> + compact_tuple_sketch<Summary, Allocator> filter(const Predicate& predicate) const; + virtual iterator begin(); virtual iterator end(); virtual const_iterator begin() const; @@ -480,6 +489,25 @@ public: virtual uint32_t get_num_retained() const; virtual uint16_t get_seed_hash() const; + /** + * Produces a Compact Tuple sketch from this sketch + * by applying a given predicate to each entry. + * @param predicate should return true for the entries to keep + * @return compact sketch with the entries retained according to the predicate + */ + template<typename Predicate> + compact_tuple_sketch filter(const Predicate& predicate) const; + + /** + * Produces a Compact Tuple sketch from a given sketch (Update or Compact) + * by applying a given predicate to each entry. + * @param sketch input sketch + * @param predicate should return true for the entries to keep + * @return compact sketch with the entries retained according to the predicate + */ + template<typename Sketch, typename Predicate> + static compact_tuple_sketch filter(const Sketch& sketch, const Predicate& predicate); + /** * This method serializes the sketch into a given stream in a binary form * @param os output stream @@ -579,7 +607,6 @@ protected: template<typename E, typename EK, typename P, typename S, typename CS, typename A> friend class theta_intersection_base; template<typename E, typename EK, typename CS, typename A> friend class theta_set_difference_base; compact_tuple_sketch(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<Entry, AllocEntry>&& entries); - }; /// Tuple base builder diff --git a/tuple/include/tuple_sketch_impl.hpp b/tuple/include/tuple_sketch_impl.hpp index 0766e4d..e5bf834 100644 --- a/tuple/include/tuple_sketch_impl.hpp +++ b/tuple/include/tuple_sketch_impl.hpp @@ -258,6 +258,12 @@ compact_tuple_sketch<S, A> update_tuple_sketch<S, U, P, A>::compact(bool ordered return compact_tuple_sketch<S, A>(*this, ordered); } +template<typename S, typename U, typename P, typename A> +template<typename Predicate> +compact_tuple_sketch<S, A> update_tuple_sketch<S, U, P, A>::filter(const Predicate& predicate) const { + return compact_tuple_sketch<S, A>::filter(*this, predicate); +} + template<typename S, typename U, typename P, typename A> void update_tuple_sketch<S, U, P, A>::print_specifics(std::ostringstream& os) const { os << " lg nominal size : " << (int) map_.lg_nom_size_ << std::endl; @@ -344,6 +350,33 @@ uint16_t compact_tuple_sketch<S, A>::get_seed_hash() const { return seed_hash_; } +template<typename S, typename A> +template<typename Predicate> +compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::filter(const Predicate& predicate) const { + return filter(*this, predicate); +} + +template<typename S, typename A> +template<typename Sketch, typename Predicate> +compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::filter(const Sketch& sketch, const Predicate& predicate) { + std::vector<Entry, AllocEntry> entries(sketch.get_allocator()); + entries.reserve(sketch.get_num_retained()); + std::copy_if( + sketch.begin(), + sketch.end(), + std::back_inserter(entries), + [&predicate](const Entry& e) {return predicate(e.second);} + ); + entries.shrink_to_fit(); + return compact_tuple_sketch( + !sketch.is_estimation_mode() && entries.empty(), + sketch.is_ordered(), + sketch.get_seed_hash(), + sketch.get_theta64(), + std::move(entries) + ); +} + // implementation for fixed-size arithmetic types (integral and floating point) template<typename S, typename A> template<typename SD, typename SS, typename std::enable_if<std::is_arithmetic<SS>::value, int>::type> diff --git a/tuple/test/tuple_sketch_test.cpp b/tuple/test/tuple_sketch_test.cpp index b5316e8..6e44e28 100644 --- a/tuple/test/tuple_sketch_test.cpp +++ b/tuple/test/tuple_sketch_test.cpp @@ -310,4 +310,65 @@ TEST_CASE("tuple sketch: float, update with different types of keys", "[tuple_sk REQUIRE(sketch.get_num_retained() == 3); } +TEST_CASE("filter", "[tuple_sketch]") { + auto usk = update_tuple_sketch<int>::builder().build(); + + { // empty update sketch + auto sk = usk.filter([](int){return true;}); + REQUIRE(sk.is_empty()); + REQUIRE(sk.is_ordered()); + REQUIRE(sk.get_num_retained() == 0); + } + + { // empty compact sketch + auto sk = usk.compact().filter([](int){return true;}); + REQUIRE(sk.is_empty()); + REQUIRE(sk.is_ordered()); + REQUIRE(sk.get_num_retained() == 0); + } + + usk.update(1, 1); + usk.update(1, 1); + usk.update(2, 1); + usk.update(2, 1); + usk.update(3, 1); + + { // exact mode update sketch + auto sk = usk.filter([](int v){return v > 1;}); + REQUIRE_FALSE(sk.is_empty()); + REQUIRE_FALSE(sk.is_ordered()); + REQUIRE_FALSE(sk.is_estimation_mode()); + REQUIRE(sk.get_num_retained() == 2); + } + + { // exact mode compact sketch + auto sk = usk.compact().filter([](int v){return v > 1;}); + REQUIRE_FALSE(sk.is_empty()); + REQUIRE(sk.is_ordered()); + REQUIRE_FALSE(sk.is_estimation_mode()); + REQUIRE(sk.get_num_retained() == 2); + } + + // only keys 1 and 2 had values of 2, which will become 3 after this update + // some entries are discarded in estimation mode, but these happen to survive + // the process is deterministic, so the test will always work + for (int i = 0; i < 10000; ++i) usk.update(i, 1); + + { // estimation mode update sketch + auto sk = usk.filter([](int v){return v > 2;}); + REQUIRE_FALSE(sk.is_empty()); + REQUIRE_FALSE(sk.is_ordered()); + REQUIRE(sk.is_estimation_mode()); + REQUIRE(sk.get_num_retained() == 2); + } + + { // estimation mode compact sketch + auto sk = usk.compact().filter([](int v){return v > 2;}); + REQUIRE_FALSE(sk.is_empty()); + REQUIRE(sk.is_ordered()); + REQUIRE(sk.is_estimation_mode()); + REQUIRE(sk.get_num_retained() == 2); + } +} + } /* namespace datasketches */ --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
