This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch filter
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git

commit af8e2b9eb70e88983b430d78ea4209691dd77731
Author: AlexanderSaydakov <[email protected]>
AuthorDate: Fri May 10 15:28:35 2024 -0700

    implemented filter
---
 tuple/include/tuple_sketch.hpp      | 29 +++++++++++++++++-
 tuple/include/tuple_sketch_impl.hpp | 33 ++++++++++++++++++++
 tuple/test/tuple_sketch_test.cpp    | 61 +++++++++++++++++++++++++++++++++++++
 3 files changed, 122 insertions(+), 1 deletion(-)

diff --git a/tuple/include/tuple_sketch.hpp b/tuple/include/tuple_sketch.hpp
index 383e573..cbfd9f1 100644
--- a/tuple/include/tuple_sketch.hpp
+++ b/tuple/include/tuple_sketch.hpp
@@ -381,6 +381,15 @@ public:
    */
   compact_tuple_sketch<Summary, Allocator> compact(bool ordered = true) const;
 
+  /**
+   * Produces a Compact Tuple sketch from this sketch
+   * by applying a given predicate to each entry.
+   * @param predicate should return true for the entries to keep
+   * @return compact sketch with the entries retained according to the 
predicate
+   */
+  template<typename Predicate>
+  compact_tuple_sketch<Summary, Allocator> filter(const Predicate& predicate) 
const;
+
   virtual iterator begin();
   virtual iterator end();
   virtual const_iterator begin() const;
@@ -480,6 +489,25 @@ public:
   virtual uint32_t get_num_retained() const;
   virtual uint16_t get_seed_hash() const;
 
+  /**
+   * Produces a Compact Tuple sketch from this sketch
+   * by applying a given predicate to each entry.
+   * @param predicate should return true for the entries to keep
+   * @return compact sketch with the entries retained according to the 
predicate
+   */
+  template<typename Predicate>
+  compact_tuple_sketch filter(const Predicate& predicate) const;
+
+  /**
+   * Produces a Compact Tuple sketch from a given sketch (Update or Compact)
+   * by applying a given predicate to each entry.
+   * @param sketch input sketch
+   * @param predicate should return true for the entries to keep
+   * @return compact sketch with the entries retained according to the 
predicate
+   */
+  template<typename Sketch, typename Predicate>
+  static compact_tuple_sketch filter(const Sketch& sketch, const Predicate& 
predicate);
+
   /**
    * This method serializes the sketch into a given stream in a binary form
    * @param os output stream
@@ -579,7 +607,6 @@ protected:
   template<typename E, typename EK, typename P, typename S, typename CS, 
typename A> friend class theta_intersection_base;
   template<typename E, typename EK, typename CS, typename A> friend class 
theta_set_difference_base;
   compact_tuple_sketch(bool is_empty, bool is_ordered, uint16_t seed_hash, 
uint64_t theta, std::vector<Entry, AllocEntry>&& entries);
-
 };
 
 /// Tuple base builder
diff --git a/tuple/include/tuple_sketch_impl.hpp 
b/tuple/include/tuple_sketch_impl.hpp
index 0766e4d..e5bf834 100644
--- a/tuple/include/tuple_sketch_impl.hpp
+++ b/tuple/include/tuple_sketch_impl.hpp
@@ -258,6 +258,12 @@ compact_tuple_sketch<S, A> update_tuple_sketch<S, U, P, 
A>::compact(bool ordered
   return compact_tuple_sketch<S, A>(*this, ordered);
 }
 
+template<typename S, typename U, typename P, typename A>
+template<typename Predicate>
+compact_tuple_sketch<S, A> update_tuple_sketch<S, U, P, A>::filter(const 
Predicate& predicate) const {
+  return compact_tuple_sketch<S, A>::filter(*this, predicate);
+}
+
 template<typename S, typename U, typename P, typename A>
 void update_tuple_sketch<S, U, P, A>::print_specifics(std::ostringstream& os) 
const {
   os << "   lg nominal size      : " << (int) map_.lg_nom_size_ << std::endl;
@@ -344,6 +350,33 @@ uint16_t compact_tuple_sketch<S, A>::get_seed_hash() const 
{
   return seed_hash_;
 }
 
+template<typename S, typename A>
+template<typename Predicate>
+compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::filter(const Predicate& 
predicate) const {
+  return filter(*this, predicate);
+}
+
+template<typename S, typename A>
+template<typename Sketch, typename Predicate>
+compact_tuple_sketch<S, A> compact_tuple_sketch<S, A>::filter(const Sketch& 
sketch, const Predicate& predicate) {
+  std::vector<Entry, AllocEntry> entries(sketch.get_allocator());
+  entries.reserve(sketch.get_num_retained());
+  std::copy_if(
+    sketch.begin(),
+    sketch.end(),
+    std::back_inserter(entries),
+    [&predicate](const Entry& e) {return predicate(e.second);}
+  );
+  entries.shrink_to_fit();
+  return compact_tuple_sketch(
+    !sketch.is_estimation_mode() && entries.empty(),
+    sketch.is_ordered(),
+    sketch.get_seed_hash(),
+    sketch.get_theta64(),
+    std::move(entries)
+  );
+}
+
 // implementation for fixed-size arithmetic types (integral and floating point)
 template<typename S, typename A>
 template<typename SD, typename SS, typename 
std::enable_if<std::is_arithmetic<SS>::value, int>::type>
diff --git a/tuple/test/tuple_sketch_test.cpp b/tuple/test/tuple_sketch_test.cpp
index b5316e8..6e44e28 100644
--- a/tuple/test/tuple_sketch_test.cpp
+++ b/tuple/test/tuple_sketch_test.cpp
@@ -310,4 +310,65 @@ TEST_CASE("tuple sketch: float, update with different 
types of keys", "[tuple_sk
   REQUIRE(sketch.get_num_retained() == 3);
 }
 
+TEST_CASE("filter", "[tuple_sketch]") {
+  auto usk = update_tuple_sketch<int>::builder().build();
+
+  { // empty update sketch
+    auto sk = usk.filter([](int){return true;});
+    REQUIRE(sk.is_empty());
+    REQUIRE(sk.is_ordered());
+    REQUIRE(sk.get_num_retained() == 0);
+  }
+
+  { // empty compact sketch
+    auto sk = usk.compact().filter([](int){return true;});
+    REQUIRE(sk.is_empty());
+    REQUIRE(sk.is_ordered());
+    REQUIRE(sk.get_num_retained() == 0);
+  }
+
+  usk.update(1, 1);
+  usk.update(1, 1);
+  usk.update(2, 1);
+  usk.update(2, 1);
+  usk.update(3, 1);
+
+  { // exact mode update sketch
+    auto sk = usk.filter([](int v){return v > 1;});
+    REQUIRE_FALSE(sk.is_empty());
+    REQUIRE_FALSE(sk.is_ordered());
+    REQUIRE_FALSE(sk.is_estimation_mode());
+    REQUIRE(sk.get_num_retained() == 2);
+  }
+
+  { // exact mode compact sketch
+    auto sk = usk.compact().filter([](int v){return v > 1;});
+    REQUIRE_FALSE(sk.is_empty());
+    REQUIRE(sk.is_ordered());
+    REQUIRE_FALSE(sk.is_estimation_mode());
+    REQUIRE(sk.get_num_retained() == 2);
+  }
+
+  // only keys 1 and 2 had values of 2, which will become 3 after this update
+  // some entries are discarded in estimation mode, but these happen to survive
+  // the process is deterministic, so the test will always work
+  for (int i = 0; i < 10000; ++i) usk.update(i, 1);
+
+  { // estimation mode update sketch
+    auto sk = usk.filter([](int v){return v > 2;});
+    REQUIRE_FALSE(sk.is_empty());
+    REQUIRE_FALSE(sk.is_ordered());
+    REQUIRE(sk.is_estimation_mode());
+    REQUIRE(sk.get_num_retained() == 2);
+  }
+
+  { // estimation mode compact sketch
+    auto sk = usk.compact().filter([](int v){return v > 2;});
+    REQUIRE_FALSE(sk.is_empty());
+    REQUIRE(sk.is_ordered());
+    REQUIRE(sk.is_estimation_mode());
+    REQUIRE(sk.get_num_retained() == 2);
+  }
+}
+
 } /* namespace datasketches */


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to