This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch count_min_python
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git

commit e904bd01d7f92e1d7e801b6569b76ed56a754448
Author: Jon Malkin <[email protected]>
AuthorDate: Wed Apr 5 18:04:54 2023 -0700

    Clean up count min a bit, define to_string method and python wrapper
---
 count/include/count_min.hpp      | 162 +++++++++++++++++++++------------------
 count/include/count_min_impl.hpp |  56 ++++++++++++--
 python/CMakeLists.txt            |   2 +
 python/src/count_wrapper.cpp     |  99 ++++++++++++++++++++++++
 python/src/datasketches.cpp      |   2 +
 5 files changed, 243 insertions(+), 78 deletions(-)

diff --git a/count/include/count_min.hpp b/count/include/count_min.hpp
index 4e9a402..993a60e 100644
--- a/count/include/count_min.hpp
+++ b/count/include/count_min.hpp
@@ -21,29 +21,29 @@ public:
 
   using vector_bytes = std::vector<uint8_t>;
   /**
-  * Creates an instance of the sketch given parameters _num_hashes, 
_num_buckets and hash seed, `seed`.
-  * @param num_hashes : number of hash functions in the sketch. Equivalently 
the number of rows in the array
-  * @param num_buckets : number of buckets that hash functions map into. 
Equivalently the number of columns in the array
-  * @param seed for hash function
-  *
-  * The items inserted into the sketch can be arbitrary type, so long as they 
are hashable via murmurhash.
-  * Only update and estimate methods are added for uint64_t and string types.
-  */
+   * Creates an instance of the sketch given parameters _num_hashes, 
_num_buckets and hash seed, `seed`.
+   * @param num_hashes : number of hash functions in the sketch. Equivalently 
the number of rows in the array
+   * @param num_buckets : number of buckets that hash functions map into. 
Equivalently the number of columns in the array
+   * @param seed for hash function
+   *
+   * The items inserted into the sketch can be arbitrary type, so long as they 
are hashable via murmurhash.
+   * Only update and estimate methods are added for uint64_t and string types.
+   */
   count_min_sketch(uint8_t num_hashes, uint32_t num_buckets, uint64_t seed = 
DEFAULT_SEED) ;
 
   /**
-  * @return configured _num_hashes of this sketch
-  */
+   * @return configured _num_hashes of this sketch
+   */
   uint8_t get_num_hashes() const;
 
   /**
-  * @return configured _num_buckets of this sketch
-  */
+   * @return configured _num_buckets of this sketch
+   */
   uint32_t get_num_buckets() const;
 
   /**
-  * @return configured seed of this sketch
-  */
+   * @return configured seed of this sketch
+   */
   uint64_t get_seed()  const;
 
   /**
@@ -54,20 +54,20 @@ public:
    double get_relative_error() const;
 
   /**
-  * @return _total_weight : typename W
-  * The total weight currently inserted into the stream.
-  */
+   * @return _total_weight : typename W
+   * The total weight currently inserted into the stream.
+   */
   W get_total_weight() const;
 
   /*
- * @param relative_error : double -- the desired accuracy within which 
estimates should lie.
- * For example, when relative_error = 0.05, then the returned frequency 
estimates satisfy the
- * `relative_error` guarantee that never overestimates the weights but may 
underestimate the weights
- * by 5% of the total weight in the sketch.
- * @return number_of_buckets : the number of hash buckets at every level of the
- * sketch required in order to obtain the specified relative error.
- * [1] - Section 3 ``Data Structure'', page 6.
- */
+   * @param relative_error : double -- the desired accuracy within which 
estimates should lie.
+   * For example, when relative_error = 0.05, then the returned frequency 
estimates satisfy the
+   * `relative_error` guarantee that never overestimates the weights but may 
underestimate the weights
+   * by 5% of the total weight in the sketch.
+   * @return number_of_buckets : the number of hash buckets at every level of 
the
+   * sketch required in order to obtain the specified relative error.
+   * [1] - Section 3 ``Data Structure'', page 6.
+   */
   static uint32_t suggest_num_buckets(double relative_error) ;
 
   /*
@@ -77,7 +77,7 @@ public:
    * order to achieve the specified confidence of the sketch.
    * confidence = 1 - delta, with delta denoting the sketch failure 
probability in the literature.
    * [1] - Section 3 ``Data Structure'', page 6.
-  */
+   */
   static uint8_t suggest_num_hashes(double confidence) ;
 
   /**
@@ -88,7 +88,15 @@ public:
    */
   W get_estimate(uint64_t item) const ;
 
-   /**
+  /**
+   * Specific get_estimate function for int64_t type
+   * see generic get_estimate function
+   * @param item : uint64_t type.
+   * @return an estimate of the item's frequency.
+   */
+  W get_estimate(int64_t item) const ;
+
+  /**
    * Specific get_estimate function for std::string type
    * see generic get_estimate function
    * @param item : std::string type
@@ -97,51 +105,53 @@ public:
   W get_estimate(const std::string& item) const;
 
   /**
-  * This is the generic estimate query function for any of the given datatypes.
-  * Query the sketch for the estimate of a given item.
-  * @param item : pointer to the data item to be query from the sketch.
-  * @param size : size_t
-  * @return the estimated frequency of the item denoted f_est satisfying
-  * f_true - relative_error*_total_weight <= f_est <= f_true
-  */
-   W get_estimate(const void* item, size_t size) const ;
+   * This is the generic estimate query function for any of the given 
datatypes.
+   * Query the sketch for the estimate of a given item.
+   * @param item : pointer to the data item to be query from the sketch.
+   * @param size : size_t
+   * @return the estimated frequency of the item denoted f_est satisfying
+   * f_true - relative_error*_total_weight <= f_est <= f_true
+   */
+  W get_estimate(const void* item, size_t size) const ;
 
   /**
-  * Query the sketch for the upper bound of a given item.
-  * @param item : uint64_t or std::string to query
-  * @return the upper bound on the true frequency of the item
-  * f_true <= f_est + relative_error*_total_weight
-  */
-   W get_upper_bound(const void* item, size_t size) const;
-   W get_upper_bound(uint64_t) const ;
-   W get_upper_bound(const std::string& item) const;
+   * Query the sketch for the upper bound of a given item.
+   * @param item : uint64_t or std::string to query
+   * @return the upper bound on the true frequency of the item
+   * f_true <= f_est + relative_error*_total_weight
+   */
+  W get_upper_bound(const void* item, size_t size) const;
+  W get_upper_bound(int64_t) const ;
+  W get_upper_bound(uint64_t) const ;
+  W get_upper_bound(const std::string& item) const;
 
   /**
-  * Query the sketch for the lower bound of a given item.
-  * @param item : uint64_t or std::string to query
-  * @return the lower bound for the query result, f_est, on the true 
frequency, f_est of the item
-  * f_true - relative_error*_total_weight <= f_est
-  */
+   * Query the sketch for the lower bound of a given item.
+   * @param item : uint64_t or std::string to query
+   * @return the lower bound for the query result, f_est, on the true 
frequency, f_est of the item
+   * f_true - relative_error*_total_weight <= f_est
+   */
   W get_lower_bound(const void* item, size_t size) const ;
+  W get_lower_bound(int64_t) const ;
   W get_lower_bound(uint64_t) const ;
   W get_lower_bound(const std::string& item) const ;
 
   /*
-  * Update this sketch with given data of any type.
-  * This is a "universal" update that covers all cases above,
-  * but may produce different hashes.
-  * @param item pointer to the data item to be inserted into the sketch.
-  * @param size of the data in bytes
-  * @return vector of uint64_t which each represent the index to which `value' 
must update in the sketch
-  */
+   * Update this sketch with given data of any type.
+   * This is a "universal" update that covers all cases above,
+   * but may produce different hashes.
+   * @param item pointer to the data item to be inserted into the sketch.
+   * @param size of the data in bytes
+   * @return vector of uint64_t which each represent the index to which 
`value' must update in the sketch
+   */
   void update(const void* item, size_t size, W weight) ;
 
   /**
-  * Update this sketch with a given uint64_t item.
-  * @param item : uint64_t to update the sketch with
-  * @param weight : arithmetic type
-  *  void function which inserts an item of type uint64_t into the sketch
-  */
+   * Update this sketch with a given uint64_t item.
+   * @param item : uint64_t to update the sketch with
+   * @param weight : arithmetic type
+   *  void function which inserts an item of type uint64_t into the sketch
+   */
   void update(uint64_t item, W weight) ;
   void update(uint64_t item) ;
   void update(int64_t item, W weight) ;
@@ -157,8 +167,8 @@ public:
   void update(const std::string& item) ;
 
   /*
-  * merges a separate count_min_sketch into this count_min_sketch.
-  */
+   * merges a separate count_min_sketch into this count_min_sketch.
+   */
   void merge(const count_min_sketch<W> &other_sketch) ;
 
   /**
@@ -169,6 +179,12 @@ public:
    */
   bool is_empty() const ;
 
+  /**
+   * @brief Returns a string describing the sketch
+   * @return A string with a human-readable description of the sketch
+   */
+  string<std::allocator<char>> to_string() const;
+
   // Iterators
   using const_iterator = typename std::vector<W>::const_iterator ;
   const_iterator begin() const;
@@ -234,21 +250,21 @@ public:
   vector_bytes serialize(unsigned header_size_bytes = 0) const;
 
   /**
- * This method deserializes a sketch from a given stream.
- * @param is input stream
- * @param seed the seed for the hash function that was used to create the 
sketch
- * @return an instance of a sketch
- */
+  * This method deserializes a sketch from a given stream.
+  * @param is input stream
+  * @param seed the seed for the hash function that was used to create the 
sketch
+  * @return an instance of a sketch
+  */
   //static count_min_sketch deserialize(std::istream& is, uint64_t 
seed=DEFAULT_SEED) const;
   static count_min_sketch deserialize(std::istream& is, uint64_t seed) ;
 
   /**
- * This method deserializes a sketch from a given array of bytes.
- * @param bytes pointer to the array of bytes
- * @param size the size of the array
- * @param seed the seed for the hash function that was used to create the 
sketch
- * @return an instance of the sketch
- */
+  * This method deserializes a sketch from a given array of bytes.
+  * @param bytes pointer to the array of bytes
+  * @param size the size of the array
+  * @param seed the seed for the hash function that was used to create the 
sketch
+  * @return an instance of the sketch
+  */
   static count_min_sketch deserialize(const void* bytes, size_t size, uint64_t 
seed=DEFAULT_SEED);
 
 private:
diff --git a/count/include/count_min_impl.hpp b/count/include/count_min_impl.hpp
index 2641446..d7ad1db 100644
--- a/count/include/count_min_impl.hpp
+++ b/count/include/count_min_impl.hpp
@@ -5,8 +5,9 @@
 #include "count_min.hpp"
 #include "memory_operations.hpp"
 
+#include <iomanip>
 #include <random>
-
+#include <sstream>
 
 namespace datasketches {
 
@@ -122,6 +123,9 @@ std::vector<uint64_t> count_min_sketch<W>::get_hashes(const 
void* item, size_t s
 template<typename W>
 W count_min_sketch<W>::get_estimate(uint64_t item) const {return 
get_estimate(&item, sizeof(item));}
 
+template<typename W>
+W count_min_sketch<W>::get_estimate(int64_t item) const {return 
get_estimate(&item, sizeof(item));}
+
 template<typename W>
 W count_min_sketch<W>::get_estimate(const std::string& item) const {
   if (item.empty()) return 0 ; // Empty strings are not inserted into the 
sketch.
@@ -152,6 +156,16 @@ void count_min_sketch<W>::update(uint64_t item) {
   update(&item, sizeof(item), 1);
 }
 
+template<typename W>
+void count_min_sketch<W>::update(int64_t item, W weight) {
+  update(&item, sizeof(item), weight);
+}
+
+template<typename W>
+void count_min_sketch<W>::update(int64_t item) {
+  update(&item, sizeof(item), 1);
+}
+
 template<typename W>
 void count_min_sketch<W>::update(const std::string& item, W weight) {
   if (item.empty()) return;
@@ -181,6 +195,9 @@ void count_min_sketch<W>::update(const void* item, size_t 
size, W weight) {
 template<typename W>
 W count_min_sketch<W>::get_upper_bound(uint64_t item) const {return 
get_upper_bound(&item, sizeof(item));}
 
+template<typename W>
+W count_min_sketch<W>::get_upper_bound(int64_t item) const {return 
get_upper_bound(&item, sizeof(item));}
+
 template<typename W>
 W count_min_sketch<W>::get_upper_bound(const std::string& item) const {
   if (item.empty()) return 0 ; // Empty strings are not inserted into the 
sketch.
@@ -192,10 +209,12 @@ W count_min_sketch<W>::get_upper_bound(const void* item, 
size_t size) const {
   return get_estimate(item, size) + get_relative_error()*get_total_weight() ;
 }
 
-
 template<typename W>
 W count_min_sketch<W>::get_lower_bound(uint64_t item) const {return 
get_lower_bound(&item, sizeof(item));}
 
+template<typename W>
+W count_min_sketch<W>::get_lower_bound(int64_t item) const {return 
get_lower_bound(&item, sizeof(item));}
+
 template<typename W>
 W count_min_sketch<W>::get_lower_bound(const std::string& item) const {
   if (item.empty()) return 0 ; // Empty strings are not inserted into the 
sketch.
@@ -302,6 +321,8 @@ count_min_sketch<W> 
count_min_sketch<W>::deserialize(std::istream& is, uint64_t
   const auto flags_byte = read<uint8_t>(is) ;
   read<uint32_t>(is) ; // 4 unused bytes
 
+  check_header_validity(preamble_longs, serial_version, family_id, flags_byte);
+
   // Sketch parameters
   const auto nbuckets = read<uint32_t>(is) ;
   const auto nhashes = read<uint8_t>(is);
@@ -391,7 +412,9 @@ count_min_sketch<W> count_min_sketch<W>::deserialize(const 
void* bytes, size_t s
   uint8_t flags_byte ;
   ptr += copy_from_mem(ptr, flags_byte) ;
   ptr += sizeof(uint32_t);
-  
+
+  check_header_validity(preamble_longs, serial_version, family_id, flags_byte);
+
   // Second 8 bytes are the sketch parameters with a final, unused byte.
   uint32_t nbuckets ;
   uint8_t nhashes ;
@@ -406,7 +429,6 @@ count_min_sketch<W> count_min_sketch<W>::deserialize(const 
void* bytes, size_t s
                                 + std::to_string(compute_seed_hash(seed)));
   }
   count_min_sketch<W> c(nhashes, nbuckets, seed) ;
-  check_header_validity(preamble_longs, serial_version, family_id, flags_byte);
   const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
   if (is_empty) return c ; // sketch is empty, no need to read further.
 
@@ -416,7 +438,7 @@ count_min_sketch<W> count_min_sketch<W>::deserialize(const 
void* bytes, size_t s
   c._total_weight += weight ;
 
   // All remaining bytes are the sketch table entries.
-  for (auto i = 0; i<c._num_buckets*c._num_hashes ; ++i){
+  for (size_t i = 0; i<c._num_buckets*c._num_hashes ; ++i){
     ptr += copy_from_mem(ptr, c._sketch_array[i]) ;
   }
   return c;
@@ -427,6 +449,30 @@ bool count_min_sketch<W>::is_empty() const {
   return _total_weight == 0;
 }
 
+template<typename W>
+string<std::allocator<char>> count_min_sketch<W>::to_string() const {
+  // count the number of used entries in the sketch
+  uint64_t num_nonzero = 0;
+  for (auto entry : _sketch_array) {
+    if (entry != static_cast<W>(0.0))
+      ++num_nonzero;
+  }
+
+  // Using a temporary stream for implementation here does not comply with 
AllocatorAwareContainer requirements.
+  // The stream does not support passing an allocator instance, and 
alternatives are complicated.
+  std::ostringstream os;
+  os << "### KLL sketch summary:" << std::endl;
+  os << "   num hashes     : " << static_cast<uint32_t>(_num_hashes) << 
std::endl;
+  os << "   num buckets    : " << _num_buckets << std::endl;
+  os << "   capacity bins  : " << _sketch_array.size() << std::endl;
+  os << "   filled bins    : " << num_nonzero << std::endl;
+  os << "   pct filled     : " << std::setprecision(3) << (num_nonzero * 
100.0) / _sketch_array.size() << "%" << std::endl;
+  os << "### End sketch summary" << std::endl;
+
+  //return string<A>(os.str().c_str(), allocator_);
+  return string<std::allocator<char>>(os.str().c_str());
+}
+
 template<typename W>
 void count_min_sketch<W>::check_header_validity(uint8_t preamble_longs, 
uint8_t serial_version,  uint8_t family_id, uint8_t flags_byte) {
   const bool empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index bc85092..586dd4e 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -45,6 +45,7 @@ target_link_libraries(python
     sampling
     req
     quantiles
+    count
     pybind11::module
 )
 
@@ -76,6 +77,7 @@ target_sources(python
     src/req_wrapper.cpp
     src/quantiles_wrapper.cpp
     src/ks_wrapper.cpp
+    src/count_wrapper.cpp
     src/vector_of_kll.cpp
     src/py_serde.cpp
 )
diff --git a/python/src/count_wrapper.cpp b/python/src/count_wrapper.cpp
new file mode 100644
index 0000000..08880e1
--- /dev/null
+++ b/python/src/count_wrapper.cpp
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <pybind11/pybind11.h>
+
+#include "count_min.hpp"
+#include "common_defs.hpp"
+
+namespace py = pybind11;
+
+template<typename W>
+void bind_count_min_sketch(py::module &m, const char* name) {
+  using namespace datasketches;
+
+  py::class_<count_min_sketch<W>>(m, name)
+    .def(py::init<uint8_t, uint32_t, uint64_t>(), py::arg("num_hashes"), 
py::arg("num_buckets"), py::arg("seed")=DEFAULT_SEED)
+    .def(py::init<const count_min_sketch<W>&>())
+    .def_static("suggest_num_buckets", 
&count_min_sketch<W>::suggest_num_buckets, py::arg("relative_error"),
+                "Suggests the number of buckets needed to achieve an accuracy 
within the provided "
+                "relative_error. For example, when relative_error = 0.05, the 
returned frequency estimates "
+                "satisfy the 'relative_error' guarantee that never 
overestimates the weights by may "
+                "underestimate hte weights by 5% of the total weight in the 
sketch. "
+                "Returns the number of hash buckets at every level of the 
sketch required in order to obtain "
+                "the specified relative error.")
+    .def_static("suggest_num_hashes", 
&count_min_sketch<W>::suggest_num_hashes, py::arg("confidence"),
+                "Suggests the number of hashes needed to achieve the provided 
confidence. For example, "
+                "with 95% confidence, frequency estimates satisfy the 
'relative_error' guarantee. "
+                "Returns the number of hash functions that are required in 
order to achieve the specified "
+                "confidence of the sketch. confidence = 1 - delta, with delta 
denoting the sketch failure probability.")
+    .def("__str__", &count_min_sketch<W>::to_string,
+         "Produces a string summary of the sketch")
+    .def("to_string", &count_min_sketch<W>::to_string,
+         "Produces a string summary of the sketch")
+    .def("is_empty", &count_min_sketch<W>::is_empty,
+         "Returns True if the sketch has seen no items, otherwise False")
+    .def("get_num_hashes", &count_min_sketch<W>::get_num_hashes,
+         "Returns the configured number of hashes for the sketch")
+    .def("get_num_buckets", &count_min_sketch<W>::get_num_buckets,
+         "Returns the configured number of buckets for the sketch")
+    .def("get_seed", &count_min_sketch<W>::get_seed,
+         "Returns the base hash seed for the sketch")
+    .def("get_relative_error", &count_min_sketch<W>::get_relative_error,
+         "Returns the maximum permissible error for any frequency estimate 
query")
+    .def("get_total_weight", &count_min_sketch<W>::get_total_weight,
+         "Returns the total weight currently inserted into the stream")
+    .def("update", static_cast<void 
(count_min_sketch<W>::*)(int64_t)>(&count_min_sketch<W>::update), 
py::arg("item"),
+         "Updates the sketch with the given 64-bit integer value")
+    .def("update", static_cast<void (count_min_sketch<W>::*)(const 
std::string&)>(&count_min_sketch<W>::update), py::arg("item"),
+         "Updates the sketch with the given string")
+    .def("get_estimate", static_cast<W (count_min_sketch<W>::*)(int64_t) 
const>(&count_min_sketch<W>::get_estimate), py::arg("item"),
+         "Returns an estimate of the frequency of the provided 64-bit integer 
value")
+    .def("get_estimate", static_cast<W (count_min_sketch<W>::*)(const 
std::string&) const>(&count_min_sketch<W>::get_estimate), py::arg("item"),
+         "Returns an estimate of the frequency of the provided string")
+    .def("get_upper_bound", static_cast<W (count_min_sketch<W>::*)(int64_t) 
const>(&count_min_sketch<W>::get_upper_bound), py::arg("item"),
+         "Returns an upper bound on the estimate for the given 64-bit integer 
value")
+    .def("get_upper_bound", static_cast<W (count_min_sketch<W>::*)(const 
std::string&) const>(&count_min_sketch<W>::get_upper_bound), py::arg("item"),
+         "Returns an upper bound on the estimate for the provided string")
+    .def("get_lower_bound", static_cast<W (count_min_sketch<W>::*)(int64_t) 
const>(&count_min_sketch<W>::get_lower_bound), py::arg("item"),
+         "Returns an lower bound on the estimate for the given 64-bit integer 
value")
+    .def("get_lower_bound", static_cast<W (count_min_sketch<W>::*)(const 
std::string&) const>(&count_min_sketch<W>::get_lower_bound), py::arg("item"),
+         "Returns an lower bound on the estimate for the provided string")
+    .def("merge", &count_min_sketch<W>::merge, py::arg("other"),
+         "Merges the provided other sketch into this one")
+    .def(
+        "serialize",
+        [](const count_min_sketch<W>& sk) {
+          auto bytes = sk.serialize();
+          return py::bytes(reinterpret_cast<const char*>(bytes.data()), 
bytes.size());
+        },
+        "Serializes the sketch into a bytes object"
+    )
+    .def_static(
+        "deserialize",
+        [](const std::string& bytes) { return 
count_min_sketch<W>::deserialize(bytes.data(), bytes.size()); },
+        py::arg("bytes"),
+        "Reads a bytes object and returns the corresponding count_min_sketch"
+    );
+}
+
+void init_count_min(py::module &m) {
+  bind_count_min_sketch<double>(m, "count_min_sketch");
+}
+
diff --git a/python/src/datasketches.cpp b/python/src/datasketches.cpp
index b34eea7..9f93896 100644
--- a/python/src/datasketches.cpp
+++ b/python/src/datasketches.cpp
@@ -30,6 +30,7 @@ void init_theta(py::module& m);
 void init_vo(py::module& m);
 void init_req(py::module& m);
 void init_quantiles(py::module& m);
+void init_count_min(py::module& m);
 void init_vector_of_kll(py::module& m);
 
 // supporting objects
@@ -45,6 +46,7 @@ PYBIND11_MODULE(_datasketches, m) {
   init_vo(m);
   init_req(m);
   init_quantiles(m);
+  init_count_min(m);
   init_vector_of_kll(m);
 
   init_kolmogorov_smirnov(m);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to