[datasketches-cpp] 02/04: add basic count min python tests

jmalkin Thu, 06 Apr 2023 00:29:44 -0700

This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch count_min_python
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git


commit 68bb1e0fceb0ef962aacdbea6e46d85d7749c3a5
Author: Jon Malkin <[email protected]>
AuthorDate: Wed Apr 5 19:14:58 2023 -0700

    add basic count min python tests
---
 count/include/count_min.hpp      |  2 +-
 count/include/count_min_impl.hpp |  6 +--
 python/src/count_wrapper.cpp     |  4 +-
 python/tests/count_min_test.py   | 85 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 91 insertions(+), 6 deletions(-)

diff --git a/count/include/count_min.hpp b/count/include/count_min.hpp
index 993a60e..c4bd752 100644
--- a/count/include/count_min.hpp
+++ b/count/include/count_min.hpp
@@ -183,7 +183,7 @@ public:
    * @brief Returns a string describing the sketch
    * @return A string with a human-readable description of the sketch
    */
-  string<std::allocator<char>> to_string() const;
+  string<std::allocator<W>> to_string() const;
 
   // Iterators
   using const_iterator = typename std::vector<W>::const_iterator ;
diff --git a/count/include/count_min_impl.hpp b/count/include/count_min_impl.hpp
index d7ad1db..8a1551f 100644
--- a/count/include/count_min_impl.hpp
+++ b/count/include/count_min_impl.hpp
@@ -450,7 +450,7 @@ bool count_min_sketch<W>::is_empty() const {
 }
 
 template<typename W>
-string<std::allocator<char>> count_min_sketch<W>::to_string() const {
+string<std::allocator<W>> count_min_sketch<W>::to_string() const {
   // count the number of used entries in the sketch
   uint64_t num_nonzero = 0;
   for (auto entry : _sketch_array) {
@@ -461,7 +461,7 @@ string<std::allocator<char>> 
count_min_sketch<W>::to_string() const {
   // Using a temporary stream for implementation here does not comply with 
AllocatorAwareContainer requirements.
   // The stream does not support passing an allocator instance, and 
alternatives are complicated.
   std::ostringstream os;
-  os << "### KLL sketch summary:" << std::endl;
+  os << "### Count Min sketch summary:" << std::endl;
   os << "   num hashes     : " << static_cast<uint32_t>(_num_hashes) << 
std::endl;
   os << "   num buckets    : " << _num_buckets << std::endl;
   os << "   capacity bins  : " << _sketch_array.size() << std::endl;
@@ -470,7 +470,7 @@ string<std::allocator<char>> 
count_min_sketch<W>::to_string() const {
   os << "### End sketch summary" << std::endl;
 
   //return string<A>(os.str().c_str(), allocator_);
-  return string<std::allocator<char>>(os.str().c_str());
+  return string<std::allocator<W>>(os.str().c_str());
 }
 
 template<typename W>
diff --git a/python/src/count_wrapper.cpp b/python/src/count_wrapper.cpp
index 08880e1..8f6ab58 100644
--- a/python/src/count_wrapper.cpp
+++ b/python/src/count_wrapper.cpp
@@ -59,9 +59,9 @@ void bind_count_min_sketch(py::module &m, const char* name) {
          "Returns the maximum permissible error for any frequency estimate 
query")
     .def("get_total_weight", &count_min_sketch<W>::get_total_weight,
          "Returns the total weight currently inserted into the stream")
-    .def("update", static_cast<void 
(count_min_sketch<W>::*)(int64_t)>(&count_min_sketch<W>::update), 
py::arg("item"),
+    .def("update", static_cast<void (count_min_sketch<W>::*)(int64_t, 
W)>(&count_min_sketch<W>::update), py::arg("item"), py::arg("weight")=1.0,
          "Updates the sketch with the given 64-bit integer value")
-    .def("update", static_cast<void (count_min_sketch<W>::*)(const 
std::string&)>(&count_min_sketch<W>::update), py::arg("item"),
+    .def("update", static_cast<void (count_min_sketch<W>::*)(const 
std::string&, W)>(&count_min_sketch<W>::update), py::arg("item"), 
py::arg("weight")=1.0,
          "Updates the sketch with the given string")
     .def("get_estimate", static_cast<W (count_min_sketch<W>::*)(int64_t) 
const>(&count_min_sketch<W>::get_estimate), py::arg("item"),
          "Returns an estimate of the frequency of the provided 64-bit integer 
value")
diff --git a/python/tests/count_min_test.py b/python/tests/count_min_test.py
new file mode 100644
index 0000000..3bd1eff
--- /dev/null
+++ b/python/tests/count_min_test.py
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+  
+import unittest
+from datasketches import count_min_sketch
+
+class CountMinTest(unittest.TestCase):
+  def test_count_min_example(self):
+    # we'll define target confidence and relative error and use the built-in
+    # methods to determine how many hashes and buckets to use
+    confidence = 0.95
+    num_hashes = count_min_sketch.suggest_num_hashes(confidence)
+    relative_error = 0.01
+    num_buckets = count_min_sketch.suggest_num_buckets(relative_error)
+
+    # now we can create a few empty sketches
+    cm = count_min_sketch(num_hashes, num_buckets)
+    cm2 = count_min_sketch(num_hashes, num_buckets)
+    self.assertTrue(cm.is_empty())
+
+    # we'll use a moderate number of distinct items with
+    # increasing weights, with each item's weight being
+    # equal to its value
+    n = 1000
+    total_wt = 0
+    for i in range(1, n+1):
+      cm.update(i, i)
+      total_wt += i
+    self.assertFalse(cm.is_empty())
+    self.assertEqual(cm.get_total_weight(), total_wt)
+
+    # querying the items, each of them should
+    # have a non-zero count.  the estimate should
+    # be at least i with appropriately behaved bounds.
+    for i in range(1, n+1):
+      val = cm.get_estimate(i)
+      self.assertGreaterEqual(val, i)
+      self.assertGreaterEqual(val, cm.get_lower_bound(i))
+      self.assertGreater(cm.get_upper_bound(i), val)
+
+    # values not in the sketch should have lower estimates, but
+    # are not guaranteed to be zero and will succeed
+    self.assertIsNotNone(cm.get_estimate("not in set"))
+
+    # we can create another sketch with partial overlap
+    # and merge them
+    for i in range(int(n / 2), int(3 * n / 2)):
+      cm2.update(i, i)
+    cm.merge(cm2)
+
+    # and the estimated weight for the meerged values should now be
+    # at least 2x the value
+    self.assertGreaterEqual(cm.get_estimate(n), 2 * n)
+
+    # finally, serialize and reconstruct
+    cm_bytes = cm.serialize()
+    new_cm = count_min_sketch.deserialize(cm_bytes)
+
+    # and now interrogate the sketch
+    self.assertFalse(new_cm.is_empty())
+    self.assertEqual(new_cm.get_num_hashes(), cm.get_num_hashes())
+    self.assertEqual(new_cm.get_num_buckets(), cm.get_num_buckets())
+    self.assertEqual(new_cm.get_total_weight(), cm.get_total_weight())
+    
+    # we can also iterate through values in and out of the sketch to ensure
+    # the estimates match
+    for i in range(0, 2 * n):
+      self.assertEqual(cm.get_estimate(i), new_cm.get_estimate(i))
+
+if __name__ == '__main__':
+    unittest.main()


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[datasketches-cpp] 02/04: add basic count min python tests

Reply via email to