This is an automated email from the ASF dual-hosted git repository. jmalkin pushed a commit to branch count_min_python in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git
commit 68bb1e0fceb0ef962aacdbea6e46d85d7749c3a5 Author: Jon Malkin <[email protected]> AuthorDate: Wed Apr 5 19:14:58 2023 -0700 add basic count min python tests --- count/include/count_min.hpp | 2 +- count/include/count_min_impl.hpp | 6 +-- python/src/count_wrapper.cpp | 4 +- python/tests/count_min_test.py | 85 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 91 insertions(+), 6 deletions(-) diff --git a/count/include/count_min.hpp b/count/include/count_min.hpp index 993a60e..c4bd752 100644 --- a/count/include/count_min.hpp +++ b/count/include/count_min.hpp @@ -183,7 +183,7 @@ public: * @brief Returns a string describing the sketch * @return A string with a human-readable description of the sketch */ - string<std::allocator<char>> to_string() const; + string<std::allocator<W>> to_string() const; // Iterators using const_iterator = typename std::vector<W>::const_iterator ; diff --git a/count/include/count_min_impl.hpp b/count/include/count_min_impl.hpp index d7ad1db..8a1551f 100644 --- a/count/include/count_min_impl.hpp +++ b/count/include/count_min_impl.hpp @@ -450,7 +450,7 @@ bool count_min_sketch<W>::is_empty() const { } template<typename W> -string<std::allocator<char>> count_min_sketch<W>::to_string() const { +string<std::allocator<W>> count_min_sketch<W>::to_string() const { // count the number of used entries in the sketch uint64_t num_nonzero = 0; for (auto entry : _sketch_array) { @@ -461,7 +461,7 @@ string<std::allocator<char>> count_min_sketch<W>::to_string() const { // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements. // The stream does not support passing an allocator instance, and alternatives are complicated. std::ostringstream os; - os << "### KLL sketch summary:" << std::endl; + os << "### Count Min sketch summary:" << std::endl; os << " num hashes : " << static_cast<uint32_t>(_num_hashes) << std::endl; os << " num buckets : " << _num_buckets << std::endl; os << " capacity bins : " << _sketch_array.size() << std::endl; @@ -470,7 +470,7 @@ string<std::allocator<char>> count_min_sketch<W>::to_string() const { os << "### End sketch summary" << std::endl; //return string<A>(os.str().c_str(), allocator_); - return string<std::allocator<char>>(os.str().c_str()); + return string<std::allocator<W>>(os.str().c_str()); } template<typename W> diff --git a/python/src/count_wrapper.cpp b/python/src/count_wrapper.cpp index 08880e1..8f6ab58 100644 --- a/python/src/count_wrapper.cpp +++ b/python/src/count_wrapper.cpp @@ -59,9 +59,9 @@ void bind_count_min_sketch(py::module &m, const char* name) { "Returns the maximum permissible error for any frequency estimate query") .def("get_total_weight", &count_min_sketch<W>::get_total_weight, "Returns the total weight currently inserted into the stream") - .def("update", static_cast<void (count_min_sketch<W>::*)(int64_t)>(&count_min_sketch<W>::update), py::arg("item"), + .def("update", static_cast<void (count_min_sketch<W>::*)(int64_t, W)>(&count_min_sketch<W>::update), py::arg("item"), py::arg("weight")=1.0, "Updates the sketch with the given 64-bit integer value") - .def("update", static_cast<void (count_min_sketch<W>::*)(const std::string&)>(&count_min_sketch<W>::update), py::arg("item"), + .def("update", static_cast<void (count_min_sketch<W>::*)(const std::string&, W)>(&count_min_sketch<W>::update), py::arg("item"), py::arg("weight")=1.0, "Updates the sketch with the given string") .def("get_estimate", static_cast<W (count_min_sketch<W>::*)(int64_t) const>(&count_min_sketch<W>::get_estimate), py::arg("item"), "Returns an estimate of the frequency of the provided 64-bit integer value") diff --git a/python/tests/count_min_test.py b/python/tests/count_min_test.py new file mode 100644 index 0000000..3bd1eff --- /dev/null +++ b/python/tests/count_min_test.py @@ -0,0 +1,85 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import unittest +from datasketches import count_min_sketch + +class CountMinTest(unittest.TestCase): + def test_count_min_example(self): + # we'll define target confidence and relative error and use the built-in + # methods to determine how many hashes and buckets to use + confidence = 0.95 + num_hashes = count_min_sketch.suggest_num_hashes(confidence) + relative_error = 0.01 + num_buckets = count_min_sketch.suggest_num_buckets(relative_error) + + # now we can create a few empty sketches + cm = count_min_sketch(num_hashes, num_buckets) + cm2 = count_min_sketch(num_hashes, num_buckets) + self.assertTrue(cm.is_empty()) + + # we'll use a moderate number of distinct items with + # increasing weights, with each item's weight being + # equal to its value + n = 1000 + total_wt = 0 + for i in range(1, n+1): + cm.update(i, i) + total_wt += i + self.assertFalse(cm.is_empty()) + self.assertEqual(cm.get_total_weight(), total_wt) + + # querying the items, each of them should + # have a non-zero count. the estimate should + # be at least i with appropriately behaved bounds. + for i in range(1, n+1): + val = cm.get_estimate(i) + self.assertGreaterEqual(val, i) + self.assertGreaterEqual(val, cm.get_lower_bound(i)) + self.assertGreater(cm.get_upper_bound(i), val) + + # values not in the sketch should have lower estimates, but + # are not guaranteed to be zero and will succeed + self.assertIsNotNone(cm.get_estimate("not in set")) + + # we can create another sketch with partial overlap + # and merge them + for i in range(int(n / 2), int(3 * n / 2)): + cm2.update(i, i) + cm.merge(cm2) + + # and the estimated weight for the meerged values should now be + # at least 2x the value + self.assertGreaterEqual(cm.get_estimate(n), 2 * n) + + # finally, serialize and reconstruct + cm_bytes = cm.serialize() + new_cm = count_min_sketch.deserialize(cm_bytes) + + # and now interrogate the sketch + self.assertFalse(new_cm.is_empty()) + self.assertEqual(new_cm.get_num_hashes(), cm.get_num_hashes()) + self.assertEqual(new_cm.get_num_buckets(), cm.get_num_buckets()) + self.assertEqual(new_cm.get_total_weight(), cm.get_total_weight()) + + # we can also iterate through values in and out of the sketch to ensure + # the estimates match + for i in range(0, 2 * n): + self.assertEqual(cm.get_estimate(i), new_cm.get_estimate(i)) + +if __name__ == '__main__': + unittest.main() --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
