This is an automated email from the ASF dual-hosted git repository. jmalkin pushed a commit to branch count_min_python in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git
commit e904bd01d7f92e1d7e801b6569b76ed56a754448 Author: Jon Malkin <[email protected]> AuthorDate: Wed Apr 5 18:04:54 2023 -0700 Clean up count min a bit, define to_string method and python wrapper --- count/include/count_min.hpp | 162 +++++++++++++++++++++------------------ count/include/count_min_impl.hpp | 56 ++++++++++++-- python/CMakeLists.txt | 2 + python/src/count_wrapper.cpp | 99 ++++++++++++++++++++++++ python/src/datasketches.cpp | 2 + 5 files changed, 243 insertions(+), 78 deletions(-) diff --git a/count/include/count_min.hpp b/count/include/count_min.hpp index 4e9a402..993a60e 100644 --- a/count/include/count_min.hpp +++ b/count/include/count_min.hpp @@ -21,29 +21,29 @@ public: using vector_bytes = std::vector<uint8_t>; /** - * Creates an instance of the sketch given parameters _num_hashes, _num_buckets and hash seed, `seed`. - * @param num_hashes : number of hash functions in the sketch. Equivalently the number of rows in the array - * @param num_buckets : number of buckets that hash functions map into. Equivalently the number of columns in the array - * @param seed for hash function - * - * The items inserted into the sketch can be arbitrary type, so long as they are hashable via murmurhash. - * Only update and estimate methods are added for uint64_t and string types. - */ + * Creates an instance of the sketch given parameters _num_hashes, _num_buckets and hash seed, `seed`. + * @param num_hashes : number of hash functions in the sketch. Equivalently the number of rows in the array + * @param num_buckets : number of buckets that hash functions map into. Equivalently the number of columns in the array + * @param seed for hash function + * + * The items inserted into the sketch can be arbitrary type, so long as they are hashable via murmurhash. + * Only update and estimate methods are added for uint64_t and string types. + */ count_min_sketch(uint8_t num_hashes, uint32_t num_buckets, uint64_t seed = DEFAULT_SEED) ; /** - * @return configured _num_hashes of this sketch - */ + * @return configured _num_hashes of this sketch + */ uint8_t get_num_hashes() const; /** - * @return configured _num_buckets of this sketch - */ + * @return configured _num_buckets of this sketch + */ uint32_t get_num_buckets() const; /** - * @return configured seed of this sketch - */ + * @return configured seed of this sketch + */ uint64_t get_seed() const; /** @@ -54,20 +54,20 @@ public: double get_relative_error() const; /** - * @return _total_weight : typename W - * The total weight currently inserted into the stream. - */ + * @return _total_weight : typename W + * The total weight currently inserted into the stream. + */ W get_total_weight() const; /* - * @param relative_error : double -- the desired accuracy within which estimates should lie. - * For example, when relative_error = 0.05, then the returned frequency estimates satisfy the - * `relative_error` guarantee that never overestimates the weights but may underestimate the weights - * by 5% of the total weight in the sketch. - * @return number_of_buckets : the number of hash buckets at every level of the - * sketch required in order to obtain the specified relative error. - * [1] - Section 3 ``Data Structure'', page 6. - */ + * @param relative_error : double -- the desired accuracy within which estimates should lie. + * For example, when relative_error = 0.05, then the returned frequency estimates satisfy the + * `relative_error` guarantee that never overestimates the weights but may underestimate the weights + * by 5% of the total weight in the sketch. + * @return number_of_buckets : the number of hash buckets at every level of the + * sketch required in order to obtain the specified relative error. + * [1] - Section 3 ``Data Structure'', page 6. + */ static uint32_t suggest_num_buckets(double relative_error) ; /* @@ -77,7 +77,7 @@ public: * order to achieve the specified confidence of the sketch. * confidence = 1 - delta, with delta denoting the sketch failure probability in the literature. * [1] - Section 3 ``Data Structure'', page 6. - */ + */ static uint8_t suggest_num_hashes(double confidence) ; /** @@ -88,7 +88,15 @@ public: */ W get_estimate(uint64_t item) const ; - /** + /** + * Specific get_estimate function for int64_t type + * see generic get_estimate function + * @param item : uint64_t type. + * @return an estimate of the item's frequency. + */ + W get_estimate(int64_t item) const ; + + /** * Specific get_estimate function for std::string type * see generic get_estimate function * @param item : std::string type @@ -97,51 +105,53 @@ public: W get_estimate(const std::string& item) const; /** - * This is the generic estimate query function for any of the given datatypes. - * Query the sketch for the estimate of a given item. - * @param item : pointer to the data item to be query from the sketch. - * @param size : size_t - * @return the estimated frequency of the item denoted f_est satisfying - * f_true - relative_error*_total_weight <= f_est <= f_true - */ - W get_estimate(const void* item, size_t size) const ; + * This is the generic estimate query function for any of the given datatypes. + * Query the sketch for the estimate of a given item. + * @param item : pointer to the data item to be query from the sketch. + * @param size : size_t + * @return the estimated frequency of the item denoted f_est satisfying + * f_true - relative_error*_total_weight <= f_est <= f_true + */ + W get_estimate(const void* item, size_t size) const ; /** - * Query the sketch for the upper bound of a given item. - * @param item : uint64_t or std::string to query - * @return the upper bound on the true frequency of the item - * f_true <= f_est + relative_error*_total_weight - */ - W get_upper_bound(const void* item, size_t size) const; - W get_upper_bound(uint64_t) const ; - W get_upper_bound(const std::string& item) const; + * Query the sketch for the upper bound of a given item. + * @param item : uint64_t or std::string to query + * @return the upper bound on the true frequency of the item + * f_true <= f_est + relative_error*_total_weight + */ + W get_upper_bound(const void* item, size_t size) const; + W get_upper_bound(int64_t) const ; + W get_upper_bound(uint64_t) const ; + W get_upper_bound(const std::string& item) const; /** - * Query the sketch for the lower bound of a given item. - * @param item : uint64_t or std::string to query - * @return the lower bound for the query result, f_est, on the true frequency, f_est of the item - * f_true - relative_error*_total_weight <= f_est - */ + * Query the sketch for the lower bound of a given item. + * @param item : uint64_t or std::string to query + * @return the lower bound for the query result, f_est, on the true frequency, f_est of the item + * f_true - relative_error*_total_weight <= f_est + */ W get_lower_bound(const void* item, size_t size) const ; + W get_lower_bound(int64_t) const ; W get_lower_bound(uint64_t) const ; W get_lower_bound(const std::string& item) const ; /* - * Update this sketch with given data of any type. - * This is a "universal" update that covers all cases above, - * but may produce different hashes. - * @param item pointer to the data item to be inserted into the sketch. - * @param size of the data in bytes - * @return vector of uint64_t which each represent the index to which `value' must update in the sketch - */ + * Update this sketch with given data of any type. + * This is a "universal" update that covers all cases above, + * but may produce different hashes. + * @param item pointer to the data item to be inserted into the sketch. + * @param size of the data in bytes + * @return vector of uint64_t which each represent the index to which `value' must update in the sketch + */ void update(const void* item, size_t size, W weight) ; /** - * Update this sketch with a given uint64_t item. - * @param item : uint64_t to update the sketch with - * @param weight : arithmetic type - * void function which inserts an item of type uint64_t into the sketch - */ + * Update this sketch with a given uint64_t item. + * @param item : uint64_t to update the sketch with + * @param weight : arithmetic type + * void function which inserts an item of type uint64_t into the sketch + */ void update(uint64_t item, W weight) ; void update(uint64_t item) ; void update(int64_t item, W weight) ; @@ -157,8 +167,8 @@ public: void update(const std::string& item) ; /* - * merges a separate count_min_sketch into this count_min_sketch. - */ + * merges a separate count_min_sketch into this count_min_sketch. + */ void merge(const count_min_sketch<W> &other_sketch) ; /** @@ -169,6 +179,12 @@ public: */ bool is_empty() const ; + /** + * @brief Returns a string describing the sketch + * @return A string with a human-readable description of the sketch + */ + string<std::allocator<char>> to_string() const; + // Iterators using const_iterator = typename std::vector<W>::const_iterator ; const_iterator begin() const; @@ -234,21 +250,21 @@ public: vector_bytes serialize(unsigned header_size_bytes = 0) const; /** - * This method deserializes a sketch from a given stream. - * @param is input stream - * @param seed the seed for the hash function that was used to create the sketch - * @return an instance of a sketch - */ + * This method deserializes a sketch from a given stream. + * @param is input stream + * @param seed the seed for the hash function that was used to create the sketch + * @return an instance of a sketch + */ //static count_min_sketch deserialize(std::istream& is, uint64_t seed=DEFAULT_SEED) const; static count_min_sketch deserialize(std::istream& is, uint64_t seed) ; /** - * This method deserializes a sketch from a given array of bytes. - * @param bytes pointer to the array of bytes - * @param size the size of the array - * @param seed the seed for the hash function that was used to create the sketch - * @return an instance of the sketch - */ + * This method deserializes a sketch from a given array of bytes. + * @param bytes pointer to the array of bytes + * @param size the size of the array + * @param seed the seed for the hash function that was used to create the sketch + * @return an instance of the sketch + */ static count_min_sketch deserialize(const void* bytes, size_t size, uint64_t seed=DEFAULT_SEED); private: diff --git a/count/include/count_min_impl.hpp b/count/include/count_min_impl.hpp index 2641446..d7ad1db 100644 --- a/count/include/count_min_impl.hpp +++ b/count/include/count_min_impl.hpp @@ -5,8 +5,9 @@ #include "count_min.hpp" #include "memory_operations.hpp" +#include <iomanip> #include <random> - +#include <sstream> namespace datasketches { @@ -122,6 +123,9 @@ std::vector<uint64_t> count_min_sketch<W>::get_hashes(const void* item, size_t s template<typename W> W count_min_sketch<W>::get_estimate(uint64_t item) const {return get_estimate(&item, sizeof(item));} +template<typename W> +W count_min_sketch<W>::get_estimate(int64_t item) const {return get_estimate(&item, sizeof(item));} + template<typename W> W count_min_sketch<W>::get_estimate(const std::string& item) const { if (item.empty()) return 0 ; // Empty strings are not inserted into the sketch. @@ -152,6 +156,16 @@ void count_min_sketch<W>::update(uint64_t item) { update(&item, sizeof(item), 1); } +template<typename W> +void count_min_sketch<W>::update(int64_t item, W weight) { + update(&item, sizeof(item), weight); +} + +template<typename W> +void count_min_sketch<W>::update(int64_t item) { + update(&item, sizeof(item), 1); +} + template<typename W> void count_min_sketch<W>::update(const std::string& item, W weight) { if (item.empty()) return; @@ -181,6 +195,9 @@ void count_min_sketch<W>::update(const void* item, size_t size, W weight) { template<typename W> W count_min_sketch<W>::get_upper_bound(uint64_t item) const {return get_upper_bound(&item, sizeof(item));} +template<typename W> +W count_min_sketch<W>::get_upper_bound(int64_t item) const {return get_upper_bound(&item, sizeof(item));} + template<typename W> W count_min_sketch<W>::get_upper_bound(const std::string& item) const { if (item.empty()) return 0 ; // Empty strings are not inserted into the sketch. @@ -192,10 +209,12 @@ W count_min_sketch<W>::get_upper_bound(const void* item, size_t size) const { return get_estimate(item, size) + get_relative_error()*get_total_weight() ; } - template<typename W> W count_min_sketch<W>::get_lower_bound(uint64_t item) const {return get_lower_bound(&item, sizeof(item));} +template<typename W> +W count_min_sketch<W>::get_lower_bound(int64_t item) const {return get_lower_bound(&item, sizeof(item));} + template<typename W> W count_min_sketch<W>::get_lower_bound(const std::string& item) const { if (item.empty()) return 0 ; // Empty strings are not inserted into the sketch. @@ -302,6 +321,8 @@ count_min_sketch<W> count_min_sketch<W>::deserialize(std::istream& is, uint64_t const auto flags_byte = read<uint8_t>(is) ; read<uint32_t>(is) ; // 4 unused bytes + check_header_validity(preamble_longs, serial_version, family_id, flags_byte); + // Sketch parameters const auto nbuckets = read<uint32_t>(is) ; const auto nhashes = read<uint8_t>(is); @@ -391,7 +412,9 @@ count_min_sketch<W> count_min_sketch<W>::deserialize(const void* bytes, size_t s uint8_t flags_byte ; ptr += copy_from_mem(ptr, flags_byte) ; ptr += sizeof(uint32_t); - + + check_header_validity(preamble_longs, serial_version, family_id, flags_byte); + // Second 8 bytes are the sketch parameters with a final, unused byte. uint32_t nbuckets ; uint8_t nhashes ; @@ -406,7 +429,6 @@ count_min_sketch<W> count_min_sketch<W>::deserialize(const void* bytes, size_t s + std::to_string(compute_seed_hash(seed))); } count_min_sketch<W> c(nhashes, nbuckets, seed) ; - check_header_validity(preamble_longs, serial_version, family_id, flags_byte); const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0; if (is_empty) return c ; // sketch is empty, no need to read further. @@ -416,7 +438,7 @@ count_min_sketch<W> count_min_sketch<W>::deserialize(const void* bytes, size_t s c._total_weight += weight ; // All remaining bytes are the sketch table entries. - for (auto i = 0; i<c._num_buckets*c._num_hashes ; ++i){ + for (size_t i = 0; i<c._num_buckets*c._num_hashes ; ++i){ ptr += copy_from_mem(ptr, c._sketch_array[i]) ; } return c; @@ -427,6 +449,30 @@ bool count_min_sketch<W>::is_empty() const { return _total_weight == 0; } +template<typename W> +string<std::allocator<char>> count_min_sketch<W>::to_string() const { + // count the number of used entries in the sketch + uint64_t num_nonzero = 0; + for (auto entry : _sketch_array) { + if (entry != static_cast<W>(0.0)) + ++num_nonzero; + } + + // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements. + // The stream does not support passing an allocator instance, and alternatives are complicated. + std::ostringstream os; + os << "### KLL sketch summary:" << std::endl; + os << " num hashes : " << static_cast<uint32_t>(_num_hashes) << std::endl; + os << " num buckets : " << _num_buckets << std::endl; + os << " capacity bins : " << _sketch_array.size() << std::endl; + os << " filled bins : " << num_nonzero << std::endl; + os << " pct filled : " << std::setprecision(3) << (num_nonzero * 100.0) / _sketch_array.size() << "%" << std::endl; + os << "### End sketch summary" << std::endl; + + //return string<A>(os.str().c_str(), allocator_); + return string<std::allocator<char>>(os.str().c_str()); +} + template<typename W> void count_min_sketch<W>::check_header_validity(uint8_t preamble_longs, uint8_t serial_version, uint8_t family_id, uint8_t flags_byte) { const bool empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0; diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index bc85092..586dd4e 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -45,6 +45,7 @@ target_link_libraries(python sampling req quantiles + count pybind11::module ) @@ -76,6 +77,7 @@ target_sources(python src/req_wrapper.cpp src/quantiles_wrapper.cpp src/ks_wrapper.cpp + src/count_wrapper.cpp src/vector_of_kll.cpp src/py_serde.cpp ) diff --git a/python/src/count_wrapper.cpp b/python/src/count_wrapper.cpp new file mode 100644 index 0000000..08880e1 --- /dev/null +++ b/python/src/count_wrapper.cpp @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include <pybind11/pybind11.h> + +#include "count_min.hpp" +#include "common_defs.hpp" + +namespace py = pybind11; + +template<typename W> +void bind_count_min_sketch(py::module &m, const char* name) { + using namespace datasketches; + + py::class_<count_min_sketch<W>>(m, name) + .def(py::init<uint8_t, uint32_t, uint64_t>(), py::arg("num_hashes"), py::arg("num_buckets"), py::arg("seed")=DEFAULT_SEED) + .def(py::init<const count_min_sketch<W>&>()) + .def_static("suggest_num_buckets", &count_min_sketch<W>::suggest_num_buckets, py::arg("relative_error"), + "Suggests the number of buckets needed to achieve an accuracy within the provided " + "relative_error. For example, when relative_error = 0.05, the returned frequency estimates " + "satisfy the 'relative_error' guarantee that never overestimates the weights by may " + "underestimate hte weights by 5% of the total weight in the sketch. " + "Returns the number of hash buckets at every level of the sketch required in order to obtain " + "the specified relative error.") + .def_static("suggest_num_hashes", &count_min_sketch<W>::suggest_num_hashes, py::arg("confidence"), + "Suggests the number of hashes needed to achieve the provided confidence. For example, " + "with 95% confidence, frequency estimates satisfy the 'relative_error' guarantee. " + "Returns the number of hash functions that are required in order to achieve the specified " + "confidence of the sketch. confidence = 1 - delta, with delta denoting the sketch failure probability.") + .def("__str__", &count_min_sketch<W>::to_string, + "Produces a string summary of the sketch") + .def("to_string", &count_min_sketch<W>::to_string, + "Produces a string summary of the sketch") + .def("is_empty", &count_min_sketch<W>::is_empty, + "Returns True if the sketch has seen no items, otherwise False") + .def("get_num_hashes", &count_min_sketch<W>::get_num_hashes, + "Returns the configured number of hashes for the sketch") + .def("get_num_buckets", &count_min_sketch<W>::get_num_buckets, + "Returns the configured number of buckets for the sketch") + .def("get_seed", &count_min_sketch<W>::get_seed, + "Returns the base hash seed for the sketch") + .def("get_relative_error", &count_min_sketch<W>::get_relative_error, + "Returns the maximum permissible error for any frequency estimate query") + .def("get_total_weight", &count_min_sketch<W>::get_total_weight, + "Returns the total weight currently inserted into the stream") + .def("update", static_cast<void (count_min_sketch<W>::*)(int64_t)>(&count_min_sketch<W>::update), py::arg("item"), + "Updates the sketch with the given 64-bit integer value") + .def("update", static_cast<void (count_min_sketch<W>::*)(const std::string&)>(&count_min_sketch<W>::update), py::arg("item"), + "Updates the sketch with the given string") + .def("get_estimate", static_cast<W (count_min_sketch<W>::*)(int64_t) const>(&count_min_sketch<W>::get_estimate), py::arg("item"), + "Returns an estimate of the frequency of the provided 64-bit integer value") + .def("get_estimate", static_cast<W (count_min_sketch<W>::*)(const std::string&) const>(&count_min_sketch<W>::get_estimate), py::arg("item"), + "Returns an estimate of the frequency of the provided string") + .def("get_upper_bound", static_cast<W (count_min_sketch<W>::*)(int64_t) const>(&count_min_sketch<W>::get_upper_bound), py::arg("item"), + "Returns an upper bound on the estimate for the given 64-bit integer value") + .def("get_upper_bound", static_cast<W (count_min_sketch<W>::*)(const std::string&) const>(&count_min_sketch<W>::get_upper_bound), py::arg("item"), + "Returns an upper bound on the estimate for the provided string") + .def("get_lower_bound", static_cast<W (count_min_sketch<W>::*)(int64_t) const>(&count_min_sketch<W>::get_lower_bound), py::arg("item"), + "Returns an lower bound on the estimate for the given 64-bit integer value") + .def("get_lower_bound", static_cast<W (count_min_sketch<W>::*)(const std::string&) const>(&count_min_sketch<W>::get_lower_bound), py::arg("item"), + "Returns an lower bound on the estimate for the provided string") + .def("merge", &count_min_sketch<W>::merge, py::arg("other"), + "Merges the provided other sketch into this one") + .def( + "serialize", + [](const count_min_sketch<W>& sk) { + auto bytes = sk.serialize(); + return py::bytes(reinterpret_cast<const char*>(bytes.data()), bytes.size()); + }, + "Serializes the sketch into a bytes object" + ) + .def_static( + "deserialize", + [](const std::string& bytes) { return count_min_sketch<W>::deserialize(bytes.data(), bytes.size()); }, + py::arg("bytes"), + "Reads a bytes object and returns the corresponding count_min_sketch" + ); +} + +void init_count_min(py::module &m) { + bind_count_min_sketch<double>(m, "count_min_sketch"); +} + diff --git a/python/src/datasketches.cpp b/python/src/datasketches.cpp index b34eea7..9f93896 100644 --- a/python/src/datasketches.cpp +++ b/python/src/datasketches.cpp @@ -30,6 +30,7 @@ void init_theta(py::module& m); void init_vo(py::module& m); void init_req(py::module& m); void init_quantiles(py::module& m); +void init_count_min(py::module& m); void init_vector_of_kll(py::module& m); // supporting objects @@ -45,6 +46,7 @@ PYBIND11_MODULE(_datasketches, m) { init_vo(m); init_req(m); init_quantiles(m); + init_count_min(m); init_vector_of_kll(m); init_kolmogorov_smirnov(m); --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
