This is an automated email from the ASF dual-hosted git repository. alsay pushed a commit to branch doxygen in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git
commit ef01d71b973eeb97c267ec5bab935079dfd10d05 Author: AlexanderSaydakov <[email protected]> AuthorDate: Wed Jun 28 21:38:52 2023 -0700 documentation fixes --- common/include/common_defs.hpp | 1 + common/include/kolmogorov_smirnov.hpp | 15 +- common/include/quantiles_sorted_view.hpp | 92 ++++++++++++ common/include/serde.hpp | 87 ++++++++--- count/include/count_min.hpp | 166 ++++++++++++++------- count/include/count_min_impl.hpp | 16 -- cpc/include/cpc_sketch.hpp | 66 ++++---- cpc/include/cpc_union.hpp | 37 ++++- density/include/density_sketch.hpp | 38 +++-- fi/include/frequent_items_sketch.hpp | 22 ++- hll/include/HllSketch-internal.hpp | 4 +- hll/include/hll.hpp | 110 ++++++++------ kll/include/kll_sketch.hpp | 38 ++++- quantiles/include/quantiles_sketch.hpp | 80 ++++++++-- req/include/req_common.hpp | 10 +- req/include/req_sketch.hpp | 99 +++++++++++- sampling/include/var_opt_sketch.hpp | 92 ++++++++---- sampling/include/var_opt_sketch_impl.hpp | 6 +- sampling/include/var_opt_union.hpp | 14 +- theta/include/bounds_on_ratios_in_sampled_sets.hpp | 1 + .../bounds_on_ratios_in_theta_sketched_sets.hpp | 13 +- theta/include/theta_a_not_b.hpp | 9 ++ theta/include/theta_intersection.hpp | 6 +- theta/include/theta_intersection_impl.hpp | 6 +- theta/include/theta_jaccard_similarity.hpp | 3 +- theta/include/theta_jaccard_similarity_base.hpp | 1 + theta/include/theta_sketch.hpp | 56 ++++++- theta/include/theta_union.hpp | 5 + theta/include/theta_union_impl.hpp | 6 +- theta/include/theta_update_sketch_base.hpp | 3 +- tuple/include/array_of_doubles_sketch.hpp | 13 +- tuple/include/tuple_intersection.hpp | 10 ++ tuple/include/tuple_jaccard_similarity.hpp | 1 + tuple/include/tuple_sketch.hpp | 83 ++++++++--- tuple/include/tuple_union.hpp | 8 + tuple/test/tuple_sketch_test.cpp | 2 +- 36 files changed, 904 insertions(+), 315 deletions(-) diff --git a/common/include/common_defs.hpp b/common/include/common_defs.hpp index fc80fe0..280ad7a 100644 --- a/common/include/common_defs.hpp +++ b/common/include/common_defs.hpp @@ -28,6 +28,7 @@ #include <chrono> #include <thread> +/// DataSketches namespace namespace datasketches { static const uint64_t DEFAULT_SEED = 9001; diff --git a/common/include/kolmogorov_smirnov.hpp b/common/include/kolmogorov_smirnov.hpp index d00853d..90f226c 100644 --- a/common/include/kolmogorov_smirnov.hpp +++ b/common/include/kolmogorov_smirnov.hpp @@ -22,13 +22,16 @@ namespace datasketches { +/** + * Kolmogorov-Smirnov test for KLL or Quantiles sketches + */ class kolmogorov_smirnov { public: /** * Computes the raw delta area between two quantile sketches for the Kolmogorov-Smirnov Test. * Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T. - * @param sketch1 KLL sketch 1 - * @param sketch2 KLL sketch 2 + * @param sketch1 sketch 1 + * @param sketch2 sketch 2 * @return the raw delta between two KLL quantile sketches */ template<typename Sketch> @@ -39,8 +42,8 @@ public: * Adjusts the computed threshold by the error epsilons of the two given sketches. * See <a href="https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">Kolmogorov–Smirnov Test</a> * Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T. - * @param sketch1 KLL sketch 1 - * @param sketch2 KLL sketch 2 + * @param sketch1 sketch 1 + * @param sketch2 sketch 2 * @param p Target p-value. Typically .001 to .1, e.g., .05. * @return the adjusted threshold to be compared with the raw delta */ @@ -52,8 +55,8 @@ public: * Will work for a type-matched pair of KLL or Quantiles sketches of the same parameterized type T. * Note: if the given sketches have insufficient data or if the sketch sizes are too small, * this will return false. - * @param sketch1 KLL sketch 1 - * @param sketch2 KLL sketch 2 + * @param sketch1 sketch 1 + * @param sketch2 sketch 2 * @param p Target p-value. Typically .001 to .1, e.g., .05. * @return Boolean indicating whether we can reject the null hypothesis (that the sketches * reflect the same underlying distribution) using the provided p-value. diff --git a/common/include/quantiles_sorted_view.hpp b/common/include/quantiles_sorted_view.hpp index e965cb1..465f186 100755 --- a/common/include/quantiles_sorted_view.hpp +++ b/common/include/quantiles_sorted_view.hpp @@ -27,6 +27,9 @@ namespace datasketches { +/** + * Sorted view for quantiles sketches (REQ, KLL and Quantiles) + */ template< typename T, typename Comparator, // strict weak ordering function (see C++ named requirements: Compare) @@ -34,30 +37,119 @@ template< > class quantiles_sorted_view { public: + /// Entry type using Entry = typename std::conditional<std::is_arithmetic<T>::value, std::pair<T, uint64_t>, std::pair<const T*, uint64_t>>::type; using AllocEntry = typename std::allocator_traits<Allocator>::template rebind_alloc<Entry>; using Container = std::vector<Entry, AllocEntry>; + /// @private quantiles_sorted_view(uint32_t num, const Comparator& comparator, const Allocator& allocator); + /// @private template<typename Iterator> void add(Iterator begin, Iterator end, uint64_t weight); + /// @private void convert_to_cummulative(); class const_iterator; + + /** + * Iterator pointing to the first entry in the view. + * If the view is empty, the returned iterator must not be dereferenced or incremented. + * @return iterator pointing to the first entry + */ const_iterator begin() const; + + /** + * Iterator pointing to the past-the-end entry in the view. + * The past-the-end entry is the hypothetical entry that would follow the last entry. + * It does not point to any entry, and must not be dereferenced or incremented. + * @return iterator pointing to the past-the-end entry + */ const_iterator end() const; + /// @return size of the view size_t size() const; + /** + * Returns an approximation to the normalized rank of the given item from 0 to 1, inclusive. + * + * <p>If the view is empty this throws std::runtime_error. + * + * @param item to be ranked + * @param inclusive if true the weight of the given item is included into the rank. + * Otherwise the rank equals the sum of the weights of all items that are less than the given item + * according to the Comparator. + * + * @return an approximate rank of the given item + */ double get_rank(const T& item, bool inclusive = true) const; + /** + * Quantile return type. + * This is to return quantiles either by value (for arithmetic types) or by const reference (for all other types) + */ using quantile_return_type = typename std::conditional<std::is_arithmetic<T>::value, T, const T&>::type; + + /** + * Returns an item from the sketch that is the best approximation to an item + * from the original stream with the given rank. + * + * <p>If the view is empty this throws std::runtime_error. + * + * @param rank of an item in the hypothetical sorted stream. + * @param inclusive if true, the given rank is considered inclusive (includes weight of an item) + * + * @return approximate quantile associated with the given rank + */ quantile_return_type get_quantile(double rank, bool inclusive = true) const; using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>; + + /** + * Returns an approximation to the Cumulative Distribution Function (CDF), which is the + * cumulative analog of the PMF, of the input stream given a set of split points (items). + * + * <p>If the view is empty this throws std::runtime_error. + * + * @param split_points an array of <i>m</i> unique, monotonically increasing items + * that divide the input domain into <i>m+1</i> consecutive disjoint intervals. + * + * @param size the number of split points in the array + * + * @param inclusive if true the rank of an item includes its own weight, and therefore + * if the sketch contains items equal to a slit point, then in CDF such items are + * included into the interval to the left of split point. Otherwise they are included into + * the interval to the right of split point. + * + * @return an array of m+1 doubles, which are a consecutive approximation to the CDF + * of the input stream given the split_points. The value at array position j of the returned + * CDF array is the sum of the returned values in positions 0 through j of the returned PMF + * array. This can be viewed as array of ranks of the given split points plus one more value + * that is always 1. + */ vector_double get_CDF(const T* split_points, uint32_t size, bool inclusive = true) const; + + /** + * Returns an approximation to the Probability Mass Function (PMF) of the input stream + * given a set of split points (items). + * + * <p>If the view is empty this throws std::runtime_error. + * + * @param split_points an array of <i>m</i> unique, monotonically increasing items + * that divide the input domain into <i>m+1</i> consecutive disjoint intervals (bins). + * + * @param size the number of split points in the array + * + * @param inclusive if true the rank of an item includes its own weight, and therefore + * if the sketch contains items equal to a slit point, then in PMF such items are + * included into the interval to the left of split point. Otherwise they are included into the interval + * to the right of split point. + * + * @return an array of m+1 doubles each of which is an approximation + * to the fraction of the input stream items (the mass) that fall into one of those intervals. + */ vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const; private: diff --git a/common/include/serde.hpp b/common/include/serde.hpp index 9b3349b..5cc55a8 100644 --- a/common/include/serde.hpp +++ b/common/include/serde.hpp @@ -30,23 +30,56 @@ namespace datasketches { -// serialize and deserialize +/// Interface for serializing and deserializing items template<typename T, typename Enable = void> struct serde { - // stream serialization + /** + * Stream serialization + * @param os output stream + * @param items pointer to array of items + * @param num number of items + */ void serialize(std::ostream& os, const T* items, unsigned num) const; + + /** + * Stream deserialization + * @param is input stream + * @param items pointer to array of items + * @param num number of items + */ void deserialize(std::istream& is, T* items, unsigned num) const; // items allocated but not initialized - // raw bytes serialization - size_t size_of_item(const T& item) const; + /** + * Raw bytes serialization + * @param ptr pointer to output buffer + * @param capacity size of the buffer in bytes + * @param items pointer to array of items + * @param num number of items + */ size_t serialize(void* ptr, size_t capacity, const T* items, unsigned num) const; - size_t deserialize(const void* ptr, size_t capacity, T* items, unsigned num) const; // items allocated but not initialized + + /** + * Raw bytes deserialization + * @param ptr pointer to input buffer + * @param capacity size of the buffer in bytes + * @param items pointer to array of items (items in the array are allocated but not initialized) + * @param num number of items + */ + size_t deserialize(const void* ptr, size_t capacity, T* items, unsigned num) const; + + /** + * Size of the given item + * @param item to be sized + * @return size of the given item in bytes + */ + size_t size_of_item(const T& item) const; }; -// serde for all fixed-size arithmetic types (int and float of different sizes) -// in particular, kll_sketch<int64_t> should produce sketches binary-compatible -// with LongsSketch and ItemsSketch<Long> with ArrayOfLongsSerDe in Java +/// serde for all fixed-size arithmetic types (int and float of different sizes). +/// in particular, kll_sketch<int64_t> should produce sketches binary-compatible +/// with LongsSketch and ItemsSketch<Long> with ArrayOfLongsSerDe in Java template<typename T> struct serde<T, typename std::enable_if<std::is_arithmetic<T>::value>::type> { + /// @copydoc serde::serialize void serialize(std::ostream& os, const T* items, unsigned num) const { bool failure = false; try { @@ -58,6 +91,7 @@ struct serde<T, typename std::enable_if<std::is_arithmetic<T>::value>::type> { throw std::runtime_error("error writing to std::ostream with " + std::to_string(num) + " items"); } } + void deserialize(std::istream& is, T* items, unsigned num) const { bool failure = false; try { @@ -70,30 +104,37 @@ struct serde<T, typename std::enable_if<std::is_arithmetic<T>::value>::type> { } } - size_t size_of_item(const T&) const { - return sizeof(T); - } + /// @copydoc serde::serialize(void*,size_t,const T*,unsigned) const size_t serialize(void* ptr, size_t capacity, const T* items, unsigned num) const { const size_t bytes_written = sizeof(T) * num; check_memory_size(bytes_written, capacity); memcpy(ptr, items, bytes_written); return bytes_written; } + + /// @copydoc serde::deserialize(const void*,size_t,T*,unsigned) const size_t deserialize(const void* ptr, size_t capacity, T* items, unsigned num) const { const size_t bytes_read = sizeof(T) * num; check_memory_size(bytes_read, capacity); memcpy(items, ptr, bytes_read); return bytes_read; } + + /// @copydoc serde::size_of_item + size_t size_of_item(const T& item) const { + unused(item); + return sizeof(T); + } }; -// serde for std::string items -// This should produce sketches binary-compatible with -// ItemsSketch<String> with ArrayOfStringsSerDe in Java. -// The length of each string is stored as a 32-bit integer (historically), -// which may be too wasteful. Treat this as an example. +/// serde for std::string items. +/// This should produce sketches binary-compatible with +/// ItemsSketch<String> with ArrayOfStringsSerDe in Java. +/// The length of each string is stored as a 32-bit integer (historically), +/// which may be too wasteful. Treat this as an example. template<> struct serde<std::string> { + /// @copydoc serde::serialize void serialize(std::ostream& os, const std::string* items, unsigned num) const { unsigned i = 0; bool failure = false; @@ -110,6 +151,8 @@ struct serde<std::string> { throw std::runtime_error("error writing to std::ostream at item " + std::to_string(i)); } } + + /// @copydoc serde::deserialize void deserialize(std::istream& is, std::string* items, unsigned num) const { unsigned i = 0; bool failure = false; @@ -137,9 +180,8 @@ struct serde<std::string> { throw std::runtime_error("error reading from std::istream at item " + std::to_string(i)); } } - size_t size_of_item(const std::string& item) const { - return sizeof(uint32_t) + item.size(); - } + + /// @copydoc serde::serialize(void*,size_t,const T*,unsigned) const size_t serialize(void* ptr, size_t capacity, const std::string* items, unsigned num) const { size_t bytes_written = 0; for (unsigned i = 0; i < num; ++i) { @@ -154,6 +196,8 @@ struct serde<std::string> { } return bytes_written; } + + /// @copydoc serde::deserialize(const void*,size_t,T*,unsigned) const size_t deserialize(const void* ptr, size_t capacity, std::string* items, unsigned num) const { size_t bytes_read = 0; unsigned i = 0; @@ -189,6 +233,11 @@ struct serde<std::string> { return bytes_read; } + + /// @copydoc serde::size_of_item + size_t size_of_item(const std::string& item) const { + return sizeof(uint32_t) + item.size(); + } }; } /* namespace datasketches */ diff --git a/count/include/count_min.hpp b/count/include/count_min.hpp index b394b75..7884653 100644 --- a/count/include/count_min.hpp +++ b/count/include/count_min.hpp @@ -25,26 +25,27 @@ namespace datasketches { - /* - * C++ implementation of the CountMin sketch data structure of Cormode and Muthukrishnan. - * [1] - http://dimacs.rutgers.edu/~graham/pubs/papers/cm-full.pdf - * The template type W is the type of the vector that contains the weights of the objects inserted into the sketch, - * not the type of the input items themselves. - * @author Charlie Dickens - */ - +/** + * C++ implementation of the CountMin sketch data structure of Cormode and Muthukrishnan. + * [1] - http://dimacs.rutgers.edu/~graham/pubs/papers/cm-full.pdf + * The template type W is the type of the vector that contains the weights of the objects inserted into the sketch, + * not the type of the input items themselves. + * @author Charlie Dickens + */ template <typename W, typename Allocator = std::allocator<W>> class count_min_sketch{ static_assert(std::is_arithmetic<W>::value, "Arithmetic type expected"); public: using allocator_type = Allocator; + using const_iterator = typename std::vector<W, Allocator>::const_iterator; /** * Creates an instance of the sketch given parameters _num_hashes, _num_buckets and hash seed, `seed`. * @param num_hashes : number of hash functions in the sketch. Equivalently the number of rows in the array * @param num_buckets : number of buckets that hash functions map into. Equivalently the number of columns in the array * @param seed for hash function + * @param allocator to use by this instance * * The items inserted into the sketch can be arbitrary type, so long as they are hashable via murmurhash. * Only update and estimate methods are added for uint64_t and string types. @@ -79,21 +80,23 @@ public: */ W get_total_weight() const; - /* - * @param relative_error : double -- the desired accuracy within which estimates should lie. + /** + * Suggests the number of buckets required to achieve the given relative error + * @param relative_error the desired accuracy within which estimates should lie. * For example, when relative_error = 0.05, then the returned frequency estimates satisfy the * `relative_error` guarantee that never overestimates the weights but may underestimate the weights * by 5% of the total weight in the sketch. - * @return number_of_buckets : the number of hash buckets at every level of the + * @return the number of hash buckets at every level of the * sketch required in order to obtain the specified relative error. * [1] - Section 3 ``Data Structure'', page 6. */ static uint32_t suggest_num_buckets(double relative_error); - /* - * @param confidence : double -- the desired confidence with which estimates should be correct. + /** + * Suggests the number of hash functions required to achieve the given confidence + * @param confidence the desired confidence with which estimates should be correct. * For example, with 95% confidence, frequency estimates satisfy the `relative_error` guarantee. - * @return number_of_hashes : the number of hash functions that are required in + * @return the number of hash functions that are required in * order to achieve the specified confidence of the sketch. * confidence = 1 - delta, with delta denoting the sketch failure probability in the literature. * [1] - Section 3 ``Data Structure'', page 6. @@ -111,7 +114,7 @@ public: /** * Specific get_estimate function for int64_t type * see generic get_estimate function - * @param item : uint64_t type. + * @param item : int64_t type. * @return an estimate of the item's frequency. */ W get_estimate(int64_t item) const; @@ -127,8 +130,8 @@ public: /** * This is the generic estimate query function for any of the given datatypes. * Query the sketch for the estimate of a given item. - * @param item : pointer to the data item to be query from the sketch. - * @param size : size_t + * @param item pointer to the data item to be query from the sketch. + * @param size size of the item in bytes * @return the estimated frequency of the item denoted f_est satisfying * f_true - relative_error*_total_weight <= f_est <= f_true */ @@ -136,60 +139,106 @@ public: /** * Query the sketch for the upper bound of a given item. - * @param item : uint64_t or std::string to query + * @param item to query + * @param size of the item in bytes * @return the upper bound on the true frequency of the item * f_true <= f_est + relative_error*_total_weight */ W get_upper_bound(const void* item, size_t size) const; - W get_upper_bound(int64_t) const; - W get_upper_bound(uint64_t) const; + + /** + * Query the sketch for the upper bound of a given item. + * @param item to query + * @return the upper bound on the true frequency of the item + * f_true <= f_est + relative_error*_total_weight + */ + W get_upper_bound(int64_t item) const; + + /** + * Query the sketch for the upper bound of a given item. + * @param item to query + * @return the upper bound on the true frequency of the item + * f_true <= f_est + relative_error*_total_weight + */ + W get_upper_bound(uint64_t item) const; + + /** + * Query the sketch for the upper bound of a given item. + * @param item to query + * @return the upper bound on the true frequency of the item + * f_true <= f_est + relative_error*_total_weight + */ W get_upper_bound(const std::string& item) const; /** * Query the sketch for the lower bound of a given item. - * @param item : uint64_t or std::string to query + * @param item to query + * @param size of the item in bytes * @return the lower bound for the query result, f_est, on the true frequency, f_est of the item * f_true - relative_error*_total_weight <= f_est */ W get_lower_bound(const void* item, size_t size) const; - W get_lower_bound(int64_t) const; - W get_lower_bound(uint64_t) const; + + /** + * Query the sketch for the lower bound of a given item. + * @param item to query + * @return the lower bound for the query result, f_est, on the true frequency, f_est of the item + * f_true - relative_error*_total_weight <= f_est + */ + W get_lower_bound(int64_t item) const; + + /** + * Query the sketch for the lower bound of a given item. + * @param item to query + * @return the lower bound for the query result, f_est, on the true frequency, f_est of the item + * f_true - relative_error*_total_weight <= f_est + */ + W get_lower_bound(uint64_t item) const; + + /** + * Query the sketch for the lower bound of a given item. + * @param item to query + * @return the lower bound for the query result, f_est, on the true frequency, f_est of the item + * f_true - relative_error*_total_weight <= f_est + */ W get_lower_bound(const std::string& item) const; - /* + /** * Update this sketch with given data of any type. - * This is a "universal" update that covers all cases above, - * but may produce different hashes. + * This is a "universal" update that covers all cases, + * but may produce different hashes compared to specialized update methods. * @param item pointer to the data item to be inserted into the sketch. * @param size of the data in bytes - * @return vector of uint64_t which each represent the index to which `value' must update in the sketch + * @param weight arithmetic type */ void update(const void* item, size_t size, W weight); /** - * Update this sketch with a given uint64_t item. - * @param item : uint64_t to update the sketch with - * @param weight : arithmetic type - * void function which inserts an item of type uint64_t into the sketch + * Update this sketch with a given item. + * @param item to update the sketch with + * @param weight arithmetic type */ - void update(uint64_t item, W weight); - void update(uint64_t item); - void update(int64_t item, W weight); - void update(int64_t item); + void update(uint64_t item, W weight = 1); + + /** + * Update this sketch with a given item. + * @param item to update the sketch with + * @param weight arithmetic type + */ + void update(int64_t item, W weight = 1); /** * Update this sketch with a given string. - * @param item : string to update the sketch with - * @param weight : arithmetic type - * void function which inserts an item of type std::string into the sketch + * @param item string to update the sketch with + * @param weight arithmetic type */ - void update(const std::string& item, W weight); - void update(const std::string& item); + void update(const std::string& item, W weight = 1); - /* - * merges a separate count_min_sketch into this count_min_sketch. + /** + * Merges another count_min_sketch into this count_min_sketch. + * @param other_sketch */ - void merge(const count_min_sketch &other_sketch); + void merge(const count_min_sketch& other_sketch); /** * Returns true if this sketch is empty. @@ -205,15 +254,23 @@ public: */ string<Allocator> to_string() const; - // Iterators - using const_iterator = typename std::vector<W, Allocator>::const_iterator; + /** + * Iterator pointing to the first item in the sketch. + * If the sketch is empty, the returned iterator must not be dereferenced or incremented. + * @return iterator pointing to the first item in the sketch + */ const_iterator begin() const; - const_iterator end() const; /** - * This method serializes the sketch into a given stream in a binary form - * @param os output stream - * The byte output has the following structure + * Iterator pointing to the past-the-end item in the sketch. + * The past-the-end item is the hypothetical item that would follow the last item. + * It does not point to any item, and must not be dereferenced or incremented. + * @return iterator pointing to the past-the-end item in the sketch + */ + const_iterator end() const; + + /* + * The serialized sketch binary form has the following structure * Byte 0: * 1 - if and only if the sketch is empty * 0 - otherwise @@ -254,8 +311,6 @@ public: ||---------------------------- sketch entries ---------------------------| ... - * - * */ @@ -266,7 +321,8 @@ public: size_t get_serialized_size_bytes() const; /** - * This method serializes a binary image of the sketch to an output stream. + * This method serializes the sketch into a given stream in a binary form + * @param os output stream */ void serialize(std::ostream& os) const; @@ -287,6 +343,7 @@ public: * This method deserializes a sketch from a given stream. * @param is input stream * @param seed the seed for the hash function that was used to create the sketch + * @param allocator instance of an Allocator * @return an instance of a sketch */ static count_min_sketch deserialize(std::istream& is, uint64_t seed=DEFAULT_SEED, const Allocator& allocator = Allocator()); @@ -296,12 +353,12 @@ public: * @param bytes pointer to the array of bytes * @param size the size of the array * @param seed the seed for the hash function that was used to create the sketch + * @param allocator instance of an Allocator * @return an instance of the sketch */ static count_min_sketch deserialize(const void* bytes, size_t size, uint64_t seed=DEFAULT_SEED, const Allocator& allocator = Allocator()); /** - * Returns the allocator for this sketch. * @return allocator */ allocator_type get_allocator() const; @@ -331,9 +388,6 @@ private: */ static void check_header_validity(uint8_t preamble_longs, uint8_t serial_version, uint8_t family_id, uint8_t flags_byte); - - - /* * Obtain the hash values when inserting an item into the sketch. * @param item pointer to the data item to be inserted into the sketch. diff --git a/count/include/count_min_impl.hpp b/count/include/count_min_impl.hpp index 568debc..45376e7 100644 --- a/count/include/count_min_impl.hpp +++ b/count/include/count_min_impl.hpp @@ -169,33 +169,17 @@ void count_min_sketch<W,A>::update(uint64_t item, W weight) { update(&item, sizeof(item), weight); } -template<typename W, typename A> -void count_min_sketch<W,A>::update(uint64_t item) { - update(&item, sizeof(item), 1); -} - template<typename W, typename A> void count_min_sketch<W,A>::update(int64_t item, W weight) { update(&item, sizeof(item), weight); } -template<typename W, typename A> -void count_min_sketch<W,A>::update(int64_t item) { - update(&item, sizeof(item), 1); -} - template<typename W, typename A> void count_min_sketch<W,A>::update(const std::string& item, W weight) { if (item.empty()) return; update(item.c_str(), item.length(), weight); } -template<typename W, typename A> -void count_min_sketch<W,A>::update(const std::string& item) { - if (item.empty()) return; - update(item.c_str(), item.length(), 1); -} - template<typename W, typename A> void count_min_sketch<W,A>::update(const void* item, size_t size, W weight) { /* diff --git a/cpc/include/cpc_sketch.hpp b/cpc/include/cpc_sketch.hpp index c000ac9..164c33a 100644 --- a/cpc/include/cpc_sketch.hpp +++ b/cpc/include/cpc_sketch.hpp @@ -33,58 +33,56 @@ namespace datasketches { -/* - * High performance C++ implementation of Compressed Probabilistic Counting (CPC) Sketch - * - * This is a very compact (in serialized form) distinct counting sketch. - * The theory is described in the following paper: - * https://arxiv.org/abs/1708.06839 - * - * author Kevin Lang - * author Alexander Saydakov - */ - // forward-declarations template<typename A> class cpc_sketch_alloc; template<typename A> class cpc_union_alloc; -// alias with default allocator for convenience +/// CPC sketch alias with default allocator using cpc_sketch = cpc_sketch_alloc<std::allocator<uint8_t>>; -// allocation and initialization of global decompression (decoding) tables -// call this before anything else if you want to control the initialization time -// for instance, to have this happen outside of a transaction context -// otherwise initialization happens on the first use (serialization or deserialization) -// it is safe to call more than once assuming no race conditions -// this is not thread safe! neither is the rest of the library +/** + * Allocation and initialization of global decompression (decoding) tables. + * Call this before anything else if you want to control the initialization time. + * For instance, to have this happen outside of a transaction context. + * Otherwise initialization happens on the first use (serialization or deserialization). + * It is safe to call more than once assuming no race conditions. + * This is not thread safe! Neither is the rest of the library. + */ template<typename A> void cpc_init(); +/** + * High performance C++ implementation of Compressed Probabilistic Counting (CPC) Sketch + * + * This is a very compact (in serialized form) distinct counting sketch. + * The theory is described in the following paper: + * https://arxiv.org/abs/1708.06839 + * + * @author Kevin Lang + * @author Alexander Saydakov + */ template<typename A> class cpc_sketch_alloc { public: + using allocator_type = A; + /** * Creates an instance of the sketch given the lg_k parameter and hash seed. * @param lg_k base 2 logarithm of the number of bins in the sketch * @param seed for hash function + * @param allocator instance of an allocator */ explicit cpc_sketch_alloc(uint8_t lg_k = cpc_constants::DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED, const A& allocator = A()); - using allocator_type = A; + /// @return allocator A get_allocator() const; - /** - * @return configured lg_k of this sketch - */ + /// @return configured lg_k of this sketch uint8_t get_lg_k() const; - /** - * @return true if this sketch represents an empty set - */ + /// @return true if this sketch represents an empty set bool is_empty() const; - /** - * @return estimate of the distinct count of the input stream - */ + /// @return estimate of the distinct count of the input stream double get_estimate() const; /** @@ -189,13 +187,14 @@ public: * Otherwise two sketches that should represent overlapping sets will be disjoint * For instance, for signed 32-bit values call update(int32_t) method above, * which does widening conversion to int64_t, if compatibility with Java is expected - * @param data pointer to the data - * @param length of the data in bytes + * @param value pointer to the data + * @param size of the data in bytes */ void update(const void* value, size_t size); /** * Returns a human-readable summary of this sketch + * @return a human-readable summary of this sketch */ string<A> to_string() const; @@ -215,6 +214,7 @@ public: * It is an uninitialized space of a given size. * This header is used in Datasketches PostgreSQL extension. * @param header_size_bytes space to reserve in front of the sketch + * @return serialized sketch as a vector of bytes */ vector_bytes serialize(unsigned header_size_bytes = 0) const; @@ -222,6 +222,7 @@ public: * This method deserializes a sketch from a given stream. * @param is input stream * @param seed the seed for the hash function that was used to create the sketch + * @param allocator instance of an Allocator * @return an instance of a sketch */ static cpc_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED, const A& allocator = A()); @@ -231,6 +232,7 @@ public: * @param bytes pointer to the array of bytes * @param size the size of the array * @param seed the seed for the hash function that was used to create the sketch + * @param allocator instance of an Allocator * @return an instance of the sketch */ static cpc_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, const A& allocator = A()); @@ -246,10 +248,10 @@ public: */ static size_t get_max_serialized_size_bytes(uint8_t lg_k); - // for internal use + /// @private for internal use uint32_t get_num_coupons() const; - // for debugging + /// @private for debugging // this should catch some forms of corruption during serialization-deserialization bool validate() const; diff --git a/cpc/include/cpc_union.hpp b/cpc/include/cpc_union.hpp index 54fbed5..f380fb7 100644 --- a/cpc/include/cpc_union.hpp +++ b/cpc/include/cpc_union.hpp @@ -27,16 +27,15 @@ namespace datasketches { -/* +/// CPC union alias with default allocator +using cpc_union = cpc_union_alloc<std::allocator<uint8_t>>; + +/** * High performance C++ implementation of Compressed Probabilistic Counting (CPC) Union * * author Kevin Lang * author Alexander Saydakov */ - -// alias with default allocator for convenience -using cpc_union = cpc_union_alloc<std::allocator<uint8_t>>; - template<typename A> class cpc_union_alloc { public: @@ -44,14 +43,36 @@ public: * Creates an instance of the union given the lg_k parameter and hash seed. * @param lg_k base 2 logarithm of the number of bins in the sketch * @param seed for hash function + * @param allocator instance of an allocator */ explicit cpc_union_alloc(uint8_t lg_k = cpc_constants::DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED, const A& allocator = A()); + /** + * Copy constructor + * @param other union to be copied + */ cpc_union_alloc(const cpc_union_alloc<A>& other); + + /** + * Move constructor + * @param other union to be moved + */ cpc_union_alloc(cpc_union_alloc<A>&& other) noexcept; + ~cpc_union_alloc(); + /** + * Copy assignment + * @param other union to be copied + * @return reference to this union + */ cpc_union_alloc<A>& operator=(const cpc_union_alloc<A>& other); + + /** + * Move assignment + * @param other union to be moved + * @return reference to this union + */ cpc_union_alloc<A>& operator=(cpc_union_alloc<A>&& other) noexcept; /** @@ -73,9 +94,9 @@ public: cpc_sketch_alloc<A> get_result() const; private: - typedef typename std::allocator_traits<A>::template rebind_alloc<uint8_t> AllocU8; - typedef typename std::allocator_traits<A>::template rebind_alloc<uint64_t> AllocU64; - typedef typename std::allocator_traits<A>::template rebind_alloc<cpc_sketch_alloc<A>> AllocCpc; + using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>; + using AllocU64 = typename std::allocator_traits<A>::template rebind_alloc<uint64_t>; + using AllocCpc = typename std::allocator_traits<A>::template rebind_alloc<cpc_sketch_alloc<A>>; uint8_t lg_k; uint64_t seed; diff --git a/density/include/density_sketch.hpp b/density/include/density_sketch.hpp index 734f477..e5674c6 100755 --- a/density/include/density_sketch.hpp +++ b/density/include/density_sketch.hpp @@ -28,15 +28,6 @@ #include "common_defs.hpp" -/* - * Based on the following paper: - * Zohar Karnin, Edo Liberty "Discrepancy, Coresets, and Sketches in Machine Learning" - * https://proceedings.mlr.press/v99/karnin19a/karnin19a.pdf - * - * Inspired by the following implementation: - * https://github.com/edoliberty/streaming-quantiles/blob/f688c8161a25582457b0a09deb4630a81406293b/gde.py - */ - namespace datasketches { template<typename T> @@ -46,6 +37,18 @@ struct gaussian_kernel { } }; +/** + * Density sketch. + * + * Builds a coreset from the given set of input points. Provides density estimate at a given point. + * + * Based on the following paper: + * Zohar Karnin, Edo Liberty "Discrepancy, Coresets, and Sketches in Machine Learning" + * https://proceedings.mlr.press/v99/karnin19a/karnin19a.pdf + * + * Inspired by the following implementation: + * https://github.com/edoliberty/streaming-quantiles/blob/f688c8161a25582457b0a09deb4630a81406293b/gde.py + */ template< typename T, typename Kernel = gaussian_kernel<T>, @@ -118,6 +121,10 @@ public: template<typename FwdSketch> void merge(FwdSketch&& other); + /** + * Density estimate at a given point + * @return density estimate at a given point + */ T get_estimate(const std::vector<T>& point) const; /** @@ -172,7 +179,20 @@ public: string<Allocator> to_string(bool print_levels = false, bool print_items = false) const; class const_iterator; + + /** + * Iterator pointing to the first item in the sketch. + * If the sketch is empty, the returned iterator must not be dereferenced or incremented. + * @return iterator pointing to the first item in the sketch + */ const_iterator begin() const; + + /** + * Iterator pointing to the past-the-end item in the sketch. + * The past-the-end item is the hypothetical item that would follow the last item. + * It does not point to any item, and must not be dereferenced or incremented. + * @return iterator pointing to the past-the-end item in the sketch + */ const_iterator end() const; private: diff --git a/fi/include/frequent_items_sketch.hpp b/fi/include/frequent_items_sketch.hpp index 02a0854..beec930 100644 --- a/fi/include/frequent_items_sketch.hpp +++ b/fi/include/frequent_items_sketch.hpp @@ -32,15 +32,15 @@ namespace datasketches { -/* +enum frequent_items_error_type { NO_FALSE_POSITIVES, NO_FALSE_NEGATIVES }; + +/** + * Frequent Items sketch. + * * Based on Java implementation here: * https://github.com/apache/datasketches-java/blob/master/src/main/java/org/apache/datasketches/frequencies/ItemsSketch.java - * author Alexander Saydakov + * @author Alexander Saydakov */ - -enum frequent_items_error_type { NO_FALSE_POSITIVES, NO_FALSE_NEGATIVES }; - -// type W for weight must be an arithmetic type (integral or floating point) template< typename T, typename W = uint64_t, @@ -49,6 +49,7 @@ template< typename A = std::allocator<T> > class frequent_items_sketch { + static_assert(std::is_arithmetic<W>::value, "Arithmetic type expected"); public: static const uint8_t LG_MIN_MAP_SIZE = 3; @@ -194,7 +195,7 @@ public: * There may be items omitted from the set with true frequencies greater than the * threshold (false negatives).</p> * - * @param error_type determines whether no false positives or no false negatives are desired. + * @param err_type determines whether no false positives or no false negatives are desired. * @return an array of frequent items */ vector_row get_frequent_items(frequent_items_error_type err_type) const; @@ -217,7 +218,7 @@ public: * There may be items omitted from the set with true frequencies greater than the * threshold (false negatives).</p> * - * @param error_type determines whether no false positives or no false negatives are desired. + * @param err_type determines whether no false positives or no false negatives are desired. * @param threshold to include items in the result list * @return an array of frequent items */ @@ -318,14 +319,19 @@ private: class items_deleter; }; +/// Row in the output from #get_frequent_items template<typename T, typename W, typename H, typename E, typename A> class frequent_items_sketch<T, W, H, E, A>::row { public: row(const T* item, W weight, W offset): item(item), weight(weight), offset(offset) {} + /// @return item const T& get_item() const { return *item; } + /// @return frequency (weight) estimate W get_estimate() const { return weight + offset; } + /// @return estimate lower bound W get_lower_bound() const { return weight; } + /// @return estimate upper bound W get_upper_bound() const { return weight + offset; } private: const T* item; diff --git a/hll/include/HllSketch-internal.hpp b/hll/include/HllSketch-internal.hpp index f3b99c1..dfd7e43 100644 --- a/hll/include/HllSketch-internal.hpp +++ b/hll/include/HllSketch-internal.hpp @@ -94,14 +94,14 @@ hll_sketch_alloc<A>::hll_sketch_alloc(HllSketchImpl<A>* that) : {} template<typename A> -hll_sketch_alloc<A> hll_sketch_alloc<A>::operator=(const hll_sketch_alloc<A>& other) { +hll_sketch_alloc<A>& hll_sketch_alloc<A>::operator=(const hll_sketch_alloc<A>& other) { sketch_impl->get_deleter()(sketch_impl); sketch_impl = other.sketch_impl->copy(); return *this; } template<typename A> -hll_sketch_alloc<A> hll_sketch_alloc<A>::operator=(hll_sketch_alloc<A>&& other) { +hll_sketch_alloc<A>& hll_sketch_alloc<A>::operator=(hll_sketch_alloc<A>&& other) { std::swap(sketch_impl, other.sketch_impl); return *this; } diff --git a/hll/include/hll.hpp b/hll/include/hll.hpp index 63b9f4f..1417c82 100644 --- a/hll/include/hll.hpp +++ b/hll/include/hll.hpp @@ -29,40 +29,6 @@ #include <vector> namespace datasketches { - - /** - * This is a high performance implementation of Phillipe Flajolet’s HLL sketch but with - * significantly improved error behavior. If the ONLY use case for sketching is counting - * uniques and merging, the HLL sketch is a reasonable choice, although the highest performing in terms of accuracy for - * storage space consumed is CPC (Compressed Probabilistic Counting). For large enough counts, this HLL version (with HLL_4) can be 2 to - * 16 times smaller than the Theta sketch family for the same accuracy. - * - * <p>This implementation offers three different types of HLL sketch, each with different - * trade-offs with accuracy, space and performance. These types are specified with the - * {@link TgtHllType} parameter. - * - * <p>In terms of accuracy, all three types, for the same <i>lg_config_k</i>, have the same error - * distribution as a function of <i>n</i>, the number of unique values fed to the sketch. - * The configuration parameter <i>lg_config_k</i> is the log-base-2 of <i>K</i>, - * where <i>K</i> is the number of buckets or slots for the sketch. - * - * <p>During warmup, when the sketch has only received a small number of unique items - * (up to about 10% of <i>K</i>), this implementation leverages a new class of estimator - * algorithms with significantly better accuracy. - * - * <p>This sketch also offers the capability of operating off-heap. Given a WritableMemory object - * created by the user, the sketch will perform all of its updates and internal phase transitions - * in that object, which can actually reside either on-heap or off-heap based on how it is - * configured. In large systems that must update and merge many millions of sketches, having the - * sketch operate off-heap avoids the serialization and deserialization costs of moving sketches - * to and from off-heap memory-mapped files, for example, and eliminates big garbage collection - * delays. - * - * author Jon Malkin - * author Lee Rhodes - * author Kevin Lang - */ - /** * Specifies the target type of HLL sketch to be created. It is a target in that the actual @@ -109,6 +75,39 @@ class hll_union_alloc; template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>; template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>; +/** + * This is a high performance implementation of Phillipe Flajolet's HLL sketch but with + * significantly improved error behavior. If the ONLY use case for sketching is counting + * uniques and merging, the HLL sketch is a reasonable choice, although the highest performing in terms of accuracy for + * storage space consumed is CPC (Compressed Probabilistic Counting). For large enough counts, this HLL version (with HLL_4) can be 2 to + * 16 times smaller than the Theta sketch family for the same accuracy. + * + * <p>This implementation offers three different types of HLL sketch, each with different + * trade-offs with accuracy, space and performance. These types are specified with the + * {@link target_hll_type} parameter. + * + * <p>In terms of accuracy, all three types, for the same <i>lg_config_k</i>, have the same error + * distribution as a function of <i>n</i>, the number of unique values fed to the sketch. + * The configuration parameter <i>lg_config_k</i> is the log-base-2 of <i>K</i>, + * where <i>K</i> is the number of buckets or slots for the sketch. + * + * <p>During warmup, when the sketch has only received a small number of unique items + * (up to about 10% of <i>K</i>), this implementation leverages a new class of estimator + * algorithms with significantly better accuracy. + * + * <p>This sketch also offers the capability of operating off-heap. Given a WritableMemory object + * created by the user, the sketch will perform all of its updates and internal phase transitions + * in that object, which can actually reside either on-heap or off-heap based on how it is + * configured. In large systems that must update and merge many millions of sketches, having the + * sketch operate off-heap avoids the serialization and deserialization costs of moving sketches + * to and from off-heap memory-mapped files, for example, and eliminates big garbage collection + * delays. + * + * author Jon Malkin + * author Lee Rhodes + * author Kevin Lang + */ + template<typename A = std::allocator<uint8_t> > class hll_sketch_alloc final { public: @@ -119,27 +118,33 @@ class hll_sketch_alloc final { * @param start_full_size Indicates whether to start in HLL mode, * keeping memory use constant (if HLL_6 or HLL_8) at the cost of * starting out using much more memory + * @param allocator instance of an Allocator */ explicit hll_sketch_alloc(uint8_t lg_config_k, target_hll_type tgt_type = HLL_4, bool start_full_size = false, const A& allocator = A()); /** * Copy constructor + * @param that sketch to be copied */ hll_sketch_alloc(const hll_sketch_alloc<A>& that); /** * Copy constructor to a new target type + * @param that sketch to be copied + * @param tgt_type target_hll_type */ hll_sketch_alloc(const hll_sketch_alloc<A>& that, target_hll_type tgt_type); /** * Move constructor + * @param that sketch to be moved */ hll_sketch_alloc(hll_sketch_alloc<A>&& that) noexcept; /** * Reconstructs a sketch from a serialized image on a stream. * @param is An input stream with a binary image of a sketch + * @param allocator instance of an Allocator */ static hll_sketch_alloc deserialize(std::istream& is, const A& allocator = A()); @@ -147,17 +152,26 @@ class hll_sketch_alloc final { * Reconstructs a sketch from a serialized image in a byte array. * @param bytes An input array with a binary image of a sketch * @param len Length of the input array, in bytes + * @param allocator instance of an Allocator */ static hll_sketch_alloc deserialize(const void* bytes, size_t len, const A& allocator = A()); //! Class destructor virtual ~hll_sketch_alloc(); - //! Copy assignment operator - hll_sketch_alloc operator=(const hll_sketch_alloc<A>& other); + /** + * Copy assignment operator + * @param other sketch to be copied + * @return reference to this sketch + */ + hll_sketch_alloc& operator=(const hll_sketch_alloc<A>& other); - //! Move assignment operator - hll_sketch_alloc operator=(hll_sketch_alloc<A>&& other); + /** + * Move assignment operator + * @param other sketch to be moved + * @return reference to this sketch + */ + hll_sketch_alloc& operator=(hll_sketch_alloc<A>&& other); /** * Resets the sketch to an empty state in coupon collection mode. @@ -165,18 +179,20 @@ class hll_sketch_alloc final { */ void reset(); - typedef vector_u8<A> vector_bytes; // alias for users + using vector_bytes = vector_u8<A>; // alias for users /** * Serializes the sketch to a byte array, compacting data structures * where feasible to eliminate unused storage in the serialized image. * @param header_size_bytes Allows for PostgreSQL integration + * @return serialized sketch in binary form */ vector_bytes serialize_compact(unsigned header_size_bytes = 0) const; /** * Serializes the sketch to a byte array, retaining all internal * data structures in their current form. + * @return serialized sketch in binary form */ vector_bytes serialize_updatable() const; @@ -413,8 +429,8 @@ class hll_sketch_alloc final { * <p>Although the API for this union operator parallels many of the methods of the * <i>HllSketch</i>, the behavior of the union operator has some fundamental differences. * - * <p>First, the user cannot specify the #tgt_hll_type as an input parameter. - * Instead, it is specified for the sketch returned with #get_result(tgt_hll_tyope). + * <p>First, the user cannot specify the #target_hll_type as an input parameter. + * Instead, it is specified for the sketch returned with #get_result. * * <p>Second, the internal effective value of log-base-2 of <i>k</i> for the union operation can * change dynamically based on the smallest <i>lg_config_k</i> that the union operation has seen. @@ -423,7 +439,6 @@ class hll_sketch_alloc final { * author Lee Rhodes * author Kevin Lang */ - template<typename A = std::allocator<uint8_t> > class hll_union_alloc { public: @@ -431,6 +446,7 @@ class hll_union_alloc { * Construct an hll_union operator with the given maximum log2 of k. * @param lg_max_k The maximum size, in log2, of k. The value must * be between 7 and 21, inclusive. + * @param allocator instance of an Allocator */ explicit hll_union_alloc(uint8_t lg_max_k, const A& allocator = A()); @@ -495,7 +511,7 @@ class hll_union_alloc { /** * Returns the result of this union operator with the specified - * #tgt_hll_type. + * #target_hll_type. * @param tgt_type The tgt_hll_type enum value of the desired result (Default: HLL_4) * @return The result of this union with the specified tgt_hll_type */ @@ -629,11 +645,11 @@ class hll_union_alloc { hll_sketch_alloc<A> gadget_; }; -/// convenience alias for hll_sketch with default allocator -typedef hll_sketch_alloc<> hll_sketch; +/// HLL sketch alias with default allocator +using hll_sketch = hll_sketch_alloc<std::allocator<uint8_t>>; -/// convenience alias for hll_union with default allocator -typedef hll_union_alloc<> hll_union; +/// HLL union alias with default allocator +using hll_union = hll_union_alloc<std::allocator<uint8_t>>; } // namespace datasketches diff --git a/kll/include/kll_sketch.hpp b/kll/include/kll_sketch.hpp index 560f097..e9fc73e 100644 --- a/kll/include/kll_sketch.hpp +++ b/kll/include/kll_sketch.hpp @@ -29,7 +29,13 @@ namespace datasketches { -/* +/// KLL sketch constants +namespace kll_constants { + /// default value of parameter K + const uint16_t DEFAULT_K = 200; +} + +/** * Implementation of a very compact quantiles sketch with lazy compaction scheme * and nearly optimal accuracy per retained item. * See <a href="https://arxiv.org/abs/1603.05346v2">Optimal Quantile Approximation in Streams</a>. @@ -146,10 +152,6 @@ namespace datasketches { * author Lee Rhodes */ -namespace kll_constants { - const uint16_t DEFAULT_K = 200; -} - template < typename T, typename C = std::less<T>, // strict weak ordering function (see C++ named requirements: Compare) @@ -160,6 +162,13 @@ class kll_sketch { using value_type = T; using comparator = C; using vector_u32 = std::vector<uint32_t, typename std::allocator_traits<A>::template rebind_alloc<uint32_t>>; + using vector_double = typename quantiles_sorted_view<T, C, A>::vector_double; + + /** + * Quantile return type. + * This is to return quantiles either by value (for arithmetic types) or by const reference (for all other types) + */ + using quantile_return_type = typename quantiles_sorted_view<T, C, A>::quantile_return_type; static const uint8_t DEFAULT_M = 8; static const uint16_t MIN_K = DEFAULT_M; @@ -262,7 +271,6 @@ class kll_sketch { * * @return approximate quantile associated with the given rank */ - using quantile_return_type = typename quantiles_sorted_view<T, C, A>::quantile_return_type; quantile_return_type get_quantile(double rank, bool inclusive = true) const; /** @@ -339,7 +347,6 @@ class kll_sketch { * @return an array of m+1 doubles each of which is an approximation * to the fraction of the input stream items (the mass) that fall into one of those intervals. */ - using vector_double = typename quantiles_sorted_view<T, C, A>::vector_double; vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const; /** @@ -489,9 +496,26 @@ class kll_sketch { string<A> to_string(bool print_levels = false, bool print_items = false) const; class const_iterator; + + /** + * Iterator pointing to the first item in the sketch. + * If the sketch is empty, the returned iterator must not be dereferenced or incremented. + * @return iterator pointing to the first item in the sketch + */ const_iterator begin() const; + + /** + * Iterator pointing to the past-the-end item in the sketch. + * The past-the-end item is the hypothetical item that would follow the last item. + * It does not point to any item, and must not be dereferenced or incremented. + * @return iterator pointing to the past-the-end item in the sketch + */ const_iterator end() const; + /** + * Gets the sorted view of this sketch + * @return the sorted view of this sketch + */ quantiles_sorted_view<T, C, A> get_sorted_view() const; private: diff --git a/quantiles/include/quantiles_sketch.hpp b/quantiles/include/quantiles_sketch.hpp index 469dc72..6f728db 100644 --- a/quantiles/include/quantiles_sketch.hpp +++ b/quantiles/include/quantiles_sketch.hpp @@ -30,6 +30,16 @@ namespace datasketches { +/// Constants for Quantiles sketch +namespace quantiles_constants { + /// default value of parameter K + const uint16_t DEFAULT_K = 128; + /// minimum value of parameter K + const uint16_t MIN_K = 2; + /// maximum value of parameter K + const uint16_t MAX_K = 1 << 15; +} + /** * This is a stochastic streaming sketch that enables near-real time analysis of the * approximate distribution from a very large stream in a single pass. @@ -136,13 +146,6 @@ Table Guide for DoublesSketch Size in Bytes and Approximate Error: * @author Alexander Saydakov * @author Jon Malkin */ - -namespace quantiles_constants { - const uint16_t DEFAULT_K = 128; - const uint16_t MIN_K = 2; - const uint16_t MAX_K = 1 << 15; -} - template <typename T, typename Comparator = std::less<T>, // strict weak ordering function (see C++ named requirements: Compare) typename Allocator = std::allocator<T>> @@ -151,13 +154,43 @@ public: using value_type = T; using allocator_type = Allocator; using comparator = Comparator; + using quantile_return_type = typename quantiles_sorted_view<T, Comparator, Allocator>::quantile_return_type; + using vector_double = typename quantiles_sorted_view<T, Comparator, Allocator>::vector_double; + /** + * Constructor + * @param k + * @param comparator + * @param allocator + */ explicit quantiles_sketch(uint16_t k = quantiles_constants::DEFAULT_K, const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator()); + + /** + * Copy constructor + * @param other sketch to be copied + */ quantiles_sketch(const quantiles_sketch& other); + + /** Move constructor + * @param other sketch to be moved + */ quantiles_sketch(quantiles_sketch&& other) noexcept; + ~quantiles_sketch(); + + /** + * Copy assignment + * @param other sketch to be copied + * @return reference to this sketch + */ quantiles_sketch& operator=(const quantiles_sketch& other); + + /** + * Move assignment + * @param other sketch to be moved + * @return reference to this sketch + */ quantiles_sketch& operator=(quantiles_sketch&& other) noexcept; /** @@ -247,10 +280,11 @@ public: * If the sketch is empty this throws std::runtime_error. * * @param rank the specified normalized rank in the hypothetical sorted stream. - * + * @param inclusive if true the weight of the given item is included into the rank. + * Otherwise the rank equals the sum of the weights of all items that are less than the given item + * according to the Comparator. * @return the approximation to the item at the given rank */ - using quantile_return_type = typename quantiles_sorted_view<T, Comparator, Allocator>::quantile_return_type; quantile_return_type get_quantile(double rank, bool inclusive = true) const; /** @@ -264,7 +298,9 @@ public: * @param ranks given array of normalized ranks in the hypothetical sorted stream. * These ranks must be in the interval [0.0, 1.0], inclusive. * @param size the number of ranks in the array - * + * @param inclusive if true the weight of the given item is included into the rank. + * Otherwise the rank equals the sum of the weights of all items that are less than the given item + * according to the Comparator. * @return array of approximations to items associated with given ranks in the same order as given ranks * in the input array. * @@ -282,7 +318,9 @@ public: * This must be an integer greater than 0. A value of 1 is equivalent to get_quantiles([0]). * A value of 2 is equivalent to get_quantiles([0, 1]). A value of 3 is equivalent to * get_quantiles([0, 0.5, 1]), etc. - * + * @param inclusive if true the weight of the given item is included into the rank. + * Otherwise the rank equals the sum of the weights of all items that are less than the given item + * according to the Comparator. * @return array of approximations to items associated with the given number of evenly-spaced normalized ranks. * * Deprecated. Will be removed in the next major version. Use get_quantile() instead. @@ -300,7 +338,7 @@ public: * @param item to be ranked * @param inclusive if true the weight of the given item is included into the rank. * Otherwise the rank equals the sum of the weights of all items that are less than the given item - * according to the comparator C. + * according to the Comparator. * @return an approximate normalized rank of the given item */ double get_rank(const T& item, bool inclusive = true) const; @@ -327,7 +365,6 @@ public: * @return an array of m+1 doubles each of which is an approximation * to the fraction of the input stream items (the mass) that fall into one of those intervals. */ - using vector_double = typename quantiles_sorted_view<T, Comparator, Allocator>::vector_double; vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const; /** @@ -451,9 +488,26 @@ public: string<Allocator> to_string(bool print_levels = false, bool print_items = false) const; class const_iterator; + + /** + * Iterator pointing to the first item in the sketch. + * If the sketch is empty, the returned iterator must not be dereferenced or incremented. + * @return iterator pointing to the first item in the sketch + */ const_iterator begin() const; + + /** + * Iterator pointing to the past-the-end item in the sketch. + * The past-the-end item is the hypothetical item that would follow the last item. + * It does not point to any item, and must not be dereferenced or incremented. + * @return iterator pointing to the past-the-end item in the sketch + */ const_iterator end() const; + /** + * Gets the sorted view of this sketch + * @return the sorted view of this sketch + */ quantiles_sorted_view<T, Comparator, Allocator> get_sorted_view() const; private: diff --git a/req/include/req_common.hpp b/req/include/req_common.hpp index 1ea4b41..aaead87 100755 --- a/req/include/req_common.hpp +++ b/req/include/req_common.hpp @@ -27,10 +27,14 @@ namespace datasketches { +/// REQ sketch constants namespace req_constants { - static const uint16_t MIN_K = 4; - static const uint8_t INIT_NUM_SECTIONS = 3; - static const unsigned MULTIPLIER = 2; + /// minimum value of parameter K + const uint16_t MIN_K = 4; + /// initial number of sections + const uint8_t INIT_NUM_SECTIONS = 3; + /// multiplier for nominal capacity + const unsigned MULTIPLIER = 2; } } /* namespace datasketches */ diff --git a/req/include/req_sketch.hpp b/req/include/req_sketch.hpp index 29658bf..99b4524 100755 --- a/req/include/req_sketch.hpp +++ b/req/include/req_sketch.hpp @@ -28,6 +28,48 @@ namespace datasketches { +/** + * Relative Error Quantiles Sketch. + * This is an implementation based on the paper + * "Relative Error Streaming Quantiles" by Graham Cormode, Zohar Karnin, Edo Liberty, + * Justin Thaler, Pavel Veselý, and loosely derived from a Python prototype written by Pavel Veselý. + * + * <p>Reference: https://arxiv.org/abs/2004.01668</p> + * + * <p>This implementation differs from the algorithm described in the paper in the following:</p> + * + * <ul> + * <li>The algorithm requires no upper bound on the stream length. + * Instead, each relative-compactor counts the number of compaction operations performed + * so far (via variable state). Initially, the relative-compactor starts with INIT_NUMBER_OF_SECTIONS. + * Each time the number of compactions (variable state) exceeds 2^{numSections - 1}, we double + * numSections. Note that after merging the sketch with another one variable state may not correspond + * to the number of compactions performed at a particular level, however, since the state variable + * never exceeds the number of compactions, the guarantees of the sketch remain valid.</li> + * + * <li>The size of each section (variable k and section_size in the code and parameter k in + * the paper) is initialized with a number set by the user via variable k. + * When the number of sections doubles, we decrease section_size by a factor of sqrt(2). + * This is applied at each level separately. Thus, when we double the number of sections, the + * nominal compactor size increases by a factor of approx. sqrt(2) (+/- rounding).</li> + * + * <li>The merge operation here does not perform "special compactions", which are used in the paper + * to allow for a tight mathematical analysis of the sketch.</li> + * </ul> + * + * <p>This implementation provides a number of capabilities not discussed in the paper or provided + * in the Python prototype.</p> + * + * <ul><li>The Python prototype only implemented high accuracy for low ranks. This implementation + * provides the user with the ability to choose either high rank accuracy or low rank accuracy at + * the time of sketch construction.</li> + * <li>The Python prototype only implemented a comparison criterion of "INCLUSIVE". This implementation + * allows the user to use both the "INCLUSIVE" criterion and the "EXCLUSIVE" criterion.</li> + * <li>This implementation provides extensive debug visibility into the operation of the sketch with + * two levels of detail output. This is not only useful for debugging, but is a powerful tool to + * help users understand how the sketch works.</li> + * </ul> + */ template< typename T, typename Comparator = std::less<T>, // strict weak ordering function (see C++ named requirements: Compare) @@ -39,6 +81,13 @@ public: using comparator = Comparator; using Compactor = req_compactor<T, Comparator, Allocator>; using AllocCompactor = typename std::allocator_traits<Allocator>::template rebind_alloc<Compactor>; + using vector_double = typename quantiles_sorted_view<T, Comparator, Allocator>::vector_double; + + /** + * Quantile return type. + * This is to return quantiles either by value (for arithmetic types) or by const reference (for all other types) + */ + using quantile_return_type = typename quantiles_sorted_view<T, Comparator, Allocator>::quantile_return_type; /** * Constructor @@ -46,19 +95,41 @@ public: * Value of 12 roughly corresponds to 1% relative error guarantee at 95% confidence. * @param hra if true, the default, the high ranks are prioritized for better * accuracy. Otherwise the low ranks are prioritized for better accuracy. - * @param comparator to use by this instance - * @param allocator to use by this instance + * @param comparator instance to use by this sketch instance + * @param allocator instance to use by this sketch instance */ explicit req_sketch(uint16_t k, bool hra = true, const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator()); - ~req_sketch(); + /** + * Copy constructor + * @param other sketch to be copied + */ req_sketch(const req_sketch& other); + + /** + * Move constructor + * @param other sketch to be moved + */ req_sketch(req_sketch&& other) noexcept; + + ~req_sketch(); + + /** + * Copy assignment + * @param other sketch to be copied + * @return reference to this sketch + */ req_sketch& operator=(const req_sketch& other); + + /** + * Move assignment + * @param other sketch to be moved + * @return reference to this sketch + */ req_sketch& operator=(req_sketch&& other); - /* + /** * Type converting constructor. * @param other sketch of a different type * @param comparator instance of a Comparator @@ -177,7 +248,6 @@ public: * @return an array of m+1 doubles each of which is an approximation * to the fraction of the input stream items (the mass) that fall into one of those intervals. */ - using vector_double = typename quantiles_sorted_view<T, Comparator, Allocator>::vector_double; vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const; /** @@ -214,7 +284,6 @@ public: * * @return approximate quantile associated with the given rank */ - using quantile_return_type = typename quantiles_sorted_view<T, Comparator, Allocator>::quantile_return_type; quantile_return_type get_quantile(double rank, bool inclusive = true) const; /** @@ -223,6 +292,7 @@ public: * * @param ranks given array of normalized ranks. * @param size the number of ranks in the array. + * @param inclusive if true, the given rank is considered inclusive (includes weight of an item) * * @return array of quantiles that correspond to the given array of normalized ranks * @@ -333,9 +403,26 @@ public: string<Allocator> to_string(bool print_levels = false, bool print_items = false) const; class const_iterator; + + /** + * Iterator pointing to the first item in the sketch. + * If the sketch is empty, the returned iterator must not be dereferenced or incremented. + * @return iterator pointing to the first item in the sketch + */ const_iterator begin() const; + + /** + * Iterator pointing to the past-the-end item in the sketch. + * The past-the-end item is the hypothetical item that would follow the last item. + * It does not point to any item, and must not be dereferenced or incremented. + * @return iterator pointing to the past-the-end item in the sketch + */ const_iterator end() const; + /** + * Gets the sorted view of this sketch + * @return the sorted view of this sketch + */ quantiles_sorted_view<T, Comparator, Allocator> get_sorted_view() const; private: diff --git a/sampling/include/var_opt_sketch.hpp b/sampling/include/var_opt_sketch.hpp index 10a7019..40dac80 100644 --- a/sampling/include/var_opt_sketch.hpp +++ b/sampling/include/var_opt_sketch.hpp @@ -26,22 +26,12 @@ #include <iterator> #include <vector> - -/** - * This sketch samples data from a stream of items, designed for optimal (minimum) variance when - * querying the sketch to estimate subset sums of items matchng a provided predicate. Variance - * optimal (varopt) sampling is related to reservoir sampling, with improved error bounds for - * subset sum estimation. - * - * author Kevin Lang - * author Jon Malkin - */ namespace datasketches { template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>; template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>; -/** +/* * A struct to hold the result of subset sum queries */ struct subset_summary { @@ -53,11 +43,23 @@ struct subset_summary { template <typename T, typename A> class var_opt_union; // forward declaration +/// VarOpt sketch constants namespace var_opt_constants { - const resize_factor DEFAULT_RESIZE_FACTOR = resize_factor::X8; - const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2; + /// default resize factor + const resize_factor DEFAULT_RESIZE_FACTOR = resize_factor::X8; + /// maximum value of parameter K + const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2; } +/** + * This sketch samples data from a stream of items. Designed for optimal (minimum) variance when + * querying the sketch to estimate subset sums of items matching a provided predicate. Variance + * optimal (varopt) sampling is related to reservoir sampling, with improved error bounds for + * subset sum estimation. + * + * author Kevin Lang + * author Jon Malkin + */ template< typename T, typename A = std::allocator<T> @@ -68,15 +70,42 @@ class var_opt_sketch { static const resize_factor DEFAULT_RESIZE_FACTOR = var_opt_constants::DEFAULT_RESIZE_FACTOR; static const uint32_t MAX_K = var_opt_constants::MAX_K; + /** + * Constructor + * @param k sketch size + * @param rf resize factor + * @param allocator instance of an allocator + */ explicit var_opt_sketch(uint32_t k, resize_factor rf = var_opt_constants::DEFAULT_RESIZE_FACTOR, const A& allocator = A()); + + /** + * Copy constructor + * @param other sketch to be copied + */ var_opt_sketch(const var_opt_sketch& other); + + /** + * Move constructor + * @param other sketch to be moved + */ var_opt_sketch(var_opt_sketch&& other) noexcept; ~var_opt_sketch(); + /** + * Copy assignment + * @param other sketch to be copied + * @return reference to this sketch + */ var_opt_sketch& operator=(const var_opt_sketch& other); + + /** + * Move assignment + * @param other sketch to be moved + * @return reference to this sketch + */ var_opt_sketch& operator=(var_opt_sketch&& other); /** @@ -85,7 +114,7 @@ class var_opt_sketch { * @param item an item from a stream of items * @param weight the weight of the item */ - void update(const T& item, double weight=1.0); + void update(const T& item, double weight = 1.0); /** * Updates this sketch with the given data item with the given weight. @@ -93,7 +122,7 @@ class var_opt_sketch { * @param item an item from a stream of items * @param weight the weight of the item */ - void update(T&& item, double weight=1.0); + void update(T&& item, double weight = 1.0); /** * Returns the configured maximum sample size. @@ -117,7 +146,7 @@ class var_opt_sketch { * Computes an estimated subset sum from the entire stream for objects matching a given * predicate. Provides a lower bound, estimate, and upper bound using a target of 2 standard * deviations. This is technically a heuristic method and tries to err on the conservative side. - * @param P a predicate function + * @param predicate a predicate function * @return a subset_summary item with estimate, upper and lower bounds, * and total sketch weight */ @@ -138,7 +167,7 @@ class var_opt_sketch { /** * Computes size needed to serialize the current state of the sketch. * This version is for fixed-size arithmetic types (integral and floating point). - * @param instance of a SerDe + * @param sd instance of a SerDe * @return size in bytes needed to serialize this sketch */ template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0> @@ -147,7 +176,7 @@ class var_opt_sketch { /** * Computes size needed to serialize the current state of the sketch. * This version is for all other types and can be expensive since every item needs to be looked at. - * @param instance of a SerDe + * @param sd instance of a SerDe * @return size in bytes needed to serialize this sketch */ template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0> @@ -155,7 +184,7 @@ class var_opt_sketch { // This is a convenience alias for users // The type returned by the following serialize method - typedef vector_u8<A> vector_bytes; + using vector_bytes = vector_u8<A>; /** * This method serializes the sketch as a vector of bytes. @@ -163,7 +192,7 @@ class var_opt_sketch { * It is a blank space of a given size. * This header is used in Datasketches PostgreSQL extension. * @param header_size_bytes space to reserve in front of the sketch - * @param instance of a SerDe + * @param sd instance of a SerDe */ template<typename SerDe = serde<T>> vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const; @@ -171,7 +200,7 @@ class var_opt_sketch { /** * This method serializes the sketch into a given stream in a binary form * @param os output stream - * @param instance of a SerDe + * @param sd instance of a SerDe */ template<typename SerDe = serde<T>> void serialize(std::ostream& os, const SerDe& sd = SerDe()) const; @@ -179,8 +208,8 @@ class var_opt_sketch { /** * This method deserializes a sketch from a given stream. * @param is input stream - * @param instance of a SerDe - * @param instance of an Allocator + * @param sd instance of a SerDe + * @param allocator instance of an allocator * @return an instance of a sketch */ template<typename SerDe = serde<T>> @@ -190,8 +219,8 @@ class var_opt_sketch { * This method deserializes a sketch from a given array of bytes. * @param bytes pointer to the array of bytes * @param size the size of the array - * @param instance of a SerDe - * @param instance of an Allocator + * @param sd instance of a SerDe + * @param allocator instance of an allocator * @return an instance of a sketch */ template<typename SerDe = serde<T>> @@ -213,7 +242,20 @@ class var_opt_sketch { string<A> items_to_string() const; class const_iterator; + + /** + * Iterator pointing to the first item in the sketch. + * If the sketch is empty, the returned iterator must not be dereferenced or incremented. + * @return iterator pointing to the first item in the sketch + */ const_iterator begin() const; + + /** + * Iterator pointing to the past-the-end item in the sketch. + * The past-the-end item is the hypothetical item that would follow the last item. + * It does not point to any item, and must not be dereferenced or incremented. + * @return iterator pointing to the past-the-end item in the sketch + */ const_iterator end() const; private: diff --git a/sampling/include/var_opt_sketch_impl.hpp b/sampling/include/var_opt_sketch_impl.hpp index 8ce66a5..7bf4095 100644 --- a/sampling/include/var_opt_sketch_impl.hpp +++ b/sampling/include/var_opt_sketch_impl.hpp @@ -36,7 +36,7 @@ namespace datasketches { -/** +/* * Implementation code for the VarOpt sketch. * * author Kevin Lang @@ -895,7 +895,7 @@ void var_opt_sketch<T, A>::update_heavy_r_eq1(O&& item, double weight, bool mark grow_candidate_set(weights_[m_slot] + total_wt_r_, 2); } -/** +/* * Decreases sketch's value of k by 1, updating stored values as needed. * * <p>Subject to certain pre-conditions, decreasing k causes tau to increase. This fact is used by @@ -1685,7 +1685,7 @@ bool var_opt_sketch<T, A>::iterator::get_mark() const { return sk_->marks_ == nullptr ? false : sk_->marks_[idx_]; } -/** +/* * Checks if target sampling allocation is more than 50% of max sampling size. * If so, returns max sampling size, otherwise passes through target size. */ diff --git a/sampling/include/var_opt_union.hpp b/sampling/include/var_opt_union.hpp index 3d00735..f91377e 100644 --- a/sampling/include/var_opt_union.hpp +++ b/sampling/include/var_opt_union.hpp @@ -91,7 +91,7 @@ public: /** * Computes size needed to serialize the current state of the union. * This version is for all other types and can be expensive since every item needs to be looked at. - * @param instance of a SerDe + * @param sd instance of a SerDe * @return size in bytes needed to serialize this sketch */ template<typename SerDe = serde<T>> @@ -108,7 +108,7 @@ public: * It is a blank space of a given size. * This header is used in Datasketches PostgreSQL extension. * @param header_size_bytes space to reserve in front of the sketch - * @param instance of a SerDe + * @param sd instance of a SerDe */ template<typename SerDe = serde<T>> vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const; @@ -117,7 +117,7 @@ public: * NOTE: This method may be deprecated in a future version. * This method serializes the sketch into a given stream in a binary form * @param os output stream - * @param instance of a SerDe + * @param sd instance of a SerDe */ template<typename SerDe = serde<T>> void serialize(std::ostream& os, const SerDe& sd = SerDe()) const; @@ -126,8 +126,8 @@ public: * NOTE: This method may be deprecated in a future version. * This method deserializes a union from a given stream. * @param is input stream - * @param instance of a SerDe - * @param instance of an Allocator + * @param sd instance of a SerDe + * @param allocator instance of an Allocator * @return an instance of a union */ template<typename SerDe = serde<T>> @@ -138,8 +138,8 @@ public: * This method deserializes a union from a given array of bytes. * @param bytes pointer to the array of bytes * @param size the size of the array - * @param instance of a SerDe - * @param instance of an Allocator + * @param sd instance of a SerDe + * @param allocator instance of an Allocator * @return an instance of a union */ template<typename SerDe = serde<T>> diff --git a/theta/include/bounds_on_ratios_in_sampled_sets.hpp b/theta/include/bounds_on_ratios_in_sampled_sets.hpp index aa44b1f..413d71a 100644 --- a/theta/include/bounds_on_ratios_in_sampled_sets.hpp +++ b/theta/include/bounds_on_ratios_in_sampled_sets.hpp @@ -29,6 +29,7 @@ namespace datasketches { /** + * Bounds on ratios in sampled sets. * This class is used to compute the bounds on the estimate of the ratio <i>|B| / |A|</i>, where: * <ul> * <li><i>|A|</i> is the unknown size of a set <i>A</i> of unique identifiers.</li> diff --git a/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp b/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp index 1779ec1..b62f92c 100644 --- a/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +++ b/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp @@ -28,6 +28,7 @@ namespace datasketches { /** + * Bounds on ratios in Theta sketched sets. * This is to compute the bounds on the estimate of the ratio <i>B / A</i>, where: * <ul> * <li><i>A</i> is a Theta Sketch of population <i>PopA</i>.</li> @@ -50,8 +51,8 @@ class bounds_on_ratios_in_theta_sketched_sets { public: /** * Gets the approximate lower bound for B over A based on a 95% confidence interval - * @param sketchA the sketch A - * @param sketchB the sketch B + * @param sketch_a the sketch A + * @param sketch_b the sketch B * @return the approximate lower bound for B over A */ template<typename SketchA, typename SketchB> @@ -72,8 +73,8 @@ public: /** * Gets the approximate upper bound for B over A based on a 95% confidence interval - * @param sketchA the sketch A - * @param sketchB the sketch B + * @param sketch_a the sketch A + * @param sketch_b the sketch B * @return the approximate upper bound for B over A */ template<typename SketchA, typename SketchB> @@ -94,8 +95,8 @@ public: /** * Gets the estimate for B over A - * @param sketchA the sketch A - * @param sketchB the sketch B + * @param sketch_a the sketch A + * @param sketch_b the sketch B * @return the estimate for B over A */ template<typename SketchA, typename SketchB> diff --git a/theta/include/theta_a_not_b.hpp b/theta/include/theta_a_not_b.hpp index 4beef60..0ca5c7e 100644 --- a/theta/include/theta_a_not_b.hpp +++ b/theta/include/theta_a_not_b.hpp @@ -25,6 +25,10 @@ namespace datasketches { +/** + * Theta A-not-B (set difference). + * Computes set difference of Theta sketches. + */ template<typename Allocator = std::allocator<uint64_t>> class theta_a_not_b_alloc { public: @@ -33,6 +37,11 @@ public: using CompactSketch = compact_theta_sketch_alloc<Allocator>; using State = theta_set_difference_base<Entry, ExtractKey, CompactSketch, Allocator>; + /** + * Constructor + * @param seed for the hash function that was used to create the sketch + * @param allocator to use for allocating and deallocating memory + */ explicit theta_a_not_b_alloc(uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator()); /** diff --git a/theta/include/theta_intersection.hpp b/theta/include/theta_intersection.hpp index 19397e5..93a4a7c 100644 --- a/theta/include/theta_intersection.hpp +++ b/theta/include/theta_intersection.hpp @@ -25,6 +25,10 @@ namespace datasketches { +/** + * Theta intersection. + * Computes intersection of Theta sketches. + */ template<typename Allocator = std::allocator<uint64_t>> class theta_intersection_alloc { public: @@ -41,7 +45,7 @@ public: }; using State = theta_intersection_base<Entry, ExtractKey, nop_policy, Sketch, CompactSketch, Allocator>; - /* + /** * Constructor * @param seed for the hash function that was used to create the sketch * @param allocator to use for allocating and deallocating memory diff --git a/theta/include/theta_intersection_impl.hpp b/theta/include/theta_intersection_impl.hpp index 5f0575f..bc3d822 100644 --- a/theta/include/theta_intersection_impl.hpp +++ b/theta/include/theta_intersection_impl.hpp @@ -28,9 +28,9 @@ state_(seed, nop_policy(), allocator) {} template<typename A> -template<typename SS> -void theta_intersection_alloc<A>::update(SS&& sketch) { - state_.update(std::forward<SS>(sketch)); +template<typename FwdSketch> +void theta_intersection_alloc<A>::update(FwdSketch&& sketch) { + state_.update(std::forward<FwdSketch>(sketch)); } template<typename A> diff --git a/theta/include/theta_jaccard_similarity.hpp b/theta/include/theta_jaccard_similarity.hpp index 417ed54..126b78d 100644 --- a/theta/include/theta_jaccard_similarity.hpp +++ b/theta/include/theta_jaccard_similarity.hpp @@ -26,10 +26,11 @@ namespace datasketches { +/// Theta Jaccard similarity alias template<typename Allocator = std::allocator<uint64_t>> using theta_jaccard_similarity_alloc = jaccard_similarity_base<theta_union_alloc<Allocator>, theta_intersection_alloc<Allocator>, trivial_extract_key>; -// alias with default allocator for convenience +/// Theta Jaccard similarity alias with default allocator using theta_jaccard_similarity = theta_jaccard_similarity_alloc<std::allocator<uint64_t>>; } /* namespace datasketches */ diff --git a/theta/include/theta_jaccard_similarity_base.hpp b/theta/include/theta_jaccard_similarity_base.hpp index 5e8dcf5..48d8676 100644 --- a/theta/include/theta_jaccard_similarity_base.hpp +++ b/theta/include/theta_jaccard_similarity_base.hpp @@ -30,6 +30,7 @@ namespace datasketches { +/// Base class for Jaccard similarity template<typename Union, typename Intersection, typename ExtractKey> class jaccard_similarity_base { public: diff --git a/theta/include/theta_sketch.hpp b/theta/include/theta_sketch.hpp index 4bc1c2e..4a022c5 100644 --- a/theta/include/theta_sketch.hpp +++ b/theta/include/theta_sketch.hpp @@ -25,6 +25,7 @@ namespace datasketches { +/// Abstract base class for Theta sketch template<typename Allocator = std::allocator<uint64_t>> class base_theta_sketch_alloc { public: @@ -106,6 +107,7 @@ protected: virtual void print_items(std::ostringstream& os) const = 0; }; +/// Base class for the Theta Sketch, a generalization of the Kth Minimum Value (KMV) sketch. template<typename Allocator = std::allocator<uint64_t>> class theta_sketch_alloc: public base_theta_sketch_alloc<Allocator> { public: @@ -149,6 +151,11 @@ protected: // forward declaration template<typename A> class compact_theta_sketch_alloc; +/** + * Update Theta sketch. + * The purpose of this class is to build a Theta sketch from input data via the update() methods. + * There is no constructor. Use builder instead. + */ template<typename Allocator = std::allocator<uint64_t>> class update_theta_sketch_alloc: public theta_sketch_alloc<Allocator> { public: @@ -307,8 +314,10 @@ private: virtual void print_specifics(std::ostringstream& os) const; }; -// compact sketch - +/** + * Compact Theta sketch. + * This is an immutable form of the Theta sketch, the form that can be serialized and deserialized. + */ template<typename Allocator = std::allocator<uint64_t>> class compact_theta_sketch_alloc: public theta_sketch_alloc<Allocator> { public: @@ -327,8 +336,15 @@ public: // - as a result of a set operation // - by deserializing a previously serialized compact sketch + /** + * Copy constructor. + * Constructs a compact sketch from any other type of Theta sketch + * @param other sketch to be constructed from + * @param ordered if true make the resulting sketch ordered + */ template<typename Other> compact_theta_sketch_alloc(const Other& other, bool ordered); + compact_theta_sketch_alloc(const compact_theta_sketch_alloc&) = default; compact_theta_sketch_alloc(compact_theta_sketch_alloc&&) noexcept = default; virtual ~compact_theta_sketch_alloc() = default; @@ -385,6 +401,7 @@ public: * This method deserializes a sketch from a given stream. * @param is input stream * @param seed the seed for the hash function that was used to create the sketch + * @param allocator instance of an Allocator * @return an instance of the sketch */ static compact_theta_sketch_alloc deserialize(std::istream& is, @@ -395,12 +412,13 @@ public: * @param bytes pointer to the array of bytes * @param size the size of the array * @param seed the seed for the hash function that was used to create the sketch + * @param allocator instance of an Allocator * @return an instance of the sketch */ static compact_theta_sketch_alloc deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator()); - // for internal use + /// @private constructor for internal use compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<uint64_t, Allocator>&& entries); private: @@ -425,18 +443,26 @@ private: virtual void print_specifics(std::ostringstream& os) const; }; +/// Update Theta sketch builder template<typename Allocator> class update_theta_sketch_alloc<Allocator>::builder: public theta_base_builder<builder, Allocator> { public: + /** + * Constructor + * @param allocator + */ builder(const Allocator& allocator = Allocator()); + /// @return instance of Update Theta sketch update_theta_sketch_alloc build() const; }; -// This is to wrap a buffer containing a serialized compact sketch and use it in a set operation avoiding some cost of deserialization. -// It does not take the ownership of the buffer. - +/** + * Wrapped Compact Theta sketch. + * This is to wrap a buffer containing a serialized compact sketch and use it in a set operation avoiding some cost of deserialization. + * It does not take the ownership of the buffer. + */ template<typename Allocator = std::allocator<uint64_t>> -class wrapped_compact_theta_sketch_alloc : public base_theta_sketch_alloc<Allocator> { +class wrapped_compact_theta_sketch_alloc: public base_theta_sketch_alloc<Allocator> { public: class const_iterator; @@ -447,7 +473,17 @@ public: uint32_t get_num_retained() const; uint16_t get_seed_hash() const; + /** + * Const iterator over hash values in this sketch. + * @return begin iterator + */ const_iterator begin() const; + + /** + * Const iterator pointing past the valid range. + * Not to be incremented or dereferenced. + * @return end iterator + */ const_iterator end() const; /** @@ -455,6 +491,7 @@ public: * @param bytes pointer to the array of bytes * @param size the size of the array * @param seed the seed for the hash function that was used to create the sketch + * @param dump_on_error if true prints hex dump of the input * @return an instance of the sketch */ static const wrapped_compact_theta_sketch_alloc wrap(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, bool dump_on_error = false); @@ -499,10 +536,13 @@ private: uint64_t buffer_[8]; }; -// aliases with default allocator for convenience +/// Theta sketch alias with default allocator using theta_sketch = theta_sketch_alloc<std::allocator<uint64_t>>; +/// Update Theta sketch alias with default allocator using update_theta_sketch = update_theta_sketch_alloc<std::allocator<uint64_t>>; +/// Compact Theta sketch alias with default allocator using compact_theta_sketch = compact_theta_sketch_alloc<std::allocator<uint64_t>>; +/// Wrapped Compact Theta sketch alias with default allocator using wrapped_compact_theta_sketch = wrapped_compact_theta_sketch_alloc<std::allocator<uint64_t>>; } /* namespace datasketches */ diff --git a/theta/include/theta_union.hpp b/theta/include/theta_union.hpp index d90c53a..ca59629 100644 --- a/theta/include/theta_union.hpp +++ b/theta/include/theta_union.hpp @@ -26,6 +26,10 @@ namespace datasketches { +/** + * Theta Union. + * Computes union of Theta sketches. There is no constructor. Use builder instead. + */ template<typename Allocator = std::allocator<uint64_t>> class theta_union_alloc { public: @@ -72,6 +76,7 @@ private: theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Allocator& allocator); }; +/// Theta union builder template<typename A> class theta_union_alloc<A>::builder: public theta_base_builder<builder, A> { public: diff --git a/theta/include/theta_union_impl.hpp b/theta/include/theta_union_impl.hpp index 8618618..310d58a 100644 --- a/theta/include/theta_union_impl.hpp +++ b/theta/include/theta_union_impl.hpp @@ -28,9 +28,9 @@ state_(lg_cur_size, lg_nom_size, rf, p, theta, seed, nop_policy(), allocator) {} template<typename A> -template<typename SS> -void theta_union_alloc<A>::update(SS&& sketch) { - state_.update(std::forward<SS>(sketch)); +template<typename FwdSketch> +void theta_union_alloc<A>::update(FwdSketch&& sketch) { + state_.update(std::forward<FwdSketch>(sketch)); } template<typename A> diff --git a/theta/include/theta_update_sketch_base.hpp b/theta/include/theta_update_sketch_base.hpp index a8babf2..9101363 100644 --- a/theta/include/theta_update_sketch_base.hpp +++ b/theta/include/theta_update_sketch_base.hpp @@ -91,13 +91,14 @@ struct theta_update_sketch_base { static void consolidate_non_empty(Entry* entries, size_t size, size_t num); }; -// builder +/// Theta base builder template<typename Derived, typename Allocator> class theta_base_builder { public: /** * Creates and instance of the builder with default parameters. + * @param allocator instance of an Allocator to pass to created sketches */ theta_base_builder(const Allocator& allocator); diff --git a/tuple/include/array_of_doubles_sketch.hpp b/tuple/include/array_of_doubles_sketch.hpp index 4c468a3..3f50227 100644 --- a/tuple/include/array_of_doubles_sketch.hpp +++ b/tuple/include/array_of_doubles_sketch.hpp @@ -109,6 +109,10 @@ template<typename A> class compact_array_of_doubles_sketch_alloc; template<typename A> using AllocAOD = typename std::allocator_traits<A>::template rebind_alloc<aod<A>>; +/** + * Update Array of doubles sketch. + * There is no constructor. Use builder instead. + */ template<typename A = std::allocator<double>> class update_array_of_doubles_sketch_alloc: public update_tuple_sketch<aod<A>, aod<A>, array_of_doubles_update_policy<A>, AllocAOD<A>> { public: @@ -118,6 +122,8 @@ public: class builder; compact_array_of_doubles_sketch_alloc<A> compact(bool ordered = true) const; + + /// @return number of double values in array uint8_t get_num_values() const; private: @@ -126,9 +132,10 @@ private: uint64_t seed, const array_of_doubles_update_policy<A>& policy, const A& allocator); }; -// alias with the default allocator for convenience +/// alias with the default allocator for convenience using update_array_of_doubles_sketch = update_array_of_doubles_sketch_alloc<>; +/// Update Array of doubles sketch builder template<typename A> class update_array_of_doubles_sketch_alloc<A>::builder: public tuple_base_builder<builder, array_of_doubles_update_policy<A>, A> { public: @@ -136,6 +143,7 @@ public: update_array_of_doubles_sketch_alloc<A> build() const; }; +/// Compact Array of doubles sketch template<typename A = std::allocator<double>> class compact_array_of_doubles_sketch_alloc: public compact_tuple_sketch<aod<A>, AllocAOD<A>> { public: @@ -153,6 +161,7 @@ public: template<typename Sketch> compact_array_of_doubles_sketch_alloc(const Sketch& other, bool ordered = true); + /// @return number of double values in array uint8_t get_num_values() const; void serialize(std::ostream& os) const; @@ -169,7 +178,7 @@ private: uint8_t num_values_; }; -// alias with the default allocator for convenience +/// alias with the default allocator for convenience using compact_array_of_doubles_sketch = compact_array_of_doubles_sketch_alloc<>; } /* namespace datasketches */ diff --git a/tuple/include/tuple_intersection.hpp b/tuple/include/tuple_intersection.hpp index 966ea9f..4d9df08 100644 --- a/tuple/include/tuple_intersection.hpp +++ b/tuple/include/tuple_intersection.hpp @@ -38,6 +38,10 @@ struct example_intersection_policy { }; */ +/** + * Tuple intersection. + * Computes intersection of Tuple sketches. + */ template< typename Summary, typename Policy, @@ -67,6 +71,12 @@ public: using State = theta_intersection_base<Entry, ExtractKey, internal_policy, Sketch, CompactSketch, AllocEntry>; + /** + * Constructor + * @param seed for the hash function that was used to create the sketch + * @param policy user-defined way of combining Summary during intersection + * @param allocator to use for allocating and deallocating memory + */ explicit tuple_intersection(uint64_t seed = DEFAULT_SEED, const Policy& policy = Policy(), const Allocator& allocator = Allocator()); /** diff --git a/tuple/include/tuple_jaccard_similarity.hpp b/tuple/include/tuple_jaccard_similarity.hpp index 0a6633c..9d4cea5 100644 --- a/tuple/include/tuple_jaccard_similarity.hpp +++ b/tuple/include/tuple_jaccard_similarity.hpp @@ -26,6 +26,7 @@ namespace datasketches { +/// Tuple Jaccard similarity alias template< typename Summary, typename IntersectionPolicy, diff --git a/tuple/include/tuple_sketch.hpp b/tuple/include/tuple_sketch.hpp index 70571df..eeb776f 100644 --- a/tuple/include/tuple_sketch.hpp +++ b/tuple/include/tuple_sketch.hpp @@ -43,6 +43,10 @@ struct pair_extract_key { } }; +/** + * Base class for Tuple sketch. + * This is an extension of Theta sketch that allows keeping arbitrary Summary associated with each retained key. + */ template< typename Summary, typename Allocator = std::allocator<Summary> @@ -199,6 +203,11 @@ struct default_update_policy { } }; +/** + * Update Tuple sketch. + * The purpose of this class is to build a Tuple sketch from input data via the update() methods. + * There is no constructor. Use builder instead. + */ template< typename Summary, typename Update = Summary, @@ -244,21 +253,24 @@ public: /** * Update this sketch with a given string. - * @param value string to update the sketch with + * @param key string to update the sketch with + * @param value to update the sketch with */ template<typename FwdUpdate> inline void update(const std::string& key, FwdUpdate&& value); /** * Update this sketch with a given unsigned 64-bit integer. - * @param value uint64_t to update the sketch with + * @param key uint64_t to update the sketch with + * @param value to update the sketch with */ template<typename FwdUpdate> inline void update(uint64_t key, FwdUpdate&& value); /** * Update this sketch with a given signed 64-bit integer. - * @param value int64_t to update the sketch with + * @param key int64_t to update the sketch with + * @param value to update the sketch with */ template<typename FwdUpdate> inline void update(int64_t key, FwdUpdate&& value); @@ -266,7 +278,8 @@ public: /** * Update this sketch with a given unsigned 32-bit integer. * For compatibility with Java implementation. - * @param value uint32_t to update the sketch with + * @param key uint32_t to update the sketch with + * @param value to update the sketch with */ template<typename FwdUpdate> inline void update(uint32_t key, FwdUpdate&& value); @@ -274,7 +287,8 @@ public: /** * Update this sketch with a given signed 32-bit integer. * For compatibility with Java implementation. - * @param value int32_t to update the sketch with + * @param key int32_t to update the sketch with + * @param value to update the sketch with */ template<typename FwdUpdate> inline void update(int32_t key, FwdUpdate&& value); @@ -282,7 +296,8 @@ public: /** * Update this sketch with a given unsigned 16-bit integer. * For compatibility with Java implementation. - * @param value uint16_t to update the sketch with + * @param key uint16_t to update the sketch with + * @param value to update the sketch with */ template<typename FwdUpdate> inline void update(uint16_t key, FwdUpdate&& value); @@ -290,7 +305,8 @@ public: /** * Update this sketch with a given signed 16-bit integer. * For compatibility with Java implementation. - * @param value int16_t to update the sketch with + * @param key int16_t to update the sketch with + * @param value to update the sketch with */ template<typename FwdUpdate> inline void update(int16_t key, FwdUpdate&& value); @@ -298,7 +314,8 @@ public: /** * Update this sketch with a given unsigned 8-bit integer. * For compatibility with Java implementation. - * @param value uint8_t to update the sketch with + * @param key uint8_t to update the sketch with + * @param value to update the sketch with */ template<typename FwdUpdate> inline void update(uint8_t key, FwdUpdate&& value); @@ -306,7 +323,8 @@ public: /** * Update this sketch with a given signed 8-bit integer. * For compatibility with Java implementation. - * @param value int8_t to update the sketch with + * @param key int8_t to update the sketch with + * @param value to update the sketch with */ template<typename FwdUpdate> inline void update(int8_t key, FwdUpdate&& value); @@ -314,7 +332,8 @@ public: /** * Update this sketch with a given double-precision floating point value. * For compatibility with Java implementation. - * @param value double to update the sketch with + * @param key double to update the sketch with + * @param value to update the sketch with */ template<typename FwdUpdate> inline void update(double key, FwdUpdate&& value); @@ -322,7 +341,8 @@ public: /** * Update this sketch with a given floating point value. * For compatibility with Java implementation. - * @param value float to update the sketch with + * @param key float to update the sketch with + * @param value to update the sketch with */ template<typename FwdUpdate> inline void update(float key, FwdUpdate&& value); @@ -337,8 +357,9 @@ public: * Otherwise two sketches that should represent overlapping sets will be disjoint * For instance, for signed 32-bit values call update(int32_t) method above, * which does widening conversion to int64_t, if compatibility with Java is expected - * @param data pointer to the data + * @param key pointer to the data * @param length of the data in bytes + * @param value to update the sketch with */ template<typename FwdUpdate> void update(const void* key, size_t length, FwdUpdate&& value); @@ -375,8 +396,10 @@ protected: virtual void print_specifics(std::ostringstream& os) const; }; -// compact sketch - +/** + * Compact Tuple sketch. + * This is an immutable form of the Tuple sketch, the form that can be serialized and deserialized. + */ template< typename Summary, typename Allocator = std::allocator<Summary> @@ -406,13 +429,26 @@ public: // - as a result of a set operation // - by deserializing a previously serialized compact sketch + /** + * Copy constructor. + * Constructs a compact sketch from another sketch (either update or compact) + * @param other sketch to be copied + * @param ordered if true make the resulting sketch ordered + */ compact_tuple_sketch(const Base& other, bool ordered); + compact_tuple_sketch(const compact_tuple_sketch&) = default; compact_tuple_sketch(compact_tuple_sketch&&) noexcept; virtual ~compact_tuple_sketch() = default; compact_tuple_sketch& operator=(const compact_tuple_sketch&) = default; compact_tuple_sketch& operator=(compact_tuple_sketch&&) = default; + /** + * Constructor from Theta sketch + * @param other Theta sketch to be constructed from + * @param summary Summary instance to be associated with each entry + * @param ordered if true make the resulting sketch ordered + */ compact_tuple_sketch(const theta_sketch_alloc<AllocU64>& other, const Summary& summary, bool ordered = true); virtual Allocator get_allocator() const; @@ -425,7 +461,7 @@ public: /** * This method serializes the sketch into a given stream in a binary form * @param os output stream - * @param instance of a SerDe + * @param sd instance of a SerDe */ template<typename SerDe = serde<Summary>> void serialize(std::ostream& os, const SerDe& sd = SerDe()) const; @@ -436,7 +472,7 @@ public: * It is a blank space of a given size. * This header is used in Datasketches PostgreSQL extension. * @param header_size_bytes space to reserve in front of the sketch - * @param instance of a SerDe + * @param sd instance of a SerDe * @return serialized sketch as a vector of bytes */ template<typename SerDe = serde<Summary>> @@ -451,8 +487,8 @@ public: * This method deserializes a sketch from a given stream. * @param is input stream * @param seed the seed for the hash function that was used to create the sketch - * @param instance of a SerDe - * @param instance of an Allocator + * @param sd instance of a SerDe + * @param allocator instance of an Allocator * @return an instance of a sketch */ template<typename SerDe = serde<Summary>> @@ -464,8 +500,8 @@ public: * @param bytes pointer to the array of bytes * @param size the size of the array * @param seed the seed for the hash function that was used to create the sketch - * @param instance of a SerDe - * @param instance of an Allocator + * @param sd instance of a SerDe + * @param allocator instance of an Allocator * @return an instance of the sketch */ template<typename SerDe = serde<Summary>> @@ -522,8 +558,7 @@ protected: }; -// builder - +// base builder template<typename Derived, typename Policy, typename Allocator> class tuple_base_builder: public theta_base_builder<Derived, Allocator> { public: @@ -533,11 +568,15 @@ protected: Policy policy_; }; +/// Update Tuple sketch builder template<typename S, typename U, typename P, typename A> class update_tuple_sketch<S, U, P, A>::builder: public tuple_base_builder<builder, P, A> { public: /** + * Constructor * Creates and instance of the builder with default parameters. + * @param policy user-defined way of creating and updating Summary + * @param allocator instance of an Allocator to pass to created sketches */ builder(const P& policy = P(), const A& allocator = A()); diff --git a/tuple/include/tuple_union.hpp b/tuple/include/tuple_union.hpp index 1c518da..381b550 100644 --- a/tuple/include/tuple_union.hpp +++ b/tuple/include/tuple_union.hpp @@ -33,6 +33,10 @@ struct default_union_policy { } }; +/** + * Tuple Union. + * Computes union of Tuple sketches. There is no constructor. Use builder instead. + */ template< typename Summary, typename Policy = default_union_policy<Summary>, @@ -92,11 +96,15 @@ protected: tuple_union(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator); }; +/// Tuple union builder template<typename S, typename P, typename A> class tuple_union<S, P, A>::builder: public tuple_base_builder<builder, P, A> { public: /** + * Constructor. * Creates and instance of the builder with default parameters. + * @param policy + * @param allocator */ builder(const P& policy = P(), const A& allocator = A()); diff --git a/tuple/test/tuple_sketch_test.cpp b/tuple/test/tuple_sketch_test.cpp index 352f953..b5316e8 100644 --- a/tuple/test/tuple_sketch_test.cpp +++ b/tuple/test/tuple_sketch_test.cpp @@ -86,7 +86,7 @@ TEST_CASE("tuple sketch float: empty", "[tuple_sketch]") { REQUIRE(update_sketch.compact(false).is_ordered()); } -TEST_CASE("tuple sketch: single item", "[theta_sketch]") { +TEST_CASE("tuple sketch: single item", "[tuple_sketch]") { auto update_sketch = update_tuple_sketch<float>::builder().build(); update_sketch.update(1, 1.0f); REQUIRE_FALSE(update_sketch.is_empty()); --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
