This is an automated email from the ASF dual-hosted git repository. alsay pushed a commit to branch kll_converting_constructor in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git
commit a92a26f7f78453afec6f90b02566dc014b45e3f7 Author: AlexanderSaydakov <[email protected]> AuthorDate: Thu Jun 2 17:51:26 2022 -0700 converting constructor --- kll/include/kll_sketch.hpp | 40 ++++++++++++++++----- kll/include/kll_sketch_impl.hpp | 73 +++++++++++++++++++++++++++++++++++--- kll/test/kll_sketch_test.cpp | 77 +++++++++++++++++++++++++++++++++++++---- 3 files changed, 172 insertions(+), 18 deletions(-) diff --git a/kll/include/kll_sketch.hpp b/kll/include/kll_sketch.hpp index fcf8ebb..8616623 100644 --- a/kll/include/kll_sketch.hpp +++ b/kll/include/kll_sketch.hpp @@ -182,6 +182,14 @@ class kll_sketch { kll_sketch& operator=(const kll_sketch& other); kll_sketch& operator=(kll_sketch&& other); + /* + * Type converting constructor. + * @param other sketch of a different type + * @param allocator instance of an Allocator + */ + template<typename TT, typename CC, typename AA> + explicit kll_sketch(const kll_sketch<TT, CC, AA> & other, const A& allocator = A()); + /** * Updates this sketch with the given data item. * @param value an item from a stream of items @@ -208,6 +216,13 @@ class kll_sketch { */ uint16_t get_k() const; + /** + * Returns min_k, which is an internal parameter for error estimation + * after merging sketches with different k. + * @return parameter min_k + */ + uint16_t get_min_k() const; + /** * Returns the length of the input stream. * @return stream length @@ -220,6 +235,13 @@ class kll_sketch { */ uint32_t get_num_retained() const; + /** + * Returns the current capacity in items allocated by the sketch. + * For internal use. + * @return the capacity + */ + uint32_t get_capacity() const; + /** * Returns true if this sketch is in estimation mode. * @return estimation mode flag @@ -390,7 +412,7 @@ class kll_sketch { /** * Computes size needed to serialize the current state of the sketch. * This version is for fixed-size arithmetic types (integral and floating point). - * @param instance of a SerDe + * @param serde instance of a SerDe * @return size in bytes needed to serialize this sketch */ template<typename TT = T, typename SerDe = S, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0> @@ -399,7 +421,7 @@ class kll_sketch { /** * Computes size needed to serialize the current state of the sketch. * This version is for all other types and can be expensive since every item needs to be looked at. - * @param instance of a SerDe + * @param serde instance of a SerDe * @return size in bytes needed to serialize this sketch */ template<typename TT = T, typename SerDe = S, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0> @@ -459,7 +481,7 @@ class kll_sketch { /** * This method deserializes a sketch from a given stream. * @param is input stream - * @param instance of an Allocator + * @param allocator instance of an Allocator * @return an instance of a sketch * * Deprecated, to be removed in the next major version @@ -469,8 +491,8 @@ class kll_sketch { /** * This method deserializes a sketch from a given stream. * @param is input stream - * @param instance of a SerDe - * @param instance of an Allocator + * @param serge instance of a SerDe + * @param allocator instance of an Allocator * @return an instance of a sketch */ template<typename SerDe = S> @@ -480,7 +502,7 @@ class kll_sketch { * This method deserializes a sketch from a given array of bytes. * @param bytes pointer to the array of bytes * @param size the size of the array - * @param instance of an Allocator + * @param allocator instance of an Allocator * @return an instance of a sketch * * Deprecated, to be removed in the next major version @@ -491,8 +513,8 @@ class kll_sketch { * This method deserializes a sketch from a given array of bytes. * @param bytes pointer to the array of bytes * @param size the size of the array - * @param instance of a SerDe - * @param instance of an Allocator + * @param serde instance of a SerDe + * @param allocator instance of an Allocator * @return an instance of a sketch */ template<typename SerDe = S> @@ -606,6 +628,8 @@ class kll_sketch { static void check_serial_version(uint8_t serial_version); static void check_family_id(uint8_t family_id); + void check_sorting() const; + // implementations for floating point types template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0> static const TT& get_invalid_value() { diff --git a/kll/include/kll_sketch_impl.hpp b/kll/include/kll_sketch_impl.hpp index 1302ac9..211a926 100644 --- a/kll/include/kll_sketch_impl.hpp +++ b/kll/include/kll_sketch_impl.hpp @@ -26,6 +26,7 @@ #include <stdexcept> #include "conditional_forward.hpp" +#include "count_zeros.hpp" #include "memory_operations.hpp" #include "kll_helper.hpp" @@ -69,7 +70,7 @@ max_value_(nullptr), is_level_zero_sorted_(other.is_level_zero_sorted_) { items_ = allocator_.allocate(items_size_); - std::copy(other.items_ + levels_[0], other.items_ + levels_[num_levels_], items_ + levels_[0]); + for (auto i = levels_[0]; i < levels_[num_levels_]; ++i) new (&items_[i]) T(other.items_[i]); if (other.min_value_ != nullptr) min_value_ = new (allocator_.allocate(1)) T(*other.min_value_); if (other.max_value_ != nullptr) max_value_ = new (allocator_.allocate(1)) T(*other.max_value_); } @@ -147,6 +148,50 @@ kll_sketch<T, C, S, A>::~kll_sketch() { } } +template<typename T, typename C, typename S, typename A> +template<typename TT, typename CC, typename AA> +kll_sketch<T, C, S, A>::kll_sketch(const kll_sketch<TT, CC, AA>& other, const A& allocator): +allocator_(allocator), +k_(other.get_k()), +m_(DEFAULT_M), +min_k_(other.get_min_k()), +n_(other.get_n()), +num_levels_(1), +levels_(2, 0, allocator), +items_(nullptr), +items_size_(other.get_capacity()), +min_value_(nullptr), +max_value_(nullptr), +is_level_zero_sorted_(false) +{ + static_assert( + std::is_constructible<T, TT>::value, + "Type converting constructor requires new type to be constructible from existing type" + ); + items_ = allocator_.allocate(items_size_); + levels_[0] = items_size_ - other.get_num_retained(); + levels_[1] = items_size_; + + if (!other.is_empty()) { + min_value_ = new (allocator_.allocate(1)) T(other.get_min_value()); + max_value_ = new (allocator_.allocate(1)) T(other.get_max_value()); + size_t index = levels_[0]; + for (auto pair: other) { + new (&items_[index]) T(pair.first); + const uint8_t level = count_trailing_zeros_in_u64(pair.second); + if (level == num_levels_) { + ++num_levels_; + levels_.resize(num_levels_ + 1); + levels_[level] = index; + levels_[num_levels_] = items_size_; + } + ++index; + } + } + check_sorting(); + assert_correct_total_weight(); +} + template<typename T, typename C, typename S, typename A> template<typename FwdT> void kll_sketch<T, C, S, A>::update(FwdT&& value) { @@ -210,6 +255,11 @@ uint16_t kll_sketch<T, C, S, A>::get_k() const { return k_; } +template<typename T, typename C, typename S, typename A> +uint16_t kll_sketch<T, C, S, A>::get_min_k() const { + return min_k_; +} + template<typename T, typename C, typename S, typename A> uint64_t kll_sketch<T, C, S, A>::get_n() const { return n_; @@ -220,6 +270,11 @@ uint32_t kll_sketch<T, C, S, A>::get_num_retained() const { return levels_[num_levels_] - levels_[0]; } +template<typename T, typename C, typename S, typename A> +uint32_t kll_sketch<T, C, S, A>::get_capacity() const { + return items_size_; +} + template<typename T, typename C, typename S, typename A> bool kll_sketch<T, C, S, A>::is_estimation_mode() const { return num_levels_ > 1; @@ -780,17 +835,27 @@ void kll_sketch<T, C, S, A>::sort_level_zero() { } } +template<typename T, typename C, typename S, typename A> +void kll_sketch<T, C, S, A>::check_sorting() const { + // not checking level 0 + for (uint8_t level = 1; level < num_levels_; ++level) { + const auto from = items_ + levels_[level]; + const auto to = items_ + levels_[level + 1]; + if (!std::is_sorted(from, to, C())) { + throw std::logic_error("levels must be sorted"); + } + } +} + template<typename T, typename C, typename S, typename A> template<bool inclusive> quantile_sketch_sorted_view<T, C, A> kll_sketch<T, C, S, A>::get_sorted_view(bool cumulative) const { const_cast<kll_sketch*>(this)->sort_level_zero(); // allow this side effect quantile_sketch_sorted_view<T, C, A> view(get_num_retained(), allocator_); - uint8_t level = 0; - while (level < num_levels_) { + for (uint8_t level = 0; level < num_levels_; ++level) { const auto from = items_ + levels_[level]; const auto to = items_ + levels_[level + 1]; // exclusive view.add(from, to, 1 << level); - ++level; } if (cumulative) view.template convert_to_cummulative<inclusive>(); return view; diff --git a/kll/test/kll_sketch_test.cpp b/kll/test/kll_sketch_test.cpp index a8bd040..acf69df 100644 --- a/kll/test/kll_sketch_test.cpp +++ b/kll/test/kll_sketch_test.cpp @@ -39,9 +39,9 @@ static std::string testBinaryInputPath = "test/"; #endif // typical usage would be just kll_sketch<float> or kll_sketch<std::string>, but here we use test_allocator -typedef kll_sketch<float, std::less<float>, serde<float>, test_allocator<float>> kll_float_sketch; +using kll_float_sketch = kll_sketch<float, std::less<float>, serde<float>, test_allocator<float>>; // let std::string use the default allocator for simplicity, otherwise we need to define "less" and "serde" -typedef kll_sketch<std::string, std::less<std::string>, serde<std::string>, test_allocator<std::string>> kll_string_sketch; +using kll_string_sketch = kll_sketch<std::string, std::less<std::string>, serde<std::string>, test_allocator<std::string>>; TEST_CASE("kll sketch", "[kll_sketch]") { @@ -75,7 +75,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") { (void) it; // to suppress "unused" warning FAIL("should be no iterations over an empty sketch"); } -} + } SECTION("get bad quantile") { kll_float_sketch sketch(200, 0); @@ -835,10 +835,75 @@ TEST_CASE("kll sketch", "[kll_sketch]") { REQUIRE((*it).second == 3); } } - // cleanup - if (test_allocator_total_bytes != 0) { - REQUIRE(test_allocator_total_bytes == 0); + + SECTION("type conversion: empty") { + kll_sketch<double> kll_double; + kll_sketch<float> kll_float(kll_double); + REQUIRE(kll_float.is_empty()); + REQUIRE(kll_float.get_k() == kll_double.get_k()); + REQUIRE(kll_float.get_n() == 0); + REQUIRE(kll_float.get_num_retained() == 0); + } + + SECTION("type conversion: over k") { + kll_sketch<double> kll_double; + for (int i = 0; i < 1000; ++i) kll_double.update(static_cast<double>(i)); + kll_sketch<float> kll_float(kll_double); + REQUIRE(!kll_float.is_empty()); + REQUIRE(kll_float.get_k() == kll_double.get_k()); + REQUIRE(kll_float.get_n() == kll_double.get_n()); + REQUIRE(kll_float.get_num_retained() == kll_double.get_num_retained()); + + auto sv_float = kll_float.get_sorted_view(false); + auto sv_double = kll_double.get_sorted_view(false); + auto sv_float_it = sv_float.begin(); + auto sv_double_it = sv_double.begin(); + while (sv_float_it != sv_float.end()) { + REQUIRE(sv_double_it != sv_double.end()); + auto float_pair = *sv_float_it; + auto double_pair = *sv_double_it; + REQUIRE(float_pair.first == Approx(double_pair.first).margin(0.01)); + REQUIRE(float_pair.second == double_pair.second); + ++sv_float_it; + ++sv_double_it; + } + REQUIRE(sv_double_it == sv_double.end()); + } + + class A { + int val; + public: + A(int val): val(val) {} + int get_val() const { return val; } + }; + + struct less_A { + bool operator()(const A& a1, const A& a2) const { return a1.get_val() < a2.get_val(); } + }; + + class B { + int val; + public: + explicit B(const A& a): val(a.get_val()) {} + int get_val() const { return val; } + }; + + struct less_B { + bool operator()(const B& b1, const B& b2) const { return b1.get_val() < b2.get_val(); } + }; + + SECTION("type conversion: custom types") { + kll_sketch<A, less_A> sa; + sa.update(1); + sa.update(2); + sa.update(3); + + kll_sketch<B, less_B> sb(sa); + REQUIRE(sb.get_n() == 3); } + + // cleanup + REQUIRE(test_allocator_total_bytes == 0); } } /* namespace datasketches */ --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
