This is an automated email from the ASF dual-hosted git repository.

charlie pushed a commit to branch engagement-test
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git


The following commit(s) were added to refs/heads/engagement-test by this push:
     new b18e0ab  Added the engagement tests from java, updated with new 
lower/upper bound fcts
b18e0ab is described below

commit b18e0aba0340dde2a726c0279b5ca3dcff6d663b
Author: cdickens <[email protected]>
AuthorDate: Wed Aug 10 16:41:57 2022 +0100

    Added the engagement tests from java, updated with new lower/upper bound 
fcts
---
 tuple/test/CMakeLists.txt      |   3 +-
 tuple/test/engagement_test.cpp | 338 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 340 insertions(+), 1 deletion(-)

diff --git a/tuple/test/CMakeLists.txt b/tuple/test/CMakeLists.txt
index af01c34..ee08a54 100644
--- a/tuple/test/CMakeLists.txt
+++ b/tuple/test/CMakeLists.txt
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-add_executable(tuple_test)
+add_executable(tuple_test engagement_test.cpp)
 
 target_link_libraries(tuple_test tuple common_test)
 
@@ -45,4 +45,5 @@ target_sources(tuple_test
     tuple_a_not_b_test.cpp
     tuple_jaccard_similarity_test.cpp
     array_of_doubles_sketch_test.cpp
+    engagement_test.cpp
 )
diff --git a/tuple/test/engagement_test.cpp b/tuple/test/engagement_test.cpp
new file mode 100644
index 0000000..f837c4d
--- /dev/null
+++ b/tuple/test/engagement_test.cpp
@@ -0,0 +1,338 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <iostream>
+#include <iomanip>
+#include <set>
+#include <catch2/catch.hpp>
+#include <tuple_sketch.hpp>
+#include <tuple_union.hpp>
+#include <stdexcept>
+
+template<typename T>
+class max_value_policy {
+public:
+    max_value_policy(const T& initial_value): initial_value(initial_value) {}
+    T create() const { return initial_value; }
+    void update(T& summary, const T& update) const { summary = 
std::max(summary, update); }
+private:
+    T initial_value;
+};
+
+using max_float_update_tuple_sketch = datasketches::update_tuple_sketch<float, 
float, max_value_policy<float>>;
+
+template<typename T>
+class always_one_policy {
+public:
+    always_one_policy(): initial_value(1) {}
+    T create() const { return 1; }
+    void update(T& summary, const T& update) const { }
+private:
+    T initial_value;
+};
+using always_one_tuple_sketch = datasketches::update_tuple_sketch<int, int, 
always_one_policy<int>> ;
+
+template<typename T>
+class update_sum_value_policy {
+public:
+    //update_sum_value_policy(const T& initial_value): initial_value(0) {}
+    //T create() const { return initial_value; }
+    update_sum_value_policy(): initial_value(0) {}
+    T create() const { return initial_value; }
+    void update(T& summary, const T& update) const { summary += update; }
+private:
+    T initial_value;
+};
+using sum_update_tuple_sketch = datasketches::update_tuple_sketch<int, int, 
update_sum_value_policy<int>>;
+
+
+// This policy is only for a union sketch, not an update sketch.
+// Need another class if the same type of update is gonna work on an update 
sketch
+// with the create and update methods.
+
+template<typename Summary>
+struct union_sum_value_policy {
+    void operator()(Summary& summary, const Summary& other) const {
+        summary += other;
+    }
+};
+
+using sum_union_tuple_sketch = datasketches::tuple_union<int,  
union_sum_value_policy<int>> ;
+
+
+class EngagementTest{
+public:
+    int num_std_dev = 2 ;
+    void test_always_one_update(){
+        std::cout << "########## Testing ALWAYS ONE policy ##########" << 
std::endl ;
+        int lgK = 8 ;
+        // Here is where the IntegerSketch should go
+        std::vector<datasketches::update_tuple_sketch<int, int, 
always_one_policy<int>>> sketch_array ;
+
+        auto always_one_sketch = 
always_one_tuple_sketch::builder(always_one_policy<int>()).set_lg_k(lgK).build()
 ;
+
+        always_one_sketch.update(1, 1);
+        always_one_sketch.update(1, 2);
+        always_one_sketch.update(2, 1);
+        always_one_sketch.update(3, 3);
+        always_one_sketch.update(3, 7);
+
+        int num_retained = 0;
+        int sum = 0;
+        for (const auto& entry: always_one_sketch) {
+            sum += entry.second;
+            ++num_retained;
+        }
+        REQUIRE(num_retained == 3);
+        REQUIRE(sum == 3); // we only keep 1 for every stored key.
+    }
+
+    void test_sum_update_policy(){
+        std::cout << "########## Testing SUM policy on UPDATE SKETCH 
##########" << std::endl ;
+        int lgK = 8 ;
+        auto sum_sketch = 
sum_update_tuple_sketch::builder().set_lg_k(lgK).build() ;
+
+        sum_sketch.update(1, 1);
+        sum_sketch.update(1, 2);
+        sum_sketch.update(2, 1);
+        sum_sketch.update(3, 3);
+        sum_sketch.update(3, 7);
+        int num_retained = 0;
+        int sum = 0;
+        for (const auto& entry: sum_sketch) {
+            sum += entry.second;
+            ++num_retained;
+        }
+        REQUIRE(num_retained == 3);
+        REQUIRE(sum == 14); // (1+2) + 1 + (3 + 7) = 14
+    }
+
+    void test_sum_union_policy(){
+        // Union two update sketches using the sum policy
+        auto sketch1 = sum_update_tuple_sketch::builder().build() ;
+        auto sketch2 = sum_update_tuple_sketch::builder().build() ;
+
+        sketch1.update(1, 1);
+        sketch1.update(2, 1);
+        sketch1.update(3, 3);
+        // std::cout << "********** sketch 1 UPDATE SUMMARY **********" << 
std::endl ;
+        // std::cout << sketch1.to_string(true) << std::endl ;
+
+        sketch2.update(1, 2);
+        sketch2.update(2, 1);
+        sketch2.update(3, 7);
+        // std::cout << "********** sketch 2 UPDATE SUMMARY **********" << 
std::endl ;
+        // std::cout << sketch2.to_string(true) << std::endl ;
+
+        auto union_sketch = sum_union_tuple_sketch::builder().build() ;
+        union_sketch.update(sketch1) ;
+        union_sketch.update(sketch2) ;
+        auto union_result = union_sketch.get_result() ;
+
+        int num_retained = 0;
+        int sum = 0;
+        for (const auto& entry: union_result) {
+            sum += entry.second;
+            ++num_retained;
+        }
+        REQUIRE(num_retained == 3);
+        REQUIRE(sum == 15); // 1:(1+2) + 2:(1+1) + 3:(3+7) = 15
+
+        // std::cout << "********** UNION SUMMARY **********" << std::endl ;
+        // std::cout << union_result.to_string(true) << std::endl ;
+
+    }
+
+    void compute_engagement_histogram(){
+        std::cout << "########## Testing ENGAGEMENT ##########" << std::endl ;
+        int lgK = 8 ;
+        //int K = 1 << lgK ;
+        const int days = 30 ;
+        int v = 0 ;
+        std::set<int> set_array[days];
+        std::vector<datasketches::update_tuple_sketch<int, int, 
always_one_policy<int>>> sketch_array ;
+
+
+        for(int i=0; i<days ; i++){
+
+            auto builder = 
always_one_tuple_sketch::builder(always_one_policy<int>()) ;
+            builder.set_lg_k(lgK) ;
+            auto sketch = builder.build() ;
+            sketch_array.push_back(sketch);
+        }
+        REQUIRE(sketch_array.size() == days) ;
+        std::cout << "Size of vector: " << sketch_array.size() << std::endl ;
+
+
+        for(int i=0; i<=days; i++){
+            int32_t num_ids = get_num_ids(days, i) ;
+            int32_t num_days = get_num_days(days, i) ;
+
+            //std::cout << i << "\t" << num_ids << "\t" << num_days << 
std::endl ;
+            // TO DO: Continue from here and figure out what to do with the 
tuple sketches.
+            int my_v = v++ ;
+            for(int d=0 ; d<num_days; d++){
+                for(int id = 0; id < num_ids; id++){
+                    //std::cout << id << " " << my_v << " " << my_v + id << " 
" << num_ids << std::endl ;
+                    set_array[d].insert(my_v + id) ;
+                    sketch_array[d].update(my_v + id, 1) ;
+                    //std::cout << "d: " << d << " id: " << id << " my_v + id: 
" << my_v + id << std::endl ;
+                    // sk_arr[d].update(my_v + id, 1) ; // update the day d 
sketch with the key my_v + id
+                }
+            }
+            v += num_ids ;
+        }
+        union_ops(lgK, sketch_array) ;
+    }
+private:
+    int32_t get_num_ids(int total_days, int index){
+        /*
+         * Generates power law distributed synthetic data
+         */
+        double d = total_days ;
+        double i = index ;
+        return int(round(exp(i * log(d) / d))) ;
+    }
+
+    int32_t get_num_days(int total_days, int index){
+        double d = total_days ;
+        double i = index ;
+        return int(round(exp( (d-i) * log(d) / d ))) ;
+    }
+
+    int8_t round_double_to_int(double x){
+        return int(std::round(x)) ;
+    }
+
+
+    //void union_ops(int lgk, 
std::vector<datasketches::update_tuple_sketch<int>> sketches){
+    void union_ops(int lgk, std::vector<datasketches::update_tuple_sketch<int, 
int, always_one_policy<int>>> sketches){
+        int num_sketches = sketches.size() ;
+        //auto u =  
datasketches::tuple_union<int>::builder().set_lg_k(lgk).build() ;
+        auto u = sum_union_tuple_sketch::builder().set_lg_k(lgk).build() ;
+
+
+        for(auto sk:sketches){
+            u.update(sk) ;
+        }
+        auto union_result = u.get_result() ;
+        std::cout << "Union type: " << typeid(union_result).name() << 
std::endl ;
+        // std::cout << "The estimate is: " << res.get_estimate() << std::endl 
; // agrees with 271.9156532434 from java.
+        std::vector<uint64_t> num_days_arr(num_sketches+1) ;
+
+        int num_retained = 0 ;
+        int total_sum = 0 ;
+
+        for (const auto& entry: union_result) {
+            std::cout << "First: " << entry.first << "\tSecond: " << 
entry.second << std::endl ;
+            int num_days_visited = entry.second ;
+            num_retained++ ;
+            total_sum += entry.second ;
+            num_days_arr[num_days_visited]++;
+        }
+        std::cout << "Num retained items: " << num_retained << std::endl ; // 
This agrees with Java.
+        std::cout << "Sum(retained items): " << total_sum << std::endl ; // 
This agrees with Java.
+
+        for(int i = 1; i<num_sketches+1; i++){
+            std::cout<< "i = " << i << "\tnum_days_arr[i] = " << 
num_days_arr[i] << std::endl ;
+        }
+
+        // *********************** WE AGREE UP TO HERE ***********************
+        // For pretty printing
+        int sum_visits = 0;
+        double theta = union_result.get_theta();
+        std::cout <<"\t\tEngagement Histogram.\t\t\t\n" ;
+        std::cout << "Number of Unique Visitors by Number of Days Visited" << 
std::endl ;
+        std::cout << "---------------------------------------------------" << 
std::endl ;
+
+        std::cout << std::setw(12) << "Days Visited"
+                  << std::setw(12) << "Estimate"
+                  << std::setw(12) << "LB"
+                  << std::setw(12) << "UB"
+                  << std:: endl ;
+
+        for (uint64_t i = 0; i < num_days_arr.size(); i++) {
+            int visitors_at_days_visited = num_days_arr[i] ;
+            if(visitors_at_days_visited == 0){ continue; }
+            sum_visits += visitors_at_days_visited * i ;
+            // RUNS UP TO HERE 10/08/2022
+
+            double est_visitors_at_days_visited = visitors_at_days_visited / 
theta ;
+            double lower_bound_at_days_visited =  
union_result.get_lower_bound(num_std_dev, visitors_at_days_visited);
+            double upper_bound_at_days_visited =  
union_result.get_upper_bound(num_std_dev, visitors_at_days_visited);
+
+            std::cout << std::setw(12) << i
+                      << std::setw(12) << est_visitors_at_days_visited
+                      << std::setw(12) << lower_bound_at_days_visited
+                      << std::setw(12) << upper_bound_at_days_visited
+                      << std:: endl ;
+
+    }
+
+        // PRINT A SUMMARY TABLE FOR VISITORS AND VISITS
+        std::cout << std::endl << std::endl ;
+        std::cout << std::setw(12) << "Totals"
+                  << std::setw(12) << "Estimate"
+                  << std::setw(12) << "LB"
+                  << std::setw(12) << "UB"
+                  << std:: endl ;
+        std::cout << "---------------------------------------------------" << 
std::endl ;
+
+        const double total_visitors = union_result.get_estimate() ;
+        const double lb_visitors = union_result.get_lower_bound(num_std_dev) ;
+        const double ub_visitors = union_result.get_upper_bound(num_std_dev) ;
+
+
+        std::cout << std::setw(12) << "Visitors"
+                  << std::setw(12) << total_visitors
+                  << std::setw(12) << lb_visitors
+                  << std::setw(12) << ub_visitors
+                  << std:: endl ;
+
+        //The total number of visits, however, is a scaled metric and takes 
advantage of the fact that
+        //the retained entries in the sketch is a uniform random sample of all 
unique visitors, and
+        //the the rest of the unique users will likely behave in the same way.
+        const double est_visits = sum_visits / theta;
+        const double lb_visits = est_visits * lb_visitors / total_visitors;
+        const double ub_visits = est_visits * ub_visitors / total_visitors;
+
+
+        std::cout << std::setw(12) << "Visits"
+                  << std::setw(12) << est_visits
+                  << std::setw(12) << lb_visits
+                  << std::setw(12) << ub_visits
+                  << std:: endl ;
+    }
+
+};
+
+namespace datasketches {
+
+    TEST_CASE("engagement", "[engagement]") {
+    EngagementTest E ;
+    E.test_always_one_update() ;
+    E.test_sum_update_policy() ;
+    E.test_sum_union_policy() ;
+    E.compute_engagement_histogram() ;
+}
+
+
+} /* namespace datasketches */
+
+


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to