This is an automated email from the ASF dual-hosted git repository. charlie pushed a commit to branch intersections in repository https://gitbox.apache.org/repos/asf/datasketches-characterization.git
commit c50930148d5c8d8d59dadcb523620f5a2fd6d0c6 Author: charlied <[email protected]> AuthorDate: Mon Jul 3 16:35:08 2023 +0100 Added jaccard estimation script --- cpp/CMakeLists.txt | 13 +- cpp/src/main.cpp | 6 + cpp/src/theta_sketch_jaccard_profile.cpp | 246 +++++++++++++++++++++++++++++++ cpp/src/theta_sketch_jaccard_profile.hpp | 61 ++++++++ 4 files changed, 325 insertions(+), 1 deletion(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5c53da1..f88951b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -15,9 +15,14 @@ # specific language governing permissions and limitations # under the License. +cmake_minimum_required(VERSION 3.16.0) +#include_directories(/Library/Developer/CommandLineTools/usr/include/c++/v1) add_executable(characterization) -find_package(DataSketches 3.2 REQUIRED) +list(APPEND CMAKE_PREFIX_PATH "/tmp/install/DataSketches/include/DataSketches") +list(APPEND CMAKE_PREFIX_PATH "/tmp/install/DataSketches/lib/DataSketches") +find_package(DataSketches 4.0.1 REQUIRED) +#find_package(DataSketches 3.2 REQUIRED) target_link_libraries(characterization PUBLIC ${DATASKETCHES_LIB}) target_include_directories(characterization PUBLIC ${DATASKETCHES_INCLUDE_DIR}) @@ -98,4 +103,10 @@ target_sources(characterization src/tuple_union_timing_profile.hpp src/zipf_distribution.cpp src/zipf_distribution.hpp + + # Companion sketch files + src/theta_sketch_jaccard_profile.cpp + src/theta_sketch_jaccard_profile.hpp + src/companion_sketch_size_profile.cpp + src/companion_sketch_size_profile.hpp ) diff --git a/cpp/src/main.cpp b/cpp/src/main.cpp index 3b09818..c6a7fdb 100644 --- a/cpp/src/main.cpp +++ b/cpp/src/main.cpp @@ -63,6 +63,9 @@ #include "req_sketch_timing_profile.hpp" #include "req_merge_timing_profile.hpp" +#include "theta_sketch_jaccard_profile.hpp" +#include "companion_sketch_size_profile.hpp" + using namespace datasketches; typedef std::unique_ptr<job_profile> job_profile_ptr; @@ -102,6 +105,9 @@ int main(int argc, char **argv) { job_profile::add("hll-cross-lang", job_profile_ptr(new hll_cross_language_profile())); + // Companion sketch files + job_profile::add("theta-jaccard-estimation", job_profile_ptr(new theta_sketch_jaccard_profile())); + if (argc == 2) { datasketches::job_profile& profile = datasketches::job_profile::instance(argv[1]); profile.run(); diff --git a/cpp/src/theta_sketch_jaccard_profile.cpp b/cpp/src/theta_sketch_jaccard_profile.cpp new file mode 100644 index 0000000..2e27e80 --- /dev/null +++ b/cpp/src/theta_sketch_jaccard_profile.cpp @@ -0,0 +1,246 @@ +// +// Created by Charlie Dickens on 28/06/2023. +// +#include <cmath> +#include <theta_sketch.hpp> +#include <theta_jaccard_similarity.hpp> +#include <theta_intersection.hpp> +#include "theta_sketch_jaccard_profile.hpp" +#include <fstream> + +namespace datasketches { + +accuracy_stats_double::accuracy_stats_double(size_t k, double true_value, size_t n, size_t union_size, size_t intersection_size): + true_value(true_value), + stream_cardinality(n), + union_size(union_size), + intersection_size(intersection_size), + sum_est(0), + sum_rel_err(0), + sum_sq_rel_err(0), + count(0), + rel_err_distribution(k), + intersection_error_distribution(k) +{} + +void accuracy_stats_double::update(double jaccard_estimate, double intersection_estimate) { + sum_est += jaccard_estimate; + const double relative_error = jaccard_estimate / true_value - 1.0; + sum_rel_err += relative_error; + sum_sq_rel_err += relative_error * relative_error; + rel_err_distribution.update(relative_error); + + //Intersection updates + const double intersection_relative_error = intersection_estimate / intersection_size - 1.0; + intersection_error_distribution.update(relative_error); + count++; +} + +double accuracy_stats_double::get_true_value() const { + return true_value; +} + +double accuracy_stats_double::get_mean_est() const { + return sum_est / count; +} + +double accuracy_stats_double::get_mean_rel_err() const { + return sum_rel_err / count; +} + +double accuracy_stats_double::get_rms_rel_err() const { + return sqrt(sum_sq_rel_err / count); +} + +size_t accuracy_stats_double::get_count() const { + return count; +} + +size_t accuracy_stats_double::get_stream_cardinality() const { + return stream_cardinality; +} + +size_t accuracy_stats_double::get_union_size() const { + return union_size; +} + +size_t accuracy_stats_double::get_intersection_size() const { + return intersection_size; +} + +std::vector<double> accuracy_stats_double::get_quantiles( + const double* fractions, size_t size) const { + return rel_err_distribution.get_quantiles(fractions, size); +} + +std::vector<double> accuracy_stats_double::get_intersection_quantiles( + const double* fractions, size_t size) const { + return intersection_error_distribution.get_quantiles(fractions, size); +} + +void theta_sketch_jaccard_profile::run(){ + + // Generic setup + const size_t num_trials = 128 ; + const size_t lg_stream_length = 23 ; + const size_t stream_length = 1<<lg_stream_length ; + const size_t lg_min_overlap_length = 0 ; + const size_t lg_max_overlap_length = lg_stream_length ; + const size_t overlaps_ppo = 3; + double jaccard_index ; + const size_t quantiles_k = 10000; + + // Dataset setup + uint64_t counter = 35538947; + const uint64_t golden64 = 0x9e3779b97f4a7c13ULL; // the golden ratio + std::vector<uint64_t> intersection_sizes ; + + // Sketch setup + const size_t lg_k = 14; + const size_t num_test_points = count_points(lg_min_overlap_length, lg_max_overlap_length, overlaps_ppo);// intersection_sizes.size() ; // + size_t num_intersection = 1 << lg_min_overlap_length; // intersection_sizes.size() ; // + + // Initialize the statistics objects: -1 is to avoid going into next pwer of 2 + for (size_t i=0; i < num_test_points-1; i++) { + num_intersection = pwr_2_law_next(overlaps_ppo, num_intersection); + std::cout << num_test_points << " " << num_intersection << std::endl; + intersection_sizes.push_back(num_intersection) ; + + size_t union_size = 2 * stream_length - intersection_sizes[i]; // |A| + |B| - |A n B| and |A| == |B| == stream_length + jaccard_index = (double) intersection_sizes[i] / union_size; + jaccard_stats.push_back(accuracy_stats_double(quantiles_k, jaccard_index, stream_length, union_size, intersection_sizes[i])); + } + + // Final num_overlap goes into next octave so ignore. + for (size_t i=0; i < num_test_points-1; i++){ + std::cout << "Test point " << i << " out of " << num_test_points << "." << std::endl; + std::cout << "Intersection size:\t" << intersection_sizes[i] << std::endl; + + for (int trial=0; trial < num_trials; trial++){ + auto sk_a = update_theta_sketch::builder().set_lg_k(lg_k).build(); + auto sk_b = update_theta_sketch::builder().set_lg_k(lg_k).build(); + theta_intersection sk_intersection; + + // Add the intersecting points to both sketches. + for (size_t intersector = 0; intersector<intersection_sizes[i] ; intersector++) { + sk_a.update(counter); + sk_b.update(counter); + counter += golden64 ; + } + + // Add remaining number of distinct items to each sketch separately + size_t items_remaining = stream_length - intersection_sizes[i] ; + for (size_t distinct_item=0; distinct_item<items_remaining; distinct_item++){ + // Increment counter twice so each sketch sees a different item + sk_a.update(counter) ; + counter += golden64 ; + sk_b.update(counter) ; + counter += golden64 ; + } + + // trim the sketches to exactly lgk items + sk_a.trim() ; + sk_b.trim() ; + + + auto jaccard_estimate = theta_jaccard_similarity::jaccard(sk_a, sk_b); + sk_intersection.update(sk_a); + sk_intersection.update(sk_b); + compact_theta_sketch intersection_result = sk_intersection.get_result(); + jaccard_stats[i].update( jaccard_estimate[1], intersection_result.get_estimate()) ; + } // End single trial +} // End iteration space + +print_stats(); +write_stats() ; +}// End function + +void theta_sketch_jaccard_profile::print_stats() const { + std::cout << std::setw(12) << "n" + << std::setw(12) << "union" + << std::setw(12) << "intersection" + << std::setw(12) << "jaccard" + << std::setw(12) << "trials" + << std::setw(12) << "mean" + << std::setw(12) << "mean relative error" + << std::setw(12) << "min" + << std::setw(12) << "m3sd" + << std::setw(12) << "m2sd" + << std::setw(12) << "m1sd" + << std::setw(12) << "median" + << std::setw(12) << "p1sd" + << std::setw(12) << "p2sd" + << std::setw(12) << "p3sd" + << std::setw(12) << "max" + << std::setw(12) << "inter_min" + << std::setw(12) << "inter_m3sd" + << std::setw(12) << "inter_m2sd" + << std::setw(12) << "inter_m1sd" + << std::setw(12) << "inter_median" + << std::setw(12) << "inter_p1sd" + << std::setw(12) << "inter_p2sd" + << std::setw(12) << "inter_p3sd" + << std::setw(12) << "inter_max" + << std::endl; + + for (const auto &stat: jaccard_stats) { + std::cout << std::setw(12) << stat.get_stream_cardinality() << "\t"; + std::cout << std::setw(12) << stat.get_union_size() << "\t"; + std::cout << std::setw(12) << stat.get_intersection_size() << "\t"; + std::cout << std::setw(12) << stat.get_true_value() << "\t"; + std::cout << std::setw(12) << stat.get_count() << "\t"; // This is the number of trials completed. + std::cout << std::setw(12) << stat.get_mean_est() << "\t"; + std::cout << std::setw(12) << stat.get_mean_rel_err() << "\t"; + const auto quants = stat.get_quantiles(FRACTIONS, FRACT_LEN); + for (size_t i = 0; i < FRACT_LEN; i++) { + const double quantile = quants[i]; + std::cout << std::setw(12) << quantile << "\t"; + } + const auto intersection_quants = stat.get_intersection_quantiles(FRACTIONS, FRACT_LEN); + for (size_t i = 0; i < FRACT_LEN; i++) { + const double quantile = intersection_quants[i]; + std::cout << std::setw(12) << quantile << "\t"; + if (i != FRACT_LEN - 1) std::cout << "\t"; + } + std::cout << std::endl; + } +} // End print_stats() + +void theta_sketch_jaccard_profile::write_stats() const { + std::ofstream out_file("jaccard_accuracy_theta.tsv"); + std::string columns[] = {"n", "union", "intersection", "jaccard", "trials", "mean estimate", "mean relative error", + "min", "m3sd", "m2sd", "m1sd", "median", "p1sd", "p2sd", "p3sd", "max", + "inter_min", "inter_m3sd", "inter_m2sd", "inter_m1sd", "inter_median", + "inter_p1sd", "inter_p2sd", "inter_p3sd", "inter_max"} ; + size_t arr_length(0) ; + for (const std::string &s : columns) arr_length++ ; + for (size_t i=0; i < arr_length; ++i){ + const std::string col_header = columns[i] ; + out_file << col_header ; + if (i != arr_length - 1) out_file << "\t"; + } + + out_file << std::endl; + for (const auto &stat: jaccard_stats) { + out_file << stat.get_stream_cardinality() << "\t"; + out_file << stat.get_union_size() << "\t"; + out_file << stat.get_intersection_size() << "\t"; + out_file << stat.get_true_value() << "\t"; + out_file << stat.get_count() << "\t"; // This is the number of trials completed. + out_file << stat.get_mean_est() << "\t"; + out_file << stat.get_mean_rel_err() << "\t"; + const auto quants = stat.get_quantiles(FRACTIONS, FRACT_LEN); + for (size_t i = 0; i < FRACT_LEN; i++) { + const double quantile = quants[i]; + out_file << quantile << "\t" ; + } + const auto intersection_quants = stat.get_intersection_quantiles(FRACTIONS, FRACT_LEN); + for (size_t i = 0; i < FRACT_LEN; i++) { + const double quantile = intersection_quants[i]; + out_file << quantile; + if (i != FRACT_LEN - 1) out_file << "\t"; + } + out_file << std::endl; + } +}// End write_stats() +} // End namespace \ No newline at end of file diff --git a/cpp/src/theta_sketch_jaccard_profile.hpp b/cpp/src/theta_sketch_jaccard_profile.hpp new file mode 100644 index 0000000..701b913 --- /dev/null +++ b/cpp/src/theta_sketch_jaccard_profile.hpp @@ -0,0 +1,61 @@ +// +// Created by Charlie Dickens on 28/06/2023. +// + +#ifndef THETA_SKETCH_JACCARD_ESTIMATION_HPP +#define THETA_SKETCH_JACCARD_ESTIMATION_HPP + +#include <kll_sketch.hpp> +#include <vector> +#include "jaccard_accuracy_profile.hpp" +#include "distinct_count_accuracy_profile.hpp" // To pull in the accuracy_stats object + + +namespace datasketches { + +class accuracy_stats_double { + public: + accuracy_stats_double(size_t k, double true_value, size_t n, size_t union_size, size_t intersection_size); + void update(double jaccard_estimate, double intersection_estimate); + double get_true_value() const; + double get_mean_est() const; + double get_mean_rel_err() const; + double get_rms_rel_err() const; + size_t get_count() const; + size_t get_stream_cardinality() const; + size_t get_union_size() const ; + size_t get_intersection_size() const ; + std::vector<double> get_quantiles(const double* fractions, size_t size) const; + std::vector<double> get_intersection_quantiles(const double* fractions, size_t size) const; + void print_kll() const ; + kll_sketch<double> rel_err_distribution; + kll_sketch<double> intersection_error_distribution; + +private: + double true_value; // Jaccard index --> What we want to estimate + size_t stream_cardinality ; + size_t union_size ; + size_t intersection_size ; + double sum_est; + double sum_rel_err; + double sum_sq_rel_err; + size_t count; // number of trials completed -- kept for consistency. + + }; + + //class theta_sketch_jaccard_profile: public jaccard_accuracy_profile { + class theta_sketch_jaccard_profile: public job_profile { + public: + void run(); + protected: + //std::vector<accuracy_stats> jaccard_stats ; + std::vector<accuracy_stats_double> jaccard_stats ; + private: + void print_stats() const; + void write_stats() const; + }; + +} + + +#endif //THETA_SKETCH_JACCARD_ESTIMATION_HPP --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
