This is an automated email from the ASF dual-hosted git repository. alsay pushed a commit to branch jaccard in repository https://gitbox.apache.org/repos/asf/datasketches-bigquery.git
commit b82b89785f865d6f6a7db2257da9171f04b0ceca Author: AlexanderSaydakov <[email protected]> AuthorDate: Mon Aug 19 18:48:21 2024 -0700 jaccard similarity --- theta_sketch.cpp | 26 +++++++++++++---------- theta_sketch_jaccard_similarity.sqlx | 40 ++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 11 deletions(-) diff --git a/theta_sketch.cpp b/theta_sketch.cpp index f62aa1f..c714fc1 100644 --- a/theta_sketch.cpp +++ b/theta_sketch.cpp @@ -24,6 +24,7 @@ #include <theta_union.hpp> #include <theta_intersection.hpp> #include <theta_a_not_b.hpp> +#include <theta_jaccard_similarity.hpp> #include "base64.hpp" @@ -37,12 +38,11 @@ using datasketches::theta_a_not_b; const emscripten::val Uint8Array = emscripten::val::global("Uint8Array"); EMSCRIPTEN_BINDINGS(theta_sketch) { - emscripten::register_vector<uint8_t>("VectorBytes"); + emscripten::register_vector<double>("VectorDouble"); emscripten::function("getExceptionMessage", emscripten::optional_override([](intptr_t ptr) { return std::string(reinterpret_cast<std::exception*>(ptr)->what()); })); - using vector_bytes = compact_theta_sketch::vector_bytes; emscripten::class_<update_theta_sketch>("update_theta_sketch") .constructor(emscripten::optional_override([](uint8_t lg_k) { @@ -81,9 +81,6 @@ EMSCRIPTEN_BINDINGS(theta_sketch) { .constructor(emscripten::optional_override([](intptr_t bytes, size_t size, uint64_t seed) { return new compact_theta_sketch(compact_theta_sketch::deserialize(reinterpret_cast<void*>(bytes), size, seed)); })) - .constructor(emscripten::optional_override([](compact_theta_sketch::vector_bytes& bytes, uint64_t seed) { - return new compact_theta_sketch(compact_theta_sketch::deserialize(bytes.data(), bytes.size(), seed)); - })) .class_function("deserializeFromB64", emscripten::optional_override([](const std::string& b64, uint64_t seed) { std::vector<char> bytes(b64_dec_len(b64.data(), b64.size())); b64_decode(b64.data(), b64.size(), bytes.data()); @@ -92,9 +89,6 @@ EMSCRIPTEN_BINDINGS(theta_sketch) { .class_function("deserializeFromBinary", emscripten::optional_override([](const std::string& bytes, uint64_t seed) { return new compact_theta_sketch(compact_theta_sketch::deserialize(bytes.data(), bytes.size(), seed)); }), emscripten::allow_raw_pointers()) - .class_function("deserializeFromVectorBytes", emscripten::optional_override([](const vector_bytes& bytes, uint64_t seed) { - return new compact_theta_sketch(compact_theta_sketch::deserialize(bytes.data(), bytes.size(), seed)); - }), emscripten::allow_raw_pointers()) .function("getEstimate", emscripten::optional_override([](const compact_theta_sketch& self) { return self.get_estimate(); })) @@ -147,9 +141,6 @@ EMSCRIPTEN_BINDINGS(theta_sketch) { .function("updateWithBytes", emscripten::optional_override([](theta_union& self, const std::string& bytes, uint64_t seed) { self.update(wrapped_compact_theta_sketch::wrap(bytes.data(), bytes.size(), seed)); }), emscripten::allow_raw_pointers()) - .function("updateWithVectorBytes", emscripten::optional_override([](theta_union& self, const vector_bytes& bytes, uint64_t seed) { - self.update(wrapped_compact_theta_sketch::wrap(bytes.data(), bytes.size(), seed)); - }), emscripten::allow_raw_pointers()) .function("updateWithB64", emscripten::optional_override([](theta_union& self, const std::string& b64, uint64_t seed) { std::vector<char> bytes(b64_dec_len(b64.data(), b64.size())); b64_decode(b64.data(), b64.size(), bytes.data()); @@ -264,4 +255,17 @@ EMSCRIPTEN_BINDINGS(theta_sketch) { return std::string(b64.data(), b64.size()); })) ; + + emscripten::function("thetaJaccardSimilarity", emscripten::optional_override([](const std::string& sketch1_b64, const std::string& sketch2_b64, uint64_t seed) { + std::vector<char> bytes1(b64_dec_len(sketch1_b64.data(), sketch1_b64.size())); + b64_decode(sketch1_b64.data(), sketch1_b64.size(), bytes1.data()); + std::vector<char> bytes2(b64_dec_len(sketch2_b64.data(), sketch2_b64.size())); + b64_decode(sketch2_b64.data(), sketch2_b64.size(), bytes2.data()); + const auto arr = datasketches::theta_jaccard_similarity::jaccard( + wrapped_compact_theta_sketch::wrap(bytes1.data(), bytes1.size(), seed), + wrapped_compact_theta_sketch::wrap(bytes2.data(), bytes2.size(), seed), + seed + ); + return std::vector<double>{arr[0], arr[1], arr[2]}; + })); } diff --git a/theta_sketch_jaccard_similarity.sqlx b/theta_sketch_jaccard_similarity.sqlx new file mode 100644 index 0000000..2458475 --- /dev/null +++ b/theta_sketch_jaccard_similarity.sqlx @@ -0,0 +1,40 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +CREATE OR REPLACE FUNCTION ${self()}(sketch1 BYTES, sketch2 BYTES, seed INT64) +RETURNS ARRAY<FLOAT64> +LANGUAGE js +OPTIONS ( + library=["gs://$GCS_BUCKET/theta_sketch.js"], + description = '''Computes the Jaccard similarity index with upper and lower bounds. The Jaccard similarity index +J(A,B) = (A ^ B)/(A U B) is used to measure how similar the two sketches are to each other. +If J = 1.0, the sketches are considered equal. If J = 0, the two sketches are disjoint. +A Jaccard of .95 means the overlap between the two sets is 95% of the union of the two sets. +Param sketch1: the first sketch as bytes. +Param sketch2: the second sketch as bytes. +Param seed: This is used to confirm that the given sketches were configured with the correct seed. +Returns an array of 3 floating-point values {LowerBound, Estimate, UpperBound} of the Jaccard index. +For more details: https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html''' +) AS R""" +const default_seed = BigInt(9001); +try { + const jaccard = Module.thetaJaccardSimilarity(sketch1, sketch2, seed == null ? default_seed : BigInt(seed)); + return [jaccard.get(0), jaccard.get(1), jaccard.get(2)]; +} catch (e) { + throw new Error(Module.getExceptionMessage(e)); +} +"""; --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
