This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch jaccard
in repository https://gitbox.apache.org/repos/asf/datasketches-bigquery.git

commit b82b89785f865d6f6a7db2257da9171f04b0ceca
Author: AlexanderSaydakov <[email protected]>
AuthorDate: Mon Aug 19 18:48:21 2024 -0700

    jaccard similarity
---
 theta_sketch.cpp                     | 26 +++++++++++++----------
 theta_sketch_jaccard_similarity.sqlx | 40 ++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 11 deletions(-)

diff --git a/theta_sketch.cpp b/theta_sketch.cpp
index f62aa1f..c714fc1 100644
--- a/theta_sketch.cpp
+++ b/theta_sketch.cpp
@@ -24,6 +24,7 @@
 #include <theta_union.hpp>
 #include <theta_intersection.hpp>
 #include <theta_a_not_b.hpp>
+#include <theta_jaccard_similarity.hpp>
 
 #include "base64.hpp"
 
@@ -37,12 +38,11 @@ using datasketches::theta_a_not_b;
 const emscripten::val Uint8Array = emscripten::val::global("Uint8Array");
 
 EMSCRIPTEN_BINDINGS(theta_sketch) {
-  emscripten::register_vector<uint8_t>("VectorBytes");
+  emscripten::register_vector<double>("VectorDouble");
 
   emscripten::function("getExceptionMessage", 
emscripten::optional_override([](intptr_t ptr) {
     return std::string(reinterpret_cast<std::exception*>(ptr)->what());
   }));
-  using vector_bytes = compact_theta_sketch::vector_bytes;
 
   emscripten::class_<update_theta_sketch>("update_theta_sketch")
     .constructor(emscripten::optional_override([](uint8_t lg_k) {
@@ -81,9 +81,6 @@ EMSCRIPTEN_BINDINGS(theta_sketch) {
     .constructor(emscripten::optional_override([](intptr_t bytes, size_t size, 
uint64_t seed) {
       return new 
compact_theta_sketch(compact_theta_sketch::deserialize(reinterpret_cast<void*>(bytes),
 size, seed));
     }))
-    
.constructor(emscripten::optional_override([](compact_theta_sketch::vector_bytes&
 bytes, uint64_t seed) {
-      return new 
compact_theta_sketch(compact_theta_sketch::deserialize(bytes.data(), 
bytes.size(), seed));
-    }))
     .class_function("deserializeFromB64", 
emscripten::optional_override([](const std::string& b64, uint64_t seed) {
       std::vector<char> bytes(b64_dec_len(b64.data(), b64.size()));
       b64_decode(b64.data(), b64.size(), bytes.data());
@@ -92,9 +89,6 @@ EMSCRIPTEN_BINDINGS(theta_sketch) {
     .class_function("deserializeFromBinary", 
emscripten::optional_override([](const std::string& bytes, uint64_t seed) {
       return new 
compact_theta_sketch(compact_theta_sketch::deserialize(bytes.data(), 
bytes.size(), seed));
     }), emscripten::allow_raw_pointers())
-    .class_function("deserializeFromVectorBytes", 
emscripten::optional_override([](const vector_bytes& bytes, uint64_t seed) {
-      return new 
compact_theta_sketch(compact_theta_sketch::deserialize(bytes.data(), 
bytes.size(), seed));
-    }), emscripten::allow_raw_pointers())
     .function("getEstimate", emscripten::optional_override([](const 
compact_theta_sketch& self) {
       return self.get_estimate();
     }))
@@ -147,9 +141,6 @@ EMSCRIPTEN_BINDINGS(theta_sketch) {
     .function("updateWithBytes", emscripten::optional_override([](theta_union& 
self, const std::string& bytes, uint64_t seed) {
       self.update(wrapped_compact_theta_sketch::wrap(bytes.data(), 
bytes.size(), seed));
     }), emscripten::allow_raw_pointers())
-    .function("updateWithVectorBytes", 
emscripten::optional_override([](theta_union& self, const vector_bytes& bytes, 
uint64_t seed) {
-      self.update(wrapped_compact_theta_sketch::wrap(bytes.data(), 
bytes.size(), seed));
-    }), emscripten::allow_raw_pointers())
     .function("updateWithB64", emscripten::optional_override([](theta_union& 
self, const std::string& b64, uint64_t seed) {
       std::vector<char> bytes(b64_dec_len(b64.data(), b64.size()));
       b64_decode(b64.data(), b64.size(), bytes.data());
@@ -264,4 +255,17 @@ EMSCRIPTEN_BINDINGS(theta_sketch) {
       return std::string(b64.data(), b64.size());
     }))
     ;
+
+  emscripten::function("thetaJaccardSimilarity", 
emscripten::optional_override([](const std::string& sketch1_b64, const 
std::string& sketch2_b64, uint64_t seed) {
+    std::vector<char> bytes1(b64_dec_len(sketch1_b64.data(), 
sketch1_b64.size()));
+    b64_decode(sketch1_b64.data(), sketch1_b64.size(), bytes1.data());
+    std::vector<char> bytes2(b64_dec_len(sketch2_b64.data(), 
sketch2_b64.size()));
+    b64_decode(sketch2_b64.data(), sketch2_b64.size(), bytes2.data());
+    const auto arr = datasketches::theta_jaccard_similarity::jaccard(
+      wrapped_compact_theta_sketch::wrap(bytes1.data(), bytes1.size(), seed),
+      wrapped_compact_theta_sketch::wrap(bytes2.data(), bytes2.size(), seed),
+      seed
+    );
+    return std::vector<double>{arr[0], arr[1], arr[2]};
+  }));
 }
diff --git a/theta_sketch_jaccard_similarity.sqlx 
b/theta_sketch_jaccard_similarity.sqlx
new file mode 100644
index 0000000..2458475
--- /dev/null
+++ b/theta_sketch_jaccard_similarity.sqlx
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+CREATE OR REPLACE FUNCTION ${self()}(sketch1 BYTES, sketch2 BYTES, seed INT64)
+RETURNS ARRAY<FLOAT64>
+LANGUAGE js
+OPTIONS (
+  library=["gs://$GCS_BUCKET/theta_sketch.js"],
+  description = '''Computes the Jaccard similarity index with upper and lower 
bounds. The Jaccard similarity index
+J(A,B) = (A ^ B)/(A U B) is used to measure how similar the two sketches are 
to each other.
+If J = 1.0, the sketches are considered equal. If J = 0, the two sketches are 
disjoint.
+A Jaccard of .95 means the overlap between the two sets is 95% of the union of 
the two sets.
+Param sketch1: the first sketch as bytes.
+Param sketch2: the second sketch as bytes.
+Param seed: This is used to confirm that the given sketches were configured 
with the correct seed.
+Returns an array of 3 floating-point values {LowerBound, Estimate, UpperBound} 
of the Jaccard index.
+For more details: 
https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html'''
+) AS R"""
+const default_seed = BigInt(9001);
+try {
+  const jaccard = Module.thetaJaccardSimilarity(sketch1, sketch2, seed == null 
? default_seed : BigInt(seed));
+  return [jaccard.get(0), jaccard.get(1), jaccard.get(2)];
+} catch (e) {
+  throw new Error(Module.getExceptionMessage(e));
+}
+""";


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to