This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch python_wrapper_improvement
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git


The following commit(s) were added to refs/heads/python_wrapper_improvement by 
this push:
     new 4983682  added iterator, rearranged and simplified existing code
4983682 is described below

commit 498368299ead9a9b1521cdd5526798a1b4923aec
Author: AlexanderSaydakov <[email protected]>
AuthorDate: Thu Dec 22 22:36:58 2022 -0800

    added iterator, rearranged and simplified existing code
---
 python/src/theta_wrapper.cpp | 138 +++++++++++++++++++++----------------------
 python/tests/theta_test.py   |   8 ++-
 2 files changed, 72 insertions(+), 74 deletions(-)

diff --git a/python/src/theta_wrapper.cpp b/python/src/theta_wrapper.cpp
index 86a2406..a5b1eb6 100644
--- a/python/src/theta_wrapper.cpp
+++ b/python/src/theta_wrapper.cpp
@@ -17,7 +17,6 @@
  * under the License.
  */
 
-#include <sstream>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
@@ -28,51 +27,8 @@
 #include "theta_jaccard_similarity.hpp"
 #include "common_defs.hpp"
 
-
 namespace py = pybind11;
 
-namespace datasketches {
-namespace python {
-
-update_theta_sketch update_theta_sketch_factory(uint8_t lg_k, double p, 
uint64_t seed) {
-  update_theta_sketch::builder builder;
-  builder.set_lg_k(lg_k);
-  builder.set_p(p);
-  builder.set_seed(seed);
-  return builder.build();
-}
-
-theta_union theta_union_factory(uint8_t lg_k, double p, uint64_t seed) {
-  theta_union::builder builder;
-  builder.set_lg_k(lg_k);
-  builder.set_p(p);
-  builder.set_seed(seed);
-  return builder.build();
-}
-
-uint16_t theta_sketch_get_seed_hash(const theta_sketch& sk) {
-  return sk.get_seed_hash();
-}
-
-py::object compact_theta_sketch_serialize(const compact_theta_sketch& sk) {
-  auto serResult = sk.serialize();
-  return py::bytes((char*)serResult.data(), serResult.size());
-}
-
-compact_theta_sketch compact_theta_sketch_deserialize(py::bytes skBytes, 
uint64_t seed) {
-  std::string skStr = skBytes; // implicit cast  
-  return compact_theta_sketch::deserialize(skStr.c_str(), skStr.length(), 
seed);
-}
-
-py::list theta_jaccard_sim_computation(const theta_sketch& sketch_a, const 
theta_sketch& sketch_b, uint64_t seed) {
-  return py::cast(theta_jaccard_similarity::jaccard(sketch_a, sketch_b, seed));
-}
-
-}
-}
-
-namespace dspy = datasketches::python;
-
 void init_theta(py::module &m) {
   using namespace datasketches;
 
@@ -93,17 +49,24 @@ void init_theta(py::module &m) {
          "Returns True if sketch is in estimation mode, otherwise False")
     .def("get_theta", &theta_sketch::get_theta,
          "Returns theta (effective sampling rate) as a fraction from 0 to 1")
+    .def("get_theta64", &theta_sketch::get_theta64,
+         "Returns theta as 64-bit value")
     .def("get_num_retained", &theta_sketch::get_num_retained,
          "Retunrs the number of items currently in the sketch")
-    .def("get_seed_hash", &dspy::theta_sketch_get_seed_hash,
+    .def("get_seed_hash", &theta_sketch::get_seed_hash,
          "Returns a hash of the seed used in the sketch")
     .def("is_ordered", &theta_sketch::is_ordered,
          "Returns True if the sketch entries are sorted, otherwise False")
+    .def("__iter__", [](const theta_sketch& s) { return 
py::make_iterator(s.begin(), s.end()); })
   ;
 
   py::class_<update_theta_sketch, theta_sketch>(m, "update_theta_sketch")
-    .def(py::init(&dspy::update_theta_sketch_factory),
-         py::arg("lg_k")=theta_constants::DEFAULT_LG_K, py::arg("p")=1.0, 
py::arg("seed")=DEFAULT_SEED)
+    .def(
+        py::init([](uint8_t lg_k, double p, uint64_t seed) {
+          return 
update_theta_sketch::builder().set_lg_k(lg_k).set_p(p).set_seed(seed).build();
+        }),
+        py::arg("lg_k")=theta_constants::DEFAULT_LG_K, py::arg("p")=1.0, 
py::arg("seed")=DEFAULT_SEED
+    )
     .def(py::init<const update_theta_sketch&>())
     .def("update", (void (update_theta_sketch::*)(int64_t)) 
&update_theta_sketch::update, py::arg("datum"),
          "Updates the sketch with the given integral value")
@@ -118,16 +81,30 @@ void init_theta(py::module &m) {
   py::class_<compact_theta_sketch, theta_sketch>(m, "compact_theta_sketch")
     .def(py::init<const compact_theta_sketch&>())
     .def(py::init<const theta_sketch&, bool>())
-    .def("serialize", &dspy::compact_theta_sketch_serialize,
-        "Serializes the sketch into a bytes object")
-    .def_static("deserialize", &dspy::compact_theta_sketch_deserialize,
+    .def(
+        "serialize",
+        [](const compact_theta_sketch& sk) {
+          auto bytes = sk.serialize();
+          return py::bytes(reinterpret_cast<const char*>(bytes.data()), 
bytes.size());
+        },
+        "Serializes the sketch into a bytes object"
+    )
+    .def_static(
+        "deserialize",
+        [](const std::string& bytes, uint64_t seed) {
+          return compact_theta_sketch::deserialize(bytes.data(), bytes.size(), 
seed);
+        },
         py::arg("bytes"), py::arg("seed")=DEFAULT_SEED,
-        "Reads a bytes object and returns the corresponding 
compact_theta_sketch")        
-  ;
+        "Reads a bytes object and returns the corresponding 
compact_theta_sketch"
+    );
 
   py::class_<theta_union>(m, "theta_union")
-    .def(py::init(&dspy::theta_union_factory),
-         py::arg("lg_k")=theta_constants::DEFAULT_LG_K, py::arg("p")=1.0, 
py::arg("seed")=DEFAULT_SEED)
+    .def(
+        py::init([](uint8_t lg_k, double p, uint64_t seed) {
+          return 
theta_union::builder().set_lg_k(lg_k).set_p(p).set_seed(seed).build();
+        }),
+        py::arg("lg_k")=theta_constants::DEFAULT_LG_K, py::arg("p")=1.0, 
py::arg("seed")=DEFAULT_SEED
+    )
     .def("update", &theta_union::update<const theta_sketch&>, 
py::arg("sketch"),
          "Updates the union with the given sketch")
     .def("get_result", &theta_union::get_result, py::arg("ordered")=true,
@@ -147,26 +124,43 @@ void init_theta(py::module &m) {
 
   py::class_<theta_a_not_b>(m, "theta_a_not_b")
     .def(py::init<uint64_t>(), py::arg("seed")=DEFAULT_SEED)
-    .def("compute", &theta_a_not_b::compute<const theta_sketch&, const 
theta_sketch&>, py::arg("a"), py::arg("b"), py::arg("ordered")=true,
-         "Returns a sketch with the reuslt of appying the A-not-B operation on 
the given inputs")
+    .def(
+        "compute",
+        &theta_a_not_b::compute<const theta_sketch&, const theta_sketch&>,
+        py::arg("a"), py::arg("b"), py::arg("ordered")=true,
+        "Returns a sketch with the reuslt of appying the A-not-B operation on 
the given inputs"
+    )
   ;
   
   py::class_<theta_jaccard_similarity>(m, "theta_jaccard_similarity")
-     .def_static("jaccard", &dspy::theta_jaccard_sim_computation,
-                 py::arg("sketch_a"), py::arg("sketch_b"), 
py::arg("seed")=DEFAULT_SEED,
-                 "Returns a list with {lower_bound, estimate, upper_bound} of 
the Jaccard similarity between sketches")
-     .def_static("exactly_equal", 
&theta_jaccard_similarity::exactly_equal<const theta_sketch&, const 
theta_sketch&>,
-                 py::arg("sketch_a"), py::arg("sketch_b"), 
py::arg("seed")=DEFAULT_SEED,
-                 "Returns True if sketch_a and sketch_b are equivalent, 
otherwise False")
-     .def_static("similarity_test", 
&theta_jaccard_similarity::similarity_test<const theta_sketch&, const 
theta_sketch&>,
-                 py::arg("actual"), py::arg("expected"), py::arg("threshold"), 
py::arg("seed")=DEFAULT_SEED,
-                 "Tests similarity of an actual sketch against an expected 
sketch. Computers the lower bound of the Jaccard "
-                 "index J_{LB} of the actual and expected sketches. If J_{LB} 
>= threshold, then the sketches are considered "
-                 "to be similar sith a confidence of 97.7% and returns True, 
otherwise False.")
-     .def_static("dissimilarity_test", 
&theta_jaccard_similarity::dissimilarity_test<const theta_sketch&, const 
theta_sketch&>,
-                 py::arg("actual"), py::arg("expected"), py::arg("threshold"), 
py::arg("seed")=DEFAULT_SEED,
-                 "Tests dissimilarity of an actual sketch against an expected 
sketch. Computers the lower bound of the Jaccard "
-                 "index J_{UB} of the actual and expected sketches. If J_{UB} 
<= threshold, then the sketches are considered "
-                 "to be dissimilar sith a confidence of 97.7% and returns 
True, otherwise False.")            
+    .def_static(
+        "jaccard",
+        [](const theta_sketch& sketch_a, const theta_sketch& sketch_b, 
uint64_t seed) {
+          return theta_jaccard_similarity::jaccard(sketch_a, sketch_b, seed);
+        },
+        py::arg("sketch_a"), py::arg("sketch_b"), py::arg("seed")=DEFAULT_SEED,
+        "Returns a list with {lower_bound, estimate, upper_bound} of the 
Jaccard similarity between sketches"
+    )
+    .def_static(
+        "exactly_equal",
+        &theta_jaccard_similarity::exactly_equal<const theta_sketch&, const 
theta_sketch&>,
+        py::arg("sketch_a"), py::arg("sketch_b"), py::arg("seed")=DEFAULT_SEED,
+        "Returns True if sketch_a and sketch_b are equivalent, otherwise False"
+    )
+    .def_static(
+        "similarity_test",
+        &theta_jaccard_similarity::similarity_test<const theta_sketch&, const 
theta_sketch&>,
+        py::arg("actual"), py::arg("expected"), py::arg("threshold"), 
py::arg("seed")=DEFAULT_SEED,
+        "Tests similarity of an actual sketch against an expected sketch. 
Computers the lower bound of the Jaccard "
+        "index J_{LB} of the actual and expected sketches. If J_{LB} >= 
threshold, then the sketches are considered "
+        "to be similar sith a confidence of 97.7% and returns True, otherwise 
False.")
+    .def_static(
+        "dissimilarity_test",
+        &theta_jaccard_similarity::dissimilarity_test<const theta_sketch&, 
const theta_sketch&>,
+        py::arg("actual"), py::arg("expected"), py::arg("threshold"), 
py::arg("seed")=DEFAULT_SEED,
+        "Tests dissimilarity of an actual sketch against an expected sketch. 
Computers the lower bound of the Jaccard "
+        "index J_{UB} of the actual and expected sketches. If J_{UB} <= 
threshold, then the sketches are considered "
+        "to be dissimilar sith a confidence of 97.7% and returns True, 
otherwise False."
+    )
   ;     
 }
diff --git a/python/tests/theta_test.py b/python/tests/theta_test.py
index 93e37a4..3f0f697 100644
--- a/python/tests/theta_test.py
+++ b/python/tests/theta_test.py
@@ -48,6 +48,12 @@ class ThetaTest(unittest.TestCase):
         self.assertFalse(sk.is_empty())
         self.assertEqual(sk.get_estimate(), new_sk.get_estimate())
 
+        count = 0
+        for hash in new_sk:
+          self.assertLess(hash, new_sk.get_theta64())
+          count = count + 1
+        self.assertEqual(count, new_sk.get_num_retained())
+
     def test_theta_set_operations(self):
         k = 12      # 2^k = 4096 rows in the table
         n = 1 << 18 # ~256k unique values
@@ -77,7 +83,6 @@ class ThetaTest(unittest.TestCase):
         self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
         self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)
 
-
         # INTERSECTIONS
         # create an intersection object
         intersect = theta_intersection() # no lg_k
@@ -96,7 +101,6 @@ class ThetaTest(unittest.TestCase):
         self.assertLessEqual(result.get_lower_bound(1), n / 4)
         self.assertGreaterEqual(result.get_upper_bound(1), n / 4)
 
-
         # A NOT B
         # create an a_not_b object
         anb = theta_a_not_b() # no lg_k


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to