This is an automated email from the ASF dual-hosted git repository.
alsay pushed a commit to branch python_wrapper_improvement
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git
The following commit(s) were added to refs/heads/python_wrapper_improvement by
this push:
new 4983682 added iterator, rearranged and simplified existing code
4983682 is described below
commit 498368299ead9a9b1521cdd5526798a1b4923aec
Author: AlexanderSaydakov <[email protected]>
AuthorDate: Thu Dec 22 22:36:58 2022 -0800
added iterator, rearranged and simplified existing code
---
python/src/theta_wrapper.cpp | 138 +++++++++++++++++++++----------------------
python/tests/theta_test.py | 8 ++-
2 files changed, 72 insertions(+), 74 deletions(-)
diff --git a/python/src/theta_wrapper.cpp b/python/src/theta_wrapper.cpp
index 86a2406..a5b1eb6 100644
--- a/python/src/theta_wrapper.cpp
+++ b/python/src/theta_wrapper.cpp
@@ -17,7 +17,6 @@
* under the License.
*/
-#include <sstream>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
@@ -28,51 +27,8 @@
#include "theta_jaccard_similarity.hpp"
#include "common_defs.hpp"
-
namespace py = pybind11;
-namespace datasketches {
-namespace python {
-
-update_theta_sketch update_theta_sketch_factory(uint8_t lg_k, double p,
uint64_t seed) {
- update_theta_sketch::builder builder;
- builder.set_lg_k(lg_k);
- builder.set_p(p);
- builder.set_seed(seed);
- return builder.build();
-}
-
-theta_union theta_union_factory(uint8_t lg_k, double p, uint64_t seed) {
- theta_union::builder builder;
- builder.set_lg_k(lg_k);
- builder.set_p(p);
- builder.set_seed(seed);
- return builder.build();
-}
-
-uint16_t theta_sketch_get_seed_hash(const theta_sketch& sk) {
- return sk.get_seed_hash();
-}
-
-py::object compact_theta_sketch_serialize(const compact_theta_sketch& sk) {
- auto serResult = sk.serialize();
- return py::bytes((char*)serResult.data(), serResult.size());
-}
-
-compact_theta_sketch compact_theta_sketch_deserialize(py::bytes skBytes,
uint64_t seed) {
- std::string skStr = skBytes; // implicit cast
- return compact_theta_sketch::deserialize(skStr.c_str(), skStr.length(),
seed);
-}
-
-py::list theta_jaccard_sim_computation(const theta_sketch& sketch_a, const
theta_sketch& sketch_b, uint64_t seed) {
- return py::cast(theta_jaccard_similarity::jaccard(sketch_a, sketch_b, seed));
-}
-
-}
-}
-
-namespace dspy = datasketches::python;
-
void init_theta(py::module &m) {
using namespace datasketches;
@@ -93,17 +49,24 @@ void init_theta(py::module &m) {
"Returns True if sketch is in estimation mode, otherwise False")
.def("get_theta", &theta_sketch::get_theta,
"Returns theta (effective sampling rate) as a fraction from 0 to 1")
+ .def("get_theta64", &theta_sketch::get_theta64,
+ "Returns theta as 64-bit value")
.def("get_num_retained", &theta_sketch::get_num_retained,
"Retunrs the number of items currently in the sketch")
- .def("get_seed_hash", &dspy::theta_sketch_get_seed_hash,
+ .def("get_seed_hash", &theta_sketch::get_seed_hash,
"Returns a hash of the seed used in the sketch")
.def("is_ordered", &theta_sketch::is_ordered,
"Returns True if the sketch entries are sorted, otherwise False")
+ .def("__iter__", [](const theta_sketch& s) { return
py::make_iterator(s.begin(), s.end()); })
;
py::class_<update_theta_sketch, theta_sketch>(m, "update_theta_sketch")
- .def(py::init(&dspy::update_theta_sketch_factory),
- py::arg("lg_k")=theta_constants::DEFAULT_LG_K, py::arg("p")=1.0,
py::arg("seed")=DEFAULT_SEED)
+ .def(
+ py::init([](uint8_t lg_k, double p, uint64_t seed) {
+ return
update_theta_sketch::builder().set_lg_k(lg_k).set_p(p).set_seed(seed).build();
+ }),
+ py::arg("lg_k")=theta_constants::DEFAULT_LG_K, py::arg("p")=1.0,
py::arg("seed")=DEFAULT_SEED
+ )
.def(py::init<const update_theta_sketch&>())
.def("update", (void (update_theta_sketch::*)(int64_t))
&update_theta_sketch::update, py::arg("datum"),
"Updates the sketch with the given integral value")
@@ -118,16 +81,30 @@ void init_theta(py::module &m) {
py::class_<compact_theta_sketch, theta_sketch>(m, "compact_theta_sketch")
.def(py::init<const compact_theta_sketch&>())
.def(py::init<const theta_sketch&, bool>())
- .def("serialize", &dspy::compact_theta_sketch_serialize,
- "Serializes the sketch into a bytes object")
- .def_static("deserialize", &dspy::compact_theta_sketch_deserialize,
+ .def(
+ "serialize",
+ [](const compact_theta_sketch& sk) {
+ auto bytes = sk.serialize();
+ return py::bytes(reinterpret_cast<const char*>(bytes.data()),
bytes.size());
+ },
+ "Serializes the sketch into a bytes object"
+ )
+ .def_static(
+ "deserialize",
+ [](const std::string& bytes, uint64_t seed) {
+ return compact_theta_sketch::deserialize(bytes.data(), bytes.size(),
seed);
+ },
py::arg("bytes"), py::arg("seed")=DEFAULT_SEED,
- "Reads a bytes object and returns the corresponding
compact_theta_sketch")
- ;
+ "Reads a bytes object and returns the corresponding
compact_theta_sketch"
+ );
py::class_<theta_union>(m, "theta_union")
- .def(py::init(&dspy::theta_union_factory),
- py::arg("lg_k")=theta_constants::DEFAULT_LG_K, py::arg("p")=1.0,
py::arg("seed")=DEFAULT_SEED)
+ .def(
+ py::init([](uint8_t lg_k, double p, uint64_t seed) {
+ return
theta_union::builder().set_lg_k(lg_k).set_p(p).set_seed(seed).build();
+ }),
+ py::arg("lg_k")=theta_constants::DEFAULT_LG_K, py::arg("p")=1.0,
py::arg("seed")=DEFAULT_SEED
+ )
.def("update", &theta_union::update<const theta_sketch&>,
py::arg("sketch"),
"Updates the union with the given sketch")
.def("get_result", &theta_union::get_result, py::arg("ordered")=true,
@@ -147,26 +124,43 @@ void init_theta(py::module &m) {
py::class_<theta_a_not_b>(m, "theta_a_not_b")
.def(py::init<uint64_t>(), py::arg("seed")=DEFAULT_SEED)
- .def("compute", &theta_a_not_b::compute<const theta_sketch&, const
theta_sketch&>, py::arg("a"), py::arg("b"), py::arg("ordered")=true,
- "Returns a sketch with the reuslt of appying the A-not-B operation on
the given inputs")
+ .def(
+ "compute",
+ &theta_a_not_b::compute<const theta_sketch&, const theta_sketch&>,
+ py::arg("a"), py::arg("b"), py::arg("ordered")=true,
+ "Returns a sketch with the reuslt of appying the A-not-B operation on
the given inputs"
+ )
;
py::class_<theta_jaccard_similarity>(m, "theta_jaccard_similarity")
- .def_static("jaccard", &dspy::theta_jaccard_sim_computation,
- py::arg("sketch_a"), py::arg("sketch_b"),
py::arg("seed")=DEFAULT_SEED,
- "Returns a list with {lower_bound, estimate, upper_bound} of
the Jaccard similarity between sketches")
- .def_static("exactly_equal",
&theta_jaccard_similarity::exactly_equal<const theta_sketch&, const
theta_sketch&>,
- py::arg("sketch_a"), py::arg("sketch_b"),
py::arg("seed")=DEFAULT_SEED,
- "Returns True if sketch_a and sketch_b are equivalent,
otherwise False")
- .def_static("similarity_test",
&theta_jaccard_similarity::similarity_test<const theta_sketch&, const
theta_sketch&>,
- py::arg("actual"), py::arg("expected"), py::arg("threshold"),
py::arg("seed")=DEFAULT_SEED,
- "Tests similarity of an actual sketch against an expected
sketch. Computers the lower bound of the Jaccard "
- "index J_{LB} of the actual and expected sketches. If J_{LB}
>= threshold, then the sketches are considered "
- "to be similar sith a confidence of 97.7% and returns True,
otherwise False.")
- .def_static("dissimilarity_test",
&theta_jaccard_similarity::dissimilarity_test<const theta_sketch&, const
theta_sketch&>,
- py::arg("actual"), py::arg("expected"), py::arg("threshold"),
py::arg("seed")=DEFAULT_SEED,
- "Tests dissimilarity of an actual sketch against an expected
sketch. Computers the lower bound of the Jaccard "
- "index J_{UB} of the actual and expected sketches. If J_{UB}
<= threshold, then the sketches are considered "
- "to be dissimilar sith a confidence of 97.7% and returns
True, otherwise False.")
+ .def_static(
+ "jaccard",
+ [](const theta_sketch& sketch_a, const theta_sketch& sketch_b,
uint64_t seed) {
+ return theta_jaccard_similarity::jaccard(sketch_a, sketch_b, seed);
+ },
+ py::arg("sketch_a"), py::arg("sketch_b"), py::arg("seed")=DEFAULT_SEED,
+ "Returns a list with {lower_bound, estimate, upper_bound} of the
Jaccard similarity between sketches"
+ )
+ .def_static(
+ "exactly_equal",
+ &theta_jaccard_similarity::exactly_equal<const theta_sketch&, const
theta_sketch&>,
+ py::arg("sketch_a"), py::arg("sketch_b"), py::arg("seed")=DEFAULT_SEED,
+ "Returns True if sketch_a and sketch_b are equivalent, otherwise False"
+ )
+ .def_static(
+ "similarity_test",
+ &theta_jaccard_similarity::similarity_test<const theta_sketch&, const
theta_sketch&>,
+ py::arg("actual"), py::arg("expected"), py::arg("threshold"),
py::arg("seed")=DEFAULT_SEED,
+ "Tests similarity of an actual sketch against an expected sketch.
Computers the lower bound of the Jaccard "
+ "index J_{LB} of the actual and expected sketches. If J_{LB} >=
threshold, then the sketches are considered "
+ "to be similar sith a confidence of 97.7% and returns True, otherwise
False.")
+ .def_static(
+ "dissimilarity_test",
+ &theta_jaccard_similarity::dissimilarity_test<const theta_sketch&,
const theta_sketch&>,
+ py::arg("actual"), py::arg("expected"), py::arg("threshold"),
py::arg("seed")=DEFAULT_SEED,
+ "Tests dissimilarity of an actual sketch against an expected sketch.
Computers the lower bound of the Jaccard "
+ "index J_{UB} of the actual and expected sketches. If J_{UB} <=
threshold, then the sketches are considered "
+ "to be dissimilar sith a confidence of 97.7% and returns True,
otherwise False."
+ )
;
}
diff --git a/python/tests/theta_test.py b/python/tests/theta_test.py
index 93e37a4..3f0f697 100644
--- a/python/tests/theta_test.py
+++ b/python/tests/theta_test.py
@@ -48,6 +48,12 @@ class ThetaTest(unittest.TestCase):
self.assertFalse(sk.is_empty())
self.assertEqual(sk.get_estimate(), new_sk.get_estimate())
+ count = 0
+ for hash in new_sk:
+ self.assertLess(hash, new_sk.get_theta64())
+ count = count + 1
+ self.assertEqual(count, new_sk.get_num_retained())
+
def test_theta_set_operations(self):
k = 12 # 2^k = 4096 rows in the table
n = 1 << 18 # ~256k unique values
@@ -77,7 +83,6 @@ class ThetaTest(unittest.TestCase):
self.assertLessEqual(result.get_lower_bound(1), 7 * n / 4)
self.assertGreaterEqual(result.get_upper_bound(1), 7 * n / 4)
-
# INTERSECTIONS
# create an intersection object
intersect = theta_intersection() # no lg_k
@@ -96,7 +101,6 @@ class ThetaTest(unittest.TestCase):
self.assertLessEqual(result.get_lower_bound(1), n / 4)
self.assertGreaterEqual(result.get_upper_bound(1), n / 4)
-
# A NOT B
# create an a_not_b object
anb = theta_a_not_b() # no lg_k
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]