This is an automated email from the ASF dual-hosted git repository. jmalkin pushed a commit to branch py_tuple in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git
commit 4afb55d65a440943c3df199176020f0e7830077d Author: Jon Malkin <[email protected]> AuthorDate: Fri Jan 27 16:58:08 2023 -0800 Clean up compact tuple creation, add docstrings, define default seed as a named constant --- python/datasketches/TupleWrapper.py | 113 +++++++++++++++++++++++++----------- python/datasketches/__init__.py | 10 ++-- python/src/tuple_wrapper.cpp | 25 ++++---- python/tests/tuple_test.py | 12 ++-- 4 files changed, 106 insertions(+), 54 deletions(-) diff --git a/python/datasketches/TupleWrapper.py b/python/datasketches/TupleWrapper.py index cb32802..155016f 100644 --- a/python/datasketches/TupleWrapper.py +++ b/python/datasketches/TupleWrapper.py @@ -21,51 +21,62 @@ from .TuplePolicy import TuplePolicy from _datasketches import _tuple_sketch, _compact_tuple_sketch, _update_tuple_sketch from _datasketches import _tuple_union, _tuple_intersection from _datasketches import _tuple_a_not_b, _tuple_jaccard_similarity -from _datasketches import PyObjectSerDe +from _datasketches import PyObjectSerDe, theta_sketch class tuple_sketch(ABC): + """An abstract base class representing a Tuple Sketch.""" _gadget: _tuple_sketch def __str__(self, print_items:bool=False): return self._gadget.to_string(print_items) def is_empty(self): + """Returns True if the sketch is empty, otherwise False.""" return self._gadget.is_empty() def get_estimate(self): + """Returns an estimate of the distinct count of the input stream.""" return self._gadget.get_estimate() def get_upper_bound(self, num_std_devs:int): + """Returns an approximate upper bound on the estimate at standard deviations in {1, 2, 3}.""" return self._gadget.get_upper_bound(num_std_devs) def get_lower_bound(self, num_std_devs:int): + """Returns an approximate lower bound on the estimate at standard deviations in {1, 2, 3}.""" return self._gadget.get_lower_bound(num_std_devs) def is_estimation_mode(self): + """Returns True if the sketch is in estimation mode, otherwise False.""" return self._gadget.is_estimation_mode() def get_theta(self): + """Returns theta (the effective sampling rate) as a fraction from 0 to 1.""" return self._gadget.get_theta() def get_theta64(self): + """Returns theta as a 64-bit integer value.""" return self._gadget.get_theta64() def get_num_retained(self): + """Returns the number of items currently in the sketch.""" return self._gadget.get_num_retained() def get_seed_hash(self): + """Returns a hash of the seed used in the sketch.""" return self._gadget.get_seed_hash() def is_ordered(self): + """Returns True if the sketch entries are sorder, otherwise False.""" return self._gadget.is_ordered() def __iter__(self): return self._gadget.__iter__() - #.def("__iter__", [](const py_tuple_sketch& s) { return py::make_iterator(s.begin(), s.end()); }) - class compact_tuple_sketch(tuple_sketch): + """An instance of a Tuple Sketch that has been compacted and can no longer accept updates.""" + def __init__(self, other:tuple_sketch, ordered:bool = True): if other == None: self._gadget = None @@ -73,89 +84,123 @@ class compact_tuple_sketch(tuple_sketch): self._gadget = _compact_tuple_sketch(other, ordered) def serialize(self, serde:PyObjectSerDe): + """Serializes the sketch into a bytes object with the provided SerDe.""" return self._gadget.serialize(serde) - # TODO: define seed from constant - @staticmethod - def deserialize(data:bytes, serde:PyObjectSerDe, seed:int=9001): - cpp_sk = _compact_tuple_sketch.deserialize(data, serde, seed) - # TODO: this seems inefficinet -- is there some sort of _wrap() - # approach that might work better? - sk = compact_tuple_sketch(None, True) - sk._gadget = cpp_sk - return sk + @classmethod + def from_theta_sketch(cls, sketch:theta_sketch, summary, seed:int=_tuple_sketch.DEFAULT_SEED): + """Creates a comapct Tuple Sketch from a Theta Sketch using a fixed summary value.""" + self = cls.__new__(cls) + self._gadget = _compact_tuple_sketch(sketch, summary, seed) + return self + + @classmethod + def deserialize(cls, data:bytes, serde:PyObjectSerDe, seed:int=_tuple_sketch.DEFAULT_SEED): + """Reads a bytes object and uses the provded SerDe to return the corresponding compact_tuple_sketch.""" + self = cls.__new__(cls) + self._gadget = _compact_tuple_sketch.deserialize(data, serde, seed) + return self class update_tuple_sketch(tuple_sketch): - # TODO: define seed from constant - def __init__(self, policy, lg_k:int = 12, p:float = 1.0, seed:int = 9001): + """An instance of a Tuple Sketch that is available for updates. Requires a Policy object to handle Summary values.""" + + def __init__(self, policy, lg_k:int = 12, p:float = 1.0, seed:int = _tuple_sketch.DEFAULT_SEED): self._policy = policy self._gadget = _update_tuple_sketch(self._policy, lg_k, p, seed) - # TODO: do we need multiple update formats? - def update(self, datum, summary): - self._gadget.update(datum, summary) + def update(self, datum, value): + """Updates the sketch with the provided item and summary value.""" + self._gadget.update(datum, value) def compact(self, ordered:bool = True) -> compact_tuple_sketch: + """Returns a compacted form of the sketch, optionally sorting it.""" return self._gadget.compact(ordered) + def reset(self): + """Resets the sketch to the initial empty state.""" + self._gadget.reset() + + class tuple_union: - # TODO: define seed from constant - def __init__(self, policy:TuplePolicy, lg_k:int = 12, p:float = 1.0, seed:int = 9001): + """An object that can merge Tuple Sketches. Requires a Policy object to handle merging Summaries.""" + _policy: TuplePolicy + + def __init__(self, policy:TuplePolicy, lg_k:int = 12, p:float = 1.0, seed:int = _tuple_sketch.DEFAULT_SEED): self._policy = policy self._gadget = _tuple_union(self._policy, lg_k, p, seed) def update(self, sketch:tuple_sketch): - self._gadget.update(sketch._gadget) + """Updates the union with the given sketch.""" + self._gadget.update(sketch._gadget) def get_result(self, ordered:bool = True) -> compact_tuple_sketch: - sk = compact_tuple_sketch(self._gadget.get_result(ordered), ordered) - return sk + """Returns the sketch corresponding to the union result, optionally sorted.""" + return compact_tuple_sketch(self._gadget.get_result(ordered), ordered) def reset(self): + """Resets the union to the initial empty state.""" self._gadget.reset() class tuple_intersection: - # TODO: define seed from constant - def __init__(self, policy:TuplePolicy, seed:int = 9001): + """An object that can intersect Tuple Sketches. Requires a Policy object to handle intersecting Summaries.""" + _policy: TuplePolicy + + def __init__(self, policy:TuplePolicy, seed:int = _tuple_sketch.DEFAULT_SEED): self._policy = policy self._gadget = _tuple_intersection(self._policy, seed) def update(self, sketch:tuple_sketch): + """Intersects the provided sketch with the current intersection state.""" self._gadget.update(sketch._gadget) def has_result(self) -> bool: + """Returns True if the intersection has a valid result, otherwise False.""" return self._gadget.has_result() def get_result(self, ordered:bool = True) -> compact_tuple_sketch: - sk = compact_tuple_sketch(self._gadget.get_result(ordered), ordered) - return sk + """Returns the sketch corresponding to the intersection result, optionally sorted.""" + return compact_tuple_sketch(self._gadget.get_result(ordered), ordered) class tuple_a_not_b: - def __init__(self, seed:int = 9001): + """An object that can peform the A-not-B operation between two sketches.""" + def __init__(self, seed:int = _tuple_sketch.DEFAULT_SEED): self._gadget = _tuple_a_not_b(seed) def compute(self, a:tuple_sketch, b:tuple_sketch, ordered:bool=True) -> compact_tuple_sketch: - sk = compact_tuple_sketch(self._gadget.compute(a._gadget, b._gadget)) - return sk + """Returns a sketch with the result of applying the A-not-B operation on the given inputs.""" + return compact_tuple_sketch(self._gadget.compute(a._gadget, b._gadget)) class tuple_jaccard_similarity: - # TODO: define seed from constant @staticmethod - def jaccard(a:tuple_sketch, b:tuple_sketch, seed:int=9001): + def jaccard(a:tuple_sketch, b:tuple_sketch, seed:int=_tuple_sketch.DEFAULT_SEED): + """Returns a list with {lower_bound, estimate, upper_bound} of the Jaccard similarity between sketches.""" return _tuple_jaccard_similarity.jaccard(a._gadget, b._gadget, seed) @staticmethod - def exactly_equal(a:tuple_sketch, b:tuple_sketch, seed:int=9001): + def exactly_equal(a:tuple_sketch, b:tuple_sketch, seed:int=_tuple_sketch.DEFAULT_SEED): + """Returns True if sketch_a and sketch_b are equivalent, otherwise False.""" return _tuple_jaccard_similarity.exactly_equal(a._gadget, b._gadget, seed) @staticmethod - def similarity_test(actual:tuple_sketch, expected:tuple_sketch, threshold:float, seed:int=9001): + def similarity_test(actual:tuple_sketch, expected:tuple_sketch, threshold:float, seed:int=_tuple_sketch.DEFAULT_SEED): + """Tests similarity of an actual sketch against an expected sketch. + + Computes the lower bound of the Jaccard index J_{LB} of the actual and expected sketches. + If J_{LB} >= threshold, then the sketches are considered to be similar sith a confidence of + 97.7% and returns True, otherwise False. + """ return _tuple_jaccard_similarity.similarity_test(actual._gadget, expected._gadget, threshold, seed) @staticmethod - def dissimilarity_test(actual:tuple_sketch, expected:tuple_sketch, threshold:float, seed:int=9001): + def dissimilarity_test(actual:tuple_sketch, expected:tuple_sketch, threshold:float, seed:int=_tuple_sketch.DEFAULT_SEED): + """Tests dissimilarity of an actual sketch against an expected sketch. + + Computes the upper bound of the Jaccard index J_{UB} of the actual and expected sketches. + If J_{UB} <= threshold, then the sketches are considered to be dissimilar sith a confidence of + 97.7% and returns True, otherwise False. + """ return _tuple_jaccard_similarity.dissimilarity_test(actual._gadget, expected._gadget, threshold, seed) diff --git a/python/datasketches/__init__.py b/python/datasketches/__init__.py index 71b2d2c..fb7636e 100644 --- a/python/datasketches/__init__.py +++ b/python/datasketches/__init__.py @@ -15,6 +15,12 @@ # specific language governing permissions and limitations # under the License. +"""The Apache DataSketches Library for Python + +Provided under the Apache License, Verison 2.0 +<http://www.apache.org/licenses/LICENSE-2.0> +""" + name = 'datasketches' from _datasketches import * @@ -27,8 +33,4 @@ from .TuplePolicy import * # the C++ object. Currently, the native python portion of # a class derived from a C++ class may be garbage collected # even though a pointer to the C++ portion remains valid. -# -# These wrappers should exactly implement the target API -# for the pybind11 interface so they can be removed if -# that issue is ever fixed. from .TupleWrapper import * diff --git a/python/src/tuple_wrapper.cpp b/python/src/tuple_wrapper.cpp index 027623b..88a9f71 100644 --- a/python/src/tuple_wrapper.cpp +++ b/python/src/tuple_wrapper.cpp @@ -47,13 +47,16 @@ void init_tuple(py::module &m) { .def("__call__", &tuple_policy::operator(), py::arg("summary"), py::arg("update")) ; - // only needed temporarily -- can remove once everything is working + // potentially useful for debugging but not needed as a permanent + // object type in the library + /* py::class_<tuple_policy_holder>(m, "TuplePolicyHolder") .def(py::init<std::shared_ptr<tuple_policy>>(), py::arg("policy")) .def("create", &tuple_policy_holder::create, "Creates a new Summary object") .def("update", &tuple_policy_holder::update, py::arg("summary"), py::arg("update"), "Updates the provided summary using the data in update") ; + */ using py_tuple_sketch = tuple_sketch<py::object>; using py_update_tuple = update_tuple_sketch<py::object, py::object, tuple_policy_holder>; @@ -89,6 +92,7 @@ void init_tuple(py::module &m) { .def("is_ordered", &py_tuple_sketch::is_ordered, "Returns True if the sketch entries are sorted, otherwise False") .def("__iter__", [](const py_tuple_sketch& s) { return py::make_iterator(s.begin(), s.end()); }) + .def_property_readonly_static("DEFAULT_SEED", [](py::object /* self */) { return DEFAULT_SEED; }); ; py::class_<py_compact_tuple, py_tuple_sketch>(m, "_compact_tuple_sketch") @@ -123,16 +127,17 @@ void init_tuple(py::module &m) { ) .def(py::init<const py_update_tuple&>()) .def("update", static_cast<void (py_update_tuple::*)(int64_t, py::object&)>(&py_update_tuple::update), - py::arg("datum"), py::arg("summary"), - "Updates the sketch with the given integral value") + py::arg("datum"), py::arg("value"), + "Updates the sketch with the given integral item and summary value") .def("update", static_cast<void (py_update_tuple::*)(double, py::object&)>(&py_update_tuple::update), - py::arg("datum"), py::arg("summary"), - "Updates the sketch with the given floating point value") + py::arg("datum"), py::arg("value"), + "Updates the sketch with the given floating point item and summary value") .def("update", static_cast<void (py_update_tuple::*)(const std::string&, py::object&)>(&py_update_tuple::update), - py::arg("datum"), py::arg("summary"), - "Updates the sketch with the given string") + py::arg("datum"), py::arg("value"), + "Updates the sketch with the given string item and summary value") .def("compact", &py_update_tuple::compact, py::arg("ordered")=true, "Returns a compacted form of the sketch, optionally sorting it") + .def("reset", &py_update_tuple::reset, "Resets the sketch to the initial empty state") ; py::class_<py_tuple_union>(m, "_tuple_union") @@ -159,7 +164,7 @@ void init_tuple(py::module &m) { }), py::arg("policy"), py::arg("seed")=DEFAULT_SEED) .def("update", &py_tuple_intersection::update<const py_tuple_sketch&>, py::arg("sketch"), - "Intersections the provided sketch with the current intersection state") + "Intersects the provided sketch with the current intersection state") .def("get_result", &py_tuple_intersection::get_result, py::arg("ordered")=true, "Returns the sketch corresponding to the intersection result") .def("has_result", &py_tuple_intersection::has_result, @@ -195,14 +200,14 @@ void init_tuple(py::module &m) { "similarity_test", &py_tuple_jaccard_similarity::similarity_test<const py_tuple_sketch&, const py_tuple_sketch&>, py::arg("actual"), py::arg("expected"), py::arg("threshold"), py::arg("seed")=DEFAULT_SEED, - "Tests similarity of an actual sketch against an expected sketch. Computers the lower bound of the Jaccard " + "Tests similarity of an actual sketch against an expected sketch. Computes the lower bound of the Jaccard " "index J_{LB} of the actual and expected sketches. If J_{LB} >= threshold, then the sketches are considered " "to be similar sith a confidence of 97.7% and returns True, otherwise False.") .def_static( "dissimilarity_test", &py_tuple_jaccard_similarity::dissimilarity_test<const py_tuple_sketch&, const py_tuple_sketch&>, py::arg("actual"), py::arg("expected"), py::arg("threshold"), py::arg("seed")=DEFAULT_SEED, - "Tests dissimilarity of an actual sketch against an expected sketch. Computers the lower bound of the Jaccard " + "Tests dissimilarity of an actual sketch against an expected sketch. Computes the upper bound of the Jaccard " "index J_{UB} of the actual and expected sketches. If J_{UB} <= threshold, then the sketches are considered " "to be dissimilar sith a confidence of 97.7% and returns True, otherwise False." ) diff --git a/python/tests/tuple_test.py b/python/tests/tuple_test.py index 60732e7..0e94e2e 100644 --- a/python/tests/tuple_test.py +++ b/python/tests/tuple_test.py @@ -32,7 +32,7 @@ class TupleTest(unittest.TestCase): # create a sketch and inject some values -- summary is 2 so we can sum them # and know the reuslt - sk = self.generate_tuple_sketch(AccumulatorPolicy(), n, k, summary=2) + sk = self.generate_tuple_sketch(AccumulatorPolicy(), n, k, value=2) # we can check that the upper and lower bounds bracket the # estimate, without needing to know the exact value. @@ -84,8 +84,8 @@ class TupleTest(unittest.TestCase): offset = int(3 * n / 4) # it's a float w/o cast # create a couple sketches and inject some values, with different summaries - sk1 = self.generate_tuple_sketch(AccumulatorPolicy(), n, k, summary=5) - sk2 = self.generate_tuple_sketch(AccumulatorPolicy(), n, k, summary=7, offset=offset) + sk1 = self.generate_tuple_sketch(AccumulatorPolicy(), n, k, value=5) + sk2 = self.generate_tuple_sketch(AccumulatorPolicy(), n, k, value=7, offset=offset) # UNIONS # create a union object @@ -195,11 +195,11 @@ class TupleTest(unittest.TestCase): self.assertTrue(tuple_jaccard_similarity.similarity_test(sk1, result, 0.7)) - # Generates a basic tuple sketch using a fixed integer summary of 2 - def generate_tuple_sketch(self, policy, n, k, summary, offset=0): + # Generates a basic tuple sketch with a fixed value for each update + def generate_tuple_sketch(self, policy, n, k, value, offset=0): sk = update_tuple_sketch(policy, k) for i in range(0, n): - sk.update(i + offset, summary) + sk.update(i + offset, value) return sk if __name__ == '__main__': --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
