This is an automated email from the ASF dual-hosted git repository.
alsay pushed a commit to branch python_wrapper_improvement
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git
The following commit(s) were added to refs/heads/python_wrapper_improvement by
this push:
new 032dd38 added iterator, rearranged and simplified existing code
032dd38 is described below
commit 032dd381e70b0f51576265d0389a1e940cb92f7e
Author: AlexanderSaydakov <[email protected]>
AuthorDate: Thu Dec 22 17:14:49 2022 -0800
added iterator, rearranged and simplified existing code
---
python/src/quantiles_wrapper.cpp | 235 +++++++++++++++++----------------------
python/tests/quantiles_test.py | 7 ++
2 files changed, 110 insertions(+), 132 deletions(-)
diff --git a/python/src/quantiles_wrapper.cpp b/python/src/quantiles_wrapper.cpp
index b91ea08..60652ba 100644
--- a/python/src/quantiles_wrapper.cpp
+++ b/python/src/quantiles_wrapper.cpp
@@ -17,95 +17,15 @@
* under the License.
*/
-#include "quantiles_sketch.hpp"
-
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <pybind11/numpy.h>
#include <vector>
+#include <stdexcept>
-namespace py = pybind11;
-
-namespace datasketches {
-
-namespace python {
-
-template<typename T>
-quantiles_sketch<T> quantiles_sketch_deserialize(py::bytes sk_bytes) {
- std::string sk_str = sk_bytes; // implicit cast
- return quantiles_sketch<T>::deserialize(sk_str.c_str(), sk_str.length());
-}
-
-template<typename T>
-py::object quantiles_sketch_serialize(const quantiles_sketch<T>& sk) {
- auto ser_result = sk.serialize();
- return py::bytes((char*)ser_result.data(), ser_result.size());
-}
-
-// maybe possible to disambiguate the static vs method rank error calls, but
-// this is easier for now
-template<typename T>
-double quantiles_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
- return quantiles_sketch<T>::get_normalized_rank_error(k, pmf);
-}
-
-template<typename T>
-py::list quantiles_sketch_get_quantiles(const quantiles_sketch<T>& sk,
- std::vector<double>& ranks,
- bool inclusive) {
- size_t n_quantiles = ranks.size();
- auto result = sk.get_quantiles(ranks.data(),
static_cast<uint32_t>(n_quantiles), inclusive);
- // returning as std::vector<> would copy values to a list anyway
- py::list list(n_quantiles);
- for (size_t i = 0; i < n_quantiles; ++i) {
- list[i] = result[i];
- }
- return list;
-}
-
-template<typename T>
-py::list quantiles_sketch_get_pmf(const quantiles_sketch<T>& sk,
- std::vector<T>& split_points,
- bool inclusive) {
- size_t n_points = split_points.size();
- auto result = sk.get_PMF(split_points.data(), n_points, inclusive);
- py::list list(n_points + 1);
- for (size_t i = 0; i <= n_points; ++i) {
- list[i] = result[i];
- }
- return list;
-}
-
-template<typename T>
-py::list quantiles_sketch_get_cdf(const quantiles_sketch<T>& sk,
- std::vector<T>& split_points,
- bool inclusive) {
- size_t n_points = split_points.size();
- auto result = sk.get_CDF(split_points.data(), n_points, inclusive);
- py::list list(n_points + 1);
- for (size_t i = 0; i <= n_points; ++i) {
- list[i] = result[i];
- }
- return list;
-}
-
-template<typename T>
-void quantiles_sketch_update(quantiles_sketch<T>& sk, py::array_t<T,
py::array::c_style | py::array::forcecast> items) {
- if (items.ndim() != 1) {
- throw std::invalid_argument("input data must have only one dimension.
Found: "
- + std::to_string(items.ndim()));
- }
-
- auto data = items.template unchecked<1>();
- for (uint32_t i = 0; i < data.size(); ++i) {
- sk.update(data(i));
- }
-}
-
-}
-}
+#include "quantiles_sketch.hpp"
-namespace dspy = datasketches::python;
+namespace py = pybind11;
template<typename T>
void bind_quantiles_sketch(py::module &m, const char* name) {
@@ -114,12 +34,27 @@ void bind_quantiles_sketch(py::module &m, const char*
name) {
py::class_<quantiles_sketch<T>>(m, name)
.def(py::init<uint16_t>(), py::arg("k")=quantiles_constants::DEFAULT_K)
.def(py::init<const quantiles_sketch<T>&>())
- .def("update", (void (quantiles_sketch<T>::*)(const T&))
&quantiles_sketch<T>::update, py::arg("item"),
- "Updates the sketch with the given value")
- .def("update", &dspy::quantiles_sketch_update<T>, py::arg("array"),
- "Updates the sketch with the values in the given array")
+ .def(
+ "update",
+ static_cast<void (quantiles_sketch<T>::*)(const
T&)>(&quantiles_sketch<T>::update),
+ py::arg("item"),
+ "Updates the sketch with the given value"
+ )
+ .def(
+ "update",
+ [](quantiles_sketch<T>& sk, py::array_t<T, py::array::c_style |
py::array::forcecast> items) {
+ if (items.ndim() != 1) {
+ throw std::invalid_argument("input data must have only one
dimension. Found: "
+ + std::to_string(items.ndim()));
+ }
+ auto array = items.template unchecked<1>();
+ for (uint32_t i = 0; i < array.size(); ++i) sk.update(array(i));
+ },
+ py::arg("array"),
+ "Updates the sketch with the values in the given array"
+ )
.def("merge", (void (quantiles_sketch<T>::*)(const quantiles_sketch<T>&))
&quantiles_sketch<T>::merge, py::arg("sketch"),
- "Merges the provided sketch into the this one")
+ "Merges the provided sketch into this one")
.def("__str__", &quantiles_sketch<T>::to_string,
py::arg("print_levels")=false, py::arg("print_items")=false,
"Produces a string summary of the sketch")
.def("to_string", &quantiles_sketch<T>::to_string,
py::arg("print_levels")=false, py::arg("print_items")=false,
@@ -144,11 +79,17 @@ void bind_quantiles_sketch(py::module &m, const char*
name) {
"version of the input stream so far.\n"
"For quantiles_floats_sketch: if the sketch is empty this returns
nan. "
"For quantiles_ints_sketch: if the sketch is empty this throws a
RuntimeError.")
- .def("get_quantiles", &dspy::quantiles_sketch_get_quantiles<T>,
py::arg("ranks"), py::arg("inclusive")=false,
- "This returns an array that could have been generated by using
get_quantile() for each "
- "normalized rank separately.\n"
- "If the sketch is empty this returns an empty vector.\n"
- "Deprecated. Will be removed in the next major version. Use
get_quantile() instead.")
+ .def(
+ "get_quantiles",
+ [](const quantiles_sketch<T>& sk, const std::vector<double>& ranks,
bool inclusive) {
+ return sk.get_quantiles(ranks.data(), ranks.size(), inclusive);
+ },
+ py::arg("ranks"), py::arg("inclusive")=false,
+ "This returns an array that could have been generated by using
get_quantile() for each "
+ "normalized rank separately.\n"
+ "If the sketch is empty this returns an empty vector.\n"
+ "Deprecated. Will be removed in the next major version. Use
get_quantile() instead."
+ )
.def("get_rank", &quantiles_sketch<T>::get_rank, py::arg("value"),
py::arg("inclusive")=false,
"Returns an approximation to the normalized rank of the given value
from 0 to 1, inclusive.\n"
"The resulting approximation has a probabilistic guarantee that can
be obtained from the "
@@ -156,45 +97,75 @@ void bind_quantiles_sketch(py::module &m, const char*
name) {
"With the parameter inclusive=true the weight of the given value is
included into the rank."
"Otherwise the rank equals the sum of the weights of values less than
the given value.\n"
"If the sketch is empty this returns nan.")
- .def("get_pmf", &dspy::quantiles_sketch_get_pmf<T>,
py::arg("split_points"), py::arg("inclusive")=false,
- "Returns an approximation to the Probability Mass Function (PMF) of
the input stream "
- "given a set of split points (values).\n"
- "The resulting approximations have a probabilistic guarantee that can
be obtained from the "
- "get_normalized_rank_error(True) function.\n"
- "If the sketch is empty this returns an empty vector.\n"
- "split_points is an array of m unique, monotonically increasing float
values "
- "that divide the real number line into m+1 consecutive disjoint
intervals.\n"
- "The definition of an 'interval' is inclusive of the left split point
(or minimum value) and "
- "exclusive of the right split point, with the exception that the last
interval will include "
- "the maximum value.\n"
- "It is not necessary to include either the min or max values in these
split points.")
- .def("get_cdf", &dspy::quantiles_sketch_get_cdf<T>,
py::arg("split_points"), py::arg("inclusive")=false,
- "Returns an approximation to the Cumulative Distribution Function
(CDF), which is the "
- "cumulative analog of the PMF, of the input stream given a set of
split points (values).\n"
- "The resulting approximations have a probabilistic guarantee that can
be obtained from the "
- "get_normalized_rank_error(True) function.\n"
- "If the sketch is empty this returns an empty vector.\n"
- "split_points is an array of m unique, monotonically increasing float
values "
- "that divide the real number line into m+1 consecutive disjoint
intervals.\n"
- "The definition of an 'interval' is inclusive of the left split point
(or minimum value) and "
- "exclusive of the right split point, with the exception that the last
interval will include "
- "the maximum value.\n"
- "It is not necessary to include either the min or max values in these
split points.")
- .def("normalized_rank_error", (double (quantiles_sketch<T>::*)(bool)
const) &quantiles_sketch<T>::get_normalized_rank_error,
- py::arg("as_pmf"),
- "Gets the normalized rank error for this sketch.\n"
- "If pmf is True, returns the 'double-sided' normalized rank error for
the get_PMF() function.\n"
- "Otherwise, it is the 'single-sided' normalized rank error for all
the other queries.\n"
- "Constants were derived as the best fit to 99 percentile empirically
measured max error in thousands of trials")
- .def_static("get_normalized_rank_error",
&dspy::quantiles_sketch_generic_normalized_rank_error<T>,
- py::arg("k"), py::arg("as_pmf"),
- "Gets the normalized rank error given parameters k and the pmf
flag.\n"
- "If pmf is True, returns the 'double-sided' normalized rank error for
the get_PMF() function.\n"
- "Otherwise, it is the 'single-sided' normalized rank error for all
the other queries.\n"
- "Constants were derived as the best fit to 99 percentile empirically
measured max error in thousands of trials")
- .def("serialize", &dspy::quantiles_sketch_serialize<T>, "Serializes the
sketch into a bytes object")
- .def_static("deserialize", &dspy::quantiles_sketch_deserialize<T>,
"Deserializes the sketch from a bytes object")
- ;
+ .def(
+ "get_pmf",
+ [](const quantiles_sketch<T>& sk, const std::vector<T>& split_points,
bool inclusive) {
+ return sk.get_PMF(split_points.data(), split_points.size(),
inclusive);
+ },
+ py::arg("split_points"), py::arg("inclusive")=false,
+ "Returns an approximation to the Probability Mass Function (PMF) of
the input stream "
+ "given a set of split points (values).\n"
+ "The resulting approximations have a probabilistic guarantee that can
be obtained from the "
+ "get_normalized_rank_error(True) function.\n"
+ "If the sketch is empty this returns an empty vector.\n"
+ "split_points is an array of m unique, monotonically increasing float
values "
+ "that divide the real number line into m+1 consecutive disjoint
intervals.\n"
+ "The definition of an 'interval' is inclusive of the left split point
(or minimum value) and "
+ "exclusive of the right split point, with the exception that the last
interval will include "
+ "the maximum value.\n"
+ "It is not necessary to include either the min or max values in these
split points."
+ )
+ .def(
+ "get_cdf",
+ [](const quantiles_sketch<T>& sk, const std::vector<T>& split_points,
bool inclusive) {
+ return sk.get_CDF(split_points.data(), split_points.size(),
inclusive);
+ },
+ py::arg("split_points"), py::arg("inclusive")=false,
+ "Returns an approximation to the Cumulative Distribution Function
(CDF), which is the "
+ "cumulative analog of the PMF, of the input stream given a set of
split points (values).\n"
+ "The resulting approximations have a probabilistic guarantee that can
be obtained from the "
+ "get_normalized_rank_error(True) function.\n"
+ "If the sketch is empty this returns an empty vector.\n"
+ "split_points is an array of m unique, monotonically increasing float
values "
+ "that divide the real number line into m+1 consecutive disjoint
intervals.\n"
+ "The definition of an 'interval' is inclusive of the left split point
(or minimum value) and "
+ "exclusive of the right split point, with the exception that the last
interval will include "
+ "the maximum value.\n"
+ "It is not necessary to include either the min or max values in these
split points."
+ )
+ .def(
+ "normalized_rank_error",
+ static_cast<double (quantiles_sketch<T>::*)(bool)
const>(&quantiles_sketch<T>::get_normalized_rank_error),
+ py::arg("as_pmf"),
+ "Gets the normalized rank error for this sketch.\n"
+ "If pmf is True, returns the 'double-sided' normalized rank error for
the get_PMF() function.\n"
+ "Otherwise, it is the 'single-sided' normalized rank error for all the
other queries.\n"
+ "Constants were derived as the best fit to 99 percentile empirically
measured max error in thousands of trials"
+ )
+ .def_static(
+ "get_normalized_rank_error",
+ [](uint16_t k, bool pmf) { return
quantiles_sketch<T>::get_normalized_rank_error(k, pmf); },
+ py::arg("k"), py::arg("as_pmf"),
+ "Gets the normalized rank error given parameters k and the pmf flag.\n"
+ "If pmf is True, returns the 'double-sided' normalized rank error for
the get_PMF() function.\n"
+ "Otherwise, it is the 'single-sided' normalized rank error for all the
other queries.\n"
+ "Constants were derived as the best fit to 99 percentile empirically
measured max error in thousands of trials"
+ )
+ .def(
+ "serialize",
+ [](const quantiles_sketch<T>& sk) {
+ auto bytes = sk.serialize();
+ return py::bytes(reinterpret_cast<const char*>(bytes.data()),
bytes.size());
+ },
+ "Serializes the sketch into a bytes object"
+ )
+ .def_static(
+ "deserialize",
+ [](const std::string& bytes) { return
quantiles_sketch<T>::deserialize(bytes.data(), bytes.size()); },
+ py::arg("bytes"),
+ "Deserializes the sketch from a bytes object"
+ )
+ .def("__iter__", [](const quantiles_sketch<T>& s) { return
py::make_iterator(s.begin(), s.end()); });
}
void init_quantiles(py::module &m) {
diff --git a/python/tests/quantiles_test.py b/python/tests/quantiles_test.py
index 7e4a40f..e2bd773 100644
--- a/python/tests/quantiles_test.py
+++ b/python/tests/quantiles_test.py
@@ -80,6 +80,13 @@ class QuantilesTest(unittest.TestCase):
unif_quantiles.update(np.random.uniform(10, 20, size=n-1))
self.assertTrue(ks_test(quantiles, unif_quantiles, 0.001))
+ total_weight = 0
+ for tuple in quantiles:
+ item = tuple[0]
+ weight = tuple[1]
+ total_weight = total_weight + weight
+ self.assertEqual(total_weight, quantiles.get_n())
+
def test_quantiles_ints_sketch(self):
k = 128
n = 10
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]