This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch python_wrapper_improvement
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git


The following commit(s) were added to refs/heads/python_wrapper_improvement by 
this push:
     new 032dd38  added iterator, rearranged and simplified existing code
032dd38 is described below

commit 032dd381e70b0f51576265d0389a1e940cb92f7e
Author: AlexanderSaydakov <[email protected]>
AuthorDate: Thu Dec 22 17:14:49 2022 -0800

    added iterator, rearranged and simplified existing code
---
 python/src/quantiles_wrapper.cpp | 235 +++++++++++++++++----------------------
 python/tests/quantiles_test.py   |   7 ++
 2 files changed, 110 insertions(+), 132 deletions(-)

diff --git a/python/src/quantiles_wrapper.cpp b/python/src/quantiles_wrapper.cpp
index b91ea08..60652ba 100644
--- a/python/src/quantiles_wrapper.cpp
+++ b/python/src/quantiles_wrapper.cpp
@@ -17,95 +17,15 @@
  * under the License.
  */
 
-#include "quantiles_sketch.hpp"
-
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <pybind11/numpy.h>
 #include <vector>
+#include <stdexcept>
 
-namespace py = pybind11;
-
-namespace datasketches {
-
-namespace python {
-
-template<typename T>
-quantiles_sketch<T> quantiles_sketch_deserialize(py::bytes sk_bytes) {
-  std::string sk_str = sk_bytes; // implicit cast  
-  return quantiles_sketch<T>::deserialize(sk_str.c_str(), sk_str.length());
-}
-
-template<typename T>
-py::object quantiles_sketch_serialize(const quantiles_sketch<T>& sk) {
-  auto ser_result = sk.serialize();
-  return py::bytes((char*)ser_result.data(), ser_result.size());
-}
-
-// maybe possible to disambiguate the static vs method rank error calls, but
-// this is easier for now
-template<typename T>
-double quantiles_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
-  return quantiles_sketch<T>::get_normalized_rank_error(k, pmf);
-}
-
-template<typename T>
-py::list quantiles_sketch_get_quantiles(const quantiles_sketch<T>& sk,
-                                        std::vector<double>& ranks,
-                                        bool inclusive) {
-  size_t n_quantiles = ranks.size();
-  auto result = sk.get_quantiles(ranks.data(), 
static_cast<uint32_t>(n_quantiles), inclusive);
-  // returning as std::vector<> would copy values to a list anyway
-  py::list list(n_quantiles);
-  for (size_t i = 0; i < n_quantiles; ++i) {
-      list[i] = result[i];
-  }
-  return list;
-}
-
-template<typename T>
-py::list quantiles_sketch_get_pmf(const quantiles_sketch<T>& sk,
-                                  std::vector<T>& split_points,
-                                  bool inclusive) {
-  size_t n_points = split_points.size();
-  auto result = sk.get_PMF(split_points.data(), n_points, inclusive);
-  py::list list(n_points + 1);
-  for (size_t i = 0; i <= n_points; ++i) {
-    list[i] = result[i];
-  }
-  return list;
-}
-
-template<typename T>
-py::list quantiles_sketch_get_cdf(const quantiles_sketch<T>& sk,
-                                  std::vector<T>& split_points,
-                                  bool inclusive) {
-  size_t n_points = split_points.size();
-  auto result = sk.get_CDF(split_points.data(), n_points, inclusive);
-  py::list list(n_points + 1);
-  for (size_t i = 0; i <= n_points; ++i) {
-    list[i] = result[i];
-  }
-  return list;
-}
-
-template<typename T>
-void quantiles_sketch_update(quantiles_sketch<T>& sk, py::array_t<T, 
py::array::c_style | py::array::forcecast> items) {
-  if (items.ndim() != 1) {
-    throw std::invalid_argument("input data must have only one dimension. 
Found: "
-          + std::to_string(items.ndim()));
-  }
-  
-  auto data = items.template unchecked<1>();
-  for (uint32_t i = 0; i < data.size(); ++i) {
-    sk.update(data(i));
-  }
-}
-
-}
-}
+#include "quantiles_sketch.hpp"
 
-namespace dspy = datasketches::python;
+namespace py = pybind11;
 
 template<typename T>
 void bind_quantiles_sketch(py::module &m, const char* name) {
@@ -114,12 +34,27 @@ void bind_quantiles_sketch(py::module &m, const char* 
name) {
   py::class_<quantiles_sketch<T>>(m, name)
     .def(py::init<uint16_t>(), py::arg("k")=quantiles_constants::DEFAULT_K)
     .def(py::init<const quantiles_sketch<T>&>())
-    .def("update", (void (quantiles_sketch<T>::*)(const T&)) 
&quantiles_sketch<T>::update, py::arg("item"),
-         "Updates the sketch with the given value")
-    .def("update", &dspy::quantiles_sketch_update<T>, py::arg("array"),
-         "Updates the sketch with the values in the given array")
+    .def(
+        "update",
+        static_cast<void (quantiles_sketch<T>::*)(const 
T&)>(&quantiles_sketch<T>::update),
+        py::arg("item"),
+        "Updates the sketch with the given value"
+    )
+    .def(
+        "update",
+        [](quantiles_sketch<T>& sk, py::array_t<T, py::array::c_style | 
py::array::forcecast> items) {
+          if (items.ndim() != 1) {
+            throw std::invalid_argument("input data must have only one 
dimension. Found: "
+              + std::to_string(items.ndim()));
+          }
+          auto array = items.template unchecked<1>();
+          for (uint32_t i = 0; i < array.size(); ++i) sk.update(array(i));
+        },
+        py::arg("array"),
+        "Updates the sketch with the values in the given array"
+    )
     .def("merge", (void (quantiles_sketch<T>::*)(const quantiles_sketch<T>&)) 
&quantiles_sketch<T>::merge, py::arg("sketch"),
-         "Merges the provided sketch into the this one")
+         "Merges the provided sketch into this one")
     .def("__str__", &quantiles_sketch<T>::to_string, 
py::arg("print_levels")=false, py::arg("print_items")=false,
          "Produces a string summary of the sketch")
     .def("to_string", &quantiles_sketch<T>::to_string, 
py::arg("print_levels")=false, py::arg("print_items")=false,
@@ -144,11 +79,17 @@ void bind_quantiles_sketch(py::module &m, const char* 
name) {
          "version of the input stream so far.\n"
          "For quantiles_floats_sketch: if the sketch is empty this returns 
nan. "
          "For quantiles_ints_sketch: if the sketch is empty this throws a 
RuntimeError.")
-    .def("get_quantiles", &dspy::quantiles_sketch_get_quantiles<T>, 
py::arg("ranks"), py::arg("inclusive")=false,
-         "This returns an array that could have been generated by using 
get_quantile() for each "
-         "normalized rank separately.\n"
-         "If the sketch is empty this returns an empty vector.\n"
-         "Deprecated. Will be removed in the next major version. Use 
get_quantile() instead.")
+    .def(
+        "get_quantiles",
+        [](const quantiles_sketch<T>& sk, const std::vector<double>& ranks, 
bool inclusive) {
+          return sk.get_quantiles(ranks.data(), ranks.size(), inclusive);
+        },
+        py::arg("ranks"), py::arg("inclusive")=false,
+        "This returns an array that could have been generated by using 
get_quantile() for each "
+        "normalized rank separately.\n"
+        "If the sketch is empty this returns an empty vector.\n"
+        "Deprecated. Will be removed in the next major version. Use 
get_quantile() instead."
+    )
     .def("get_rank", &quantiles_sketch<T>::get_rank, py::arg("value"), 
py::arg("inclusive")=false,
          "Returns an approximation to the normalized rank of the given value 
from 0 to 1, inclusive.\n"
          "The resulting approximation has a probabilistic guarantee that can 
be obtained from the "
@@ -156,45 +97,75 @@ void bind_quantiles_sketch(py::module &m, const char* 
name) {
          "With the parameter inclusive=true the weight of the given value is 
included into the rank."
          "Otherwise the rank equals the sum of the weights of values less than 
the given value.\n"
          "If the sketch is empty this returns nan.")
-    .def("get_pmf", &dspy::quantiles_sketch_get_pmf<T>, 
py::arg("split_points"), py::arg("inclusive")=false,
-         "Returns an approximation to the Probability Mass Function (PMF) of 
the input stream "
-         "given a set of split points (values).\n"
-         "The resulting approximations have a probabilistic guarantee that can 
be obtained from the "
-         "get_normalized_rank_error(True) function.\n"
-         "If the sketch is empty this returns an empty vector.\n"
-         "split_points is an array of m unique, monotonically increasing float 
values "
-         "that divide the real number line into m+1 consecutive disjoint 
intervals.\n"
-         "The definition of an 'interval' is inclusive of the left split point 
(or minimum value) and "
-         "exclusive of the right split point, with the exception that the last 
interval will include "
-         "the maximum value.\n"
-         "It is not necessary to include either the min or max values in these 
split points.")
-    .def("get_cdf", &dspy::quantiles_sketch_get_cdf<T>, 
py::arg("split_points"), py::arg("inclusive")=false,
-         "Returns an approximation to the Cumulative Distribution Function 
(CDF), which is the "
-         "cumulative analog of the PMF, of the input stream given a set of 
split points (values).\n"
-         "The resulting approximations have a probabilistic guarantee that can 
be obtained from the "
-         "get_normalized_rank_error(True) function.\n"
-         "If the sketch is empty this returns an empty vector.\n"
-         "split_points is an array of m unique, monotonically increasing float 
values "
-         "that divide the real number line into m+1 consecutive disjoint 
intervals.\n"
-         "The definition of an 'interval' is inclusive of the left split point 
(or minimum value) and "
-         "exclusive of the right split point, with the exception that the last 
interval will include "
-         "the maximum value.\n"
-         "It is not necessary to include either the min or max values in these 
split points.")
-    .def("normalized_rank_error", (double (quantiles_sketch<T>::*)(bool) 
const) &quantiles_sketch<T>::get_normalized_rank_error,
-         py::arg("as_pmf"),
-         "Gets the normalized rank error for this sketch.\n"
-         "If pmf is True, returns the 'double-sided' normalized rank error for 
the get_PMF() function.\n"
-         "Otherwise, it is the 'single-sided' normalized rank error for all 
the other queries.\n"
-         "Constants were derived as the best fit to 99 percentile empirically 
measured max error in thousands of trials")
-    .def_static("get_normalized_rank_error", 
&dspy::quantiles_sketch_generic_normalized_rank_error<T>,
-         py::arg("k"), py::arg("as_pmf"),
-         "Gets the normalized rank error given parameters k and the pmf 
flag.\n"
-         "If pmf is True, returns the 'double-sided' normalized rank error for 
the get_PMF() function.\n"
-         "Otherwise, it is the 'single-sided' normalized rank error for all 
the other queries.\n"
-         "Constants were derived as the best fit to 99 percentile empirically 
measured max error in thousands of trials")
-    .def("serialize", &dspy::quantiles_sketch_serialize<T>, "Serializes the 
sketch into a bytes object")
-    .def_static("deserialize", &dspy::quantiles_sketch_deserialize<T>, 
"Deserializes the sketch from a bytes object")
-    ;
+    .def(
+        "get_pmf",
+        [](const quantiles_sketch<T>& sk, const std::vector<T>& split_points, 
bool inclusive) {
+          return sk.get_PMF(split_points.data(), split_points.size(), 
inclusive);
+        },
+        py::arg("split_points"), py::arg("inclusive")=false,
+        "Returns an approximation to the Probability Mass Function (PMF) of 
the input stream "
+        "given a set of split points (values).\n"
+        "The resulting approximations have a probabilistic guarantee that can 
be obtained from the "
+        "get_normalized_rank_error(True) function.\n"
+        "If the sketch is empty this returns an empty vector.\n"
+        "split_points is an array of m unique, monotonically increasing float 
values "
+        "that divide the real number line into m+1 consecutive disjoint 
intervals.\n"
+        "The definition of an 'interval' is inclusive of the left split point 
(or minimum value) and "
+        "exclusive of the right split point, with the exception that the last 
interval will include "
+        "the maximum value.\n"
+        "It is not necessary to include either the min or max values in these 
split points."
+    )
+    .def(
+        "get_cdf",
+        [](const quantiles_sketch<T>& sk, const std::vector<T>& split_points, 
bool inclusive) {
+          return sk.get_CDF(split_points.data(), split_points.size(), 
inclusive);
+        },
+        py::arg("split_points"), py::arg("inclusive")=false,
+        "Returns an approximation to the Cumulative Distribution Function 
(CDF), which is the "
+        "cumulative analog of the PMF, of the input stream given a set of 
split points (values).\n"
+        "The resulting approximations have a probabilistic guarantee that can 
be obtained from the "
+        "get_normalized_rank_error(True) function.\n"
+        "If the sketch is empty this returns an empty vector.\n"
+        "split_points is an array of m unique, monotonically increasing float 
values "
+        "that divide the real number line into m+1 consecutive disjoint 
intervals.\n"
+        "The definition of an 'interval' is inclusive of the left split point 
(or minimum value) and "
+        "exclusive of the right split point, with the exception that the last 
interval will include "
+        "the maximum value.\n"
+        "It is not necessary to include either the min or max values in these 
split points."
+    )
+    .def(
+        "normalized_rank_error",
+        static_cast<double (quantiles_sketch<T>::*)(bool) 
const>(&quantiles_sketch<T>::get_normalized_rank_error),
+        py::arg("as_pmf"),
+        "Gets the normalized rank error for this sketch.\n"
+        "If pmf is True, returns the 'double-sided' normalized rank error for 
the get_PMF() function.\n"
+        "Otherwise, it is the 'single-sided' normalized rank error for all the 
other queries.\n"
+        "Constants were derived as the best fit to 99 percentile empirically 
measured max error in thousands of trials"
+    )
+    .def_static(
+        "get_normalized_rank_error",
+        [](uint16_t k, bool pmf) { return 
quantiles_sketch<T>::get_normalized_rank_error(k, pmf); },
+        py::arg("k"), py::arg("as_pmf"),
+        "Gets the normalized rank error given parameters k and the pmf flag.\n"
+        "If pmf is True, returns the 'double-sided' normalized rank error for 
the get_PMF() function.\n"
+        "Otherwise, it is the 'single-sided' normalized rank error for all the 
other queries.\n"
+        "Constants were derived as the best fit to 99 percentile empirically 
measured max error in thousands of trials"
+    )
+    .def(
+        "serialize",
+        [](const quantiles_sketch<T>& sk) {
+          auto bytes = sk.serialize();
+          return py::bytes(reinterpret_cast<const char*>(bytes.data()), 
bytes.size());
+        },
+        "Serializes the sketch into a bytes object"
+    )
+    .def_static(
+        "deserialize",
+        [](const std::string& bytes) { return 
quantiles_sketch<T>::deserialize(bytes.data(), bytes.size()); },
+        py::arg("bytes"),
+        "Deserializes the sketch from a bytes object"
+    )
+    .def("__iter__", [](const quantiles_sketch<T>& s) { return 
py::make_iterator(s.begin(), s.end()); });
 }
 
 void init_quantiles(py::module &m) {
diff --git a/python/tests/quantiles_test.py b/python/tests/quantiles_test.py
index 7e4a40f..e2bd773 100644
--- a/python/tests/quantiles_test.py
+++ b/python/tests/quantiles_test.py
@@ -80,6 +80,13 @@ class QuantilesTest(unittest.TestCase):
       unif_quantiles.update(np.random.uniform(10, 20, size=n-1))
       self.assertTrue(ks_test(quantiles, unif_quantiles, 0.001))
 
+      total_weight = 0
+      for tuple in quantiles:
+        item = tuple[0]
+        weight = tuple[1]
+        total_weight = total_weight + weight
+      self.assertEqual(total_weight, quantiles.get_n())
+
     def test_quantiles_ints_sketch(self):
         k = 128
         n = 10


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to