(datasketches-python) 01/02: Add .filter() to tuple and add tdigest wrapper. Specify numpy < 2.0 everywhere for now.

jmalkin Sun, 04 Aug 2024 19:20:22 -0700

This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch cpp_version_bump
in repository https://gitbox.apache.org/repos/asf/datasketches-python.git


commit c4ca0bfb6c22921a3997cdfb2985760cdd071431
Author: Jon Malkin <[email protected]>
AuthorDate: Sun Aug 4 19:18:56 2024 -0700

    Add .filter() to tuple and add tdigest wrapper. Specify numpy < 2.0 
everywhere for now.
---
 src/tdigest_wrapper.cpp |  83 +++++++++++++++++++++++++++++++++++++
 tests/tdigest_test.py   | 106 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 189 insertions(+)

diff --git a/src/tdigest_wrapper.cpp b/src/tdigest_wrapper.cpp
new file mode 100644
index 0000000..1691daa
--- /dev/null
+++ b/src/tdigest_wrapper.cpp
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <vector>
+#include <stdexcept>
+
+#include <nanobind/nanobind.h>
+#include <nanobind/make_iterator.h>
+#include <nanobind/stl/string.h>
+//#include <nanobind/stl/vector.h>
+#include <nanobind/ndarray.h>
+
+#include "tdigest.hpp"
+#include "quantile_conditional.hpp"
+
+namespace nb = nanobind;
+
+template<typename T>
+void bind_tdigest(nb::module_ &m, const char* name) {
+  using namespace datasketches;
+
+  auto tdigest_class = nb::class_<tdigest<T>>(m, name)
+    .def(nb::init<uint16_t>(), nb::arg("k")=tdigest<T>::DEFAULT_K,
+         "Creates a classic quantiles sketch instance with the given value of 
k.\n\n"
+         ":param k: Controls the size/accuracy trade-off of the sketch. 
Default is 128.\n"
+         ":type k: int, optional"
+    )
+    .def("__copy__", [](const tdigest<T>& sk) { return tdigest<T>(sk); })
+    .def("update", (void(tdigest<T>::*)(T)) &tdigest<T>::update, 
nb::arg("item"),
+        "Updates the sketch with the given value")
+    .def("merge", (void(tdigest<T>::*)(tdigest<T>&)) &tdigest<T>::merge, 
nb::arg("sketch"),
+         "Merges the provided sketch into this one")
+    .def("__str__", [](const tdigest<T>& sk) { return sk.to_string(); },
+         "Produces a string summary of the sketch")
+    .def("to_string", &tdigest<T>::to_string, nb::arg("print_centroids")=false,
+         "Produces a string summary of the sketch")
+    .def("is_empty", &tdigest<T>::is_empty,
+         "Returns True if the sketch is empty, otherwise False")
+    .def_prop_ro("k", &tdigest<T>::get_k,
+         "The configured parameter k")
+    .def("get_total_weight", &tdigest<T>::get_total_weight,
+         "The total weight processed by the sketch")
+    .def("compress", &tdigest<T>::compress,
+         "Process buffered values and merge centroids, if necesssary")
+    .def("get_min_value", &tdigest<T>::get_min_value,
+         "Returns the minimum value from the stream. If empty, throws a 
RuntimeError")
+    .def("get_max_value", &tdigest<T>::get_max_value,
+         "Returns the maximum value from the stream. If empty, throws a 
RuntimeError")
+    .def("get_rank", &tdigest<T>::get_rank, nb::arg("value"),
+         "Computes the approximate normalized rank of the given value")
+    .def("get_quantile", &tdigest<T>::get_quantile, nb::arg("rank"),
+         "Returns an approximation to the data value "
+         "associated with the given rank in a hypothetical sorted "
+         "version of the input stream so far.\n")
+    .def("get_serialized_size_bytes", &tdigest<T>::get_serialized_size_bytes,
+         nb::arg("with_buffer")=false,
+         "Returns the size of the serialized sketch, in bytes")
+    ;
+
+    add_serialization<T>(tdigest_class);
+    add_vector_update<T>(tdigest_class);
+}
+
+void init_tdigest(nb::module_ &m) {
+  bind_tdigest<float>(m, "tdigest_float");
+  bind_tdigest<double>(m, "tdigest_double");
+}
\ No newline at end of file
diff --git a/tests/tdigest_test.py b/tests/tdigest_test.py
new file mode 100644
index 0000000..3b66a8c
--- /dev/null
+++ b/tests/tdigest_test.py
@@ -0,0 +1,106 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import unittest
+from datasketches import tdigest_float, tdigest_double
+import numpy as np
+
+class TdigestTest(unittest.TestCase):
+    def test_tdigest_double_example(self):
+      n = 2 ** 20
+
+      # create a tdigest and inject ~1 million N(0,1) points, both using a 
vector
+      # update as well as a single value
+      td = tdigest_double()
+      td.update(np.random.normal(size=n-1))
+      td.update(0.0)
+
+      # 0 should be near the median
+      self.assertAlmostEqual(0.5, td.get_rank(0.0), delta=0.06)
+
+      # the median should be near 0
+      self.assertAlmostEqual(0.0, td.get_quantile(0.5), delta=0.06)
+
+      # note that with t-digest, while it typically performs quite well in 
practice,
+      # we do not have any sort of theoretical guarantees on the error bounds
+      # or even an estimate of what bounds we may expect.
+
+      # we also track the min/max independently from the rest of the data
+      # which lets us know the full observed data range
+      self.assertLessEqual(td.get_min_value(), td.get_quantile(0.01))
+      self.assertLessEqual(0.0, td.get_rank(td.get_min_value()))
+      self.assertGreaterEqual(td.get_max_value(), td.get_quantile(0.99))
+      self.assertGreaterEqual(1.0, td.get_rank(td.get_max_value()))
+
+      # and a few basic queries about the sketch
+      self.assertFalse(td.is_empty())
+      self.assertEqual(td.get_total_weight(), n)
+
+      # we can define a new tdiget with a different distribution, then merge 
them
+      td2 = tdigest_double()
+      td2.update(np.random.normal(loc=2.0, size=n))
+      td.merge(td2)
+
+      # the new median should be near 1.0, and 1.0 should be near the median 
although
+      # the error distribution is not well-characterized so we allow generous 
margins
+      self.assertAlmostEqual(0.5, td.get_rank(1.0), delta=0.2)
+      self.assertAlmostEqual(1.0, td.get_quantile(0.5), delta=0.2)
+      self.assertEqual(td.get_total_weight(), 2 * n)
+
+      # finally, can serialize and deserialize the sketch
+      td_bytes = td.serialize()
+      new_td = tdigest_double.deserialize(td_bytes)
+      self.assertEqual(td.get_total_weight(), new_td.get_total_weight())
+      self.assertEqual(td.get_min_value(), new_td.get_min_value())
+      self.assertEqual(td.get_max_value(), new_td.get_max_value())
+      self.assertEqual(td.get_quantile(0.7), new_td.get_quantile(0.7))
+      self.assertEqual(td.get_rank(0.0), new_td.get_rank(0.0))
+
+
+    # the same tests as above, but with tdigest_float
+    def test_tdigest_float_example(self):
+      n = 2 ** 20
+      td = tdigest_float()
+      td.update(np.random.normal(size=n-1))
+      td.update(0.0)
+
+      self.assertAlmostEqual(0.5, td.get_rank(0.0), delta=0.06)
+      self.assertAlmostEqual(0.0, td.get_quantile(0.5), delta=0.06)
+
+      self.assertLessEqual(td.get_min_value(), td.get_quantile(0.01))
+      self.assertLessEqual(0.0, td.get_rank(td.get_min_value()))
+      self.assertGreaterEqual(td.get_max_value(), td.get_quantile(0.99))
+      self.assertGreaterEqual(1.0, td.get_rank(td.get_max_value()))
+
+      self.assertFalse(td.is_empty())
+      self.assertEqual(td.get_total_weight(), n)
+
+      td2 = tdigest_float()
+      td2.update(np.random.normal(loc=2.0, size=n))
+      td.merge(td2)
+
+      self.assertAlmostEqual(0.5, td.get_rank(1.0), delta=0.2)
+      self.assertAlmostEqual(1.0, td.get_quantile(0.5), delta=0.2)
+      self.assertEqual(td.get_total_weight(), 2 * n)
+
+      td_bytes = td.serialize()
+      new_td = tdigest_float.deserialize(td_bytes)
+      self.assertEqual(td.get_total_weight(), new_td.get_total_weight())
+      self.assertEqual(td.get_min_value(), new_td.get_min_value())
+      self.assertEqual(td.get_max_value(), new_td.get_max_value())
+      self.assertEqual(td.get_quantile(0.7), new_td.get_quantile(0.7))
+      self.assertEqual(td.get_rank(0.0), new_td.get_rank(0.0))


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(datasketches-python) 01/02: Add .filter() to tuple and add tdigest wrapper. Specify numpy < 2.0 everywhere for now.

Reply via email to