This is an automated email from the ASF dual-hosted git repository. jmalkin pushed a commit to branch cpp_version_bump in repository https://gitbox.apache.org/repos/asf/datasketches-python.git
commit c4ca0bfb6c22921a3997cdfb2985760cdd071431 Author: Jon Malkin <[email protected]> AuthorDate: Sun Aug 4 19:18:56 2024 -0700 Add .filter() to tuple and add tdigest wrapper. Specify numpy < 2.0 everywhere for now. --- src/tdigest_wrapper.cpp | 83 +++++++++++++++++++++++++++++++++++++ tests/tdigest_test.py | 106 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 189 insertions(+) diff --git a/src/tdigest_wrapper.cpp b/src/tdigest_wrapper.cpp new file mode 100644 index 0000000..1691daa --- /dev/null +++ b/src/tdigest_wrapper.cpp @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include <vector> +#include <stdexcept> + +#include <nanobind/nanobind.h> +#include <nanobind/make_iterator.h> +#include <nanobind/stl/string.h> +//#include <nanobind/stl/vector.h> +#include <nanobind/ndarray.h> + +#include "tdigest.hpp" +#include "quantile_conditional.hpp" + +namespace nb = nanobind; + +template<typename T> +void bind_tdigest(nb::module_ &m, const char* name) { + using namespace datasketches; + + auto tdigest_class = nb::class_<tdigest<T>>(m, name) + .def(nb::init<uint16_t>(), nb::arg("k")=tdigest<T>::DEFAULT_K, + "Creates a classic quantiles sketch instance with the given value of k.\n\n" + ":param k: Controls the size/accuracy trade-off of the sketch. Default is 128.\n" + ":type k: int, optional" + ) + .def("__copy__", [](const tdigest<T>& sk) { return tdigest<T>(sk); }) + .def("update", (void(tdigest<T>::*)(T)) &tdigest<T>::update, nb::arg("item"), + "Updates the sketch with the given value") + .def("merge", (void(tdigest<T>::*)(tdigest<T>&)) &tdigest<T>::merge, nb::arg("sketch"), + "Merges the provided sketch into this one") + .def("__str__", [](const tdigest<T>& sk) { return sk.to_string(); }, + "Produces a string summary of the sketch") + .def("to_string", &tdigest<T>::to_string, nb::arg("print_centroids")=false, + "Produces a string summary of the sketch") + .def("is_empty", &tdigest<T>::is_empty, + "Returns True if the sketch is empty, otherwise False") + .def_prop_ro("k", &tdigest<T>::get_k, + "The configured parameter k") + .def("get_total_weight", &tdigest<T>::get_total_weight, + "The total weight processed by the sketch") + .def("compress", &tdigest<T>::compress, + "Process buffered values and merge centroids, if necesssary") + .def("get_min_value", &tdigest<T>::get_min_value, + "Returns the minimum value from the stream. If empty, throws a RuntimeError") + .def("get_max_value", &tdigest<T>::get_max_value, + "Returns the maximum value from the stream. If empty, throws a RuntimeError") + .def("get_rank", &tdigest<T>::get_rank, nb::arg("value"), + "Computes the approximate normalized rank of the given value") + .def("get_quantile", &tdigest<T>::get_quantile, nb::arg("rank"), + "Returns an approximation to the data value " + "associated with the given rank in a hypothetical sorted " + "version of the input stream so far.\n") + .def("get_serialized_size_bytes", &tdigest<T>::get_serialized_size_bytes, + nb::arg("with_buffer")=false, + "Returns the size of the serialized sketch, in bytes") + ; + + add_serialization<T>(tdigest_class); + add_vector_update<T>(tdigest_class); +} + +void init_tdigest(nb::module_ &m) { + bind_tdigest<float>(m, "tdigest_float"); + bind_tdigest<double>(m, "tdigest_double"); +} \ No newline at end of file diff --git a/tests/tdigest_test.py b/tests/tdigest_test.py new file mode 100644 index 0000000..3b66a8c --- /dev/null +++ b/tests/tdigest_test.py @@ -0,0 +1,106 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import unittest +from datasketches import tdigest_float, tdigest_double +import numpy as np + +class TdigestTest(unittest.TestCase): + def test_tdigest_double_example(self): + n = 2 ** 20 + + # create a tdigest and inject ~1 million N(0,1) points, both using a vector + # update as well as a single value + td = tdigest_double() + td.update(np.random.normal(size=n-1)) + td.update(0.0) + + # 0 should be near the median + self.assertAlmostEqual(0.5, td.get_rank(0.0), delta=0.06) + + # the median should be near 0 + self.assertAlmostEqual(0.0, td.get_quantile(0.5), delta=0.06) + + # note that with t-digest, while it typically performs quite well in practice, + # we do not have any sort of theoretical guarantees on the error bounds + # or even an estimate of what bounds we may expect. + + # we also track the min/max independently from the rest of the data + # which lets us know the full observed data range + self.assertLessEqual(td.get_min_value(), td.get_quantile(0.01)) + self.assertLessEqual(0.0, td.get_rank(td.get_min_value())) + self.assertGreaterEqual(td.get_max_value(), td.get_quantile(0.99)) + self.assertGreaterEqual(1.0, td.get_rank(td.get_max_value())) + + # and a few basic queries about the sketch + self.assertFalse(td.is_empty()) + self.assertEqual(td.get_total_weight(), n) + + # we can define a new tdiget with a different distribution, then merge them + td2 = tdigest_double() + td2.update(np.random.normal(loc=2.0, size=n)) + td.merge(td2) + + # the new median should be near 1.0, and 1.0 should be near the median although + # the error distribution is not well-characterized so we allow generous margins + self.assertAlmostEqual(0.5, td.get_rank(1.0), delta=0.2) + self.assertAlmostEqual(1.0, td.get_quantile(0.5), delta=0.2) + self.assertEqual(td.get_total_weight(), 2 * n) + + # finally, can serialize and deserialize the sketch + td_bytes = td.serialize() + new_td = tdigest_double.deserialize(td_bytes) + self.assertEqual(td.get_total_weight(), new_td.get_total_weight()) + self.assertEqual(td.get_min_value(), new_td.get_min_value()) + self.assertEqual(td.get_max_value(), new_td.get_max_value()) + self.assertEqual(td.get_quantile(0.7), new_td.get_quantile(0.7)) + self.assertEqual(td.get_rank(0.0), new_td.get_rank(0.0)) + + + # the same tests as above, but with tdigest_float + def test_tdigest_float_example(self): + n = 2 ** 20 + td = tdigest_float() + td.update(np.random.normal(size=n-1)) + td.update(0.0) + + self.assertAlmostEqual(0.5, td.get_rank(0.0), delta=0.06) + self.assertAlmostEqual(0.0, td.get_quantile(0.5), delta=0.06) + + self.assertLessEqual(td.get_min_value(), td.get_quantile(0.01)) + self.assertLessEqual(0.0, td.get_rank(td.get_min_value())) + self.assertGreaterEqual(td.get_max_value(), td.get_quantile(0.99)) + self.assertGreaterEqual(1.0, td.get_rank(td.get_max_value())) + + self.assertFalse(td.is_empty()) + self.assertEqual(td.get_total_weight(), n) + + td2 = tdigest_float() + td2.update(np.random.normal(loc=2.0, size=n)) + td.merge(td2) + + self.assertAlmostEqual(0.5, td.get_rank(1.0), delta=0.2) + self.assertAlmostEqual(1.0, td.get_quantile(0.5), delta=0.2) + self.assertEqual(td.get_total_weight(), 2 * n) + + td_bytes = td.serialize() + new_td = tdigest_float.deserialize(td_bytes) + self.assertEqual(td.get_total_weight(), new_td.get_total_weight()) + self.assertEqual(td.get_min_value(), new_td.get_min_value()) + self.assertEqual(td.get_max_value(), new_td.get_max_value()) + self.assertEqual(td.get_quantile(0.7), new_td.get_quantile(0.7)) + self.assertEqual(td.get_rank(0.0), new_td.get_rank(0.0)) --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
