This is an automated email from the ASF dual-hosted git repository. jmalkin pushed a commit to branch cpp_version_bump in repository https://gitbox.apache.org/repos/asf/datasketches-python.git
commit b6aaca7597f7606c05fd4e5d0408ddae9ebecf17 Author: Jon Malkin <[email protected]> AuthorDate: Sun Aug 4 19:19:20 2024 -0700 add missing files to previous commit --- CMakeLists.txt | 5 +++-- setup.py | 2 +- src/datasketches.cpp | 2 ++ src/tuple_wrapper.cpp | 17 +++++++++++++++++ tests/tuple_test.py | 18 ++++++++++++++++++ tox.ini | 2 +- 6 files changed, 42 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 350ebc0..4e9ba96 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -108,6 +108,7 @@ target_sources(python src/density_wrapper.cpp src/ks_wrapper.cpp src/count_wrapper.cpp + src/tdigest_wrapper.cpp src/vector_of_kll.cpp src/py_serde.cpp ) @@ -116,7 +117,7 @@ cmake_policy(SET CMP0097 NEW) include(ExternalProject) ExternalProject_Add(datasketches GIT_REPOSITORY https://github.com/apache/datasketches-cpp.git - GIT_TAG 5.0.2 + GIT_TAG 5.1.0 GIT_SHALLOW true GIT_SUBMODULES "" INSTALL_DIR /tmp/datasketches @@ -126,7 +127,7 @@ ExternalProject_Get_property(datasketches INSTALL_DIR) set(datasketches_INSTALL_DIR ${INSTALL_DIR}) message("Source dir of datasketches = ${datasketches_INSTALL_DIR}") message("Numpy include dir(s): ${Python_NumPy_INCLUDE_DIRS}") -target_include_directories(python +target_include_directories(python PRIVATE ${datasketches_INSTALL_DIR}/include/DataSketches ${Python_NumPy_INCLUDE_DIRS} diff --git a/setup.py b/setup.py index 1467797..3ef740d 100644 --- a/setup.py +++ b/setup.py @@ -104,6 +104,6 @@ setup( # may need to add all source paths for sdist packages w/o MANIFEST.in ext_modules=[CMakeExtension('datasketches','.')], cmdclass={'build_ext': CMakeBuild}, - install_requires=['numpy'], + install_requires=['numpy < 2.0'], zip_safe=False ) diff --git a/src/datasketches.cpp b/src/datasketches.cpp index 7eb70a5..118683b 100644 --- a/src/datasketches.cpp +++ b/src/datasketches.cpp @@ -39,6 +39,7 @@ void init_req(nb::module_& m); void init_quantiles(nb::module_& m); void init_count_min(nb::module_& m); void init_density(nb::module_& m); +void init_tdigest(nb::module_& m); void init_vector_of_kll(nb::module_& m); // supporting objects @@ -70,6 +71,7 @@ NB_MODULE(_datasketches, m) { init_quantiles(m); init_count_min(m); init_density(m); + init_tdigest(m); init_vector_of_kll(m); init_kolmogorov_smirnov(m); diff --git a/src/tuple_wrapper.cpp b/src/tuple_wrapper.cpp index 6e906a7..6563e5b 100644 --- a/src/tuple_wrapper.cpp +++ b/src/tuple_wrapper.cpp @@ -23,6 +23,7 @@ #include <nanobind/make_iterator.h> #include <nanobind/intrusive/counter.h> #include <nanobind/stl/array.h> +#include <nanobind/stl/function.h> #include <nanobind/stl/string.h> #include "py_serde.hpp" @@ -133,6 +134,14 @@ void init_tuple(nb::module_ &m) { }, nb::arg("serde"), "Serializes the sketch into a bytes object" ) + .def("filter", + [](const py_compact_tuple& sk, const std::function<bool(const nb::object&)> func) { + return sk.filter(func); + }, nb::arg("predicate"), + "Produces a compact_tuple_sketch from the given sketch by applying a predicate to " + "the summary in each entry.\n\n" + ":param predicate: A function returning true or value evaluated on each tuple summary\n" + ":return: A compact_tuple_sketch with the selected entries\n:rtype: :class:`compact_tuple_sketch`") .def_static( "deserialize", [](const nb::bytes& bytes, py_object_serde& serde, uint64_t seed) { @@ -169,6 +178,14 @@ void init_tuple(nb::module_ &m) { "Returns a compacted form of the sketch, optionally sorting it") .def("trim", &py_update_tuple::trim, "Removes retained entries in excess of the nominal size k (if any)") .def("reset", &py_update_tuple::reset, "Resets the sketch to the initial empty state") + .def("filter", + [](const py_update_tuple& sk, const std::function<bool(const nb::object&)> func) { + return sk.filter(func); + }, nb::arg("predicate"), + "Produces a compact_tuple_sketch from the given sketch by applying a predicate to " + "the summary in each entry.\n\n" + ":param predicate: A function returning true or value evaluated on each tuple summary\n" + ":return: A compact_tuple_sketch with the selected entries\n:rtype: :class:`compact_tuple_sketch`") ; nb::class_<py_tuple_union>(m, "tuple_union") diff --git a/tests/tuple_test.py b/tests/tuple_test.py index fa09b0b..cf505fb 100644 --- a/tests/tuple_test.py +++ b/tests/tuple_test.py @@ -87,6 +87,24 @@ class TupleTest(unittest.TestCase): self.assertTrue(sk.is_empty()) self.assertEqual(sk.num_retained, 0) + def test_tuple_filter(self): + lgk = 12 # 2^k = 4096 rows in the table + n = 1 << 18 # ~256k unique values + + # filtering lets us apply a predicate to the sketch, producing a new + # compact sketch using the entries matching the predicate. + sk = update_tuple_sketch(AccumulatorPolicy(), lgk) + for ii in range(0, n): + sk.update(ii, ii) + + # we can filter by a predicate, whether a lambda or a defined function + # for instance, using 0.5*n will return a compact_tuple_sketch with + # approximately half the entries. + result = sk.filter(lambda x: x < (0.5 * n)) + self.assertAlmostEqual(result.get_estimate(), 0.5 * n, delta=0.01 * n) + self.assertLess(result.get_lower_bound(1), 0.5 * n) + self.assertGreater(result.get_upper_bound(1), 0.5 * n) + def test_tuple_set_operations(self): lgk = 12 # 2^k = 4096 rows in the table n = 1 << 18 # ~256k unique values diff --git a/tox.ini b/tox.ini index 64a1f68..02b0a90 100644 --- a/tox.ini +++ b/tox.ini @@ -21,6 +21,6 @@ isolated_build = true [testenv] deps = pytest - numpy + numpy < 2.0 changedir = tests commands = pytest --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
