This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch cpp_version_bump
in repository https://gitbox.apache.org/repos/asf/datasketches-python.git

commit b6aaca7597f7606c05fd4e5d0408ddae9ebecf17
Author: Jon Malkin <[email protected]>
AuthorDate: Sun Aug 4 19:19:20 2024 -0700

    add missing files to previous commit
---
 CMakeLists.txt        |  5 +++--
 setup.py              |  2 +-
 src/datasketches.cpp  |  2 ++
 src/tuple_wrapper.cpp | 17 +++++++++++++++++
 tests/tuple_test.py   | 18 ++++++++++++++++++
 tox.ini               |  2 +-
 6 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 350ebc0..4e9ba96 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -108,6 +108,7 @@ target_sources(python
     src/density_wrapper.cpp
     src/ks_wrapper.cpp
     src/count_wrapper.cpp
+    src/tdigest_wrapper.cpp
     src/vector_of_kll.cpp
     src/py_serde.cpp
 )
@@ -116,7 +117,7 @@ cmake_policy(SET CMP0097 NEW)
 include(ExternalProject)
 ExternalProject_Add(datasketches
   GIT_REPOSITORY https://github.com/apache/datasketches-cpp.git
-  GIT_TAG 5.0.2
+  GIT_TAG 5.1.0
   GIT_SHALLOW true
   GIT_SUBMODULES ""
   INSTALL_DIR /tmp/datasketches
@@ -126,7 +127,7 @@ ExternalProject_Get_property(datasketches INSTALL_DIR)
 set(datasketches_INSTALL_DIR ${INSTALL_DIR})
 message("Source dir of datasketches = ${datasketches_INSTALL_DIR}")
 message("Numpy include dir(s): ${Python_NumPy_INCLUDE_DIRS}")
-target_include_directories(python 
+target_include_directories(python
   PRIVATE
     ${datasketches_INSTALL_DIR}/include/DataSketches
     ${Python_NumPy_INCLUDE_DIRS}
diff --git a/setup.py b/setup.py
index 1467797..3ef740d 100644
--- a/setup.py
+++ b/setup.py
@@ -104,6 +104,6 @@ setup(
     # may need to add all source paths for sdist packages w/o MANIFEST.in
     ext_modules=[CMakeExtension('datasketches','.')],
     cmdclass={'build_ext': CMakeBuild},
-    install_requires=['numpy'],
+    install_requires=['numpy < 2.0'],
     zip_safe=False
 )
diff --git a/src/datasketches.cpp b/src/datasketches.cpp
index 7eb70a5..118683b 100644
--- a/src/datasketches.cpp
+++ b/src/datasketches.cpp
@@ -39,6 +39,7 @@ void init_req(nb::module_& m);
 void init_quantiles(nb::module_& m);
 void init_count_min(nb::module_& m);
 void init_density(nb::module_& m);
+void init_tdigest(nb::module_& m);
 void init_vector_of_kll(nb::module_& m);
 
 // supporting objects
@@ -70,6 +71,7 @@ NB_MODULE(_datasketches, m) {
   init_quantiles(m);
   init_count_min(m);
   init_density(m);
+  init_tdigest(m);
   init_vector_of_kll(m);
 
   init_kolmogorov_smirnov(m);
diff --git a/src/tuple_wrapper.cpp b/src/tuple_wrapper.cpp
index 6e906a7..6563e5b 100644
--- a/src/tuple_wrapper.cpp
+++ b/src/tuple_wrapper.cpp
@@ -23,6 +23,7 @@
 #include <nanobind/make_iterator.h>
 #include <nanobind/intrusive/counter.h>
 #include <nanobind/stl/array.h>
+#include <nanobind/stl/function.h>
 #include <nanobind/stl/string.h>
 
 #include "py_serde.hpp"
@@ -133,6 +134,14 @@ void init_tuple(nb::module_ &m) {
         }, nb::arg("serde"),
         "Serializes the sketch into a bytes object"
     )
+    .def("filter",
+         [](const py_compact_tuple& sk, const std::function<bool(const 
nb::object&)> func) {
+           return sk.filter(func);
+         }, nb::arg("predicate"),
+         "Produces a compact_tuple_sketch from the given sketch by applying a 
predicate to "
+         "the summary in each entry.\n\n"
+         ":param predicate: A function returning true or value evaluated on 
each tuple summary\n"
+         ":return: A compact_tuple_sketch with the selected entries\n:rtype: 
:class:`compact_tuple_sketch`")
     .def_static(
         "deserialize",
         [](const nb::bytes& bytes, py_object_serde& serde, uint64_t seed) {
@@ -169,6 +178,14 @@ void init_tuple(nb::module_ &m) {
          "Returns a compacted form of the sketch, optionally sorting it")
     .def("trim", &py_update_tuple::trim, "Removes retained entries in excess 
of the nominal size k (if any)")
     .def("reset", &py_update_tuple::reset, "Resets the sketch to the initial 
empty state")
+    .def("filter",
+         [](const py_update_tuple& sk, const std::function<bool(const 
nb::object&)> func) {
+           return sk.filter(func);
+         }, nb::arg("predicate"),
+         "Produces a compact_tuple_sketch from the given sketch by applying a 
predicate to "
+         "the summary in each entry.\n\n"
+         ":param predicate: A function returning true or value evaluated on 
each tuple summary\n"
+         ":return: A compact_tuple_sketch with the selected entries\n:rtype: 
:class:`compact_tuple_sketch`")
   ;
 
   nb::class_<py_tuple_union>(m, "tuple_union")
diff --git a/tests/tuple_test.py b/tests/tuple_test.py
index fa09b0b..cf505fb 100644
--- a/tests/tuple_test.py
+++ b/tests/tuple_test.py
@@ -87,6 +87,24 @@ class TupleTest(unittest.TestCase):
         self.assertTrue(sk.is_empty())
         self.assertEqual(sk.num_retained, 0)
 
+    def test_tuple_filter(self):
+        lgk = 12    # 2^k = 4096 rows in the table
+        n = 1 << 18 # ~256k unique values
+
+        # filtering lets us apply a predicate to the sketch, producing a new
+        # compact sketch using the entries matching the predicate.
+        sk = update_tuple_sketch(AccumulatorPolicy(), lgk)
+        for ii in range(0, n):
+          sk.update(ii, ii)
+
+        # we can filter by a predicate, whether a lambda or a defined function
+        # for instance, using 0.5*n will return a compact_tuple_sketch with
+        # approximately half the entries.
+        result = sk.filter(lambda x: x < (0.5 * n))
+        self.assertAlmostEqual(result.get_estimate(), 0.5 * n, delta=0.01 * n)
+        self.assertLess(result.get_lower_bound(1), 0.5 * n)
+        self.assertGreater(result.get_upper_bound(1), 0.5 * n)
+
     def test_tuple_set_operations(self):
         lgk = 12    # 2^k = 4096 rows in the table
         n = 1 << 18 # ~256k unique values
diff --git a/tox.ini b/tox.ini
index 64a1f68..02b0a90 100644
--- a/tox.ini
+++ b/tox.ini
@@ -21,6 +21,6 @@ isolated_build = true
 
 [testenv]
 deps = pytest
-       numpy
+       numpy < 2.0
 changedir = tests
 commands = pytest


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to