This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch density_custom_kernel
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git

commit 669567e2a71efc440a073f664b280ac330de5605
Author: Jon <[email protected]>
AuthorDate: Thu Apr 13 00:23:50 2023 -0700

    Add custom kernel to density sketch, adjust python wrapper to support the 
same
---
 density/include/density_sketch.hpp                 | 10 ++-
 density/include/density_sketch_impl.hpp            |  7 +-
 density/test/density_sketch_test.cpp               | 33 ++++++++
 python/datasketches/DensityWrapper.py              | 75 +++++++++++++++++
 .../{__init__.py => KernelFunction.py}             | 29 ++++---
 python/datasketches/__init__.py                    |  2 +
 python/include/kernel_function.hpp                 | 98 ++++++++++++++++++++++
 python/src/datasketches.cpp                        |  6 --
 python/src/density_wrapper.cpp                     | 52 +++++++-----
 python/tests/density_test.py                       | 29 ++++++-
 10 files changed, 292 insertions(+), 49 deletions(-)

diff --git a/density/include/density_sketch.hpp 
b/density/include/density_sketch.hpp
index 8393bad..5083802 100755
--- a/density/include/density_sketch.hpp
+++ b/density/include/density_sketch.hpp
@@ -63,9 +63,10 @@ public:
    * Constructor
    * @param k controls the size and error of the sketch.
    * @param dim dimension of the input domain
+   * @param kernel to use by this instance
    * @param allocator to use by this instance
    */
-  density_sketch(uint16_t k, uint32_t dim, const Allocator& allocator = 
Allocator());
+  density_sketch(uint16_t k, uint32_t dim, const Kernel& kernel = Kernel(), 
const Allocator& allocator = Allocator());
 
   /**
    * Returns configured parameter K
@@ -137,6 +138,7 @@ public:
   const_iterator end() const;
 
 private:
+  Kernel kernel_;
   uint16_t k_;
   uint32_t dim_;
   uint32_t num_retained_;
@@ -148,10 +150,14 @@ private:
 };
 
 template<typename T, typename K, typename A>
-class density_sketch<T, K, A>::const_iterator: public 
std::iterator<std::input_iterator_tag, T> {
+class density_sketch<T, K, A>::const_iterator {
 public:
   using Vector = density_sketch<T, K, A>::Vector;
+  using iterator_category = std::input_iterator_tag;
   using value_type = std::pair<const Vector&, const uint64_t>;
+  using difference_type = void;
+  using pointer = return_value_holder<value_type>;
+  using reference = const value_type;
   const_iterator& operator++();
   const_iterator& operator++(int);
   bool operator==(const const_iterator& other) const;
diff --git a/density/include/density_sketch_impl.hpp 
b/density/include/density_sketch_impl.hpp
index b3c29eb..9cefe56 100755
--- a/density/include/density_sketch_impl.hpp
+++ b/density/include/density_sketch_impl.hpp
@@ -28,7 +28,8 @@
 namespace datasketches {
 
 template<typename T, typename K, typename A>
-density_sketch<T, K, A>::density_sketch(uint16_t k, uint32_t dim, const A& 
allocator):
+density_sketch<T, K, A>::density_sketch(uint16_t k, uint32_t dim, const K& 
kernel, const A& allocator):
+kernel_(kernel),
 k_(k),
 dim_(dim),
 num_retained_(0),
@@ -100,7 +101,7 @@ T density_sketch<T, K, A>::get_estimate(const 
std::vector<T>& point) const {
   T density = 0;
   for (unsigned height = 0; height < levels_.size(); ++height) {
     for (const auto& p: levels_[height]) {
-      density += (1 << height) * K()(p, point) / n_;
+      density += (1 << height) * kernel_(p, point) / n_;
     }
   }
   return density;
@@ -131,7 +132,7 @@ void density_sketch<T, K, A>::compact_level(unsigned 
height) {
   for (unsigned i = 1; i < level.size(); ++i) {
     T delta = 0;
     for (unsigned j = 0; j < i; ++j) {
-      delta += (bits[j] ? 1 : -1) * K()(level[i], level[j]);
+      delta += (bits[j] ? 1 : -1) * kernel_(level[i], level[j]);
     }
     bits[i] = delta < 0;
   }
diff --git a/density/test/density_sketch_test.cpp 
b/density/test/density_sketch_test.cpp
index 4f5927f..6440a32 100755
--- a/density/test/density_sketch_test.cpp
+++ b/density/test/density_sketch_test.cpp
@@ -73,4 +73,37 @@ TEST_CASE("density sketch: iterator", "[density_sketch]") {
   REQUIRE(count == sketch.get_num_retained());
 }
 
+// spherical kernel for testing, returns 1 for vectors within radius and 0 
otherwise
+template<typename T>
+struct spherical_kernel {
+  spherical_kernel(T radius = 1.0) : _radius_squared(radius * radius) {}
+  T operator()(const std::vector<T>& v1, const std::vector<T>& v2) const {
+    return std::inner_product(v1.begin(), v1.end(), v2.begin(), 0.0, 
std::plus<T>(), [](T a, T b){return (a-b)*(a-b);}) <= _radius_squared ? 1.0 : 
0.0;
+  }
+  private:
+    T _radius_squared;
+};
+
+TEST_CASE("custom kernel", "[density_sketch]") {
+  density_sketch<float, spherical_kernel<float>> sketch(10, 3, 
spherical_kernel<float>(0.5));
+
+  // update with (1,1,1) and test points inside and outside the kernel
+  sketch.update(std::vector<float>(3, 1.0));
+  REQUIRE(sketch.get_estimate(std::vector<float>(3, 1.001)) == 1.0);
+  REQUIRE(sketch.get_estimate(std::vector<float>(3, 2.0)) == 0.0);
+
+  // rest of test follows iterator test above
+  unsigned n = 1000;
+  for (unsigned i = 2; i <= n; ++i) sketch.update(std::vector<float>(3, i));
+  REQUIRE(sketch.get_n() == n);
+  REQUIRE(sketch.is_estimation_mode());
+  unsigned count = 0;
+  for (auto pair: sketch) {
+    ++count;
+    // just to assert something about the output
+    REQUIRE(pair.first.size() == sketch.get_dim());
+  }
+  REQUIRE(count == sketch.get_num_retained());
+}
+
 } /* namespace datasketches */
diff --git a/python/datasketches/DensityWrapper.py 
b/python/datasketches/DensityWrapper.py
new file mode 100644
index 0000000..98d8c35
--- /dev/null
+++ b/python/datasketches/DensityWrapper.py
@@ -0,0 +1,75 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import numpy as np
+
+from _datasketches import _density_sketch, KernelFunction
+from .KernelFunction import GaussianKernel
+
+class density_sketch:
+  """An instance of a Density Sketch for kernel density estimation. Requires a 
KernelFunction object."""
+
+  def __init__(self, k:int, dim:int, kernel:KernelFunction=GaussianKernel()):
+    self._kernel = kernel
+    self._gadget = _density_sketch(k, dim, self._kernel)
+
+  def update(self, point:np.array):
+    """Updates the sketch with the given point"""
+    self._gadget.update(point)
+
+  def merge(self, other:'density_sketch'):
+    """Merges the provided sketch into this one"""
+    self._gadget.merge(other._gadget)
+
+  def is_empty(self):
+    """Returns True if the sketch is empty, otherwise False"""
+    return self._gadget.is_empty()
+
+  def get_k(self):
+    """Returns the configured parameter k"""
+    return self._gadget.get_k()
+
+  def get_dim(self):
+    """Returns the configured parameter dim"""
+    return self._gadget.get_dim()
+
+  def get_n(self):
+    """Returns the length of the input stream"""
+    return self._gadget.get_n()
+
+  def get_num_retained(self):
+    """Returns the number of retained items (samples) in the sketch"""
+    return self._gadget.get_num_retained()
+
+  def is_estimation_mode(self):
+    """Returns True if the sketch is in estimation mode, otherwise False"""
+    return self._gadget.is_estimation_mode()
+
+  def get_estimate(self, point:np.array):
+    """Returns an approximate density at the given point"""
+    return self._gadget.get_estimate(point)
+
+  def __str__(self, print_levels:bool=False, print_items:bool=False):
+    """Produces a string summary of the sketch"""
+    return self._gadget.to_string(print_levels, print_items)
+
+  def to_string(self, print_levels:bool=False, print_items:bool=False):
+    """Produces a string summary of the sketch"""
+    return self._gadget.to_string(print_levels, print_items)
+
+  def __iter__(self):
+    return self._gadget.__iter__()
diff --git a/python/datasketches/__init__.py 
b/python/datasketches/KernelFunction.py
similarity index 58%
copy from python/datasketches/__init__.py
copy to python/datasketches/KernelFunction.py
index fb7636e..7603b10 100644
--- a/python/datasketches/__init__.py
+++ b/python/datasketches/KernelFunction.py
@@ -15,22 +15,21 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""The Apache DataSketches Library for Python
+import numpy as np
 
-Provided under the Apache License, Verison 2.0
-<http://www.apache.org/licenses/LICENSE-2.0>
-"""
+from _datasketches import KernelFunction
 
-name = 'datasketches'
-
-from _datasketches import *
+# This file provides an example Python Kernel Function implementation.
+#
+# Each implementation must extend the KernelFunction class
+# and define the __call__ method
 
-from .PySerDe import *
-from .TuplePolicy import *
+# Implements a basic Gaussian Kernel
+class GaussianKernel(KernelFunction):
+  def __init__(self, bandwidth: float=1.0):
+    KernelFunction.__init__(self)
+    self._bw = bandwidth
+    self._scale = -0.5 * (bandwidth ** -2)
 
-# Wrappers around the pybind11 classes for cases where we
-# need to define a python object that is persisted within
-# the C++ object. Currently, the native python portion of
-# a class derived from a C++ class may be garbage collected
-# even though a pointer to the C++ portion remains valid.
-from .TupleWrapper import *
+  def __call__(self, a: np.array, b: np.array) -> float:
+    return np.exp(self._scale * np.linalg.norm(a - b)**2)
diff --git a/python/datasketches/__init__.py b/python/datasketches/__init__.py
index fb7636e..bc78d4e 100644
--- a/python/datasketches/__init__.py
+++ b/python/datasketches/__init__.py
@@ -27,6 +27,7 @@ from _datasketches import *
 
 from .PySerDe import *
 from .TuplePolicy import *
+from .KernelFunction import *
 
 # Wrappers around the pybind11 classes for cases where we
 # need to define a python object that is persisted within
@@ -34,3 +35,4 @@ from .TuplePolicy import *
 # a class derived from a C++ class may be garbage collected
 # even though a pointer to the C++ portion remains valid.
 from .TupleWrapper import *
+from .DensityWrapper import *
diff --git a/python/include/kernel_function.hpp 
b/python/include/kernel_function.hpp
new file mode 100644
index 0000000..ca41c71
--- /dev/null
+++ b/python/include/kernel_function.hpp
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+//#include <memory>
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+
+#ifndef _KERNEL_FUNCTION_HPP_
+#define _KERNEL_FUNCTION_HPP_
+
+namespace py = pybind11;
+
+namespace datasketches {
+
+/**
+ * @brief kernel_function provides the underlying base class from
+ *        which native Python kernels ultimately inherit. The actual
+ *        kernels implement KernelFunction, as shown in KernelFunction.py
+ */
+struct kernel_function {
+  virtual double operator()(py::array_t<double>& a, const py::array_t<double>& 
b) const = 0;
+  virtual ~kernel_function() = default;
+};
+
+/**
+ * @brief KernelFunction provides the "trampoline" class for pybind11
+ *        that allows for a native Python implementation of kernel
+ *        functions.
+ */
+struct KernelFunction : public kernel_function {
+  using kernel_function::kernel_function;
+
+  /**
+   * @brief Evaluates K(a,b), the kernel function for the given points a and b
+   * @param a the first vector
+   * @param b the second vector
+   * @return The function value K(a,b)
+   */
+  double operator()(py::array_t<double>& a, const py::array_t<double>& b) 
const override {
+    PYBIND11_OVERRIDE_PURE_NAME(
+      double,          // Return type
+      kernel_function, // Parent class
+      "__call__",      // Name of function in python
+      operator(),      // Name of function in C++
+      a, b             // Arguemnts
+    );
+  }
+};
+
+/* The kernel_function_holder provides a concrete class that dispatches calls
+ * from the sketch to the kernel_function. This class is needed to provide a
+ * concrete object to produce a compiled library, but library users should
+ * never need to use this directly.
+ */
+struct kernel_function_holder {
+  explicit kernel_function_holder(std::shared_ptr<kernel_function> kernel) : 
_kernel(kernel) {}
+  kernel_function_holder(const kernel_function_holder& other) : 
_kernel(other._kernel) {}
+  kernel_function_holder(kernel_function_holder&& other) : 
_kernel(std::move(other._kernel)) {}
+  kernel_function_holder& operator=(const kernel_function_holder& other) { 
_kernel = other._kernel; return *this; }
+  kernel_function_holder& operator=(kernel_function_holder&& other) { 
std::swap(_kernel, other._kernel); return *this; }
+
+  double operator()(const std::vector<double>& a, const py::array_t<double>& 
b) const {
+    py::array_t<double> a_arr(a.size(), a.data(), dummy_array_owner);
+    return _kernel->operator()(a_arr, b);
+  }
+
+  double operator()(const std::vector<double>& a, const std::vector<double>& 
b) const {
+    py::array_t<double> a_arr(a.size(), a.data(), dummy_array_owner);
+    py::array_t<double> b_arr(b.size(), b.data(), dummy_array_owner);
+    return _kernel->operator()(a_arr, b_arr);
+  }
+
+  private:
+    // a dummy object to "own" arrays when translating from std::vector to 
avoid a copy:
+    // https://github.com/pybind/pybind11/issues/323#issuecomment-575717041
+    py::str dummy_array_owner;
+    std::shared_ptr<kernel_function> _kernel;
+};
+
+}
+
+#endif // _KERNEL_FUNCTION_HPP_
\ No newline at end of file
diff --git a/python/src/datasketches.cpp b/python/src/datasketches.cpp
index e10074a..e360452 100644
--- a/python/src/datasketches.cpp
+++ b/python/src/datasketches.cpp
@@ -31,11 +31,8 @@ void init_tuple(py::module& m);
 void init_vo(py::module& m);
 void init_req(py::module& m);
 void init_quantiles(py::module& m);
-<<<<<<< HEAD
 void init_count_min(py::module& m);
-=======
 void init_density(py::module& m);
->>>>>>> c354c56 (python wrapper)
 void init_vector_of_kll(py::module& m);
 
 // supporting objects
@@ -52,11 +49,8 @@ PYBIND11_MODULE(_datasketches, m) {
   init_vo(m);
   init_req(m);
   init_quantiles(m);
-<<<<<<< HEAD
   init_count_min(m);
-=======
   init_density(m);
->>>>>>> c354c56 (python wrapper)
   init_vector_of_kll(m);
 
   init_kolmogorov_smirnov(m);
diff --git a/python/src/density_wrapper.cpp b/python/src/density_wrapper.cpp
index 5db808f..48b6db5 100644
--- a/python/src/density_wrapper.cpp
+++ b/python/src/density_wrapper.cpp
@@ -22,45 +22,59 @@
 #include <pybind11/numpy.h>
 #include <vector>
 
+#include "kernel_function.hpp"
 #include "density_sketch.hpp"
 
 namespace py = pybind11;
 
-template<typename T>
+template<typename T, typename K>
 void bind_density_sketch(py::module &m, const char* name) {
   using namespace datasketches;
 
-  py::class_<density_sketch<T>>(m, name)
-    .def(py::init<uint16_t, uint32_t>(), py::arg("k"), py::arg("dim"))
-    .def("update", static_cast<void (density_sketch<T>::*)(const 
std::vector<T>&)>(&density_sketch<T>::update),
+  py::class_<density_sketch<T, K>>(m, name)
+    .def(
+        py::init([](uint16_t k, uint32_t dim, std::shared_ptr<kernel_function> 
kernel) {
+          kernel_function_holder holder(kernel);
+          return density_sketch<T, K>(k, dim, holder);
+        }),
+        py::arg("k"), py::arg("dim"), py::arg("kernel"))
+    .def("update", static_cast<void (density_sketch<T, K>::*)(const 
std::vector<T>&)>(&density_sketch<T, K>::update),
         "Updates the sketch with the given vector")
-    .def("update", static_cast<void 
(density_sketch<T>::*)(std::vector<T>&&)>(&density_sketch<T>::update),
-        "Updates the sketch with the given vector")
-    .def("merge", static_cast<void (density_sketch<T>::*)(const 
density_sketch<T>&)>(&density_sketch<T>::merge), py::arg("sketch"),
+    .def("merge", static_cast<void (density_sketch<T, K>::*)(const 
density_sketch<T, K>&)>(&density_sketch<T, K>::merge), py::arg("sketch"),
         "Merges the provided sketch into this one")
-    .def("is_empty", &density_sketch<T>::is_empty,
+    .def("is_empty", &density_sketch<T, K>::is_empty,
         "Returns True if the sketch is empty, otherwise False")
-    .def("get_k", &density_sketch<T>::get_k,
+    .def("get_k", &density_sketch<T, K>::get_k,
         "Returns the configured parameter k")
-    .def("get_dim", &density_sketch<T>::get_dim,
+    .def("get_dim", &density_sketch<T, K>::get_dim,
         "Returns the configured parameter dim")
-    .def("get_n", &density_sketch<T>::get_n,
+    .def("get_n", &density_sketch<T, K>::get_n,
         "Returns the length of the input stream")
-    .def("get_num_retained", &density_sketch<T>::get_num_retained,
+    .def("get_num_retained", &density_sketch<T, K>::get_num_retained,
         "Returns the number of retained items (samples) in the sketch")
-    .def("is_estimation_mode", &density_sketch<T>::is_estimation_mode,
+    .def("is_estimation_mode", &density_sketch<T, K>::is_estimation_mode,
         "Returns True if the sketch is in estimation mode, otherwise False")
-    .def("get_estimate", &density_sketch<T>::get_estimate, py::arg("point"),
+    .def("get_estimate", &density_sketch<T, K>::get_estimate, py::arg("point"),
         "Returns an approximate density at the given point")
-    .def("__str__", &density_sketch<T>::to_string, 
py::arg("print_levels")=false, py::arg("print_items")=false,
+    .def("__str__", &density_sketch<T, K>::to_string, 
py::arg("print_levels")=false, py::arg("print_items")=false,
         "Produces a string summary of the sketch")
-    .def("to_string", &density_sketch<T>::to_string, 
py::arg("print_levels")=false, py::arg("print_items")=false,
+    .def("to_string", &density_sketch<T, K>::to_string, 
py::arg("print_levels")=false, py::arg("print_items")=false,
         "Produces a string summary of the sketch")
-    .def("__iter__", [](const density_sketch<T>& s){ return 
py::make_iterator(s.begin(), s.end()); })
+    .def("__iter__", [](const density_sketch<T, K>& s){ return 
py::make_iterator(s.begin(), s.end()); })
     ;
 }
 
 void init_density(py::module &m) {
-  bind_density_sketch<float>(m, "density_floats_sketch");
-  bind_density_sketch<double>(m, "density_doubles_sketch");
+  using namespace datasketches;
+
+  // generic kernel function
+  py::class_<kernel_function, KernelFunction, 
std::shared_ptr<kernel_function>>(m, "KernelFunction")
+    .def(py::init())
+    .def("__call__", &kernel_function::operator(), py::arg("a"), py::arg("b"))
+    ;
+
+  // the old sketch names  can almost be defined, but the 
kernel_function_holder won't work in init()
+  //bind_density_sketch<float, gaussian_kernel<float>>(m, 
"density_floats_sketch");
+  //bind_density_sketch<double, gaussian_kernel<double>>(m, 
"density_doubles_sketch");
+  bind_density_sketch<double, kernel_function_holder>(m, "_density_sketch");
 }
diff --git a/python/tests/density_test.py b/python/tests/density_test.py
index eccb0aa..3d457ec 100644
--- a/python/tests/density_test.py
+++ b/python/tests/density_test.py
@@ -16,16 +16,23 @@
 # under the License.
 
 import unittest
-from datasketches import density_doubles_sketch
+from datasketches import density_sketch, KernelFunction
 import numpy as np
 
+class UnitSphereKernel(KernelFunction):
+  def __call__(self, a: np.array, b: np.array) -> float:
+    if np.linalg.norm(a - b) < 1.0:
+      return 1.0
+    else:
+      return 0.0
+
 class densityTest(unittest.TestCase):
   def test_density_sketch(self):
     k = 10
     dim = 3
     n = 1000
 
-    sketch = density_doubles_sketch(k, dim)
+    sketch = density_sketch(k, dim)
 
     self.assertEqual(sketch.get_k(), k)
     self.assertEqual(sketch.get_dim(), dim)
@@ -53,13 +60,27 @@ class densityTest(unittest.TestCase):
       self.assertGreaterEqual(weight, 1) 
 
   def test_density_merge(self):
-    sketch1 = density_doubles_sketch(10, 2)
+    sketch1 = density_sketch(10, 2)
     sketch1.update([0, 0])
-    sketch2 = density_doubles_sketch(10, 2)
+    sketch2 = density_sketch(10, 2)
     sketch2.update([0, 1])
     sketch1.merge(sketch2)
     self.assertEqual(sketch1.get_n(), 2)
     self.assertEqual(sketch1.get_num_retained(), 2)
 
+  def test_custom_kernel(self):
+    gaussianSketch = density_sketch(10, 2) # default kernel
+    sphericalSketch = density_sketch(10, 2, UnitSphereKernel())
+
+    p = [1, 1]
+    gaussianSketch.update(p)
+    sphericalSketch.update(p)
+
+    # Spherical kernel should return 1.0 for a nearby point, 0 farther
+    # Gaussian kernel should return something nonzero when farther away
+    self.assertEqual(sphericalSketch.get_estimate([1.001, 1]), 1.0)
+    self.assertEqual(sphericalSketch.get_estimate([2, 2]), 0.0)
+    self.assertGreater(gaussianSketch.get_estimate([2, 2]), 0.0)
+
 if __name__ == '__main__':
     unittest.main()


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to