This is an automated email from the ASF dual-hosted git repository. jmalkin pushed a commit to branch density_custom_kernel in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git
commit 669567e2a71efc440a073f664b280ac330de5605 Author: Jon <[email protected]> AuthorDate: Thu Apr 13 00:23:50 2023 -0700 Add custom kernel to density sketch, adjust python wrapper to support the same --- density/include/density_sketch.hpp | 10 ++- density/include/density_sketch_impl.hpp | 7 +- density/test/density_sketch_test.cpp | 33 ++++++++ python/datasketches/DensityWrapper.py | 75 +++++++++++++++++ .../{__init__.py => KernelFunction.py} | 29 ++++--- python/datasketches/__init__.py | 2 + python/include/kernel_function.hpp | 98 ++++++++++++++++++++++ python/src/datasketches.cpp | 6 -- python/src/density_wrapper.cpp | 52 +++++++----- python/tests/density_test.py | 29 ++++++- 10 files changed, 292 insertions(+), 49 deletions(-) diff --git a/density/include/density_sketch.hpp b/density/include/density_sketch.hpp index 8393bad..5083802 100755 --- a/density/include/density_sketch.hpp +++ b/density/include/density_sketch.hpp @@ -63,9 +63,10 @@ public: * Constructor * @param k controls the size and error of the sketch. * @param dim dimension of the input domain + * @param kernel to use by this instance * @param allocator to use by this instance */ - density_sketch(uint16_t k, uint32_t dim, const Allocator& allocator = Allocator()); + density_sketch(uint16_t k, uint32_t dim, const Kernel& kernel = Kernel(), const Allocator& allocator = Allocator()); /** * Returns configured parameter K @@ -137,6 +138,7 @@ public: const_iterator end() const; private: + Kernel kernel_; uint16_t k_; uint32_t dim_; uint32_t num_retained_; @@ -148,10 +150,14 @@ private: }; template<typename T, typename K, typename A> -class density_sketch<T, K, A>::const_iterator: public std::iterator<std::input_iterator_tag, T> { +class density_sketch<T, K, A>::const_iterator { public: using Vector = density_sketch<T, K, A>::Vector; + using iterator_category = std::input_iterator_tag; using value_type = std::pair<const Vector&, const uint64_t>; + using difference_type = void; + using pointer = return_value_holder<value_type>; + using reference = const value_type; const_iterator& operator++(); const_iterator& operator++(int); bool operator==(const const_iterator& other) const; diff --git a/density/include/density_sketch_impl.hpp b/density/include/density_sketch_impl.hpp index b3c29eb..9cefe56 100755 --- a/density/include/density_sketch_impl.hpp +++ b/density/include/density_sketch_impl.hpp @@ -28,7 +28,8 @@ namespace datasketches { template<typename T, typename K, typename A> -density_sketch<T, K, A>::density_sketch(uint16_t k, uint32_t dim, const A& allocator): +density_sketch<T, K, A>::density_sketch(uint16_t k, uint32_t dim, const K& kernel, const A& allocator): +kernel_(kernel), k_(k), dim_(dim), num_retained_(0), @@ -100,7 +101,7 @@ T density_sketch<T, K, A>::get_estimate(const std::vector<T>& point) const { T density = 0; for (unsigned height = 0; height < levels_.size(); ++height) { for (const auto& p: levels_[height]) { - density += (1 << height) * K()(p, point) / n_; + density += (1 << height) * kernel_(p, point) / n_; } } return density; @@ -131,7 +132,7 @@ void density_sketch<T, K, A>::compact_level(unsigned height) { for (unsigned i = 1; i < level.size(); ++i) { T delta = 0; for (unsigned j = 0; j < i; ++j) { - delta += (bits[j] ? 1 : -1) * K()(level[i], level[j]); + delta += (bits[j] ? 1 : -1) * kernel_(level[i], level[j]); } bits[i] = delta < 0; } diff --git a/density/test/density_sketch_test.cpp b/density/test/density_sketch_test.cpp index 4f5927f..6440a32 100755 --- a/density/test/density_sketch_test.cpp +++ b/density/test/density_sketch_test.cpp @@ -73,4 +73,37 @@ TEST_CASE("density sketch: iterator", "[density_sketch]") { REQUIRE(count == sketch.get_num_retained()); } +// spherical kernel for testing, returns 1 for vectors within radius and 0 otherwise +template<typename T> +struct spherical_kernel { + spherical_kernel(T radius = 1.0) : _radius_squared(radius * radius) {} + T operator()(const std::vector<T>& v1, const std::vector<T>& v2) const { + return std::inner_product(v1.begin(), v1.end(), v2.begin(), 0.0, std::plus<T>(), [](T a, T b){return (a-b)*(a-b);}) <= _radius_squared ? 1.0 : 0.0; + } + private: + T _radius_squared; +}; + +TEST_CASE("custom kernel", "[density_sketch]") { + density_sketch<float, spherical_kernel<float>> sketch(10, 3, spherical_kernel<float>(0.5)); + + // update with (1,1,1) and test points inside and outside the kernel + sketch.update(std::vector<float>(3, 1.0)); + REQUIRE(sketch.get_estimate(std::vector<float>(3, 1.001)) == 1.0); + REQUIRE(sketch.get_estimate(std::vector<float>(3, 2.0)) == 0.0); + + // rest of test follows iterator test above + unsigned n = 1000; + for (unsigned i = 2; i <= n; ++i) sketch.update(std::vector<float>(3, i)); + REQUIRE(sketch.get_n() == n); + REQUIRE(sketch.is_estimation_mode()); + unsigned count = 0; + for (auto pair: sketch) { + ++count; + // just to assert something about the output + REQUIRE(pair.first.size() == sketch.get_dim()); + } + REQUIRE(count == sketch.get_num_retained()); +} + } /* namespace datasketches */ diff --git a/python/datasketches/DensityWrapper.py b/python/datasketches/DensityWrapper.py new file mode 100644 index 0000000..98d8c35 --- /dev/null +++ b/python/datasketches/DensityWrapper.py @@ -0,0 +1,75 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import numpy as np + +from _datasketches import _density_sketch, KernelFunction +from .KernelFunction import GaussianKernel + +class density_sketch: + """An instance of a Density Sketch for kernel density estimation. Requires a KernelFunction object.""" + + def __init__(self, k:int, dim:int, kernel:KernelFunction=GaussianKernel()): + self._kernel = kernel + self._gadget = _density_sketch(k, dim, self._kernel) + + def update(self, point:np.array): + """Updates the sketch with the given point""" + self._gadget.update(point) + + def merge(self, other:'density_sketch'): + """Merges the provided sketch into this one""" + self._gadget.merge(other._gadget) + + def is_empty(self): + """Returns True if the sketch is empty, otherwise False""" + return self._gadget.is_empty() + + def get_k(self): + """Returns the configured parameter k""" + return self._gadget.get_k() + + def get_dim(self): + """Returns the configured parameter dim""" + return self._gadget.get_dim() + + def get_n(self): + """Returns the length of the input stream""" + return self._gadget.get_n() + + def get_num_retained(self): + """Returns the number of retained items (samples) in the sketch""" + return self._gadget.get_num_retained() + + def is_estimation_mode(self): + """Returns True if the sketch is in estimation mode, otherwise False""" + return self._gadget.is_estimation_mode() + + def get_estimate(self, point:np.array): + """Returns an approximate density at the given point""" + return self._gadget.get_estimate(point) + + def __str__(self, print_levels:bool=False, print_items:bool=False): + """Produces a string summary of the sketch""" + return self._gadget.to_string(print_levels, print_items) + + def to_string(self, print_levels:bool=False, print_items:bool=False): + """Produces a string summary of the sketch""" + return self._gadget.to_string(print_levels, print_items) + + def __iter__(self): + return self._gadget.__iter__() diff --git a/python/datasketches/__init__.py b/python/datasketches/KernelFunction.py similarity index 58% copy from python/datasketches/__init__.py copy to python/datasketches/KernelFunction.py index fb7636e..7603b10 100644 --- a/python/datasketches/__init__.py +++ b/python/datasketches/KernelFunction.py @@ -15,22 +15,21 @@ # specific language governing permissions and limitations # under the License. -"""The Apache DataSketches Library for Python +import numpy as np -Provided under the Apache License, Verison 2.0 -<http://www.apache.org/licenses/LICENSE-2.0> -""" +from _datasketches import KernelFunction -name = 'datasketches' - -from _datasketches import * +# This file provides an example Python Kernel Function implementation. +# +# Each implementation must extend the KernelFunction class +# and define the __call__ method -from .PySerDe import * -from .TuplePolicy import * +# Implements a basic Gaussian Kernel +class GaussianKernel(KernelFunction): + def __init__(self, bandwidth: float=1.0): + KernelFunction.__init__(self) + self._bw = bandwidth + self._scale = -0.5 * (bandwidth ** -2) -# Wrappers around the pybind11 classes for cases where we -# need to define a python object that is persisted within -# the C++ object. Currently, the native python portion of -# a class derived from a C++ class may be garbage collected -# even though a pointer to the C++ portion remains valid. -from .TupleWrapper import * + def __call__(self, a: np.array, b: np.array) -> float: + return np.exp(self._scale * np.linalg.norm(a - b)**2) diff --git a/python/datasketches/__init__.py b/python/datasketches/__init__.py index fb7636e..bc78d4e 100644 --- a/python/datasketches/__init__.py +++ b/python/datasketches/__init__.py @@ -27,6 +27,7 @@ from _datasketches import * from .PySerDe import * from .TuplePolicy import * +from .KernelFunction import * # Wrappers around the pybind11 classes for cases where we # need to define a python object that is persisted within @@ -34,3 +35,4 @@ from .TuplePolicy import * # a class derived from a C++ class may be garbage collected # even though a pointer to the C++ portion remains valid. from .TupleWrapper import * +from .DensityWrapper import * diff --git a/python/include/kernel_function.hpp b/python/include/kernel_function.hpp new file mode 100644 index 0000000..ca41c71 --- /dev/null +++ b/python/include/kernel_function.hpp @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +//#include <memory> +#include <pybind11/pybind11.h> +#include <pybind11/numpy.h> + +#ifndef _KERNEL_FUNCTION_HPP_ +#define _KERNEL_FUNCTION_HPP_ + +namespace py = pybind11; + +namespace datasketches { + +/** + * @brief kernel_function provides the underlying base class from + * which native Python kernels ultimately inherit. The actual + * kernels implement KernelFunction, as shown in KernelFunction.py + */ +struct kernel_function { + virtual double operator()(py::array_t<double>& a, const py::array_t<double>& b) const = 0; + virtual ~kernel_function() = default; +}; + +/** + * @brief KernelFunction provides the "trampoline" class for pybind11 + * that allows for a native Python implementation of kernel + * functions. + */ +struct KernelFunction : public kernel_function { + using kernel_function::kernel_function; + + /** + * @brief Evaluates K(a,b), the kernel function for the given points a and b + * @param a the first vector + * @param b the second vector + * @return The function value K(a,b) + */ + double operator()(py::array_t<double>& a, const py::array_t<double>& b) const override { + PYBIND11_OVERRIDE_PURE_NAME( + double, // Return type + kernel_function, // Parent class + "__call__", // Name of function in python + operator(), // Name of function in C++ + a, b // Arguemnts + ); + } +}; + +/* The kernel_function_holder provides a concrete class that dispatches calls + * from the sketch to the kernel_function. This class is needed to provide a + * concrete object to produce a compiled library, but library users should + * never need to use this directly. + */ +struct kernel_function_holder { + explicit kernel_function_holder(std::shared_ptr<kernel_function> kernel) : _kernel(kernel) {} + kernel_function_holder(const kernel_function_holder& other) : _kernel(other._kernel) {} + kernel_function_holder(kernel_function_holder&& other) : _kernel(std::move(other._kernel)) {} + kernel_function_holder& operator=(const kernel_function_holder& other) { _kernel = other._kernel; return *this; } + kernel_function_holder& operator=(kernel_function_holder&& other) { std::swap(_kernel, other._kernel); return *this; } + + double operator()(const std::vector<double>& a, const py::array_t<double>& b) const { + py::array_t<double> a_arr(a.size(), a.data(), dummy_array_owner); + return _kernel->operator()(a_arr, b); + } + + double operator()(const std::vector<double>& a, const std::vector<double>& b) const { + py::array_t<double> a_arr(a.size(), a.data(), dummy_array_owner); + py::array_t<double> b_arr(b.size(), b.data(), dummy_array_owner); + return _kernel->operator()(a_arr, b_arr); + } + + private: + // a dummy object to "own" arrays when translating from std::vector to avoid a copy: + // https://github.com/pybind/pybind11/issues/323#issuecomment-575717041 + py::str dummy_array_owner; + std::shared_ptr<kernel_function> _kernel; +}; + +} + +#endif // _KERNEL_FUNCTION_HPP_ \ No newline at end of file diff --git a/python/src/datasketches.cpp b/python/src/datasketches.cpp index e10074a..e360452 100644 --- a/python/src/datasketches.cpp +++ b/python/src/datasketches.cpp @@ -31,11 +31,8 @@ void init_tuple(py::module& m); void init_vo(py::module& m); void init_req(py::module& m); void init_quantiles(py::module& m); -<<<<<<< HEAD void init_count_min(py::module& m); -======= void init_density(py::module& m); ->>>>>>> c354c56 (python wrapper) void init_vector_of_kll(py::module& m); // supporting objects @@ -52,11 +49,8 @@ PYBIND11_MODULE(_datasketches, m) { init_vo(m); init_req(m); init_quantiles(m); -<<<<<<< HEAD init_count_min(m); -======= init_density(m); ->>>>>>> c354c56 (python wrapper) init_vector_of_kll(m); init_kolmogorov_smirnov(m); diff --git a/python/src/density_wrapper.cpp b/python/src/density_wrapper.cpp index 5db808f..48b6db5 100644 --- a/python/src/density_wrapper.cpp +++ b/python/src/density_wrapper.cpp @@ -22,45 +22,59 @@ #include <pybind11/numpy.h> #include <vector> +#include "kernel_function.hpp" #include "density_sketch.hpp" namespace py = pybind11; -template<typename T> +template<typename T, typename K> void bind_density_sketch(py::module &m, const char* name) { using namespace datasketches; - py::class_<density_sketch<T>>(m, name) - .def(py::init<uint16_t, uint32_t>(), py::arg("k"), py::arg("dim")) - .def("update", static_cast<void (density_sketch<T>::*)(const std::vector<T>&)>(&density_sketch<T>::update), + py::class_<density_sketch<T, K>>(m, name) + .def( + py::init([](uint16_t k, uint32_t dim, std::shared_ptr<kernel_function> kernel) { + kernel_function_holder holder(kernel); + return density_sketch<T, K>(k, dim, holder); + }), + py::arg("k"), py::arg("dim"), py::arg("kernel")) + .def("update", static_cast<void (density_sketch<T, K>::*)(const std::vector<T>&)>(&density_sketch<T, K>::update), "Updates the sketch with the given vector") - .def("update", static_cast<void (density_sketch<T>::*)(std::vector<T>&&)>(&density_sketch<T>::update), - "Updates the sketch with the given vector") - .def("merge", static_cast<void (density_sketch<T>::*)(const density_sketch<T>&)>(&density_sketch<T>::merge), py::arg("sketch"), + .def("merge", static_cast<void (density_sketch<T, K>::*)(const density_sketch<T, K>&)>(&density_sketch<T, K>::merge), py::arg("sketch"), "Merges the provided sketch into this one") - .def("is_empty", &density_sketch<T>::is_empty, + .def("is_empty", &density_sketch<T, K>::is_empty, "Returns True if the sketch is empty, otherwise False") - .def("get_k", &density_sketch<T>::get_k, + .def("get_k", &density_sketch<T, K>::get_k, "Returns the configured parameter k") - .def("get_dim", &density_sketch<T>::get_dim, + .def("get_dim", &density_sketch<T, K>::get_dim, "Returns the configured parameter dim") - .def("get_n", &density_sketch<T>::get_n, + .def("get_n", &density_sketch<T, K>::get_n, "Returns the length of the input stream") - .def("get_num_retained", &density_sketch<T>::get_num_retained, + .def("get_num_retained", &density_sketch<T, K>::get_num_retained, "Returns the number of retained items (samples) in the sketch") - .def("is_estimation_mode", &density_sketch<T>::is_estimation_mode, + .def("is_estimation_mode", &density_sketch<T, K>::is_estimation_mode, "Returns True if the sketch is in estimation mode, otherwise False") - .def("get_estimate", &density_sketch<T>::get_estimate, py::arg("point"), + .def("get_estimate", &density_sketch<T, K>::get_estimate, py::arg("point"), "Returns an approximate density at the given point") - .def("__str__", &density_sketch<T>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false, + .def("__str__", &density_sketch<T, K>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false, "Produces a string summary of the sketch") - .def("to_string", &density_sketch<T>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false, + .def("to_string", &density_sketch<T, K>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false, "Produces a string summary of the sketch") - .def("__iter__", [](const density_sketch<T>& s){ return py::make_iterator(s.begin(), s.end()); }) + .def("__iter__", [](const density_sketch<T, K>& s){ return py::make_iterator(s.begin(), s.end()); }) ; } void init_density(py::module &m) { - bind_density_sketch<float>(m, "density_floats_sketch"); - bind_density_sketch<double>(m, "density_doubles_sketch"); + using namespace datasketches; + + // generic kernel function + py::class_<kernel_function, KernelFunction, std::shared_ptr<kernel_function>>(m, "KernelFunction") + .def(py::init()) + .def("__call__", &kernel_function::operator(), py::arg("a"), py::arg("b")) + ; + + // the old sketch names can almost be defined, but the kernel_function_holder won't work in init() + //bind_density_sketch<float, gaussian_kernel<float>>(m, "density_floats_sketch"); + //bind_density_sketch<double, gaussian_kernel<double>>(m, "density_doubles_sketch"); + bind_density_sketch<double, kernel_function_holder>(m, "_density_sketch"); } diff --git a/python/tests/density_test.py b/python/tests/density_test.py index eccb0aa..3d457ec 100644 --- a/python/tests/density_test.py +++ b/python/tests/density_test.py @@ -16,16 +16,23 @@ # under the License. import unittest -from datasketches import density_doubles_sketch +from datasketches import density_sketch, KernelFunction import numpy as np +class UnitSphereKernel(KernelFunction): + def __call__(self, a: np.array, b: np.array) -> float: + if np.linalg.norm(a - b) < 1.0: + return 1.0 + else: + return 0.0 + class densityTest(unittest.TestCase): def test_density_sketch(self): k = 10 dim = 3 n = 1000 - sketch = density_doubles_sketch(k, dim) + sketch = density_sketch(k, dim) self.assertEqual(sketch.get_k(), k) self.assertEqual(sketch.get_dim(), dim) @@ -53,13 +60,27 @@ class densityTest(unittest.TestCase): self.assertGreaterEqual(weight, 1) def test_density_merge(self): - sketch1 = density_doubles_sketch(10, 2) + sketch1 = density_sketch(10, 2) sketch1.update([0, 0]) - sketch2 = density_doubles_sketch(10, 2) + sketch2 = density_sketch(10, 2) sketch2.update([0, 1]) sketch1.merge(sketch2) self.assertEqual(sketch1.get_n(), 2) self.assertEqual(sketch1.get_num_retained(), 2) + def test_custom_kernel(self): + gaussianSketch = density_sketch(10, 2) # default kernel + sphericalSketch = density_sketch(10, 2, UnitSphereKernel()) + + p = [1, 1] + gaussianSketch.update(p) + sphericalSketch.update(p) + + # Spherical kernel should return 1.0 for a nearby point, 0 farther + # Gaussian kernel should return something nonzero when farther away + self.assertEqual(sphericalSketch.get_estimate([1.001, 1]), 1.0) + self.assertEqual(sphericalSketch.get_estimate([2, 2]), 0.0) + self.assertGreater(gaussianSketch.get_estimate([2, 2]), 0.0) + if __name__ == '__main__': unittest.main() --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
