This is an automated email from the ASF dual-hosted git repository. jmalkin pushed a commit to branch density_custom_kernel in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git
commit 5ab6c9608fba2a6d68e1cbf416b89166c261cee9 Author: AlexanderSaydakov <[email protected]> AuthorDate: Tue Dec 13 13:36:34 2022 -0800 density sketch rebase onto master since diverging in December --- CMakeLists.txt | 1 + density/CMakeLists.txt | 42 +++++++++ density/include/density_sketch.hpp | 131 +++++++++++++++++++++++++++++ density/include/density_sketch_impl.hpp | 145 ++++++++++++++++++++++++++++++++ density/test/CMakeLists.txt | 35 ++++++++ density/test/density_sketch_test.cpp | 61 ++++++++++++++ 6 files changed, 415 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7e62151..3b7be54 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -116,6 +116,7 @@ add_subdirectory(tuple) add_subdirectory(req) add_subdirectory(quantiles) add_subdirectory(count) +add_subdirectory(density) if (WITH_PYTHON) add_subdirectory(python) diff --git a/density/CMakeLists.txt b/density/CMakeLists.txt new file mode 100644 index 0000000..a1f62ea --- /dev/null +++ b/density/CMakeLists.txt @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +add_library(density INTERFACE) + +add_library(${PROJECT_NAME}::DENSITY ALIAS density) + +if (BUILD_TESTS) + add_subdirectory(test) +endif() + +target_include_directories(density + INTERFACE + $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include> + $<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include> +) + +target_link_libraries(density INTERFACE common) +target_compile_features(density INTERFACE cxx_std_11) + +install(TARGETS density + EXPORT ${PROJECT_NAME} +) + +install(FILES + include/density_sketch.hpp + include/density_sketch_impl.hpp + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches") diff --git a/density/include/density_sketch.hpp b/density/include/density_sketch.hpp new file mode 100755 index 0000000..83a01ca --- /dev/null +++ b/density/include/density_sketch.hpp @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef DENSITY_SKETCH_HPP_ +#define DENSITY_SKETCH_HPP_ + +#include <type_traits> +#include <vector> +#include <functional> +#include <cmath> + +/* + * Based on the following paper: + * Zohar Karnin, Edo Liberty "Discrepancy, Coresets, and Sketches in Machine Learning" + * https://proceedings.mlr.press/v99/karnin19a/karnin19a.pdf + */ + +namespace datasketches { + +template<typename T> +struct gaussian_kernel { + T operator()(const std::vector<T>& v1, const std::vector<T>& v2) const { + return exp(-std::inner_product(v1.begin(), v1.end(), v2.begin(), 0, std::plus<T>(), [](T a, T b){return (a-b)*(a-b);})); + } +}; + +template< + typename T, + typename Kernel = gaussian_kernel<T>, + typename Allocator = std::allocator<T> +> +class density_sketch { + static_assert(std::is_floating_point<T>::value, "Floating point type expected"); + +public: + using Vector = std::vector<T, Allocator>; + using Level = std::vector<Vector, typename std::allocator_traits<Allocator>::template rebind_alloc<Vector>>; + using Levels = std::vector<Level, typename std::allocator_traits<Allocator>::template rebind_alloc<Level>>; + + /** + * Constructor + * @param k controls the size and error of the sketch. + * @param dim dimension of the input domain + * @param allocator to use by this instance + */ + density_sketch(uint16_t k, uint32_t dim, const Allocator& allocator = Allocator()); + + /** + * Returns configured parameter K + * @return parameter K + */ + uint16_t get_k() const; + + /** + * Returns configured dimensions + * @return dimensions + */ + uint32_t get_dim() const; + + /** + * Returns true if this sketch is empty. + * @return empty flag + */ + bool is_empty() const; + + /** + * Returns the length of the input stream (number of points observed by this sketch). + * @return stream length + */ + uint64_t get_n() const; + + /** + * Returns the number of retained points in the sketch. + * @return number of retained points + */ + uint32_t get_num_retained() const; + + /** + * Updates this sketch with a given point. + * @param point given point + */ + template<typename FwdVector> + void update(FwdVector&& point); + + /** + * Merges another sketch into this one. + * @param other sketch to merge into this one + */ + template<typename FwdSketch> + void merge(FwdSketch&& other); + + T get_estimate(const std::vector<T>& point) const; + + /** + * Returns an instance of the allocator for this sketch. + * @return allocator + */ + Allocator get_allocator() const; + +private: + uint16_t k_; + uint32_t dim_; + uint32_t num_retained_; + uint64_t n_; + Levels levels_; + + void compact(); + void compact_level(unsigned height); +}; + +} /* namespace datasketches */ + +#include "density_sketch_impl.hpp" + +#endif diff --git a/density/include/density_sketch_impl.hpp b/density/include/density_sketch_impl.hpp new file mode 100755 index 0000000..895a3d6 --- /dev/null +++ b/density/include/density_sketch_impl.hpp @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef DENSITY_SKETCH_IMPL_HPP_ +#define DENSITY_SKETCH_IMPL_HPP_ + +#include <algorithm> + +#include "common_defs.hpp" +#include "conditional_forward.hpp" + +namespace datasketches { + +template<typename T, typename K, typename A> +density_sketch<T, K, A>::density_sketch(uint16_t k, uint32_t dim, const A& allocator): +k_(k), +dim_(dim), +num_retained_(0), +n_(0), +levels_(1, Level(allocator), allocator) +{} + +template<typename T, typename K, typename A> +uint16_t density_sketch<T, K, A>::get_k() const { + return k_; +} + +template<typename T, typename K, typename A> +uint32_t density_sketch<T, K, A>::get_dim() const { + return dim_; +} + +template<typename T, typename K, typename A> +bool density_sketch<T, K, A>::is_empty() const { + return num_retained_ == 0; +} + +template<typename T, typename K, typename A> +uint64_t density_sketch<T, K, A>::get_n() const { + return n_; +} + +template<typename T, typename K, typename A> +uint32_t density_sketch<T, K, A>::get_num_retained() const { + return num_retained_; +} + +template<typename T, typename K, typename A> +template<typename FwdVector> +void density_sketch<T, K, A>::update(FwdVector&& point) { + if (point.size() != dim_) throw std::invalid_argument("dimension mismatch"); + while (num_retained_ >= k_ * levels_.size()) compact(); + levels_[0].push_back(std::forward<FwdVector>(point)); + ++num_retained_; + ++n_; +} + +template<typename T, typename K, typename A> +template<typename FwdSketch> +void density_sketch<T, K, A>::merge(FwdSketch&& other) { + if (other.is_empty()) return; + if (other.dim_ != dim_) throw std::invalid_argument("dimension mismatch"); + while (levels_.size() < other.levels_.size()) levels_.push_back(Level(levels_.get_allocator())); + for (unsigned height = 0; height < other.levels_.size(); ++height) { + std::copy( + forward_begin(conditional_forward<FwdSketch>(other.levels_[height])), + forward_end(conditional_forward<FwdSketch>(other.levels_[height])), + back_inserter(levels_[height]) + ); + } + num_retained_ += other.num_retained_; + n_ += other.n_; + while (num_retained_ >= k_ * levels_.size()) compact(); +} + +template<typename T, typename K, typename A> +T density_sketch<T, K, A>::get_estimate(const std::vector<T>& point) const { + if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch"); + T density = 0; + for (unsigned height = 0; height < levels_.size(); ++height) { + for (const auto& p: levels_[height]) { + density += (1 << height) * K()(p, point) / n_; + } + } + return density; +} + +template<typename T, typename K, typename A> +A density_sketch<T, K, A>::get_allocator() const { + return levels_.get_allocator(); +} + +template<typename T, typename K, typename A> +void density_sketch<T, K, A>::compact() { + for (unsigned height = 0; height < levels_.size(); ++height) { + if (levels_[height].size() >= k_) { + if (height + 1 >= levels_.size()) levels_.push_back(Level(levels_.get_allocator())); + compact_level(height); + break; + } + } +} + +template<typename T, typename K, typename A> +void density_sketch<T, K, A>::compact_level(unsigned height) { + auto& level = levels_[height]; + std::vector<bool> bits(level.size()); + bits[0] = random_bit(); + std::random_shuffle(level.begin(), level.end()); + for (unsigned i = 1; i < level.size(); ++i) { + T delta = 0; + for (unsigned j = 0; j < i; ++j) { + delta += (bits[j] ? 1 : -1) * K()(level[i], level[j]); + } + bits[i] = delta < 0; + } + for (unsigned i = 0; i < level.size(); ++i) { + if (bits[i]) { + levels_[height + 1].push_back(std::move(level[i])); + } else { + --num_retained_; + } + } + level.clear(); +} + +} /* namespace datasketches */ + +#endif diff --git a/density/test/CMakeLists.txt b/density/test/CMakeLists.txt new file mode 100644 index 0000000..79370ec --- /dev/null +++ b/density/test/CMakeLists.txt @@ -0,0 +1,35 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +add_executable(density_test) + +target_link_libraries(density_test density common_test_lib) + +set_target_properties(density_test PROPERTIES + CXX_STANDARD 11 + CXX_STANDARD_REQUIRED YES +) + +add_test( + NAME density_test + COMMAND density_test +) + +target_sources(density_test + PRIVATE + density_sketch_test.cpp +) diff --git a/density/test/density_sketch_test.cpp b/density/test/density_sketch_test.cpp new file mode 100755 index 0000000..704d3ac --- /dev/null +++ b/density/test/density_sketch_test.cpp @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include <catch2/catch.hpp> + +#include <density_sketch.hpp> + +#include <iostream> + +namespace datasketches { + +TEST_CASE("density sketch: empty", "[density_sketch]") { + density_sketch<float> sketch(10, 3); + REQUIRE(sketch.is_empty()); + REQUIRE_THROWS_AS(sketch.get_estimate({0, 0, 0}), std::runtime_error); +} + +TEST_CASE("density sketch: one item", "[density_sketch]") { + density_sketch<float> sketch(10, 3); + + // dimension mismatch + REQUIRE_THROWS_AS(sketch.update(std::vector<float>({0, 0})), std::invalid_argument); + + sketch.update(std::vector<float>({0, 0, 0})); + REQUIRE_FALSE(sketch.is_empty()); + REQUIRE(sketch.get_estimate({0, 0, 0}) == 1); + REQUIRE(sketch.get_estimate({0.01, 0.01, 0.01}) > 0.95); + REQUIRE(sketch.get_estimate({1, 1, 1}) < 0.05); +} + +TEST_CASE("density sketch: merge", "[density_sketch]") { + density_sketch<float> sketch1(10, 4); + sketch1.update(std::vector<float>({0, 0, 0, 0})); + sketch1.update(std::vector<float>({1, 2, 3, 4})); + + density_sketch<float> sketch2(10, 4); + sketch2.update(std::vector<float>({5, 6, 7, 8})); + + sketch1.merge(sketch2); + + REQUIRE(sketch1.get_n() == 3); + REQUIRE(sketch1.get_num_retained() == 3); +} + +} /* namespace datasketches */ --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
