This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch density_custom_kernel
in repository https://gitbox.apache.org/repos/asf/datasketches-cpp.git

commit 5ab6c9608fba2a6d68e1cbf416b89166c261cee9
Author: AlexanderSaydakov <[email protected]>
AuthorDate: Tue Dec 13 13:36:34 2022 -0800

    density sketch
    rebase onto master since diverging in December
---
 CMakeLists.txt                          |   1 +
 density/CMakeLists.txt                  |  42 +++++++++
 density/include/density_sketch.hpp      | 131 +++++++++++++++++++++++++++++
 density/include/density_sketch_impl.hpp | 145 ++++++++++++++++++++++++++++++++
 density/test/CMakeLists.txt             |  35 ++++++++
 density/test/density_sketch_test.cpp    |  61 ++++++++++++++
 6 files changed, 415 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7e62151..3b7be54 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -116,6 +116,7 @@ add_subdirectory(tuple)
 add_subdirectory(req)
 add_subdirectory(quantiles)
 add_subdirectory(count)
+add_subdirectory(density)
 
 if (WITH_PYTHON)
   add_subdirectory(python)
diff --git a/density/CMakeLists.txt b/density/CMakeLists.txt
new file mode 100644
index 0000000..a1f62ea
--- /dev/null
+++ b/density/CMakeLists.txt
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+add_library(density INTERFACE)
+
+add_library(${PROJECT_NAME}::DENSITY ALIAS density)
+
+if (BUILD_TESTS)
+  add_subdirectory(test)
+endif()
+
+target_include_directories(density
+  INTERFACE
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+    $<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>
+)
+
+target_link_libraries(density INTERFACE common)
+target_compile_features(density INTERFACE cxx_std_11)
+
+install(TARGETS density
+  EXPORT ${PROJECT_NAME}
+)
+
+install(FILES 
+               include/density_sketch.hpp
+               include/density_sketch_impl.hpp
+  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
diff --git a/density/include/density_sketch.hpp 
b/density/include/density_sketch.hpp
new file mode 100755
index 0000000..83a01ca
--- /dev/null
+++ b/density/include/density_sketch.hpp
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef DENSITY_SKETCH_HPP_
+#define DENSITY_SKETCH_HPP_
+
+#include <type_traits>
+#include <vector>
+#include <functional>
+#include <cmath>
+
+/*
+ * Based on the following paper:
+ * Zohar Karnin, Edo Liberty "Discrepancy, Coresets, and Sketches in Machine 
Learning"
+ * https://proceedings.mlr.press/v99/karnin19a/karnin19a.pdf
+ */
+
+namespace datasketches {
+
+template<typename T>
+struct gaussian_kernel {
+  T operator()(const std::vector<T>& v1, const std::vector<T>& v2) const {
+    return exp(-std::inner_product(v1.begin(), v1.end(), v2.begin(), 0, 
std::plus<T>(), [](T a, T b){return (a-b)*(a-b);}));
+  }
+};
+
+template<
+  typename T,
+  typename Kernel = gaussian_kernel<T>,
+  typename Allocator = std::allocator<T>
+>
+class density_sketch {
+  static_assert(std::is_floating_point<T>::value, "Floating point type 
expected");
+
+public:
+  using Vector = std::vector<T, Allocator>;
+  using Level = std::vector<Vector, typename 
std::allocator_traits<Allocator>::template rebind_alloc<Vector>>;
+  using Levels = std::vector<Level, typename 
std::allocator_traits<Allocator>::template rebind_alloc<Level>>;
+
+  /**
+   * Constructor
+   * @param k controls the size and error of the sketch.
+   * @param dim dimension of the input domain
+   * @param allocator to use by this instance
+   */
+  density_sketch(uint16_t k, uint32_t dim, const Allocator& allocator = 
Allocator());
+
+  /**
+   * Returns configured parameter K
+   * @return parameter K
+   */
+  uint16_t get_k() const;
+
+  /**
+   * Returns configured dimensions
+   * @return dimensions
+   */
+  uint32_t get_dim() const;
+
+  /**
+   * Returns true if this sketch is empty.
+   * @return empty flag
+   */
+  bool is_empty() const;
+
+  /**
+   * Returns the length of the input stream (number of points observed by this 
sketch).
+   * @return stream length
+   */
+  uint64_t get_n() const;
+
+  /**
+   * Returns the number of retained points in the sketch.
+   * @return number of retained points
+   */
+  uint32_t get_num_retained() const;
+
+  /**
+   * Updates this sketch with a given point.
+   * @param point given point
+   */
+  template<typename FwdVector>
+  void update(FwdVector&& point);
+
+  /**
+   * Merges another sketch into this one.
+   * @param other sketch to merge into this one
+   */
+  template<typename FwdSketch>
+  void merge(FwdSketch&& other);
+
+  T get_estimate(const std::vector<T>& point) const;
+
+  /**
+   * Returns an instance of the allocator for this sketch.
+   * @return allocator
+   */
+  Allocator get_allocator() const;
+
+private:
+  uint16_t k_;
+  uint32_t dim_;
+  uint32_t num_retained_;
+  uint64_t n_;
+  Levels levels_;
+
+  void compact();
+  void compact_level(unsigned height);
+};
+
+} /* namespace datasketches */
+
+#include "density_sketch_impl.hpp"
+
+#endif
diff --git a/density/include/density_sketch_impl.hpp 
b/density/include/density_sketch_impl.hpp
new file mode 100755
index 0000000..895a3d6
--- /dev/null
+++ b/density/include/density_sketch_impl.hpp
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef DENSITY_SKETCH_IMPL_HPP_
+#define DENSITY_SKETCH_IMPL_HPP_
+
+#include <algorithm>
+
+#include "common_defs.hpp"
+#include "conditional_forward.hpp"
+
+namespace datasketches {
+
+template<typename T, typename K, typename A>
+density_sketch<T, K, A>::density_sketch(uint16_t k, uint32_t dim, const A& 
allocator):
+k_(k),
+dim_(dim),
+num_retained_(0),
+n_(0),
+levels_(1, Level(allocator), allocator)
+{}
+
+template<typename T, typename K, typename A>
+uint16_t density_sketch<T, K, A>::get_k() const {
+  return k_;
+}
+
+template<typename T, typename K, typename A>
+uint32_t density_sketch<T, K, A>::get_dim() const {
+  return dim_;
+}
+
+template<typename T, typename K, typename A>
+bool density_sketch<T, K, A>::is_empty() const {
+  return num_retained_ == 0;
+}
+
+template<typename T, typename K, typename A>
+uint64_t density_sketch<T, K, A>::get_n() const {
+  return n_;
+}
+
+template<typename T, typename K, typename A>
+uint32_t density_sketch<T, K, A>::get_num_retained() const {
+  return num_retained_;
+}
+
+template<typename T, typename K, typename A>
+template<typename FwdVector>
+void density_sketch<T, K, A>::update(FwdVector&& point) {
+  if (point.size() != dim_) throw std::invalid_argument("dimension mismatch");
+  while (num_retained_ >= k_ * levels_.size()) compact();
+  levels_[0].push_back(std::forward<FwdVector>(point));
+  ++num_retained_;
+  ++n_;
+}
+
+template<typename T, typename K, typename A>
+template<typename FwdSketch>
+void density_sketch<T, K, A>::merge(FwdSketch&& other) {
+  if (other.is_empty()) return;
+  if (other.dim_ != dim_) throw std::invalid_argument("dimension mismatch");
+  while (levels_.size() < other.levels_.size()) 
levels_.push_back(Level(levels_.get_allocator()));
+  for (unsigned height = 0; height < other.levels_.size(); ++height) {
+    std::copy(
+      forward_begin(conditional_forward<FwdSketch>(other.levels_[height])),
+      forward_end(conditional_forward<FwdSketch>(other.levels_[height])),
+      back_inserter(levels_[height])
+    );
+  }
+  num_retained_ += other.num_retained_;
+  n_ += other.n_;
+  while (num_retained_ >= k_ * levels_.size()) compact();
+}
+
+template<typename T, typename K, typename A>
+T density_sketch<T, K, A>::get_estimate(const std::vector<T>& point) const {
+  if (is_empty()) throw std::runtime_error("operation is undefined for an 
empty sketch");
+  T density = 0;
+  for (unsigned height = 0; height < levels_.size(); ++height) {
+    for (const auto& p: levels_[height]) {
+      density += (1 << height) * K()(p, point) / n_;
+    }
+  }
+  return density;
+}
+
+template<typename T, typename K, typename A>
+A density_sketch<T, K, A>::get_allocator() const {
+  return levels_.get_allocator();
+}
+
+template<typename T, typename K, typename A>
+void density_sketch<T, K, A>::compact() {
+  for (unsigned height = 0; height < levels_.size(); ++height) {
+    if (levels_[height].size() >= k_) {
+      if (height + 1 >= levels_.size()) 
levels_.push_back(Level(levels_.get_allocator()));
+      compact_level(height);
+      break;
+    }
+  }
+}
+
+template<typename T, typename K, typename A>
+void density_sketch<T, K, A>::compact_level(unsigned height) {
+  auto& level = levels_[height];
+  std::vector<bool> bits(level.size());
+  bits[0] = random_bit();
+  std::random_shuffle(level.begin(), level.end());
+  for (unsigned i = 1; i < level.size(); ++i) {
+    T delta = 0;
+    for (unsigned j = 0; j < i; ++j) {
+      delta += (bits[j] ? 1 : -1) * K()(level[i], level[j]);
+    }
+    bits[i] = delta < 0;
+  }
+  for (unsigned i = 0; i < level.size(); ++i) {
+    if (bits[i]) {
+      levels_[height + 1].push_back(std::move(level[i]));
+    } else {
+      --num_retained_;
+    }
+  }
+  level.clear();
+}
+
+} /* namespace datasketches */
+
+#endif
diff --git a/density/test/CMakeLists.txt b/density/test/CMakeLists.txt
new file mode 100644
index 0000000..79370ec
--- /dev/null
+++ b/density/test/CMakeLists.txt
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+add_executable(density_test)
+
+target_link_libraries(density_test density common_test_lib)
+
+set_target_properties(density_test PROPERTIES
+  CXX_STANDARD 11
+  CXX_STANDARD_REQUIRED YES
+)
+
+add_test(
+  NAME density_test
+  COMMAND density_test
+)
+
+target_sources(density_test
+  PRIVATE
+    density_sketch_test.cpp
+)
diff --git a/density/test/density_sketch_test.cpp 
b/density/test/density_sketch_test.cpp
new file mode 100755
index 0000000..704d3ac
--- /dev/null
+++ b/density/test/density_sketch_test.cpp
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <catch2/catch.hpp>
+
+#include <density_sketch.hpp>
+
+#include <iostream>
+
+namespace datasketches {
+
+TEST_CASE("density sketch: empty", "[density_sketch]") {
+  density_sketch<float> sketch(10, 3);
+  REQUIRE(sketch.is_empty());
+  REQUIRE_THROWS_AS(sketch.get_estimate({0, 0, 0}), std::runtime_error);
+}
+
+TEST_CASE("density sketch: one item", "[density_sketch]") {
+  density_sketch<float> sketch(10, 3);
+
+  // dimension mismatch
+  REQUIRE_THROWS_AS(sketch.update(std::vector<float>({0, 0})), 
std::invalid_argument);
+
+  sketch.update(std::vector<float>({0, 0, 0}));
+  REQUIRE_FALSE(sketch.is_empty());
+  REQUIRE(sketch.get_estimate({0, 0, 0}) == 1);
+  REQUIRE(sketch.get_estimate({0.01, 0.01, 0.01}) > 0.95);
+  REQUIRE(sketch.get_estimate({1, 1, 1}) < 0.05);
+}
+
+TEST_CASE("density sketch: merge", "[density_sketch]") {
+  density_sketch<float> sketch1(10, 4);
+  sketch1.update(std::vector<float>({0, 0, 0, 0}));
+  sketch1.update(std::vector<float>({1, 2, 3, 4}));
+
+  density_sketch<float> sketch2(10, 4);
+  sketch2.update(std::vector<float>({5, 6, 7, 8}));
+
+  sketch1.merge(sketch2);
+
+  REQUIRE(sketch1.get_n() == 3);
+  REQUIRE(sketch1.get_num_retained() == 3);
+}
+
+} /* namespace datasketches */


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to