This is an automated email from the ASF dual-hosted git repository. charlie pushed a commit to branch bloom-filter-bindings in repository https://gitbox.apache.org/repos/asf/datasketches-python.git
commit 65e1f1d8fb5a65f073ffb21b7e8eebf7c5d0a287 Author: c-dickens <dickens....@gmail.com> AuthorDate: Fri Aug 8 22:30:43 2025 +0100 Add initial bloom filter Python bindings and tests --- CMakeLists.txt | 1 + _datasketches.so | Bin 0 -> 1348224 bytes src/bloom_filter_wrapper.cpp | 55 ++++++++++++++++ src/datasketches.cpp | 2 + tests/bloom_filter_test.py | 145 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 203 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2e74346..ddc9325 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -110,6 +110,7 @@ target_sources(python src/count_wrapper.cpp src/tdigest_wrapper.cpp src/vector_of_kll.cpp + src/bloom_filter_wrapper.cpp src/py_serde.cpp ) diff --git a/_datasketches.so b/_datasketches.so new file mode 100755 index 0000000..3bbdc3b Binary files /dev/null and b/_datasketches.so differ diff --git a/src/bloom_filter_wrapper.cpp b/src/bloom_filter_wrapper.cpp new file mode 100644 index 0000000..abcdc0a --- /dev/null +++ b/src/bloom_filter_wrapper.cpp @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include <nanobind/nanobind.h> +#include <nanobind/stl/string.h> + +#include "bloom_filter.hpp" +#include "common_defs.hpp" + +namespace nb = nanobind; + +template<typename A> +void bind_bloom_filter(nb::module_ &m, const char* name) { + using namespace datasketches; + using bloom_filter_type = bloom_filter_alloc<A>; + + // Start with just one simple function + m.def("create_bloom_filter", + [](uint64_t max_distinct_items, double target_false_positive_prob) { + return bloom_filter_type::builder::create_by_accuracy(max_distinct_items, target_false_positive_prob); + }, + nb::arg("max_distinct_items"), nb::arg("target_false_positive_prob"), + "Creates a Bloom filter with optimal parameters for the given accuracy requirements"); + + // Bind the class with minimal methods + nb::class_<bloom_filter_type>(m, name) + .def("is_empty", &bloom_filter_type::is_empty, + "Returns True if the filter has seen no items, otherwise False") + .def("update", static_cast<void (bloom_filter_type::*)(const std::string&)>(&bloom_filter_type::update), + nb::arg("item"), + "Updates the filter with the given string") + .def("query", static_cast<bool (bloom_filter_type::*)(const std::string&) const>(&bloom_filter_type::query), + nb::arg("item"), + "Queries the filter for the given string"); +} + +void init_bloom_filter(nb::module_ &m) { + bind_bloom_filter<std::allocator<uint8_t>>(m, "bloom_filter"); +} \ No newline at end of file diff --git a/src/datasketches.cpp b/src/datasketches.cpp index 118683b..4aec928 100644 --- a/src/datasketches.cpp +++ b/src/datasketches.cpp @@ -41,6 +41,7 @@ void init_count_min(nb::module_& m); void init_density(nb::module_& m); void init_tdigest(nb::module_& m); void init_vector_of_kll(nb::module_& m); +void init_bloom_filter(nb::module_& m); // supporting objects void init_kolmogorov_smirnov(nb::module_& m); @@ -73,6 +74,7 @@ NB_MODULE(_datasketches, m) { init_density(m); init_tdigest(m); init_vector_of_kll(m); + init_bloom_filter(m); init_kolmogorov_smirnov(m); init_serde(m); diff --git a/tests/bloom_filter_test.py b/tests/bloom_filter_test.py new file mode 100644 index 0000000..a7ba806 --- /dev/null +++ b/tests/bloom_filter_test.py @@ -0,0 +1,145 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import unittest +from datasketches import create_bloom_filter + +class BloomFilterTest(unittest.TestCase): + def test_create_bloom_filter(self): + """Test that we can create a bloom filter with basic parameters""" + bf = create_bloom_filter(1000, 0.01) + self.assertIsNotNone(bf) + self.assertTrue(bf.is_empty()) + + def test_bloom_filter_empty_state(self): + """Test that newly created bloom filter is empty""" + bf = create_bloom_filter(100, 0.05) + self.assertTrue(bf.is_empty()) + + def test_bloom_filter_update_and_query(self): + """Test basic update and query functionality""" + bf = create_bloom_filter(1000, 0.01) + + # Initially empty + self.assertTrue(bf.is_empty()) + self.assertFalse(bf.query("test_item")) + + # Add an item + bf.update("test_item") + self.assertFalse(bf.is_empty()) + self.assertTrue(bf.query("test_item")) + + # Query for item not in filter + self.assertFalse(bf.query("other_item")) + + def test_bloom_filter_multiple_items(self): + """Test adding multiple items to the bloom filter""" + bf = create_bloom_filter(1000, 0.01) + + items = ["item1", "item2", "item3", "item4", "item5"] + + # Add all items + for item in items: + bf.update(item) + + # Check that all items are found + for item in items: + self.assertTrue(bf.query(item), f"Item {item} should be found") + + # Check that items not added are not found + non_items = ["not_item1", "not_item2", "not_item3"] + for item in non_items: + self.assertFalse(bf.query(item), f"Item {item} should not be found") + + def test_bloom_filter_false_positives(self): + """Test that bloom filter can have false positives (this is expected behavior)""" + bf = create_bloom_filter(10, 0.1) # Small filter, higher false positive rate + + # Add a few items + bf.update("item1") + bf.update("item2") + + # Check that added items are found + self.assertTrue(bf.query("item1")) + self.assertTrue(bf.query("item2")) + + # With a small filter and high false positive rate, we might get false positives + # This is expected behavior for bloom filters + # We're not testing for specific false positives, just that the filter works + + def test_bloom_filter_parameters(self): + """Test creating bloom filters with different parameters""" + # Test with different sizes and false positive rates + test_cases = [ + (100, 0.01), + (1000, 0.05), + (10000, 0.001), + (100, 0.1), + ] + + for max_items, false_positive_rate in test_cases: + with self.subTest(max_items=max_items, false_positive_rate=false_positive_rate): + bf = create_bloom_filter(max_items, false_positive_rate) + self.assertIsNotNone(bf) + self.assertTrue(bf.is_empty()) + + def test_bloom_filter_string_types(self): + """Test that bloom filter works with different string types""" + bf = create_bloom_filter(1000, 0.01) + + # Test with different string types + test_strings = [ + "simple", + "string with spaces", + "string_with_underscores", + "string-with-dashes", + "string123with456numbers", + "string.with.dots", + "string!with@special#chars$", + ] + + for test_string in test_strings: + with self.subTest(test_string=test_string): + bf.update(test_string) + self.assertTrue(bf.query(test_string)) + + # Test empty string separately - it might be ignored by the implementation + bf.update("") + # Note: Empty strings might be ignored by the bloom filter implementation + # This is common behavior, so we don't assert on the result + + def test_bloom_filter_edge_cases(self): + """Test edge cases for bloom filter""" + bf = create_bloom_filter(1000, 0.01) + + # Test with very long strings + long_string = "a" * 1000 + bf.update(long_string) + self.assertTrue(bf.query(long_string)) + + # Test with unicode strings + unicode_string = "café résumé naïve" + bf.update(unicode_string) + self.assertTrue(bf.query(unicode_string)) + + # Test with numbers as strings + number_string = "12345" + bf.update(number_string) + self.assertTrue(bf.query(number_string)) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@datasketches.apache.org For additional commands, e-mail: commits-h...@datasketches.apache.org