This is an automated email from the ASF dual-hosted git repository. jmalkin pushed a commit to branch bloom in repository https://gitbox.apache.org/repos/asf/datasketches-java.git
commit c55ea437f695426acd3b1acb78f4d7d9506ae8d7 Author: jmalkin <[email protected]> AuthorDate: Mon Feb 26 22:52:49 2024 -0800 Test BloomFilterBuilder methods vs manually computed values from formulae --- .../filters/bloomfilter/BloomFilterBuilder.java | 23 ++-- .../bloomfilter/BloomFilterBuilderTest.java | 118 +++++++++++++++++++++ 2 files changed, 131 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java index 938889d8..0934338d 100644 --- a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java +++ b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java @@ -19,6 +19,8 @@ package org.apache.datasketches.filters.bloomfilter; +import java.util.concurrent.ThreadLocalRandom; + import org.apache.datasketches.common.SketchesArgumentException; public final class BloomFilterBuilder { @@ -27,10 +29,13 @@ public final class BloomFilterBuilder { * Returns the optimal number of hash functions to given target numbers of distinct items * and the BloomFliter size in bits. * @param maxDistinctItems The maximum expected number of distinct items to add to the filter - * @param numFilterBits The target size, in bits, of the Bloom Filter + * @param numFilterBits The intended size of the Bloom Filter in bits * @return The suggested number of hash functions to use with the filter */ public static short suggestNumHashes(final long maxDistinctItems, final long numFilterBits) { + if (maxDistinctItems < 1 || numFilterBits < 1) { + throw new SketchesArgumentException("maxDistinctItems and numFilterBits must be strictly positive"); + } // ceil to ensure we never average worse than the target return (short) Math.max(1, (int) Math.ceil((double) numFilterBits / maxDistinctItems * Math.log(2.0))); } @@ -72,15 +77,7 @@ public final class BloomFilterBuilder { * @return A new BloomFilter configured for the given input parameters */ public static BloomFilter newBloomFilter(final long maxDistinctItems, final double targetFalsePositiveProb) { - if (maxDistinctItems <= 0) { - throw new SketchesArgumentException("maxDistinctItems must be strictly positive"); - } - if (targetFalsePositiveProb <= 0.0 || targetFalsePositiveProb > 1.0) { - throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0"); - } - final long numBits = suggestNumFilterBits(maxDistinctItems, targetFalsePositiveProb); - final short numHashes = suggestNumHashes(maxDistinctItems, numBits); - return new BloomFilter(numBits, numHashes); + return newBloomFilter(maxDistinctItems, targetFalsePositiveProb, ThreadLocalRandom.current().nextLong()); } /** @@ -92,6 +89,12 @@ public final class BloomFilterBuilder { * @return A new BloomFilter configured for the given input parameters */ public static BloomFilter newBloomFilter(final long maxDistinctItems, final double targetFalsePositiveProb, final long seed) { + if (maxDistinctItems <= 0) { + throw new SketchesArgumentException("maxDistinctItems must be strictly positive"); + } + if (targetFalsePositiveProb <= 0.0 || targetFalsePositiveProb > 1.0) { + throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0"); + } final long numBits = suggestNumFilterBits(maxDistinctItems, targetFalsePositiveProb); final short numHashes = suggestNumHashes(maxDistinctItems, numBits); return new BloomFilter(numBits, numHashes, seed); diff --git a/src/test/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilderTest.java b/src/test/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilderTest.java new file mode 100644 index 00000000..d3a08f85 --- /dev/null +++ b/src/test/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilderTest.java @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.filters.bloomfilter; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.fail; + +import org.apache.datasketches.common.SketchesArgumentException; +import org.testng.annotations.Test; + +public class BloomFilterBuilderTest { + + @Test + public void testSuggestHashesFromSizes() { + // invalid distinct items + try { + BloomFilterBuilder.suggestNumHashes(0, 32768); + fail(); + } catch (final SketchesArgumentException e) { + // expected + } + + // invalid filter size + try { + BloomFilterBuilder.suggestNumHashes(10_000, -1); + fail(); + } catch (final SketchesArgumentException e) { + // expected + } + + // manually computed values based on formula + assertEquals(BloomFilterBuilder.suggestNumHashes(100, 1L << 16), 455); + assertEquals(BloomFilterBuilder.suggestNumHashes(10_000, 1L << 12), 1); + assertEquals(BloomFilterBuilder.suggestNumHashes(1_000_000_000, Integer.MAX_VALUE * 4L), 6); + assertEquals(BloomFilterBuilder.suggestNumHashes(1_500_000, 1L << 24), 8); + } + + @Test + public void testSuggestHashesFromProbability() { + // negative probability + try { + BloomFilterBuilder.suggestNumHashes(-0.5); + fail(); + } catch (final SketchesArgumentException e) { + // expected + } + + // probability > 1 + try { + BloomFilterBuilder.suggestNumHashes(2.5); + fail(); + } catch (final SketchesArgumentException e) { + // expected + } + + // manually computed values based on formula + assertEquals(BloomFilterBuilder.suggestNumHashes(0.333), 2); + assertEquals(BloomFilterBuilder.suggestNumHashes(0.01), 7); + assertEquals(BloomFilterBuilder.suggestNumHashes(1e-12), 40); + } + + @Test + public void testSuggestNumFilterBits() { + // non-positive number distincts + try { + BloomFilterBuilder.suggestNumFilterBits(0, 0.01); + fail(); + } catch (final SketchesArgumentException e) { + // expected + } + + // non-positive probability + try { + BloomFilterBuilder.suggestNumFilterBits(2500, 0.0); + fail(); + } catch (final SketchesArgumentException e) { + // expected + } + + // probability > 1 + try { + BloomFilterBuilder.suggestNumFilterBits(1000, 2.5); + fail(); + } catch (final SketchesArgumentException e) { + // expected + } + + // manually computed values based on formula + assertEquals(BloomFilterBuilder.suggestNumFilterBits(250_000, 0.01), 2396265); + BloomFilter bf = BloomFilterBuilder.newBloomFilter(250_000, 0.01); + assertEquals(bf.getCapacity(), 2396288); // next smallest multiple of 64 + assertEquals(bf.getNumHashes(), BloomFilterBuilder.suggestNumHashes(250_000, 2396288)); + + assertEquals(BloomFilterBuilder.suggestNumFilterBits(5_000_000, 1e-4), 95850584); + final long seed = 19805243; + bf = BloomFilterBuilder.newBloomFilter(5_000_000, 1e-4, seed); + assertEquals(bf.getCapacity(), 95850624); // next smallest multiple of 64 + assertEquals(bf.getNumHashes(), BloomFilterBuilder.suggestNumHashes(5_000_000, 95850624)); + assertEquals(bf.getSeed(), seed); + } +} --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
