This is an automated email from the ASF dual-hosted git repository. jmalkin pushed a commit to branch bloom in repository https://gitbox.apache.org/repos/asf/datasketches-java.git
commit 533cb81c67dc88644917a18411e50c31c926002c Author: jmalkin <[email protected]> AuthorDate: Mon Feb 26 18:36:52 2024 -0800 finish javadocs for builder class --- .../filters/bloomfilter/BloomFilterBuilder.java | 52 +++++++++++++++++++++- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java index 46d48a26..938889d8 100644 --- a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java +++ b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java @@ -19,31 +19,79 @@ package org.apache.datasketches.filters.bloomfilter; +import org.apache.datasketches.common.SketchesArgumentException; + public final class BloomFilterBuilder { + /** + * Returns the optimal number of hash functions to given target numbers of distinct items + * and the BloomFliter size in bits. + * @param maxDistinctItems The maximum expected number of distinct items to add to the filter + * @param numFilterBits The target size, in bits, of the Bloom Filter + * @return The suggested number of hash functions to use with the filter + */ public static short suggestNumHashes(final long maxDistinctItems, final long numFilterBits) { // ceil to ensure we never average worse than the target return (short) Math.max(1, (int) Math.ceil((double) numFilterBits / maxDistinctItems * Math.log(2.0))); } + /** + * Returns the optimal number of hash functions to achieve a target false positive probability. + * @param targetFalsePositiveProb A desired false positive probability per item + * @return The suggested number of hash functions to use with the filter. + */ public static short suggestNumHashes(final double targetFalsePositiveProb) { + if (targetFalsePositiveProb <= 0.0 || targetFalsePositiveProb > 1.0) { + throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0"); + } // ceil to ensure we never average worse than the target return (short) Math.ceil((- Math.log(targetFalsePositiveProb) / Math.log(2))); } + /** + * Returns the optimal number of bits to use in a Bloom Filter given a target number of distinct + * items and a target false positive probability. + * @param maxDistinctItems The maximum expected number of distinct items to add to the filter + * @param targetFalsePositiveProb A desired false positive probability per item + * @return The suggested number of bits to use with the filter + */ public static long suggestNumFilterBits(final long maxDistinctItems, final double targetFalsePositiveProb) { + if (maxDistinctItems <= 0) { + throw new SketchesArgumentException("maxDistinctItems must be strictly positive"); + } + if (targetFalsePositiveProb <= 0.0 || targetFalsePositiveProb > 1.0) { + throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0"); + } return (long) Math.round(-maxDistinctItems * Math.log(targetFalsePositiveProb) / (Math.log(2) * Math.log(2))); } + /** + * Creates a new BloomFilter with an optimal number of bits and hash functions for the given inputs. + * @param maxDistinctItems The maximum expected number of distinct items to add to the filter + * @param targetFalsePositiveProb A desired false positive probability per item + * @return A new BloomFilter configured for the given input parameters + */ public static BloomFilter newBloomFilter(final long maxDistinctItems, final double targetFalsePositiveProb) { - // TODO validate inputs + if (maxDistinctItems <= 0) { + throw new SketchesArgumentException("maxDistinctItems must be strictly positive"); + } + if (targetFalsePositiveProb <= 0.0 || targetFalsePositiveProb > 1.0) { + throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0"); + } final long numBits = suggestNumFilterBits(maxDistinctItems, targetFalsePositiveProb); final short numHashes = suggestNumHashes(maxDistinctItems, numBits); return new BloomFilter(numBits, numHashes); } + /** + * Creates a new BloomFilter with an optimal number of bits and hash functions for the given inputs, + * using the provided base seed for the hash function. + * @param maxDistinctItems The maximum expected number of distinct items to add to the filter + * @param targetFalsePositiveProb A desired false positive probability per item + * @param seed A base hash seed + * @return A new BloomFilter configured for the given input parameters + */ public static BloomFilter newBloomFilter(final long maxDistinctItems, final double targetFalsePositiveProb, final long seed) { - // TODO validate inputs final long numBits = suggestNumFilterBits(maxDistinctItems, targetFalsePositiveProb); final short numHashes = suggestNumHashes(maxDistinctItems, numBits); return new BloomFilter(numBits, numHashes, seed); --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
