kinow commented on a change in pull request #258: URL: https://github.com/apache/commons-collections/pull/258#discussion_r730216326
########## File path: src/test/java/org/apache/commons/collections4/bloomfilter/BitMapTest.java ########## @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import org.junit.Test; + +public class BitMapTest { + + @Test + public void checkPositiveTest() { + BitMap.checkPositive(0); + BitMap.checkPositive(0); + try { + BitMap.checkPositive(-1); + + } catch (IndexOutOfBoundsException expected) { + // do nothing + } Review comment: assertThrows :point_up: ########## File path: src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java ########## @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter.hasher; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import java.util.Objects; +import java.util.function.IntConsumer; +import org.apache.commons.collections4.bloomfilter.IndexProducer; +import org.apache.commons.collections4.bloomfilter.Shape; + +/** + * A collection of Hashers. Useful when the generation of a Bloom filter depends upon + * multiple items. Hashers for each item are added to the HasherCollection and then + * the collection is used wherever a Hasher can be used in the API. + * + * @since 4.5 + */ +public class HasherCollection implements Hasher { + + /** + * The list of hashers to be used to generate the indices. + */ + private final List<Hasher> hashers; + + /** + * Constructs an empty HasherCollection. + */ + public HasherCollection() { + this.hashers = new ArrayList<>(); + } + + /** + * Constructs a HasherCollection from a collection of Hasher objects. + * + * @param hashers A collections of Hashers to build the indices with. + */ + public HasherCollection(final Collection<Hasher> hashers) { + Objects.requireNonNull( hashers, "hashers"); + this.hashers = new ArrayList<>(hashers); + } + + /** + * Constructor. + * + * @param function the function to use. + * @param buffers the byte buffers that will be hashed. + */ + public HasherCollection(Hasher... hashers) { + this( Arrays.asList(hashers)); + } + + /** + * Adds a hasher to the collection. + * @param hasher The hasher to add. + */ + public void add(Hasher hasher) { + Objects.requireNonNull( hasher, "hasher"); + hashers.add(hasher); + } + + /** + * Add all the Hashers in a collection to this HasherCollection. + * @param hashers The hashers to add. + */ + public void add(Collection<Hasher> hashers) { + Objects.requireNonNull( hashers, "hashers"); + hashers.addAll(hashers); + } + + @Override + public IndexProducer indices(final Shape shape) { + Objects.requireNonNull( shape, "shape"); + return new IndexProducer() { + @Override + public void forEachIndex(IntConsumer consumer) { + for (Hasher hasher : hashers) { + hasher.indices( shape ).forEachIndex(consumer); + } + } + }; + } + + @Override + public int size() { + int i = 0; + for (Hasher h : hashers ) + { Review comment: normally the `{` stays in the same line of the `for` in Commons Collections. ########## File path: src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java ########## @@ -169,33 +116,63 @@ */ boolean remove(Hasher hasher); + /** - * Adds the specified counting Bloom filter to this Bloom filter. Specifically - * all counts for the indexes identified by the {@code other} filter will be incremented - * by their corresponding counts in the {@code other} filter. + * Adds the specified BitCountProducer to this Bloom filter. Specifically + * all counts for the indexes identified by the {@code other} will be incremented + * by their corresponding values in the {@code other}. * - * <p>This method will return true if the filter is valid after the operation. + * <p>This method will return true if the filter is valid after the operation.</p> * - * @param other the other counting Bloom filter + * @param other the BitCountProducer to add. * @return true if the addition was successful and the state is valid - * @throws IllegalArgumentException if the shape of the other filter does not match - * the shape of this filter * @see #isValid() */ - boolean add(CountingBloomFilter other); + boolean add(BitCountProducer other); /** - * Adds the specified counting Bloom filter to this Bloom filter. Specifically - * all counts for the indexes identified by the {@code other} filter will be decremented - * by their corresponding counts in the {@code other} filter. + * Adds the specified BitCountProducer to this Bloom filter. Specifically + * all counts for the indexes identified by the {@code other} will be decremented + * by their corresponding values in the {@code other}. * - * <p>This method will return true if the filter is valid after the operation. + * <p>This method will return true if the filter is valid after the operation.</p> * - * @param other the other counting Bloom filter + * @param other the BitCountProducer to subtract. * @return true if the subtraction was successful and the state is valid - * @throws IllegalArgumentException if the shape of the other filter does not match - * the shape of this filter * @see #isValid() */ - boolean subtract(CountingBloomFilter other); + boolean subtract(BitCountProducer other); + + /** + * Merges the specified Bloom filter into this Bloom filter to produce a new CountingBloomFilter. + * Specifically the new Bloom filter will contain all the counts of this filter and in addition + * all bit indexes that are enabled in the {@code other} filter will be incremented + * by one in the new filter. + * + * <p>Note: the validity of the resulting filter is not guaranteed. When in doubt {@code isValid()} + * should be called on the new filter.</p> + * + * @param other the other Bloom filter + * @return A new CountingBloomFilter instance. + */ + @Override + CountingBloomFilter merge(BloomFilter other); + + /** + * Merges the specified hasher with this Bloom filter to create a new CountingBloomFilter. + * Specifically the new Bloom filter will contain all the counts of this filter and in addition + * all bit indexes specified by the {@code hasher} will be incremented + * by one in the new filter. + * + * For HasherCollections each SimpleHasher will be considered a single item and increment Review comment: New `<p>` for this paragraph? ########## File path: src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java ########## @@ -120,7 +120,23 @@ public void forEachBitMap(LongConsumer consumer) { if (cardinality() == 0) { return; } - BitMapProducer.fromIndexProducer( this, shape).forEachBitMap(consumer); + // because our indices are always in order we can + // shorten the time necessary to create the longs for the + // consumer + long bitMap =0; + int idx=0; + for (int i : indices) { + if (BitMap.getLongIndex(i) != idx) { + consumer.accept( bitMap ); + bitMap = 0; + idx = BitMap.getLongIndex(i); + } + bitMap |= BitMap.getLongBit(i); + } + if (bitMap != 0) + { Review comment: this `{` can be in the same line of `if` as in the rest of the code. ########## File path: src/test/java/org/apache/commons/collections4/bloomfilter/ShapeFactoryTest.java ########## @@ -0,0 +1,224 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.fail; + + +import org.junit.jupiter.api.Test; + +/** + * Tests the {@link Shape} class. + */ +public class ShapeFactoryTest { + + + /* + * values from https://hur.st/bloomfilter/?n=5&p=.1&m=&k= + * + * n = 5 + * + * p = 0.100375138 (1 in 10) + * + * m = 24 (3B) + * + * k = 3 + */ + + + /** + * Tests that if the number of items less than 1 an IllegalArgumentException is thrown. + */ + @Test + public void badNumberOfItemsTest() { + try { + Shape.Factory.fromNM(0, 24); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + try { + Shape.Factory.fromNMK(0, 24, 5); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + try { + Shape.Factory.fromNP(0, 0.02 ); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + } + + /** + * Tests that if the number of bits is less than 1 an exception is thrown + */ + @Test + public void badNumberOfBitsTest() { + try { + Shape.Factory.fromNM( 5, 0 ); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + try { + Shape.Factory.fromNMK( 5, 0, 7 ); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + try { + Shape.Factory.fromPMK( 0.035, 0, 7 ); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + } + + /** + * Tests that if the number of hash functions is less than 1 an exception is thrown. + */ + @Test + public void badNumberOfHashFunctionsTest() { + try { + Shape.Factory.fromNMK(5, 26, 0); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + try { + Shape.Factory.fromPMK(0.35, 26, 0); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + } + + /** + * Tests that if the calculated probability is greater than or equal to 1 an IllegalArgumentException is thrown + */ + @Test + public void badProbabilityTest() { + try { + Shape.Factory.fromNMK( 4000, 8, 1); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + try { + Shape.Factory.fromNP(10, 0.0); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // do nothing. + } + try { + Shape.Factory.fromNP( 10, 1.0); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // do nothing. + } + try { + Shape.Factory.fromNP( 10, Double.NaN); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // do nothing. + } + } + + + /** + * Tests that when the number of items, number of bits and number of hash functions is passed the values are + * calculated correctly. + */ + @Test + public void fromNMK_test() { + /* + * values from https://hur.st/bloomfilter/?n=5&m=24&k=4 + */ + final Shape filterConfig = Shape.Factory.fromNMK( 5, 24, 4); + + assertEquals(24, filterConfig.getNumberOfBits()); + assertEquals(4, filterConfig.getNumberOfHashFunctions()); + assertEquals(0.102194782, filterConfig.getProbability(5 ), 0.000001); + } + + /** + * Tests that the number of items and number of bits is passed the other values are calculated correctly. + */ + @Test + public void fromNM_Test() { + /* + * values from https://hur.st/bloomfilter/?n=5&m=24 + */ + final Shape filterConfig = Shape.Factory.fromNM(5, 24); + + assertEquals(24, filterConfig.getNumberOfBits()); + assertEquals(3, filterConfig.getNumberOfHashFunctions()); + assertEquals(0.100375138, filterConfig.getProbability(5), 0.000001); + } + + + + + /** + * Tests that if calculated number of bits is greater than Integer.MAX_VALUE an IllegalArgumentException is thrown. + */ + @Test + public void numberOfBitsOverflowTest() { + try { + Shape.Factory.fromNP(Integer.MAX_VALUE, 0.1); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // do nothing. + } Review comment: I think there's a new syntax in JUnit 5, `assertThrows(IllegalArgumentException.class, () => { Shape.Fctory ... })` I think? ########## File path: src/main/java/org/apache/commons/collections4/bloomfilter/hasher/package-info.java ########## @@ -16,8 +16,24 @@ */ /** - * Provides classes and interfaces to define the shape of a Bloom filter and the conversion - * of generic bytes to a hash of bit indexes to be used with a Bloom filter. + * + * With the exception of the HasherCollection, a Hasher represents an item of arbitrary + * byte size as multiple byte representations of fixed size (multiple hashes). The hashers + * are be used to create indices for a Bloom filter.</p> + * + * <p>Hashers create @{code IndexProducer} instances for hashed items based + * on a @{code Shape}.</p> + * + * <p>The method used to generate the multiple hashes is dependent upon the Hasher + * implementation. The SimpleHasher uses a combinatorial strategy to create the + * multiple hashes from a single starting hash.</p> + * + * <p>Note that the process of generating hashes and mapping them to a Bloom + * filter shape may create duplicate indexes. The Hasher implementation is required to + * remove all duplicate values for a single item. Thus tge hasher may generate fewer Review comment: s/tge/the (probably worth checking if another javadoc has a similar typo; I think I read another javadoc similar to this one, not sure if copy+paste wasn't sued) ########## File path: src/main/java/org/apache/commons/collections4/bloomfilter/package-info.java ########## @@ -71,48 +78,18 @@ * <h3> * Shape</h3> * <p> - * Describes the Bloom filter using the - * standard number of bits, number of hash functions and number of items along with a - * description of the HashFunction. It is this description that has caused the most issues - * of late. </p> - * <h3> - * Hasher</h3> - * <p> - * converts byte buffers into an iterator if int based - * on a Shape. There are 2 implementations of Hasher provided <ul> - * <li> - * Dynamic - calls - * the HashFunction for each value required in the Bloom filter.</li> - * <li> - * Static - based - * on a pre-calculated list of Bloom filter index values. It is also limited to generating - * values for a specific Shape.</li> - * </ul> + * The Shape describes the Bloom filter using the number of bits and the number of hash functions</p> * * <h3> - * Hash Functions</h3> - * <p> - * Hash - * functions generate individual index values for the filter from a byte buffer. There are - * four implementations provided. </p> - * <h3> - * HashFunctionIdentity</h3> + * Hasher</h3> * <p> - * The - * HashFunctionIdentity is the base interface for the HashFunction. It tracks three (3) - * properties: <ul> - * <li> - * The Hashing algorithm</li> - * <li> - * Whether the contents of the - * resulting hash buffer are read as signed or unsigned values.</li> - * <li> - * Whether the hash - * function uses an iterative or cyclic method. In traditional iterative methods this is - * done by calling the selected hash function with a different seed for each hash - * required. The second method described by Adam Kirsch and Micheal Mitzenmacher[1] has - * become more common and is used in applications like Cassandra[2].</li> - * </ul> + * A Hasher converts bytes into an series of integers based on a Shape. With the exception of the HasherCollecton, + * Each hasher represents one item being added to the Bloom filter. The HasherCollection represents the + * number of items as the sum of the number of items represented by Hashers in the collection.</p> + * <p>The SimpleHasher uses a combinatorial generation technique to create the integers. It is easily + * initialized by using a standard @{code MessageDigest} or other Hash function to hash the item to insert and Review comment: I think it's `{@code MessageDigest}`? At least I never tried the `@` out of brackets (doing the review on GitHub UI, sorry, no IDE to confirm at the moment). ########## File path: src/test/java/org/apache/commons/collections4/bloomfilter/ShapeFactoryTest.java ########## @@ -0,0 +1,224 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.fail; + + +import org.junit.jupiter.api.Test; + +/** + * Tests the {@link Shape} class. + */ +public class ShapeFactoryTest { + + + /* + * values from https://hur.st/bloomfilter/?n=5&p=.1&m=&k= + * + * n = 5 + * + * p = 0.100375138 (1 in 10) + * + * m = 24 (3B) + * + * k = 3 + */ + + + /** + * Tests that if the number of items less than 1 an IllegalArgumentException is thrown. + */ + @Test + public void badNumberOfItemsTest() { + try { + Shape.Factory.fromNM(0, 24); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + try { + Shape.Factory.fromNMK(0, 24, 5); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + try { + Shape.Factory.fromNP(0, 0.02 ); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + } + + /** + * Tests that if the number of bits is less than 1 an exception is thrown + */ + @Test + public void badNumberOfBitsTest() { + try { + Shape.Factory.fromNM( 5, 0 ); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + try { + Shape.Factory.fromNMK( 5, 0, 7 ); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + try { + Shape.Factory.fromPMK( 0.035, 0, 7 ); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + } + + /** + * Tests that if the number of hash functions is less than 1 an exception is thrown. + */ + @Test + public void badNumberOfHashFunctionsTest() { + try { + Shape.Factory.fromNMK(5, 26, 0); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + try { + Shape.Factory.fromPMK(0.35, 26, 0); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + } + + /** + * Tests that if the calculated probability is greater than or equal to 1 an IllegalArgumentException is thrown + */ + @Test + public void badProbabilityTest() { + try { + Shape.Factory.fromNMK( 4000, 8, 1); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + try { + Shape.Factory.fromNP(10, 0.0); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // do nothing. + } + try { + Shape.Factory.fromNP( 10, 1.0); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // do nothing. + } + try { + Shape.Factory.fromNP( 10, Double.NaN); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // do nothing. + } Review comment: See comment below about `assertThrows` from JUnit 5. That can simplify these test methods :point_up: ########## File path: src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java ########## @@ -16,44 +16,55 @@ */ package org.apache.commons.collections4.bloomfilter; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; /** * The interface that describes a Bloom filter. * @since 4.5 */ -public interface BloomFilter { +public interface BloomFilter extends IndexProducer, BitMapProducer { - // Query Operations + /** + * Return the Bloom filter data as a BitMap array. + * @param filter the filter to get the data from. + * @return An array of BitMap long. + */ + public static long[] asBitMapArray( BloomFilter filter ) { + BitMapProducer.ArrayBuilder builder = new BitMapProducer.ArrayBuilder(filter.getShape()); + filter.forEachBitMap( builder ); + return builder.getArray(); + } /** - * Gets the shape of this filter. - * - * @return the shape of this filter + * Return the Bloom filter data as an array of indices for the enabled bits. + * @param filter the Filter to get the data from. + * @return An array of indices for enabled bits in the Bloom filter. */ - Shape getShape(); + public static int[] asIndexArray( BloomFilter filter ) { + List<Integer> lst = new ArrayList<Integer>(); + filter.forEachIndex( lst::add ); + return lst.stream().mapToInt( Integer::intValue ).toArray(); + } + + + // Query Operations /** - * Gets an array of little-endian long values representing the bits of this filter. - * - * <p>The returned array will have length {@code ceil(m / 64)} where {@code m} is the - * number of bits in the filter and {@code ceil} is the ceiling function. - * Bits 0-63 are in the first long. A value of 1 at a bit position indicates the bit - * index is enabled. - * - * @return the {@code long[]} representation of this filter + * This method is used to determine the best method for matching. For `sparse` implementations the `getIndices()` Review comment: Not sure if it changed in Javadoc, but normally \`sparse\` would be written as `{@code sparse}` (ditto for \`getIndices()\`). ########## File path: src/test/java/org/apache/commons/collections4/bloomfilter/ShapeTest.java ########## @@ -0,0 +1,453 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.junit.jupiter.api.Assertions.fail; + + +import org.junit.jupiter.api.Test; + +/** + * Tests the {@link Shape} class. + */ +public class ShapeTest { + + + /* + * values from https://hur.st/bloomfilter/?n=5&p=.1&m=&k= + * + * n = 5 + * + * p = 0.100375138 (1 in 10) + * + * m = 24 (3B) + * + * k = 3 + */ + + private final Shape shape = new Shape(3, 24 ); + + /** + * Tests that if the number of bits less than 1 an IllegalArgumentException is thrown. + */ + @Test + public void constructor_items_bits_BadNumberOfBitsTest() { + try { + new Shape( 5, 0); + fail("Should have thrown IllegalArgumentException"); + } catch (final IllegalArgumentException expected) { + // expected + } + } + + // /** + // * Tests that if the number of hash functions is less than 1 an IllegalArgumentException is thrown. + // */ + // @Test + // public void constructor_items_bits_BadNumberOfHashFunctionsTest() { + // try { + // new Shape( 16, 8); + // + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // expected + // } + // } + + // /** + // * Tests that if the number of items less than 1 an IllegalArgumentException is thrown. + // */ + // @Test + // public void constructor_items_bits_BadNumberOfItemsTest() { + // try { + // new Shape(testFunction, 0, 24); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // expected + // } + // } + + // /** + // * Tests that if the number of bits is less than 1 an exception is thrown + // */ + // @Test + // public void constructor_items_bits_hash_BadNumberOfBitsTest() { + // try { + // new Shape(testFunction, 5, 0, 1); + // fail("Should have thrown IllegalArgumentException"); + // } catch (final IllegalArgumentException expected) { + // // expected + // } + // } Review comment: Is it going to be used in the future? If not I think we can remove it (since it's a WIP PR it might be still under development... :+1: ) ########## File path: src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java ########## @@ -62,56 +73,131 @@ * effectively {@code (this AND other) == other}. * * @param other the other Bloom filter - * @return true if this filter is enabled for all enabled bits in the other filter - * @throws IllegalArgumentException if the shape of the other filter does not match - * the shape of this filter + * @return true if all enabled bits in the other filter are enabled in this filter. */ - boolean contains(BloomFilter other); + default boolean contains(BloomFilter other) { + Objects.requireNonNull( other, "other"); + return isSparse() ? contains( (IndexProducer) other) : + contains( (BitMapProducer) other ); + } /** - * Returns {@code true} if this filter contains the specified decomposed Bloom filter. + * Returns {@code true} if this filter contains the bits specified in the hasher. * Specifically this returns {@code true} if this filter is enabled for all bit indexes - * identified by the {@code hasher}. Using the bit representations this is + * identified by the {@code hasher}. Using the BitMap representations this is * effectively {@code (this AND hasher) == hasher}. * * @param hasher the hasher to provide the indexes * @return true if this filter is enabled for all bits specified by the hasher * @throws IllegalArgumentException if the hasher cannot generate indices for the shape of * this filter */ - boolean contains(Hasher hasher); + default boolean contains(Hasher hasher) { + Objects.requireNonNull( hasher, "Hasher"); + Shape shape = getShape(); + return contains( hasher.indices(shape)); + } - // Modification Operations + /** + * Returns {@code true} if this filter contains the indices specified IndexProducer. + * Specifically this returns {@code true} if this filter is enabled for all bit indexes + * identified by the {@code IndexProducer}. + * + * @param indexProducer the IndexProducer to provide the indexes + * @return true if this filter is enabled for all bits specified by the IndexProducer + */ + boolean contains(IndexProducer indexProducer); /** - * Merges the specified Bloom filter into this Bloom filter. Specifically all bit indexes - * that are enabled in the {@code other} filter will be enabled in this filter. + * Returns {@code true} if this filter contains the bits specified in the BitMaps produced by the + * bitMapProducer. + * + * @param bitMapProducer the the {@code BitMapProducer} to provide the BitMaps. + * @return true if this filter is enabled for all bits specified by the BitMaps + */ + boolean contains(BitMapProducer bitMapProducer); + + /** + * Merges the specified Bloom filter with this Bloom filter creating a new Bloom filter. + * Specifically all bit indexes that are enabled in the {@code other} and in @code this} filter will be + * enabled in the resulting filter. + * + * @param other the other Bloom filter + * @return The new Bloom filter. + */ + default BloomFilter merge(BloomFilter other) { + Objects.requireNonNull( other, "other"); + Shape shape = getShape(); + BloomFilter result = BitMap.isSparse( (cardinality() + other.cardinality()), getShape() ) ? + new SparseBloomFilter(shape) : + new SimpleBloomFilter(shape); Review comment: Wrong indentation? :point_up: not matching the other `?` statement alignment. ########## File path: src/main/java/org/apache/commons/collections4/bloomfilter/exceptions/package-info.java ########## @@ -14,11 +14,4 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - -/** - * Provides implementations of the Bloom filter - * {@link org.apache.commons.collections4.bloomfilter.hasher.HashFunction HashFunction} interface. - * - * @since 4.5 - */ -package org.apache.commons.collections4.bloomfilter.hasher.function; +package org.apache.commons.collections4.bloomfilter.exceptions; Review comment: Missing newline. ########## File path: src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java ########## @@ -16,44 +16,55 @@ */ package org.apache.commons.collections4.bloomfilter; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; /** * The interface that describes a Bloom filter. * @since 4.5 */ -public interface BloomFilter { +public interface BloomFilter extends IndexProducer, BitMapProducer { - // Query Operations + /** + * Return the Bloom filter data as a BitMap array. + * @param filter the filter to get the data from. + * @return An array of BitMap long. + */ + public static long[] asBitMapArray( BloomFilter filter ) { + BitMapProducer.ArrayBuilder builder = new BitMapProducer.ArrayBuilder(filter.getShape()); + filter.forEachBitMap( builder ); + return builder.getArray(); + } /** - * Gets the shape of this filter. - * - * @return the shape of this filter + * Return the Bloom filter data as an array of indices for the enabled bits. + * @param filter the Filter to get the data from. + * @return An array of indices for enabled bits in the Bloom filter. */ - Shape getShape(); + public static int[] asIndexArray( BloomFilter filter ) { + List<Integer> lst = new ArrayList<Integer>(); + filter.forEachIndex( lst::add ); + return lst.stream().mapToInt( Integer::intValue ).toArray(); + } + + + // Query Operations /** - * Gets an array of little-endian long values representing the bits of this filter. - * - * <p>The returned array will have length {@code ceil(m / 64)} where {@code m} is the - * number of bits in the filter and {@code ceil} is the ceiling function. - * Bits 0-63 are in the first long. A value of 1 at a bit position indicates the bit - * index is enabled. - * - * @return the {@code long[]} representation of this filter + * This method is used to determine the best method for matching. For `sparse` implementations the `getIndices()` + * method is more efficient. Implementers should determine if it is easier for the implementation to return am array of Review comment: /am/an ########## File path: src/main/java/org/apache/commons/collections4/bloomfilter/ArrayCountingBloomFilter.java ########## @@ -356,9 +239,31 @@ private void add(final int idx, final int addend) { * @param idx the index * @param subtrahend the amount to subtract */ - private void subtract(final int idx, final int subtrahend) { + protected void subtract(final int idx, final int subtrahend) { final int updated = counts[idx] - subtrahend; state |= updated; counts[idx] = updated; } + + + @Override + public Shape getShape() { + return shape; + } + + @Override + public boolean contains(IndexProducer indexProducer) { + try { + indexProducer.forEachIndex( idx -> {if ( this.counts[idx] == 0 ) { throw new NoMatchException(); }} ); Review comment: Maybe instead of `throw new NoMatchException()`, just `return false` here? ########## File path: src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java ########## @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Objects; +import java.util.function.IntConsumer; +import java.util.function.LongConsumer; + +import org.apache.commons.collections4.bloomfilter.exceptions.NoMatchException; +import org.apache.commons.collections4.bloomfilter.hasher.Hasher; + +/** + * A bloom filter using an array of BitMaps to track enabled bits. This is a standard + * implementation and should work well for most Bloom filters. + * @since 4.5 + */ +public class SimpleBloomFilter implements BloomFilter { + + /** + * The array of BitMap longs that defines this Bloom filter. + */ + private long[] bitMap; + + /** + * The Shape of this Bloom filter + */ + private final Shape shape; + + /** + * The cardinality of this Bloom filter. + */ + private int cardinality; + + /** + * Constructs an empty BitSetBloomFilter. + * + */ + public SimpleBloomFilter(Shape shape) { + Objects.requireNonNull( shape, "shape"); + this.shape = shape; + this.bitMap = new long[0]; + this.cardinality = 0; + } + + /** + * Constructor. + * @param shape The shape for the filter. + * @param hasher the Hasher to initialize the filter with. + */ + public SimpleBloomFilter(final Shape shape, Hasher hasher) { + Objects.requireNonNull( shape, "shape"); + Objects.requireNonNull( hasher, "hasher"); + this.shape = shape; + + BitMapProducer producer = BitMapProducer.fromIndexProducer( hasher.indices(shape), shape); + BitMapProducer.ArrayBuilder builder = new BitMapProducer.ArrayBuilder(shape); + producer.forEachBitMap( builder ); + this.bitMap = builder.getArray(); + this.cardinality = 0; + forEachBitMap( w -> this.cardinality += Long.bitCount(w)); + } + + @Override + public boolean mergeInPlace(BloomFilter other) { + Objects.requireNonNull( other, "other"); + BitMapProducer.ArrayBuilder builder = new BitMapProducer.ArrayBuilder(shape, this.bitMap); + other.forEachBitMap( builder ); + this.bitMap = builder.getArray(); + this.cardinality = 0; + forEachBitMap( w -> this.cardinality += Long.bitCount(w)); + return true; + } + + @Override + public Shape getShape() { + return shape; + } + + @Override + public boolean isSparse() { + return false; + } + + @Override + public int cardinality() { + return this.cardinality; + } + + @Override + public void forEachIndex(IntConsumer consumer) { + Objects.requireNonNull( consumer, "consumer"); + IndexProducer.fromBitMapProducer(this).forEachIndex(consumer); + } + + @Override + public void forEachBitMap(LongConsumer consumer) { + Objects.requireNonNull( consumer, "consumer"); + for ( long l : bitMap ) { + consumer.accept(l); + } + } + + @Override + public boolean contains(IndexProducer indexProducer) { + return contains( BitMapProducer.fromIndexProducer(indexProducer, shape)); + } + + + @Override + public boolean contains(BitMapProducer bitMapProducer) { + LongConsumer consumer = new LongConsumer() { + int i=0; + @Override + public void accept(long w) { + if ((bitMap[i++] & w) != w) + { throw new NoMatchException(); Review comment: Move the `{` to the same line as the `if`, and fix the `throw` indentation? ########## File path: src/main/java/org/apache/commons/collections4/bloomfilter/CountingBloomFilter.java ########## @@ -155,10 +99,13 @@ boolean remove(BloomFilter other); /** - * Removes the specified decomposed Bloom filter from this Bloom filter. Specifically + * Removes the specified hasher from the Bloom filter from this Bloom filter. Specifically * all counts for the <em>distinct</em> indexes identified by the {@code hasher} will be * decremented by 1. If the {@code hasher} contains duplicate bit indexes these are ignored. * + * For HasherCollections each SimpleHasher will be considered a single item and decremented Review comment: Add `<p>` for this paragraph? ########## File path: src/main/java/org/apache/commons/collections4/bloomfilter/Shape.java ########## @@ -0,0 +1,455 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Objects; + +/** + * The definition of a Bloom filter shape. + * + * <p> This class contains the values for the filter configuration and is used to + * convert a Hasher into a BloomFilter as well as verify that two Bloom filters are + * compatible. (i.e. can be compared or merged)</p> + * + * <h2>Interrelatedness of values</h2> + * + * <dl> <dt>Number of Items ({@code n})</dt> + * <dd>{@code n = ceil(m / (-k / ln(1 - exp(ln(p) / k))))}</dd> <dt>Probability of + * False Positives ({@code p})</dt> <dd>{@code p = pow(1 - exp(-k / (m / n)), k)}</dd> <dt>Number + * of Bits ({@code m})</dt> + * <dd>{@code m = ceil((n * ln(p)) / ln(1 / pow(2, ln(2))))}</dd> <dt>Number of + * Functions ({@code k})</dt> <dd>{@code k = round((m / n) * ln(2))}</dd> </dl> + * + * @see <a href="http://hur.st/bloomfilter?n=3&p=1.0E-5">Bloom Filter calculator</a> + * @see <a href="https://en.wikipedia.org/wiki/Bloom_filter">Bloom filter + * [Wikipedia]</a> + * @since 4.5 + */ +public final class Shape { + + /** + * Number of hash functions to create a filter ({@code k}). + */ + private final int numberOfHashFunctions; + + /** + * Number of bits in the filter ({@code m}). + */ + private final int numberOfBits; + + /** + * Constructs a filter configuration with the specified number of items ({@code n}) and + * bits ({@code m}). + * + * <p>The optimal number of hash functions ({@code k}) is computed. + * <pre>k = round((m / n) * ln(2))</pre> + * + * <p>The false-positive probability is computed using the number of items, bits and hash + * functions. An exception is raised if this is greater than or equal to 1 (i.e. the + * shape is invalid for use as a Bloom filter). + * + * @param numberOfHashFunctions Number of hash functions to use for each item placed in the filter. + * @param numberOfBits The number of bits in the filter + * @throws IllegalArgumentException if {@code numberOfHashFunctions < 1} or {@code numberOfBits < 1} + */ + public Shape(final int numberOfHashFunctions, final int numberOfBits) { + this.numberOfBits = checkNumberOfBits(numberOfBits); + this.numberOfHashFunctions = checkNumberOfHashFunctions(numberOfHashFunctions); + } + + /** + * Check number of bits is strictly positive. + * + * @param numberOfBits the number of bits + * @return the number of bits + * @throws IllegalArgumentException if the number of bits is {@code < 1} + */ + private static int checkNumberOfBits(final int numberOfBits) { + if (numberOfBits < 1) { + throw new IllegalArgumentException("Number of bits must be greater than 0: " + numberOfBits); + } + return numberOfBits; + } + + /** + * Check number of hash functions is strictly positive + * + * @param numberOfHashFunctions the number of hash functions + * @return the number of hash functions + * @throws IllegalArgumentException if the number of hash functions is {@code < 1} + */ + private static int checkNumberOfHashFunctions(final int numberOfHashFunctions) { + if (numberOfHashFunctions < 1) { + throw new IllegalArgumentException("Number of hash functions must be greater than 0: " + numberOfHashFunctions); + } + return numberOfHashFunctions; + } + + @Override + public boolean equals(final Object o) { + if (o instanceof Shape) { + final Shape other = (Shape) o; + return numberOfBits == other.numberOfBits && + numberOfHashFunctions == other.numberOfHashFunctions; + } + return false; + } + + @Override + public int hashCode() { + return Objects.hash(numberOfBits, numberOfHashFunctions); + } + + /** + * Gets the number of bits in the Bloom filter. + * This is also known as {@code m}. + * + * @return the number of bits in the Bloom filter ({@code m}). + */ + public int getNumberOfBits() { + return numberOfBits; + } + + + /** + * Gets the number of hash functions used to construct the filter. + * This is also known as {@code k}. + * + * @return the number of hash functions used to construct the filter ({@code k}). + */ + public int getNumberOfHashFunctions() { + return numberOfHashFunctions; + } + + + /** + * Calculates the probability of false positives ({@code p}) given + * numberOfItems ({@code n}), numberOfBits ({@code m}) and numberOfHashFunctions ({@code k}). + * <pre>p = pow(1 - exp(-k / (m / n)), k)</pre> + * + * <p>This is the probability that a Bloom filter will return true for the presence of an item + * when it does not contain the item. + * + * <p>The probability assumes that the Bloom filter is filled with the expected number of + * items. If the filter contains fewer items then the actual probability will be lower. + * Thus this returns the worst-case false positive probability for a filter that has not + * exceeded its expected number of items. + * + * @param numberOfItems the number of items hashed into the Bloom filter. + * @return the probability of false positives. + * @see #getNumberOfItems() + */ + public double getProbability(int numberOfItems) { + if (numberOfItems < 0) { + throw new IllegalArgumentException("Number of items must be greater than or equal to 0: " + numberOfItems); + } + if (numberOfItems == 0) { + return 0; + } + return Math.pow(1.0 - Math.exp(-1.0 * numberOfHashFunctions * numberOfItems / numberOfBits), + numberOfHashFunctions); + } + + @Override + public String toString() { + return String.format("Shape[ m=%s k=%s ]", + numberOfBits, numberOfHashFunctions); + } + + /** + * Estimate the number of items in a Bloom filter with this shape and the specified number of bits enabled. + * @param hammingValue the number of enabled bits. + * @return An estimate of the number of items in the Bloom filter. + */ + public double estimateN( int hammingValue ) { + double c = hammingValue; + double m = numberOfBits; + double k = numberOfHashFunctions; + return -(m / k) * Math.log(1.0 - (c / m)); + } + + /** + * The factory to assist in the creation of proper Shapes. + * + * In the methods of this factory the `fraom` names are appended with the standard variable Review comment: add a `<p>`? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
