aherbert commented on a change in pull request #258:
URL:
https://github.com/apache/commons-collections/pull/258#discussion_r800391877
##########
File path:
src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java
##########
@@ -16,14 +16,123 @@
*/
package org.apache.commons.collections4.bloomfilter;
-import org.apache.commons.collections4.bloomfilter.hasher.Shape;
+import java.util.function.LongBinaryOperator;
+import java.util.function.LongPredicate;
+import java.util.function.LongUnaryOperator;
/**
* Implementations of set operations on Bloom filters.
*
+ * @since 4.5
*/
public final class SetOperations {
+ /**
+ * A helper class that calculates cardinality as the cardinality of the
result of an operation on a two bit map arrays.
+ *
+ * <p>The first array is build in the constructor. The second array is
processed as a LongConsumer. Whenever there are
+ * two values the op2 operation is used. Whenever the one array is longer
than the other the op1 operation is used on the
+ * bit maps that do not have matching entries.</p>
+ *
+ * <p>The calculated cardinalities are summed to return the cardinality of
the operation.</p>
+ *
+ */
+ private static class CardCounter implements LongPredicate {
+ /**
+ * The calculated cardinality.
+ */
+ private int cardinality = 0;
+ /**
+ * The index into the array of bit maps.
+ */
+ private int idx = 0;
+ /**
+ * The array of bit maps.
+ */
+ private long[] bitMaps;
+ /**
+ * The operator to execute for 2 bit maps.
+ */
+ private LongBinaryOperator op2;
+ /**
+ * The operator to execute for a single bit map.
+ */
+ private LongUnaryOperator op1;
+
+ /**
+ * Constructor.
+ * @param producer The the producer for the initial bit maps.
+ * @param The shape of the Bloom filter
Review comment:
`@param shape`
##########
File path:
src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasherTest.java
##########
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter.hasher;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+import org.apache.commons.collections4.bloomfilter.IndexProducer;
+import org.apache.commons.collections4.bloomfilter.Shape;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Tests the {@link SimpleHasher}.
+ */
+public class SimpleHasherTest extends AbstractHasherTest {
+
+ @Override
+ protected Hasher createHasher() {
+ return new SimpleHasher(1, 1);
+ }
+
+ @Override
+ protected Hasher createEmptyHasher() {
+ return NullHasher.INSTANCE;
+ }
+
+ private void assertConstructorBuffer(Shape shape, byte[] buffer, Integer[]
expected) {
+ SimpleHasher hasher = new SimpleHasher(buffer);
+ List<Integer> lst = new ArrayList<>();
+ IndexProducer producer = hasher.indices(shape);
+ producer.forEachIndex(lst::add);
+ assertEquals(expected.length, lst.size());
+ for (int i = 0; i < expected.length; i++) {
+ assertEquals(expected[i], lst.get(i));
+ }
+ }
+
+ @Test
+ public void testConstructor() {
+
+ Shape shape = Shape.fromKM(5, 10);
+ assertConstructorBuffer(shape, new byte[] { 1, 1 }, new Integer[] { 1,
2, 3, 4, 5 });
+ assertConstructorBuffer(shape, new byte[] { 1 }, new Integer[] { 0, 1,
2, 3, 4 });
+ assertConstructorBuffer(shape, new byte[] { 1, 0, 1 }, new Integer[] {
1, 2, 3, 4, 5 });
+ assertConstructorBuffer(shape, new byte[] { 0, 1, 0, 1 }, new
Integer[] { 1, 2, 3, 4, 5 });
+ assertConstructorBuffer(shape, new byte[] { 0, 0, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 1 },
+ new Integer[] { 1, 2, 3, 4, 5 });
+ assertConstructorBuffer(shape, new byte[] { 0, 0, 0, 0, 0, 0, 0, 1, 5,
5, 0, 0, 0, 0, 0, 0, 0, 1, 5, 5 },
+ new Integer[] { 1, 2, 3, 4, 5 });
+ assertConstructorBuffer(shape, new byte[] { 0, 0, 0, 0, 0, 0, 0, 1, 5,
0, 0, 0, 0, 0, 0, 0, 1, 5, 5 },
+ new Integer[] { 1, 2, 3, 4, 5 });
+
+ // test empty buffer
+ assertThrows(IllegalArgumentException.class, () -> new
SimpleHasher(new byte[0]));
+ }
+
+ @Test
+ public void testMod() {
+
+ long dividend;
+ int divisor;
+
+ dividend = 4133050040864586807L;
+ divisor = 1110442806;
+ assertEquals(SimpleHasher.mod(dividend, divisor), (int)
Long.remainderUnsigned(dividend, divisor),
+ String.format("failure with dividend=%s and divisor=%s.",
dividend, divisor));
+
+ Random r = new Random();
+ for (int i = 0; i < 10000; i++) {
+ dividend = r.nextLong();
+ divisor = Math.abs(r.nextInt());
+ assertEquals(SimpleHasher.mod(dividend, divisor), (int)
Long.remainderUnsigned(dividend, divisor),
Review comment:
Reverse expected and actual
##########
File path:
src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java
##########
@@ -16,138 +16,239 @@
*/
package org.apache.commons.collections4.bloomfilter;
+import java.util.Objects;
import org.apache.commons.collections4.bloomfilter.hasher.Hasher;
-import org.apache.commons.collections4.bloomfilter.hasher.Shape;
-import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher;
/**
* The interface that describes a Bloom filter.
+ * <p>
+ * <em>See implementation notes for BitMapProducer and IndexProducer.</em>
+ * </p>
+ * @see BitMapProducer
+ * @see IndexProducer
* @since 4.5
*/
-public interface BloomFilter {
-
- // Query Operations
+public interface BloomFilter extends IndexProducer, BitMapProducer {
/**
- * Gets the shape of this filter.
- *
- * @return the shape of this filter
+ * Creates a new instance of the BloomFilter with the same properties as
the current one.
+ * @return a copy of this BloomFilter
*/
- Shape getShape();
+ BloomFilter copy();
+
+ // Query Operations
/**
- * Gets an array of little-endian long values representing the bits of
this filter.
+ * This method is used to determine the best method for matching.
*
- * <p>The returned array will have length {@code ceil(m / 64)} where
{@code m} is the
- * number of bits in the filter and {@code ceil} is the ceiling function.
- * Bits 0-63 are in the first long. A value of 1 at a bit position
indicates the bit
- * index is enabled.
+ * <p>For `sparse` implementations
+ * the {@code forEachIndex(IntConsumer consumer)} method is more
efficient. For non `sparse` implementations
+ * the {@code forEachBitMap(LongConsumer consumer)} is more efficient.
Implementers should determine if it is easier
+ * for the implementation to produce indexes of bit map blocks.</p>
*
- * @return the {@code long[]} representation of this filter
+ * @return {@code true} if the implementation is sparse {@code false}
otherwise.
+ * @see BitMap
*/
- long[] getBits();
+ boolean isSparse();
/**
- * Creates a StaticHasher that contains the indexes of the bits that are
on in this
- * filter.
- *
- * @return a StaticHasher for that produces this Bloom filter
+ * Gets the shape that was used when the filter was built.
+ * @return The shape the filter was built with.
*/
- StaticHasher getHasher();
+ Shape getShape();
/**
- * Returns {@code true} if this filter contains the specified filter.
Specifically this
+ * Returns {@code true} if this filter contains the specified filter.
+ *
+ * <p>Specifically this
* returns {@code true} if this filter is enabled for all bits that are
enabled in the
* {@code other} filter. Using the bit representations this is
- * effectively {@code (this AND other) == other}.
+ * effectively {@code (this AND other) == other}.</p>
*
* @param other the other Bloom filter
- * @return true if this filter is enabled for all enabled bits in the
other filter
- * @throws IllegalArgumentException if the shape of the other filter does
not match
- * the shape of this filter
+ * @return true if all enabled bits in the other filter are enabled in
this filter.
*/
- boolean contains(BloomFilter other);
+ default boolean contains(BloomFilter other) {
+ Objects.requireNonNull(other, "other");
+ return isSparse() ? contains((IndexProducer) other) :
contains((BitMapProducer) other);
+ }
/**
- * Returns {@code true} if this filter contains the specified decomposed
Bloom filter.
- * Specifically this returns {@code true} if this filter is enabled for
all bit indexes
- * identified by the {@code hasher}. Using the bit representations this is
- * effectively {@code (this AND hasher) == hasher}.
+ * Returns {@code true} if this filter contains the bits specified in the
hasher.
+ *
+ * <p>Specifically this returns {@code true} if this filter is enabled for
all bit indexes
+ * identified by the {@code hasher}. Using the bit map representations
this is
+ * effectively {@code (this AND hasher) == hasher}.</p>
*
* @param hasher the hasher to provide the indexes
* @return true if this filter is enabled for all bits specified by the
hasher
- * @throws IllegalArgumentException if the hasher cannot generate indices
for the shape of
- * this filter
*/
- boolean contains(Hasher hasher);
+ default boolean contains(Hasher hasher) {
+ Objects.requireNonNull(hasher, "Hasher");
+ Shape shape = getShape();
+ return contains(hasher.indices(shape));
+ }
- // Modification Operations
+ /**
+ * Returns {@code true} if this filter contains the indices specified
IndexProducer.
+ *
+ * <p>Specifically this returns {@code true} if this filter is enabled for
all bit indexes
+ * identified by the {@code IndexProducer}.</p>
+ *
+ * @param indexProducer the IndexProducer to provide the indexes
+ * @return {@code true} if this filter is enabled for all bits specified
by the IndexProducer
+ */
+ boolean contains(IndexProducer indexProducer);
/**
- * Merges the specified Bloom filter into this Bloom filter. Specifically
all bit indexes
- * that are enabled in the {@code other} filter will be enabled in this
filter.
+ * Returns {@code true} if this filter contains the bits specified in the
bit maps produced by the
+ * bitMapProducer.
*
- * <p>Note: This method should return {@code true} even if no additional
bit indexes were
- * enabled. A {@code false} result indicates that this filter is not
ensured to contain
- * the {@code other} Bloom filter.
+ * @param bitMapProducer the the {@code BitMapProducer} to provide the bit
maps.
+ * @return {@code true} if this filter is enabled for all bits specified
by the bit maps
+ */
+ default boolean contains(BitMapProducer bitMapProducer) {
+ return bitMapProducer.forEachBitMap(this.makePredicate((x, y) -> (x &
y) == y));
+ }
+
+ // update operations
+
+ /**
+ * Merges the specified Bloom filter with this Bloom filter creating a new
Bloom filter.
+ *
+ * <p>Specifically all bit indexes that are enabled in the {@code other}
and in @code this} filter will be
+ * enabled in the resulting filter.</p>
*
* @param other the other Bloom filter
+ * @return The new Bloom filter.
+ */
+ default BloomFilter merge(BloomFilter other) {
+ Objects.requireNonNull(other, "other");
+ BloomFilter result = copy();
+ result.mergeInPlace(other);
+ return result;
+ }
+
+ /**
+ * Merges the specified Hasher with this Bloom filter and returns a new
Bloom filter.
+ *
+ * <p>Specifically all bit indexes that are identified by the {@code
hasher} and in {@code this} Bloom filter
+ * be enabled in the resulting filter.</p>
+ *
+ * @param hasher the hasher to provide the indices
+ * @return the new Bloom filter.
+ */
+ default BloomFilter merge(Hasher hasher) {
+ Objects.requireNonNull(hasher, "hasher");
+ BloomFilter result = copy();
+ result.mergeInPlace(hasher);
+ return result;
+ }
+
+ /**
+ * Merges the specified Bloom filter into this Bloom filter.
+ *
+ * <p>Specifically all
+ * bit indexes that are identified by the {@code other} will be enabled in
this filter.</p>
+ *
+ * <p><em>Note: This method should return {@code true} even if no
additional bit indexes were
+ * enabled. A {@code false} result indicates that this filter may or may
not contain
+ * the {@code other} Bloom filter.</em> This state may occur in complex
Bloom filter implementations like
+ * counting Bloom filters.</p>
+ *
+ * @param other The bloom filter to merge into this one.
* @return true if the merge was successful
- * @throws IllegalArgumentException if the shape of the other filter does
not match
- * the shape of this filter
*/
- boolean merge(BloomFilter other);
+ boolean mergeInPlace(BloomFilter other);
/**
- * Merges the specified decomposed Bloom filter into this Bloom filter.
Specifically all
+ * Merges the specified hasher into this Bloom filter. Specifically all
* bit indexes that are identified by the {@code hasher} will be enabled
in this filter.
*
- * <p>Note: This method should return {@code true} even if no additional
bit indexes were
- * enabled. A {@code false} result indicates that this filter is not
ensured to contain
- * the specified decomposed Bloom filter.
+ * <p><em>Note: This method should return {@code true} even if no
additional bit indexes were
+ * enabled. A {@code false} result indicates that this filter may or may
not contain
+ * the {@code other} Bloom filter.</em> This state may occur in complex
Bloom filter implementations like
+ * counting Bloom filters.</p>
*
- * @param hasher the hasher to provide the indexes
+ * @param hasher The hasher to merge.
* @return true if the merge was successful
- * @throws IllegalArgumentException if the hasher cannot generate indices
for the shape of
- * this filter
*/
- boolean merge(Hasher hasher);
+ default boolean mergeInPlace(Hasher hasher) {
+ Objects.requireNonNull(hasher, "hasher");
+ Shape shape = getShape();
+ BloomFilter result = shape.isSparse((hasher.size() *
shape.getNumberOfHashFunctions()) + cardinality())
Review comment:
The current cardinality of the present filter is not relevant. This may
create a SimpleBloomFilter when the number of hash functions is small if the
current cardinality is close to saturation.
##########
File path:
src/test/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasherTest.java
##########
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter.hasher;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+import org.apache.commons.collections4.bloomfilter.IndexProducer;
+import org.apache.commons.collections4.bloomfilter.Shape;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Tests the {@link SimpleHasher}.
+ */
+public class SimpleHasherTest extends AbstractHasherTest {
+
+ @Override
+ protected Hasher createHasher() {
+ return new SimpleHasher(1, 1);
+ }
+
+ @Override
+ protected Hasher createEmptyHasher() {
+ return NullHasher.INSTANCE;
+ }
+
+ private void assertConstructorBuffer(Shape shape, byte[] buffer, Integer[]
expected) {
+ SimpleHasher hasher = new SimpleHasher(buffer);
+ List<Integer> lst = new ArrayList<>();
+ IndexProducer producer = hasher.indices(shape);
+ producer.forEachIndex(lst::add);
+ assertEquals(expected.length, lst.size());
+ for (int i = 0; i < expected.length; i++) {
+ assertEquals(expected[i], lst.get(i));
+ }
+ }
+
+ @Test
+ public void testConstructor() {
+
+ Shape shape = Shape.fromKM(5, 10);
+ assertConstructorBuffer(shape, new byte[] { 1, 1 }, new Integer[] { 1,
2, 3, 4, 5 });
+ assertConstructorBuffer(shape, new byte[] { 1 }, new Integer[] { 0, 1,
2, 3, 4 });
+ assertConstructorBuffer(shape, new byte[] { 1, 0, 1 }, new Integer[] {
1, 2, 3, 4, 5 });
+ assertConstructorBuffer(shape, new byte[] { 0, 1, 0, 1 }, new
Integer[] { 1, 2, 3, 4, 5 });
+ assertConstructorBuffer(shape, new byte[] { 0, 0, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 1 },
+ new Integer[] { 1, 2, 3, 4, 5 });
+ assertConstructorBuffer(shape, new byte[] { 0, 0, 0, 0, 0, 0, 0, 1, 5,
5, 0, 0, 0, 0, 0, 0, 0, 1, 5, 5 },
+ new Integer[] { 1, 2, 3, 4, 5 });
+ assertConstructorBuffer(shape, new byte[] { 0, 0, 0, 0, 0, 0, 0, 1, 5,
0, 0, 0, 0, 0, 0, 0, 1, 5, 5 },
+ new Integer[] { 1, 2, 3, 4, 5 });
+
+ // test empty buffer
+ assertThrows(IllegalArgumentException.class, () -> new
SimpleHasher(new byte[0]));
+ }
+
+ @Test
+ public void testMod() {
+
+ long dividend;
+ int divisor;
+
+ dividend = 4133050040864586807L;
+ divisor = 1110442806;
+ assertEquals(SimpleHasher.mod(dividend, divisor), (int)
Long.remainderUnsigned(dividend, divisor),
Review comment:
Reverse expected and actual
##########
File path:
src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java
##########
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter.hasher;
+
+import java.util.Objects;
+import java.util.function.IntPredicate;
+
+import org.apache.commons.collections4.bloomfilter.IndexProducer;
+import org.apache.commons.collections4.bloomfilter.Shape;
+import org.apache.commons.collections4.bloomfilter.hasher.filter.Filter;
+
+/**
+ * A Hasher that implements combinatorial hashing as as described by
+ * <a
href="https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf">Krisch
amd Mitzenmacher</a>.
+ * <p>
+ * Common use for this hasher is to generate a byte array as the output of a
hashing
+ * or MessageDigest algorithm.</p>
+ *
+ * @since 4.5
+ */
+public final class SimpleHasher implements Hasher {
+ /**
+ * A default increment used when the requested increment is zero. This is
the same
+ * default increment used in Java's SplittableRandom random number
generator. It is the
+ * fractional representation of the golden ratio (0.618...) with a base of
2^64.
+ */
+ public static final long DEFAULT_INCREMENT = 0x9e3779b97f4a7c15L;
+
+ /**
+ * This mask is used to obtain the value of an int as if it were unsigned.
+ */
+ private static final long LONG_MASK = 0xffffffffL;
+
+ /**
+ * The initial hash value.
+ */
+ private final long initial;
+
+ /**
+ * The value to increment the hash value by.
+ */
+ private final long increment;
+
+ /**
+ * Convert bytes to long.
+ * @param byteArray the byte array to extract the values from.
+ * @param offset the offset to start extraction from.
+ * @param len the length of the extraction, may be longer than 8.
+ * @return
+ */
+ private static long toLong(byte[] byteArray, int offset, int len) {
+ long val = 0;
+ len = Math.min(len, Long.BYTES);
+ for (int i = 0; i < len; i++) {
+ val <<= 8;
+ val |= (byteArray[offset + i] & 0x00FF);
+ }
+ return val;
+ }
+
+ /**
+ * Constructs the SimpleHasher from a byte array.
+ * <p>The byte array is split in 2 and each half is interpreted as a long
value.
+ * Excess bytes are ignored. This simplifies the conversion from a Digest
or hasher algorithm output
+ * to the two values used by the SimpleHasher.</p>
+ * <p><em>If the second long is zero the DEFAULT_INCREMENT is used
instead.</em></p>
+ * @param buffer the buffer to extract the longs from.
+ * @throws IllegalArgumentException is buffer length is zero.
+ */
+ public SimpleHasher(byte[] buffer) {
+ if (buffer.length == 0) {
+ throw new IllegalArgumentException("buffer length must be greater
than 0");
+ }
+ int segment = buffer.length / 2;
+ this.initial = toLong(buffer, 0, segment);
+ long possibleIncrement = toLong(buffer, segment, buffer.length -
segment);
+ this.increment = possibleIncrement == 0 ? DEFAULT_INCREMENT :
possibleIncrement;
+ }
+
+ /**
+ * Constructs the SimpleHasher from 2 longs. The long values will be
interpreted as unsigned values.
+ * <p><em>If the increment is zero the DEFAULT_INCREMENT is used
instead.</em></p>
+ * @param initial The initial value for the hasher.
+ * @param increment The value to increment the hash by on each iteration.
+ */
+ public SimpleHasher(long initial, long increment) {
+ this.initial = initial;
+ this.increment = increment == 0 ? DEFAULT_INCREMENT : increment;
+ }
+
+ /**
+ * This method divides a long < 0 as an unsigned long and returns the
remainder.
+ *
+ * @param dividend the number to divide.
+ * @param divisor the divisor
+ * @return the remainder
+ */
+ private static int remainder(long dividend, int divisor) {
+ long longDivisor = divisor & LONG_MASK;
+ long remainder;
+ long quotient;
+ if (divisor == 1) {
+ return 0;
+ }
+
+ // Approximate the quotient and remainder
+ quotient = (dividend >>> 1) / (longDivisor >>> 1);
+ remainder = dividend - quotient * longDivisor;
+
+ // Correct the approximation
+ while (remainder < 0) {
+ remainder += longDivisor;
+ quotient--;
+ }
+ while (remainder >= longDivisor) {
+ remainder -= longDivisor;
+ quotient++;
+ }
+ return (int) remainder;
+ }
+
+ /**
+ * Calculates the modulus of an unsigned long value and an integer divisor.
+ * @param dividend the unsigned long value to divide.
+ * @param divisor the divisor
+ * @return the remainder.
+ */
+ static int mod(long dividend, int divisor) {
+ if (dividend >= 0) {
+ return (int) (dividend % divisor);
+ }
+
+ int highValue = (int) (((dividend & (LONG_MASK << Integer.SIZE)) >>
Integer.SIZE) & LONG_MASK);
+ int lowValue = (int) (dividend & LONG_MASK);
+ // the truncated value of integer division
+ int trunk = 0;
+
+ long divisorLong = divisor & LONG_MASK;
+ // Normalize the divisor
+ int shift = Integer.numberOfLeadingZeros(divisor);
+ int rem = highValue;
+ long remLong = rem & LONG_MASK;
+ if (remLong >= divisorLong) {
+ trunk = (int) (remLong / divisorLong);
+ rem = (int) (remLong - (trunk * divisorLong));
+ remLong = rem & LONG_MASK;
+ }
+ long dividendEstimate = (remLong << 32) | (lowValue & LONG_MASK);
+ if (dividendEstimate >= 0) {
+ trunk = (int) (dividendEstimate / divisorLong);
+ rem = (int) (dividendEstimate - trunk * divisorLong);
+ } else {
+ rem = remainder(dividendEstimate, divisor);
+ }
+ remLong = rem & LONG_MASK;
+
+ // revert normalization of the divisor
+ return (shift > 0) ? rem % divisor : rem;
+ }
+
+ /**
+ * Gets an IndexProducer that produces indices based on the shape.
+ * The iterator will not return the same value multiple
+ * times. Values will be returned in ascending order.
Review comment:
Note: The values may not be in ascending order as the modulus can wrap.
I would drop this javadoc as it is wrong on this point and also mentions an
iterator which is not used as part of the method. The default javadoc should
suffice here.
##########
File path:
src/main/java/org/apache/commons/collections4/bloomfilter/hasher/filter/Filter.java
##########
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter.hasher.filter;
+
+import java.util.function.IntPredicate;
+
+import org.apache.commons.collections4.bloomfilter.BitMap;
+import org.apache.commons.collections4.bloomfilter.Shape;
+
+/**
+ * A convenience class for Hasher implementations to filter out duplicate
indices.
+ *
+ * <p><em>If the index is negative the behavior is not defined.</em></p>
+ *
+ * <p>This is conceptually a unique filter implemented as a {@code
Predicate<int>}.</p>
+ * @since 4.5
+ */
+public final class Filter implements IntPredicate {
+ private IndexTracker tracker;
+ private int size;
+ private IntPredicate consumer;
+
+ /**
+ * Creates an instance optimized for the specified shape.
+ * @param shape The shape that is being generated.
+ * @param consumer The consumer to accept the values.
+ * @return a Filter optimized for the shape.
+ */
+ public Filter(Shape shape, IntPredicate consumer) {
+ this.size = shape.getNumberOfBits();
+ this.consumer = consumer;
+ if (BitMap.numberOfBitMaps(shape.getNumberOfBits()) * Long.BYTES <
shape.getNumberOfHashFunctions()
Review comment:
Optimising based purely on storage size is not ideal as performance for
a large number of hash functions could be poor. However in the worst case the
number of longs is 2^31 / 2^6 = 2^25 longs. This is about 2^28 bytes = 256 MiB.
So a maximum size shape could have an OutOfMemoryError on limited systems but
this is unlikely.
It would be interesting to investigate this using a JMH performance test.
Perhaps when the size is small, e.g. nbits < 2^16 => memory == 2^16 / 2^6 * 2^3
= 2^13 = 8 KiB then the bitmap version should always be used.
I think only a tiny number of hash functions (e.g. 5 to 10) may be as
performant as the bitmap array. Only very large arrays are likely to have
issues with memory. This does require some more work to decide on the best
implementation.
##########
File path:
src/test/java/org/apache/commons/collections4/bloomfilter/hasher/filter/FilterTest.java
##########
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter.hasher.filter;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.lang.reflect.Field;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.commons.collections4.bloomfilter.Shape;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Tests the Filter class.
+ */
+public class FilterTest {
+
+ @Test
+ public void testFiltering() {
+ Set<Integer> tracker = new HashSet<Integer>();
+ Shape shape = Shape.fromKM(3, 12);
+ List<Integer> consumer = new ArrayList<Integer>();
+ Filter filter = new Filter(shape, consumer::add, (i) -> {
Review comment:
I don't think this constructor is necessary. The filter should be tested
by targeting the functionality of creation for a given shape and then only
allowing the first occurrence of an index to be passed to the consumer.
Add this to the pom.xml:
```xml
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-params</artifactId>
<version>${commons.junit.version}</version>
<scope>test</scope>
</dependency>
```
Then do a test of the functionality:
```Java
@ParameterizedTest
@CsvSource({
"1, 64",
"2, 64",
"3, 64",
"7, 357",
"7, 17",
})
void testFilter(int k, int m) {
Shape shape = Shape.fromKM(k, m);
BitSet used = new BitSet(m);
for (int n = 0; n < 10; n++) {
used.clear();
List<Integer> consumer = new ArrayList<>();
Filter filter = new Filter(shape, consumer::add);
// Make random indices; these may be duplicates
long seed = ThreadLocalRandom.current().nextLong();
SplittableRandom rng = new SplittableRandom(seed);
for (int i = Math.min(k, m / 2); i-- > 0;) {
int bit = rng.nextInt(m);
// duplicates should not alter the list size
int newSize = consumer.size() + (used.get(bit) ? 0 : 1);
Assertions.assertTrue(filter.test(bit));
Assertions.assertEquals(newSize, consumer.size(),
() -> String.format("Bad filter. Seed=%d, bit=%d", seed,
bit));
used.set(bit);
}
// The list should have unique entries
Assertions.assertArrayEquals(used.stream().toArray(),
consumer.stream().mapToInt(i -> (int) i).sorted().toArray());
final int size = consumer.size();
// Second observations do not change the list size
used.stream().forEach(bit -> {
Assertions.assertTrue(filter.test(bit));
Assertions.assertEquals(size, consumer.size(),
() -> String.format("Bad filter. Seed=%d, bit=%d", seed,
bit));
});
}
}
```
Note: You do not have a test to check the `IntPredicate` returns false when
the consumer returns false.
##########
File path:
src/main/java/org/apache/commons/collections4/bloomfilter/hasher/filter/Filter.java
##########
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter.hasher.filter;
+
+import java.util.function.IntPredicate;
+
+import org.apache.commons.collections4.bloomfilter.BitMap;
+import org.apache.commons.collections4.bloomfilter.Shape;
+
+/**
+ * A convenience class for Hasher implementations to filter out duplicate
indices.
+ *
+ * <p><em>If the index is negative the behavior is not defined.</em></p>
+ *
+ * <p>This is conceptually a unique filter implemented as a {@code
Predicate<int>}.</p>
+ * @since 4.5
+ */
+public final class Filter implements IntPredicate {
+ private IndexTracker tracker;
+ private int size;
+ private IntPredicate consumer;
+
+ /**
+ * Creates an instance optimized for the specified shape.
+ * @param shape The shape that is being generated.
+ * @param consumer The consumer to accept the values.
+ * @return a Filter optimized for the shape.
+ */
+ public Filter(Shape shape, IntPredicate consumer) {
+ this.size = shape.getNumberOfBits();
+ this.consumer = consumer;
+ if (BitMap.numberOfBitMaps(shape.getNumberOfBits()) * Long.BYTES <
shape.getNumberOfHashFunctions()
+ * Integer.BYTES) {
+ this.tracker = new BitMapTracker(shape);
+ } else {
+ this.tracker = new ArrayTracker(shape);
+ }
+ }
+
+ /**
+ * Creates an instance of Filter with the specified IndexTracker.
+ *
+ * @param shape The shape that is being generated.
+ * @param consumer The consumer to accept the values
+ * @param tracker The index tracker to use.
+ */
+ public Filter(Shape shape, IntPredicate consumer, IndexTracker tracker) {
Review comment:
This need not be public. It is only used for testing and then the test
does not require it. The test should target the functionality of only allowing
the first occurrence of an index to be passed to the consumer.
##########
File path:
src/main/java/org/apache/commons/collections4/bloomfilter/hasher/filter/IndexTracker.java
##########
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter.hasher.filter;
+
+/**
+ * A functional interface that defines the seen function for the Hasher Filter.
+ * @since 4.5
+ */
+@FunctionalInterface
+interface IndexTracker {
Review comment:
This would be more conventional to change the name to `IndexSet`,
`IntegerSet`, or `IntSet` and update the `seen` method to `add`. The `add`
method for a set (or collection) returns `true` if the set has been modified
which is the opposite to the case here so downstream logic would have to be
inverted.
This interface and the implementing classes should be implementation details
moved to inner classes of the Filter class. No need to expose this as a public
API. Very similar functionality can be written using BitSet, TreeSet or HashSet
(the later two require inefficient boxing of primitives); or by using BitMap
and a `long[]`. The point being that it is not a uniquely useful class beyond
the functionality exposed by the Filter class.
##########
File path:
src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java
##########
@@ -16,138 +16,239 @@
*/
package org.apache.commons.collections4.bloomfilter;
+import java.util.Objects;
import org.apache.commons.collections4.bloomfilter.hasher.Hasher;
-import org.apache.commons.collections4.bloomfilter.hasher.Shape;
-import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher;
/**
* The interface that describes a Bloom filter.
+ * <p>
+ * <em>See implementation notes for BitMapProducer and IndexProducer.</em>
+ * </p>
+ * @see BitMapProducer
+ * @see IndexProducer
* @since 4.5
*/
-public interface BloomFilter {
-
- // Query Operations
+public interface BloomFilter extends IndexProducer, BitMapProducer {
/**
- * Gets the shape of this filter.
- *
- * @return the shape of this filter
+ * Creates a new instance of the BloomFilter with the same properties as
the current one.
+ * @return a copy of this BloomFilter
*/
- Shape getShape();
+ BloomFilter copy();
+
+ // Query Operations
/**
- * Gets an array of little-endian long values representing the bits of
this filter.
+ * This method is used to determine the best method for matching.
*
- * <p>The returned array will have length {@code ceil(m / 64)} where
{@code m} is the
- * number of bits in the filter and {@code ceil} is the ceiling function.
- * Bits 0-63 are in the first long. A value of 1 at a bit position
indicates the bit
- * index is enabled.
+ * <p>For `sparse` implementations
+ * the {@code forEachIndex(IntConsumer consumer)} method is more
efficient. For non `sparse` implementations
+ * the {@code forEachBitMap(LongConsumer consumer)} is more efficient.
Implementers should determine if it is easier
+ * for the implementation to produce indexes of bit map blocks.</p>
*
- * @return the {@code long[]} representation of this filter
+ * @return {@code true} if the implementation is sparse {@code false}
otherwise.
+ * @see BitMap
*/
- long[] getBits();
+ boolean isSparse();
/**
- * Creates a StaticHasher that contains the indexes of the bits that are
on in this
- * filter.
- *
- * @return a StaticHasher for that produces this Bloom filter
+ * Gets the shape that was used when the filter was built.
+ * @return The shape the filter was built with.
*/
- StaticHasher getHasher();
+ Shape getShape();
/**
- * Returns {@code true} if this filter contains the specified filter.
Specifically this
+ * Returns {@code true} if this filter contains the specified filter.
+ *
+ * <p>Specifically this
* returns {@code true} if this filter is enabled for all bits that are
enabled in the
* {@code other} filter. Using the bit representations this is
- * effectively {@code (this AND other) == other}.
+ * effectively {@code (this AND other) == other}.</p>
*
* @param other the other Bloom filter
- * @return true if this filter is enabled for all enabled bits in the
other filter
- * @throws IllegalArgumentException if the shape of the other filter does
not match
- * the shape of this filter
+ * @return true if all enabled bits in the other filter are enabled in
this filter.
*/
- boolean contains(BloomFilter other);
+ default boolean contains(BloomFilter other) {
+ Objects.requireNonNull(other, "other");
+ return isSparse() ? contains((IndexProducer) other) :
contains((BitMapProducer) other);
+ }
/**
- * Returns {@code true} if this filter contains the specified decomposed
Bloom filter.
- * Specifically this returns {@code true} if this filter is enabled for
all bit indexes
- * identified by the {@code hasher}. Using the bit representations this is
- * effectively {@code (this AND hasher) == hasher}.
+ * Returns {@code true} if this filter contains the bits specified in the
hasher.
+ *
+ * <p>Specifically this returns {@code true} if this filter is enabled for
all bit indexes
+ * identified by the {@code hasher}. Using the bit map representations
this is
+ * effectively {@code (this AND hasher) == hasher}.</p>
*
* @param hasher the hasher to provide the indexes
* @return true if this filter is enabled for all bits specified by the
hasher
- * @throws IllegalArgumentException if the hasher cannot generate indices
for the shape of
- * this filter
*/
- boolean contains(Hasher hasher);
+ default boolean contains(Hasher hasher) {
+ Objects.requireNonNull(hasher, "Hasher");
+ Shape shape = getShape();
+ return contains(hasher.indices(shape));
+ }
- // Modification Operations
+ /**
+ * Returns {@code true} if this filter contains the indices specified
IndexProducer.
+ *
+ * <p>Specifically this returns {@code true} if this filter is enabled for
all bit indexes
+ * identified by the {@code IndexProducer}.</p>
+ *
+ * @param indexProducer the IndexProducer to provide the indexes
+ * @return {@code true} if this filter is enabled for all bits specified
by the IndexProducer
+ */
+ boolean contains(IndexProducer indexProducer);
/**
- * Merges the specified Bloom filter into this Bloom filter. Specifically
all bit indexes
- * that are enabled in the {@code other} filter will be enabled in this
filter.
+ * Returns {@code true} if this filter contains the bits specified in the
bit maps produced by the
+ * bitMapProducer.
*
- * <p>Note: This method should return {@code true} even if no additional
bit indexes were
- * enabled. A {@code false} result indicates that this filter is not
ensured to contain
- * the {@code other} Bloom filter.
+ * @param bitMapProducer the the {@code BitMapProducer} to provide the bit
maps.
+ * @return {@code true} if this filter is enabled for all bits specified
by the bit maps
+ */
+ default boolean contains(BitMapProducer bitMapProducer) {
+ return bitMapProducer.forEachBitMap(this.makePredicate((x, y) -> (x &
y) == y));
+ }
+
+ // update operations
+
+ /**
+ * Merges the specified Bloom filter with this Bloom filter creating a new
Bloom filter.
+ *
+ * <p>Specifically all bit indexes that are enabled in the {@code other}
and in @code this} filter will be
+ * enabled in the resulting filter.</p>
*
* @param other the other Bloom filter
+ * @return The new Bloom filter.
+ */
+ default BloomFilter merge(BloomFilter other) {
+ Objects.requireNonNull(other, "other");
+ BloomFilter result = copy();
+ result.mergeInPlace(other);
+ return result;
+ }
+
+ /**
+ * Merges the specified Hasher with this Bloom filter and returns a new
Bloom filter.
+ *
+ * <p>Specifically all bit indexes that are identified by the {@code
hasher} and in {@code this} Bloom filter
+ * be enabled in the resulting filter.</p>
+ *
+ * @param hasher the hasher to provide the indices
+ * @return the new Bloom filter.
+ */
+ default BloomFilter merge(Hasher hasher) {
+ Objects.requireNonNull(hasher, "hasher");
+ BloomFilter result = copy();
+ result.mergeInPlace(hasher);
+ return result;
+ }
+
+ /**
+ * Merges the specified Bloom filter into this Bloom filter.
+ *
+ * <p>Specifically all
+ * bit indexes that are identified by the {@code other} will be enabled in
this filter.</p>
+ *
+ * <p><em>Note: This method should return {@code true} even if no
additional bit indexes were
+ * enabled. A {@code false} result indicates that this filter may or may
not contain
+ * the {@code other} Bloom filter.</em> This state may occur in complex
Bloom filter implementations like
+ * counting Bloom filters.</p>
+ *
+ * @param other The bloom filter to merge into this one.
* @return true if the merge was successful
- * @throws IllegalArgumentException if the shape of the other filter does
not match
- * the shape of this filter
*/
- boolean merge(BloomFilter other);
+ boolean mergeInPlace(BloomFilter other);
/**
- * Merges the specified decomposed Bloom filter into this Bloom filter.
Specifically all
+ * Merges the specified hasher into this Bloom filter. Specifically all
* bit indexes that are identified by the {@code hasher} will be enabled
in this filter.
*
- * <p>Note: This method should return {@code true} even if no additional
bit indexes were
- * enabled. A {@code false} result indicates that this filter is not
ensured to contain
- * the specified decomposed Bloom filter.
+ * <p><em>Note: This method should return {@code true} even if no
additional bit indexes were
+ * enabled. A {@code false} result indicates that this filter may or may
not contain
+ * the {@code other} Bloom filter.</em> This state may occur in complex
Bloom filter implementations like
+ * counting Bloom filters.</p>
*
- * @param hasher the hasher to provide the indexes
+ * @param hasher The hasher to merge.
* @return true if the merge was successful
- * @throws IllegalArgumentException if the hasher cannot generate indices
for the shape of
- * this filter
*/
- boolean merge(Hasher hasher);
+ default boolean mergeInPlace(Hasher hasher) {
+ Objects.requireNonNull(hasher, "hasher");
+ Shape shape = getShape();
+ BloomFilter result = shape.isSparse((hasher.size() *
shape.getNumberOfHashFunctions()) + cardinality())
+ ? new SparseBloomFilter(shape, hasher)
+ : new SimpleBloomFilter(shape, hasher);
+ return mergeInPlace(result);
+ }
// Counting Operations
+ /**
+ * Determines if the bloom filter is "full".
+ *
+ * <p>Full is defined as having no unset bits.</p>
+ *
+ * @return {@code true} if the filter is full, {@code false} otherwise.
+ */
+ default boolean isFull() {
Review comment:
Q. Is `full` the best term here? What is used in the literature, e.g.
isSaturated?
##########
File path:
src/main/java/org/apache/commons/collections4/bloomfilter/hasher/filter/ArrayTracker.java
##########
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter.hasher.filter;
+
+import org.apache.commons.collections4.bloomfilter.Shape;
+
+/**
+ * An IndexTracker implementation that uses an array of integers to track
whether or not a
+ * number has been seen. Suitable for Shapes that have few hash functions.
+ * @since 4.5
+ */
+public class ArrayTracker implements IndexTracker {
+ private int[] seenAry;
+ private int idx;
Review comment:
It would be more conventional to name this `size`
##########
File path:
src/test/java/org/apache/commons/collections4/bloomfilter/AbstractIndexProducerTest.java
##########
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.util.function.IntPredicate;
+
+import org.junit.jupiter.api.Test;
+
+public abstract class AbstractIndexProducerTest {
+
+ public static final IntPredicate TRUE_PREDICATE = new IntPredicate() {
+
+ @Override
+ public boolean test(int arg0) {
+ return true;
+ }
+ };
+
+ public static final IntPredicate FALSE_PREDICATE = new IntPredicate() {
+
+ @Override
+ public boolean test(int arg0) {
+ return false;
+ }
+ };
+
+ /**
+ * Creates a producer with some data.
+ * @return a producer with some data
+ */
+ protected abstract IndexProducer createProducer();
+
+ /**
+ * Creates an producer without data.
+ * @return a producer that has no data.
+ */
+ protected abstract IndexProducer createEmptyProducer();
+
+ @Test
+ public final void testForEachIndex() {
+
+ IndexProducer populated = createProducer();
+ IndexProducer empty = createEmptyProducer();
+ assertFalse(populated.forEachIndex(FALSE_PREDICATE), "non-empty should
be false");
+
+ assertTrue(empty.forEachIndex(FALSE_PREDICATE), "empty should be
true");
+
+ assertTrue(populated.forEachIndex(TRUE_PREDICATE), "non-empty should
be true");
+ assertTrue(empty.forEachIndex(TRUE_PREDICATE), "empty should be true");
+ }
+
+ @Test
+ public final void testAsIndexArray() {
+ int ary[] = createEmptyProducer().asIndexArray();
+ assertEquals(0, ary.length);
+
+ IndexProducer producer = createProducer();
+ final int tary[] = producer.asIndexArray();
+ assertTrue(producer.forEachIndex(new IntPredicate() {
Review comment:
This test enforces the asIndexArray returns the same order as the
forEachIndex method. This is not mandated by the interface javadoc and prevents
implementations optimising the function. I noted this when I updated the
default implementation for `asIndexArray` to return a sorted list of unique
indices. The test should be rewritten to ensure the indices are the same but
not the order, or the documentation changed to state the order must be the same.
##########
File path:
src/main/java/org/apache/commons/collections4/bloomfilter/hasher/filter/ArrayTracker.java
##########
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter.hasher.filter;
+
+import org.apache.commons.collections4.bloomfilter.Shape;
+
+/**
+ * An IndexTracker implementation that uses an array of integers to track
whether or not a
+ * number has been seen. Suitable for Shapes that have few hash functions.
+ * @since 4.5
+ */
+public class ArrayTracker implements IndexTracker {
+ private int[] seenAry;
+ private int idx;
+
+ /**
+ * Constructs the tracker based on the shape.
+ * @param shape the shape to build the tracker for.
+ */
+ public ArrayTracker(Shape shape) {
+ seenAry = new int[shape.getNumberOfHashFunctions()];
+ idx = 0;
Review comment:
This will be initialised to zero by default.
##########
File path:
src/main/java/org/apache/commons/collections4/bloomfilter/hasher/HasherCollection.java
##########
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter.hasher;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.Objects;
+import java.util.function.IntPredicate;
+
+import org.apache.commons.collections4.bloomfilter.IndexProducer;
+import org.apache.commons.collections4.bloomfilter.Shape;
+
+/**
+ * A collection of Hashers. Useful when the generation of a Bloom filter
depends upon
+ * multiple items.
+ * <p>
+ * Hashers for each item are added to the HasherCollection and then
+ * the collection is used wherever a Hasher can be used in the API.
+ * </p>
+ * @since 4.5
+ */
+public class HasherCollection implements Hasher {
+
+ /**
+ * The list of hashers to be used to generate the indices.
+ */
+ private final List<Hasher> hashers;
+
+ /**
+ * Constructs an empty HasherCollection.
+ */
+ public HasherCollection() {
+ this.hashers = new ArrayList<>();
+ }
+
+ /**
+ * Constructs a HasherCollection from a collection of Hasher objects.
+ *
+ * @param hashers A collections of Hashers to build the indices with.
+ */
+ public HasherCollection(final Collection<Hasher> hashers) {
+ Objects.requireNonNull(hashers, "hashers");
+ this.hashers = new ArrayList<>(hashers);
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param hashers A list of Hashers to initialize the collection with.
+ */
+ public HasherCollection(Hasher... hashers) {
+ this(Arrays.asList(hashers));
+ }
+
+ /**
+ * Adds a hasher to the collection.
+ * @param hasher The hasher to add.
+ */
+ public void add(Hasher hasher) {
+ Objects.requireNonNull(hasher, "hasher");
+ hashers.add(hasher);
+ }
+
+ /**
+ * Add all the Hashers in a collection to this HasherCollection.
+ * @param hashers The hashers to add.
+ */
+ public void add(Collection<Hasher> hashers) {
+ Objects.requireNonNull(hashers, "hashers");
+ this.hashers.addAll(hashers);
+ }
+
+ @Override
+ public IndexProducer indices(final Shape shape) {
+ Objects.requireNonNull(shape, "shape");
+ return new IndexProducer() {
+ @Override
+ public boolean forEachIndex(IntPredicate consumer) {
+ for (Hasher hasher : hashers) {
+ if (!hasher.indices(shape).forEachIndex(consumer)) {
Review comment:
This will fail to satisfy the `indices` contract to only supply unique
indices as no checks are made for duplicates. I think the requirement to
eliminate duplicates should be removed from the interface.
##########
File path:
src/main/java/org/apache/commons/collections4/bloomfilter/hasher/filter/package-info.java
##########
@@ -16,9 +16,11 @@
*/
/**
- * Provides implementations of the Bloom filter
- * {@link org.apache.commons.collections4.bloomfilter.hasher.HashFunction
HashFunction} interface.
- *
+ * Hasher Filter implementations and tools.
+ * <p>
+ * The classes and interfaces found in this package are intended to filter
duplicates
+ * values from the {@code hasher.indices} methods. Developers of hashers are
expected to use
+ * the {@code Filter} implementation to perform the filtering.
* @since 4.5
*/
-package org.apache.commons.collections4.bloomfilter.hasher.function;
+package org.apache.commons.collections4.bloomfilter.hasher.filter;
Review comment:
I do not understand the requirement to put this into a separate package.
The functionality is used by hashers.
In addition the `hasher` package seems redundant. Why not put all these in
the `bloomfilter` package? You cannot use a Bloom filter without a hasher.
Currently there is a circular dependency between `bloomfilter` and
`bloomfilter.hasher`:
`bloomfilter.BloomFilter.merge(Hasher)`
but `bloomfilter.hasher.Hasher.indices()` returns
`bloomfilter.IndexProducer`.
##########
File path:
src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java
##########
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter.hasher;
+
+import java.util.Objects;
+import java.util.function.IntPredicate;
+
+import org.apache.commons.collections4.bloomfilter.IndexProducer;
+import org.apache.commons.collections4.bloomfilter.Shape;
+import org.apache.commons.collections4.bloomfilter.hasher.filter.Filter;
+
+/**
+ * A Hasher that implements combinatorial hashing as as described by
+ * <a
href="https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf">Krisch
amd Mitzenmacher</a>.
+ * <p>
+ * Common use for this hasher is to generate a byte array as the output of a
hashing
+ * or MessageDigest algorithm.</p>
+ *
+ * @since 4.5
+ */
+public final class SimpleHasher implements Hasher {
+ /**
+ * A default increment used when the requested increment is zero. This is
the same
+ * default increment used in Java's SplittableRandom random number
generator. It is the
+ * fractional representation of the golden ratio (0.618...) with a base of
2^64.
+ */
+ public static final long DEFAULT_INCREMENT = 0x9e3779b97f4a7c15L;
+
+ /**
+ * This mask is used to obtain the value of an int as if it were unsigned.
+ */
+ private static final long LONG_MASK = 0xffffffffL;
+
+ /**
+ * The initial hash value.
+ */
+ private final long initial;
+
+ /**
+ * The value to increment the hash value by.
+ */
+ private final long increment;
+
+ /**
+ * Convert bytes to long.
+ * @param byteArray the byte array to extract the values from.
+ * @param offset the offset to start extraction from.
+ * @param len the length of the extraction, may be longer than 8.
+ * @return
+ */
+ private static long toLong(byte[] byteArray, int offset, int len) {
+ long val = 0;
+ len = Math.min(len, Long.BYTES);
+ for (int i = 0; i < len; i++) {
+ val <<= 8;
+ val |= (byteArray[offset + i] & 0x00FF);
+ }
+ return val;
+ }
+
+ /**
+ * Constructs the SimpleHasher from a byte array.
+ * <p>The byte array is split in 2 and each half is interpreted as a long
value.
+ * Excess bytes are ignored. This simplifies the conversion from a Digest
or hasher algorithm output
+ * to the two values used by the SimpleHasher.</p>
+ * <p><em>If the second long is zero the DEFAULT_INCREMENT is used
instead.</em></p>
+ * @param buffer the buffer to extract the longs from.
+ * @throws IllegalArgumentException is buffer length is zero.
+ */
+ public SimpleHasher(byte[] buffer) {
+ if (buffer.length == 0) {
+ throw new IllegalArgumentException("buffer length must be greater
than 0");
+ }
+ int segment = buffer.length / 2;
+ this.initial = toLong(buffer, 0, segment);
+ long possibleIncrement = toLong(buffer, segment, buffer.length -
segment);
+ this.increment = possibleIncrement == 0 ? DEFAULT_INCREMENT :
possibleIncrement;
+ }
+
+ /**
+ * Constructs the SimpleHasher from 2 longs. The long values will be
interpreted as unsigned values.
+ * <p><em>If the increment is zero the DEFAULT_INCREMENT is used
instead.</em></p>
+ * @param initial The initial value for the hasher.
+ * @param increment The value to increment the hash by on each iteration.
+ */
+ public SimpleHasher(long initial, long increment) {
+ this.initial = initial;
+ this.increment = increment == 0 ? DEFAULT_INCREMENT : increment;
+ }
+
+ /**
+ * This method divides a long < 0 as an unsigned long and returns the
remainder.
+ *
+ * @param dividend the number to divide.
+ * @param divisor the divisor
+ * @return the remainder
+ */
+ private static int remainder(long dividend, int divisor) {
+ long longDivisor = divisor & LONG_MASK;
+ long remainder;
+ long quotient;
+ if (divisor == 1) {
+ return 0;
+ }
+
+ // Approximate the quotient and remainder
+ quotient = (dividend >>> 1) / (longDivisor >>> 1);
+ remainder = dividend - quotient * longDivisor;
+
+ // Correct the approximation
+ while (remainder < 0) {
+ remainder += longDivisor;
+ quotient--;
+ }
+ while (remainder >= longDivisor) {
+ remainder -= longDivisor;
+ quotient++;
+ }
+ return (int) remainder;
+ }
+
+ /**
+ * Calculates the modulus of an unsigned long value and an integer divisor.
+ * @param dividend the unsigned long value to divide.
+ * @param divisor the divisor
+ * @return the remainder.
+ */
+ static int mod(long dividend, int divisor) {
+ if (dividend >= 0) {
+ return (int) (dividend % divisor);
+ }
+
+ int highValue = (int) (((dividend & (LONG_MASK << Integer.SIZE)) >>
Integer.SIZE) & LONG_MASK);
+ int lowValue = (int) (dividend & LONG_MASK);
+ // the truncated value of integer division
+ int trunk = 0;
+
+ long divisorLong = divisor & LONG_MASK;
+ // Normalize the divisor
+ int shift = Integer.numberOfLeadingZeros(divisor);
+ int rem = highValue;
+ long remLong = rem & LONG_MASK;
+ if (remLong >= divisorLong) {
+ trunk = (int) (remLong / divisorLong);
+ rem = (int) (remLong - (trunk * divisorLong));
+ remLong = rem & LONG_MASK;
+ }
+ long dividendEstimate = (remLong << 32) | (lowValue & LONG_MASK);
+ if (dividendEstimate >= 0) {
+ trunk = (int) (dividendEstimate / divisorLong);
+ rem = (int) (dividendEstimate - trunk * divisorLong);
+ } else {
+ rem = remainder(dividendEstimate, divisor);
+ }
+ remLong = rem & LONG_MASK;
+
+ // revert normalization of the divisor
+ return (shift > 0) ? rem % divisor : rem;
+ }
+
+ /**
+ * Gets an IndexProducer that produces indices based on the shape.
+ * The iterator will not return the same value multiple
+ * times. Values will be returned in ascending order.
+ *
+ * @param shape {@inheritDoc}
+ * @return {@inheritDoc}
+ * @throws IllegalArgumentException {@inheritDoc}
+ */
+ @Override
+ public IndexProducer indices(final Shape shape) {
+ Objects.requireNonNull(shape, "shape");
+
+ return new IndexProducer() {
+
+ /** The index of the next item. */
+ private long next = SimpleHasher.this.initial;
+
+ @Override
+ public boolean forEachIndex(IntPredicate consumer) {
+ Objects.requireNonNull(consumer, "consumer");
+ Filter filter = new Filter(shape, consumer);
+
+ for (int functionalCount = 0; functionalCount <
shape.getNumberOfHashFunctions(); functionalCount++) {
Review comment:
Another way to get a fast modulus is to use Shapes with a power of 2
number of bits. Then you can compute a modulus with a mask:
```Java
// Here we can optimise the modulus if the number of bits is a
// power of 2 by using a mask.
int bits = shape.getNumberOfBits();
LongToIntFunction mod;
if (((bits - 1) & bits) == 0) {
// Power of 2
final int mask = bits - 1;
mod = l -> (int) l & mask;
} else {
mod = l -> mod(l, bits);
}
for (int functionalCount = 0; functionalCount <
shape.getNumberOfHashFunctions(); functionalCount++) {
if (!filter.test(mod.applyAsInt(next))) {
// reset next
next = SimpleHasher.this.initial;
return false;
}
next += SimpleHasher.this.increment;
}
```
##########
File path:
src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java
##########
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter.hasher;
+
+import java.util.Objects;
+import java.util.function.IntPredicate;
+
+import org.apache.commons.collections4.bloomfilter.IndexProducer;
+import org.apache.commons.collections4.bloomfilter.Shape;
+import org.apache.commons.collections4.bloomfilter.hasher.filter.Filter;
+
+/**
+ * A Hasher that implements combinatorial hashing as as described by
+ * <a
href="https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf">Krisch
amd Mitzenmacher</a>.
+ * <p>
+ * Common use for this hasher is to generate a byte array as the output of a
hashing
+ * or MessageDigest algorithm.</p>
+ *
+ * @since 4.5
+ */
+public final class SimpleHasher implements Hasher {
+ /**
+ * A default increment used when the requested increment is zero. This is
the same
+ * default increment used in Java's SplittableRandom random number
generator. It is the
+ * fractional representation of the golden ratio (0.618...) with a base of
2^64.
+ */
+ public static final long DEFAULT_INCREMENT = 0x9e3779b97f4a7c15L;
+
+ /**
+ * This mask is used to obtain the value of an int as if it were unsigned.
+ */
+ private static final long LONG_MASK = 0xffffffffL;
+
+ /**
+ * The initial hash value.
+ */
+ private final long initial;
+
+ /**
+ * The value to increment the hash value by.
+ */
+ private final long increment;
+
+ /**
+ * Convert bytes to long.
+ * @param byteArray the byte array to extract the values from.
+ * @param offset the offset to start extraction from.
+ * @param len the length of the extraction, may be longer than 8.
+ * @return
+ */
+ private static long toLong(byte[] byteArray, int offset, int len) {
+ long val = 0;
+ len = Math.min(len, Long.BYTES);
+ for (int i = 0; i < len; i++) {
+ val <<= 8;
+ val |= (byteArray[offset + i] & 0x00FF);
+ }
+ return val;
+ }
+
+ /**
+ * Constructs the SimpleHasher from a byte array.
+ * <p>The byte array is split in 2 and each half is interpreted as a long
value.
+ * Excess bytes are ignored. This simplifies the conversion from a Digest
or hasher algorithm output
+ * to the two values used by the SimpleHasher.</p>
+ * <p><em>If the second long is zero the DEFAULT_INCREMENT is used
instead.</em></p>
+ * @param buffer the buffer to extract the longs from.
+ * @throws IllegalArgumentException is buffer length is zero.
+ */
+ public SimpleHasher(byte[] buffer) {
+ if (buffer.length == 0) {
+ throw new IllegalArgumentException("buffer length must be greater
than 0");
+ }
+ int segment = buffer.length / 2;
+ this.initial = toLong(buffer, 0, segment);
+ long possibleIncrement = toLong(buffer, segment, buffer.length -
segment);
+ this.increment = possibleIncrement == 0 ? DEFAULT_INCREMENT :
possibleIncrement;
+ }
+
+ /**
+ * Constructs the SimpleHasher from 2 longs. The long values will be
interpreted as unsigned values.
+ * <p><em>If the increment is zero the DEFAULT_INCREMENT is used
instead.</em></p>
+ * @param initial The initial value for the hasher.
+ * @param increment The value to increment the hash by on each iteration.
+ */
+ public SimpleHasher(long initial, long increment) {
+ this.initial = initial;
+ this.increment = increment == 0 ? DEFAULT_INCREMENT : increment;
+ }
+
+ /**
+ * This method divides a long < 0 as an unsigned long and returns the
remainder.
+ *
+ * @param dividend the number to divide.
+ * @param divisor the divisor
+ * @return the remainder
+ */
+ private static int remainder(long dividend, int divisor) {
+ long longDivisor = divisor & LONG_MASK;
+ long remainder;
+ long quotient;
+ if (divisor == 1) {
+ return 0;
+ }
+
+ // Approximate the quotient and remainder
+ quotient = (dividend >>> 1) / (longDivisor >>> 1);
+ remainder = dividend - quotient * longDivisor;
+
+ // Correct the approximation
+ while (remainder < 0) {
+ remainder += longDivisor;
+ quotient--;
+ }
+ while (remainder >= longDivisor) {
+ remainder -= longDivisor;
+ quotient++;
+ }
+ return (int) remainder;
+ }
+
+ /**
+ * Calculates the modulus of an unsigned long value and an integer divisor.
+ * @param dividend the unsigned long value to divide.
+ * @param divisor the divisor
+ * @return the remainder.
+ */
+ static int mod(long dividend, int divisor) {
Review comment:
This method is complex. I wondered if it was faster than the
Long.unsignedRemainder. This uses BigInteger in JDK 11. However they switch to
a bit twiddling method in JDK 17 (see [Long JDK
17](https://github.com/openjdk/jdk17/blob/master/src/java.base/share/classes/java/lang/Long.java#L1696)).
I found this out when I had written a JMH benchmark for different methods. So
it is not a hotspot intrinsic, but it is about 4x faster in JDK17 than JDK11.
I tested:
- baseline: a random long
- OpenJDK11 unsignedRemainder (using BigInteger divide for negative dividend)
- OpenJDK17 unsignedRemainder
- shift (a random long modulus divisor)
- mod1 (this method)
- mod2 (a simplified method)
- mod3 (an inlined version of OpenJDK17 method assuming a positive divisor)
- mod4 (an inlined version of OpenJDK17 method assuming a positive divisor
but with a conditional load to return the final result)
- rng - a random integer modulus divisor
```
(divisor) (method) Mode Cnt Score Error Units
35677 baseline avgt 5 2.449 ± 0.093 ns/op
35677 remainderUnsigned avgt 5 13.710 ± 0.112 ns/op
35677 remainderUnsignedJdk11 avgt 5 54.739 ± 5.669 ns/op
35677 shift avgt 5 12.269 ± 0.149 ns/op
35677 mod1 avgt 5 26.754 ± 0.362 ns/op
35677 mod2 avgt 5 26.948 ± 0.260 ns/op
35677 mod3 avgt 5 13.531 ± 0.101 ns/op
35677 mod4 avgt 5 13.282 ± 0.362 ns/op
35677 rng avgt 5 4.788 ± 0.033 ns/op
```
Given that the SimpleHasher is using an increment to generate a sequence,
then computing an unsigned remainder this will not be very randomly spread out
over the range of the shape if the increment is small. It requires a long
modulus of some type to compute the result (which is slow). The likeliness of
the increment being small is low if it has been generated with a good random
hashing function as it would require e.g. 56 of the upper bits to be zero to
make an increment less than 256. This has a probability of 2^-56 = 1.39e-17.
Switching to using a fast mixing function on the current sequence from the
increment and seed should produce a faster and more robust output of indices.
It requires an int modulus which is faster. I can provide an implementation if
you desire.
Anyway, if you wish to stick with the unsigned modulus the `mod4` method is:
```Java
/**
* Calculates the modulus of an unsigned long value and an integer divisor.
* @param dividend the unsigned long value to divide.
* @param divisor the divisor (assume to be positive).
* @return the remainder.
*/
static int mod4(long dividend, int divisor) {
// See Hacker's Delight (2nd ed), section 9.3.
// Assume divisor is positive.
// Divide half the unsigned number and then double the quotient result.
final long quotient = ((dividend >>> 1) / divisor) << 1;
final long remainder = dividend - quotient * divisor;
// remainder in [0, 2 * divisor)
return (int) (remainder >= divisor ? remainder - divisor : remainder);
}
```
There is no license issue here as the method is from a published book.
For reference this is the other method I tried (it is slightly slower than
your version here but simpler):
```Java
static int mod2(long dividend, int divisor) {
if (dividend >= 0) {
return (int) (dividend % divisor);
}
// 2^63 % divisor
long mod = 1 + Long.MAX_VALUE % divisor;
// Add (dividend - 2^63) % divisor
mod += (dividend & Long.MAX_VALUE) % divisor;
// Ensure the modulus is in range
return (int) (mod % divisor);
}
```
It is simple, has 1 branch condition but uses 3 modulus operations. This is
where the slowness occurs.
Note that the shift operation is a fast as possible for a long modulus and
provides the limit of what is possible using `long %`:
```Java
(int)((dividend >>> 1) % divisor)
```
Note:
I updated the tests for this method to test edge cases. Your method with
10000 random numbers does not hit these edge cases:
```Java
@Test
void testModEdgeCases() {
for (long dividend : new long[] {-1, -2, -3, -6378683,
-23567468136887892L, Long.MIN_VALUE,
345, 678686, 67868768686878924L, Long.MAX_VALUE}) {
for (int divisor : new int[] {1, 2, 3, 5, 13, Integer.MAX_VALUE}) {
assertEquals((int) Long.remainderUnsigned(dividend, divisor),
SimpleHasher.mod(dividend, divisor),
() -> String.format("failure with dividend=%s and
divisor=%s.", dividend, divisor));
}
}
}
```
The updated `mod[24]` methods pass these tests as well as the current one
with 10000 random numbers.
##########
File path:
src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java
##########
@@ -16,117 +16,47 @@
*/
package org.apache.commons.collections4.bloomfilter.hasher;
-import java.nio.charset.Charset;
-import java.util.PrimitiveIterator;
+import org.apache.commons.collections4.bloomfilter.Shape;
+import org.apache.commons.collections4.bloomfilter.IndexProducer;
/**
- * A Hasher represents items of arbitrary byte size as a byte representation of
- * fixed size (a hash). The hash representations can be used to create indexes
- * for a Bloom filter.
- *
- * <p>The hash for each item is created using a hash function; use of different
- * seeds allows generation of different hashes for the same item. The hashes
can
- * be dynamically converted into the bit index representation used by a Bloom
- * filter. The shape of the Bloom filter defines the number of indexes per item
- * and the range of the indexes. The hasher can generate the correct number of
- * indexes in the range required by the Bloom filter for each item it
- * represents.
- *
- * <p>Note that the process of generating hashes and mapping them to a Bloom
- * filter shape may create duplicate indexes. The hasher may generate fewer
than
- * the required number of hash functions per item if duplicates have been
- * removed. Implementations of {@code iterator()} may return duplicate values
- * and may return values in a random order. See implementation javadoc notes as
- * to the guarantees provided by the specific implementation.
- *
- * <p>Hashers have an identity based on the hashing algorithm used.
+ * A Hasher creates IndexProducer based on the hash implementation and the
+ * provided Shape.
*
* @since 4.5
*/
public interface Hasher {
/**
- * A builder to build a hasher.
+ * Creates an IndexProducer for this hasher based on the Shape.
*
- * <p>A hasher represents one or more items of arbitrary byte size. The
builder
- * contains methods to collect byte representations of items. Each method
to add
- * to the builder will add an entire item to the final hasher created by
the
- * {@link #build()} method.
+ * <p>The @{code IndexProducer} will create indices within the range
defined by the number of bits in
+ * the shape. The total number of indices will respect the number of hash
functions per item
+ * defined by the shape. However the count of indices may not be a
multiple of the number of
+ * hash functions once implementation has removed duplicates.</p>
*
- * @since 4.5
- */
- interface Builder {
-
- /**
- * Builds the hasher from all the items.
- *
- * <p>This method will clear the builder for future use.
- *
- * @return the fully constructed hasher
- */
- Hasher build();
-
- /**
- * Adds a byte array item to the hasher.
- *
- * @param item the item to add
- * @return a reference to this object
- */
- Builder with(byte[] item);
-
- /**
- * Adds a character sequence item to the hasher using the specified
{@code charset}
- * encoding.
- *
- * @param item the item to add
- * @param charset the character set
- * @return a reference to this object
- */
- default Builder with(final CharSequence item, final Charset charset) {
- return with(item.toString().getBytes(charset));
- }
-
- /**
- * Adds a character sequence item to the hasher. Each 16-bit character
is
- * converted to 2 bytes using little-endian order.
- *
- * @param item the item to add
- * @return a reference to this object
- */
- default Builder withUnencoded(final CharSequence item) {
- final int length = item.length();
- final byte[] bytes = new byte[length * 2];
- for (int i = 0; i < length; i++) {
- final char ch = item.charAt(i);
- bytes[i * 2] = (byte) ch;
- bytes[i * 2 + 1] = (byte) (ch >>> 8);
- }
- return with(bytes);
- }
- }
-
- /**
- * Gets an iterator of integers that are the bits to enable in the Bloom
- * filter based on the shape.
+ * <p>This IndexProducer must be deterministic in that it must return the
same indices for the
+ * same Shape.</p>
*
- * <p>The iterator will create indexes within the range defined by the
number of bits in
- * the shape. The total number of indexes will respect the number of hash
functions per item
- * defined by the shape. However the count of indexes may not be a
multiple of the number of
- * hash functions if the implementation has removed duplicates.
+ * <p>No guarantee is made as to order of indices.</p>
+ * <p>Duplicates indices for a single item must be removed.</p>
Review comment:
I think the requirement to eliminate duplicates should be removed from
the interface.
The SimpleBloomFilter and SparesBloomFilter will handle duplicates. A filter
should be put into the ArrayCountingBloomFilter to eliminate duplicates when
merging.
Note that eliminating duplicates is a big overhead for merge when most Bloom
filter in use already handle this efficiently.
##########
File path:
src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java
##########
@@ -0,0 +1,241 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter;
+
+import java.util.Objects;
+import java.util.TreeSet;
+import java.util.function.IntPredicate;
+import java.util.function.LongPredicate;
+
+import org.apache.commons.collections4.bloomfilter.hasher.Hasher;
+
+/**
+ * A bloom filter using a TreeSet of integers to track enabled bits. This is a
standard
+ * implementation and should work well for most low cardinality Bloom filters.
+ * @since 4.5
+ */
+public class SparseBloomFilter implements BloomFilter {
+
+ /**
+ * The bitSet that defines this BloomFilter.
+ */
+ private final TreeSet<Integer> indices;
+
+ /**
+ * The shape of this BloomFilter.
+ */
+ private final Shape shape;
+
+ /**
+ * Constructs an empty BitSetBloomFilter.
+ *
+ * @param shape The shape of the filter.
+ */
+ public SparseBloomFilter(Shape shape) {
+ Objects.requireNonNull(shape, "shape");
+ this.shape = shape;
+ this.indices = new TreeSet<>();
+ }
+
+ /**
+ * Creates an instance that is equivalent to {@code other}.
+ *
+ * @param other The bloom filter to copy.
+ */
+ public SparseBloomFilter(BloomFilter other) {
+ Objects.requireNonNull(other, "other");
+ this.shape = other.getShape();
+ this.indices = new TreeSet<>();
+ if (other.isSparse()) {
+ mergeInPlace((IndexProducer) other);
+ } else {
+ mergeInPlace(IndexProducer.fromBitMapProducer(other));
+ }
+ }
+
+ /**
+ * Constructs a populated Bloom filter.
+ * @param shape the shape for the bloom filter.
+ * @param hasher the hasher to provide the initial data.
+ */
+ public SparseBloomFilter(final Shape shape, Hasher hasher) {
+ this(shape);
+ Objects.requireNonNull(hasher, "hasher");
+ hasher.indices(shape).forEachIndex(this::add);
+ }
+
+ /**
+ * Constructs a populated Bloom filter.
+ * @param shape the shape of the filter.
+ * @param indices an index producer for the indices to to enable.
+ * @throws IllegalArgumentException if indices contains a value greater
than the number
+ * of bits in the shape.
+ */
+ public SparseBloomFilter(Shape shape, IndexProducer indices) {
+ this(shape);
+ Objects.requireNonNull(indices, "indices");
+ mergeInPlace(indices);
+ }
+
+ /**
+ * Constructs a populated Bloom filter.
+ * @param shape the shape of the filter.
+ * @param bitmaps a BitMapProducer for the bit maps to add.
Review comment:
`@param bitMaps`
##########
File path:
src/main/java/org/apache/commons/collections4/bloomfilter/hasher/filter/Filter.java
##########
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter.hasher.filter;
+
+import java.util.function.IntPredicate;
+
+import org.apache.commons.collections4.bloomfilter.BitMap;
+import org.apache.commons.collections4.bloomfilter.Shape;
+
+/**
+ * A convenience class for Hasher implementations to filter out duplicate
indices.
+ *
+ * <p><em>If the index is negative the behavior is not defined.</em></p>
+ *
+ * <p>This is conceptually a unique filter implemented as a {@code
Predicate<int>}.</p>
+ * @since 4.5
+ */
+public final class Filter implements IntPredicate {
+ private IndexTracker tracker;
+ private int size;
+ private IntPredicate consumer;
+
+ /**
+ * Creates an instance optimized for the specified shape.
+ * @param shape The shape that is being generated.
+ * @param consumer The consumer to accept the values.
+ * @return a Filter optimized for the shape.
+ */
+ public Filter(Shape shape, IntPredicate consumer) {
+ this.size = shape.getNumberOfBits();
+ this.consumer = consumer;
+ if (BitMap.numberOfBitMaps(shape.getNumberOfBits()) * Long.BYTES <
shape.getNumberOfHashFunctions()
+ * Integer.BYTES) {
+ this.tracker = new BitMapTracker(shape);
+ } else {
+ this.tracker = new ArrayTracker(shape);
+ }
+ }
+
+ /**
+ * Creates an instance of Filter with the specified IndexTracker.
+ *
+ * @param shape The shape that is being generated.
+ * @param consumer The consumer to accept the values
+ * @param tracker The index tracker to use.
+ */
+ public Filter(Shape shape, IntPredicate consumer, IndexTracker tracker) {
+ this.size = shape.getNumberOfBits();
+ this.consumer = consumer;
+ this.tracker = tracker;
+ }
+
+ /**
+ * Test if the number has not been seen.
+ *
+ * <p>The first time a number is tested the method returns {@code true}
and returns
Review comment:
This is incorrect. It returns the result of the IntConsumer (true or
false) the first time a number is tested, then always returns true for
subsequent tests on the same number.
Note this does create an issue where the first observation of an index can
return false and then subsequent observations return true:
```Java
@Test
void contradictory() {
Shape shape = Shape.fromKM(4, 37);
Filter filter = new Filter(shape, i -> false);
Assertions.assertFalse(filter.test(3));
Assertions.assertTrue(filter.test(3));
Assertions.assertTrue(filter.test(3));
}
```
For a correct implementation the result should ideally be the same. Given
this it may better to hide this class totally from the public API. It is only
used in SimpleHasher to ensure unique indices (which is a requirement that
should be dropped IMO) and in SingleItemHasherCollection where it filters the
indices from a HasherCollection (which can be duplicates thus breaking the
current requirement to have unique indices).
The functionality may be better served up by one of:
```Java
public interface Hasher {
default IndexProducer uniqueIndices(Shape shape) {
return new IndexProducer() {
@Override
public boolean forEachIndex(IntPredicate predicate) {
Filter filter = new Filter(shape, predicate);
return Hasher.this.indices(shape).forEachIndex(filter);
}
};
}
}
public interface IndexProducer {
default IndexProducer distinct() {
// This cannot be optimised for the shape in the default
implementation.
return new IndexProducer() {
private final BitSet set = new BitSet();
@Override
public boolean forEachIndex(IntPredicate predicate) {
set.clear();
return IndexProducer.this.forEachIndex(i -> {
if (!set.get(i)) {
set.set(i);
return predicate.test(i);
}
return true;
});
}
};
}
}
```
Placing this in the `Hasher` interface allows implementations to override it
if they know they are already unique and ensures the default implementation can
be optimised for the shape. It also hides the contradictory behaviour of the
Filter as this can be package private. Any IntPredicate returning false should
fail fast the forEachIndex method and any subsequent repeats of the index will
not be consumed.
Placing it in `IndexProducer` allows a unique output of indices:
```Java
IndexProducer p;
p.distinct().asIndexArray();
```
Currently there is no requirement in the interface javadoc that
`asIndexArray()` should be distinct. This could be enforced by changing the
default implementation to:
```Java
default int[] asIndexArray() {
BitSet set = new BitSet();
forEachIndex(i -> {
set.set(i);
return true;
});
return set.stream().toArray();
```
Overriding classes should then ensure they stick to the contract to not
generate duplicates.
##########
File path:
src/main/java/org/apache/commons/collections4/bloomfilter/hasher/filter/Filter.java
##########
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter.hasher.filter;
+
+import java.util.function.IntPredicate;
+
+import org.apache.commons.collections4.bloomfilter.BitMap;
+import org.apache.commons.collections4.bloomfilter.Shape;
+
+/**
+ * A convenience class for Hasher implementations to filter out duplicate
indices.
+ *
+ * <p><em>If the index is negative the behavior is not defined.</em></p>
+ *
+ * <p>This is conceptually a unique filter implemented as a {@code
Predicate<int>}.</p>
+ * @since 4.5
+ */
+public final class Filter implements IntPredicate {
+ private IndexTracker tracker;
+ private int size;
+ private IntPredicate consumer;
+
+ /**
+ * Creates an instance optimized for the specified shape.
+ * @param shape The shape that is being generated.
+ * @param consumer The consumer to accept the values.
+ * @return a Filter optimized for the shape.
Review comment:
No `@return`. Should this be a private constructor and factory
constructor method?
I would move this out of the filter package into the hasher package; add a
factory constructor; put the IndexTracker interface as a private inner
interface and the two implementations ArrayTracker and BitMapTracker into
private inner classes; and update the class name to reflect the functionality.
The API then becomes:
```Java
public class IndexFilter implements IntPredicate {
public static IndexFilter create(Shape shape, IntPredicate consumer);
public boolean test(int index);
}
```
This is in the interest of minimising the public API for the first cut
release. The ArrayTracker and BitMapTracker are then hidden implementation
details.
##########
File path: src/main/java/org/apache/commons/collections4/bloomfilter/BitMap.java
##########
@@ -17,46 +17,79 @@
package org.apache.commons.collections4.bloomfilter;
/**
- * Contains functions to convert {@code int} indices into Bloom filter bit
positions.
+ * Contains functions to convert {@code int} indices into Bloom filter bit
positions and visa versa.
+ *
+ * <p>The functions view an array of longs as a collection of bit maps each
containing 64 bits. The bits are arranged
+ * in memory as a little-endian long value. This matches the requirements of
the BitMapProducer interface.</p>
+ *
+ * @since 4.5
*/
-public final class BloomFilterIndexer {
+public class BitMap {
/** A bit shift to apply to an integer to divided by 64 (2^6). */
private static final int DIVIDE_BY_64 = 6;
/** Do not instantiate. */
- private BloomFilterIndexer() {}
+ private BitMap() {
+ }
+
+ /**
+ * Calculates the number of bit maps (longs) required for the numberOfBits
parameter.
+ *
+ * <p><em>If the input is negative the behavior is not defined.</em></p>
+ *
+ * @param numberOfBits the number of bits to store in the array of bit
maps.
+ * @return the number of bit maps necessary.
+ */
+ public static int numberOfBitMaps(int numberOfBits) {
+ return ((numberOfBits - 1) >> DIVIDE_BY_64) + 1;
+ }
+
+ /**
+ * Checks if the specified index bit is enabled in the array of bit maps.
+ *
+ * If the bit specified by idx is not in the bit map false is returned.
+ *
+ * @param bitMaps The array of bit maps.
+ * @param idx the index of the bit to locate.
+ * @return {@code true} if the bit is enabled, {@code false} otherwise.
+ * @throws IndexOutOfBoundsException if idx specifies a bit not in the
range being tracked.
+ */
+ public static boolean contains(long[] bitMaps, int idx) {
Review comment:
For consistency all index parameters in this class should have the same
name. You have used `idx` and `bitIndex` for different methods. Java's `BitSet`
class uses `bitIndex` so I would change to that.
##########
File path:
src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SimpleHasher.java
##########
@@ -0,0 +1,217 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter.hasher;
+
+import java.util.Objects;
+import java.util.function.IntPredicate;
+
+import org.apache.commons.collections4.bloomfilter.IndexProducer;
+import org.apache.commons.collections4.bloomfilter.Shape;
+import org.apache.commons.collections4.bloomfilter.hasher.filter.Filter;
+
+/**
+ * A Hasher that implements combinatorial hashing as as described by
+ * <a
href="https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf">Krisch
amd Mitzenmacher</a>.
+ * <p>
+ * Common use for this hasher is to generate a byte array as the output of a
hashing
+ * or MessageDigest algorithm.</p>
+ *
+ * @since 4.5
+ */
+public final class SimpleHasher implements Hasher {
+ /**
+ * A default increment used when the requested increment is zero. This is
the same
+ * default increment used in Java's SplittableRandom random number
generator. It is the
+ * fractional representation of the golden ratio (0.618...) with a base of
2^64.
+ */
+ public static final long DEFAULT_INCREMENT = 0x9e3779b97f4a7c15L;
+
+ /**
+ * This mask is used to obtain the value of an int as if it were unsigned.
+ */
+ private static final long LONG_MASK = 0xffffffffL;
+
+ /**
+ * The initial hash value.
+ */
+ private final long initial;
+
+ /**
+ * The value to increment the hash value by.
+ */
+ private final long increment;
+
+ /**
+ * Convert bytes to long.
+ * @param byteArray the byte array to extract the values from.
+ * @param offset the offset to start extraction from.
+ * @param len the length of the extraction, may be longer than 8.
+ * @return
+ */
+ private static long toLong(byte[] byteArray, int offset, int len) {
+ long val = 0;
+ len = Math.min(len, Long.BYTES);
+ for (int i = 0; i < len; i++) {
+ val <<= 8;
+ val |= (byteArray[offset + i] & 0x00FF);
Review comment:
A personal preference is to use lower case letters. Or if you prefer
upper case then use it also for the `LONG_MASK` and golden ratio constant just
to be consistent. I would drop the leading `00` to have `0xff`.
##########
File path:
src/test/java/org/apache/commons/collections4/bloomfilter/hasher/AbstractHasherTest.java
##########
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter.hasher;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import org.apache.commons.collections4.bloomfilter.AbstractIndexProducerTest;
+import org.apache.commons.collections4.bloomfilter.IndexProducer;
+import org.apache.commons.collections4.bloomfilter.Shape;
+import org.junit.Test;
+
+public abstract class AbstractHasherTest extends AbstractIndexProducerTest {
+
+ protected abstract Hasher createHasher();
+
+ protected abstract Hasher createEmptyHasher();
+
+ /**
+ * The shape of the Hashers filters for testing.
+ * <ul>
+ * <li>Hash functions (k) = 17
+ * <li>Number of bits (m) = 72
+ * </ul>
+ * @return the testing shape.
+ */
+ protected final Shape getTestShape() {
+ return Shape.fromKM(17, 72);
+ }
+
+ @Override
+ protected IndexProducer createProducer() {
+ return createHasher().indices(getTestShape());
+ }
+
+ @Override
+ protected IndexProducer createEmptyProducer() {
+ return createEmptyHasher().indices(getTestShape());
+ }
+
+ @Test
+ public void testSize() {
+ assertEquals(1, createHasher().size());
+ assertEquals(0, createEmptyHasher().size());
+ }
+
+ @Test
+ public void testIsEmpty() {
+ assertFalse(createHasher().isEmpty());
+ assertTrue(createEmptyHasher().isEmpty());
+ }
+}
Review comment:
You have no tests for a hasher to show that the IndexProducer interface
will return the correct number of hash functions for the shape and number of
items in the hasher. As such the SingleItemHasherCollection fails this test:
This fails:
```Java
@ParameterizedTest
@CsvSource({
"17, 72",
"3, 14",
"5, 67868",
})
public void testHashing(int k, int m) {
int[] count = {0};
Hasher hasher = createHasher();
hasher.indices(Shape.fromKM(k, m)).forEachIndex(i -> {
Assertions.assertTrue(i >= 0 && i < m, () -> "Out of range: " +
i + ", m=" + m);
count[0]++;
return true;
});
Assertions.assertEquals(k * hasher.size(), count[0],
() -> String.format("Did not produce k=%d * m=%d indices", k,
hasher.size()));
}
```
The SingleItemHasherCollection needs a rethink. If you wish to combine lots
of hashers as a single item then you must pick k indices at random from the
entire output of indices from all the hashers. Otherwise you are adding too
many indices to your Bloom filter for your 'single' item.
If this is not the intention then it should be documented that the
SingleItemHasherCollection reports its size as 1 when non-empty but may
generate a number of indices larger than that requested by the Shape.
It could be fixed by implementing n choose k on the entire output of
indices. But this requires a random number generator to choose the k.
Collections does not currently depend on Commons RNG. This does provide an
efficient n choose k sampler. But then a SingleItemHasherCollection would have
to be constructed using a UniformRandomProvider. It may be better to drop this
class for this initial release.
##########
File path:
src/main/java/org/apache/commons/collections4/bloomfilter/hasher/SingleItemHasherCollection.java
##########
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter.hasher;
+
+import java.util.Collection;
+import java.util.Objects;
+import java.util.function.IntPredicate;
+
+import org.apache.commons.collections4.bloomfilter.IndexProducer;
+import org.apache.commons.collections4.bloomfilter.Shape;
+import org.apache.commons.collections4.bloomfilter.hasher.filter.Filter;
+
+/**
+ * A collection of Hashers that are combined to be a single item. This
differs from
+ * the HasherCollection in that the HasherCollection counts each Hasher in the
collection as
+ * a different item, or in the case of an enclosed HasherCollection multiple
items. This collection
+ * assumes that all hashers are combined to make a single item.
+ *
+ * @since 4.5
+ */
+public class SingleItemHasherCollection extends HasherCollection {
+
+ /**
+ * Constructs an empty SingleItemHasherCollection.
+ */
+ public SingleItemHasherCollection() {
+ super();
+ }
+
+ /**
+ * Constructs a SingleItemHasherCollection from a collection of Hasher
objects.
+ *
+ * @param hashers A collections of Hashers to build the indices with.
+ */
+ public SingleItemHasherCollection(Collection<Hasher> hashers) {
+ super(hashers);
+ }
+
+ /**
+ * Constructor.
+ *
+ * @param hashers A list of Hashers to initialize the collection with.
+ */
+ public SingleItemHasherCollection(Hasher... hashers) {
+ super(hashers);
+ }
+
+ /**
+ * Produces unique indices.
+ *
+ * <p>Specifically, this method create an IndexProducer that will not
return duplicate indices. The effect is
+ * to make the entire collection appear as one item. This useful when
working with complex Bloom filters like the
+ * CountingBloomFilter.</p>
+ *
+ * @param shape The shape of the desired Bloom filter.
+ * @return an IndexProducer that only produces unique values.
+ */
+ @Override
+ public IndexProducer indices(final Shape shape) {
+ Objects.requireNonNull(shape, "shape");
+ IndexProducer baseProducer = super.indices(shape);
+
+ return new IndexProducer() {
+ @Override
+ public boolean forEachIndex(IntPredicate consumer) {
+ Objects.requireNonNull(consumer, "consumer");
+ int size = getHashers().size() == 0 ? 1 : getHashers().size();
+ Shape filterShape =
Shape.fromKM(shape.getNumberOfHashFunctions() * size,
Review comment:
Multiplying the number of bits by the size is wrong. The filter should
be constructed to be able to filter indices up to `nbits`. Filtering up to
`size * nbits` is wasteful as higher indices are not generated.
If the indices method is updated to drop the requirement to eliminate
duplicates then this filter is not required.
##########
File path: src/conf/checkstyle.xml
##########
@@ -63,5 +63,22 @@ limitations under the License.
<!-- Indentation style recommended by Oracle -->
<property name="caseIndent" value="0"/>
</module>
+ <!-- start of custom modules -->
Review comment:
Why add commented out rules? I assumed it is because there are a lot of
other source files that fail these rules too which is true. I put these back in
and your code is OK. The rest of the code has 1200+ errors.
Just use the rules to fix your style issues then remove these from the
commit.
I may add a placeholder task to in Jira to add some more rules to clean up
the code. The `JavadocStyle` rule is something that would benefit the online
javadoc of the codebase.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]