aherbert commented on a change in pull request #258: URL: https://github.com/apache/commons-collections/pull/258#discussion_r806265481
########## File path: src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java ########## @@ -0,0 +1,256 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Objects; +import java.util.TreeSet; +import java.util.function.IntPredicate; +import java.util.function.LongPredicate; + +/** + * A bloom filter using a TreeSet of integers to track enabled bits. This is a standard + * implementation and should work well for most low cardinality Bloom filters. + * @since 4.5 + */ +public class SparseBloomFilter implements BloomFilter { + + /** + * The bitSet that defines this BloomFilter. + */ + private final TreeSet<Integer> indices; + + /** + * The shape of this BloomFilter. + */ + private final Shape shape; + + /** + * Constructs an empty BitSetBloomFilter. + * + * @param shape The shape of the filter. + */ + public SparseBloomFilter(Shape shape) { + Objects.requireNonNull(shape, "shape"); + this.shape = shape; + this.indices = new TreeSet<>(); + } + + /** + * Creates an instance that is equivalent to {@code other}. + * + * @param other The bloom filter to copy. + */ + public SparseBloomFilter(BloomFilter other) { + Objects.requireNonNull(other, "other"); + this.shape = other.getShape(); + this.indices = new TreeSet<>(); + if (other.isSparse()) { + mergeInPlace((IndexProducer) other); + } else { + mergeInPlace(IndexProducer.fromBitMapProducer(other)); + } + } + + private void checkIndices(Shape shape) { + if (this.indices.floor(-1) != null || this.indices.ceiling(shape.getNumberOfBits()) != null) { + throw new IllegalArgumentException( + String.format("Filter only accepts values in the [0,%d) range", shape.getNumberOfBits())); + } + } + + /** + * Constructs a populated Bloom filter. + * @param shape the shape for the bloom filter. + * @param hasher the hasher to provide the initial data. + */ + public SparseBloomFilter(final Shape shape, Hasher hasher) { + this(shape); + Objects.requireNonNull(hasher, "hasher"); + hasher.indices(shape).forEachIndex(this::add); + checkIndices(shape); + } + + /** + * Constructs a populated Bloom filter. + * @param shape the shape of the filter. + * @param indices an index producer for the indices to to enable. + * @throws IllegalArgumentException if indices contains a value greater than the number + * of bits in the shape. + */ + public SparseBloomFilter(Shape shape, IndexProducer indices) { + this(shape); + Objects.requireNonNull(indices, "indices"); + indices.forEachIndex(this::add); + checkIndices(shape); + } + + /** + * Constructs a populated Bloom filter. + * @param shape the shape of the filter. + * @param bitMaps a BitMapProducer for the bit maps to add. + * @throws IllegalArgumentException if the bit maps contain a value greater than the number + * of bits in the shape. + */ + public SparseBloomFilter(Shape shape, BitMapProducer bitMaps) { + this(shape); + Objects.requireNonNull(bitMaps, "bitMaps"); + mergeInPlace(IndexProducer + .fromBitMapProducer(new CheckBitMapCount(bitMaps, BitMap.numberOfBitMaps(shape.getNumberOfBits())))); + } + + @Override + public long[] asBitMapArray() { + long[] result = new long[BitMap.numberOfBitMaps(shape.getNumberOfBits())]; + LongPredicate filler = new LongPredicate() { + int idx = 0; + + @Override + public boolean test(long value) { + result[idx++] = value; + return true; + } + }; + forEachBitMap(filler); + return result; + } + + @Override + public SparseBloomFilter copy() { + SparseBloomFilter result = new SparseBloomFilter(shape); + result.indices.addAll(indices); + return result; + } + + /** + * Adds the index to the indices. + * @param idx the index to add. + * @return {@code true} always + */ + private boolean add(int idx) { + indices.add(idx); + return true; + } + + /** + * Performs a merge in place using an IndexProducer. + * @param indexProducer the IndexProducer to merge from. + * @throws IllegalArgumentException if producer sends illegal value. + */ + private void mergeInPlace(IndexProducer indexProducer) { + indexProducer.forEachIndex(this::add); + if (!this.indices.isEmpty()) { + if (this.indices.last() >= shape.getNumberOfBits()) { + throw new IllegalArgumentException(String.format("Value in list %s is greater than maximum value (%s)", + this.indices.last(), shape.getNumberOfBits())); + } + if (this.indices.first() < 0) { + throw new IllegalArgumentException( + String.format("Value in list %s is less than 0", this.indices.first())); + } + } + } + + @Override + public boolean mergeInPlace(Hasher hasher) { + Objects.requireNonNull(hasher, "hasher"); + mergeInPlace(hasher.indices(shape)); + return true; + } + + @Override + public boolean mergeInPlace(BloomFilter other) { + Objects.requireNonNull(other, "other"); + IndexProducer producer = other.isSparse() ? (IndexProducer) other : IndexProducer.fromBitMapProducer(other); + mergeInPlace(producer); + return true; + } + + @Override + public Shape getShape() { + return shape; + } + + @Override + public boolean isSparse() { + return true; + } + + @Override + public int cardinality() { + return indices.size(); + } + + @Override + public boolean forEachIndex(IntPredicate consumer) { + Objects.requireNonNull(consumer, "consumer"); + for (int value : indices) { + if (!consumer.test(value)) { + return false; + } + } + return true; + } + + @Override + public boolean forEachBitMap(LongPredicate consumer) { + Objects.requireNonNull(consumer, "consumer"); + int limit = BitMap.numberOfBitMaps(shape.getNumberOfBits()); + /* + * because our indices are always in order we can shorten the time necessary to + * create the longs for the consumer + */ + // the currenlty constructed bitMap + long bitMap = 0; + // the bitmap we are working on + int idx = 0; + for (int i : indices) { + while (BitMap.getLongIndex(i) != idx) { + if (!consumer.test(bitMap)) { + return false; + } + bitMap = 0; + idx++; + } + bitMap |= BitMap.getLongBit(i); + } + // we fall through with data in the bitMap + if (!consumer.test(bitMap)) { + return false; + } + // account for hte bitMap in the previous block + the next one + idx++; + // while there are more blocks to generate send zero to the consumer. + while (idx < limit) { + if (!consumer.test(0L)) { + return false; + } + idx++; + } + return true; + } + + @Override + public boolean contains(IndexProducer indexProducer) { + return indexProducer.forEachIndex(indices::contains); + } + + @Override + public boolean contains(BitMapProducer bitMapProducer) { + return contains(IndexProducer.fromBitMapProducer( Review comment: There is no documented requirement for the contains method to accept only BitMapProducer with the correct number of bitmaps for the shape. The use of CheckBitMapCount is not required. This raising an exception prevents BitMapProducers from using early exit when the remaining bitmaps are all zeros. ########## File path: src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java ########## @@ -0,0 +1,236 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Arrays; +import java.util.Objects; +import java.util.function.IntPredicate; +import java.util.function.LongPredicate; + +/** + * A bloom filter using an array of bit maps to track enabled bits. This is a standard + * implementation and should work well for most Bloom filters. + * @since 4.5 + */ +public class SimpleBloomFilter implements BloomFilter { + + /** + * The array of bit map longs that defines this Bloom filter. Will be null if the filter is empty. + */ + private final long[] bitMap; + + /** + * The Shape of this Bloom filter. + */ + private final Shape shape; + + /** + * The cardinality of this Bloom filter. + */ + private int cardinality; Review comment: Why store the cardinality? If desired then this should be set as -1 when a merge is done and computed lazily. ```Java @Override public int cardinality() { // Lazy evaluation with caching int c = cardinality; if (c < 0) { for (long bits : bitMap) { c += Long.bitCount(bits); } cardinality = c; } return c; } ``` In the current scheme I can merge 50 filters and have to put up with 50 repeat calculations of the cardinality slowing it down. ########## File path: src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java ########## @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; +import java.util.function.IntPredicate; +import java.util.function.LongPredicate; + +/** + * An object that produces indices of a Bloom filter. + * <p><em> + * The default implementation of {@code asIndexArray} is slow. Implementers should reimplement the + * method where possible.</em></p> + * + * @since 4.5 + */ +@FunctionalInterface +public interface IndexProducer { + + /** + * Each index is passed to the predicate. The predicate is applied to each + * index value, if the predicate returns {@code false} the execution is stopped, {@code false} + * is returned, and no further indices are processed. + * + * <p>Any exceptions thrown by the action are relayed to the caller.</p> + * + * <p>Indices ordering is not guaranteed</p> + * + * @param predicate the action to be performed for each non-zero bit index. + * @return {@code true} if all indexes return true from consumer, {@code false} otherwise. + * @throws NullPointerException if the specified action is null + */ + boolean forEachIndex(IntPredicate predicate); + + /** + * Creates an IndexProducer from an array of integers. + * @param values the index values + * @return an IndexProducer that uses the values. + */ + static IndexProducer fromIntArray(final int[] values) { + return new IndexProducer() { + + @Override + public boolean forEachIndex(IntPredicate predicate) { + for (int value : values) { + if (!predicate.test(value)) { + return false; + } + } + return true; + } + }; + } + + /** + * Creates an IndexProducer from a {@code BitMapProducer}. + * @param producer the {@code BitMapProducer} + * @return a new {@code IndexProducer}. + */ + static IndexProducer fromBitMapProducer(BitMapProducer producer) { + Objects.requireNonNull(producer, "producer"); + return new IndexProducer() { + @Override + public boolean forEachIndex(IntPredicate consumer) { + LongPredicate longPredicate = new LongPredicate() { + int wordIdx = 0; + + @Override + public boolean test(long word) { + int i = wordIdx; + while (word != 0) { + if ((word & 1) == 1) { + if (!consumer.test(i)) { + return false; + } + } + word >>>= 1; + i++; + } + wordIdx += 64; + return true; + } + }; + return producer.forEachBitMap(longPredicate::test); + } + }; + } + + /** + * Return a copy of the IndexProducer data as an int array. + * <p><em> + * The default implementation of this method is slow. It is recommended + * that implementing classes reimplement this method. + * </em></p> + * @return An int array of the data. + */ + default int[] asIndexArray() { Review comment: Avoid boxing in the default implementation using a BitSet. The BitSet will resize more efficiently than an ArrayList. This also ensures the indices are sorted and distinct. The stream toArray() from the BitSet is likely quite inefficient though so this is more a optimisation of memory consumption verses speed. ```Java default int[] asIndexArray() { BitSet result = new BitSet(); forEachIndex(i -> { result.set(i); return true; }); return result.stream().toArray(); } ``` ########## File path: src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java ########## @@ -0,0 +1,170 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Objects; +import java.util.function.LongPredicate; + +/** + * Produces bit map longs for a Bloom filter. + * + * Each bit map is a little-endian long value representing a block of bits of in a filter. + * + * <p>The returned array will have length {@code ceil(m / 64)} where {@code m} is the + * number of bits in the filter and {@code ceil} is the ceiling function. + * Bits 0-63 are in the first long. A value of 1 at a bit position indicates the bit + * index is enabled. + * </p><p><em> + * The default implementations of the {@code makePredicate()} and {@code asBitMapArray} methods + * are slow and should be reimplemented in the implementing classes where possible.</em></p> + * + * @since 4.5 + */ +@FunctionalInterface +public interface BitMapProducer { + + /** + * Each bit map is passed to the predicate in order. The predicate is applied to each + * bit map value, if the predicate returns {@code false} the execution is stopped, {@code false} + * is returned, and no further bit maps are processed. + * + * <p>If the producer is empty this method will return true.</p> + * + * <p>Any exceptions thrown by the action are relayed to the caller.</p> + * + * @param predicate the function to execute + * @return {@code true} if all bit maps returned {@code true}, {@code false} otherwise. + * @throws NullPointerException if the specified consumer is null + */ + boolean forEachBitMap(LongPredicate predicate); + + /** + * Applies the {@code func} to each bit map pair in order. + * <p> + * Creates a LongPredicate that is used in apply {@code func} to this BitMapProducer + * and another. For example:</p> + * <pre> + * BitMapProducer a = ....; + * BitMapProducer b = ....; + * LongPredicate predicate = a.makePredicate( (x,y) -> x==y ); + * boolean result = b.apply( predicate ); + * </pre> + * <p>The above example will execute a.bitmapValue == b.bitmapValue for every value in b. + * </p><p> + * Notes: + * </p><ul> + * <li>The resulting LongPredicate should only be used once.</li> + * <li>Any changes made to the {@code func} arguments will not survive outside of the {@code func} call.</li> + * <li>For an example of how to use {@code makePredicate} to apply non-binary functions across all pairs of + * bit maps see the SetOperations code.</li> + * </ul> + * <p> + * <em>The default implementation of this method uses {@code asBitMapArray()} It is recommended that implementations + * of BitMapProducer that have local arrays reimplement this method.</em></p> + * + * @param func The function to apply. + * @return A LongPredicate that tests this BitMapProducers bitmap values in order. + */ + default LongPredicate makePredicate(LongBiPredicate func) { Review comment: I think creating a predicate that can be used outside of the intended scope is not ideal. Calling it too many times will generate an IOOB exception. The makePredicate method is poor encapsulation and has a name that does not describe the intended usage. It is better to hide the predicate so that it is ensured to be single use: ```Java default boolean forEachBitMapPair(BitMapProducer other, LongBiPredicate func) { long[] ary = asBitMapArray(); LongPredicate p = new LongPredicate() { int idx = 0; @Override public boolean test(long other) { return func.test(idx > ary.length ? 0L : ary[idx++], other); } }; return other.forEachBitMap(p); } ``` The makePredicate is only overridden in SimpleBloomFilter. This can simply repeat the default `forEachBitMapPair` method without first dumping the bitmaps to an array. However the above method does not address Shape mismatch. This actually works: ```Java default boolean forEachBitMapPair(BitMapProducer other, LongBiPredicate func) { long[] ary = asBitMapArray(); class CountingLongPredicate implements LongPredicate { int idx = 0; @Override public boolean test(long other) { return func.test(idx == ary.length ? 0 : ary[idx++], other); } boolean forEachRemaining() { while (idx != ary.length && func.test(ary[idx], 0)) { idx++; } return idx == ary.length; } } CountingLongPredicate p = new CountingLongPredicate(); if (!other.forEachBitMap(p)) { return false; } // Consume the rest return p.forEachRemaining(); } ``` This is a quick hack. Ideally the CountingLongPredicate should be package private and have a constructor accepting `long[]`. It can then be reused by the SimpleBloomFilter with the direct array reference with 2 lines of code: ```Java boolean forEachBitMapPair(BitMapProducer other, LongBiPredicate func) { CountingLongPredicate p = new CountingLongPredicate(bitMap); return other.forEachBitMap(p) && p.forEachRemaining(); } ``` ########## File path: src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java ########## @@ -0,0 +1,256 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Objects; +import java.util.TreeSet; +import java.util.function.IntPredicate; +import java.util.function.LongPredicate; + +/** + * A bloom filter using a TreeSet of integers to track enabled bits. This is a standard + * implementation and should work well for most low cardinality Bloom filters. + * @since 4.5 + */ +public class SparseBloomFilter implements BloomFilter { + + /** + * The bitSet that defines this BloomFilter. + */ + private final TreeSet<Integer> indices; + + /** + * The shape of this BloomFilter. + */ + private final Shape shape; + + /** + * Constructs an empty BitSetBloomFilter. + * + * @param shape The shape of the filter. + */ + public SparseBloomFilter(Shape shape) { + Objects.requireNonNull(shape, "shape"); + this.shape = shape; + this.indices = new TreeSet<>(); + } + + /** + * Creates an instance that is equivalent to {@code other}. + * + * @param other The bloom filter to copy. + */ + public SparseBloomFilter(BloomFilter other) { + Objects.requireNonNull(other, "other"); + this.shape = other.getShape(); + this.indices = new TreeSet<>(); + if (other.isSparse()) { + mergeInPlace((IndexProducer) other); + } else { + mergeInPlace(IndexProducer.fromBitMapProducer(other)); + } + } + + private void checkIndices(Shape shape) { + if (this.indices.floor(-1) != null || this.indices.ceiling(shape.getNumberOfBits()) != null) { + throw new IllegalArgumentException( + String.format("Filter only accepts values in the [0,%d) range", shape.getNumberOfBits())); + } + } + + /** + * Constructs a populated Bloom filter. + * @param shape the shape for the bloom filter. + * @param hasher the hasher to provide the initial data. + */ + public SparseBloomFilter(final Shape shape, Hasher hasher) { + this(shape); + Objects.requireNonNull(hasher, "hasher"); + hasher.indices(shape).forEachIndex(this::add); + checkIndices(shape); + } + + /** + * Constructs a populated Bloom filter. + * @param shape the shape of the filter. + * @param indices an index producer for the indices to to enable. + * @throws IllegalArgumentException if indices contains a value greater than the number + * of bits in the shape. + */ + public SparseBloomFilter(Shape shape, IndexProducer indices) { + this(shape); + Objects.requireNonNull(indices, "indices"); + indices.forEachIndex(this::add); + checkIndices(shape); + } + + /** + * Constructs a populated Bloom filter. + * @param shape the shape of the filter. + * @param bitMaps a BitMapProducer for the bit maps to add. + * @throws IllegalArgumentException if the bit maps contain a value greater than the number + * of bits in the shape. + */ + public SparseBloomFilter(Shape shape, BitMapProducer bitMaps) { + this(shape); + Objects.requireNonNull(bitMaps, "bitMaps"); + mergeInPlace(IndexProducer + .fromBitMapProducer(new CheckBitMapCount(bitMaps, BitMap.numberOfBitMaps(shape.getNumberOfBits())))); Review comment: This use of CheckBitMapCount can be replaced with a simple merge and then a call to checkIndices. Using the CheckBitMapCount would not detect an error for the case where e.g. bits = 65, but the producer outputs [-1, -1] for the two bit maps so populates indices [0, 127]. ########## File path: src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java ########## @@ -16,138 +16,282 @@ */ package org.apache.commons.collections4.bloomfilter; -import org.apache.commons.collections4.bloomfilter.hasher.Hasher; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; -import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher; +import java.util.Objects; +import java.util.function.LongPredicate; /** * The interface that describes a Bloom filter. + * <p> + * <em>See implementation notes for BitMapProducer and IndexProducer.</em> + * </p> + * @see BitMapProducer + * @see IndexProducer * @since 4.5 */ -public interface BloomFilter { - - // Query Operations +public interface BloomFilter extends IndexProducer, BitMapProducer { /** - * Gets the shape of this filter. - * - * @return the shape of this filter + * Creates a new instance of the BloomFilter with the same properties as the current one. + * @return a copy of this BloomFilter */ - Shape getShape(); + BloomFilter copy(); + + // Query Operations /** - * Gets an array of little-endian long values representing the bits of this filter. + * This method is used to determine the best method for matching. * - * <p>The returned array will have length {@code ceil(m / 64)} where {@code m} is the - * number of bits in the filter and {@code ceil} is the ceiling function. - * Bits 0-63 are in the first long. A value of 1 at a bit position indicates the bit - * index is enabled. + * <p>For `sparse` implementations + * the {@code forEachIndex(IntConsumer consumer)} method is more efficient. For non `sparse` implementations + * the {@code forEachBitMap(LongConsumer consumer)} is more efficient. Implementers should determine if it is easier + * for the implementation to produce indexes of bit map blocks.</p> * - * @return the {@code long[]} representation of this filter + * @return {@code true} if the implementation is sparse {@code false} otherwise. + * @see BitMap */ - long[] getBits(); + boolean isSparse(); /** - * Creates a StaticHasher that contains the indexes of the bits that are on in this - * filter. - * - * @return a StaticHasher for that produces this Bloom filter + * Gets the shape that was used when the filter was built. + * @return The shape the filter was built with. */ - StaticHasher getHasher(); + Shape getShape(); /** - * Returns {@code true} if this filter contains the specified filter. Specifically this + * Returns {@code true} if this filter contains the specified filter. + * + * <p>Specifically this * returns {@code true} if this filter is enabled for all bits that are enabled in the * {@code other} filter. Using the bit representations this is - * effectively {@code (this AND other) == other}. + * effectively {@code (this AND other) == other}.</p> * * @param other the other Bloom filter - * @return true if this filter is enabled for all enabled bits in the other filter - * @throws IllegalArgumentException if the shape of the other filter does not match - * the shape of this filter + * @return true if all enabled bits in the other filter are enabled in this filter. */ - boolean contains(BloomFilter other); + default boolean contains(BloomFilter other) { + Objects.requireNonNull(other, "other"); + return isSparse() ? contains((IndexProducer) other) : contains((BitMapProducer) other); + } /** - * Returns {@code true} if this filter contains the specified decomposed Bloom filter. - * Specifically this returns {@code true} if this filter is enabled for all bit indexes - * identified by the {@code hasher}. Using the bit representations this is - * effectively {@code (this AND hasher) == hasher}. + * Returns {@code true} if this filter contains the bits specified in the hasher. + * + * <p>Specifically this returns {@code true} if this filter is enabled for all bit indexes + * identified by the {@code hasher}. Using the bit map representations this is + * effectively {@code (this AND hasher) == hasher}.</p> * * @param hasher the hasher to provide the indexes * @return true if this filter is enabled for all bits specified by the hasher - * @throws IllegalArgumentException if the hasher cannot generate indices for the shape of - * this filter */ - boolean contains(Hasher hasher); + default boolean contains(Hasher hasher) { + Objects.requireNonNull(hasher, "Hasher"); + Shape shape = getShape(); + return contains(hasher.indices(shape)); + } + + /** + * Returns {@code true} if this filter contains the indices specified IndexProducer. + * + * <p>Specifically this returns {@code true} if this filter is enabled for all bit indexes + * identified by the {@code IndexProducer}.</p> + * + * @param indexProducer the IndexProducer to provide the indexes + * @return {@code true} if this filter is enabled for all bits specified by the IndexProducer + */ + boolean contains(IndexProducer indexProducer); - // Modification Operations + /** + * Returns {@code true} if this filter contains the bits specified in the bit maps produced by the + * bitMapProducer. + * + * @param bitMapProducer the the {@code BitMapProducer} to provide the bit maps. + * @return {@code true} if this filter is enabled for all bits specified by the bit maps + */ + default boolean contains(BitMapProducer bitMapProducer) { + return bitMapProducer.forEachBitMap(this.makePredicate((x, y) -> (x & y) == y)); + } + + // update operations /** - * Merges the specified Bloom filter into this Bloom filter. Specifically all bit indexes - * that are enabled in the {@code other} filter will be enabled in this filter. + * Merges the specified Bloom filter with this Bloom filter creating a new Bloom filter. * - * <p>Note: This method should return {@code true} even if no additional bit indexes were - * enabled. A {@code false} result indicates that this filter is not ensured to contain - * the {@code other} Bloom filter. + * <p>Specifically all bit indexes that are enabled in the {@code other} and in @code this} filter will be + * enabled in the resulting filter.</p> * * @param other the other Bloom filter + * @return The new Bloom filter. + */ + default BloomFilter merge(BloomFilter other) { + Objects.requireNonNull(other, "other"); + BloomFilter result = copy(); + result.mergeInPlace(other); + return result; + } + + /** + * Merges the specified Hasher with this Bloom filter and returns a new Bloom filter. + * + * <p>Specifically all bit indexes that are identified by the {@code hasher} and in {@code this} Bloom filter + * be enabled in the resulting filter.</p> + * + * @param hasher the hasher to provide the indices + * @return the new Bloom filter. + */ + default BloomFilter merge(Hasher hasher) { + Objects.requireNonNull(hasher, "hasher"); + BloomFilter result = copy(); + result.mergeInPlace(hasher); + return result; + } + + /** + * Merges the specified Bloom filter into this Bloom filter. + * + * <p>Specifically all + * bit indexes that are identified by the {@code other} will be enabled in this filter.</p> + * + * <p><em>Note: This method should return {@code true} even if no additional bit indexes were + * enabled. A {@code false} result indicates that this filter may or may not contain + * the {@code other} Bloom filter.</em> This state may occur in complex Bloom filter implementations like + * counting Bloom filters.</p> + * + * @param other The bloom filter to merge into this one. * @return true if the merge was successful - * @throws IllegalArgumentException if the shape of the other filter does not match - * the shape of this filter */ - boolean merge(BloomFilter other); + boolean mergeInPlace(BloomFilter other); /** - * Merges the specified decomposed Bloom filter into this Bloom filter. Specifically all + * Merges the specified hasher into this Bloom filter. Specifically all * bit indexes that are identified by the {@code hasher} will be enabled in this filter. * - * <p>Note: This method should return {@code true} even if no additional bit indexes were - * enabled. A {@code false} result indicates that this filter is not ensured to contain - * the specified decomposed Bloom filter. + * <p><em>Note: This method should return {@code true} even if no additional bit indexes were + * enabled. A {@code false} result indicates that this filter may or may not contain + * the {@code other} Bloom filter.</em> This state may occur in complex Bloom filter implementations like + * counting Bloom filters.</p> * - * @param hasher the hasher to provide the indexes + * @param hasher The hasher to merge. * @return true if the merge was successful - * @throws IllegalArgumentException if the hasher cannot generate indices for the shape of - * this filter */ - boolean merge(Hasher hasher); + default boolean mergeInPlace(Hasher hasher) { + Objects.requireNonNull(hasher, "hasher"); + Shape shape = getShape(); + // create the bloomfilter that is most likely to merge quickly with this one + BloomFilter result = isSparse() ? new SparseBloomFilter(shape, hasher) : new SimpleBloomFilter(shape, hasher); + return mergeInPlace(result); + } // Counting Operations + /** + * Determines if the bloom filter is "full". + * + * <p>Full is defined as having no unset bits.</p> + * + * @return {@code true} if the filter is full, {@code false} otherwise. + */ + default boolean isFull() { + return cardinality() == getShape().getNumberOfBits(); + } + /** * Gets the cardinality (number of enabled bits) of this Bloom filter. * - * <p>This is also known as the Hamming value.</p> + * <p>This is also known as the Hamming value or Hamming number.</p> * * @return the cardinality of this filter */ int cardinality(); /** - * Performs a logical "AND" with the other Bloom filter and returns the cardinality - * (number of enabled bits) of the result. + * Estimates the number of items in the Bloom filter. * - * @param other the other Bloom filter - * @return the cardinality of the result of {@code (this AND other)} + * <p>By default this is the rounding of the {@code Shape.estimateN(cardinality)} calculation for the + * shape and cardinality of this filter.</p> + * + * <p>This produces an estimate roughly equivalent to the number of Hashers that have been merged into the filter.</p> + * + * @return an estimate of the number of items in the bloom filter. + * @see Shape#estimateN(int) */ - int andCardinality(BloomFilter other); + default int estimateN() { + return (int) Math.round(getShape().estimateN(cardinality())); + } /** - * Performs a logical "OR" with the other Bloom filter and returns the cardinality - * (number of enabled bits) of the result. + * Estimates the number of items in the union of this Bloom filter with the other bloom filter. * - * @param other the other Bloom filter - * @return the cardinality of the result of {@code (this OR other)} + * <p>By default this is the {@code estimateN()} of the merging of this filter with the {@code other} filter.</p> + * + * <p>This produces an estimate roughly equivalent to the number of unique Hashers that have been merged into either + * of the filters.</p> + * + * @param other The other Bloom filter + * @return an estimate of the number of items in the union. + * @see #estimateN() */ - int orCardinality(BloomFilter other); + default int estimateUnion(BloomFilter other) { + Objects.requireNonNull(other, "other"); + return this.merge(other).estimateN(); + } /** - * Performs a logical "XOR" with the other Bloom filter and returns the cardinality - * (number of enabled bits) of the result. + * Estimates the number of items in the intersection of this Bloom filter with the other bloom filter. * - * @param other the other Bloom filter - * @return the cardinality of the result of {@code (this XOR other)} + * <p>By default this is the {@code estimateN() + other.estimateN() - estimateUnion(other)} </p> + * + * <p>This produces estimate is roughly equivalent to the number of unique Hashers that have been merged into both + * of the filters.</p> + * + * @param other The other Bloom filter + * @return an estimate of the number of items in the intersection. */ - int xorCardinality(BloomFilter other); + default int estimateIntersection(BloomFilter other) { + Objects.requireNonNull(other, "other"); + return estimateN() + other.estimateN() - estimateUnion(other); + } + + /** + * Convenience class to test that a bitMapProducer is producing the proper number of bitMaps. + * Primarily used in constructors. + */ + class CheckBitMapCount implements BitMapProducer { Review comment: Remove this redundant class. It is cluttering the public API. The BitMapProducer interface is free to generate bitmaps up to a maximum starting from 0. It is of limited use and where you have used it there are are alternatives that catch more bugs since this counts number of bit maps and does not check the maximum bit index. ########## File path: src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java ########## @@ -0,0 +1,236 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Arrays; +import java.util.Objects; +import java.util.function.IntPredicate; +import java.util.function.LongPredicate; + +/** + * A bloom filter using an array of bit maps to track enabled bits. This is a standard + * implementation and should work well for most Bloom filters. + * @since 4.5 + */ +public class SimpleBloomFilter implements BloomFilter { Review comment: I would make this class final. I would make all the implementations final. I do not see why most of the classes are not final as why would you extend them? Currently if this is extended a child class has limited ability to do anything as the bitmap array is private. If not final then the correct parts of the class should be made `protected` to allow efficient copy() by child classes. The simplest way is: ```Java protected SimpleBloomFilter(SimpleBloomFilter source) { // copy private fields } ``` To be used by a derived class as: ```Java MyBloomFilter(MyBloomFilter source) { super(source); // copy other stuff } @Override public MyBloomFilter copy() { return new MyBloomFilter(this); } ``` I would not make the private fields protected. That will avoid a derived class causing trouble with the values. ########## File path: src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java ########## @@ -0,0 +1,236 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Arrays; +import java.util.Objects; +import java.util.function.IntPredicate; +import java.util.function.LongPredicate; + +/** + * A bloom filter using an array of bit maps to track enabled bits. This is a standard + * implementation and should work well for most Bloom filters. + * @since 4.5 + */ +public class SimpleBloomFilter implements BloomFilter { + + /** + * The array of bit map longs that defines this Bloom filter. Will be null if the filter is empty. + */ + private final long[] bitMap; + + /** + * The Shape of this Bloom filter. + */ + private final Shape shape; + + /** + * The cardinality of this Bloom filter. + */ + private int cardinality; + + /** + * Creates an empty instance. + * + * @param shape The shape for the filter. + */ + public SimpleBloomFilter(Shape shape) { + Objects.requireNonNull(shape, "shape"); + this.shape = shape; + this.bitMap = new long[BitMap.numberOfBitMaps(shape.getNumberOfBits())]; + this.cardinality = 0; + } + + /** + * Creates an instance that is equivalent to {@code other}. + * + * @param other The bloom filter to copy. + */ + public SimpleBloomFilter(BloomFilter other) { + Objects.requireNonNull(other, "other"); + this.shape = other.getShape(); + this.bitMap = new long[BitMap.numberOfBitMaps(shape.getNumberOfBits())]; + this.cardinality = 0; + if (other.isSparse()) { + mergeInPlace((IndexProducer) other); + } else { + mergeInPlace((BitMapProducer) other); + } + } + + /** + * Creates a populated instance. + * @param shape The shape for the filter. + * @param hasher the Hasher to initialize the filter with. + */ + public SimpleBloomFilter(final Shape shape, Hasher hasher) { + this(shape); + Objects.requireNonNull(hasher, "hasher"); + mergeInPlace(hasher); + } + + /** + * Creates a populated instance. + * @param shape The shape for the filter. + * @param indices the IndexProducer to initialize the filter with. + * @throws IllegalArgumentException if producer sends illegal value. + */ + public SimpleBloomFilter(final Shape shape, IndexProducer indices) { + this(shape); + Objects.requireNonNull(indices, "indices"); + mergeInPlace(indices); + } + + /** + * Creates a populated instance. + * @param shape The shape for the filter. + * @param bitMaps the BitMapProducer to initialize the filter with. + * @throws IllegalArgumentException if the producer returns too many or too few bit maps. + */ + public SimpleBloomFilter(final Shape shape, BitMapProducer bitMaps) { + this(shape); + Objects.requireNonNull(bitMaps, "bitMaps"); + mergeInPlace(bitMaps); + } + + @Override + public long[] asBitMapArray() { + return Arrays.copyOf(bitMap, bitMap.length); + } + + @Override + public LongPredicate makePredicate(LongBiPredicate func) { + return new LongPredicate() { + int idx = 0; + + @Override + public boolean test(long other) { + return func.test(idx >= bitMap.length ? 0 : bitMap[idx++], other); + } + }; + } + + @Override + public SimpleBloomFilter copy() { Review comment: Copy is done wrong. It should use `bitMap.clone()`. Creating a new array then using system array copy is slower as the JRE must first zero fill the allocated memory, then copy into it. This should be done with a copy constructor: ```Java private SimpleBloomFilter(SimpleBloomFilter source) { this.shape = source.shape; this.bitMap = source.bitMap.clone(); this.cardinality = source.cardinality; } ``` ########## File path: src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java ########## @@ -0,0 +1,256 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Objects; +import java.util.TreeSet; +import java.util.function.IntPredicate; +import java.util.function.LongPredicate; + +/** + * A bloom filter using a TreeSet of integers to track enabled bits. This is a standard + * implementation and should work well for most low cardinality Bloom filters. + * @since 4.5 + */ +public class SparseBloomFilter implements BloomFilter { + + /** + * The bitSet that defines this BloomFilter. + */ + private final TreeSet<Integer> indices; + + /** + * The shape of this BloomFilter. + */ + private final Shape shape; + + /** + * Constructs an empty BitSetBloomFilter. + * + * @param shape The shape of the filter. + */ + public SparseBloomFilter(Shape shape) { + Objects.requireNonNull(shape, "shape"); + this.shape = shape; + this.indices = new TreeSet<>(); + } + + /** + * Creates an instance that is equivalent to {@code other}. + * + * @param other The bloom filter to copy. + */ + public SparseBloomFilter(BloomFilter other) { + Objects.requireNonNull(other, "other"); + this.shape = other.getShape(); + this.indices = new TreeSet<>(); + if (other.isSparse()) { + mergeInPlace((IndexProducer) other); + } else { + mergeInPlace(IndexProducer.fromBitMapProducer(other)); + } + } + + private void checkIndices(Shape shape) { + if (this.indices.floor(-1) != null || this.indices.ceiling(shape.getNumberOfBits()) != null) { + throw new IllegalArgumentException( + String.format("Filter only accepts values in the [0,%d) range", shape.getNumberOfBits())); + } + } + + /** + * Constructs a populated Bloom filter. + * @param shape the shape for the bloom filter. + * @param hasher the hasher to provide the initial data. + */ + public SparseBloomFilter(final Shape shape, Hasher hasher) { + this(shape); + Objects.requireNonNull(hasher, "hasher"); + hasher.indices(shape).forEachIndex(this::add); + checkIndices(shape); + } + + /** + * Constructs a populated Bloom filter. + * @param shape the shape of the filter. + * @param indices an index producer for the indices to to enable. + * @throws IllegalArgumentException if indices contains a value greater than the number + * of bits in the shape. + */ + public SparseBloomFilter(Shape shape, IndexProducer indices) { + this(shape); + Objects.requireNonNull(indices, "indices"); + indices.forEachIndex(this::add); + checkIndices(shape); + } + + /** + * Constructs a populated Bloom filter. + * @param shape the shape of the filter. + * @param bitMaps a BitMapProducer for the bit maps to add. + * @throws IllegalArgumentException if the bit maps contain a value greater than the number + * of bits in the shape. + */ + public SparseBloomFilter(Shape shape, BitMapProducer bitMaps) { + this(shape); + Objects.requireNonNull(bitMaps, "bitMaps"); + mergeInPlace(IndexProducer + .fromBitMapProducer(new CheckBitMapCount(bitMaps, BitMap.numberOfBitMaps(shape.getNumberOfBits())))); + } + + @Override + public long[] asBitMapArray() { Review comment: This is very inefficient. Avoid using forEachBitMap. That must composite build each bitmap as it goes. The same operations can just be done directly on the output array which will run entirely branchless apart from the iterator of `indices`: ```Java @Override public long[] asBitMapArray() { long[] result = new long[BitMap.numberOfBitMaps(shape.getNumberOfBits())]; for (int i : indices) { BitMap.set(result, i); } return result; } ``` ########## File path: src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java ########## @@ -0,0 +1,170 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Objects; +import java.util.function.LongPredicate; + +/** + * Produces bit map longs for a Bloom filter. + * + * Each bit map is a little-endian long value representing a block of bits of in a filter. + * + * <p>The returned array will have length {@code ceil(m / 64)} where {@code m} is the + * number of bits in the filter and {@code ceil} is the ceiling function. + * Bits 0-63 are in the first long. A value of 1 at a bit position indicates the bit + * index is enabled. + * </p><p><em> + * The default implementations of the {@code makePredicate()} and {@code asBitMapArray} methods + * are slow and should be reimplemented in the implementing classes where possible.</em></p> + * + * @since 4.5 + */ +@FunctionalInterface +public interface BitMapProducer { + + /** + * Each bit map is passed to the predicate in order. The predicate is applied to each + * bit map value, if the predicate returns {@code false} the execution is stopped, {@code false} + * is returned, and no further bit maps are processed. + * + * <p>If the producer is empty this method will return true.</p> + * + * <p>Any exceptions thrown by the action are relayed to the caller.</p> + * + * @param predicate the function to execute + * @return {@code true} if all bit maps returned {@code true}, {@code false} otherwise. + * @throws NullPointerException if the specified consumer is null + */ + boolean forEachBitMap(LongPredicate predicate); + + /** + * Applies the {@code func} to each bit map pair in order. + * <p> + * Creates a LongPredicate that is used in apply {@code func} to this BitMapProducer + * and another. For example:</p> + * <pre> + * BitMapProducer a = ....; + * BitMapProducer b = ....; + * LongPredicate predicate = a.makePredicate( (x,y) -> x==y ); + * boolean result = b.apply( predicate ); + * </pre> + * <p>The above example will execute a.bitmapValue == b.bitmapValue for every value in b. + * </p><p> + * Notes: + * </p><ul> + * <li>The resulting LongPredicate should only be used once.</li> + * <li>Any changes made to the {@code func} arguments will not survive outside of the {@code func} call.</li> + * <li>For an example of how to use {@code makePredicate} to apply non-binary functions across all pairs of + * bit maps see the SetOperations code.</li> + * </ul> + * <p> + * <em>The default implementation of this method uses {@code asBitMapArray()} It is recommended that implementations + * of BitMapProducer that have local arrays reimplement this method.</em></p> + * + * @param func The function to apply. + * @return A LongPredicate that tests this BitMapProducers bitmap values in order. + */ + default LongPredicate makePredicate(LongBiPredicate func) { + long[] ary = asBitMapArray(); + + return new LongPredicate() { + int idx = 0; + + @Override + public boolean test(long other) { + return func.test(idx >= ary.length ? 0L : ary[idx++], other); + } + }; + } + + /** + * Return a copy of the BitMapProducer data as a bit map array. + * <p> + * The default implementation of this method is slow. It is recommended + * that implementing classes reimplement this method. + * </p> + * @return An array of bit map data. + */ + default long[] asBitMapArray() { Review comment: We should avoid boxing. So the default implementation can be made better: ```Java default long[] asBitMapArray() { class Bits { private long[] data = new long[16]; private int size; boolean add(long bits) { if (size == data.length) { // This will throw an out-of-memory error if there are too many bits. // Since bits are addressed using 32-bit signed integer indices // the maximum length should be ~2^31 / 2^6 = ~2^25. // Any more is a broken implementation. data = Arrays.copyOf(data, size * 2); } data[size++] = bits; return true; } long[] toArray() { // Edge case to avoid a large array copy return size == data.length ? data : Arrays.copyOf(data, size); } } Bits bits = new Bits(); forEachBitMap(bits::add); return bits.toArray(); } ``` ########## File path: src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java ########## @@ -16,147 +16,172 @@ */ package org.apache.commons.collections4.bloomfilter; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; +import java.util.function.LongBinaryOperator; +import java.util.function.LongPredicate; /** - * Implementations of set operations on Bloom filters. + * Implementations of set operations on BitMapProducers. * + * @since 4.5 */ public final class SetOperations { Review comment: You have rewritten this class to entirely depend on the `makePredicate` idea where the predicate can send zeros if its array of bitmaps runs out before the other provider does. However this is not mirrored. A simple test with different shapes shows this fails: ```Java Shape shape = Shape.fromKM(3, 128); Shape shape2 = Shape.fromKM(3, 192); // ... filter1 = new SparseBloomFilter(shape2, IndexProducer.fromIntArray(new int[] { 1, 63, 185})); filter2 = new SparseBloomFilter(shape, IndexProducer.fromIntArray(new int[] { 5, 64, 69 })); // This fails! //assertEquals(6, SetOperations.orCardinality(filter1, filter2)); assertEquals(6, SetOperations.orCardinality(filter2, filter1)); ``` ########## File path: src/main/java/org/apache/commons/collections4/bloomfilter/LongBiPredicate.java ########## @@ -14,12 +14,20 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +package org.apache.commons.collections4.bloomfilter; /** - * Provides classes and interfaces to define the shape of a Bloom filter and the conversion - * of generic bytes to a hash of bit indexes to be used with a Bloom filter. - * + * A bi function that accepts long values. Review comment: Adapt the Javadoc from java.util.function: Represents a predicate (boolean-valued function) of two arguments. This is the primitive type specialization of {@link java.util.function.BiPredicate BiPredicate} for {@code long}. ########## File path: src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java ########## @@ -16,147 +16,172 @@ */ package org.apache.commons.collections4.bloomfilter; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; +import java.util.function.LongBinaryOperator; +import java.util.function.LongPredicate; /** - * Implementations of set operations on Bloom filters. + * Implementations of set operations on BitMapProducers. * + * @since 4.5 */ public final class SetOperations { /** - * Calculates the Cosine distance between two Bloom filters. + * Calculates the cardinality of the result of a LongBinaryOperator using the + * {@code BitMapProducer.makePredicate} method. + * @param first the first BitMapProducer + * @param second the second BitMapProducer + * @param op a long binary operation on where x = {@code first} and y = {@code second} bitmap producers. + * @return the calculated cardinality. + */ + private static int cardinality(BitMapProducer first, BitMapProducer second, LongBinaryOperator op) { + int[] cardinality = new int[1]; + + LongPredicate lp = first.makePredicate((x, y) -> { + cardinality[0] += Long.bitCount(op.applyAsLong(x, y)); + return true; + }); + second.forEachBitMap(lp); + return cardinality[0]; + } + + /** + * Calculates the cardinality of a BitMapProducer. By necessity this method will visit each bit map + * created by the producer. + * @param producer the Producer to calculate the cardinality for. + * @return the cardinality of the bit maps produced by the producer. + */ + public static int cardinality(BitMapProducer producer) { + int[] cardinality = new int[1]; + producer.forEachBitMap(l -> { + cardinality[0] += Long.bitCount(l); + return true; + }); + return cardinality[0]; + } + + /** + * Calculates the cardinality of the logical {@code AND} of the bit maps for the two filters. + * @param first the first BitMapProducer. + * @param second the second BitMapProducer + * @return the cardinality of the {@code AND} of the filters. + */ + public static int andCardinality(final BitMapProducer first, final BitMapProducer second) { + return cardinality(first, second, (x, y) -> x & y); + } + + /** + * Calculates the cardinality of the logical {@code OR} of the bit maps for the two filters. + * @param first the first BitMapProducer. + * @param second the second BitMapProducer + * @return the cardinality of the {@code OR} of the filters. + */ + public static int orCardinality(final BitMapProducer first, final BitMapProducer second) { + return cardinality(first, second, (x, y) -> x | y); + } + + /** + * Calculates the cardinality of the logical {@code XOR} of the bit maps for the two filters. + * @param first the first BitMapProducer. + * @param second the second BitMapProducer + * @return the cardinality of the {@code XOR} of the filters. + */ + public static int xorCardinality(final BitMapProducer first, final BitMapProducer second) { + return cardinality(first, second, (x, y) -> x ^ y); + } + + /** + * Calculates the Cosine distance between two BitMapProducer. * * <p>Cosine distance is defined as {@code 1 - Cosine similarity}</p> * - * @param first the first Bloom filter. - * @param second the second Bloom filter. + * @param first the first BitMapProducer. + * @param second the second BitMapProducer. * @return the jaccard distance. */ - public static double cosineDistance(final BloomFilter first, final BloomFilter second) { + public static double cosineDistance(final BitMapProducer first, final BitMapProducer second) { return 1.0 - cosineSimilarity(first, second); } /** - * Calculates the Cosine similarity between two Bloom filters. + * Calculates the Cosine similarity between two BitMapProducers. * <p> Also known as Orchini similarity and the Tucker coefficient of congruence or * Ochiai similarity.</p> * - * <p>If either filter is empty (no enabled bits) the result is 0 (zero)</p> + * <p>If either producer is empty the result is 0 (zero)</p> * - * @param first the first Bloom filter. - * @param second the second Bloom filter. + * @param first the first BitMapProducer. + * @param second the second BitMapProducer. * @return the Cosine similarity. */ - public static double cosineSimilarity(final BloomFilter first, final BloomFilter second) { - verifyShape(first, second); - final int numerator = first.andCardinality(second); - return numerator == 0 ? 0 : numerator / (Math.sqrt(first.cardinality()) * Math.sqrt(second.cardinality())); + public static double cosineSimilarity(final BitMapProducer first, final BitMapProducer second) { + final int numerator = andCardinality(first, second); + // Given that the cardinality is an int then the product as a double will not + // overflow, we can use one sqrt: + return numerator == 0 ? 0 : numerator / Math.sqrt(cardinality(first) * cardinality(second)); } /** - * Estimates the number of items in the intersection of the sets represented by two - * Bloom filters. + * Calculates the Cosine similarity between two Bloom filters. + * <p> Also known as Orchini similarity and the Tucker coefficient of congruence or + * Ochiai similarity.</p> * - * @param first the first Bloom filter. - * @param second the second Bloom filter. - * @return an estimate of the size of the intersection between the two filters. - */ - public static long estimateIntersectionSize(final BloomFilter first, final BloomFilter second) { - verifyShape(first, second); - // do subtraction early to avoid Long overflow. - return estimateSize(first) - estimateUnionSize(first, second) + estimateSize(second); - } - - /** - * Estimates the number of items in the Bloom filter based on the shape and the number - * of bits that are enabled. + * <p>If either filter is empty (no enabled bits) the result is 0 (zero)</p> * - * @param filter the Bloom filter to estimate size for. - * @return an estimate of the number of items that were placed in the Bloom filter. - */ - public static long estimateSize(final BloomFilter filter) { - final Shape shape = filter.getShape(); - final double estimate = -(shape.getNumberOfBits() * - Math.log(1.0 - filter.cardinality() * 1.0 / shape.getNumberOfBits())) / - shape.getNumberOfHashFunctions(); - return Math.round(estimate); - } - - /** - * Estimates the number of items in the union of the sets represented by two - * Bloom filters. + * <p>This is a version of cosineSimilarity optimized for Bloom filters.</p> * * @param first the first Bloom filter. * @param second the second Bloom filter. - * @return an estimate of the size of the union between the two filters. + * @return the Cosine similarity. */ - public static long estimateUnionSize(final BloomFilter first, final BloomFilter second) { - verifyShape(first, second); - final Shape shape = first.getShape(); - final double estimate = -(shape.getNumberOfBits() * - Math.log(1.0 - first.orCardinality(second) * 1.0 / shape.getNumberOfBits())) / - shape.getNumberOfHashFunctions(); - return Math.round(estimate); + public static double cosineSimilarity(final BloomFilter first, final BloomFilter second) { + final int numerator = andCardinality(first, second); + // Given that the cardinality is an int then the product as a double will not + // overflow, we can use one sqrt: + return numerator == 0 ? 0 : numerator / Math.sqrt(first.cardinality() * second.cardinality()); } /** - * Calculates the Hamming distance between two Bloom filters. + * Calculates the Hamming distance between two BitMapProducers. * - * @param first the first Bloom filter. - * @param second the second Bloom filter. + * @param first the first BitMapProducer. + * @param second the second BitMapProducer. * @return the Hamming distance. */ - public static int hammingDistance(final BloomFilter first, final BloomFilter second) { - verifyShape(first, second); - return first.xorCardinality(second); + public static int hammingDistance(final BitMapProducer first, final BitMapProducer second) { + return xorCardinality(first, second); } /** - * Calculates the Jaccard distance between two Bloom filters. + * Calculates the Jaccard distance between two BitMapProducer. * * <p>Jaccard distance is defined as {@code 1 - Jaccard similarity}</p> * - * @param first the first Bloom filter. - * @param second the second Bloom filter. + * @param first the first BitMapProducer. + * @param second the second BitMapProducer. * @return the Jaccard distance. */ - public static double jaccardDistance(final BloomFilter first, final BloomFilter second) { + public static double jaccardDistance(final BitMapProducer first, final BitMapProducer second) { return 1.0 - jaccardSimilarity(first, second); } /** - * Calculates the Jaccard similarity between two Bloom filters. + * Calculates the Jaccard similarity between two BitMapProducer. * * <p>Also known as Jaccard index, Intersection over Union, and Jaccard similarity coefficient</p> * - * @param first the first Bloom filter. - * @param second the second Bloom filter. + * @param first the first BitMapProducer. + * @param second the second BitMapProducer. * @return the Jaccard similarity. */ - public static double jaccardSimilarity(final BloomFilter first, final BloomFilter second) { - verifyShape(first, second); - final int orCard = first.orCardinality(second); - // if the orCard is zero then the hamming distance will also be zero. - return orCard == 0 ? 0 : hammingDistance(first, second) / (double) orCard; - } - - /** - * Verifies the Bloom filters have the same shape. - * - * @param first the first filter to check. - * @param second the second filter to check. - * @throws IllegalArgumentException if the shapes are not the same. - */ - private static void verifyShape(final BloomFilter first, final BloomFilter second) { - if (!first.getShape().equals(second.getShape())) { - throw new IllegalArgumentException(String.format("Shape %s is not the same as %s", - first.getShape(), second.getShape())); - } + public static double jaccardSimilarity(final BitMapProducer first, final BitMapProducer second) { + final int intersection = andCardinality(first, second); Review comment: There is chance for efficiency here. Using my updated method: ```Java public static double jaccardSimilarity(final BitMapProducer first, final BitMapProducer second) { int[] cardinality = new int[2]; first.forEachBitMapPair(second, (x, y) -> { cardinality[0] += Long.bitCount(x & y); cardinality[1] += Long.bitCount(x | y); return true; }); final int intersection = cardinality[0]; return intersection == 0 ? 0 : intersection / (double) cardinality[1]; } ``` It passes all the current tests in SetOperations. ########## File path: src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java ########## @@ -0,0 +1,170 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Objects; +import java.util.function.LongPredicate; + +/** + * Produces bit map longs for a Bloom filter. + * + * Each bit map is a little-endian long value representing a block of bits of in a filter. + * + * <p>The returned array will have length {@code ceil(m / 64)} where {@code m} is the + * number of bits in the filter and {@code ceil} is the ceiling function. + * Bits 0-63 are in the first long. A value of 1 at a bit position indicates the bit + * index is enabled. + * </p><p><em> + * The default implementations of the {@code makePredicate()} and {@code asBitMapArray} methods + * are slow and should be reimplemented in the implementing classes where possible.</em></p> + * + * @since 4.5 + */ +@FunctionalInterface +public interface BitMapProducer { Review comment: Naming conventions between the to and from method in this and IndexProducer do not align: ``` IndexProducer: // Update to use the ... varargs syntax fromIntArray(int[] array) -> fromIndexArray(int... array) int[] toIndexArray() BitMapProducer: fromLongArray(long... array) -> fromBitMapArray(long... array) long[] toBitMapArray(); ``` I would also order the methods in this interface to match IndexProducer. This means moving `toBitMapArray()` as the last method. ########## File path: src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java ########## @@ -16,147 +16,172 @@ */ package org.apache.commons.collections4.bloomfilter; -import org.apache.commons.collections4.bloomfilter.hasher.Shape; +import java.util.function.LongBinaryOperator; +import java.util.function.LongPredicate; /** - * Implementations of set operations on Bloom filters. + * Implementations of set operations on BitMapProducers. * + * @since 4.5 */ public final class SetOperations { /** - * Calculates the Cosine distance between two Bloom filters. + * Calculates the cardinality of the result of a LongBinaryOperator using the + * {@code BitMapProducer.makePredicate} method. + * @param first the first BitMapProducer + * @param second the second BitMapProducer + * @param op a long binary operation on where x = {@code first} and y = {@code second} bitmap producers. + * @return the calculated cardinality. + */ + private static int cardinality(BitMapProducer first, BitMapProducer second, LongBinaryOperator op) { + int[] cardinality = new int[1]; + Review comment: With my updated method this would be: ```Java int[] cardinality = new int[1]; first.forEachBitMapPair(second, (x, y) -> { cardinality[0] += Long.bitCount(op.applyAsLong(x, y)); return true; }); return cardinality[0]; ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
