aherbert commented on a change in pull request #258:
URL: 
https://github.com/apache/commons-collections/pull/258#discussion_r806265481



##########
File path: 
src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java
##########
@@ -0,0 +1,256 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter;
+
+import java.util.Objects;
+import java.util.TreeSet;
+import java.util.function.IntPredicate;
+import java.util.function.LongPredicate;
+
+/**
+ * A bloom filter using a TreeSet of integers to track enabled bits. This is a 
standard
+ * implementation and should work well for most low cardinality Bloom filters.
+ * @since 4.5
+ */
+public class SparseBloomFilter implements BloomFilter {
+
+    /**
+     * The bitSet that defines this BloomFilter.
+     */
+    private final TreeSet<Integer> indices;
+
+    /**
+     * The shape of this BloomFilter.
+     */
+    private final Shape shape;
+
+    /**
+     * Constructs an empty BitSetBloomFilter.
+     *
+     * @param shape The shape of the filter.
+     */
+    public SparseBloomFilter(Shape shape) {
+        Objects.requireNonNull(shape, "shape");
+        this.shape = shape;
+        this.indices = new TreeSet<>();
+    }
+
+    /**
+     * Creates an instance that is equivalent to {@code other}.
+     *
+     * @param other The bloom filter to copy.
+     */
+    public SparseBloomFilter(BloomFilter other) {
+        Objects.requireNonNull(other, "other");
+        this.shape = other.getShape();
+        this.indices = new TreeSet<>();
+        if (other.isSparse()) {
+            mergeInPlace((IndexProducer) other);
+        } else {
+            mergeInPlace(IndexProducer.fromBitMapProducer(other));
+        }
+    }
+
+    private void checkIndices(Shape shape) {
+        if (this.indices.floor(-1) != null || 
this.indices.ceiling(shape.getNumberOfBits()) != null) {
+            throw new IllegalArgumentException(
+                    String.format("Filter only accepts values in the [0,%d) 
range", shape.getNumberOfBits()));
+        }
+    }
+
+    /**
+     * Constructs a populated Bloom filter.
+     * @param shape the shape for the bloom filter.
+     * @param hasher the hasher to provide the initial data.
+     */
+    public SparseBloomFilter(final Shape shape, Hasher hasher) {
+        this(shape);
+        Objects.requireNonNull(hasher, "hasher");
+        hasher.indices(shape).forEachIndex(this::add);
+        checkIndices(shape);
+    }
+
+    /**
+     * Constructs a populated Bloom filter.
+     * @param shape the shape of the filter.
+     * @param indices an index producer for the indices to to enable.
+     * @throws IllegalArgumentException if indices contains a value greater 
than the number
+     * of bits in the shape.
+     */
+    public SparseBloomFilter(Shape shape, IndexProducer indices) {
+        this(shape);
+        Objects.requireNonNull(indices, "indices");
+        indices.forEachIndex(this::add);
+        checkIndices(shape);
+    }
+
+    /**
+     * Constructs a populated Bloom filter.
+     * @param shape the shape of the filter.
+     * @param bitMaps a BitMapProducer for the bit maps to add.
+     * @throws IllegalArgumentException if the bit maps contain a value 
greater than the number
+     * of bits in the shape.
+     */
+    public SparseBloomFilter(Shape shape, BitMapProducer bitMaps) {
+        this(shape);
+        Objects.requireNonNull(bitMaps, "bitMaps");
+        mergeInPlace(IndexProducer
+                .fromBitMapProducer(new CheckBitMapCount(bitMaps, 
BitMap.numberOfBitMaps(shape.getNumberOfBits()))));
+    }
+
+    @Override
+    public long[] asBitMapArray() {
+        long[] result = new 
long[BitMap.numberOfBitMaps(shape.getNumberOfBits())];
+        LongPredicate filler = new LongPredicate() {
+            int idx = 0;
+
+            @Override
+            public boolean test(long value) {
+                result[idx++] = value;
+                return true;
+            }
+        };
+        forEachBitMap(filler);
+        return result;
+    }
+
+    @Override
+    public SparseBloomFilter copy() {
+        SparseBloomFilter result = new SparseBloomFilter(shape);
+        result.indices.addAll(indices);
+        return result;
+    }
+
+    /**
+     * Adds the index to the indices.
+     * @param idx the index to add.
+     * @return {@code true} always
+     */
+    private boolean add(int idx) {
+        indices.add(idx);
+        return true;
+    }
+
+    /**
+     * Performs a merge in place using an IndexProducer.
+     * @param indexProducer the IndexProducer to merge from.
+     * @throws IllegalArgumentException if producer sends illegal value.
+     */
+    private void mergeInPlace(IndexProducer indexProducer) {
+        indexProducer.forEachIndex(this::add);
+        if (!this.indices.isEmpty()) {
+            if (this.indices.last() >= shape.getNumberOfBits()) {
+                throw new IllegalArgumentException(String.format("Value in 
list %s is greater than maximum value (%s)",
+                        this.indices.last(), shape.getNumberOfBits()));
+            }
+            if (this.indices.first() < 0) {
+                throw new IllegalArgumentException(
+                        String.format("Value in list %s is less than 0", 
this.indices.first()));
+            }
+        }
+    }
+
+    @Override
+    public boolean mergeInPlace(Hasher hasher) {
+        Objects.requireNonNull(hasher, "hasher");
+        mergeInPlace(hasher.indices(shape));
+        return true;
+    }
+
+    @Override
+    public boolean mergeInPlace(BloomFilter other) {
+        Objects.requireNonNull(other, "other");
+        IndexProducer producer = other.isSparse() ? (IndexProducer) other : 
IndexProducer.fromBitMapProducer(other);
+        mergeInPlace(producer);
+        return true;
+    }
+
+    @Override
+    public Shape getShape() {
+        return shape;
+    }
+
+    @Override
+    public boolean isSparse() {
+        return true;
+    }
+
+    @Override
+    public int cardinality() {
+        return indices.size();
+    }
+
+    @Override
+    public boolean forEachIndex(IntPredicate consumer) {
+        Objects.requireNonNull(consumer, "consumer");
+        for (int value : indices) {
+            if (!consumer.test(value)) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    @Override
+    public boolean forEachBitMap(LongPredicate consumer) {
+        Objects.requireNonNull(consumer, "consumer");
+        int limit = BitMap.numberOfBitMaps(shape.getNumberOfBits());
+        /*
+         * because our indices are always in order we can shorten the time 
necessary to
+         * create the longs for the consumer
+         */
+        // the currenlty constructed bitMap
+        long bitMap = 0;
+        // the bitmap we are working on
+        int idx = 0;
+        for (int i : indices) {
+            while (BitMap.getLongIndex(i) != idx) {
+                if (!consumer.test(bitMap)) {
+                    return false;
+                }
+                bitMap = 0;
+                idx++;
+            }
+            bitMap |= BitMap.getLongBit(i);
+        }
+        // we fall through with data in the bitMap
+        if (!consumer.test(bitMap)) {
+            return false;
+        }
+        // account for hte bitMap in the previous block + the next one
+        idx++;
+        // while there are more blocks to generate send zero to the consumer.
+        while (idx < limit) {
+            if (!consumer.test(0L)) {
+                return false;
+            }
+            idx++;
+        }
+        return true;
+    }
+
+    @Override
+    public boolean contains(IndexProducer indexProducer) {
+        return indexProducer.forEachIndex(indices::contains);
+    }
+
+    @Override
+    public boolean contains(BitMapProducer bitMapProducer) {
+        return contains(IndexProducer.fromBitMapProducer(

Review comment:
       There is no documented requirement for the contains method to accept 
only BitMapProducer with the correct number of bitmaps for the shape. The use 
of CheckBitMapCount is not required.
   
   This raising an exception prevents BitMapProducers from using early exit 
when the remaining bitmaps are all zeros.

##########
File path: 
src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java
##########
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter;
+
+import java.util.Arrays;
+import java.util.Objects;
+import java.util.function.IntPredicate;
+import java.util.function.LongPredicate;
+
+/**
+ * A bloom filter using an array of bit maps to track enabled bits. This is a 
standard
+ * implementation and should work well for most Bloom filters.
+ * @since 4.5
+ */
+public class SimpleBloomFilter implements BloomFilter {
+
+    /**
+     * The array of bit map longs that defines this Bloom filter.  Will be 
null if the filter is empty.
+     */
+    private final long[] bitMap;
+
+    /**
+     * The Shape of this Bloom filter.
+     */
+    private final Shape shape;
+
+    /**
+     * The cardinality of this Bloom filter.
+     */
+    private int cardinality;

Review comment:
       Why store the cardinality? If desired then this should be set as -1 when 
a merge is done and computed lazily.
   ```Java
   @Override
   public int cardinality() {
       // Lazy evaluation with caching
       int c = cardinality;
       if (c < 0) {
           for (long bits : bitMap) {
               c += Long.bitCount(bits);
           }
           cardinality = c;
       }
       return c;
   }
   ```
   In the current scheme I can merge 50 filters and have to put up with 50 
repeat calculations of the cardinality slowing it down.
   

##########
File path: 
src/main/java/org/apache/commons/collections4/bloomfilter/IndexProducer.java
##########
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+import java.util.function.IntPredicate;
+import java.util.function.LongPredicate;
+
+/**
+ * An object that produces indices of a Bloom filter.
+ * <p><em>
+ * The default implementation of {@code asIndexArray} is slow.  Implementers 
should reimplement the
+ * method where possible.</em></p>
+ *
+ * @since 4.5
+ */
+@FunctionalInterface
+public interface IndexProducer {
+
+    /**
+     * Each index is passed to the predicate.  The predicate is applied to each
+     * index value, if the predicate returns {@code false} the execution is 
stopped, {@code false}
+     * is returned, and no further indices are processed.
+     *
+     * <p>Any exceptions thrown by the action are relayed to the caller.</p>
+     *
+     * <p>Indices ordering is not guaranteed</p>
+     *
+     * @param predicate the action to be performed for each non-zero bit index.
+     * @return {@code true} if all indexes return true from consumer, {@code 
false} otherwise.
+     * @throws NullPointerException if the specified action is null
+     */
+    boolean forEachIndex(IntPredicate predicate);
+
+    /**
+     * Creates an IndexProducer from an array of integers.
+     * @param values the index values
+     * @return an IndexProducer that uses the values.
+     */
+    static IndexProducer fromIntArray(final int[] values) {
+        return new IndexProducer() {
+
+            @Override
+            public boolean forEachIndex(IntPredicate predicate) {
+                for (int value : values) {
+                    if (!predicate.test(value)) {
+                        return false;
+                    }
+                }
+                return true;
+            }
+        };
+    }
+
+    /**
+     * Creates an IndexProducer from a {@code BitMapProducer}.
+     * @param producer the {@code BitMapProducer}
+     * @return a new {@code IndexProducer}.
+     */
+    static IndexProducer fromBitMapProducer(BitMapProducer producer) {
+        Objects.requireNonNull(producer, "producer");
+        return new IndexProducer() {
+            @Override
+            public boolean forEachIndex(IntPredicate consumer) {
+                LongPredicate longPredicate = new LongPredicate() {
+                    int wordIdx = 0;
+
+                    @Override
+                    public boolean test(long word) {
+                        int i = wordIdx;
+                        while (word != 0) {
+                            if ((word & 1) == 1) {
+                                if (!consumer.test(i)) {
+                                    return false;
+                                }
+                            }
+                            word >>>= 1;
+                            i++;
+                        }
+                        wordIdx += 64;
+                        return true;
+                    }
+                };
+                return producer.forEachBitMap(longPredicate::test);
+            }
+        };
+    }
+
+    /**
+     * Return a copy of the IndexProducer data as an int array.
+     * <p><em>
+     * The default implementation of this method is slow.  It is recommended
+     * that implementing classes reimplement this method.
+     * </em></p>
+     * @return An int array of the data.
+     */
+    default int[] asIndexArray() {

Review comment:
       Avoid boxing in the default implementation using a BitSet. The BitSet 
will resize more efficiently than an ArrayList. This also ensures the indices 
are sorted and distinct. The stream toArray() from the BitSet is likely quite 
inefficient though so this is more a optimisation of memory consumption verses 
speed.
   
   ```Java
   default int[] asIndexArray() {
       BitSet result = new BitSet();
       forEachIndex(i -> {
           result.set(i);
           return true;
       });
       return result.stream().toArray();
   }
   ```

##########
File path: 
src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java
##########
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Objects;
+import java.util.function.LongPredicate;
+
+/**
+ * Produces bit map longs for a Bloom filter.
+ *
+ * Each bit map is a little-endian long value representing a block of bits of 
in a filter.
+ *
+ * <p>The returned array will have length {@code ceil(m / 64)} where {@code m} 
is the
+ * number of bits in the filter and {@code ceil} is the ceiling function.
+ * Bits 0-63 are in the first long. A value of 1 at a bit position indicates 
the bit
+ * index is enabled.
+ * </p><p><em>
+ * The default implementations of the {@code makePredicate()} and {@code 
asBitMapArray} methods
+ * are slow and should be reimplemented in the implementing classes where 
possible.</em></p>
+ *
+ * @since 4.5
+ */
+@FunctionalInterface
+public interface BitMapProducer {
+
+    /**
+     * Each bit map is passed to the predicate in order.  The predicate is 
applied to each
+     * bit map value, if the predicate returns {@code false} the execution is 
stopped, {@code false}
+     * is returned, and no further bit maps are processed.
+     *
+     * <p>If the producer is empty this method will return true.</p>
+     *
+     * <p>Any exceptions thrown by the action are relayed to the caller.</p>
+     *
+     * @param predicate the function to execute
+     * @return {@code true} if all bit maps returned {@code true}, {@code 
false} otherwise.
+     * @throws NullPointerException if the specified consumer is null
+     */
+    boolean forEachBitMap(LongPredicate predicate);
+
+    /**
+     * Applies the {@code func} to each bit map pair in order.
+     * <p>
+     * Creates a LongPredicate that is used in apply {@code func} to this 
BitMapProducer
+     * and another.  For example:</p>
+     * <pre>
+     * BitMapProducer a = ....;
+     * BitMapProducer b = ....;
+     * LongPredicate predicate = a.makePredicate( (x,y) -&gt; x==y );
+     * boolean result = b.apply( predicate );
+     * </pre>
+     * <p>The above example will execute a.bitmapValue == b.bitmapValue for 
every value in b.
+     * </p><p>
+     * Notes:
+     * </p><ul>
+     * <li>The resulting LongPredicate should only be used once.</li>
+     * <li>Any changes made to the {@code func} arguments will not survive 
outside of the {@code func} call.</li>
+     * <li>For an example of how to use {@code makePredicate} to apply 
non-binary functions across all pairs of
+     * bit maps see the SetOperations code.</li>
+     * </ul>
+     * <p>
+     * <em>The default implementation of this method uses {@code 
asBitMapArray()}  It is recommended that implementations
+     * of BitMapProducer that have local arrays reimplement this 
method.</em></p>
+     *
+     * @param func The function to apply.
+     * @return A LongPredicate that tests this BitMapProducers bitmap values 
in order.
+     */
+    default LongPredicate makePredicate(LongBiPredicate func) {

Review comment:
       I think creating a predicate that can be used outside of the intended 
scope is not ideal. Calling it too many times will generate an IOOB exception. 
The makePredicate method is poor encapsulation and has a name that does not 
describe the intended usage. It is better to hide the predicate so that it is 
ensured to be single use:
   
   ```Java
   default boolean forEachBitMapPair(BitMapProducer other, LongBiPredicate 
func) {
       long[] ary = asBitMapArray();
   
       LongPredicate p = new LongPredicate() {
           int idx = 0;
   
           @Override
           public boolean test(long other) {
               return func.test(idx > ary.length ? 0L : ary[idx++], other);
           }
       };
       return other.forEachBitMap(p);
   }
   ```
   
   The makePredicate is only overridden in SimpleBloomFilter. This can simply 
repeat the default `forEachBitMapPair`  method without first dumping the 
bitmaps to an array.
   
   However the above method does not address Shape mismatch. This actually 
works:
   ```Java
      default boolean forEachBitMapPair(BitMapProducer other, LongBiPredicate 
func) {
           long[] ary = asBitMapArray();
           class CountingLongPredicate implements LongPredicate {
               int idx = 0;
   
               @Override
               public boolean test(long other) {
                   return func.test(idx == ary.length ? 0 : ary[idx++], other);
               }
   
               boolean forEachRemaining() {
                   while (idx != ary.length && func.test(ary[idx], 0)) {
                       idx++;
                   }
                   return idx == ary.length;
               }
           }
   
           CountingLongPredicate p = new CountingLongPredicate();
           if (!other.forEachBitMap(p)) {
               return false;
           }
           // Consume the rest
           return p.forEachRemaining();
       }
   ```
   This is a quick hack. Ideally the CountingLongPredicate should be package 
private and have a constructor accepting `long[]`. It can then be reused by the 
SimpleBloomFilter with the direct array reference with 2 lines of code:
   ```Java
   boolean forEachBitMapPair(BitMapProducer other, LongBiPredicate func) {
           CountingLongPredicate p = new CountingLongPredicate(bitMap);
           return other.forEachBitMap(p) && p.forEachRemaining();
   }
   ```

##########
File path: 
src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java
##########
@@ -0,0 +1,256 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter;
+
+import java.util.Objects;
+import java.util.TreeSet;
+import java.util.function.IntPredicate;
+import java.util.function.LongPredicate;
+
+/**
+ * A bloom filter using a TreeSet of integers to track enabled bits. This is a 
standard
+ * implementation and should work well for most low cardinality Bloom filters.
+ * @since 4.5
+ */
+public class SparseBloomFilter implements BloomFilter {
+
+    /**
+     * The bitSet that defines this BloomFilter.
+     */
+    private final TreeSet<Integer> indices;
+
+    /**
+     * The shape of this BloomFilter.
+     */
+    private final Shape shape;
+
+    /**
+     * Constructs an empty BitSetBloomFilter.
+     *
+     * @param shape The shape of the filter.
+     */
+    public SparseBloomFilter(Shape shape) {
+        Objects.requireNonNull(shape, "shape");
+        this.shape = shape;
+        this.indices = new TreeSet<>();
+    }
+
+    /**
+     * Creates an instance that is equivalent to {@code other}.
+     *
+     * @param other The bloom filter to copy.
+     */
+    public SparseBloomFilter(BloomFilter other) {
+        Objects.requireNonNull(other, "other");
+        this.shape = other.getShape();
+        this.indices = new TreeSet<>();
+        if (other.isSparse()) {
+            mergeInPlace((IndexProducer) other);
+        } else {
+            mergeInPlace(IndexProducer.fromBitMapProducer(other));
+        }
+    }
+
+    private void checkIndices(Shape shape) {
+        if (this.indices.floor(-1) != null || 
this.indices.ceiling(shape.getNumberOfBits()) != null) {
+            throw new IllegalArgumentException(
+                    String.format("Filter only accepts values in the [0,%d) 
range", shape.getNumberOfBits()));
+        }
+    }
+
+    /**
+     * Constructs a populated Bloom filter.
+     * @param shape the shape for the bloom filter.
+     * @param hasher the hasher to provide the initial data.
+     */
+    public SparseBloomFilter(final Shape shape, Hasher hasher) {
+        this(shape);
+        Objects.requireNonNull(hasher, "hasher");
+        hasher.indices(shape).forEachIndex(this::add);
+        checkIndices(shape);
+    }
+
+    /**
+     * Constructs a populated Bloom filter.
+     * @param shape the shape of the filter.
+     * @param indices an index producer for the indices to to enable.
+     * @throws IllegalArgumentException if indices contains a value greater 
than the number
+     * of bits in the shape.
+     */
+    public SparseBloomFilter(Shape shape, IndexProducer indices) {
+        this(shape);
+        Objects.requireNonNull(indices, "indices");
+        indices.forEachIndex(this::add);
+        checkIndices(shape);
+    }
+
+    /**
+     * Constructs a populated Bloom filter.
+     * @param shape the shape of the filter.
+     * @param bitMaps a BitMapProducer for the bit maps to add.
+     * @throws IllegalArgumentException if the bit maps contain a value 
greater than the number
+     * of bits in the shape.
+     */
+    public SparseBloomFilter(Shape shape, BitMapProducer bitMaps) {
+        this(shape);
+        Objects.requireNonNull(bitMaps, "bitMaps");
+        mergeInPlace(IndexProducer
+                .fromBitMapProducer(new CheckBitMapCount(bitMaps, 
BitMap.numberOfBitMaps(shape.getNumberOfBits()))));

Review comment:
       This use of CheckBitMapCount can be replaced with a simple merge and 
then a call to checkIndices. Using the CheckBitMapCount would not detect an 
error for the case where e.g. bits = 65, but the producer outputs [-1, -1] for 
the two bit maps so populates indices [0, 127].

##########
File path: 
src/main/java/org/apache/commons/collections4/bloomfilter/BloomFilter.java
##########
@@ -16,138 +16,282 @@
  */
 package org.apache.commons.collections4.bloomfilter;
 
-import org.apache.commons.collections4.bloomfilter.hasher.Hasher;
-import org.apache.commons.collections4.bloomfilter.hasher.Shape;
-import org.apache.commons.collections4.bloomfilter.hasher.StaticHasher;
+import java.util.Objects;
+import java.util.function.LongPredicate;
 
 /**
  * The interface that describes a Bloom filter.
+ * <p>
+ * <em>See implementation notes for BitMapProducer and IndexProducer.</em>
+ * </p>
+ * @see BitMapProducer
+ * @see IndexProducer
  * @since 4.5
  */
-public interface BloomFilter {
-
-    // Query Operations
+public interface BloomFilter extends IndexProducer, BitMapProducer {
 
     /**
-     * Gets the shape of this filter.
-     *
-     * @return the shape of this filter
+     * Creates a new instance of the BloomFilter with the same properties as 
the current one.
+     * @return a copy of this BloomFilter
      */
-    Shape getShape();
+    BloomFilter copy();
+
+    // Query Operations
 
     /**
-     * Gets an array of little-endian long values representing the bits of 
this filter.
+     * This method is used to determine the best method for matching.
      *
-     * <p>The returned array will have length {@code ceil(m / 64)} where 
{@code m} is the
-     * number of bits in the filter and {@code ceil} is the ceiling function.
-     * Bits 0-63 are in the first long. A value of 1 at a bit position 
indicates the bit
-     * index is enabled.
+     * <p>For `sparse` implementations
+     * the {@code forEachIndex(IntConsumer consumer)} method is more 
efficient.  For non `sparse` implementations
+     * the {@code forEachBitMap(LongConsumer consumer)} is more efficient.  
Implementers should determine if it is easier
+     * for the implementation to produce indexes of bit map blocks.</p>
      *
-     * @return the {@code long[]} representation of this filter
+     * @return {@code true} if the implementation is sparse {@code false} 
otherwise.
+     * @see BitMap
      */
-    long[] getBits();
+    boolean isSparse();
 
     /**
-     * Creates a StaticHasher that contains the indexes of the bits that are 
on in this
-     * filter.
-     *
-     * @return a StaticHasher for that produces this Bloom filter
+     * Gets the shape that was used when the filter was built.
+     * @return The shape the filter was built with.
      */
-    StaticHasher getHasher();
+    Shape getShape();
 
     /**
-     * Returns {@code true} if this filter contains the specified filter. 
Specifically this
+     * Returns {@code true} if this filter contains the specified filter.
+     *
+     * <p>Specifically this
      * returns {@code true} if this filter is enabled for all bits that are 
enabled in the
      * {@code other} filter. Using the bit representations this is
-     * effectively {@code (this AND other) == other}.
+     * effectively {@code (this AND other) == other}.</p>
      *
      * @param other the other Bloom filter
-     * @return true if this filter is enabled for all enabled bits in the 
other filter
-     * @throws IllegalArgumentException if the shape of the other filter does 
not match
-     * the shape of this filter
+     * @return true if all enabled bits in the other filter are enabled in 
this filter.
      */
-    boolean contains(BloomFilter other);
+    default boolean contains(BloomFilter other) {
+        Objects.requireNonNull(other, "other");
+        return isSparse() ? contains((IndexProducer) other) : 
contains((BitMapProducer) other);
+    }
 
     /**
-     * Returns {@code true} if this filter contains the specified decomposed 
Bloom filter.
-     * Specifically this returns {@code true} if this filter is enabled for 
all bit indexes
-     * identified by the {@code hasher}. Using the bit representations this is
-     * effectively {@code (this AND hasher) == hasher}.
+     * Returns {@code true} if this filter contains the bits specified in the 
hasher.
+     *
+     * <p>Specifically this returns {@code true} if this filter is enabled for 
all bit indexes
+     * identified by the {@code hasher}. Using the bit map representations 
this is
+     * effectively {@code (this AND hasher) == hasher}.</p>
      *
      * @param hasher the hasher to provide the indexes
      * @return true if this filter is enabled for all bits specified by the 
hasher
-     * @throws IllegalArgumentException if the hasher cannot generate indices 
for the shape of
-     * this filter
      */
-    boolean contains(Hasher hasher);
+    default boolean contains(Hasher hasher) {
+        Objects.requireNonNull(hasher, "Hasher");
+        Shape shape = getShape();
+        return contains(hasher.indices(shape));
+    }
+
+    /**
+     * Returns {@code true} if this filter contains the indices specified 
IndexProducer.
+     *
+     * <p>Specifically this returns {@code true} if this filter is enabled for 
all bit indexes
+     * identified by the {@code IndexProducer}.</p>
+     *
+     * @param indexProducer the IndexProducer to provide the indexes
+     * @return {@code true} if this filter is enabled for all bits specified 
by the IndexProducer
+     */
+    boolean contains(IndexProducer indexProducer);
 
-    // Modification Operations
+    /**
+     * Returns {@code true} if this filter contains the bits specified in the 
bit maps produced by the
+     * bitMapProducer.
+     *
+     * @param bitMapProducer the the {@code BitMapProducer} to provide the bit 
maps.
+     * @return {@code true} if this filter is enabled for all bits specified 
by the bit maps
+     */
+    default boolean contains(BitMapProducer bitMapProducer) {
+        return bitMapProducer.forEachBitMap(this.makePredicate((x, y) -> (x & 
y) == y));
+    }
+
+    // update operations
 
     /**
-     * Merges the specified Bloom filter into this Bloom filter. Specifically 
all bit indexes
-     * that are enabled in the {@code other} filter will be enabled in this 
filter.
+     * Merges the specified Bloom filter with this Bloom filter creating a new 
Bloom filter.
      *
-     * <p>Note: This method should return {@code true} even if no additional 
bit indexes were
-     * enabled. A {@code false} result indicates that this filter is not 
ensured to contain
-     * the {@code other} Bloom filter.
+     * <p>Specifically all bit indexes that are enabled in the {@code other} 
and in @code this} filter will be
+     * enabled in the resulting filter.</p>
      *
      * @param other the other Bloom filter
+     * @return The new Bloom filter.
+     */
+    default BloomFilter merge(BloomFilter other) {
+        Objects.requireNonNull(other, "other");
+        BloomFilter result = copy();
+        result.mergeInPlace(other);
+        return result;
+    }
+
+    /**
+     * Merges the specified Hasher with this Bloom filter and returns a new 
Bloom filter.
+     *
+     * <p>Specifically all bit indexes that are identified by the {@code 
hasher} and in {@code this} Bloom filter
+     * be enabled in the resulting filter.</p>
+     *
+     * @param hasher the hasher to provide the indices
+     * @return the new Bloom filter.
+     */
+    default BloomFilter merge(Hasher hasher) {
+        Objects.requireNonNull(hasher, "hasher");
+        BloomFilter result = copy();
+        result.mergeInPlace(hasher);
+        return result;
+    }
+
+    /**
+     * Merges the specified Bloom filter into this Bloom filter.
+     *
+     * <p>Specifically all
+     * bit indexes that are identified by the {@code other} will be enabled in 
this filter.</p>
+     *
+     * <p><em>Note: This method should return {@code true} even if no 
additional bit indexes were
+     * enabled. A {@code false} result indicates that this filter may or may 
not contain
+     * the {@code other} Bloom filter.</em>  This state may occur in complex 
Bloom filter implementations like
+     * counting Bloom filters.</p>
+     *
+     * @param other The bloom filter to merge into this one.
      * @return true if the merge was successful
-     * @throws IllegalArgumentException if the shape of the other filter does 
not match
-     * the shape of this filter
      */
-    boolean merge(BloomFilter other);
+    boolean mergeInPlace(BloomFilter other);
 
     /**
-     * Merges the specified decomposed Bloom filter into this Bloom filter. 
Specifically all
+     * Merges the specified hasher into this Bloom filter. Specifically all
      * bit indexes that are identified by the {@code hasher} will be enabled 
in this filter.
      *
-     * <p>Note: This method should return {@code true} even if no additional 
bit indexes were
-     * enabled. A {@code false} result indicates that this filter is not 
ensured to contain
-     * the specified decomposed Bloom filter.
+     * <p><em>Note: This method should return {@code true} even if no 
additional bit indexes were
+     * enabled. A {@code false} result indicates that this filter may or may 
not contain
+     * the {@code other} Bloom filter.</em>  This state may occur in complex 
Bloom filter implementations like
+     * counting Bloom filters.</p>
      *
-     * @param hasher the hasher to provide the indexes
+     * @param hasher The hasher to merge.
      * @return true if the merge was successful
-     * @throws IllegalArgumentException if the hasher cannot generate indices 
for the shape of
-     * this filter
      */
-    boolean merge(Hasher hasher);
+    default boolean mergeInPlace(Hasher hasher) {
+        Objects.requireNonNull(hasher, "hasher");
+        Shape shape = getShape();
+        // create the bloomfilter that is most likely to merge quickly with 
this one
+        BloomFilter result = isSparse() ? new SparseBloomFilter(shape, hasher) 
: new SimpleBloomFilter(shape, hasher);
+        return mergeInPlace(result);
+    }
 
     // Counting Operations
 
+    /**
+     * Determines if the bloom filter is "full".
+     *
+     * <p>Full is defined as having no unset bits.</p>
+     *
+     * @return {@code true} if the filter is full, {@code false} otherwise.
+     */
+    default boolean isFull() {
+        return cardinality() == getShape().getNumberOfBits();
+    }
+
     /**
      * Gets the cardinality (number of enabled bits) of this Bloom filter.
      *
-     * <p>This is also known as the Hamming value.</p>
+     * <p>This is also known as the Hamming value or Hamming number.</p>
      *
      * @return the cardinality of this filter
      */
     int cardinality();
 
     /**
-     * Performs a logical "AND" with the other Bloom filter and returns the 
cardinality
-     * (number of enabled bits) of the result.
+     * Estimates the number of items in the Bloom filter.
      *
-     * @param other the other Bloom filter
-     * @return the cardinality of the result of {@code (this AND other)}
+     * <p>By default this is the rounding of the {@code 
Shape.estimateN(cardinality)} calculation for the
+     * shape and cardinality of this filter.</p>
+     *
+     * <p>This produces an estimate roughly equivalent to the number of 
Hashers that have been merged into the filter.</p>
+     *
+     * @return an estimate of the number of items in the bloom filter.
+     * @see Shape#estimateN(int)
      */
-    int andCardinality(BloomFilter other);
+    default int estimateN() {
+        return (int) Math.round(getShape().estimateN(cardinality()));
+    }
 
     /**
-     * Performs a logical "OR" with the other Bloom filter and returns the 
cardinality
-     * (number of enabled bits) of the result.
+     * Estimates the number of items in the union of this Bloom filter with 
the other bloom filter.
      *
-     * @param other the other Bloom filter
-     * @return the cardinality of the result of {@code (this OR other)}
+     * <p>By default this is the {@code estimateN()} of the merging of this 
filter with the {@code other} filter.</p>
+     *
+     * <p>This produces an estimate roughly equivalent to the number of unique 
Hashers that have been merged into either
+     * of the filters.</p>
+     *
+     * @param other The other Bloom filter
+     * @return an estimate of the number of items in the union.
+     * @see #estimateN()
      */
-    int orCardinality(BloomFilter other);
+    default int estimateUnion(BloomFilter other) {
+        Objects.requireNonNull(other, "other");
+        return this.merge(other).estimateN();
+    }
 
     /**
-     * Performs a logical "XOR" with the other Bloom filter and returns the 
cardinality
-     * (number of enabled bits) of the result.
+     * Estimates the number of items in the intersection of this Bloom filter 
with the other bloom filter.
      *
-     * @param other the other Bloom filter
-     * @return the cardinality of the result of {@code (this XOR other)}
+     * <p>By default this is the {@code estimateN() + other.estimateN() - 
estimateUnion(other)} </p>
+     *
+     * <p>This produces estimate is roughly equivalent to the number of unique 
Hashers that have been merged into both
+     * of the filters.</p>
+     *
+     * @param other The other Bloom filter
+     * @return an estimate of the number of items in the intersection.
      */
-    int xorCardinality(BloomFilter other);
+    default int estimateIntersection(BloomFilter other) {
+        Objects.requireNonNull(other, "other");
+        return estimateN() + other.estimateN() - estimateUnion(other);
+    }
+
+    /**
+     * Convenience class to test that a bitMapProducer is producing the proper 
number of bitMaps.
+     * Primarily used in constructors.
+     */
+    class CheckBitMapCount implements BitMapProducer {

Review comment:
       Remove this redundant class. It is cluttering the public API. The 
BitMapProducer interface is free to generate bitmaps up to a maximum starting 
from 0. It is of limited use and where you have used it there are are 
alternatives that catch more bugs since this counts number of bit maps and does 
not check the maximum bit index.

##########
File path: 
src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java
##########
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter;
+
+import java.util.Arrays;
+import java.util.Objects;
+import java.util.function.IntPredicate;
+import java.util.function.LongPredicate;
+
+/**
+ * A bloom filter using an array of bit maps to track enabled bits. This is a 
standard
+ * implementation and should work well for most Bloom filters.
+ * @since 4.5
+ */
+public class SimpleBloomFilter implements BloomFilter {

Review comment:
       I would make this class final. I would make all the implementations 
final. I do not see why most of the classes are not final as why would you 
extend them?
   
   Currently if this is extended a child class has limited ability to do 
anything as the bitmap array is private.
   
   If not final then the correct parts of the class should be made `protected` 
to allow efficient copy() by child classes. The simplest way is:
   ```Java
   protected SimpleBloomFilter(SimpleBloomFilter source) {
       // copy private fields
   }
   ```
   To be used by a derived class as:
   ```Java
   MyBloomFilter(MyBloomFilter source) {
       super(source);
       // copy other stuff
   }
   @Override
   public MyBloomFilter copy() {
        return new MyBloomFilter(this);
   }
   ```
   I would not make the private fields protected. That will avoid a derived 
class causing trouble with the values.

##########
File path: 
src/main/java/org/apache/commons/collections4/bloomfilter/SimpleBloomFilter.java
##########
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter;
+
+import java.util.Arrays;
+import java.util.Objects;
+import java.util.function.IntPredicate;
+import java.util.function.LongPredicate;
+
+/**
+ * A bloom filter using an array of bit maps to track enabled bits. This is a 
standard
+ * implementation and should work well for most Bloom filters.
+ * @since 4.5
+ */
+public class SimpleBloomFilter implements BloomFilter {
+
+    /**
+     * The array of bit map longs that defines this Bloom filter.  Will be 
null if the filter is empty.
+     */
+    private final long[] bitMap;
+
+    /**
+     * The Shape of this Bloom filter.
+     */
+    private final Shape shape;
+
+    /**
+     * The cardinality of this Bloom filter.
+     */
+    private int cardinality;
+
+    /**
+     * Creates an empty instance.
+     *
+     * @param shape The shape for the filter.
+     */
+    public SimpleBloomFilter(Shape shape) {
+        Objects.requireNonNull(shape, "shape");
+        this.shape = shape;
+        this.bitMap = new 
long[BitMap.numberOfBitMaps(shape.getNumberOfBits())];
+        this.cardinality = 0;
+    }
+
+    /**
+     * Creates an instance that is equivalent to {@code other}.
+     *
+     * @param other The bloom filter to copy.
+     */
+    public SimpleBloomFilter(BloomFilter other) {
+        Objects.requireNonNull(other, "other");
+        this.shape = other.getShape();
+        this.bitMap = new 
long[BitMap.numberOfBitMaps(shape.getNumberOfBits())];
+        this.cardinality = 0;
+        if (other.isSparse()) {
+            mergeInPlace((IndexProducer) other);
+        } else {
+            mergeInPlace((BitMapProducer) other);
+        }
+    }
+
+    /**
+     * Creates a populated instance.
+     * @param shape The shape for the filter.
+     * @param hasher the Hasher to initialize the filter with.
+     */
+    public SimpleBloomFilter(final Shape shape, Hasher hasher) {
+        this(shape);
+        Objects.requireNonNull(hasher, "hasher");
+        mergeInPlace(hasher);
+    }
+
+    /**
+     * Creates a populated instance.
+     * @param shape The shape for the filter.
+     * @param indices the IndexProducer to initialize the filter with.
+     * @throws IllegalArgumentException if producer sends illegal value.
+     */
+    public SimpleBloomFilter(final Shape shape, IndexProducer indices) {
+        this(shape);
+        Objects.requireNonNull(indices, "indices");
+        mergeInPlace(indices);
+    }
+
+    /**
+     * Creates a populated instance.
+     * @param shape The shape for the filter.
+     * @param bitMaps the BitMapProducer to initialize the filter with.
+     * @throws IllegalArgumentException if the producer returns too many or 
too few bit maps.
+     */
+    public SimpleBloomFilter(final Shape shape, BitMapProducer bitMaps) {
+        this(shape);
+        Objects.requireNonNull(bitMaps, "bitMaps");
+        mergeInPlace(bitMaps);
+    }
+
+    @Override
+    public long[] asBitMapArray() {
+        return Arrays.copyOf(bitMap, bitMap.length);
+    }
+
+    @Override
+    public LongPredicate makePredicate(LongBiPredicate func) {
+        return new LongPredicate() {
+            int idx = 0;
+
+            @Override
+            public boolean test(long other) {
+                return func.test(idx >= bitMap.length ? 0 : bitMap[idx++], 
other);
+            }
+        };
+    }
+
+    @Override
+    public SimpleBloomFilter copy() {

Review comment:
       Copy is done wrong. It should use `bitMap.clone()`. Creating a new array 
then using system array copy is slower as the JRE must first zero fill the 
allocated memory, then copy into it. 
   
   This should be done with a copy constructor:
   
   ```Java
   private SimpleBloomFilter(SimpleBloomFilter source) {
       this.shape = source.shape;
       this.bitMap = source.bitMap.clone();
       this.cardinality = source.cardinality;
   }
   ```

##########
File path: 
src/main/java/org/apache/commons/collections4/bloomfilter/SparseBloomFilter.java
##########
@@ -0,0 +1,256 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter;
+
+import java.util.Objects;
+import java.util.TreeSet;
+import java.util.function.IntPredicate;
+import java.util.function.LongPredicate;
+
+/**
+ * A bloom filter using a TreeSet of integers to track enabled bits. This is a 
standard
+ * implementation and should work well for most low cardinality Bloom filters.
+ * @since 4.5
+ */
+public class SparseBloomFilter implements BloomFilter {
+
+    /**
+     * The bitSet that defines this BloomFilter.
+     */
+    private final TreeSet<Integer> indices;
+
+    /**
+     * The shape of this BloomFilter.
+     */
+    private final Shape shape;
+
+    /**
+     * Constructs an empty BitSetBloomFilter.
+     *
+     * @param shape The shape of the filter.
+     */
+    public SparseBloomFilter(Shape shape) {
+        Objects.requireNonNull(shape, "shape");
+        this.shape = shape;
+        this.indices = new TreeSet<>();
+    }
+
+    /**
+     * Creates an instance that is equivalent to {@code other}.
+     *
+     * @param other The bloom filter to copy.
+     */
+    public SparseBloomFilter(BloomFilter other) {
+        Objects.requireNonNull(other, "other");
+        this.shape = other.getShape();
+        this.indices = new TreeSet<>();
+        if (other.isSparse()) {
+            mergeInPlace((IndexProducer) other);
+        } else {
+            mergeInPlace(IndexProducer.fromBitMapProducer(other));
+        }
+    }
+
+    private void checkIndices(Shape shape) {
+        if (this.indices.floor(-1) != null || 
this.indices.ceiling(shape.getNumberOfBits()) != null) {
+            throw new IllegalArgumentException(
+                    String.format("Filter only accepts values in the [0,%d) 
range", shape.getNumberOfBits()));
+        }
+    }
+
+    /**
+     * Constructs a populated Bloom filter.
+     * @param shape the shape for the bloom filter.
+     * @param hasher the hasher to provide the initial data.
+     */
+    public SparseBloomFilter(final Shape shape, Hasher hasher) {
+        this(shape);
+        Objects.requireNonNull(hasher, "hasher");
+        hasher.indices(shape).forEachIndex(this::add);
+        checkIndices(shape);
+    }
+
+    /**
+     * Constructs a populated Bloom filter.
+     * @param shape the shape of the filter.
+     * @param indices an index producer for the indices to to enable.
+     * @throws IllegalArgumentException if indices contains a value greater 
than the number
+     * of bits in the shape.
+     */
+    public SparseBloomFilter(Shape shape, IndexProducer indices) {
+        this(shape);
+        Objects.requireNonNull(indices, "indices");
+        indices.forEachIndex(this::add);
+        checkIndices(shape);
+    }
+
+    /**
+     * Constructs a populated Bloom filter.
+     * @param shape the shape of the filter.
+     * @param bitMaps a BitMapProducer for the bit maps to add.
+     * @throws IllegalArgumentException if the bit maps contain a value 
greater than the number
+     * of bits in the shape.
+     */
+    public SparseBloomFilter(Shape shape, BitMapProducer bitMaps) {
+        this(shape);
+        Objects.requireNonNull(bitMaps, "bitMaps");
+        mergeInPlace(IndexProducer
+                .fromBitMapProducer(new CheckBitMapCount(bitMaps, 
BitMap.numberOfBitMaps(shape.getNumberOfBits()))));
+    }
+
+    @Override
+    public long[] asBitMapArray() {

Review comment:
       This is very inefficient. Avoid using forEachBitMap. That must composite 
build each bitmap as it goes. The same operations can just be done directly on 
the output array which will run entirely branchless apart from the iterator of 
`indices`:
   
   ```Java
   @Override
   public long[] asBitMapArray() {
       long[] result = new 
long[BitMap.numberOfBitMaps(shape.getNumberOfBits())];
       for (int i : indices) {
           BitMap.set(result, i);
       }
       return result;
   }
   ```

##########
File path: 
src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java
##########
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Objects;
+import java.util.function.LongPredicate;
+
+/**
+ * Produces bit map longs for a Bloom filter.
+ *
+ * Each bit map is a little-endian long value representing a block of bits of 
in a filter.
+ *
+ * <p>The returned array will have length {@code ceil(m / 64)} where {@code m} 
is the
+ * number of bits in the filter and {@code ceil} is the ceiling function.
+ * Bits 0-63 are in the first long. A value of 1 at a bit position indicates 
the bit
+ * index is enabled.
+ * </p><p><em>
+ * The default implementations of the {@code makePredicate()} and {@code 
asBitMapArray} methods
+ * are slow and should be reimplemented in the implementing classes where 
possible.</em></p>
+ *
+ * @since 4.5
+ */
+@FunctionalInterface
+public interface BitMapProducer {
+
+    /**
+     * Each bit map is passed to the predicate in order.  The predicate is 
applied to each
+     * bit map value, if the predicate returns {@code false} the execution is 
stopped, {@code false}
+     * is returned, and no further bit maps are processed.
+     *
+     * <p>If the producer is empty this method will return true.</p>
+     *
+     * <p>Any exceptions thrown by the action are relayed to the caller.</p>
+     *
+     * @param predicate the function to execute
+     * @return {@code true} if all bit maps returned {@code true}, {@code 
false} otherwise.
+     * @throws NullPointerException if the specified consumer is null
+     */
+    boolean forEachBitMap(LongPredicate predicate);
+
+    /**
+     * Applies the {@code func} to each bit map pair in order.
+     * <p>
+     * Creates a LongPredicate that is used in apply {@code func} to this 
BitMapProducer
+     * and another.  For example:</p>
+     * <pre>
+     * BitMapProducer a = ....;
+     * BitMapProducer b = ....;
+     * LongPredicate predicate = a.makePredicate( (x,y) -&gt; x==y );
+     * boolean result = b.apply( predicate );
+     * </pre>
+     * <p>The above example will execute a.bitmapValue == b.bitmapValue for 
every value in b.
+     * </p><p>
+     * Notes:
+     * </p><ul>
+     * <li>The resulting LongPredicate should only be used once.</li>
+     * <li>Any changes made to the {@code func} arguments will not survive 
outside of the {@code func} call.</li>
+     * <li>For an example of how to use {@code makePredicate} to apply 
non-binary functions across all pairs of
+     * bit maps see the SetOperations code.</li>
+     * </ul>
+     * <p>
+     * <em>The default implementation of this method uses {@code 
asBitMapArray()}  It is recommended that implementations
+     * of BitMapProducer that have local arrays reimplement this 
method.</em></p>
+     *
+     * @param func The function to apply.
+     * @return A LongPredicate that tests this BitMapProducers bitmap values 
in order.
+     */
+    default LongPredicate makePredicate(LongBiPredicate func) {
+        long[] ary = asBitMapArray();
+
+        return new LongPredicate() {
+            int idx = 0;
+
+            @Override
+            public boolean test(long other) {
+                return func.test(idx >= ary.length ? 0L : ary[idx++], other);
+            }
+        };
+    }
+
+    /**
+     * Return a copy of the BitMapProducer data as a bit map array.
+     * <p>
+     * The default implementation of this method is slow.  It is recommended
+     * that implementing classes reimplement this method.
+     * </p>
+     * @return An array of bit map data.
+     */
+    default long[] asBitMapArray() {

Review comment:
       We should avoid boxing. So the default implementation can be made better:
   ```Java
   default long[] asBitMapArray() {
       class Bits {
           private long[] data = new long[16];
           private int size;
   
           boolean add(long bits) {
               if (size == data.length) {
                   // This will throw an out-of-memory error if there are too 
many bits.
                   // Since bits are addressed using 32-bit signed integer 
indices 
                   // the maximum length should be ~2^31 / 2^6 = ~2^25.
                   // Any more is a broken implementation.
                   data = Arrays.copyOf(data, size * 2);
               }
               data[size++] = bits;
               return true;
           }
   
           long[] toArray() {
               // Edge case to avoid a large array copy
               return size == data.length ? data : Arrays.copyOf(data, size);
           }
       }
       Bits bits = new Bits();
       forEachBitMap(bits::add);
       return bits.toArray();
   }
   ```

##########
File path: 
src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java
##########
@@ -16,147 +16,172 @@
  */
 package org.apache.commons.collections4.bloomfilter;
 
-import org.apache.commons.collections4.bloomfilter.hasher.Shape;
+import java.util.function.LongBinaryOperator;
+import java.util.function.LongPredicate;
 
 /**
- * Implementations of set operations on Bloom filters.
+ * Implementations of set operations on BitMapProducers.
  *
+ * @since 4.5
  */
 public final class SetOperations {

Review comment:
       You have rewritten this class to entirely depend on the `makePredicate` 
idea where the predicate can send zeros if its array of bitmaps runs out before 
the other provider does. However this is not mirrored. A simple test with 
different shapes shows this fails:
   
   ```Java
           Shape shape = Shape.fromKM(3, 128);
           Shape shape2 = Shape.fromKM(3, 192);
           // ...
           filter1 = new SparseBloomFilter(shape2, 
IndexProducer.fromIntArray(new int[] { 1, 63, 185}));
           filter2 = new SparseBloomFilter(shape, 
IndexProducer.fromIntArray(new int[] { 5, 64, 69 }));
           // This fails!
           //assertEquals(6, SetOperations.orCardinality(filter1, filter2));
           assertEquals(6, SetOperations.orCardinality(filter2, filter1));
   ```

##########
File path: 
src/main/java/org/apache/commons/collections4/bloomfilter/LongBiPredicate.java
##########
@@ -14,12 +14,20 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+package org.apache.commons.collections4.bloomfilter;
 
 /**
- * Provides classes and interfaces to define the shape of a Bloom filter and 
the conversion
- * of generic bytes to a hash of bit indexes to be used with a Bloom filter.
- *
+ * A bi function that accepts long values.

Review comment:
       Adapt the Javadoc from java.util.function:
   
   Represents a predicate (boolean-valued function) of two arguments.
   This is the primitive type specialization of {@link 
java.util.function.BiPredicate BiPredicate} for {@code long}.

##########
File path: 
src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java
##########
@@ -16,147 +16,172 @@
  */
 package org.apache.commons.collections4.bloomfilter;
 
-import org.apache.commons.collections4.bloomfilter.hasher.Shape;
+import java.util.function.LongBinaryOperator;
+import java.util.function.LongPredicate;
 
 /**
- * Implementations of set operations on Bloom filters.
+ * Implementations of set operations on BitMapProducers.
  *
+ * @since 4.5
  */
 public final class SetOperations {
 
     /**
-     * Calculates the Cosine distance between two Bloom filters.
+     * Calculates the cardinality of the result of a LongBinaryOperator using 
the
+     * {@code BitMapProducer.makePredicate} method.
+     * @param first the first BitMapProducer
+     * @param second the second BitMapProducer
+     * @param op a long binary operation on where x = {@code first} and y = 
{@code second} bitmap producers.
+     * @return the calculated cardinality.
+     */
+    private static int cardinality(BitMapProducer first, BitMapProducer 
second, LongBinaryOperator op) {
+        int[] cardinality = new int[1];
+
+        LongPredicate lp = first.makePredicate((x, y) -> {
+            cardinality[0] += Long.bitCount(op.applyAsLong(x, y));
+            return true;
+        });
+        second.forEachBitMap(lp);
+        return cardinality[0];
+    }
+
+    /**
+     * Calculates the cardinality of a BitMapProducer.  By necessity this 
method will visit each bit map
+     * created by the producer.
+     * @param producer the Producer to calculate the cardinality for.
+     * @return the cardinality of the bit maps produced by the producer.
+     */
+    public static int cardinality(BitMapProducer producer) {
+        int[] cardinality = new int[1];
+        producer.forEachBitMap(l -> {
+            cardinality[0] += Long.bitCount(l);
+            return true;
+        });
+        return cardinality[0];
+    }
+
+    /**
+     * Calculates the cardinality of the logical {@code AND} of the bit maps 
for the two filters.
+     * @param first the first BitMapProducer.
+     * @param second the second BitMapProducer
+     * @return the cardinality of the {@code AND} of the filters.
+     */
+    public static int andCardinality(final BitMapProducer first, final 
BitMapProducer second) {
+        return cardinality(first, second, (x, y) -> x & y);
+    }
+
+    /**
+     * Calculates the cardinality of the logical {@code OR} of the bit maps 
for the two filters.
+     * @param first the first BitMapProducer.
+     * @param second the second BitMapProducer
+     * @return the cardinality of the {@code OR} of the filters.
+     */
+    public static int orCardinality(final BitMapProducer first, final 
BitMapProducer second) {
+        return cardinality(first, second, (x, y) -> x | y);
+    }
+
+    /**
+     * Calculates the cardinality of the logical {@code XOR} of the bit maps 
for the two filters.
+     * @param first the first BitMapProducer.
+     * @param second the second BitMapProducer
+     * @return the cardinality of the {@code XOR} of the filters.
+     */
+    public static int xorCardinality(final BitMapProducer first, final 
BitMapProducer second) {
+        return cardinality(first, second, (x, y) -> x ^ y);
+    }
+
+    /**
+     * Calculates the Cosine distance between two BitMapProducer.
      *
      * <p>Cosine distance is defined as {@code 1 - Cosine similarity}</p>
      *
-     * @param first the first Bloom filter.
-     * @param second the second Bloom filter.
+     * @param first the first BitMapProducer.
+     * @param second the second BitMapProducer.
      * @return the jaccard distance.
      */
-    public static double cosineDistance(final BloomFilter first, final 
BloomFilter second) {
+    public static double cosineDistance(final BitMapProducer first, final 
BitMapProducer second) {
         return 1.0 - cosineSimilarity(first, second);
     }
 
     /**
-     * Calculates the Cosine similarity between two Bloom filters.
+     * Calculates the Cosine similarity between two BitMapProducers.
      * <p> Also known as Orchini similarity and the Tucker coefficient of 
congruence or
      * Ochiai similarity.</p>
      *
-     * <p>If either filter is empty (no enabled bits) the result is 0 
(zero)</p>
+     * <p>If either producer is empty the result is 0 (zero)</p>
      *
-     * @param first the first Bloom filter.
-     * @param second the second Bloom filter.
+     * @param first the first BitMapProducer.
+     * @param second the second BitMapProducer.
      * @return the Cosine similarity.
      */
-    public static double cosineSimilarity(final BloomFilter first, final 
BloomFilter second) {
-        verifyShape(first, second);
-        final int numerator = first.andCardinality(second);
-        return numerator == 0 ? 0 : numerator / 
(Math.sqrt(first.cardinality()) * Math.sqrt(second.cardinality()));
+    public static double cosineSimilarity(final BitMapProducer first, final 
BitMapProducer second) {
+        final int numerator = andCardinality(first, second);
+        // Given that the cardinality is an int then the product as a double 
will not
+        // overflow, we can use one sqrt:
+        return numerator == 0 ? 0 : numerator / Math.sqrt(cardinality(first) * 
cardinality(second));
     }
 
     /**
-     * Estimates the number of items in the intersection of the sets 
represented by two
-     * Bloom filters.
+     * Calculates the Cosine similarity between two Bloom filters.
+     * <p> Also known as Orchini similarity and the Tucker coefficient of 
congruence or
+     * Ochiai similarity.</p>
      *
-     * @param first the first Bloom filter.
-     * @param second the second Bloom filter.
-     * @return an estimate of the size of the intersection between the two 
filters.
-     */
-    public static long estimateIntersectionSize(final BloomFilter first, final 
BloomFilter second) {
-        verifyShape(first, second);
-        // do subtraction early to avoid Long overflow.
-        return estimateSize(first) - estimateUnionSize(first, second) + 
estimateSize(second);
-    }
-
-    /**
-     * Estimates the number of items in the Bloom filter based on the shape 
and the number
-     * of bits that are enabled.
+     * <p>If either filter is empty (no enabled bits) the result is 0 
(zero)</p>
      *
-     * @param filter the Bloom filter to estimate size for.
-     * @return an estimate of the number of items that were placed in the 
Bloom filter.
-     */
-    public static long estimateSize(final BloomFilter filter) {
-        final Shape shape = filter.getShape();
-        final double estimate = -(shape.getNumberOfBits() *
-            Math.log(1.0 - filter.cardinality() * 1.0 / 
shape.getNumberOfBits())) /
-            shape.getNumberOfHashFunctions();
-        return Math.round(estimate);
-    }
-
-    /**
-     * Estimates the number of items in the union of the sets represented by 
two
-     * Bloom filters.
+     * <p>This is a version of cosineSimilarity optimized for Bloom 
filters.</p>
      *
      * @param first the first Bloom filter.
      * @param second the second Bloom filter.
-     * @return an estimate of the size of the union between the two filters.
+     * @return the Cosine similarity.
      */
-    public static long estimateUnionSize(final BloomFilter first, final 
BloomFilter second) {
-        verifyShape(first, second);
-        final Shape shape = first.getShape();
-        final double estimate = -(shape.getNumberOfBits() *
-            Math.log(1.0 - first.orCardinality(second) * 1.0 / 
shape.getNumberOfBits())) /
-            shape.getNumberOfHashFunctions();
-        return Math.round(estimate);
+    public static double cosineSimilarity(final BloomFilter first, final 
BloomFilter second) {
+        final int numerator = andCardinality(first, second);
+        // Given that the cardinality is an int then the product as a double 
will not
+        // overflow, we can use one sqrt:
+        return numerator == 0 ? 0 : numerator / Math.sqrt(first.cardinality() 
* second.cardinality());
     }
 
     /**
-     * Calculates the Hamming distance between two Bloom filters.
+     * Calculates the Hamming distance between two BitMapProducers.
      *
-     * @param first the first Bloom filter.
-     * @param second the second Bloom filter.
+     * @param first the first BitMapProducer.
+     * @param second the second BitMapProducer.
      * @return the Hamming distance.
      */
-    public static int hammingDistance(final BloomFilter first, final 
BloomFilter second) {
-        verifyShape(first, second);
-        return first.xorCardinality(second);
+    public static int hammingDistance(final BitMapProducer first, final 
BitMapProducer second) {
+        return xorCardinality(first, second);
     }
 
     /**
-     * Calculates the Jaccard distance between two Bloom filters.
+     * Calculates the Jaccard distance between two BitMapProducer.
      *
      * <p>Jaccard distance is defined as {@code 1 - Jaccard similarity}</p>
      *
-     * @param first the first Bloom filter.
-     * @param second the second Bloom filter.
+     * @param first the first BitMapProducer.
+     * @param second the second BitMapProducer.
      * @return the Jaccard distance.
      */
-    public static double jaccardDistance(final BloomFilter first, final 
BloomFilter second) {
+    public static double jaccardDistance(final BitMapProducer first, final 
BitMapProducer second) {
         return 1.0 - jaccardSimilarity(first, second);
     }
 
     /**
-     * Calculates the Jaccard similarity between two Bloom filters.
+     * Calculates the Jaccard similarity between two BitMapProducer.
      *
      * <p>Also known as Jaccard index, Intersection over Union, and Jaccard 
similarity coefficient</p>
      *
-     * @param first the first Bloom filter.
-     * @param second the second Bloom filter.
+     * @param first the first BitMapProducer.
+     * @param second the second BitMapProducer.
      * @return the Jaccard similarity.
      */
-    public static double jaccardSimilarity(final BloomFilter first, final 
BloomFilter second) {
-        verifyShape(first, second);
-        final int orCard = first.orCardinality(second);
-        // if the orCard is zero then the hamming distance will also be zero.
-        return orCard == 0 ? 0 : hammingDistance(first, second) / (double) 
orCard;
-    }
-
-    /**
-     * Verifies the Bloom filters have the same shape.
-     *
-     * @param first the first filter to check.
-     * @param second the second filter to check.
-     * @throws IllegalArgumentException if the shapes are not the same.
-     */
-    private static void verifyShape(final BloomFilter first, final BloomFilter 
second) {
-        if (!first.getShape().equals(second.getShape())) {
-            throw new IllegalArgumentException(String.format("Shape %s is not 
the same as %s",
-                first.getShape(), second.getShape()));
-        }
+    public static double jaccardSimilarity(final BitMapProducer first, final 
BitMapProducer second) {
+        final int intersection = andCardinality(first, second);

Review comment:
       There is chance for efficiency here. Using my updated method:
   ```Java
       public static double jaccardSimilarity(final BitMapProducer first, final 
BitMapProducer second) {
           int[] cardinality = new int[2];
           first.forEachBitMapPair(second, (x, y) -> {
               cardinality[0] += Long.bitCount(x & y);
               cardinality[1] += Long.bitCount(x | y);
               return true;
           });
           final int intersection = cardinality[0];
           return intersection == 0 ? 0 : intersection / (double) 
cardinality[1];
       }
   ```
   It passes all the current tests in SetOperations.
   

##########
File path: 
src/main/java/org/apache/commons/collections4/bloomfilter/BitMapProducer.java
##########
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.collections4.bloomfilter;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Objects;
+import java.util.function.LongPredicate;
+
+/**
+ * Produces bit map longs for a Bloom filter.
+ *
+ * Each bit map is a little-endian long value representing a block of bits of 
in a filter.
+ *
+ * <p>The returned array will have length {@code ceil(m / 64)} where {@code m} 
is the
+ * number of bits in the filter and {@code ceil} is the ceiling function.
+ * Bits 0-63 are in the first long. A value of 1 at a bit position indicates 
the bit
+ * index is enabled.
+ * </p><p><em>
+ * The default implementations of the {@code makePredicate()} and {@code 
asBitMapArray} methods
+ * are slow and should be reimplemented in the implementing classes where 
possible.</em></p>
+ *
+ * @since 4.5
+ */
+@FunctionalInterface
+public interface BitMapProducer {

Review comment:
       Naming conventions between the to and from method in this and 
IndexProducer do not align:
   ```
   IndexProducer:
   // Update to use the ... varargs syntax
   fromIntArray(int[] array) -> fromIndexArray(int... array)
   int[] toIndexArray()
   
   BitMapProducer:
   fromLongArray(long... array) -> fromBitMapArray(long... array)
   long[] toBitMapArray();
   ```
   I would also order the methods in this interface to match IndexProducer. 
This means moving `toBitMapArray()` as the last method.

##########
File path: 
src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java
##########
@@ -16,147 +16,172 @@
  */
 package org.apache.commons.collections4.bloomfilter;
 
-import org.apache.commons.collections4.bloomfilter.hasher.Shape;
+import java.util.function.LongBinaryOperator;
+import java.util.function.LongPredicate;
 
 /**
- * Implementations of set operations on Bloom filters.
+ * Implementations of set operations on BitMapProducers.
  *
+ * @since 4.5
  */
 public final class SetOperations {
 
     /**
-     * Calculates the Cosine distance between two Bloom filters.
+     * Calculates the cardinality of the result of a LongBinaryOperator using 
the
+     * {@code BitMapProducer.makePredicate} method.
+     * @param first the first BitMapProducer
+     * @param second the second BitMapProducer
+     * @param op a long binary operation on where x = {@code first} and y = 
{@code second} bitmap producers.
+     * @return the calculated cardinality.
+     */
+    private static int cardinality(BitMapProducer first, BitMapProducer 
second, LongBinaryOperator op) {
+        int[] cardinality = new int[1];
+

Review comment:
       With my updated method this would be:
   ```Java
           int[] cardinality = new int[1];
           first.forEachBitMapPair(second, (x, y) -> {
               cardinality[0] += Long.bitCount(op.applyAsLong(x, y));
               return true;
           });
           return cardinality[0];
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to