Claudenw commented on a change in pull request #258:
URL:
https://github.com/apache/commons-collections/pull/258#discussion_r800216009
##########
File path:
src/main/java/org/apache/commons/collections4/bloomfilter/hasher/Hasher.java
##########
@@ -16,117 +16,127 @@
*/
package org.apache.commons.collections4.bloomfilter.hasher;
-import java.nio.charset.Charset;
-import java.util.PrimitiveIterator;
+import org.apache.commons.collections4.bloomfilter.Shape;
+
+import java.util.function.IntPredicate;
+
+import org.apache.commons.collections4.bloomfilter.BitMap;
+import org.apache.commons.collections4.bloomfilter.IndexProducer;
/**
- * A Hasher represents items of arbitrary byte size as a byte representation of
- * fixed size (a hash). The hash representations can be used to create indexes
- * for a Bloom filter.
- *
- * <p>The hash for each item is created using a hash function; use of different
- * seeds allows generation of different hashes for the same item. The hashes
can
- * be dynamically converted into the bit index representation used by a Bloom
- * filter. The shape of the Bloom filter defines the number of indexes per item
- * and the range of the indexes. The hasher can generate the correct number of
- * indexes in the range required by the Bloom filter for each item it
- * represents.
- *
- * <p>Note that the process of generating hashes and mapping them to a Bloom
- * filter shape may create duplicate indexes. The hasher may generate fewer
than
- * the required number of hash functions per item if duplicates have been
- * removed. Implementations of {@code iterator()} may return duplicate values
- * and may return values in a random order. See implementation javadoc notes as
- * to the guarantees provided by the specific implementation.
- *
- * <p>Hashers have an identity based on the hashing algorithm used.
+ * A Hasher creates IndexProducer based on the hash implementation and the
+ * provided Shape.
*
* @since 4.5
*/
public interface Hasher {
/**
- * A builder to build a hasher.
+ * Creates an IndexProducer for this hasher based on the Shape.
*
- * <p>A hasher represents one or more items of arbitrary byte size. The
builder
- * contains methods to collect byte representations of items. Each method
to add
- * to the builder will add an entire item to the final hasher created by
the
- * {@link #build()} method.
+ * <p>The @{code IndexProducer} will create indices within the range
defined by the number of bits in
+ * the shape. The total number of indices will respect the number of hash
functions per item
+ * defined by the shape. However the count of indices may not be a
multiple of the number of
+ * hash functions once implementation has removed duplicates.</p>
*
- * @since 4.5
+ * <p>This IndexProducer must be deterministic in that it must return the
same indices for the
+ * same Shape.</p>
+ *
+ * <p>No guarantee is made as to order of indices.</p>
+ * <p>Duplicates indices for a single item must be removed.</p>
+ *
+ * @param shape the shape of the desired Bloom filter.
+ * @return the iterator of integers
*/
- interface Builder {
+ IndexProducer indices(Shape shape);
- /**
- * Builds the hasher from all the items.
- *
- * <p>This method will clear the builder for future use.
- *
- * @return the fully constructed hasher
- */
- Hasher build();
+ /**
+ * Gets the number of items that will be hashed by the {@code
IndexProducer}.
+ * @return The number of items that will be hashed by the {@code
IndexProducer}.
+ */
+ int size();
- /**
- * Adds a byte array item to the hasher.
- *
- * @param item the item to add
- * @return a reference to this object
- */
- Builder with(byte[] item);
+ /**
+ * Returns true if there are no items to be hashed.
+ * @return {@code true} if there are no items to be hashed.
+ */
+ default boolean isEmpty() {
+ return size() == 0;
+ }
+
+ /**
+ * A convenience class for Hasher implementations to filter out duplicate
indices.
+ *
+ * <p><em>If the index is negative the behavior is not defined.</em></p>
+ *
+ * <p>This is conceptually a unique filter implemented as a {@code
Predicate<int>}.</p>
+ * @since 4.5
+ */
+ class Filter {
+ private long[] bits;
+ private int size;
/**
- * Adds a character sequence item to the hasher using the specified
{@code charset}
- * encoding.
+ * Constructor.
*
- * @param item the item to add
- * @param charset the character set
- * @return a reference to this object
+ * @param size The number of numbers to track. Values from 0 to size-1
will be tracked.
*/
- default Builder with(final CharSequence item, final Charset charset) {
- return with(item.toString().getBytes(charset));
+ public Filter(int size) {
Review comment:
Filter structure was changed as noted else where.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]