This is an automated email from the ASF dual-hosted git repository. jmalkin pushed a commit to branch bloom_review_changes in repository https://gitbox.apache.org/repos/asf/datasketches-java.git
commit c899316611332de91267f51d3e8313f792ebaf25 Author: jmalkin <[email protected]> AuthorDate: Wed Feb 28 21:13:57 2024 -0800 Changes mostly based on review feedback, but without modifying the review branch since changes are small and scattered --- .../org/apache/datasketches/common/Family.java | 2 +- .../datasketches/filters/bloomfilter/BitArray.java | 49 +++++++++++++--------- .../filters/bloomfilter/BloomFilter.java | 14 ++++++- .../filters/bloomfilter/BloomFilterBuilder.java | 8 ++++ 4 files changed, 50 insertions(+), 23 deletions(-) diff --git a/src/main/java/org/apache/datasketches/common/Family.java b/src/main/java/org/apache/datasketches/common/Family.java index a8a94b4c..38c517c1 100644 --- a/src/main/java/org/apache/datasketches/common/Family.java +++ b/src/main/java/org/apache/datasketches/common/Family.java @@ -156,7 +156,7 @@ public enum Family { /** * Bloom Filter */ - BLOOMFILTER(21, "BLOOMFILTER", 3, 3); + BLOOMFILTER(21, "BLOOMFILTER", 4, 4); ; private static final Map<Integer, Family> lookupID = new HashMap<>(); diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java b/src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java index 9d2242ee..dd8943fe 100644 --- a/src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java +++ b/src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java @@ -19,6 +19,8 @@ package org.apache.datasketches.filters.bloomfilter; +import java.util.Arrays; + import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.SketchesStateException; import org.apache.datasketches.memory.Memory; @@ -33,9 +35,11 @@ import org.apache.datasketches.memory.WritableMemory; final class BitArray { // MAX_BITS using longs, based on array indices being capped at Integer.MAX_VALUE private static final long MAX_BITS = Integer.MAX_VALUE * (long) Long.SIZE; - private static final long DATA_OFFSET = 8; // offset into memory for start of data array + private static final long NUMBITSSET_OFFSET = 8; // offset into memory for numBitsSet + private static final long DATA_OFFSET = 16; // offset into memory for start of data array private long numBitsSet_; // if -1, need to recompute value + private boolean isDirty_; private long[] data_; // creates an array of a given size @@ -49,16 +53,15 @@ final class BitArray { final int numLongs = (int) Math.ceil(numBits / 64.0); numBitsSet_ = 0; + isDirty_ = false; data_ = new long[numLongs]; } // uses the provided array - BitArray(final long[] data) { + BitArray(final long numBitsSet, final long[] data) { data_ = data; - numBitsSet_ = 0; - for (long val : data) { - numBitsSet_ += Long.bitCount(val); - } + isDirty_ = numBitsSet < 0; + numBitsSet_ = numBitsSet; } // reads a serialized image, but the BitArray is not fully self-describing so requires @@ -71,15 +74,18 @@ final class BitArray { if (isEmpty) { return new BitArray((long) numLongs * Long.SIZE); - } + } + + // will be -1 if dirty + final long numBitsSet = mem.getLong(NUMBITSSET_OFFSET); final long[] data = new long[numLongs]; mem.getLongArray(DATA_OFFSET, data, 0, numLongs); - return new BitArray(data); + return new BitArray(numBitsSet, data); } boolean isEmpty() { - return numBitsSet_ == 0; + return getNumBitsSet() == 0; } // queries a single bit in the array @@ -88,11 +94,10 @@ final class BitArray { } // sets a single bit in the array without querying, meaning the method - // cannot properly track the number of bits set. - // instead changes numBitsSet_ to indicate that the value is unreliable + // cannot properly track the number of bits set so set isDirty = true void setBit(final long index) { data_[(int) index >>> 6] |= 1L << index; - numBitsSet_ = -1; // use as a dirty flag + isDirty_ = true; } // returns existing value of bit @@ -103,7 +108,7 @@ final class BitArray { return true; // already seen } else { data_[offset] |= mask; - if (numBitsSet_ != -1) { ++numBitsSet_; } + ++numBitsSet_; // increment regardless of isDirty_ return false; // new set } } @@ -112,7 +117,7 @@ final class BitArray { // O(1) if only getAndSetBit() has been used // O(data_.length) if setBit() has ever been used long getNumBitsSet() { - if (numBitsSet_ == -1) { + if (isDirty_) { numBitsSet_ = 0; for (long val : data_) { numBitsSet_ += Long.bitCount(val); @@ -137,6 +142,7 @@ final class BitArray { numBitsSet += Long.bitCount(data_[i]); } numBitsSet_ = numBitsSet; + isDirty_ = false; } // applies logical AND @@ -151,29 +157,31 @@ final class BitArray { numBitsSet += Long.bitCount(data_[i]); } numBitsSet_ = numBitsSet; + isDirty_ = false; } // applies bitwise inversion void invert() { - if (numBitsSet_ == -1) { + if (isDirty_) { numBitsSet_ = 0; for (int i = 0; i < data_.length; ++i) { data_[i] = ~data_[i]; numBitsSet_ += Long.bitCount(data_[i]); } + isDirty_ = false; } else { for (int i = 0; i < data_.length; ++i) { data_[i] = ~data_[i]; } - numBitsSet_ = getCapacity() - numBitsSet_; + numBitsSet_ = getCapacity() - numBitsSet_; } } long getSerializedSizeBytes() { // We only really need an int for array length but this will keep everything // aligned to 8 bytes. - // Always write array length, even if empty - return isEmpty() ? Long.BYTES : Long.BYTES * (1L + data_.length); + // Always write array length and numBitsSet, even if empty + return isEmpty() ? Long.BYTES : Long.BYTES * (2L + data_.length); } void writeToMemory(final WritableMemory wmem) { @@ -184,6 +192,7 @@ final class BitArray { wmem.putInt(0, data_.length); if (!isEmpty()) { + wmem.putLong(NUMBITSSET_OFFSET, isDirty_ ? -1 : numBitsSet_); wmem.putLongArray(DATA_OFFSET, data_, 0, data_.length); } } @@ -212,8 +221,8 @@ final class BitArray { // clears the array void reset() { - final int numLongs = data_.length; - data_ = new long[numLongs]; + Arrays.fill(data_, 0); numBitsSet_ = 0; + isDirty_ = false; } } diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java index 00c6dfe9..33df6e55 100644 --- a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java +++ b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java @@ -57,7 +57,7 @@ import org.apache.datasketches.memory.XxHash; */ public final class BloomFilter { // maximum number of longs in the array with space for a header at serialization - private static final long MAX_SIZE = (Integer.MAX_VALUE - 3) * (long) Long.SIZE; + private static final long MAX_SIZE = (Integer.MAX_VALUE - Family.BLOOMFILTER.getMaxPreLongs()) * (long) Long.SIZE; private static final int SER_VER = 1; private static final int EMPTY_FLAG_MASK = 4; @@ -135,6 +135,13 @@ public final class BloomFilter { return new BloomFilter(numHashes, seed, bitArray); } + /** + * Resets the BloomFilter to an empty state + */ + public void reset() { + bitArray_.reset(); + } + /** * Checks if the BloomFilter has processed any items * @return True if the BloomFilter is empty, otherwise False @@ -648,7 +655,7 @@ public final class BloomFilter { } /* - * A Bloom Filter's serialized image always uses 3 longs of preamble, whether empty or not: + * A Bloom Filter's serialized image always uses 4 longs of preamble, whether empty or not: * * <pre> * Long || Start Byte Adr: @@ -661,6 +668,9 @@ public final class BloomFilter { * * || 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | * 2 ||-------BitArray Length (in longs)----------|-----------Unused------------------| + * + * || 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | + * 2 ||---------------------------------NumBitsSet------------------------------------| * </pre> * * The raw BitArray bits, if non-empty start at byte 24. diff --git a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java index 0934338d..22356330 100644 --- a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java +++ b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java @@ -23,6 +23,14 @@ import java.util.concurrent.ThreadLocalRandom; import org.apache.datasketches.common.SketchesArgumentException; +/** + * <p>This class provides methods to help estimate the correct paramters to use when + * creating a Bloom filter, and methods to create the filter using those values.</p> + * + * <p>The underlying math is described in the + * <a href='https://en.wikipedia.org/wiki/Bloom_filter#Optimal_number_of_hash_functions'> + * Wikipedia article on Bloom filters</a>.</p> + */ public final class BloomFilterBuilder { /** --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
