This is an automated email from the ASF dual-hosted git repository.

jmalkin pushed a commit to branch bloom_review_changes
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git

commit c899316611332de91267f51d3e8313f792ebaf25
Author: jmalkin <[email protected]>
AuthorDate: Wed Feb 28 21:13:57 2024 -0800

    Changes mostly based on review feedback, but without modifying the review 
branch since changes are small and scattered
---
 .../org/apache/datasketches/common/Family.java     |  2 +-
 .../datasketches/filters/bloomfilter/BitArray.java | 49 +++++++++++++---------
 .../filters/bloomfilter/BloomFilter.java           | 14 ++++++-
 .../filters/bloomfilter/BloomFilterBuilder.java    |  8 ++++
 4 files changed, 50 insertions(+), 23 deletions(-)

diff --git a/src/main/java/org/apache/datasketches/common/Family.java 
b/src/main/java/org/apache/datasketches/common/Family.java
index a8a94b4c..38c517c1 100644
--- a/src/main/java/org/apache/datasketches/common/Family.java
+++ b/src/main/java/org/apache/datasketches/common/Family.java
@@ -156,7 +156,7 @@ public enum Family {
   /**
    * Bloom Filter
    */
-  BLOOMFILTER(21, "BLOOMFILTER", 3, 3);
+  BLOOMFILTER(21, "BLOOMFILTER", 4, 4);
   ;
 
   private static final Map<Integer, Family> lookupID = new HashMap<>();
diff --git 
a/src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java 
b/src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java
index 9d2242ee..dd8943fe 100644
--- a/src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java
+++ b/src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java
@@ -19,6 +19,8 @@
 
 package org.apache.datasketches.filters.bloomfilter;
 
+import java.util.Arrays;
+
 import org.apache.datasketches.common.SketchesArgumentException;
 import org.apache.datasketches.common.SketchesStateException;
 import org.apache.datasketches.memory.Memory;
@@ -33,9 +35,11 @@ import org.apache.datasketches.memory.WritableMemory;
 final class BitArray {
   // MAX_BITS using longs, based on array indices being capped at 
Integer.MAX_VALUE
   private static final long MAX_BITS = Integer.MAX_VALUE * (long) Long.SIZE; 
-  private static final long DATA_OFFSET = 8; // offset into memory for start 
of data array
+  private static final long NUMBITSSET_OFFSET = 8; // offset into memory for 
numBitsSet
+  private static final long DATA_OFFSET = 16; // offset into memory for start 
of data array
 
   private long numBitsSet_;  // if -1, need to recompute value
+  private boolean isDirty_;
   private long[] data_;
 
   // creates an array of a given size
@@ -49,16 +53,15 @@ final class BitArray {
 
     final int numLongs = (int) Math.ceil(numBits / 64.0);
     numBitsSet_ = 0;
+    isDirty_ = false;
     data_ = new long[numLongs];
   }
 
   // uses the provided array
-  BitArray(final long[] data) {
+  BitArray(final long numBitsSet, final long[] data) {
     data_ = data;
-    numBitsSet_ = 0;
-    for (long val : data) {
-      numBitsSet_ += Long.bitCount(val);
-    }
+    isDirty_ = numBitsSet < 0;
+    numBitsSet_ = numBitsSet;
   }
 
   // reads a serialized image, but the BitArray is not fully self-describing 
so requires
@@ -71,15 +74,18 @@ final class BitArray {
     
     if (isEmpty) {
       return new BitArray((long) numLongs * Long.SIZE);
-    }
+    }    
+
+    // will be -1 if dirty
+    final long numBitsSet = mem.getLong(NUMBITSSET_OFFSET);
 
     final long[] data = new long[numLongs];
     mem.getLongArray(DATA_OFFSET, data, 0, numLongs);
-    return new BitArray(data);
+    return new BitArray(numBitsSet, data);
   }
 
   boolean isEmpty() {
-    return numBitsSet_ == 0;
+    return getNumBitsSet() == 0;
   }
 
   // queries a single bit in the array
@@ -88,11 +94,10 @@ final class BitArray {
   }
 
   // sets a single bit in the array without querying, meaning the method
-  // cannot properly track the number of bits set. 
-  // instead changes numBitsSet_ to indicate that the value is unreliable
+  // cannot properly track the number of bits set so set isDirty = true
   void setBit(final long index) {
     data_[(int) index >>> 6] |= 1L << index;
-    numBitsSet_ = -1; // use as a dirty flag
+    isDirty_ = true;
   }
 
   // returns existing value of bit
@@ -103,7 +108,7 @@ final class BitArray {
       return true; // already seen
     } else {
       data_[offset] |= mask;
-      if (numBitsSet_ != -1) { ++numBitsSet_; }
+      ++numBitsSet_; // increment regardless of isDirty_
       return false; // new set
     }
   }
@@ -112,7 +117,7 @@ final class BitArray {
   // O(1) if only getAndSetBit() has been used
   // O(data_.length) if setBit() has ever been used
   long getNumBitsSet() {
-    if (numBitsSet_ == -1) {
+    if (isDirty_) {
       numBitsSet_ = 0;
       for (long val : data_) {
         numBitsSet_ += Long.bitCount(val);
@@ -137,6 +142,7 @@ final class BitArray {
       numBitsSet += Long.bitCount(data_[i]);
     }
     numBitsSet_ = numBitsSet;
+    isDirty_ = false;
   }
 
   // applies logical AND
@@ -151,29 +157,31 @@ final class BitArray {
       numBitsSet += Long.bitCount(data_[i]);
     }
     numBitsSet_ = numBitsSet;
+    isDirty_ = false;
   }
 
   // applies bitwise inversion
   void invert() {
-    if (numBitsSet_ == -1) {
+    if (isDirty_) {
       numBitsSet_ = 0;
       for (int i = 0; i < data_.length; ++i) {
         data_[i] = ~data_[i];
         numBitsSet_ += Long.bitCount(data_[i]);
       }
+      isDirty_ = false;
     } else {
       for (int i = 0; i < data_.length; ++i) {
         data_[i] = ~data_[i];
       }
-      numBitsSet_ = getCapacity() - numBitsSet_;  
+      numBitsSet_ = getCapacity() - numBitsSet_;
     }
   }
 
   long getSerializedSizeBytes() {
     // We only really need an int for array length but this will keep 
everything
     // aligned to 8 bytes.
-    // Always write array length, even if empty
-    return isEmpty() ? Long.BYTES : Long.BYTES * (1L + data_.length);
+    // Always write array length and numBitsSet, even if empty
+    return isEmpty() ? Long.BYTES : Long.BYTES * (2L + data_.length);
   }
 
   void writeToMemory(final WritableMemory wmem) {
@@ -184,6 +192,7 @@ final class BitArray {
     wmem.putInt(0, data_.length);
     
     if (!isEmpty()) {
+      wmem.putLong(NUMBITSSET_OFFSET, isDirty_ ? -1 : numBitsSet_);
       wmem.putLongArray(DATA_OFFSET, data_, 0, data_.length);
     }
   }
@@ -212,8 +221,8 @@ final class BitArray {
 
   // clears the array
   void reset() {
-    final int numLongs = data_.length;
-    data_ = new long[numLongs];
+    Arrays.fill(data_, 0);
     numBitsSet_ = 0;
+    isDirty_ = false;
   }
 }
diff --git 
a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java 
b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java
index 00c6dfe9..33df6e55 100644
--- a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java
+++ b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java
@@ -57,7 +57,7 @@ import org.apache.datasketches.memory.XxHash;
  */
 public final class BloomFilter {
   // maximum number of longs in the array with space for a header at 
serialization
-  private static final long MAX_SIZE = (Integer.MAX_VALUE - 3) * (long) 
Long.SIZE;
+  private static final long MAX_SIZE = (Integer.MAX_VALUE - 
Family.BLOOMFILTER.getMaxPreLongs()) * (long) Long.SIZE;
   private static final int SER_VER = 1;
   private static final int EMPTY_FLAG_MASK = 4;
 
@@ -135,6 +135,13 @@ public final class BloomFilter {
     return new BloomFilter(numHashes, seed, bitArray);
   }
 
+  /**
+   * Resets the BloomFilter to an empty state
+   */
+  public void reset() {
+    bitArray_.reset();
+  }
+
   /**
    * Checks if the BloomFilter has processed any items
    * @return True if the BloomFilter is empty, otherwise False
@@ -648,7 +655,7 @@ public final class BloomFilter {
   }
 
 /*
- * A Bloom Filter's serialized image always uses 3 longs of preamble, whether 
empty or not:
+ * A Bloom Filter's serialized image always uses 4 longs of preamble, whether 
empty or not:
  *
  * <pre>
  * Long || Start Byte Adr:
@@ -661,6 +668,9 @@ public final class BloomFilter {
  *
  *      ||      16        |   17   |   18   |   19   |   20   |   21   |   22  
 |   23   |
  *  2   ||-------BitArray Length (in 
longs)----------|-----------Unused------------------|
+ *
+ *      ||      24        |   25   |   26   |   27   |   28   |   29   |   30  
 |   31   |
+ *  2   
||---------------------------------NumBitsSet------------------------------------|
  *  </pre>
  * 
  * The raw BitArray bits, if non-empty start at byte 24.
diff --git 
a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java
 
b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java
index 0934338d..22356330 100644
--- 
a/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java
+++ 
b/src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilterBuilder.java
@@ -23,6 +23,14 @@ import java.util.concurrent.ThreadLocalRandom;
 
 import org.apache.datasketches.common.SketchesArgumentException;
 
+/**
+ * <p>This class provides methods to help estimate the correct paramters to 
use when
+ * creating a Bloom filter, and methods to create the filter using those 
values.</p>
+ * 
+ * <p>The underlying math is described in the
+ * <a 
href='https://en.wikipedia.org/wiki/Bloom_filter#Optimal_number_of_hash_functions'>
+ * Wikipedia article on Bloom filters</a>.</p>
+ */
 public final class BloomFilterBuilder {
 
   /**


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to