Re: [PR] Bloom Filter (datasketches-java)

via GitHub Mon, 26 Feb 2024 23:05:52 -0800


github-advanced-security[bot] commented on code in PR #513:
URL: https://github.com/apache/datasketches-java/pull/513#discussion_r1503732112



##########
src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java:
##########
@@ -0,0 +1,219 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.filters.bloomfilter;
+
+import org.apache.datasketches.common.SketchesArgumentException;
+import org.apache.datasketches.common.SketchesStateException;
+import org.apache.datasketches.memory.Memory;
+import org.apache.datasketches.memory.WritableMemory;
+
+/**
+ * This class holds an array of bits suitable for use in a Bloom Filter
+ * 
+ * <p>Rounds the number of bits up to the smallest multiple of 64 (one long)
+ * that is not smaller than the specified number.
+ */
+final class BitArray {
+  // MAX_BITS using longs, based on array indices being capped at 
Integer.MAX_VALUE
+  private static final long MAX_BITS = Integer.MAX_VALUE * (long) Long.SIZE; 
+  private static final long DATA_OFFSET = 8; // offset into memory for start 
of data array
+
+  private long numBitsSet_;  // if -1, need to recompute value
+  private long[] data_;
+
+  // creates an array of a given size
+  BitArray(final long numBits) {
+    if (numBits <= 0) {
+      throw new SketchesArgumentException("Number of bits must be strictly 
positive. Found: " + numBits);
+    }
+    if (numBits > MAX_BITS) {
+      throw new SketchesArgumentException("Number of bits may not exceed " + 
MAX_BITS + ". Found: " + numBits);
+    }
+
+    final int numLongs = (int) Math.ceil(numBits / 64.0);
+    numBitsSet_ = 0;
+    data_ = new long[numLongs];
+  }
+
+  // uses the provided array
+  BitArray(final long[] data) {
+    data_ = data;
+    numBitsSet_ = 0;
+    for (long val : data) {
+      numBitsSet_ += Long.bitCount(val);
+    }
+  }
+
+  // reads a serialized image, but the BitArray is not fully self-describing 
so requires
+  // a flag to indicate whether the array is empty
+  static BitArray heapify(final Memory mem, final boolean isEmpty) {
+    final int numLongs = mem.getInt(0);
+    if (numLongs < 0) {
+      throw new SketchesArgumentException("Possible corruption: Must have 
strictly positive array size. Found: " + numLongs);
+    }
+    
+    if (isEmpty) {
+      return new BitArray(numLongs * Long.SIZE);
+    }
+
+    final long[] data = new long[numLongs];
+    mem.getLongArray(DATA_OFFSET, data, 0, numLongs);
+    return new BitArray(data);
+  }
+
+  boolean isEmpty() {
+    return numBitsSet_ == 0;
+  }
+
+  // queries a single bit in the array
+  boolean getBit(final long index) {
+    return (data_[(int) index >>> 6] & (1L << index)) != 0 ? true : false;
+  }
+
+  // sets a single bit in the array without querying, meaning the method
+  // cannot properly track the number of bits set. 
+  // instead changes numBitsSet_ to indicate that the value is unreliable
+  void setBit(final long index) {
+    data_[(int) index >>> 6] |= 1L << index;
+    numBitsSet_ = -1; // use as a dirty flag
+  }
+
+  // returns existing value of bit
+  boolean getAndSetBit(final long index) {
+    final int offset = (int) index >>> 6;
+    final long mask = 1L << index;
+    if ((data_[offset] & mask) != 0) {
+      return true; // already seen
+    } else {
+      data_[offset] |= mask;
+      if (numBitsSet_ != -1) { ++numBitsSet_; }
+      return false; // new set
+    }
+  }
+
+  // may need to recompute value:
+  // O(1) if only getAndSetBit() has been used
+  // O(data_.length) if setBit() has ever been used
+  long getNumBitsSet() {
+    if (numBitsSet_ == -1) {
+      numBitsSet_ = 0;
+      for (long val : data_) {
+        numBitsSet_ += Long.bitCount(val);
+      }  
+    }
+    return numBitsSet_;
+  }
+
+  long getCapacity() { return (long) data_.length * Long.SIZE; }
+
+  int getArrayLength() { return data_.length; }
+
+  // applies logical OR
+  void union(final BitArray other) {
+    if (data_.length != other.data_.length) {
+      throw new SketchesArgumentException("Cannot union bit arrays with 
unequal lengths");
+    }
+
+    long numBitsSet = 0;
+    for (int i = 0; i < data_.length; ++i) {
+      data_[i] |= other.data_[i];
+      numBitsSet += Long.bitCount(data_[i]);
+    }
+    numBitsSet_ = numBitsSet;
+  }
+
+  // applies logical AND
+  void intersect(final BitArray other) {
+    if (data_.length != other.data_.length) {
+      throw new SketchesArgumentException("Cannot intersect bit arrays with 
unequal lengths");
+    }
+
+    long numBitsSet = 0;
+    for (int i = 0; i < data_.length; ++i) {
+      data_[i] &= other.data_[i];
+      numBitsSet += Long.bitCount(data_[i]);
+    }
+    numBitsSet_ = numBitsSet;
+  }
+
+  // applies bitwise inversion
+  void invert() {
+    if (numBitsSet_ == -1) {
+      numBitsSet_ = 0;
+      for (int i = 0; i < data_.length; ++i) {
+        data_[i] = ~data_[i];
+        numBitsSet_ += Long.bitCount(data_[i]);
+      }
+    } else {
+      for (int i = 0; i < data_.length; ++i) {
+        data_[i] = ~data_[i];
+      }
+      numBitsSet_ = getCapacity() - numBitsSet_;  
+    }
+  }
+
+  long getSerializedSizeBytes() {
+    // We only really need an int for array length but this will keep 
everything
+    // aligned to 8 bytes.
+    // Always write array length, even if empty
+    return isEmpty() ? Long.BYTES : Long.BYTES * (1 + data_.length);

Review Comment:
   ## Result of multiplication cast to wider type
   
   Potential overflow in [int multiplication](1) before it is converted to long 
by use in a return context.
   
   [Show more 
details](https://github.com/apache/datasketches-java/security/code-scanning/692)



##########
src/main/java/org/apache/datasketches/filters/bloomfilter/BloomFilter.java:
##########
@@ -0,0 +1,747 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.filters.bloomfilter;
+
+import static org.apache.datasketches.common.Util.LS;
+
+import java.nio.charset.StandardCharsets;
+import java.util.concurrent.ThreadLocalRandom;
+
+import org.apache.datasketches.common.Family;
+import org.apache.datasketches.common.SketchesArgumentException;
+import org.apache.datasketches.common.SketchesStateException;
+import org.apache.datasketches.memory.Memory;
+import org.apache.datasketches.memory.WritableMemory;
+import org.apache.datasketches.memory.XxHash;
+
+/**
+ * <p>A Bloom filter is a data structure that can be used for probabilistic
+ * set membership.</p>
+ *
+ * <p>When querying a Bloom filter, there are no false positives. Specifically:
+ * When querying an item that has already been inserted to the filter, the 
filter will
+ * always indicate that the item is present. There is a chance of false 
positives, where
+ * querying an item that has never been presented to the filter will indicate 
that the
+ * item has already been seen. Consequently, any query should be interpreted as
+ * "might have seen."</p>
+ *
+ * <p>A standard Bloom filter is unlike typical sketches in that it is not 
sub-linear
+ * in size and does not resize itself. A Bloom filter will work up to a target 
number of
+ * distinct items, beyond which it will saturate and the false positive rate 
will start to
+ * increase. The size of a Bloom filter will be linear in the expected number 
of
+ * distinct items.</p>
+ *
+ * <p>See the BloomFilterBuilder class for methods to create a filter sized 
correctly
+ * for a target number of distinct elements and a target false positive 
probability.</p>
+ *
+ * <p>This implementation uses xxHash64 and follows the approach in Kirsch and 
Mitzenmacher,
+ * "Less Hashing, Same Performance: Building a Better Bloom Filter," Wiley 
Interscience, 2008,
+ * pp. 187-218.</p>
+ */
+public final class BloomFilter {
+  // maximum number of longs in the array with space for a header at 
serialization
+  private static final long MAX_SIZE = Integer.MAX_VALUE * (long) Long.SIZE - 
3;
+  private static final int SER_VER = 1;
+  private static final int EMPTY_FLAG_MASK = 4;
+
+  private long seed_;            // hash seed
+  private short numHashes_;      // number of hash values
+  private BitArray bitArray_;    // the actual data bits
+
+  /**
+   * Creates a BloomFilter with given number of bits and number of hash 
functions,
+   * and a random seed.
+   *
+   * @param numBits The size of the BloomFilter, in bits
+   * @param numHashes The number of hash functions to apply to items
+   */
+  public BloomFilter(final long numBits, final int numHashes) {
+    this(numBits, numHashes, ThreadLocalRandom.current().nextLong());   
+  }
+
+  /**
+   * Creates a BloomFilter with given number of bits and number of hash 
functions,
+   * and a user-specified  seed.
+   * 
+   * @param numBits The size of the BloomFilter, in bits
+   * @param numHashes The number of hash functions to apply to items
+   * @param seed The base hash seed
+   */
+  public BloomFilter(final long numBits, final int numHashes, final long seed) 
{
+    checkArgument(numBits > MAX_SIZE, "Size of BloomFilter must be <= " + 
MAX_SIZE
+      + ". Requested: " + numBits);
+    checkArgument(numHashes < 1, "Must specify a strictly positive number of 
hash functions. "
+      + "Requested: " + numHashes);
+    checkArgument(numHashes > Short.MAX_VALUE, "Number of hashes cannot exceed 
" + Short.MAX_VALUE
+      + ". Requested: " + numHashes);
+    
+    seed_ = seed;
+    numHashes_ = (short) numHashes;
+    bitArray_ = new BitArray(numBits);
+  }
+
+  // Constructor used with heapify()
+  BloomFilter(final short numHashes, final long seed, final BitArray bitArray) 
{
+    seed_ = seed;
+    numHashes_ = numHashes;
+    bitArray_ = bitArray;
+  }
+
+  /**
+   * Reads a serialized image of a BloomFilter from the provided Memory
+   * @param mem Memory containing a previously serialized BloomFilter
+   * @return a BloomFilter object
+   */
+  public static BloomFilter heapify(final Memory mem) {
+    int offsetBytes = 0;
+    final int preLongs = mem.getByte(offsetBytes++);
+    final int serVer = mem.getByte(offsetBytes++);
+    final int familyID = mem.getByte(offsetBytes++);
+    final int flags = mem.getByte(offsetBytes++);
+
+    checkArgument(preLongs < Family.BLOOMFILTER.getMinPreLongs() || preLongs > 
Family.BLOOMFILTER.getMaxPreLongs(),
+      "Possible corruption: Incorrect number of preamble bytes specified in 
header");
+    checkArgument(serVer != SER_VER, "Possible corruption: Unrecognized 
serialization version: " + serVer);
+    checkArgument(familyID != Family.BLOOMFILTER.getID(), "Possible 
corruption: Incorrect FamilyID for bloom filter. Found: " + familyID);
+    
+    final short numHashes = mem.getShort(offsetBytes);
+    offsetBytes += Integer.BYTES; // increment by 4 even after reading 2
+    checkArgument(numHashes < 1, "Possible corruption: Need strictly positive 
number of hash functions. Found: " + numHashes);
+
+    final long seed = mem.getLong(offsetBytes);
+    offsetBytes += Long.BYTES;
+
+    final boolean isEmpty = (flags & EMPTY_FLAG_MASK) != 0;
+
+    final BitArray bitArray = BitArray.heapify(mem.region(offsetBytes, 
mem.getCapacity() - offsetBytes), isEmpty);
+
+    return new BloomFilter(numHashes, seed, bitArray);
+  }
+
+  /**
+   * Checks if the BloomFilter has processed any items
+   * @return True if the BloomFilter is empty, otherwise False
+   */
+  public boolean isEmpty() { return bitArray_.isEmpty(); }
+  
+  /**
+   * Returns the number of bits in the BloomFilter that are set to 1.
+   * @return The number of bits in use in this filter
+   */
+  public long getBitsUsed() { return bitArray_.getNumBitsSet(); }
+
+  /**
+   * Returns the total number of bits in the BloomFilter.
+   * @return The total size of the BloomFilter
+   */
+  public long getCapacity() { return bitArray_.getCapacity(); }
+
+  /**
+   * Returns the configured number of hash functions for this BloomFilter
+   * @return The number of hash functions to apply to inputs
+   */
+  public short getNumHashes() { return numHashes_; }
+
+  /**
+   * Returns the hash seed for this BloomFilter.
+   * @return The hash seed for this filter
+   */
+  public long getSeed() { return seed_; }
+
+  /**
+   * Returns the percentage of all bits in the BloomFilter set to 1.
+   * @return the percentage of bits in the filter set to 1
+   */
+  public double getFillPercentage() {
+    return (double) bitArray_.getNumBitsSet() / bitArray_.getCapacity();
+  }
+
+  // UPDATE METHODS
+  /**
+   * Updates the filter with the provided long value.
+   * @param item an item with which to update the filter
+   */
+  public void update(final long item) {
+    final long h0 = XxHash.hashLong(item, seed_);
+    final long h1 = XxHash.hashLong(item, h0);
+    updateInternal(h0, h1);
+  }
+
+  /**
+   * Updates the filter with the provided double value. The value is
+   * canonicalized (NaN and infinities) prior to updating.
+   * @param item an item with which to update the filter
+   */
+  public void update(final double item) {
+    // canonicalize all NaN & +/- infinity forms    
+    final long[] data = { Double.doubleToLongBits(item) };
+    final long h0 = XxHash.hashLongArr(data, 0, 1, seed_);
+    final long h1 = XxHash.hashLongArr(data, 0, 1, h0);
+    updateInternal(h0, h1);
+  }
+
+  /**
+   * Updates the filter with the provided String.
+   * The string is converted to a byte array using UTF8 encoding.
+   *
+   * <p>Note: this will not produce the same output hash values as the {@link 
#update(char[])}
+   * method and will generally be a little slower depending on the complexity 
of the UTF8 encoding.
+   * </p>
+   *
+   * @param item an item with which to update the filter
+   */
+  public void update(final String item) {
+    if (item == null || item.isEmpty()) { return; }
+    final byte[] strBytes = item.getBytes(StandardCharsets.UTF_8);
+    final long h0 = XxHash.hashByteArr(strBytes, 0, strBytes.length, seed_);
+    final long h1 = XxHash.hashByteArr(strBytes, 0, strBytes.length, h0);
+    updateInternal(h0, h1);
+  }
+
+  /**
+   * Updates the filter with the provided byte[].
+   * @param data an array with which to update the filter
+   */
+  public void update(final byte[] data) {
+    if (data == null) { return; }
+    final long h0 = XxHash.hashByteArr(data, 0, data.length, seed_);
+    final long h1 = XxHash.hashByteArr(data, 0, data.length, h0);
+    updateInternal(h0, h1);
+  }
+
+  /**
+   * Updates the filter with the provided char[].
+   * @param data an array with which to update the filter
+   */
+  public void update(final char[] data) {
+    if (data == null) { return; }
+    final long h0 = XxHash.hashCharArr(data, 0, data.length, seed_);
+    final long h1 = XxHash.hashCharArr(data, 0, data.length, h0);
+    updateInternal(h0, h1);
+  }
+
+  /**
+   * Updates the filter with the provided short[].
+   * @param data an array with which to update the filter
+   */
+  public void update(final short[] data) {
+    if (data == null) { return; }
+    final long h0 = XxHash.hashShortArr(data, 0, data.length, seed_);
+    final long h1 = XxHash.hashShortArr(data, 0, data.length, h0);
+    updateInternal(h0, h1);
+  }
+
+  /**
+   * Updates the filter with the provided int[].
+   * @param data an array with which to update the filter
+   */
+  public void update(final int[] data) {
+    if (data == null) { return; }
+    final long h0 = XxHash.hashIntArr(data, 0, data.length, seed_);
+    final long h1 = XxHash.hashIntArr(data, 0, data.length, h0);
+    updateInternal(h0, h1);
+  }
+
+  /**
+   * Updates the filter with the provided long[].
+   * @param data an array with which to update the filter
+   */
+  public void update(final long[] data) {
+    if (data == null) { return; }
+    final long h0 = XxHash.hashLongArr(data, 0, data.length, seed_);
+    final long h1 = XxHash.hashLongArr(data, 0, data.length, h0);
+    updateInternal(h0, h1);
+  }
+
+  /**
+   * Updates the filter with the data in the provided Memory.
+   * @param mem a Memory object with which to update the filter
+   */
+  public void update(final Memory mem) {
+    if (mem == null) { return; }
+    final long h0 = mem.xxHash64(0, mem.getCapacity(), seed_);
+    final long h1 = mem.xxHash64(0, mem.getCapacity(), h0);
+    updateInternal(h0, h1);
+  }
+
+  // Internal method to apply updates given pre-computed hashes
+  private void updateInternal(final long h0, final long h1) {
+    final long numBits = bitArray_.getCapacity();
+    for (int i = 1; i <= numHashes_; ++i) {
+      // right-shift to ensure non-negative value
+      final long hashIndex = ((h0 + i * h1) >>> 1) % numBits;
+      bitArray_.setBit(hashIndex);
+    }
+  }
+
+  // QUERY-AND-UPDATE METHODS 
+
+  /**
+   * Updates the filter with the provided long and 
+   * returns the result from quering that value prior to the update.
+   * @param item an item with which to update the filter
+   * @return The query result prior to applying the update
+   */
+  public boolean queryAndUpdate(final long item) {
+    final long h0 = XxHash.hashLong(item, seed_);
+    final long h1 = XxHash.hashLong(item, h0);
+    return queryAndUpdateInternal(h0, h1);
+  }
+  
+  /**
+   * Updates the filter with the provided double and 
+   * returns the result from quering that value prior to the update.
+   * The double is canonicalized (NaN and +/- infinity) in the call.
+   * @param item an item with which to update the filter
+   * @return The query result prior to applying the update
+   */
+  public boolean queryAndUpdate(final double item) {
+    // canonicalize all NaN & +/- infinity forms    
+    final long[] data = { Double.doubleToLongBits(item) };
+    final long h0 = XxHash.hashLongArr(data, 0, 1, seed_);
+    final long h1 = XxHash.hashLongArr(data, 0, 1, h0);
+    return queryAndUpdateInternal(h0, h1);
+  }
+
+  /**
+   * Updates the filter with the provided String and 
+   * returns the result from quering that value prior to the update.
+   * The string is converted to a byte array using UTF8 encoding.
+   *
+   * <p>Note: this will not produce the same output hash values as the {@link 
#queryAndUpdate(char[])}
+   * method and will generally be a little slower depending on the complexity 
of the UTF8 encoding.
+   * </p>
+   *
+   * @param item an item with which to update the filter
+   * @return The query result prior to applying the update, or false if item 
is null
+   */
+  public boolean queryAndUpdate(final String item) {
+    if (item == null || item.isEmpty()) { return false; }
+    final byte[] strBytes = item.getBytes(StandardCharsets.UTF_8);
+    final long h0 = XxHash.hashByteArr(strBytes, 0, strBytes.length, seed_);
+    final long h1 = XxHash.hashByteArr(strBytes, 0, strBytes.length, h0);
+    return queryAndUpdateInternal(h0, h1);
+  }
+
+  /**
+   * Updates the filter with the provided byte[] and 
+   * returns the result from quering that array prior to the update.
+   * @param data an array with which to update the filter
+   * @return The query result prior to applying the update, or false if data 
is null
+   */
+  public boolean queryAndUpdate(final byte[] data) {
+    final long h0 = XxHash.hashByteArr(data, 0, data.length, seed_);
+    final long h1 = XxHash.hashByteArr(data, 0, data.length, h0);
+    return queryAndUpdateInternal(h0, h1);
+  }
+  
+  /**
+   * Updates the filter with the provided char[] and 
+   * returns the result from quering that array prior to the update.
+   * @param data an array with which to update the filter
+   * @return The query result prior to applying the update, or false if data 
is null
+   */
+  public boolean queryAndUpdate(final char[] data) {
+    if (data == null) { return false; }
+    final long h0 = XxHash.hashCharArr(data, 0, data.length, seed_);
+    final long h1 = XxHash.hashCharArr(data, 0, data.length, h0);
+    return queryAndUpdateInternal(h0, h1);
+  }
+
+  /**
+   * Updates the filter with the provided short[] and 
+   * returns the result from quering that array prior to the update.
+   * @param data an array with which to update the filter
+   * @return The query result prior to applying the update, or false if data 
is null
+   */
+  public boolean queryAndUpdate(final short[] data) {
+    if (data == null) { return false; }
+    final long h0 = XxHash.hashShortArr(data, 0, data.length, seed_);
+    final long h1 = XxHash.hashShortArr(data, 0, data.length, h0);
+    return queryAndUpdateInternal(h0, h1);
+  }
+
+  /**
+   * Updates the filter with the provided int[] and 
+   * returns the result from quering that array prior to the update.
+   * @param data an array with which to update the filter
+   * @return The query result prior to applying the update, or false if data 
is null
+   */
+  public boolean queryAndUpdate(final int[] data) {
+    if (data == null) { return false; }
+    final long h0 = XxHash.hashIntArr(data, 0, data.length, seed_);
+    final long h1 = XxHash.hashIntArr(data, 0, data.length, h0);
+    return queryAndUpdateInternal(h0, h1);
+  }
+
+  /**
+   * Updates the filter with the provided long[] and 
+   * returns the result from quering that array prior to the update.
+   * @param data an array with which to update the filter
+   * @return The query result prior to applying the update, or false if data 
is null
+   */
+  public boolean queryAndUpdate(final long[] data) {
+    if (data == null) { return false; }
+    final long h0 = XxHash.hashLongArr(data, 0, data.length, seed_);
+    final long h1 = XxHash.hashLongArr(data, 0, data.length, h0);
+    return queryAndUpdateInternal(h0, h1);
+  }
+
+  /**
+   * Updates the filter with the provided Memory and 
+   * returns the result from quering that Memory prior to the update.
+   * @param mem an array with which to update the filter
+   * @return The query result prior to applying the update, or false if mem is 
null
+   */
+  public boolean queryAndUpdate(final Memory mem) {
+    if (mem == null) { return false; }
+    final long h0 = mem.xxHash64(0, mem.getCapacity(), seed_);
+    final long h1 = mem.xxHash64(0, mem.getCapacity(), h0);
+    return queryAndUpdateInternal(h0, h1);
+  }
+
+  // Internal query-and-update method given pre-computed hashes
+  private boolean queryAndUpdateInternal(final long h0, final long h1) {
+    final long numBits = bitArray_.getCapacity();
+    boolean valueAlreadyExists = true;
+    for (int i = 1; i <= numHashes_; ++i) {
+      final long hashIndex = ((h0 + i * h1) >>> 1) % numBits;
+      // returns old value of bit
+      valueAlreadyExists &= bitArray_.getAndSetBit(hashIndex);
+    }
+    return valueAlreadyExists;
+  }
+
+  // QUERY METHODS
+  /**
+   * Queries the filter with the provided long and returns whether the
+   * value <em>might</em> have been seen previously. The filter's expected
+   * False Positive Probability determines the chances of a true result being
+   * a false positive. False negatives are never possible.
+   * @param item an item with which to query the filter
+   * @return The result of querying the filter with the given item
+   */
+  public boolean query(final long item) {
+    final long h0 = XxHash.hashLong(item, seed_);
+    final long h1 = XxHash.hashLong(item, h0);
+    return queryInternal(h0, h1);
+  }
+
+  /**
+   * Queries the filter with the provided double and returns whether the
+   * value <em>might</em> have been seen previously. The filter's expected
+   * False Positive Probability determines the chances of a true result being
+   * a false positive. False negatives are never possible. Double values are
+   * canonicalized (NaN and +/- infinity) before querying.
+   * @param item an item with which to query the filter
+   * @return The result of querying the filter with the given item
+   */
+  public boolean query(final double item) {
+    // canonicalize all NaN & +/- infinity forms    
+    final long[] data = { Double.doubleToLongBits(item) };
+    final long h0 = XxHash.hashLongArr(data, 0, 1, seed_);
+    final long h1 = XxHash.hashLongArr(data, 0, 1, h0);
+    return queryInternal(h0, h1);
+  }
+
+  /**
+   * Queries the filter with the provided double and returns whether the
+   * value <em>might</em> have been seen previously. The filter's expected
+   * False Positive Probability determines the chances of a true result being
+   * a false positive. False negatives are never possible.
+   * The string is converted to a byte array using UTF8 encoding.
+   *
+   * <p>Note: this will not produce the same output hash values as the {@link 
#update(char[])}
+   * method and will generally be a little slower depending on the complexity 
of the UTF8 encoding.
+   * </p>
+   *
+   * @param item an item with which to query the filter
+   * @return The result of querying the filter with the given item, or false 
if item is null
+   */
+  public boolean query(final String item) {
+    if (item == null || item.isEmpty()) { return false; }    
+    final byte[] strBytes = item.getBytes(StandardCharsets.UTF_8);
+    final long h0 = XxHash.hashByteArr(strBytes, 0, strBytes.length, seed_);
+    final long h1 = XxHash.hashByteArr(strBytes, 0, strBytes.length, h0);
+    return queryInternal(h0, h1);
+  }
+
+  /**
+   * Queries the filter with the provided byte[] and returns whether the
+   * array <em>might</em> have been seen previously. The filter's expected
+   * False Positive Probability determines the chances of a true result being
+   * a false positive. False negatives are never possible.
+   * @param data an array with which to query the filter
+   * @return The result of querying the filter with the given data, or false 
if data is null
+   */
+  public boolean query(final byte[] data) {
+    if (data == null) { return false; }
+    final long h0 = XxHash.hashByteArr(data, 0, data.length, seed_);
+    final long h1 = XxHash.hashByteArr(data, 0, data.length, h0);
+    return queryInternal(h0, h1);
+  }
+
+  /**
+   * Queries the filter with the provided char[] and returns whether the
+   * array <em>might</em> have been seen previously. The filter's expected
+   * False Positive Probability determines the chances of a true result being
+   * a false positive. False negatives are never possible.
+   * @param data an array with which to query the filter
+   * @return The result of querying the filter with the given data, or false 
if data is null
+   */
+  public boolean query(final char[] data) {
+    if (data == null) { return false; }
+    final long h0 = XxHash.hashCharArr(data, 0, data.length, seed_);
+    final long h1 = XxHash.hashCharArr(data, 0, data.length, h0);
+    return queryInternal(h0, h1);
+  }
+
+  /**
+   * Queries the filter with the provided short[] and returns whether the
+   * array <em>might</em> have been seen previously. The filter's expected
+   * False Positive Probability determines the chances of a true result being
+   * a false positive. False negatives are never possible.
+   * @param data an array with which to query the filter
+   * @return The result of querying the filter with the given data, or false 
if data is null
+   */
+  public boolean query(final short[] data) {
+    if (data == null) { return false; }
+    final long h0 = XxHash.hashShortArr(data, 0, data.length, seed_);
+    final long h1 = XxHash.hashShortArr(data, 0, data.length, h0);
+    return queryInternal(h0, h1);
+  }
+
+  /**
+   * Queries the filter with the provided int[] and returns whether the
+   * array <em>might</em> have been seen previously. The filter's expected
+   * False Positive Probability determines the chances of a true result being
+   * a false positive. False negatives are never possible.
+   * @param data an array with which to query the filter
+   * @return The result of querying the filter with the given data, or false 
if data is null
+   */
+  public boolean query(final int[] data) {
+    if (data == null) { return false; }
+    final long h0 = XxHash.hashIntArr(data, 0, data.length, seed_);
+    final long h1 = XxHash.hashIntArr(data, 0, data.length, h0);
+    return queryInternal(h0, h1);
+  }
+
+  /**
+   * Queries the filter with the provided long[] and returns whether the
+   * array <em>might</em> have been seen previously. The filter's expected
+   * False Positive Probability determines the chances of a true result being
+   * a false positive. False negatives are never possible.
+   * @param data an array with which to query the filter
+   * @return The result of querying the filter with the given data, or false 
if data is null
+   */
+  public boolean query(final long[] data) {
+    if (data == null) { return false; }
+    final long h0 = XxHash.hashLongArr(data, 0, data.length, seed_);
+    final long h1 = XxHash.hashLongArr(data, 0, data.length, h0);
+    return queryInternal(h0, h1);
+  }
+
+  /**
+   * Queries the filter with the provided Memory and returns whether the
+   * data <em>might</em> have been seen previously. The filter's expected
+   * False Positive Probability determines the chances of a true result being
+   * a false positive. False negatives are never possible.
+   * @param mem a Memory array with which to query the filter
+   * @return The result of querying the filter with the given Memory, or false 
if data is null
+   */
+  public boolean query(final Memory mem) {
+    if (mem == null) { return false; }
+    final long h0 = mem.xxHash64(0, mem.getCapacity(), seed_);
+    final long h1 = mem.xxHash64(0, mem.getCapacity(), h0);
+    return queryInternal(h0, h1);
+  }
+
+  // Internal method to query the filter given pre-computed hashes
+  private boolean queryInternal(final long h0, final long h1) {
+    final long numBits = bitArray_.getCapacity();
+    for (int i = 1; i <= numHashes_; ++i) {
+      final long hashIndex = ((h0 + i * h1) >>> 1) % numBits;
+      // returns old value of bit
+      if (!bitArray_.getBit(hashIndex)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // OTHER OPERATIONS
+  /**
+   * Unions two BloomFilters by applying a logical OR. The result will 
recognized
+   * any values seen by either filter (as well as false positives).
+   * @param other A BloomFilter to union with this one
+   */
+  public void union(final BloomFilter other) {
+    if (other == null) { return; }
+    if (!isCompatible(other)) {
+      throw new SketchesArgumentException("Cannot union sketches with 
different seeds, hash functions, or sizes");
+    }
+
+    bitArray_.union(other.bitArray_);
+  }
+
+  /**
+   * Intersects two BloomFilters by applying a logical AND. The result will 
recognize
+   * only values seen by both filters (as well as false positives).
+   * @param other A BloomFilter to union with this one
+   */
+  public void intersect(final BloomFilter other) {
+    if (other == null) { return; }
+    if (!isCompatible(other)) {
+      throw new SketchesArgumentException("Cannot union sketches with 
different seeds, hash functions, or sizes");
+    }
+
+    bitArray_.intersect(other.bitArray_);
+  }
+
+  /**
+   * Inverts all the bits of the BloomFilter. Approximately inverts the notion 
of set-membership.
+   */
+  public void invert() {
+    bitArray_.invert();
+  }
+
+  /**
+   * Helps identify if two BloomFilters may be unioned or intersected.
+   * @param other A BloomFilter to check for compatibility with this one
+   * @return True if the filters are compatible, otherwise false
+   */
+  public boolean isCompatible(final BloomFilter other) {
+    if (other == null
+        || seed_ != other.seed_
+        || numHashes_ != other.numHashes_
+        || bitArray_.getArrayLength() != other.bitArray_.getArrayLength()) {
+          return false;
+    }
+    return true;
+  }
+
+  /**
+   * Returns the length of this BloomFilter when serialized, in bytes
+   * @return The length of this BloomFilter when serialized, in bytes
+   */
+  public long getSerializedSizeBytes() {
+    long sizeBytes = 2 * Long.BYTES; // basic sketch info + baseSeed

Review Comment:
   ## Result of multiplication cast to wider type
   
   Potential overflow in [int multiplication](1) before it is converted to long 
by use in an assignment context.
   
   [Show more 
details](https://github.com/apache/datasketches-java/security/code-scanning/693)



##########
src/main/java/org/apache/datasketches/filters/bloomfilter/BitArray.java:
##########
@@ -0,0 +1,219 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.filters.bloomfilter;
+
+import org.apache.datasketches.common.SketchesArgumentException;
+import org.apache.datasketches.common.SketchesStateException;
+import org.apache.datasketches.memory.Memory;
+import org.apache.datasketches.memory.WritableMemory;
+
+/**
+ * This class holds an array of bits suitable for use in a Bloom Filter
+ * 
+ * <p>Rounds the number of bits up to the smallest multiple of 64 (one long)
+ * that is not smaller than the specified number.
+ */
+final class BitArray {
+  // MAX_BITS using longs, based on array indices being capped at 
Integer.MAX_VALUE
+  private static final long MAX_BITS = Integer.MAX_VALUE * (long) Long.SIZE; 
+  private static final long DATA_OFFSET = 8; // offset into memory for start 
of data array
+
+  private long numBitsSet_;  // if -1, need to recompute value
+  private long[] data_;
+
+  // creates an array of a given size
+  BitArray(final long numBits) {
+    if (numBits <= 0) {
+      throw new SketchesArgumentException("Number of bits must be strictly 
positive. Found: " + numBits);
+    }
+    if (numBits > MAX_BITS) {
+      throw new SketchesArgumentException("Number of bits may not exceed " + 
MAX_BITS + ". Found: " + numBits);
+    }
+
+    final int numLongs = (int) Math.ceil(numBits / 64.0);
+    numBitsSet_ = 0;
+    data_ = new long[numLongs];
+  }
+
+  // uses the provided array
+  BitArray(final long[] data) {
+    data_ = data;
+    numBitsSet_ = 0;
+    for (long val : data) {
+      numBitsSet_ += Long.bitCount(val);
+    }
+  }
+
+  // reads a serialized image, but the BitArray is not fully self-describing 
so requires
+  // a flag to indicate whether the array is empty
+  static BitArray heapify(final Memory mem, final boolean isEmpty) {
+    final int numLongs = mem.getInt(0);
+    if (numLongs < 0) {
+      throw new SketchesArgumentException("Possible corruption: Must have 
strictly positive array size. Found: " + numLongs);
+    }
+    
+    if (isEmpty) {
+      return new BitArray(numLongs * Long.SIZE);

Review Comment:
   ## Result of multiplication cast to wider type
   
   Potential overflow in [int multiplication](1) before it is converted to long 
by use in an invocation context.
   
   [Show more 
details](https://github.com/apache/datasketches-java/security/code-scanning/691)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Bloom Filter (datasketches-java)

Reply via email to