Claudenw commented on code in PR #402: URL: https://github.com/apache/commons-collections/pull/402#discussion_r1255448741
########## src/main/java/org/apache/commons/collections4/bloomfilter/LayeredBloomFilter.java: ########## @@ -0,0 +1,416 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.collections4.bloomfilter; + +import java.util.Arrays; +import java.util.NoSuchElementException; +import java.util.Objects; +import java.util.function.IntPredicate; +import java.util.function.LongPredicate; +import java.util.function.Predicate; + +/** + * Layered Bloom filters are described in Zhiwang, Cen; Jungang, Xu; Jian, Sun + * (2010), "A multi-layer Bloom filter for duplicated URL detection", Proc. 3rd + * International Conference on Advanced Computer Theory and Engineering (ICACTE + * 2010), vol. 1, pp. V1–586–V1–591, doi:10.1109/ICACTE.2010.5578947, ISBN + * 978-1-4244-6539-2, S2CID 3108985 + * <p> + * In short, Layered Bloom filter contains several bloom filters arranged in + * layers. + * </p> + * <ul> + * <li>When membership in the filter is checked each layer in turn is checked + * and if a match is found {@code true} is returned.</li> + * <li>When merging each bloom filter is merged into the last newest filter in + * the list of layers.</li> + * <li>When questions of cardinality are asked the cardinality of the union of + * the enclosed Bloom filters is used.</li> + * </ul> + * The net result is that the layered Bloom filter can be populated with more + * items than the Shape would indicate and yet still return a false positive + * rate in line with the Shape and not the over population. + * <p> + * This implementation uses a LayerManager to handle the manipulation of the + * layers. + * </p> + * <ul> + * <li>Level 0 is the oldest layer and the highest level is the newest.</li> + * <li>There is always at least one enclosed filter.</li> + * <li>The newest filter is the {@code target} into which merges are performed. + * <li>Whenever the target is retrieved, or a {@code merge} operation is + * performed the code checks if any older layers should be removed, and if so + * removes them. It also checks it a new layer should be added, and if so adds + * it and sets the {@code target} before the operation.</li> + * </ul> + */ +public class LayeredBloomFilter implements BloomFilter, BloomFilterProducer { + private final Shape shape; + private LayerManager layerManager; + + /** + * Creates a fixed size layered bloom filter that adds new filters to the list, + * but never merges them. List will never exceed maxDepth. As additional filters + * are added earlier filters are removed. + * + * @param shape The shape for the enclosed Bloom filters + * @param maxDepth The maximum depth of layers. + * @return An empty layered Bloom filter of the specified shape and depth. + */ + public static LayeredBloomFilter fixed(final Shape shape, int maxDepth) { + LayerManager manager = LayerManager.builder().withExtendCheck(LayerManager.ExtendCheck.ADVANCE_ON_POPULATED) + .withCleanup(LayerManager.Cleanup.onMaxSize(maxDepth)).withSuplier(() -> new SimpleBloomFilter(shape)) + .build(); + return new LayeredBloomFilter(shape, manager); + } + + /** + * Constructor. + * + * @param shape the Shape of the enclosed Bloom filters + * @param layerManager the LayerManager to manage the layers. + */ + public LayeredBloomFilter(Shape shape, LayerManager layerManager) { + this.shape = shape; + this.layerManager = layerManager; + } + + @Override + public LayeredBloomFilter copy() { + return new LayeredBloomFilter(shape, layerManager.copy()); + } + + /** + * Gets the depth of the deepest layer. + * + * @return the depth of the deepest layer. + */ + public final int getDepth() { + return layerManager.getDepth(); + } + + /** + * Gets the Bloom filter at the specified depth + * + * @param depth the depth of the filter to return. + * @return the Bloom filter at the specified depth. + * @throws NoSuchElementException if depth is not in the range [0,getDepth()) + */ + public BloomFilter get(int depth) { + return layerManager.get(depth); + } + + @Override + public int cardinality() { + return SetOperations.cardinality(this); + } + + @Override + public final void clear() { + layerManager.clear(); + } + + /** + * Clears the Bloom filter (removes all set bits) at the specified level. + * + * @param level the level to clear. + */ + public final void clear(int level) { + layerManager.get(level).clear(); + } + + /** + * Get the Bloom filter that is currently being merged into. This method ensures + * that the {@code target} filter meets the criteria specified within the + * {@code LayerManager}. if the {@code LayerManager} determines that a new + * target filter is required it is constructed and used. + * + * @return the current Bloom filter. + * @see LayerManager + */ + public final BloomFilter target() { + return layerManager.target(); + } + + /** + * Processes the Bloom filters in depth order with the most recent filters + * first. Each filter is passed to the predicate in turn. The function exits on + * the first {@code false} returned by the predicate. + * + * @param bloomFilterPredicate the predicate to execute. + * @return {@code true} if all filters passed the predicate, {@code false} + * otherwise. + */ + @Override + public final boolean forEachBloomFilter(Predicate<BloomFilter> bloomFilterPredicate) { + return layerManager.forEachBloomFilter(bloomFilterPredicate); + } + + /** + * Create a standard (non-layered) Bloom filter by merging all of the layers. + * + * @return the merged bloom filter. + */ + public BloomFilter flatten() { + BloomFilter bf = new SimpleBloomFilter(shape); + forEachBloomFilter(bf::merge); + return bf; + } + + /** + * Finds the layers in which the Hasher is found. + * + * @param hasher the Hasher to search for. + * @return an array of layer indices in which the Bloom filter is found. + */ + public int[] find(final Hasher hasher) { + SimpleBloomFilter bf = new SimpleBloomFilter(shape); + bf.merge(hasher); + return find(bf); + } + + /** + * Finds the layers in which the IndexProducer is found. + * + * @param indexProducer the Index producer to search for. + * @return an array of layer indices in which the Bloom filter is found. + */ + public int[] find(final IndexProducer indexProducer) { + SimpleBloomFilter bf = new SimpleBloomFilter(shape); + bf.merge(indexProducer); + return find(bf); + } + + /** + * Finds the layers in which the BitMapProducer is found. + * + * @param bitMapProducer the BitMapProducer to search for. + * @return an array of layer indices in which the Bloom filter is found. + */ + public int[] find(final BitMapProducer bitMapProducer) { + SimpleBloomFilter bf = new SimpleBloomFilter(shape); + bf.merge(bitMapProducer); + return find(bf); + } + + /** + * Finds the layers in which the Bloom filter is found. + * + * @param bf the Bloom filter to search for. + * @return an array of layer indices in which the Bloom filter is found. + */ + public int[] find(BloomFilter bf) { + Finder finder = new Finder(bf); + forEachBloomFilter(finder); + return finder.getResult(); + } + + /** + * Returns {@code true} if this any layer contained by this filter contains the + * specified filter. + * <p> + * If the {@code other} is another Layered Bloom filter each filter within the + * {@code other} is checked to see if it exits within this filter. + * </p> + * + * @param other the other Bloom filter + * @return {@code true} if this filter contains the other filter. + */ + @Override + public boolean contains(final BloomFilter other) { + if (other instanceof LayeredBloomFilter) { + boolean[] result = { true }; + // return false when we have found a match. + ((LayeredBloomFilter) other).forEachBloomFilter(x -> { + result[0] &= contains(x); + return result[0]; + }); + return result[0]; + } + return !forEachBloomFilter(x -> !x.contains(other)); + } + + /** + * Creates a Bloom filter from a Hasher. + * + * @param hasher the hasher to create the filter from. + * @return the BloomFilter. + */ + private BloomFilter createFilter(final Hasher hasher) { + SimpleBloomFilter bf = new SimpleBloomFilter(shape); + bf.merge(hasher); + return bf; + } + + /** + * Creates a Bloom filter from an IndexProducer. + * + * @param indexProducer the IndexProducer to create the filter from. + * @return the BloomFilter. + */ + private BloomFilter createFilter(final IndexProducer indexProducer) { + SimpleBloomFilter bf = new SimpleBloomFilter(shape); + bf.merge(indexProducer); + return bf; + } + + /** + * Creates a Bloom filter from a BitMapProducer. + * + * @param bitMapProducer the BitMapProducer to create the filter from. + * @return the BloomFilter. + */ + private BloomFilter createFilter(final BitMapProducer bitMapProducer) { + SimpleBloomFilter bf = new SimpleBloomFilter(shape); + bf.merge(bitMapProducer); + return bf; + } + + @Override + public int characteristics() { + return 0; + } + + @Override + public final Shape getShape() { + return shape; + } + + @Override + public boolean contains(final Hasher hasher) { + return contains(createFilter(hasher)); + } + + @Override + public boolean contains(final BitMapProducer bitMapProducer) { + return contains(createFilter(bitMapProducer)); + } + + @Override + public boolean contains(IndexProducer indexProducer) { + return contains(createFilter(indexProducer)); + } + + @Override + public boolean merge(BloomFilter bf) { + return target().merge(bf); + } + + @Override + public boolean merge(IndexProducer indexProducer) { + return target().merge(indexProducer); + } + + @Override + public boolean merge(BitMapProducer bitMapProducer) { + return target().merge(bitMapProducer); + } + + @Override + public boolean forEachIndex(IntPredicate predicate) { + return forEachBloomFilter(bf -> bf.forEachIndex(predicate)); + } + + @Override + public boolean forEachBitMap(LongPredicate predicate) { Review Comment: By definition forEachBitMap may only return the number of BitMaps specified by the Shape or fewer, never more. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
