Luo Chen has submitted this change and it was merged. Change subject: [ASTERIXDB-2243][STO] Fix BloomFilter size estimation ......................................................................
[ASTERIXDB-2243][STO] Fix BloomFilter size estimation - user model changes: no - storage format changes: no - interface changes: no Details: - Fix the bloom filter size estimation by using the actual number of elements after bulk loading. This prevents the bloom filter size grows larger and large under an update heavy workloads, where most of ingested records are deleted through merge. Change-Id: Ib4054797d969efcfceb86f91b5321d34480e25c3 Reviewed-on: https://asterix-gerrit.ics.uci.edu/2285 Sonar-Qube: Jenkins <[email protected]> Reviewed-by: Michael Blow <[email protected]> Integration-Tests: Jenkins <[email protected]> Tested-by: Jenkins <[email protected]> Contrib: Jenkins <[email protected]> --- M hyracks-fullstack/hyracks/hyracks-storage-am-bloomfilter/src/main/java/org/apache/hyracks/storage/am/bloomfilter/impls/BloomFilter.java 1 file changed, 11 insertions(+), 8 deletions(-) Approvals: Anon. E. Moose #1000171: Jenkins: Verified; No violations found; ; Verified Michael Blow: Looks good to me, approved diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-bloomfilter/src/main/java/org/apache/hyracks/storage/am/bloomfilter/impls/BloomFilter.java b/hyracks-fullstack/hyracks/hyracks-storage-am-bloomfilter/src/main/java/org/apache/hyracks/storage/am/bloomfilter/impls/BloomFilter.java index 3d8782a..6c16bd1 100644 --- a/hyracks-fullstack/hyracks/hyracks-storage-am-bloomfilter/src/main/java/org/apache/hyracks/storage/am/bloomfilter/impls/BloomFilter.java +++ b/hyracks-fullstack/hyracks/hyracks-storage-am-bloomfilter/src/main/java/org/apache/hyracks/storage/am/bloomfilter/impls/BloomFilter.java @@ -57,7 +57,7 @@ private int numHashes; private long numElements; private long numBits; - // keep trace of the version of the bloomfilter to be backward compatible + // keep track of the version of the bloomfilter to be backward compatible private int version; private final int numBitsPerPage; private final int numBlocksPerPage; @@ -277,27 +277,30 @@ public class BloomFilterBuilder implements IIndexBulkLoader { private final long[] hashes = BloomFilter.createHashArray(); - private final long numElements; + private final long estimatedNumElements; private final int numHashes; private final long numBits; private final int numPages; + private long actualNumElements; private final IFIFOPageQueue queue; private final ICachedPage[] pages; private ICachedPage metaDataPage = null; - public BloomFilterBuilder(long numElements, int numHashes, int numBitsPerElement) throws HyracksDataException { + public BloomFilterBuilder(long estimatedNumElemenets, int numHashes, int numBitsPerElement) + throws HyracksDataException { if (!isActivated) { throw HyracksDataException.create(ErrorCode.CANNOT_CREATE_BLOOM_FILTER_BUILDER_FOR_INACTIVE_FILTER); } queue = bufferCache.createFIFOQueue(); - this.numElements = numElements; + this.estimatedNumElements = estimatedNumElemenets; this.numHashes = numHashes; - numBits = this.numElements * numBitsPerElement; + numBits = this.estimatedNumElements * numBitsPerElement; long tmp = (long) Math.ceil(numBits / (double) numBitsPerPage); if (tmp > Integer.MAX_VALUE) { throw HyracksDataException.create(ErrorCode.CANNOT_CREATE_BLOOM_FILTER_WITH_NUMBER_OF_PAGES, tmp); } numPages = (int) tmp; + actualNumElements = 0; pages = new ICachedPage[numPages]; int currentPageId = 1; while (currentPageId <= numPages) { @@ -327,7 +330,7 @@ } metaDataPage.getBuffer().putInt(NUM_PAGES_OFFSET, numPages); metaDataPage.getBuffer().putInt(NUM_HASHES_USED_OFFSET, numHashes); - metaDataPage.getBuffer().putLong(NUM_ELEMENTS_OFFSET, numElements); + metaDataPage.getBuffer().putLong(NUM_ELEMENTS_OFFSET, actualNumElements); metaDataPage.getBuffer().putLong(NUM_BITS_OFFSET, numBits); metaDataPage.getBuffer().putInt(VERSION_OFFSET, BLOCKED_BLOOM_FILTER_VERSION); } @@ -337,6 +340,7 @@ if (numPages == 0) { throw HyracksDataException.create(ErrorCode.CANNOT_ADD_TUPLES_TO_DUMMY_BLOOM_FILTER); } + actualNumElements++; MurmurHash128Bit.hash3_x64_128(tuple, keyFields, SEED, hashes); long hash = Math.abs(hashes[0] % numBits); @@ -367,7 +371,7 @@ bufferCache.finishQueue(); BloomFilter.this.numBits = numBits; BloomFilter.this.numHashes = numHashes; - BloomFilter.this.numElements = numElements; + BloomFilter.this.numElements = actualNumElements; BloomFilter.this.numPages = numPages; BloomFilter.this.version = BLOCKED_BLOOM_FILTER_VERSION; } @@ -383,6 +387,5 @@ bufferCache.returnPage(metaDataPage, false); } } - } } -- To view, visit https://asterix-gerrit.ics.uci.edu/2285 To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ib4054797d969efcfceb86f91b5321d34480e25c3 Gerrit-PatchSet: 4 Gerrit-Project: asterixdb Gerrit-Branch: master Gerrit-Owner: Luo Chen <[email protected]> Gerrit-Reviewer: Anon. E. Moose #1000171 Gerrit-Reviewer: Ian Maxon <[email protected]> Gerrit-Reviewer: Jenkins <[email protected]> Gerrit-Reviewer: Luo Chen <[email protected]> Gerrit-Reviewer: Michael Blow <[email protected]> Gerrit-Reviewer: abdullah alamoudi <[email protected]>
