This is an automated email from the ASF dual-hosted git repository. charlie pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/datasketches-characterization.git
commit 145301a14b862601813cc70bb92eb19b7a773a14 Author: c-dickens <[email protected]> AuthorDate: Fri Aug 2 11:43:21 2024 +0100 Simplified experiment code and added trials --- .../filters/QuotientFilterExpansionProfile.java | 107 +++++++++++++++++++++ .../filters/QuotientFilterExpansionJob.conf | 43 +++++++++ 2 files changed, 150 insertions(+) diff --git a/src/main/java/org/apache/datasketches/characterization/filters/QuotientFilterExpansionProfile.java b/src/main/java/org/apache/datasketches/characterization/filters/QuotientFilterExpansionProfile.java new file mode 100644 index 0000000..c07d226 --- /dev/null +++ b/src/main/java/org/apache/datasketches/characterization/filters/QuotientFilterExpansionProfile.java @@ -0,0 +1,107 @@ +package org.apache.datasketches.characterization.filters; + +import org.apache.datasketches.Job; +import org.apache.datasketches.JobProfile; +import org.apache.datasketches.filters.quotientfilter.QuotientFilter; + +import java.util.ArrayList; + +import static org.apache.datasketches.common.Util.pwr2SeriesNext; + +public class QuotientFilterExpansionProfile implements JobProfile{ + + private Job job; + long vIn = 0L; + int startLgU; + int endLgU; + int uPPO; + int lgK; + int startLenFprint; + public ArrayList<Long> negativeItems = new ArrayList<>(); + int lgTrials; + int numTrials; + + @Override + public void start(final Job job) { + this.job = job; + runExpansionTrials(); + } + + @Override + public void shutdown() { } + + @Override + public void cleanup() { } + + private void runExpansionTrials(){ + //final ArrayList<Long> negativeItems = new ArrayList<>(); + startLgU = Integer.parseInt(job.getProperties().mustGet("startLgU")); + endLgU = Integer.parseInt(job.getProperties().mustGet("endLgU")); + uPPO = Integer.parseInt(job.getProperties().mustGet("Universe_uPPO")); + lgK = Integer.parseInt(job.getProperties().mustGet("startLgNumSlots")); + startLenFprint = Integer.parseInt(job.getProperties().mustGet("startLenFprint")); + final int lgNumQueries = Integer.parseInt(job.getProperties().mustGet("lgNumQueries")); + lgTrials = Integer.parseInt(job.getProperties().mustGet("lgTrials")); + final long numQueries = 1L << lgNumQueries; + final int numTrials = 1 << lgTrials; + + job.println(getHeader()); + for (int t = 0; t < numTrials; t++) doTrial(numQueries, startLgU, endLgU); + } + + /* + The number of collisions in N keys inserted to a length M hash table is approximately N^2/2M. + This can be seen through the recursive relationship + */ + private void doTrial(long numQueries, long startLgU, long endLgU){ + QuotientFilter qf = new QuotientFilter(lgK, startLenFprint+3); + final StringBuilder dataStr = new StringBuilder(); + + // Populate the negative items. Do this first to easily keep it separate from input item set. + negativeItems.clear(); + for (long i = 0; i < numQueries; i++ ) {negativeItems.add(++vIn) ;} + + // vIn is now the starting point; add a specified number of items from this location to the end of the range. + long startPoint = 0L; + long inputCardinality = (1L << startLgU) ; + long numInsertions; + long maxCardinality = (1L << endLgU) ; // end point + + while(inputCardinality < maxCardinality) { + numInsertions = inputCardinality - startPoint; + for (long i = 0; i < numInsertions; i++) { qf.insert(++vIn);} + startPoint = inputCardinality; + + // test the false positive rate + long numFalsePositive = 0; + for (long negItem : negativeItems) { + if (qf.search(negItem)) ++numFalsePositive; + } + double fpr = (double) numFalsePositive / numQueries; + process(inputCardinality, qf.getNumEntries(), qf.getNumSlots(), qf.getFingerprintLength(), fpr, dataStr); + job.println(dataStr.toString()); + inputCardinality = pwr2SeriesNext(uPPO, inputCardinality); + } + } + + private static void process(final long numInput, final long numEntries, final long numSlots, final int fPrintLen, + final double falsePositiveRate, final StringBuilder sb){ + // OUTPUT + sb.setLength(0); + sb.append(numSlots).append(TAB); + sb.append(numInput).append(TAB); + sb.append(fPrintLen).append(TAB); + sb.append(numEntries).append(TAB); + sb.append(falsePositiveRate); + } + + private String getHeader() { + final StringBuilder sb = new StringBuilder(); + sb.append("NumSlots").append(TAB); + sb.append("NumInput").append(TAB); + sb.append("FPrintLen").append(TAB); + sb.append("NumEntries").append(TAB); + sb.append("FalsePositiveRate"); + return sb.toString(); + } +} diff --git a/src/main/resources/filters/QuotientFilterExpansionJob.conf b/src/main/resources/filters/QuotientFilterExpansionJob.conf new file mode 100644 index 0000000..3879123 --- /dev/null +++ b/src/main/resources/filters/QuotientFilterExpansionJob.conf @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Uniques Profile +Universe_lgU=15 # Maximum log2 of the input set. +Universe_uPPO=4 #The horizontal x-resolution of trials points + + +# Trials Profile +Trials_lgMinT=0 +Trials_lgMaxT=0 +Trials_TPPO=1 +Trials_lgMinBpU=1 +Trials_lgMaxBpU=5 + +# Date-Time Profile +TimeZone=PST +TimeZoneOffset=-28800000 # offset in millisec +FileNameDateFormat=yyyyMMdd'_'HHmmssz +ReadableDateFormat=yyyy/MM/dd HH:mm:ss z + +#Job Profile +JobProfile=org.apache.datasketches.characterization.filters.QuotientFilterExpansionProfile +startLgU = 9 +endLgU = 12 +startLgNumSlots = 10 +startLenFprint = 5 +lgNumQueries = 10 +lgTrials = 3 \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
