This is an automated email from the ASF dual-hosted git repository.

charlie pushed a commit to branch master
in repository 
https://gitbox.apache.org/repos/asf/datasketches-characterization.git

commit 145301a14b862601813cc70bb92eb19b7a773a14
Author: c-dickens <[email protected]>
AuthorDate: Fri Aug 2 11:43:21 2024 +0100

    Simplified experiment code and added trials
---
 .../filters/QuotientFilterExpansionProfile.java    | 107 +++++++++++++++++++++
 .../filters/QuotientFilterExpansionJob.conf        |  43 +++++++++
 2 files changed, 150 insertions(+)

diff --git 
a/src/main/java/org/apache/datasketches/characterization/filters/QuotientFilterExpansionProfile.java
 
b/src/main/java/org/apache/datasketches/characterization/filters/QuotientFilterExpansionProfile.java
new file mode 100644
index 0000000..c07d226
--- /dev/null
+++ 
b/src/main/java/org/apache/datasketches/characterization/filters/QuotientFilterExpansionProfile.java
@@ -0,0 +1,107 @@
+package org.apache.datasketches.characterization.filters;
+
+import org.apache.datasketches.Job;
+import org.apache.datasketches.JobProfile;
+import org.apache.datasketches.filters.quotientfilter.QuotientFilter;
+
+import java.util.ArrayList;
+
+import static org.apache.datasketches.common.Util.pwr2SeriesNext;
+
+public class QuotientFilterExpansionProfile implements JobProfile{
+
+    private Job job;
+    long vIn = 0L;
+    int startLgU;
+    int endLgU;
+    int uPPO;
+    int lgK;
+    int startLenFprint;
+    public ArrayList<Long> negativeItems = new ArrayList<>();
+    int lgTrials;
+    int numTrials;
+
+    @Override
+    public void start(final Job job) {
+        this.job = job;
+        runExpansionTrials();
+    }
+
+    @Override
+    public void shutdown() { }
+
+    @Override
+    public void cleanup() { }
+
+    private void runExpansionTrials(){
+        //final ArrayList<Long> negativeItems = new ArrayList<>();
+        startLgU = Integer.parseInt(job.getProperties().mustGet("startLgU"));
+        endLgU = Integer.parseInt(job.getProperties().mustGet("endLgU"));
+        uPPO = Integer.parseInt(job.getProperties().mustGet("Universe_uPPO"));
+        lgK = Integer.parseInt(job.getProperties().mustGet("startLgNumSlots"));
+        startLenFprint = 
Integer.parseInt(job.getProperties().mustGet("startLenFprint"));
+        final int lgNumQueries = 
Integer.parseInt(job.getProperties().mustGet("lgNumQueries"));
+        lgTrials = Integer.parseInt(job.getProperties().mustGet("lgTrials"));
+        final long numQueries = 1L << lgNumQueries;
+        final int numTrials = 1 << lgTrials;
+
+        job.println(getHeader());
+        for (int t = 0; t < numTrials; t++)  doTrial(numQueries, startLgU, 
endLgU);
+    }
+
+    /*
+    The number of collisions in N keys inserted to a length M hash table is 
approximately N^2/2M.
+    This can be seen through the recursive relationship
+     */
+    private void doTrial(long numQueries, long startLgU, long endLgU){
+        QuotientFilter qf = new QuotientFilter(lgK, startLenFprint+3);
+        final StringBuilder dataStr = new StringBuilder();
+
+        // Populate the negative items.  Do this first to easily keep it 
separate from input item set.
+        negativeItems.clear();
+        for (long i = 0; i < numQueries; i++ ) {negativeItems.add(++vIn) ;}
+
+        // vIn is now the starting point; add a specified number of items from 
this location to the end of the range.
+        long startPoint = 0L;
+        long inputCardinality = (1L << startLgU) ;
+        long numInsertions;
+        long maxCardinality = (1L << endLgU) ; // end point
+
+        while(inputCardinality < maxCardinality) {
+            numInsertions = inputCardinality - startPoint;
+            for (long i = 0; i < numInsertions; i++) { qf.insert(++vIn);}
+            startPoint = inputCardinality;
+
+            // test the false positive rate
+            long numFalsePositive = 0;
+            for (long negItem : negativeItems) {
+                if (qf.search(negItem)) ++numFalsePositive;
+            }
+            double fpr = (double) numFalsePositive / numQueries;
+            process(inputCardinality, qf.getNumEntries(), qf.getNumSlots(), 
qf.getFingerprintLength(), fpr, dataStr);
+            job.println(dataStr.toString());
+            inputCardinality = pwr2SeriesNext(uPPO, inputCardinality);
+        }
+    }
+
+    private static void process(final long numInput, final long numEntries, 
final long numSlots, final int fPrintLen,
+                                final double falsePositiveRate, final 
StringBuilder sb){
+        // OUTPUT
+        sb.setLength(0);
+        sb.append(numSlots).append(TAB);
+        sb.append(numInput).append(TAB);
+        sb.append(fPrintLen).append(TAB);
+        sb.append(numEntries).append(TAB);
+        sb.append(falsePositiveRate);
+    }
+
+    private String getHeader() {
+        final StringBuilder sb = new StringBuilder();
+        sb.append("NumSlots").append(TAB);
+        sb.append("NumInput").append(TAB);
+        sb.append("FPrintLen").append(TAB);
+        sb.append("NumEntries").append(TAB);
+        sb.append("FalsePositiveRate");
+        return sb.toString();
+    }
+}
diff --git a/src/main/resources/filters/QuotientFilterExpansionJob.conf 
b/src/main/resources/filters/QuotientFilterExpansionJob.conf
new file mode 100644
index 0000000..3879123
--- /dev/null
+++ b/src/main/resources/filters/QuotientFilterExpansionJob.conf
@@ -0,0 +1,43 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Uniques Profile
+Universe_lgU=15 # Maximum log2 of the input set.
+Universe_uPPO=4   #The horizontal x-resolution of trials points
+
+
+# Trials Profile
+Trials_lgMinT=0
+Trials_lgMaxT=0
+Trials_TPPO=1
+Trials_lgMinBpU=1
+Trials_lgMaxBpU=5
+
+# Date-Time Profile
+TimeZone=PST
+TimeZoneOffset=-28800000 # offset in millisec
+FileNameDateFormat=yyyyMMdd'_'HHmmssz
+ReadableDateFormat=yyyy/MM/dd HH:mm:ss z
+
+#Job Profile
+JobProfile=org.apache.datasketches.characterization.filters.QuotientFilterExpansionProfile
+startLgU = 9
+endLgU = 12
+startLgNumSlots = 10
+startLenFprint = 5
+lgNumQueries = 10
+lgTrials = 3
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to