This is an automated email from the ASF dual-hosted git repository.

charlie pushed a commit to branch master
in repository 
https://gitbox.apache.org/repos/asf/datasketches-characterization.git


The following commit(s) were added to refs/heads/master by this push:
     new 69f906e  Added space profiles
69f906e is described below

commit 69f906eeeb2949e79b09346702eaddd89317fd09
Author: c-dickens <[email protected]>
AuthorDate: Mon Jul 29 15:46:54 2024 +0100

    Added space profiles
---
 .../characterization/filters/BaseSizeProfile.java  | 161 +++++++++++++++++++++
 .../filters/BloomFilterSizeProfile.java            |  49 +++++++
 .../filters/QuotientFilterSizeProfile.java         |  47 ++++++
 .../resources/filters/BloomFilterSpaceJob.conf     |  45 ++++++
 .../resources/filters/QuotientFilterSpaceJob.conf  |  41 ++++++
 5 files changed, 343 insertions(+)

diff --git 
a/src/main/java/org/apache/datasketches/characterization/filters/BaseSizeProfile.java
 
b/src/main/java/org/apache/datasketches/characterization/filters/BaseSizeProfile.java
new file mode 100644
index 0000000..23ecf8e
--- /dev/null
+++ 
b/src/main/java/org/apache/datasketches/characterization/filters/BaseSizeProfile.java
@@ -0,0 +1,161 @@
+package org.apache.datasketches.characterization.filters;
+import org.apache.datasketches.Job;
+import org.apache.datasketches.JobProfile;
+import org.apache.datasketches.Properties;
+import org.apache.datasketches.characterization.AccuracyStats;
+import org.apache.datasketches.quantiles.DoublesSketch;
+
+import java.util.ArrayList;
+
+import static java.lang.Math.log;
+import static java.lang.Math.pow;
+import static org.apache.datasketches.GaussianRanks.GAUSSIANS_4SD;
+import static org.apache.datasketches.common.Util.milliSecToString;
+import static org.apache.datasketches.common.Util.pwr2SeriesNext;
+
+class trialResults{
+    long filterSizeBits;
+    double measuredFalsePositiveRate;
+
+    int numHashbits;
+}
+
+public abstract class BaseSizeProfile implements JobProfile{
+    Job job;
+    public Properties prop;
+    double targetFpp;
+    public long vIn = 0;
+    int lgMinT;
+    int lgMaxT;
+    int tPPO;
+    int lgMinU;
+    int lgMaxU;
+    int uPPO;
+    long inputCardinality;
+    public ArrayList<Long> inputItems = new ArrayList<>();
+    public ArrayList<Long> negativeItems = new ArrayList<>();
+    int numTrials;
+    int lgMinBpU;
+    int lgMaxBpU;
+    double slope;
+
+
+    @Override
+    public void start(final Job job) {
+
+        this.job = job;
+        prop = job.getProperties();
+        targetFpp = Double.parseDouble(prop.mustGet("targetFpp"));
+        //Uniques Profile
+        lgMinU = Integer.parseInt(prop.mustGet("Trials_lgMinU"));
+        lgMaxU = Integer.parseInt(prop.mustGet("Trials_lgMaxU"));
+        uPPO = Integer.parseInt(prop.mustGet("Trials_UPPO"));
+        inputCardinality = (int)pwr2SeriesNext(uPPO, 1L<<lgMinU);
+
+        //Trials Profile
+        lgMinT = Integer.parseInt(prop.mustGet("Trials_lgMinT"));
+        lgMaxT = Integer.parseInt(prop.mustGet("Trials_lgMaxT"));
+        tPPO = Integer.parseInt(prop.mustGet("Trials_TPPO"));
+
+        // Uniques profile
+        lgMinBpU = Integer.parseInt(prop.mustGet("Trials_lgMinBpU"));
+        lgMaxBpU = Integer.parseInt(prop.mustGet("Trials_lgMaxBpU"));
+        slope = (double) (lgMaxT - lgMinT) / (lgMinBpU - lgMaxBpU);
+
+        configure();
+        doTrials();
+        shutdown();
+        cleanup();
+    }
+
+    @Override
+    public void shutdown() {}
+
+    @Override
+    public void cleanup() {}
+
+    public abstract void configure();
+
+    // In here should have the logic to initialize a new sketch for a 
different number of input items for each trial.
+    //public abstract long doTrial(long inputCardinality, double targetFpp);
+    public abstract trialResults doTrial(long inputCardinality, double 
targetFpp);
+
+    /*
+     This experiment varies the cardinality of the input and measures the 
filter size required
+     to obtain an input false positive rate.
+     Inputs:
+     - falsePositiveRate: the target false positive rate
+     */
+    private void doTrials() {
+        final StringBuilder dataStr = new StringBuilder();
+        final int minT = 1 << lgMinT;
+        final int maxT = 1 << lgMaxT;
+        final long maxU = 1L << lgMaxU;
+        job.println(getHeader());
+
+        while(inputCardinality < maxU) {
+            numTrials = getNumTrials(inputCardinality);
+            //doTrial(inputCardinality, targetFpp, final long numQueries);
+            inputCardinality = (int)pwr2SeriesNext(uPPO, inputCardinality);
+            //long filterNumBits = doTrial(inputCardinality, targetFpp) ;
+            trialResults results =  doTrial(inputCardinality, targetFpp) ;
+            process(inputCardinality, results.filterSizeBits, numTrials,
+                    results.measuredFalsePositiveRate,  results.numHashbits, 
dataStr);
+            job.println(dataStr.toString()) ;
+        }
+    }
+
+
+    /**
+     * Computes the number of trials for a given current number of uniques for 
a
+     * trial set. This is used to decrease the number of trials
+     * as the number of uniques increase.
+     *
+     * @param curU the given current number of uniques for a trial set.
+     * @return the number of trials for a given current number of uniques for a
+     * trial set.
+     */
+    private int getNumTrials(final long curU) {
+        final int minBpU = 1 << lgMinBpU;
+        final int maxBpU = 1 << lgMaxBpU;
+        final int maxT = 1 << lgMaxT;
+        final int minT = 1 << lgMinT;
+        if (lgMinT == lgMaxT || curU <= minBpU) {
+            return maxT;
+        }
+        if (curU >= maxBpU) {
+            return minT;
+        }
+        final double lgCurU = log(curU) / LN2;
+        final double lgTrials = slope * (lgCurU - lgMinBpU) + lgMaxT;
+        return (int) pow(2.0, lgTrials);
+    }
+
+    /**
+     * Processes the results of a trial and appends them to a StringBuilder in 
a tab-separated format.
+     *
+     * @param inputCardinality The number of hashes used in the trial.
+     * @param sb The StringBuilder to which the results are appended.
+     */
+    private static void process(final long inputCardinality, final long 
sizeInBits, final long numTrials,
+                                final double falsePositiveRate, final int 
numHashBits, final StringBuilder sb) {
+        // OUTPUT
+        sb.setLength(0);
+        sb.append(inputCardinality).append(TAB);
+        sb.append(sizeInBits).append(TAB);
+        sb.append(numTrials).append(TAB);
+        sb.append(falsePositiveRate).append(TAB);
+        sb.append(numHashBits);
+    }
+
+    private String getHeader() {
+        final StringBuilder sb = new StringBuilder();
+        sb.append("TrueU").append(TAB);
+        sb.append("Size").append(TAB);
+        sb.append("NumTrials").append(TAB);
+        sb.append("FalsePositiveRate").append(TAB);
+        sb.append("NumHashBits");
+        return sb.toString();
+    }
+
+}
diff --git 
a/src/main/java/org/apache/datasketches/characterization/filters/BloomFilterSizeProfile.java
 
b/src/main/java/org/apache/datasketches/characterization/filters/BloomFilterSizeProfile.java
new file mode 100644
index 0000000..e90ad46
--- /dev/null
+++ 
b/src/main/java/org/apache/datasketches/characterization/filters/BloomFilterSizeProfile.java
@@ -0,0 +1,49 @@
+package org.apache.datasketches.characterization.filters;
+
+import org.apache.datasketches.filters.bloomfilter.BloomFilter;
+import org.apache.datasketches.filters.bloomfilter.BloomFilterBuilder;
+
+import static org.apache.datasketches.common.Util.pwr2SeriesNext;
+
+public class BloomFilterSizeProfile extends BaseSizeProfile {
+    protected BloomFilter sketch;
+
+    public void configure() {}
+
+    @Override
+    //public long doTrial(long inputCardinality, double targetFpp) {
+    public trialResults doTrial(long inputCardinality, double targetFpp){
+        trialResults result = new trialResults();
+        final long numBits = 
BloomFilterBuilder.suggestNumFilterBits(inputCardinality, targetFpp);
+        final short numHashes = 
BloomFilterBuilder.suggestNumHashes(inputCardinality, numBits);
+        sketch = BloomFilterBuilder.createBySize(numBits, (int)numHashes-4, 
348675132L);
+
+        //sketch = BloomFilterBuilder.createByAccuracy(inputCardinality, 
targetFpp);
+        long numQueries = pwr2SeriesNext(1, 1L<<(sketch.getNumHashes()+4));
+        // Build the test sets but clear them first so that they have the 
correct cardinality and no surplus is added.
+        inputItems.clear();
+        negativeItems.clear();
+        long item;
+        for (long i = 0; i < inputCardinality; i++){
+            item = ++vIn;
+            inputItems.add(item) ;
+            sketch.update(item);
+        }
+
+        for (long i = inputCardinality; i < inputCardinality+numQueries; i++ ) 
{
+            item = ++vIn;
+            negativeItems.add(item) ;
+        }
+
+        // Check the number of false positives
+        long numFalsePositive = 0 ;
+        for (int i = 0; i < negativeItems.size(); i++){
+            if (sketch.query(negativeItems.get(i))) ++numFalsePositive ;
+        }
+        result.filterSizeBits = sketch.getCapacity();
+        result.measuredFalsePositiveRate = (double)numFalsePositive / 
numQueries;
+        result.numHashbits = sketch.getNumHashes();
+        return result;
+    }
+
+}
\ No newline at end of file
diff --git 
a/src/main/java/org/apache/datasketches/characterization/filters/QuotientFilterSizeProfile.java
 
b/src/main/java/org/apache/datasketches/characterization/filters/QuotientFilterSizeProfile.java
new file mode 100644
index 0000000..ad91001
--- /dev/null
+++ 
b/src/main/java/org/apache/datasketches/characterization/filters/QuotientFilterSizeProfile.java
@@ -0,0 +1,47 @@
+package org.apache.datasketches.characterization.filters;
+
+import org.apache.datasketches.filters.quotientfilter.QuotientFilter;
+import org.apache.datasketches.filters.quotientfilter.QuotientFilterBuilder;
+
+import static org.apache.datasketches.common.Util.pwr2SeriesNext;
+
+public class QuotientFilterSizeProfile extends BaseSizeProfile {
+    protected QuotientFilter sketch;
+
+    public void configure() {}
+
+    @Override
+    //public long doTrial(long inputCardinality, double targetFpp) {
+    public trialResults doTrial(long inputCardinality, double targetFpp){
+        trialResults result = new trialResults();
+        int fingerprintLength = 
QuotientFilterBuilder.suggestFingerprintLength(targetFpp);
+        int lgNumSlots = 
QuotientFilterBuilder.suggestLgNumSlots(inputCardinality); // Make sure to 
check this line if you want -1 or not
+        sketch = new QuotientFilter(lgNumSlots, fingerprintLength+3);
+
+        long numQueries = pwr2SeriesNext(1, 1L<<(fingerprintLength+1));
+        // Build the test sets but clear them first so that they have the 
correct cardinality and no surplus is added.
+        inputItems.clear();
+        negativeItems.clear();
+        long item;
+        for (long i = 0; i < inputCardinality; i++){
+            item = ++vIn;
+            inputItems.add(item) ;
+            sketch.insert(item);
+        }
+
+        for (long i = inputCardinality; i < inputCardinality+numQueries; i++ ) 
{
+            item = ++vIn;
+            negativeItems.add(item) ;
+        }
+
+        // Check the number of false positives
+        long numFalsePositive = 0 ;
+        for (int i = 0; i < negativeItems.size(); i++){
+            if (sketch.search(negativeItems.get(i))) ++numFalsePositive ;
+        }
+        result.filterSizeBits =  sketch.getSpaceUse();
+        result.measuredFalsePositiveRate = (double)numFalsePositive / 
numQueries;
+        result.numHashbits = fingerprintLength;
+        return result;
+    }
+}
diff --git a/src/main/resources/filters/BloomFilterSpaceJob.conf 
b/src/main/resources/filters/BloomFilterSpaceJob.conf
new file mode 100644
index 0000000..c6008a4
--- /dev/null
+++ b/src/main/resources/filters/BloomFilterSpaceJob.conf
@@ -0,0 +1,45 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Job
+
+## Uniques Profile
+Trials_lgMinU=0  #The starting # of uniques that is printed at the end.
+Trials_lgMaxU=20 #How high the # uniques go
+Trials_UPPO=10   #The horizontal x-resolution of trials points
+
+## Trials Profile
+Trials_lgMinT=10  #Min trials at tail (high counts) 4
+Trials_lgMaxT=14  #Min trials at tail (high counts) 4
+Trials_TPPO=1     #how often intermediate results are printed
+Trials_lgMinBpU=1   #start the downward slope of trials at this LgU
+Trials_lgMaxBpU=5  #stop the downward slope of trials at this LgU
+
+Trials_lgQK=12   #size of quantiles sketch
+Trials_interData=true
+Trials_postPMFs=false
+
+# Date-Time Profile
+TimeZone=PST
+TimeZoneOffset=-28800000 # offset in millisec
+FileNameDateFormat=yyyyMMdd'_'HHmmssz
+ReadableDateFormat=yyyy/MM/dd HH:mm:ss z
+
+#Job Profile
+JobProfile=org.apache.datasketches.characterization.filters.BloomFilterSizeProfile
+targetFpp=1e-3
+
diff --git a/src/main/resources/filters/QuotientFilterSpaceJob.conf 
b/src/main/resources/filters/QuotientFilterSpaceJob.conf
new file mode 100644
index 0000000..17086a2
--- /dev/null
+++ b/src/main/resources/filters/QuotientFilterSpaceJob.conf
@@ -0,0 +1,41 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Job
+
+## Uniques Profile
+Trials_lgMinU=0  #The starting # of uniques that is printed at the end.
+Trials_lgMaxU=20 #How high the # uniques go
+Trials_UPPO=32   #The horizontal x-resolution of trials points
+
+## Trials Profile
+Trials_lgMinT=14  #Min trials at tail (high counts) 4
+Trials_lgMaxT=16  #Min trials at tail (high counts) 4
+Trials_TPPO=1     #how often intermediate results are printed
+Trials_lgMinBpU=1   #start the downward slope of trials at this LgU
+Trials_lgMaxBpU=5  #stop the downward slope of trials at this LgU
+
+# Date-Time Profile
+TimeZone=PST
+TimeZoneOffset=-28800000 # offset in millisec
+FileNameDateFormat=yyyyMMdd'_'HHmmssz
+ReadableDateFormat=yyyy/MM/dd HH:mm:ss z
+
+#Job Profile
+JobProfile=org.apache.datasketches.characterization.filters.QuotientFilterSizeProfile
+targetFpp=1e-6
+


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to