This is an automated email from the ASF dual-hosted git repository.
charlie pushed a commit to branch master
in repository
https://gitbox.apache.org/repos/asf/datasketches-characterization.git
The following commit(s) were added to refs/heads/master by this push:
new 69f906e Added space profiles
69f906e is described below
commit 69f906eeeb2949e79b09346702eaddd89317fd09
Author: c-dickens <[email protected]>
AuthorDate: Mon Jul 29 15:46:54 2024 +0100
Added space profiles
---
.../characterization/filters/BaseSizeProfile.java | 161 +++++++++++++++++++++
.../filters/BloomFilterSizeProfile.java | 49 +++++++
.../filters/QuotientFilterSizeProfile.java | 47 ++++++
.../resources/filters/BloomFilterSpaceJob.conf | 45 ++++++
.../resources/filters/QuotientFilterSpaceJob.conf | 41 ++++++
5 files changed, 343 insertions(+)
diff --git
a/src/main/java/org/apache/datasketches/characterization/filters/BaseSizeProfile.java
b/src/main/java/org/apache/datasketches/characterization/filters/BaseSizeProfile.java
new file mode 100644
index 0000000..23ecf8e
--- /dev/null
+++
b/src/main/java/org/apache/datasketches/characterization/filters/BaseSizeProfile.java
@@ -0,0 +1,161 @@
+package org.apache.datasketches.characterization.filters;
+import org.apache.datasketches.Job;
+import org.apache.datasketches.JobProfile;
+import org.apache.datasketches.Properties;
+import org.apache.datasketches.characterization.AccuracyStats;
+import org.apache.datasketches.quantiles.DoublesSketch;
+
+import java.util.ArrayList;
+
+import static java.lang.Math.log;
+import static java.lang.Math.pow;
+import static org.apache.datasketches.GaussianRanks.GAUSSIANS_4SD;
+import static org.apache.datasketches.common.Util.milliSecToString;
+import static org.apache.datasketches.common.Util.pwr2SeriesNext;
+
+class trialResults{
+ long filterSizeBits;
+ double measuredFalsePositiveRate;
+
+ int numHashbits;
+}
+
+public abstract class BaseSizeProfile implements JobProfile{
+ Job job;
+ public Properties prop;
+ double targetFpp;
+ public long vIn = 0;
+ int lgMinT;
+ int lgMaxT;
+ int tPPO;
+ int lgMinU;
+ int lgMaxU;
+ int uPPO;
+ long inputCardinality;
+ public ArrayList<Long> inputItems = new ArrayList<>();
+ public ArrayList<Long> negativeItems = new ArrayList<>();
+ int numTrials;
+ int lgMinBpU;
+ int lgMaxBpU;
+ double slope;
+
+
+ @Override
+ public void start(final Job job) {
+
+ this.job = job;
+ prop = job.getProperties();
+ targetFpp = Double.parseDouble(prop.mustGet("targetFpp"));
+ //Uniques Profile
+ lgMinU = Integer.parseInt(prop.mustGet("Trials_lgMinU"));
+ lgMaxU = Integer.parseInt(prop.mustGet("Trials_lgMaxU"));
+ uPPO = Integer.parseInt(prop.mustGet("Trials_UPPO"));
+ inputCardinality = (int)pwr2SeriesNext(uPPO, 1L<<lgMinU);
+
+ //Trials Profile
+ lgMinT = Integer.parseInt(prop.mustGet("Trials_lgMinT"));
+ lgMaxT = Integer.parseInt(prop.mustGet("Trials_lgMaxT"));
+ tPPO = Integer.parseInt(prop.mustGet("Trials_TPPO"));
+
+ // Uniques profile
+ lgMinBpU = Integer.parseInt(prop.mustGet("Trials_lgMinBpU"));
+ lgMaxBpU = Integer.parseInt(prop.mustGet("Trials_lgMaxBpU"));
+ slope = (double) (lgMaxT - lgMinT) / (lgMinBpU - lgMaxBpU);
+
+ configure();
+ doTrials();
+ shutdown();
+ cleanup();
+ }
+
+ @Override
+ public void shutdown() {}
+
+ @Override
+ public void cleanup() {}
+
+ public abstract void configure();
+
+ // In here should have the logic to initialize a new sketch for a
different number of input items for each trial.
+ //public abstract long doTrial(long inputCardinality, double targetFpp);
+ public abstract trialResults doTrial(long inputCardinality, double
targetFpp);
+
+ /*
+ This experiment varies the cardinality of the input and measures the
filter size required
+ to obtain an input false positive rate.
+ Inputs:
+ - falsePositiveRate: the target false positive rate
+ */
+ private void doTrials() {
+ final StringBuilder dataStr = new StringBuilder();
+ final int minT = 1 << lgMinT;
+ final int maxT = 1 << lgMaxT;
+ final long maxU = 1L << lgMaxU;
+ job.println(getHeader());
+
+ while(inputCardinality < maxU) {
+ numTrials = getNumTrials(inputCardinality);
+ //doTrial(inputCardinality, targetFpp, final long numQueries);
+ inputCardinality = (int)pwr2SeriesNext(uPPO, inputCardinality);
+ //long filterNumBits = doTrial(inputCardinality, targetFpp) ;
+ trialResults results = doTrial(inputCardinality, targetFpp) ;
+ process(inputCardinality, results.filterSizeBits, numTrials,
+ results.measuredFalsePositiveRate, results.numHashbits,
dataStr);
+ job.println(dataStr.toString()) ;
+ }
+ }
+
+
+ /**
+ * Computes the number of trials for a given current number of uniques for
a
+ * trial set. This is used to decrease the number of trials
+ * as the number of uniques increase.
+ *
+ * @param curU the given current number of uniques for a trial set.
+ * @return the number of trials for a given current number of uniques for a
+ * trial set.
+ */
+ private int getNumTrials(final long curU) {
+ final int minBpU = 1 << lgMinBpU;
+ final int maxBpU = 1 << lgMaxBpU;
+ final int maxT = 1 << lgMaxT;
+ final int minT = 1 << lgMinT;
+ if (lgMinT == lgMaxT || curU <= minBpU) {
+ return maxT;
+ }
+ if (curU >= maxBpU) {
+ return minT;
+ }
+ final double lgCurU = log(curU) / LN2;
+ final double lgTrials = slope * (lgCurU - lgMinBpU) + lgMaxT;
+ return (int) pow(2.0, lgTrials);
+ }
+
+ /**
+ * Processes the results of a trial and appends them to a StringBuilder in
a tab-separated format.
+ *
+ * @param inputCardinality The number of hashes used in the trial.
+ * @param sb The StringBuilder to which the results are appended.
+ */
+ private static void process(final long inputCardinality, final long
sizeInBits, final long numTrials,
+ final double falsePositiveRate, final int
numHashBits, final StringBuilder sb) {
+ // OUTPUT
+ sb.setLength(0);
+ sb.append(inputCardinality).append(TAB);
+ sb.append(sizeInBits).append(TAB);
+ sb.append(numTrials).append(TAB);
+ sb.append(falsePositiveRate).append(TAB);
+ sb.append(numHashBits);
+ }
+
+ private String getHeader() {
+ final StringBuilder sb = new StringBuilder();
+ sb.append("TrueU").append(TAB);
+ sb.append("Size").append(TAB);
+ sb.append("NumTrials").append(TAB);
+ sb.append("FalsePositiveRate").append(TAB);
+ sb.append("NumHashBits");
+ return sb.toString();
+ }
+
+}
diff --git
a/src/main/java/org/apache/datasketches/characterization/filters/BloomFilterSizeProfile.java
b/src/main/java/org/apache/datasketches/characterization/filters/BloomFilterSizeProfile.java
new file mode 100644
index 0000000..e90ad46
--- /dev/null
+++
b/src/main/java/org/apache/datasketches/characterization/filters/BloomFilterSizeProfile.java
@@ -0,0 +1,49 @@
+package org.apache.datasketches.characterization.filters;
+
+import org.apache.datasketches.filters.bloomfilter.BloomFilter;
+import org.apache.datasketches.filters.bloomfilter.BloomFilterBuilder;
+
+import static org.apache.datasketches.common.Util.pwr2SeriesNext;
+
+public class BloomFilterSizeProfile extends BaseSizeProfile {
+ protected BloomFilter sketch;
+
+ public void configure() {}
+
+ @Override
+ //public long doTrial(long inputCardinality, double targetFpp) {
+ public trialResults doTrial(long inputCardinality, double targetFpp){
+ trialResults result = new trialResults();
+ final long numBits =
BloomFilterBuilder.suggestNumFilterBits(inputCardinality, targetFpp);
+ final short numHashes =
BloomFilterBuilder.suggestNumHashes(inputCardinality, numBits);
+ sketch = BloomFilterBuilder.createBySize(numBits, (int)numHashes-4,
348675132L);
+
+ //sketch = BloomFilterBuilder.createByAccuracy(inputCardinality,
targetFpp);
+ long numQueries = pwr2SeriesNext(1, 1L<<(sketch.getNumHashes()+4));
+ // Build the test sets but clear them first so that they have the
correct cardinality and no surplus is added.
+ inputItems.clear();
+ negativeItems.clear();
+ long item;
+ for (long i = 0; i < inputCardinality; i++){
+ item = ++vIn;
+ inputItems.add(item) ;
+ sketch.update(item);
+ }
+
+ for (long i = inputCardinality; i < inputCardinality+numQueries; i++ )
{
+ item = ++vIn;
+ negativeItems.add(item) ;
+ }
+
+ // Check the number of false positives
+ long numFalsePositive = 0 ;
+ for (int i = 0; i < negativeItems.size(); i++){
+ if (sketch.query(negativeItems.get(i))) ++numFalsePositive ;
+ }
+ result.filterSizeBits = sketch.getCapacity();
+ result.measuredFalsePositiveRate = (double)numFalsePositive /
numQueries;
+ result.numHashbits = sketch.getNumHashes();
+ return result;
+ }
+
+}
\ No newline at end of file
diff --git
a/src/main/java/org/apache/datasketches/characterization/filters/QuotientFilterSizeProfile.java
b/src/main/java/org/apache/datasketches/characterization/filters/QuotientFilterSizeProfile.java
new file mode 100644
index 0000000..ad91001
--- /dev/null
+++
b/src/main/java/org/apache/datasketches/characterization/filters/QuotientFilterSizeProfile.java
@@ -0,0 +1,47 @@
+package org.apache.datasketches.characterization.filters;
+
+import org.apache.datasketches.filters.quotientfilter.QuotientFilter;
+import org.apache.datasketches.filters.quotientfilter.QuotientFilterBuilder;
+
+import static org.apache.datasketches.common.Util.pwr2SeriesNext;
+
+public class QuotientFilterSizeProfile extends BaseSizeProfile {
+ protected QuotientFilter sketch;
+
+ public void configure() {}
+
+ @Override
+ //public long doTrial(long inputCardinality, double targetFpp) {
+ public trialResults doTrial(long inputCardinality, double targetFpp){
+ trialResults result = new trialResults();
+ int fingerprintLength =
QuotientFilterBuilder.suggestFingerprintLength(targetFpp);
+ int lgNumSlots =
QuotientFilterBuilder.suggestLgNumSlots(inputCardinality); // Make sure to
check this line if you want -1 or not
+ sketch = new QuotientFilter(lgNumSlots, fingerprintLength+3);
+
+ long numQueries = pwr2SeriesNext(1, 1L<<(fingerprintLength+1));
+ // Build the test sets but clear them first so that they have the
correct cardinality and no surplus is added.
+ inputItems.clear();
+ negativeItems.clear();
+ long item;
+ for (long i = 0; i < inputCardinality; i++){
+ item = ++vIn;
+ inputItems.add(item) ;
+ sketch.insert(item);
+ }
+
+ for (long i = inputCardinality; i < inputCardinality+numQueries; i++ )
{
+ item = ++vIn;
+ negativeItems.add(item) ;
+ }
+
+ // Check the number of false positives
+ long numFalsePositive = 0 ;
+ for (int i = 0; i < negativeItems.size(); i++){
+ if (sketch.search(negativeItems.get(i))) ++numFalsePositive ;
+ }
+ result.filterSizeBits = sketch.getSpaceUse();
+ result.measuredFalsePositiveRate = (double)numFalsePositive /
numQueries;
+ result.numHashbits = fingerprintLength;
+ return result;
+ }
+}
diff --git a/src/main/resources/filters/BloomFilterSpaceJob.conf
b/src/main/resources/filters/BloomFilterSpaceJob.conf
new file mode 100644
index 0000000..c6008a4
--- /dev/null
+++ b/src/main/resources/filters/BloomFilterSpaceJob.conf
@@ -0,0 +1,45 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Job
+
+## Uniques Profile
+Trials_lgMinU=0 #The starting # of uniques that is printed at the end.
+Trials_lgMaxU=20 #How high the # uniques go
+Trials_UPPO=10 #The horizontal x-resolution of trials points
+
+## Trials Profile
+Trials_lgMinT=10 #Min trials at tail (high counts) 4
+Trials_lgMaxT=14 #Min trials at tail (high counts) 4
+Trials_TPPO=1 #how often intermediate results are printed
+Trials_lgMinBpU=1 #start the downward slope of trials at this LgU
+Trials_lgMaxBpU=5 #stop the downward slope of trials at this LgU
+
+Trials_lgQK=12 #size of quantiles sketch
+Trials_interData=true
+Trials_postPMFs=false
+
+# Date-Time Profile
+TimeZone=PST
+TimeZoneOffset=-28800000 # offset in millisec
+FileNameDateFormat=yyyyMMdd'_'HHmmssz
+ReadableDateFormat=yyyy/MM/dd HH:mm:ss z
+
+#Job Profile
+JobProfile=org.apache.datasketches.characterization.filters.BloomFilterSizeProfile
+targetFpp=1e-3
+
diff --git a/src/main/resources/filters/QuotientFilterSpaceJob.conf
b/src/main/resources/filters/QuotientFilterSpaceJob.conf
new file mode 100644
index 0000000..17086a2
--- /dev/null
+++ b/src/main/resources/filters/QuotientFilterSpaceJob.conf
@@ -0,0 +1,41 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Job
+
+## Uniques Profile
+Trials_lgMinU=0 #The starting # of uniques that is printed at the end.
+Trials_lgMaxU=20 #How high the # uniques go
+Trials_UPPO=32 #The horizontal x-resolution of trials points
+
+## Trials Profile
+Trials_lgMinT=14 #Min trials at tail (high counts) 4
+Trials_lgMaxT=16 #Min trials at tail (high counts) 4
+Trials_TPPO=1 #how often intermediate results are printed
+Trials_lgMinBpU=1 #start the downward slope of trials at this LgU
+Trials_lgMaxBpU=5 #stop the downward slope of trials at this LgU
+
+# Date-Time Profile
+TimeZone=PST
+TimeZoneOffset=-28800000 # offset in millisec
+FileNameDateFormat=yyyyMMdd'_'HHmmssz
+ReadableDateFormat=yyyy/MM/dd HH:mm:ss z
+
+#Job Profile
+JobProfile=org.apache.datasketches.characterization.filters.QuotientFilterSizeProfile
+targetFpp=1e-6
+
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]