This is an automated email from the ASF dual-hosted git repository.
leerho pushed a commit to branch master
in repository
https://gitbox.apache.org/repos/asf/incubator-datasketches-java.git
The following commit(s) were added to refs/heads/master by this push:
new 7f0203d Added error bounds to Engagement Example.
7f0203d is described below
commit 7f0203d990be4ea7729e39d8d47774752e3935f7
Author: Lee Rhodes <[email protected]>
AuthorDate: Mon Oct 28 17:38:24 2019 -0700
Added error bounds to Engagement Example.
Added subset computations of estimate, UB, LB to Generic Tuple Sketch.
---
.../org/apache/datasketches/fdt/FdtSketch.java | 41 -----------
.../java/org/apache/datasketches/tuple/Sketch.java | 40 +++++++++++
.../tuple/aninteger/EngagementTest.java | 84 +++++++++-------------
3 files changed, 73 insertions(+), 92 deletions(-)
diff --git a/src/main/java/org/apache/datasketches/fdt/FdtSketch.java
b/src/main/java/org/apache/datasketches/fdt/FdtSketch.java
index 8a90336..2a85118 100644
--- a/src/main/java/org/apache/datasketches/fdt/FdtSketch.java
+++ b/src/main/java/org/apache/datasketches/fdt/FdtSketch.java
@@ -23,7 +23,6 @@ import static org.apache.datasketches.Util.MAX_LG_NOM_LONGS;
import java.util.List;
-import org.apache.datasketches.BinomialBoundsN;
import org.apache.datasketches.SketchesArgumentException;
import org.apache.datasketches.memory.Memory;
import org.apache.datasketches.tuple.strings.ArrayOfStringsSketch;
@@ -125,46 +124,6 @@ public class FdtSketch extends ArrayOfStringsSketch {
return new PostProcessor(this, group, sep);
}
- /**
- * Gets the estimate of the true distinct population of subset tuples
represented by the count
- * of entries in a group. This is primarily used internally.
- * @param numSubsetEntries number of entries for a chosen subset of the
sketch.
- * @return the estimate of the true distinct population of subset tuples
represented by the count
- * of entries in a group.
- */
- public double getEstimate(final int numSubsetEntries) {
- if (!isEstimationMode()) { return numSubsetEntries; }
- return numSubsetEntries / getTheta();
- }
-
- /**
- * Gets the estimate of the lower bound of the true distinct population
represented by the count
- * of entries in a group.
- * @param numStdDev
- * <a href="{@docRoot}/resources/dictionary.html#numStdDev">See Number of
Standard Deviations</a>
- * @param numSubsetEntries number of entries for a chosen subset of the
sketch.
- * @return the estimate of the lower bound of the true distinct population
represented by the count
- * of entries in a group.
- */
- public double getLowerBound(final int numStdDev, final int numSubsetEntries)
{
- if (!isEstimationMode()) { return numSubsetEntries; }
- return BinomialBoundsN.getLowerBound(numSubsetEntries, getTheta(),
numStdDev, isEmpty());
- }
-
- /**
- * Gets the estimate of the upper bound of the true distinct population
represented by the count
- * of entries in a group.
- * @param numStdDev
- * <a href="{@docRoot}/resources/dictionary.html#numStdDev">See Number of
Standard Deviations</a>
- * @param numSubsetEntries number of entries for a chosen subset of the
sketch.
- * @return the estimate of the upper bound of the true distinct population
represented by the count
- * of entries in a group.
- */
- public double getUpperBound(final int numStdDev, final int numSubsetEntries)
{
- if (!isEstimationMode()) { return numSubsetEntries; }
- return BinomialBoundsN.getUpperBound(numSubsetEntries, getTheta(),
numStdDev, isEmpty());
- }
-
// Restricted
/**
diff --git a/src/main/java/org/apache/datasketches/tuple/Sketch.java
b/src/main/java/org/apache/datasketches/tuple/Sketch.java
index 5e777c9..651602a 100644
--- a/src/main/java/org/apache/datasketches/tuple/Sketch.java
+++ b/src/main/java/org/apache/datasketches/tuple/Sketch.java
@@ -76,6 +76,46 @@ public abstract class Sketch<S extends Summary> {
}
/**
+ * Gets the estimate of the true distinct population of subset tuples
represented by the count
+ * of entries in a subset of the total retained entries of the sketch.
+ * @param numSubsetEntries number of entries for a chosen subset of the
sketch.
+ * @return the estimate of the true distinct population of subset tuples
represented by the count
+ * of entries in a subset of the total retained entries of the sketch.
+ */
+ public double getEstimate(final int numSubsetEntries) {
+ if (!isEstimationMode()) { return numSubsetEntries; }
+ return numSubsetEntries / getTheta();
+ }
+
+ /**
+ * Gets the estimate of the lower bound of the true distinct population
represented by the count
+ * of entries in a subset of the total retained entries of the sketch.
+ * @param numStdDev
+ * <a href="{@docRoot}/resources/dictionary.html#numStdDev">See Number of
Standard Deviations</a>
+ * @param numSubsetEntries number of entries for a chosen subset of the
sketch.
+ * @return the estimate of the lower bound of the true distinct population
represented by the count
+ * of entries in a subset of the total retained entries of the sketch.
+ */
+ public double getLowerBound(final int numStdDev, final int numSubsetEntries)
{
+ if (!isEstimationMode()) { return numSubsetEntries; }
+ return BinomialBoundsN.getLowerBound(numSubsetEntries, getTheta(),
numStdDev, isEmpty());
+ }
+
+ /**
+ * Gets the estimate of the upper bound of the true distinct population
represented by the count
+ * of entries in a subset of the total retained entries of the sketch.
+ * @param numStdDev
+ * <a href="{@docRoot}/resources/dictionary.html#numStdDev">See Number of
Standard Deviations</a>
+ * @param numSubsetEntries number of entries for a chosen subset of the
sketch.
+ * @return the estimate of the upper bound of the true distinct population
represented by the count
+ * of entries in a subset of the total retained entries of the sketch.
+ */
+ public double getUpperBound(final int numStdDev, final int numSubsetEntries)
{
+ if (!isEstimationMode()) { return numSubsetEntries; }
+ return BinomialBoundsN.getUpperBound(numSubsetEntries, getTheta(),
numStdDev, isEmpty());
+ }
+
+ /**
* <a href="{@docRoot}/resources/dictionary.html#empty">See Empty</a>
* @return true if empty.
*/
diff --git
a/src/test/java/org/apache/datasketches/tuple/aninteger/EngagementTest.java
b/src/test/java/org/apache/datasketches/tuple/aninteger/EngagementTest.java
index 6199f4b..6286896 100644
--- a/src/test/java/org/apache/datasketches/tuple/aninteger/EngagementTest.java
+++ b/src/test/java/org/apache/datasketches/tuple/aninteger/EngagementTest.java
@@ -24,7 +24,6 @@ import static java.lang.Math.log;
import static java.lang.Math.round;
import static
org.apache.datasketches.tuple.aninteger.IntegerSummary.Mode.AlwaysOne;
import static org.apache.datasketches.tuple.aninteger.IntegerSummary.Mode.Sum;
-import static org.testng.Assert.assertEquals;
import org.apache.datasketches.tuple.CompactSketch;
import org.apache.datasketches.tuple.SketchIterator;
@@ -36,18 +35,19 @@ import org.testng.annotations.Test;
*/
@SuppressWarnings("javadoc")
public class EngagementTest {
+ public static final int numStdDev = 2;
@Test
public void computeEngagementHistogram() {
- int lgK = 12;
- int K = 1 << lgK; // = 4096
+ int lgK = 8; //Using a larger sketch, say >= 9 will produce exact results
for this little example
+ int K = 1 << lgK;
int days = 30;
int v = 0;
IntegerSketch[] skArr = new IntegerSketch[days];
for (int i = 0; i < days; i++) {
skArr[i] = new IntegerSketch(lgK, AlwaysOne);
}
- for (int i = 0; i <= days; i++) { //31 generating indices
+ for (int i = 0; i <= days; i++) { //31 generating indices for symmetry
int numIds = numIDs(days, i);
int numDays = numDays(days, i);
int myV = v++;
@@ -58,9 +58,7 @@ public class EngagementTest {
}
v += numIds;
}
-
- int numVisits = unionOps(K, Sum, skArr);
- assertEquals(numVisits, 897);
+ unionOps(K, Sum, skArr);
}
private static int numIDs(int totalDays, int index) {
@@ -75,7 +73,7 @@ public class EngagementTest {
return (int)(round(exp(((d - i) * log(d)) / d)));
}
- private static int unionOps(int K, IntegerSummary.Mode mode, IntegerSketch
... sketches) {
+ private static void unionOps(int K, IntegerSummary.Mode mode, IntegerSketch
... sketches) {
IntegerSummarySetOperations setOps = new IntegerSummarySetOperations(mode,
mode);
Union<IntegerSummary> union = new Union<>(K, setOps);
int len = sketches.length;
@@ -86,59 +84,43 @@ public class EngagementTest {
CompactSketch<IntegerSummary> result = union.getResult();
SketchIterator<IntegerSummary> itr = result.iterator();
- int[] freqArr = new int[len +1];
+ int[] numDaysArr = new int[len + 1]; //zero index ignored
while (itr.next()) {
- int value = itr.getSummary().getValue();
- freqArr[value]++;
+ //For each unique visitor from the result sketch, get the # days visited
+ int numDaysVisited = itr.getSummary().getValue();
+ //increment the number of visitors that visited numDays
+ numDaysArr[numDaysVisited]++; //values range from 1 to 30
}
+
println("Engagement Histogram:");
- printf("%12s,%12s\n","Days Visited", "Visitors");
- int sumVisitors = 0;
+ println("Number of Unique Visitors by Number of Days Visited");
+ printf("%12s%12s%12s%12s\n","Days Visited", "Estimate", "LB", "UB");
int sumVisits = 0;
- for (int i = 0; i < freqArr.length; i++) {
- int visits = freqArr[i];
- if (visits == 0) { continue; }
- sumVisitors += visits;
- sumVisits += (visits * i);
- printf("%12d,%12d\n", i, visits);
- }
- println("Total Visitors: " + sumVisitors);
- println("Total Visits : " + sumVisits);
- return sumVisits;
- }
-
- @Test
- public void simpleCheckAlwaysOneIntegerSketch() {
- int lgK = 12;
- int K = 1 << lgK; // = 4096
-
- IntegerSketch a1Sk1 = new IntegerSketch(lgK, AlwaysOne);
- IntegerSketch a1Sk2 = new IntegerSketch(lgK, AlwaysOne);
-
- int m = 2 * K;
- for (int key = 0; key < m; key++) {
- a1Sk1.update(key, 1);
- a1Sk2.update(key + (m/2), 1); //overlap by 1/2 = 1.5m = 12288.
- }
- int numVisits = unionOps(K, AlwaysOne, a1Sk1, a1Sk2);
- assertEquals(numVisits, K);
- }
-
- @Test
- public void checkPwrLaw() {
- int days = 30;
- for (int i = 0; i <= days; i++) {
- int numIds = numIDs(days, i);
- int numDays = numDays(days, i);
- printf("%6d%6d%6d\n", i, numIds, numDays);
+ for (int i = 0; i < numDaysArr.length; i++) {
+ int visitorsAtDaysVisited = numDaysArr[i];
+ if (visitorsAtDaysVisited == 0) { continue; }
+ int lbVisitorsAtDaysVisited = (int)
round(result.getLowerBound(numStdDev, visitorsAtDaysVisited));
+ int ubVisitorsAtDaysVisited = (int)
round(result.getUpperBound(numStdDev, visitorsAtDaysVisited));
+ sumVisits += visitorsAtDaysVisited * i;
+ printf("%12d%12d%12d%12d\n",
+ i, visitorsAtDaysVisited, lbVisitorsAtDaysVisited,
ubVisitorsAtDaysVisited);
}
+ double visitors = result.getEstimate();
+ double lbVisitors = result.getLowerBound(numStdDev);
+ double ubVisitors = result.getUpperBound(numStdDev);
+ int lbVisits = (int) round((sumVisits * lbVisitors)/visitors);
+ int ubVisits = (int) round((sumVisits * ubVisitors)/visitors);
+ printf("\n%12s%12s%12s%12s\n","Totals", "Estimate", "LB", "UB");
+ printf("%12s%12d%12d%12d\n", "Visitors",
+ (int)round(visitors), (int)round(lbVisitors), (int)round(ubVisitors));
+ printf("%12s%12d%12d%12d\n", "Visits", sumVisits, lbVisits, ubVisits);
}
/**
* @param o object to print
*/
- static void println(Object o) {
+ private static void println(Object o) {
printf("%s\n", o.toString());
}
@@ -146,7 +128,7 @@ public class EngagementTest {
* @param fmt format
* @param args arguments
*/
- static void printf(String fmt, Object ... args) {
+ private static void printf(String fmt, Object ... args) {
//System.out.printf(fmt, args); //Enable/Disable printing here
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]