This is an automated email from the ASF dual-hosted git repository.

leerho pushed a commit to branch master
in repository 
https://gitbox.apache.org/repos/asf/incubator-datasketches-java.git


The following commit(s) were added to refs/heads/master by this push:
     new 7f0203d  Added error bounds to Engagement Example.
7f0203d is described below

commit 7f0203d990be4ea7729e39d8d47774752e3935f7
Author: Lee Rhodes <[email protected]>
AuthorDate: Mon Oct 28 17:38:24 2019 -0700

    Added error bounds to Engagement Example.
    
    Added subset computations of estimate, UB, LB to Generic Tuple Sketch.
---
 .../org/apache/datasketches/fdt/FdtSketch.java     | 41 -----------
 .../java/org/apache/datasketches/tuple/Sketch.java | 40 +++++++++++
 .../tuple/aninteger/EngagementTest.java            | 84 +++++++++-------------
 3 files changed, 73 insertions(+), 92 deletions(-)

diff --git a/src/main/java/org/apache/datasketches/fdt/FdtSketch.java 
b/src/main/java/org/apache/datasketches/fdt/FdtSketch.java
index 8a90336..2a85118 100644
--- a/src/main/java/org/apache/datasketches/fdt/FdtSketch.java
+++ b/src/main/java/org/apache/datasketches/fdt/FdtSketch.java
@@ -23,7 +23,6 @@ import static org.apache.datasketches.Util.MAX_LG_NOM_LONGS;
 
 import java.util.List;
 
-import org.apache.datasketches.BinomialBoundsN;
 import org.apache.datasketches.SketchesArgumentException;
 import org.apache.datasketches.memory.Memory;
 import org.apache.datasketches.tuple.strings.ArrayOfStringsSketch;
@@ -125,46 +124,6 @@ public class FdtSketch extends ArrayOfStringsSketch {
     return new PostProcessor(this, group, sep);
   }
 
-  /**
-   * Gets the estimate of the true distinct population of subset tuples 
represented by the count
-   * of entries in a group. This is primarily used internally.
-   * @param numSubsetEntries number of entries for a chosen subset of the 
sketch.
-   * @return the estimate of the true distinct population of subset tuples 
represented by the count
-   * of entries in a group.
-   */
-  public double getEstimate(final int numSubsetEntries) {
-    if (!isEstimationMode()) { return numSubsetEntries; }
-    return numSubsetEntries / getTheta();
-  }
-
-  /**
-   * Gets the estimate of the lower bound of the true distinct population 
represented by the count
-   * of entries in a group.
-   * @param numStdDev
-   * <a href="{@docRoot}/resources/dictionary.html#numStdDev">See Number of 
Standard Deviations</a>
-   * @param numSubsetEntries number of entries for a chosen subset of the 
sketch.
-   * @return the estimate of the lower bound of the true distinct population 
represented by the count
-   * of entries in a group.
-   */
-  public double getLowerBound(final int numStdDev, final int numSubsetEntries) 
{
-    if (!isEstimationMode()) { return numSubsetEntries; }
-    return BinomialBoundsN.getLowerBound(numSubsetEntries, getTheta(), 
numStdDev, isEmpty());
-  }
-
-  /**
-   * Gets the estimate of the upper bound of the true distinct population 
represented by the count
-   * of entries in a group.
-   * @param numStdDev
-   * <a href="{@docRoot}/resources/dictionary.html#numStdDev">See Number of 
Standard Deviations</a>
-   * @param numSubsetEntries number of entries for a chosen subset of the 
sketch.
-   * @return the estimate of the upper bound of the true distinct population 
represented by the count
-   * of entries in a group.
-   */
-  public double getUpperBound(final int numStdDev, final int numSubsetEntries) 
{
-    if (!isEstimationMode()) { return numSubsetEntries; }
-    return BinomialBoundsN.getUpperBound(numSubsetEntries, getTheta(), 
numStdDev, isEmpty());
-  }
-
   // Restricted
 
   /**
diff --git a/src/main/java/org/apache/datasketches/tuple/Sketch.java 
b/src/main/java/org/apache/datasketches/tuple/Sketch.java
index 5e777c9..651602a 100644
--- a/src/main/java/org/apache/datasketches/tuple/Sketch.java
+++ b/src/main/java/org/apache/datasketches/tuple/Sketch.java
@@ -76,6 +76,46 @@ public abstract class Sketch<S extends Summary> {
   }
 
   /**
+   * Gets the estimate of the true distinct population of subset tuples 
represented by the count
+   * of entries in a subset of the total retained entries of the sketch.
+   * @param numSubsetEntries number of entries for a chosen subset of the 
sketch.
+   * @return the estimate of the true distinct population of subset tuples 
represented by the count
+   * of entries in a subset of the total retained entries of the sketch.
+   */
+  public double getEstimate(final int numSubsetEntries) {
+    if (!isEstimationMode()) { return numSubsetEntries; }
+    return numSubsetEntries / getTheta();
+  }
+
+  /**
+   * Gets the estimate of the lower bound of the true distinct population 
represented by the count
+   * of entries in a subset of the total retained entries of the sketch.
+   * @param numStdDev
+   * <a href="{@docRoot}/resources/dictionary.html#numStdDev">See Number of 
Standard Deviations</a>
+   * @param numSubsetEntries number of entries for a chosen subset of the 
sketch.
+   * @return the estimate of the lower bound of the true distinct population 
represented by the count
+   * of entries in a subset of the total retained entries of the sketch.
+   */
+  public double getLowerBound(final int numStdDev, final int numSubsetEntries) 
{
+    if (!isEstimationMode()) { return numSubsetEntries; }
+    return BinomialBoundsN.getLowerBound(numSubsetEntries, getTheta(), 
numStdDev, isEmpty());
+  }
+
+  /**
+   * Gets the estimate of the upper bound of the true distinct population 
represented by the count
+   * of entries in a subset of the total retained entries of the sketch.
+   * @param numStdDev
+   * <a href="{@docRoot}/resources/dictionary.html#numStdDev">See Number of 
Standard Deviations</a>
+   * @param numSubsetEntries number of entries for a chosen subset of the 
sketch.
+   * @return the estimate of the upper bound of the true distinct population 
represented by the count
+   * of entries in a subset of the total retained entries of the sketch.
+   */
+  public double getUpperBound(final int numStdDev, final int numSubsetEntries) 
{
+    if (!isEstimationMode()) { return numSubsetEntries; }
+    return BinomialBoundsN.getUpperBound(numSubsetEntries, getTheta(), 
numStdDev, isEmpty());
+  }
+
+  /**
    * <a href="{@docRoot}/resources/dictionary.html#empty">See Empty</a>
    * @return true if empty.
    */
diff --git 
a/src/test/java/org/apache/datasketches/tuple/aninteger/EngagementTest.java 
b/src/test/java/org/apache/datasketches/tuple/aninteger/EngagementTest.java
index 6199f4b..6286896 100644
--- a/src/test/java/org/apache/datasketches/tuple/aninteger/EngagementTest.java
+++ b/src/test/java/org/apache/datasketches/tuple/aninteger/EngagementTest.java
@@ -24,7 +24,6 @@ import static java.lang.Math.log;
 import static java.lang.Math.round;
 import static 
org.apache.datasketches.tuple.aninteger.IntegerSummary.Mode.AlwaysOne;
 import static org.apache.datasketches.tuple.aninteger.IntegerSummary.Mode.Sum;
-import static org.testng.Assert.assertEquals;
 
 import org.apache.datasketches.tuple.CompactSketch;
 import org.apache.datasketches.tuple.SketchIterator;
@@ -36,18 +35,19 @@ import org.testng.annotations.Test;
  */
 @SuppressWarnings("javadoc")
 public class EngagementTest {
+  public static final int numStdDev = 2;
 
   @Test
   public void computeEngagementHistogram() {
-    int lgK = 12;
-    int K = 1 << lgK; // = 4096
+    int lgK = 8; //Using a larger sketch, say >= 9 will produce exact results 
for this little example
+    int K = 1 << lgK;
     int days = 30;
     int v = 0;
     IntegerSketch[] skArr = new IntegerSketch[days];
     for (int i = 0; i < days; i++) {
       skArr[i] = new IntegerSketch(lgK, AlwaysOne);
     }
-    for (int i = 0; i <= days; i++) { //31 generating indices
+    for (int i = 0; i <= days; i++) { //31 generating indices for symmetry
       int numIds = numIDs(days, i);
       int numDays = numDays(days, i);
       int myV = v++;
@@ -58,9 +58,7 @@ public class EngagementTest {
       }
       v += numIds;
     }
-
-    int numVisits = unionOps(K, Sum, skArr);
-    assertEquals(numVisits, 897);
+    unionOps(K, Sum, skArr);
   }
 
   private static int numIDs(int totalDays, int index) {
@@ -75,7 +73,7 @@ public class EngagementTest {
     return (int)(round(exp(((d - i) * log(d)) / d)));
   }
 
-  private static int unionOps(int K, IntegerSummary.Mode mode, IntegerSketch 
... sketches) {
+  private static void unionOps(int K, IntegerSummary.Mode mode, IntegerSketch 
... sketches) {
     IntegerSummarySetOperations setOps = new IntegerSummarySetOperations(mode, 
mode);
     Union<IntegerSummary> union = new Union<>(K, setOps);
     int len = sketches.length;
@@ -86,59 +84,43 @@ public class EngagementTest {
     CompactSketch<IntegerSummary> result = union.getResult();
     SketchIterator<IntegerSummary> itr = result.iterator();
 
-    int[] freqArr = new int[len +1];
+    int[] numDaysArr = new int[len + 1]; //zero index ignored
 
     while (itr.next()) {
-      int value = itr.getSummary().getValue();
-      freqArr[value]++;
+      //For each unique visitor from the result sketch, get the # days visited
+      int numDaysVisited = itr.getSummary().getValue();
+      //increment the number of visitors that visited numDays
+      numDaysArr[numDaysVisited]++; //values range from 1 to 30
     }
+
     println("Engagement Histogram:");
-    printf("%12s,%12s\n","Days Visited", "Visitors");
-    int sumVisitors = 0;
+    println("Number of Unique Visitors by Number of Days Visited");
+    printf("%12s%12s%12s%12s\n","Days Visited", "Estimate", "LB", "UB");
     int sumVisits = 0;
-    for (int i = 0; i < freqArr.length; i++) {
-      int visits = freqArr[i];
-      if (visits == 0) { continue; }
-      sumVisitors += visits;
-      sumVisits += (visits * i);
-      printf("%12d,%12d\n", i, visits);
-    }
-    println("Total Visitors: " + sumVisitors);
-    println("Total Visits  : " + sumVisits);
-    return sumVisits;
-  }
-
-  @Test
-  public void simpleCheckAlwaysOneIntegerSketch() {
-    int lgK = 12;
-    int K = 1 << lgK; // = 4096
-
-    IntegerSketch a1Sk1 = new IntegerSketch(lgK, AlwaysOne);
-    IntegerSketch a1Sk2 = new IntegerSketch(lgK, AlwaysOne);
-
-    int m = 2 * K;
-    for (int key = 0; key < m; key++) {
-      a1Sk1.update(key, 1);
-      a1Sk2.update(key + (m/2), 1); //overlap by 1/2 = 1.5m = 12288.
-    }
-    int numVisits = unionOps(K, AlwaysOne, a1Sk1, a1Sk2);
-    assertEquals(numVisits, K);
-  }
-
-  @Test
-  public void checkPwrLaw() {
-    int days = 30;
-    for (int i = 0; i <= days; i++) {
-      int numIds = numIDs(days, i);
-      int numDays = numDays(days, i);
-      printf("%6d%6d%6d\n", i, numIds, numDays);
+    for (int i = 0; i < numDaysArr.length; i++) {
+      int visitorsAtDaysVisited = numDaysArr[i];
+      if (visitorsAtDaysVisited == 0) { continue; }
+      int lbVisitorsAtDaysVisited = (int) 
round(result.getLowerBound(numStdDev, visitorsAtDaysVisited));
+      int ubVisitorsAtDaysVisited = (int) 
round(result.getUpperBound(numStdDev, visitorsAtDaysVisited));
+      sumVisits += visitorsAtDaysVisited * i;
+      printf("%12d%12d%12d%12d\n",
+          i, visitorsAtDaysVisited, lbVisitorsAtDaysVisited, 
ubVisitorsAtDaysVisited);
     }
+    double visitors = result.getEstimate();
+    double lbVisitors = result.getLowerBound(numStdDev);
+    double ubVisitors = result.getUpperBound(numStdDev);
+    int lbVisits = (int) round((sumVisits * lbVisitors)/visitors);
+    int ubVisits = (int) round((sumVisits * ubVisitors)/visitors);
+    printf("\n%12s%12s%12s%12s\n","Totals", "Estimate", "LB", "UB");
+    printf("%12s%12d%12d%12d\n", "Visitors",
+        (int)round(visitors), (int)round(lbVisitors), (int)round(ubVisitors));
+    printf("%12s%12d%12d%12d\n", "Visits", sumVisits, lbVisits, ubVisits);
   }
 
   /**
    * @param o object to print
    */
-  static void println(Object o) {
+  private static void println(Object o) {
     printf("%s\n", o.toString());
   }
 
@@ -146,7 +128,7 @@ public class EngagementTest {
    * @param fmt format
    * @param args arguments
    */
-  static void printf(String fmt, Object ... args) {
+  private static void printf(String fmt, Object ... args) {
     //System.out.printf(fmt, args); //Enable/Disable printing here
   }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to