This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch tdigest_pmf_cdf
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git

commit e6741622c796046d6bdb9368ac903f1beffe3358
Author: AlexanderSaydakov <[email protected]>
AuthorDate: Thu Oct 31 17:59:53 2024 -0700

    implemented getPMF() and getCDF()
---
 .../apache/datasketches/tdigest/TDigestDouble.java | 47 ++++++++++++++++++++++
 .../datasketches/tdigest/TDigestDoubleTest.java    | 13 ++++--
 2 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/src/main/java/org/apache/datasketches/tdigest/TDigestDouble.java 
b/src/main/java/org/apache/datasketches/tdigest/TDigestDouble.java
index 1e340851..478b8012 100644
--- a/src/main/java/org/apache/datasketches/tdigest/TDigestDouble.java
+++ b/src/main/java/org/apache/datasketches/tdigest/TDigestDouble.java
@@ -32,6 +32,7 @@ import org.apache.datasketches.memory.Memory;
 import org.apache.datasketches.memory.WritableBuffer;
 import org.apache.datasketches.memory.WritableMemory;
 import org.apache.datasketches.quantilescommon.QuantilesAPI;
+import org.apache.datasketches.quantilescommon.QuantilesUtil;
 
 /**
  * t-Digest for estimating quantiles and ranks.
@@ -125,6 +126,7 @@ public final class TDigestDouble {
   /**
    * Process buffered values and merge centroids if needed
    */
+  // this method will become private in the next major version
   public void compress() {
     if (numBuffered_ == 0) { return; }
     final int num = numBuffered_ + numCentroids_;
@@ -277,6 +279,51 @@ public final class TDigestDouble {
     return weightedAverage(centroidWeights_[numCentroids_ - 1], w1, maxValue_, 
w2);
   }
 
+  /**
+   * Returns an approximation to the Probability Mass Function (PMF) of the 
input stream
+   * given a set of split points.
+   *
+   * @param splitPoints an array of <i>m</i> unique, monotonically increasing 
values
+   * that divide the input domain into <i>m+1</i> consecutive disjoint 
intervals (bins).
+   *
+   * @return an array of m+1 doubles each of which is an approximation
+   * to the fraction of the input stream values (the mass) that fall into one 
of those intervals.
+   * @throws SketchesStateException if sketch is empty.
+   */
+  public double[] getPMF(final double[] splitPoints) {
+    final double[] buckets = getCDF(splitPoints);
+    for (int i = buckets.length; i-- > 1; ) {
+      buckets[i] -= buckets[i - 1];
+    }
+    return buckets;
+  }
+
+  /**
+   * Returns an approximation to the Cumulative Distribution Function (CDF), 
which is the
+   * cumulative analog of the PMF, of the input stream given a set of split 
points.
+   *
+   * @param splitPoints an array of <i>m</i> unique, monotonically increasing 
values
+   * that divide the input domain into <i>m+1</i> consecutive disjoint 
intervals.
+   *
+   * @return an array of m+1 doubles, which are a consecutive approximation to 
the CDF
+   * of the input stream given the splitPoints. The value at array position j 
of the returned
+   * CDF array is the sum of the returned values in positions 0 through j of 
the returned PMF
+   * array. This can be viewed as array of ranks of the given split points 
plus one more value
+   * that is always 1.
+   * @throws SketchesStateException if sketch is empty.
+   */
+  public double[] getCDF(final double[] splitPoints) {
+    if (isEmpty()) { throw new SketchesStateException(QuantilesAPI.EMPTY_MSG); 
}
+    QuantilesUtil.checkDoublesSplitPointsOrder(splitPoints);
+    final int len = splitPoints.length + 1;
+    final double[] ranks = new double[len];
+    for (int i = 0; i < len - 1; i++) {
+      ranks[i] = getRank(splitPoints[i]);
+    }
+    ranks[len - 1] = 1.0;
+    return ranks;
+  }
+
   /**
    * Computes size needed to serialize the current state.
    * @return size in bytes needed to serialize this tdigest
diff --git 
a/src/test/java/org/apache/datasketches/tdigest/TDigestDoubleTest.java 
b/src/test/java/org/apache/datasketches/tdigest/TDigestDoubleTest.java
index db043cff..55baa83e 100644
--- a/src/test/java/org/apache/datasketches/tdigest/TDigestDoubleTest.java
+++ b/src/test/java/org/apache/datasketches/tdigest/TDigestDoubleTest.java
@@ -41,6 +41,8 @@ public class TDigestDoubleTest {
     assertThrows(SketchesStateException.class, () -> td.getMaxValue());
     assertThrows(SketchesStateException.class, () -> td.getRank(0));
     assertThrows(SketchesStateException.class, () -> td.getQuantile(0.5));
+    assertThrows(SketchesStateException.class, () -> td.getPMF(new 
double[]{0}));
+    assertThrows(SketchesStateException.class, () -> td.getCDF(new 
double[]{0}));
   }
 
   @Test
@@ -65,9 +67,6 @@ public class TDigestDoubleTest {
     final TDigestDouble td = new TDigestDouble();
     final int n = 10000;
     for (int i = 0; i < n; i++) td.update(i);
-//    System.out.println(td.toString(true));
-//    td.compress();
-//    System.out.println(td.toString(true));
     assertFalse(td.isEmpty());
     assertEquals(td.getTotalWeight(), n);
     assertEquals(td.getMinValue(), 0);
@@ -82,6 +81,14 @@ public class TDigestDoubleTest {
     assertEquals(td.getQuantile(0.9), n * 0.9, n * 0.9 * 0.01);
     assertEquals(td.getQuantile(0.95), n * 0.95, n * 0.95 * 0.01);
     assertEquals(td.getQuantile(1), n - 1);
+    final double[] pmf = td.getPMF(new double[] {n / 2});
+    assertEquals(pmf.length, 2);
+    assertEquals(pmf[0], 0.5, 0.0001);
+    assertEquals(pmf[1], 0.5, 0.0001);
+    final double[] cdf = td.getCDF(new double[] {n / 2});
+    assertEquals(cdf.length, 2);
+    assertEquals(cdf[0], 0.5, 0.0001);
+    assertEquals(cdf[1], 1.0);
   }
 
   @Test


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to