[2/2] [math] MATH-1274: representation of Kolmogorov-Smirnov statistic as integral value

oertl Wed, 16 Sep 2015 11:37:23 -0700

MATH-1274: representation of Kolmogorov-Smirnov statistic as integral
value


Project: http://git-wip-us.apache.org/repos/asf/commons-math/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-math/commit/1c9c43c1
Tree: http://git-wip-us.apache.org/repos/asf/commons-math/tree/1c9c43c1
Diff: http://git-wip-us.apache.org/repos/asf/commons-math/diff/1c9c43c1

Branch: refs/heads/MATH_3_X
Commit: 1c9c43c1d4bb76d7e47cdfc9c6681a38305a95e4
Parents: 1cdaba9
Author: Otmar Ertl <[email protected]>
Authored: Wed Sep 16 20:34:46 2015 +0200
Committer: Otmar Ertl <[email protected]>
Committed: Wed Sep 16 20:34:46 2015 +0200

----------------------------------------------------------------------
 src/changes/changes.xml                         |   3 +
 .../stat/inference/KolmogorovSmirnovTest.java   | 195 ++++++++++++++++---
 2 files changed, 170 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-math/blob/1c9c43c1/src/changes/changes.xml
----------------------------------------------------------------------
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 2d1fecd..d1eaf45 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -51,6 +51,9 @@ If the output is not quite correct, check for invisible 
trailing spaces!
   </properties>
   <body>
     <release version="3.6" date="XXXX-XX-XX" description="">
+      <action dev="oertl" type="update" issue="MATH-1274">
+        Representation of Kolmogorov-Smirnov statistic as integral value.
+      </action>
       <action dev="luc" type="add" issue="MATH-1273" due-to="Qualtagh">
         Added negative zero support in FastMath.pow.
       </action>

http://git-wip-us.apache.org/repos/asf/commons-math/blob/1c9c43c1/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java
 
b/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java
index 74bede9..eac1546 100644
--- 
a/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java
+++ 
b/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java
@@ -21,7 +21,6 @@ import java.math.BigDecimal;
 import java.util.Arrays;
 import java.util.Iterator;
 
-import org.apache.commons.math3.util.Precision;
 import org.apache.commons.math3.distribution.RealDistribution;
 import org.apache.commons.math3.exception.InsufficientDataException;
 import org.apache.commons.math3.exception.MathArithmeticException;
@@ -220,7 +219,10 @@ public class KolmogorovSmirnovTest {
      * {@value #SMALL_SAMPLE_PRODUCT}), the exact distribution is used to 
compute the p-value. This
      * is accomplished by enumerating all partitions of the combined sample 
into two subsamples of
      * the respective sample sizes, computing \(D_{n,m}\) for each partition 
and returning the
-     * proportion of partitions that give \(D\) values exceeding the observed 
value.</li>
+     * proportion of partitions that give \(D\) values exceeding the observed 
value. In the very
+     * small sample case, if there are ties in the data, the actual sample 
values (including ties)
+     * are used in generating the partitions (which are basically multi-set 
partitions in this
+     * case).</li>
      * <li>For mid-size samples (product of sample sizes greater than or equal 
to
      * {@value #SMALL_SAMPLE_PRODUCT} but less than {@value 
#LARGE_SAMPLE_PRODUCT}), Monte Carlo
      * simulation is used to compute the p-value. The simulation randomly 
generates partitions and
@@ -243,10 +245,10 @@ public class KolmogorovSmirnovTest {
     public double kolmogorovSmirnovTest(double[] x, double[] y, boolean 
strict) {
         final long lengthProduct = (long) x.length * y.length;
         if (lengthProduct < SMALL_SAMPLE_PRODUCT) {
-            return exactP(kolmogorovSmirnovStatistic(x, y), x.length, 
y.length, strict);
+            return integralExactP(integralKolmogorovSmirnovStatistic(x, y) + 
((strict)?1l:0l), x.length, y.length);
         }
         if (lengthProduct < LARGE_SAMPLE_PRODUCT) {
-            return monteCarloP(kolmogorovSmirnovStatistic(x, y), x.length, 
y.length, strict, MONTE_CARLO_ITERATIONS);
+            return integralMonteCarloP(integralKolmogorovSmirnovStatistic(x, 
y) + ((strict)?1l:0l), x.length, y.length, MONTE_CARLO_ITERATIONS);
         }
         return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, 
y.length);
     }
@@ -285,6 +287,25 @@ public class KolmogorovSmirnovTest {
      * @throws NullArgumentException if either {@code x} or {@code y} is null
      */
     public double kolmogorovSmirnovStatistic(double[] x, double[] y) {
+        return integralKolmogorovSmirnovStatistic(x, y)/((double)(x.length * 
(long)y.length));
+    }
+
+    /**
+     * Computes the two-sample Kolmogorov-Smirnov test statistic, 
\(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\)
+     * where \(n\) is the length of {@code x}, \(m\) is the length of {@code 
y}, \(F_n\) is the
+     * empirical distribution that puts mass \(1/n\) at each of the values in 
{@code x} and \(F_m\)
+     * is the empirical distribution of the {@code y} values. Finally \(n m 
D_{n,m}\) is returned
+     * as long value.
+     *
+     * @param x first sample
+     * @param y second sample
+     * @return test statistic \(n m D_{n,m}\) used to evaluate the null 
hypothesis that {@code x} and
+     *         {@code y} represent samples from the same underlying 
distribution
+     * @throws InsufficientDataException if either {@code x} or {@code y} does 
not have length at
+     *         least 2
+     * @throws NullArgumentException if either {@code x} or {@code y} is null
+     */
+    private long integralKolmogorovSmirnovStatistic(double[] x, double[] y) {
         checkArray(x);
         checkArray(y);
         // Copy and sort the sample arrays
@@ -297,23 +318,26 @@ public class KolmogorovSmirnovTest {
 
         int rankX = 0;
         int rankY = 0;
+        long curD = 0l;
 
         // Find the max difference between cdf_x and cdf_y
-        double supD = 0d;
+        long supD = 0l;
         do {
             double z = Double.compare(sx[rankX], sy[rankY]) <= 0 ? sx[rankX] : 
sy[rankY];
             while(rankX < n && Double.compare(sx[rankX], z) == 0) {
                 rankX += 1;
+                curD += m;
             }
             while(rankY < m && Double.compare(sy[rankY], z) == 0) {
                 rankY += 1;
+                curD -= n;
             }
-            final double cdf_x = rankX / (double) n;
-            final double cdf_y = rankY / (double) m;
-            final double curD = FastMath.abs(cdf_x - cdf_y);
             if (curD > supD) {
                 supD = curD;
             }
+            else if (-curD > supD) {
+                supD = -curD;
+            }
         } while(rankX < n && rankY < m);
         return supD;
     }
@@ -858,6 +882,32 @@ public class KolmogorovSmirnovTest {
     }
 
     /**
+     * Given a d-statistic in the range [0, 1] and the two sample sizes n and 
m,
+     * an integral d-statistic in the range [0, n*m] is calculated, that can 
be used for
+     * comparison with other integral d-statistics. Depending whether {@code 
strict} is
+     * {@code true} or not, the returned value divided by (n*m) is greater than
+     * (resp greater than or equal to) the given d value (allowing some 
tolerance).
+     *
+     * @param d a d-statistic in the range [0, 1]
+     * @param n first sample size
+     * @param m second sample size
+     * @param strict whether the returned value divided by (n*m) is allowed to 
be equal to d
+     * @return the integral d-statistic in the range [0, n*m]
+     */
+    private static long calculateIntegralD(double d, int n, int m, boolean 
strict) {
+        final double tol = 1e-12;  // d-values within tol of one another are 
considered equal
+        long nm = n * (long)m;
+        long upperBound = (long)FastMath.ceil((d - tol) * nm);
+        long lowerBound = (long)FastMath.floor((d + tol) * nm);
+        if (strict && lowerBound == upperBound) {
+            return upperBound + 1l;
+        }
+        else {
+            return upperBound;
+        }
+    }
+
+    /**
      * Computes \(P(D_{n,m} > d)\) if {@code strict} is {@code true}; 
otherwise \(P(D_{n,m} \ge
      * d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. 
See
      * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the 
definition of \(D_{n,m}\).
@@ -882,11 +932,28 @@ public class KolmogorovSmirnovTest {
      *         greater than (resp. greater than or equal to) {@code d}
      */
     public double exactP(double d, int n, int m, boolean strict) {
+        return integralExactP(calculateIntegralD(d, n, m, strict), n, m);
+    }
+
+    /**
+     * Computes \(P(D_{n,m} >= d/(n*m))\), where \(D_{n,m}\) is the
+     * 2-sample Kolmogorov-Smirnov statistic.
+     * <p>
+     * Here d is the D-statistic represented as long value.
+     * The real D-statistic is obtained by dividing d by n*m.
+     * See also {@link #exactP(double, int, int, boolean)}.
+     *
+     * @param d integral D-statistic
+     * @param n first sample size
+     * @param m second sample size
+     * @return probability that a randomly selected m-n partition of m + n 
generates \(D_{n,m}\)
+     *         greater than or equal to {@code d/(n*m)}
+     */
+    private double integralExactP(long d, int n, int m) {
         Iterator<int[]> combinationsIterator = 
CombinatoricsUtils.combinationsIterator(n + m, n);
         long tail = 0;
         final double[] nSet = new double[n];
         final double[] mSet = new double[m];
-        final double tol = 1e-12;  // d-values within tol of one another are 
considered equal
         while (combinationsIterator.hasNext()) {
             // Generate an n-set
             final int[] nSetI = combinationsIterator.next();
@@ -900,9 +967,67 @@ public class KolmogorovSmirnovTest {
                     mSet[k++] = i;
                 }
             }
-            final double curD = kolmogorovSmirnovStatistic(nSet, mSet);
-            final int order = Precision.compareTo(curD, d, tol);
-            if (order > 0 || (order == 0 && !strict)) {
+            final long curD = integralKolmogorovSmirnovStatistic(nSet, mSet);
+            if (curD >= d) {
+                tail++;
+            }
+        }
+        return (double) tail / (double) 
CombinatoricsUtils.binomialCoefficient(n + m, n);
+    }
+
+    /**
+     * Computes the exact p value for a two-sample Kolmogorov-Smirnov test with
+     * {@code x} and {@code y} as samples, possibly containing ties. This 
method
+     * uses the same implementation as {@link #exactP(double, int, int, 
boolean)}
+     * with the exception that it examines partitions of the combined sample,
+     * preserving ties in the data.  What is returned is the exact probability
+     * that a random partition of the combined dataset into a subset of size
+     * {@code x.length} and another of size {@code y.length} yields a \(D\)
+     * value greater than (resp greater than or equal to) \(D(x,y)\).
+     * <p>
+     * This method should not be used on large samples (a good rule of thumb is
+     * to keep the product of the sample sizes less than
+     * {@link #SMALL_SAMPLE_PRODUCT} when using this method).  If the data do
+     * not contain ties, {@link #exactP(double[], double[], boolean)} should be
+     * used instead of this method.</p>
+     *
+     * @param x first sample
+     * @param y second sample
+     * @param strict whether or not the inequality in the null hypothesis is 
strict
+     * @return p-value
+     */
+    public double exactP(double[] x, double[] y, boolean strict) {
+        final long d = integralKolmogorovSmirnovStatistic(x, y);
+        final int n = x.length;
+        final int m = y.length;
+
+        // Concatenate x and y into universe, preserving ties in the data
+        final double[] universe = new double[n + m];
+        System.arraycopy(x, 0, universe, 0, n);
+        System.arraycopy(y, 0, universe, n, m);
+
+        // Iterate over all n, m partitions of the n + m elements in the 
universe,
+        // Computing D for each one
+        Iterator<int[]> combinationsIterator = 
CombinatoricsUtils.combinationsIterator(n + m, n);
+        long tail = 0;
+        final double[] nSet = new double[n];
+        final double[] mSet = new double[m];
+        while (combinationsIterator.hasNext()) {
+            // Generate an n-set
+            final int[] nSetI = combinationsIterator.next();
+            // Copy the elements of the universe in the n-set to nSet
+            // and the others to mSet
+            int j = 0;
+            int k = 0;
+            for (int i = 0; i < n + m; i++) {
+                if (j < n && nSetI[j] == i) {
+                    nSet[j++] = universe[i];
+                } else {
+                    mSet[k++] = universe[i];
+                }
+            }
+            final long curD = integralKolmogorovSmirnovStatistic(nSet, mSet);
+            if (curD > d || (curD == d && !strict)) {
                 tail++;
             }
         }
@@ -974,35 +1099,49 @@ public class KolmogorovSmirnovTest {
      */
     public double monteCarloP(final double d, final int n, final int m, final 
boolean strict,
                               final int iterations) {
+        return integralMonteCarloP(calculateIntegralD(d, n, m, strict), n, m, 
iterations);
+    }
+
+    /**
+     * Uses Monte Carlo simulation to approximate \(P(D_{n,m} >= d/(n*m))\) 
where \(D_{n,m}\) is the
+     * 2-sample Kolmogorov-Smirnov statistic.
+     * <p>
+     * Here d is the D-statistic represented as long value.
+     * The real D-statistic is obtained by dividing d by n*m.
+     * See also {@link #monteCarloP(double, int, int, boolean, int)}.
+     *
+     * @param d integral D-statistic
+     * @param n first sample size
+     * @param m second sample size
+     * @param iterations number of random partitions to generate
+     * @return proportion of randomly generated m-n partitions of m + n that 
result in \(D_{n,m}\)
+     *         greater than or equal to {@code d/(n*m))}
+     */
+    private double integralMonteCarloP(final long d, final int n, final int m, 
final int iterations) {
+
         // ensure that nn is always the max of (n, m) to require fewer random 
numbers
         final int nn = FastMath.max(n, m);
         final int mm = FastMath.min(n, m);
         final int sum = nn + mm;
-        final double tol = 1e-12;  // d-values within tol of one another are 
considered equal
 
         int tail = 0;
         final boolean b[] = new boolean[sum];
         for (int i = 0; i < iterations; i++) {
             fillBooleanArrayRandomlyWithFixedNumberTrueValues(b, nn, rng);
-            int rankN = b[0] ? 1 : 0;
-            int rankM = b[0] ? 0 : 1;
-            boolean previous = b[0];
-            for(int j = 1; j < b.length; ++j) {
-                if (b[j] != previous) {
-                    final double cdf_n = rankN / (double) nn;
-                    final double cdf_m = rankM / (double) mm;
-                    final double curD = FastMath.abs(cdf_n - cdf_m);
-                    final int order = Precision.compareTo(curD, d, tol);
-                    if (order > 0 || (order == 0 && !strict)) {
+            long curD = 0l;
+            for(int j = 0; j < b.length; ++j) {
+                if (b[j]) {
+                    curD += mm;
+                    if (curD >= d) {
                         tail++;
                         break;
                     }
-                }
-                previous = b[j];
-                if (b[j]) {
-                    rankN++;
                 } else {
-                    rankM++;
+                    curD -= nn;
+                    if (curD <= -d) {
+                        tail++;
+                        break;
+                    }
                 }
             }
         }

[2/2] [math] MATH-1274: representation of Kolmogorov-Smirnov statistic as integral value

Reply via email to