Repository: commons-math Updated Branches: refs/heads/MATH_3_X 56a628776 -> 2983ff81b
Improved performance and accuracy of 2-sample KS tests. JIRA: MATH-1310. Project: http://git-wip-us.apache.org/repos/asf/commons-math/repo Commit: http://git-wip-us.apache.org/repos/asf/commons-math/commit/2983ff81 Tree: http://git-wip-us.apache.org/repos/asf/commons-math/tree/2983ff81 Diff: http://git-wip-us.apache.org/repos/asf/commons-math/diff/2983ff81 Branch: refs/heads/MATH_3_X Commit: 2983ff81b17638ea041d73676049d002b9096781 Parents: 56a6287 Author: Phil Steitz <[email protected]> Authored: Thu Dec 31 15:02:11 2015 -0700 Committer: Phil Steitz <[email protected]> Committed: Thu Dec 31 15:02:11 2015 -0700 ---------------------------------------------------------------------- src/changes/changes.xml | 3 + .../stat/inference/KolmogorovSmirnovTest.java | 161 +++++++++---------- src/test/R/KolmogorovSmirnovTestCases.R | 14 +- .../inference/KolmogorovSmirnovTestTest.java | 44 ++++- 4 files changed, 134 insertions(+), 88 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/commons-math/blob/2983ff81/src/changes/changes.xml ---------------------------------------------------------------------- diff --git a/src/changes/changes.xml b/src/changes/changes.xml index 2f94e04..8b7a409 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -51,6 +51,9 @@ If the output is not quite correct, check for invisible trailing spaces! </properties> <body> <release version="3.6" date="XXXX-XX-XX" description=""> + <action dev="psteitz" type="fix" issue="MATH-1310"> + Improved performance and accuracy of 2-sample KolmogorovSmirnov tests. + </action> <action dev="luc" type="fix" issue="MATH-1297"> Detect start failures with multi-step ODE integrators. </action> http://git-wip-us.apache.org/repos/asf/commons-math/blob/2983ff81/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java b/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java index b9f2ec0..6b70e9b 100644 --- a/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java +++ b/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java @@ -20,7 +20,6 @@ package org.apache.commons.math3.stat.inference; import java.math.BigDecimal; import java.util.Arrays; import java.util.HashSet; -import java.util.Iterator; import org.apache.commons.math3.distribution.EnumeratedRealDistribution; import org.apache.commons.math3.distribution.RealDistribution; @@ -69,14 +68,9 @@ import org.apache.commons.math3.util.MathUtils; * default 2-sample test method, {@link #kolmogorovSmirnovTest(double[], double[])} works as * follows: * <ul> - * <li>For very small samples (where the product of the sample sizes is less than - * {@value #SMALL_SAMPLE_PRODUCT}), the exact distribution is used to compute the p-value for the - * 2-sample test.</li> - * <li>For mid-size samples (product of sample sizes greater than or equal to - * {@value #SMALL_SAMPLE_PRODUCT} but less than {@value #LARGE_SAMPLE_PRODUCT}), Monte Carlo - * simulation is used to compute the p-value. The simulation randomly generates partitions of \(m + - * n\) into an \(m\)-set and an \(n\)-set and reports the proportion that give \(D\) values - * exceeding the observed value.</li> + * <li>For small samples (where the product of the sample sizes is less than + * {@value #LARGE_SAMPLE_PRODUCT}), the method presented in [4] is used to compute the + * exact p-value for the 2-sample test.</li> * <li>When the product of the sample sizes exceeds {@value #LARGE_SAMPLE_PRODUCT}, the asymptotic * distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)} for details on * the approximation.</li> @@ -97,8 +91,6 @@ import org.apache.commons.math3.util.MathUtils; * The methods used by the 2-sample default implementation are also exposed directly: * <ul> * <li>{@link #exactP(double, int, int, boolean)} computes exact 2-sample p-values</li> - * <li>{@link #monteCarloP(double, int, int, boolean, int)} computes 2-sample p-values by Monte - * Carlo simulation</li> * <li>{@link #approximateP(double, int, int)} uses the asymptotic distribution The {@code boolean} * arguments in the first two methods allow the probability used to estimate the p-value to be * expressed using strict or non-strict inequality. See @@ -115,6 +107,8 @@ import org.apache.commons.math3.util.MathUtils; * <li>[3] Jasjeet S. Sekhon. 2011. <a href="http://www.jstatsoft.org/article/view/v042i07"> * Multivariate and Propensity Score Matching Software with Automated Balance Optimization: * The Matching package for R</a> Journal of Statistical Software, 42(7): 1-52.</li> + * <li>[4] Wilcox, Rand. 2012. Introduction to Robust Estimation and Hypothesis Testing, + * Chapter 5, 3rd Ed. Academic Press.</li> * </ul> * <br/> * Note that [1] contains an error in computing h, refer to <a @@ -136,16 +130,19 @@ public class KolmogorovSmirnovTest { /** Convergence criterion for the sums in #pelzGood(double, double, int)} */ protected static final double PG_SUM_RELATIVE_ERROR = 1.0e-10; - /** When product of sample sizes is less than this value, 2-sample K-S test is exact */ + /** No longer used. */ + @Deprecated protected static final int SMALL_SAMPLE_PRODUCT = 200; /** * When product of sample sizes exceeds this value, 2-sample K-S test uses asymptotic - * distribution for strict inequality p-value. + * distribution to compute the p-value. */ protected static final int LARGE_SAMPLE_PRODUCT = 10000; - /** Default number of iterations used by {@link #monteCarloP(double, int, int, boolean, int)} */ + /** Default number of iterations used by {@link #monteCarloP(double, int, int, boolean, int)}. + * Deprecated as of version 3.6, as this method is no longer needed. */ + @Deprecated protected static final int MONTE_CARLO_ITERATIONS = 1000000; /** Random data generator used by {@link #monteCarloP(double, int, int, boolean, int)} */ @@ -160,9 +157,12 @@ public class KolmogorovSmirnovTest { /** * Construct a KolmogorovSmirnovTest with the provided random data generator. + * The #monteCarloP(double, int, int, boolean, int) that uses the generator supplied to this + * constructor is deprecated as of version 3.6. * * @param rng random data generator used by {@link #monteCarloP(double, int, int, boolean, int)} */ + @Deprecated public KolmogorovSmirnovTest(RandomGenerator rng) { this.rng = rng; } @@ -226,18 +226,9 @@ public class KolmogorovSmirnovTest { * {@code y.length} will strictly exceed (if {@code strict} is {@code true}) or be at least as * large as {@code strict = false}) as {@code kolmogorovSmirnovStatistic(x, y)}. * <ul> - * <li>For very small samples (where the product of the sample sizes is less than - * {@value #SMALL_SAMPLE_PRODUCT}), the exact distribution is used to compute the p-value. This - * is accomplished by enumerating all partitions of the combined sample into two subsamples of - * the respective sample sizes, computing \(D_{n,m}\) for each partition and returning the - * proportion of partitions that give \(D\) values exceeding the observed value. In the very - * small sample case, if there are ties in the data, the actual sample values (including ties) - * are used in generating the partitions (which are basically multi-set partitions in this - * case).</li> - * <li>For mid-size samples (product of sample sizes greater than or equal to - * {@value #SMALL_SAMPLE_PRODUCT} but less than {@value #LARGE_SAMPLE_PRODUCT}), Monte Carlo - * simulation is used to compute the p-value. The simulation randomly generates partitions and - * reports the proportion that give \(D\) values exceeding the observed value.</li> + * <li>For small samples (where the product of the sample sizes is less than + * {@value #LARGE_SAMPLE_PRODUCT}), the exact p-value is computed using the method presented + * in [4], implemented in {@link #exactP(double, int, int, boolean)}. </li> * <li>When the product of the sample sizes exceeds {@value #LARGE_SAMPLE_PRODUCT}, the * asymptotic distribution of \(D_{n,m}\) is used. See {@link #approximateP(double, int, int)} * for details on the approximation.</li> @@ -274,11 +265,8 @@ public class KolmogorovSmirnovTest { xa = x; ya = y; } - if (lengthProduct < SMALL_SAMPLE_PRODUCT) { - return integralExactP(integralKolmogorovSmirnovStatistic(xa, ya) + (strict?1l:0l), x.length, y.length); - } if (lengthProduct < LARGE_SAMPLE_PRODUCT) { - return integralMonteCarloP(integralKolmogorovSmirnovStatistic(xa, ya) + (strict?1l:0l), x.length, y.length, MONTE_CARLO_ITERATIONS); + return exactP(kolmogorovSmirnovStatistic(xa, ya), x.length, y.length, strict); } return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, y.length); } @@ -1000,16 +988,8 @@ public class KolmogorovSmirnovTest { * d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\). * <p> - * The returned probability is exact, obtained by enumerating all partitions of {@code m + n} - * into {@code m} and {@code n} sets, computing \(D_{n,m}\) for each partition and counting the - * number of partitions that yield \(D_{n,m}\) values exceeding (resp. greater than or equal to) - * {@code d}. - * </p> - * <p> - * <strong>USAGE NOTE</strong>: Since this method enumerates all combinations in \({m+n} \choose - * {n}\), it is very slow if called for large {@code m, n}. For this reason, - * {@link #kolmogorovSmirnovTest(double[], double[])} uses this only for {@code m * n < } - * {@value #SMALL_SAMPLE_PRODUCT}. + * The returned probability is exact, implemented by unwinding the recursive function + * definitions presented in [4] (class javadoc). * </p> * * @param d D-statistic value @@ -1020,50 +1000,10 @@ public class KolmogorovSmirnovTest { * greater than (resp. greater than or equal to) {@code d} */ public double exactP(double d, int n, int m, boolean strict) { - return integralExactP(calculateIntegralD(d, n, m, strict), n, m); - } - - /** - * Computes \(P(D_{n,m} >= d/(n*m))\), where \(D_{n,m}\) is the - * 2-sample Kolmogorov-Smirnov statistic. - * <p> - * Here d is the D-statistic represented as long value. - * The real D-statistic is obtained by dividing d by n*m. - * See also {@link #exactP(double, int, int, boolean)}. - * - * @param d integral D-statistic - * @param n first sample size - * @param m second sample size - * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\) - * greater than or equal to {@code d/(n*m)} - */ - private double integralExactP(long d, int n, int m) { - Iterator<int[]> combinationsIterator = CombinatoricsUtils.combinationsIterator(n + m, n); - long tail = 0; - final double[] nSet = new double[n]; - final double[] mSet = new double[m]; - while (combinationsIterator.hasNext()) { - // Generate an n-set - final int[] nSetI = combinationsIterator.next(); - // Copy the n-set to nSet and its complement to mSet - int j = 0; - int k = 0; - for (int i = 0; i < n + m; i++) { - if (j < n && nSetI[j] == i) { - nSet[j++] = i; - } else { - mSet[k++] = i; - } - } - final long curD = integralKolmogorovSmirnovStatistic(nSet, mSet); - if (curD >= d) { - tail++; - } - } - return (double) tail / (double) CombinatoricsUtils.binomialCoefficient(n + m, n); + return 1 - n(m, n, m, n, calculateIntegralD(d, m, n, strict), strict) / + CombinatoricsUtils.binomialCoefficientDouble(n + m, m); } - /** * Uses the Kolmogorov-Smirnov distribution to approximate \(P(D_{n,m} > d)\) where \(D_{n,m}\) * is the 2-sample Kolmogorov-Smirnov statistic. See @@ -1270,4 +1210,61 @@ public class KolmogorovSmirnovTest { data[i] += dist.sample(); } } + + /** + * The function C(i, j) defined in [4] (class javadoc), formula (5.5). + * defined to return 1 if |i/n - j/m| <= c; 0 otherwise. Here c is scaled up + * and recoded as a long to avoid rounding errors in comparison tests, so what + * is actually tested is |im - jn| <= cmn. + * + * @param i first path parameter + * @param j second path paramter + * @param m first sample size + * @param n second sample size + * @param cmn integral D-statistic (see {@link #calculateIntegralD(double, int, int, boolean)}) + * @param strict whether or not the null hypothesis uses strict inequality + * @return C(i,j) for given m, n, c + */ + private static int c(int i, int j, int m, int n, long cmn, boolean strict) { + if (strict) { + return FastMath.abs(i*(long)n - j*(long)m) <= cmn ? 1 : 0; + } + return FastMath.abs(i*(long)n - j*(long)m) < cmn ? 1 : 0; + } + + /** + * The function N(i, j) defined in [4] (class javadoc). + * Returns the number of paths over the lattice {(i,j) : 0 <= i <= n, 0 <= j <= m} + * from (0,0) to (i,j) satisfying C(h,k, m, n, c) = 1 for each (h,k) on the path. + * The return value is integral, but subject to overflow, so it is maintained and + * returned as a double. + * + * @param i first path parameter + * @param j second path parameter + * @param m first sample size + * @param n second sample size + * @param cnm integral D-statistic (see {@link #calculateIntegralD(double, int, int, boolean)}) + * @param strict whether or not the null hypothesis uses strict inequality + * @return number or paths to (i, j) from (0,0) representing D-values as large as c for given m, n + */ + private static double n(int i, int j, int m, int n, long cnm, boolean strict) { + /* + * Unwind the recursive definition given in [4]. + * Compute n(1,1), n(1,2)...n(2,1), n(2,2)... up to n(i,j), one row at a time. + * When n(i,*) are being computed, lag[] holds the values of n(i - 1, *). + */ + final double[] lag = new double[n]; + double last = 0; + for (int k = 0; k < n; k++) { + lag[k] = c(0, k + 1, m, n, cnm, strict); + } + for (int k = 1; k <= i; k++) { + last = c(k, 0, m, n, cnm, strict); + for (int l = 1; l <= j; l++) { + lag[l - 1] = c(k, l, m, n, cnm, strict) * (last + lag[l - 1]); + last = lag[l - 1]; + } + } + return last; + } } http://git-wip-us.apache.org/repos/asf/commons-math/blob/2983ff81/src/test/R/KolmogorovSmirnovTestCases.R ---------------------------------------------------------------------- diff --git a/src/test/R/KolmogorovSmirnovTestCases.R b/src/test/R/KolmogorovSmirnovTestCases.R index 3146325..b456291 100644 --- a/src/test/R/KolmogorovSmirnovTestCases.R +++ b/src/test/R/KolmogorovSmirnovTestCases.R @@ -156,6 +156,12 @@ uniform <- c(0.7930305, 0.6424382, 0.8747699, 0.7156518, 0.1845909, 0.2022326, smallSample1 <- c(6, 7, 9, 13, 19, 21, 22, 23, 24) smallSample2 <- c(10, 11, 12, 16, 20, 27, 28, 32, 44, 54) +smallSample3 <- c(6, 7, 9, 13, 19, 21, 22, 23, 24, 29, 30, 34, 36, 41, 45, 47, 51, 63, 33, 91) +smallSample4 <- c(10, 11, 12, 16, 20, 27, 28, 32, 44, 54, 56, 57, 64, 69, 71, 80, 81, 88, 90) +smallSample5 <- c(-10, -5, 17, 21, 22, 23, 24, 30, 44, 50, 56, 57, 59, 67, 73, 75, 77, 78, 79, 80, 81, 83, 84, 85, 88, 90, + 92, 93, 94, 95, 98, 100, 101, 103, 105, 110) +smallSample6 <- c(-2, -1, 0, 10, 14, 15, 16, 20, 25, 26, 27, 31, 32, 33, 34, 45, 47, 48, 51, 52, 53, 54, 60, 61, 62, 63, + 74, 82, 106, 107, 109, 11, 112, 113, 114) bootSample1 <- c(0, 2, 4, 6, 8, 8, 10, 15, 22, 30, 33, 36, 38) bootSample2 <- c(9, 17, 20, 33, 40, 51, 60, 60, 72, 90, 101) roundingSample1 <- c(2,4,6,8,9,10,11,12,13) @@ -185,7 +191,13 @@ verifyTwoSampleLargeSamples(gaussian, gaussian2, 0.0319983962391632, 0.202352941 "Two sample N(0, 1) vs N(0, 1.6)") verifyTwoSampleSmallSamplesExact(smallSample1, smallSample2, 0.105577085453247, .5, tol, -"Two sample small samples exact") +"Two sample small samples exact 1") + +verifyTwoSampleSmallSamplesExact(smallSample3, smallSample4, 0.046298660942952, 0.426315789473684, tol, +"Two sample small samples exact 2") + +verifyTwoSampleSmallSamplesExact(smallSample5, smallSample6, 0.00300743602233366, 0.41031746031746, tol, +"Two sample small samples exact 3") verifyTwoSampleBootstrap(bootSample1, bootSample2, 0.0059, 1E-3, "Two sample bootstrap - isolated failures possible") verifyTwoSampleBootstrap(gaussian, gaussian2, 0.0237, 1E-2, "Two sample bootstrap - isolated failures possible") http://git-wip-us.apache.org/repos/asf/commons-math/blob/2983ff81/src/test/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTestTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTestTest.java b/src/test/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTestTest.java index 0a8a419..5c8df3f 100644 --- a/src/test/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTestTest.java +++ b/src/test/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTestTest.java @@ -25,7 +25,6 @@ import org.apache.commons.math3.distribution.NormalDistribution; import org.apache.commons.math3.distribution.UniformRealDistribution; import org.apache.commons.math3.random.RandomGenerator; import org.apache.commons.math3.random.Well19937c; -import org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest; import org.apache.commons.math3.util.CombinatoricsUtils; import org.apache.commons.math3.util.FastMath; import org.apache.commons.math3.util.MathArrays; @@ -181,12 +180,48 @@ public class KolmogorovSmirnovTestTest { final double[] smallSample2 = { 10, 11, 12, 16, 20, 27, 28, 32, 44, 54 }; - // Reference values from R, version 2.15.3 - R uses non-strict inequality in null hypothesis + // Reference values from R, version 3.2.0 - R uses non-strict inequality in null hypothesis Assert .assertEquals(0.105577085453247, test.kolmogorovSmirnovTest(smallSample1, smallSample2, false), TOLERANCE); Assert.assertEquals(0.5, test.kolmogorovSmirnovStatistic(smallSample1, smallSample2), TOLERANCE); } + /** Small samples - exact p-value, checked against R */ + @Test + public void testTwoSampleSmallSampleExact2() { + final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest(); + final double[] smallSample1 = { + 6, 7, 9, 13, 19, 21, 22, 23, 24, 29, 30, 34, 36, 41, 45, 47, 51, 63, 33, 91 + }; + final double[] smallSample2 = { + 10, 11, 12, 16, 20, 27, 28, 32, 44, 54, 56, 57, 64, 69, 71, 80, 81, 88, 90 + }; + // Reference values from R, version 3.2.0 - R uses non-strict inequality in null hypothesis + Assert + .assertEquals(0.0462986609, test.kolmogorovSmirnovTest(smallSample1, smallSample2, false), TOLERANCE); + Assert.assertEquals(0.4263157895, test.kolmogorovSmirnovStatistic(smallSample1, smallSample2), TOLERANCE); + } + + /** Small samples - exact p-value, checked against R */ + @Test + public void testTwoSampleSmallSampleExact3() { + final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest(); + final double[] smallSample1 = { + -10, -5, 17, 21, 22, 23, 24, 30, 44, 50, 56, 57, 59, 67, 73, 75, 77, 78, 79, 80, 81, 83, 84, 85, 88, 90, + 92, 93, 94, 95, 98, 100, 101, 103, 105, 110 + }; + final double[] smallSample2 = { + -2, -1, 0, 10, 14, 15, 16, 20, 25, 26, 27, 31, 32, 33, 34, 45, 47, 48, 51, 52, 53, 54, 60, 61, 62, 63, + 74, 82, 106, 107, 109, 11, 112, 113, 114 + }; + // Reference values from R, version 3.2.0 - R uses non-strict inequality in null hypothesis + Assert + .assertEquals(0.00300743602, test.kolmogorovSmirnovTest(smallSample1, smallSample2, false), TOLERANCE); + Assert.assertEquals(0.4103174603, test.kolmogorovSmirnovStatistic(smallSample1, smallSample2), TOLERANCE); + Assert + .assertEquals(0.00300743602, test.kolmogorovSmirnovTest(smallSample2, smallSample1, false), TOLERANCE); + } + /** * Checks exact p-value computations using critical values from Table 9 in V.K Rohatgi, An * Introduction to Probability and Mathematical Statistics, Wiley, 1976, ISBN 0-471-73135-8. @@ -279,10 +314,9 @@ public class KolmogorovSmirnovTestTest { } /** - * Verifies that Monte Carlo simulation gives results close to exact p values. This test is a - * little long-running (more than two minutes on a fast machine), so is disabled by default. + * Verifies that Monte Carlo simulation gives results close to exact p values. */ - // @Test + @Test public void testTwoSampleMonteCarlo() { final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest(new Well19937c(1000)); final int sampleSize = 14;
