MATH-1274: representation of Kolmogorov-Smirnov statistic as integral value
Project: http://git-wip-us.apache.org/repos/asf/commons-math/repo Commit: http://git-wip-us.apache.org/repos/asf/commons-math/commit/1c9c43c1 Tree: http://git-wip-us.apache.org/repos/asf/commons-math/tree/1c9c43c1 Diff: http://git-wip-us.apache.org/repos/asf/commons-math/diff/1c9c43c1 Branch: refs/heads/MATH_3_X Commit: 1c9c43c1d4bb76d7e47cdfc9c6681a38305a95e4 Parents: 1cdaba9 Author: Otmar Ertl <[email protected]> Authored: Wed Sep 16 20:34:46 2015 +0200 Committer: Otmar Ertl <[email protected]> Committed: Wed Sep 16 20:34:46 2015 +0200 ---------------------------------------------------------------------- src/changes/changes.xml | 3 + .../stat/inference/KolmogorovSmirnovTest.java | 195 ++++++++++++++++--- 2 files changed, 170 insertions(+), 28 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/commons-math/blob/1c9c43c1/src/changes/changes.xml ---------------------------------------------------------------------- diff --git a/src/changes/changes.xml b/src/changes/changes.xml index 2d1fecd..d1eaf45 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -51,6 +51,9 @@ If the output is not quite correct, check for invisible trailing spaces! </properties> <body> <release version="3.6" date="XXXX-XX-XX" description=""> + <action dev="oertl" type="update" issue="MATH-1274"> + Representation of Kolmogorov-Smirnov statistic as integral value. + </action> <action dev="luc" type="add" issue="MATH-1273" due-to="Qualtagh"> Added negative zero support in FastMath.pow. </action> http://git-wip-us.apache.org/repos/asf/commons-math/blob/1c9c43c1/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java b/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java index 74bede9..eac1546 100644 --- a/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java +++ b/src/main/java/org/apache/commons/math3/stat/inference/KolmogorovSmirnovTest.java @@ -21,7 +21,6 @@ import java.math.BigDecimal; import java.util.Arrays; import java.util.Iterator; -import org.apache.commons.math3.util.Precision; import org.apache.commons.math3.distribution.RealDistribution; import org.apache.commons.math3.exception.InsufficientDataException; import org.apache.commons.math3.exception.MathArithmeticException; @@ -220,7 +219,10 @@ public class KolmogorovSmirnovTest { * {@value #SMALL_SAMPLE_PRODUCT}), the exact distribution is used to compute the p-value. This * is accomplished by enumerating all partitions of the combined sample into two subsamples of * the respective sample sizes, computing \(D_{n,m}\) for each partition and returning the - * proportion of partitions that give \(D\) values exceeding the observed value.</li> + * proportion of partitions that give \(D\) values exceeding the observed value. In the very + * small sample case, if there are ties in the data, the actual sample values (including ties) + * are used in generating the partitions (which are basically multi-set partitions in this + * case).</li> * <li>For mid-size samples (product of sample sizes greater than or equal to * {@value #SMALL_SAMPLE_PRODUCT} but less than {@value #LARGE_SAMPLE_PRODUCT}), Monte Carlo * simulation is used to compute the p-value. The simulation randomly generates partitions and @@ -243,10 +245,10 @@ public class KolmogorovSmirnovTest { public double kolmogorovSmirnovTest(double[] x, double[] y, boolean strict) { final long lengthProduct = (long) x.length * y.length; if (lengthProduct < SMALL_SAMPLE_PRODUCT) { - return exactP(kolmogorovSmirnovStatistic(x, y), x.length, y.length, strict); + return integralExactP(integralKolmogorovSmirnovStatistic(x, y) + ((strict)?1l:0l), x.length, y.length); } if (lengthProduct < LARGE_SAMPLE_PRODUCT) { - return monteCarloP(kolmogorovSmirnovStatistic(x, y), x.length, y.length, strict, MONTE_CARLO_ITERATIONS); + return integralMonteCarloP(integralKolmogorovSmirnovStatistic(x, y) + ((strict)?1l:0l), x.length, y.length, MONTE_CARLO_ITERATIONS); } return approximateP(kolmogorovSmirnovStatistic(x, y), x.length, y.length); } @@ -285,6 +287,25 @@ public class KolmogorovSmirnovTest { * @throws NullArgumentException if either {@code x} or {@code y} is null */ public double kolmogorovSmirnovStatistic(double[] x, double[] y) { + return integralKolmogorovSmirnovStatistic(x, y)/((double)(x.length * (long)y.length)); + } + + /** + * Computes the two-sample Kolmogorov-Smirnov test statistic, \(D_{n,m}=\sup_x |F_n(x)-F_m(x)|\) + * where \(n\) is the length of {@code x}, \(m\) is the length of {@code y}, \(F_n\) is the + * empirical distribution that puts mass \(1/n\) at each of the values in {@code x} and \(F_m\) + * is the empirical distribution of the {@code y} values. Finally \(n m D_{n,m}\) is returned + * as long value. + * + * @param x first sample + * @param y second sample + * @return test statistic \(n m D_{n,m}\) used to evaluate the null hypothesis that {@code x} and + * {@code y} represent samples from the same underlying distribution + * @throws InsufficientDataException if either {@code x} or {@code y} does not have length at + * least 2 + * @throws NullArgumentException if either {@code x} or {@code y} is null + */ + private long integralKolmogorovSmirnovStatistic(double[] x, double[] y) { checkArray(x); checkArray(y); // Copy and sort the sample arrays @@ -297,23 +318,26 @@ public class KolmogorovSmirnovTest { int rankX = 0; int rankY = 0; + long curD = 0l; // Find the max difference between cdf_x and cdf_y - double supD = 0d; + long supD = 0l; do { double z = Double.compare(sx[rankX], sy[rankY]) <= 0 ? sx[rankX] : sy[rankY]; while(rankX < n && Double.compare(sx[rankX], z) == 0) { rankX += 1; + curD += m; } while(rankY < m && Double.compare(sy[rankY], z) == 0) { rankY += 1; + curD -= n; } - final double cdf_x = rankX / (double) n; - final double cdf_y = rankY / (double) m; - final double curD = FastMath.abs(cdf_x - cdf_y); if (curD > supD) { supD = curD; } + else if (-curD > supD) { + supD = -curD; + } } while(rankX < n && rankY < m); return supD; } @@ -858,6 +882,32 @@ public class KolmogorovSmirnovTest { } /** + * Given a d-statistic in the range [0, 1] and the two sample sizes n and m, + * an integral d-statistic in the range [0, n*m] is calculated, that can be used for + * comparison with other integral d-statistics. Depending whether {@code strict} is + * {@code true} or not, the returned value divided by (n*m) is greater than + * (resp greater than or equal to) the given d value (allowing some tolerance). + * + * @param d a d-statistic in the range [0, 1] + * @param n first sample size + * @param m second sample size + * @param strict whether the returned value divided by (n*m) is allowed to be equal to d + * @return the integral d-statistic in the range [0, n*m] + */ + private static long calculateIntegralD(double d, int n, int m, boolean strict) { + final double tol = 1e-12; // d-values within tol of one another are considered equal + long nm = n * (long)m; + long upperBound = (long)FastMath.ceil((d - tol) * nm); + long lowerBound = (long)FastMath.floor((d + tol) * nm); + if (strict && lowerBound == upperBound) { + return upperBound + 1l; + } + else { + return upperBound; + } + } + + /** * Computes \(P(D_{n,m} > d)\) if {@code strict} is {@code true}; otherwise \(P(D_{n,m} \ge * d)\), where \(D_{n,m}\) is the 2-sample Kolmogorov-Smirnov statistic. See * {@link #kolmogorovSmirnovStatistic(double[], double[])} for the definition of \(D_{n,m}\). @@ -882,11 +932,28 @@ public class KolmogorovSmirnovTest { * greater than (resp. greater than or equal to) {@code d} */ public double exactP(double d, int n, int m, boolean strict) { + return integralExactP(calculateIntegralD(d, n, m, strict), n, m); + } + + /** + * Computes \(P(D_{n,m} >= d/(n*m))\), where \(D_{n,m}\) is the + * 2-sample Kolmogorov-Smirnov statistic. + * <p> + * Here d is the D-statistic represented as long value. + * The real D-statistic is obtained by dividing d by n*m. + * See also {@link #exactP(double, int, int, boolean)}. + * + * @param d integral D-statistic + * @param n first sample size + * @param m second sample size + * @return probability that a randomly selected m-n partition of m + n generates \(D_{n,m}\) + * greater than or equal to {@code d/(n*m)} + */ + private double integralExactP(long d, int n, int m) { Iterator<int[]> combinationsIterator = CombinatoricsUtils.combinationsIterator(n + m, n); long tail = 0; final double[] nSet = new double[n]; final double[] mSet = new double[m]; - final double tol = 1e-12; // d-values within tol of one another are considered equal while (combinationsIterator.hasNext()) { // Generate an n-set final int[] nSetI = combinationsIterator.next(); @@ -900,9 +967,67 @@ public class KolmogorovSmirnovTest { mSet[k++] = i; } } - final double curD = kolmogorovSmirnovStatistic(nSet, mSet); - final int order = Precision.compareTo(curD, d, tol); - if (order > 0 || (order == 0 && !strict)) { + final long curD = integralKolmogorovSmirnovStatistic(nSet, mSet); + if (curD >= d) { + tail++; + } + } + return (double) tail / (double) CombinatoricsUtils.binomialCoefficient(n + m, n); + } + + /** + * Computes the exact p value for a two-sample Kolmogorov-Smirnov test with + * {@code x} and {@code y} as samples, possibly containing ties. This method + * uses the same implementation as {@link #exactP(double, int, int, boolean)} + * with the exception that it examines partitions of the combined sample, + * preserving ties in the data. What is returned is the exact probability + * that a random partition of the combined dataset into a subset of size + * {@code x.length} and another of size {@code y.length} yields a \(D\) + * value greater than (resp greater than or equal to) \(D(x,y)\). + * <p> + * This method should not be used on large samples (a good rule of thumb is + * to keep the product of the sample sizes less than + * {@link #SMALL_SAMPLE_PRODUCT} when using this method). If the data do + * not contain ties, {@link #exactP(double[], double[], boolean)} should be + * used instead of this method.</p> + * + * @param x first sample + * @param y second sample + * @param strict whether or not the inequality in the null hypothesis is strict + * @return p-value + */ + public double exactP(double[] x, double[] y, boolean strict) { + final long d = integralKolmogorovSmirnovStatistic(x, y); + final int n = x.length; + final int m = y.length; + + // Concatenate x and y into universe, preserving ties in the data + final double[] universe = new double[n + m]; + System.arraycopy(x, 0, universe, 0, n); + System.arraycopy(y, 0, universe, n, m); + + // Iterate over all n, m partitions of the n + m elements in the universe, + // Computing D for each one + Iterator<int[]> combinationsIterator = CombinatoricsUtils.combinationsIterator(n + m, n); + long tail = 0; + final double[] nSet = new double[n]; + final double[] mSet = new double[m]; + while (combinationsIterator.hasNext()) { + // Generate an n-set + final int[] nSetI = combinationsIterator.next(); + // Copy the elements of the universe in the n-set to nSet + // and the others to mSet + int j = 0; + int k = 0; + for (int i = 0; i < n + m; i++) { + if (j < n && nSetI[j] == i) { + nSet[j++] = universe[i]; + } else { + mSet[k++] = universe[i]; + } + } + final long curD = integralKolmogorovSmirnovStatistic(nSet, mSet); + if (curD > d || (curD == d && !strict)) { tail++; } } @@ -974,35 +1099,49 @@ public class KolmogorovSmirnovTest { */ public double monteCarloP(final double d, final int n, final int m, final boolean strict, final int iterations) { + return integralMonteCarloP(calculateIntegralD(d, n, m, strict), n, m, iterations); + } + + /** + * Uses Monte Carlo simulation to approximate \(P(D_{n,m} >= d/(n*m))\) where \(D_{n,m}\) is the + * 2-sample Kolmogorov-Smirnov statistic. + * <p> + * Here d is the D-statistic represented as long value. + * The real D-statistic is obtained by dividing d by n*m. + * See also {@link #monteCarloP(double, int, int, boolean, int)}. + * + * @param d integral D-statistic + * @param n first sample size + * @param m second sample size + * @param iterations number of random partitions to generate + * @return proportion of randomly generated m-n partitions of m + n that result in \(D_{n,m}\) + * greater than or equal to {@code d/(n*m))} + */ + private double integralMonteCarloP(final long d, final int n, final int m, final int iterations) { + // ensure that nn is always the max of (n, m) to require fewer random numbers final int nn = FastMath.max(n, m); final int mm = FastMath.min(n, m); final int sum = nn + mm; - final double tol = 1e-12; // d-values within tol of one another are considered equal int tail = 0; final boolean b[] = new boolean[sum]; for (int i = 0; i < iterations; i++) { fillBooleanArrayRandomlyWithFixedNumberTrueValues(b, nn, rng); - int rankN = b[0] ? 1 : 0; - int rankM = b[0] ? 0 : 1; - boolean previous = b[0]; - for(int j = 1; j < b.length; ++j) { - if (b[j] != previous) { - final double cdf_n = rankN / (double) nn; - final double cdf_m = rankM / (double) mm; - final double curD = FastMath.abs(cdf_n - cdf_m); - final int order = Precision.compareTo(curD, d, tol); - if (order > 0 || (order == 0 && !strict)) { + long curD = 0l; + for(int j = 0; j < b.length; ++j) { + if (b[j]) { + curD += mm; + if (curD >= d) { tail++; break; } - } - previous = b[j]; - if (b[j]) { - rankN++; } else { - rankM++; + curD -= nn; + if (curD <= -d) { + tail++; + break; + } } } }
