Repository: commons-math
Updated Branches:
  refs/heads/master 980554552 -> bddcfd5a5


MATH-1405

As per the discussion on the JIRA page:
 * Modified method "hasTies" in order to use a single way for detecting ties.
 * Modified method "jitter": Add a random value that will modify the least
   significant bits of the number to be jittered.
 * Simplified method "fixTies".
 * When NaN values are present in the data, an exception is thrown.

Formatting of Javadoc.
Added unit tests.


Project: http://git-wip-us.apache.org/repos/asf/commons-math/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-math/commit/b0b23c17
Tree: http://git-wip-us.apache.org/repos/asf/commons-math/tree/b0b23c17
Diff: http://git-wip-us.apache.org/repos/asf/commons-math/diff/b0b23c17

Branch: refs/heads/master
Commit: b0b23c179ac55334f760aa29fce262c73a909268
Parents: e280c86
Author: Gilles <[email protected]>
Authored: Tue Mar 7 13:06:29 2017 +0100
Committer: Gilles <[email protected]>
Committed: Tue Mar 7 13:06:29 2017 +0100

----------------------------------------------------------------------
 .../stat/inference/KolmogorovSmirnovTest.java   | 140 +++++++++----------
 .../inference/KolmogorovSmirnovTestTest.java    |  90 ++++++++++--
 2 files changed, 140 insertions(+), 90 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-math/blob/b0b23c17/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
 
b/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
index 09999ea..368bc23 100644
--- 
a/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
+++ 
b/src/main/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTest.java
@@ -31,6 +31,7 @@ import 
org.apache.commons.math4.exception.NullArgumentException;
 import org.apache.commons.math4.exception.NumberIsTooLargeException;
 import org.apache.commons.math4.exception.OutOfRangeException;
 import org.apache.commons.math4.exception.TooManyIterationsException;
+import org.apache.commons.math4.exception.NotANumberException;
 import org.apache.commons.math4.exception.util.LocalizedFormats;
 import org.apache.commons.math4.fraction.BigFraction;
 import org.apache.commons.math4.fraction.BigFractionField;
@@ -243,15 +244,16 @@ public class KolmogorovSmirnovTest {
      * If ties are known to be present in the data, {@link 
#bootstrap(double[], double[], int, boolean)}
      * may be used as an alternative method for estimating the p-value.</p>
      *
-     * @param x first sample dataset
-     * @param y second sample dataset
-     * @param strict whether or not the probability to compute is expressed as 
a strict inequality
-     *        (ignored for large samples)
-     * @return p-value associated with the null hypothesis that {@code x} and 
{@code y} represent
-     *         samples from the same distribution
-     * @throws InsufficientDataException if either {@code x} or {@code y} does 
not have length at
-     *         least 2
-     * @throws NullArgumentException if either {@code x} or {@code y} is null
+     * @param x first sample dataset.
+     * @param y second sample dataset.
+     * @param strict whether or not the probability to compute is expressed as
+     * a strict inequality (ignored for large samples).
+     * @return p-value associated with the null hypothesis that {@code x} and
+     * {@code y} represent samples from the same distribution.
+     * @throws InsufficientDataException if either {@code x} or {@code y} does
+     * not have length at least 2.
+     * @throws NullArgumentException if either {@code x} or {@code y} is null.
+     * @throws NotANumberException if the input arrays contain NaN values.
      * @see #bootstrap(double[], double[], int, boolean)
      */
     public double kolmogorovSmirnovTest(double[] x, double[] y, boolean 
strict) {
@@ -1121,80 +1123,61 @@ public class KolmogorovSmirnovTest {
     }
 
     /**
-     * If there are no ties in the combined dataset formed from x and y, this
-     * method is a no-op.  If there are ties, a uniform random deviate in
-     * (-minDelta / 2, minDelta / 2) - {0} is added to each value in x and y, 
where
-     * minDelta is the minimum difference between unequal values in the 
combined
-     * sample.  A fixed seed is used to generate the jitter, so repeated 
activations
-     * with the same input arrays result in the same values.
+     * If there are no ties in the combined dataset formed from x and y,
+     * this method is a no-op.
+     * If there are ties, a uniform random deviate in
+     * is added to each value in x and y, and this method overwrites the
+     * data in x and y with the jittered values.
      *
-     * NOTE: if there are ties in the data, this method overwrites the data in
-     * x and y with the jittered values.
-     *
-     * @param x first sample
-     * @param y second sample
+     * @param x First sample.
+     * @param y Second sample.
+     * @throw NotANumberException if any of the input arrays contain
+     * a NaN value.
      */
     private static void fixTies(double[] x, double[] y) {
-       final double[] values = MathArrays.unique(MathArrays.concatenate(x,y));
-       if (values.length == x.length + y.length) {
-           return;  // There are no ties
-       }
-
-       // Find the smallest difference between values, or 1 if all values are 
the same
-       double minDelta = 1;
-       double prev = values[0];
-       double delta = 1;
-       for (int i = 1; i < values.length; i++) {
-          delta = prev - values[i];
-          if (delta < minDelta) {
-              minDelta = delta;
-          }
-          prev = values[i];
-       }
-       minDelta /= 2;
-
-       // Add jitter using a fixed seed (so same arguments always give same 
results),
-       // low-initialization-overhead generator.
-       final UniformRandomProvider rng = 
RandomSource.create(RandomSource.TWO_CMRES, 654321);
-
-       // It is theoretically possible that jitter does not break ties, so 
repeat
-       // until all ties are gone.  Bound the loop and throw MIE if bound is 
exceeded.
-       int ct = 0;
-       boolean ties = true;
-       do {
-           jitter(x, rng, minDelta);
-           jitter(y, rng, minDelta);
-           ties = hasTies(x, y);
-           ct++;
-           // If jittering hasn't resolved ties, "minDelta" may be too small.
-           minDelta *= 2;
-       } while (ties && ct < 1000);
-       if (ties) {
-           throw new MathInternalError(); // Should never happen.
-       }
+        if (hasTies(x, y)) {
+            // Add jitter using a fixed seed (so same arguments always give 
same results),
+            // low-initialization-overhead generator.
+            final UniformRandomProvider rng = 
RandomSource.create(RandomSource.TWO_CMRES, 7654321);
+
+            // It is theoretically possible that jitter does not break ties, 
so repeat
+            // until all ties are gone.  Bound the loop and throw MIE if bound 
is exceeded.
+            int ct = 0;
+            boolean ties = true;
+            do {
+                jitter(x, rng, 10);
+                jitter(y, rng, 10);
+                ties = hasTies(x, y);
+                ++ct;
+            } while (ties && ct < 10);
+            if (ties) {
+                throw new MathInternalError(); // Should never happen.
+            }
+        }
     }
 
     /**
-     * Returns true iff there are ties in the combined sample
-     * formed from x and y.
+     * Returns true iff there are ties in the combined sample formed from
+     * x and y.
      *
-     * @param x first sample
-     * @param y second sample
-     * @return true if x and y together contain ties
+     * @param x First sample.
+     * @param y Second sample.
+     * @return true if x and y together contain ties.
+     * @throw NotANumberException if any of the input arrays contain
+     * a NaN value.
      */
     private static boolean hasTies(double[] x, double[] y) {
-        final HashSet<Double> values = new HashSet<>();
-            for (int i = 0; i < x.length; i++) {
-                if (!values.add(x[i])) {
-                    return true;
-                }
-            }
-            for (int i = 0; i < y.length; i++) {
-                if (!values.add(y[i])) {
-                    return true;
-                }
-            }
-        return false;
+       final double[] values = MathArrays.unique(MathArrays.concatenate(x, y));
+
+       // "unique" moves NaN to the head of the output array.
+       if (Double.isNaN(values[0])) {
+           throw new NotANumberException();
+       }
+       if (values.length == x.length + y.length) {
+           return false;  // There are no ties.
+       }
+
+       return true;
     }
 
     /**
@@ -1207,10 +1190,13 @@ public class KolmogorovSmirnovTest {
      * @param sampler probability distribution to sample for jitter values
      * @throws NullPointerException if either of the parameters is null
      */
-    private static void jitter(final double[] data, final 
UniformRandomProvider rng, final double delta) {
+    private static void jitter(double[] data,
+                               UniformRandomProvider rng,
+                               int ulp) {
+        final int range = ulp * 2;
         for (int i = 0; i < data.length; i++) {
-            final double d = delta * (2 * rng.nextDouble() - 1);
-            data[i] += d;
+            final int rand = rng.nextInt(range) - ulp;
+            data[i] += rand * Math.ulp(data[i]);
         }
     }
 

http://git-wip-us.apache.org/repos/asf/commons-math/blob/b0b23c17/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java
 
b/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java
index c3aa553..4ad0872 100644
--- 
a/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java
+++ 
b/src/test/java/org/apache/commons/math4/stat/inference/KolmogorovSmirnovTestTest.java
@@ -28,6 +28,7 @@ import org.apache.commons.rng.UniformRandomProvider;
 import org.apache.commons.math4.util.CombinatoricsUtils;
 import org.apache.commons.math4.util.FastMath;
 import org.apache.commons.math4.util.MathArrays;
+import org.apache.commons.math4.exception.NotANumberException;
 import org.junit.Assert;
 import org.junit.Test;
 
@@ -440,30 +441,93 @@ public class KolmogorovSmirnovTestTest {
     @Test
     public void testTwoSampleWithManyTiesAndVerySmallDelta() {
         // Cf. MATH-1405
+
         final double[] x = {
-            0.000000, 0.000000, 1.000000,
-            1.000000, 1.500000, 1.600000,
-            1.700000, 1.800000, 1.900000, 2.000000, 2.000000000000001
+            0.0, 0.0,
+            1.0, 1.0,
+            1.5,
+            1.6,
+            1.7,
+            1.8,
+            1.9,
+            2.0,
+            2.000000000000001
         };
 
         final double[] y = {
-            0.000000, 0.000000, 10.000000,
-            10.000000, 11.000000, 11.000000,
-            11.000000, 15.000000, 16.000000,
-            17.000000, 18.000000, 19.000000, 20.000000, 20.000000000000001
+            0.0, 0.0,
+            10.0, 10.0,
+            11.0, 11.0, 11.0,
+            15.0,
+            16.0,
+            17.0,
+            18.0,
+            19.0,
+            20.0,
+            20.000000000000001
         };
 
-        // These values result in an initial calculated minDelta of 
4.440892098500626E-16,
-        // which is too small to jitter the existing values to new ones bc of 
floating-point
-        // precision.
-        // MATH-1405 adds functionality to iteratively increase minDelta until 
a noticeable
-        // jitter occurs.
-
         final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
         Assert.assertEquals(1.12173015e-5, test.kolmogorovSmirnovTest(x, y), 
1e-6);
     }
 
     @Test
+    public void testTwoSampleWithManyTiesAndExtremeValues() {
+        // Cf. MATH-1405
+
+        final double[] largeX = {
+            Double.MAX_VALUE, Double.MAX_VALUE,
+            1e40, 1e40,
+            2e40, 2e40,
+            1e30,
+            2e30,
+            3e30,
+            4e30,
+            5e10,
+            6e10,
+            7e10,
+            8e10
+        };
+
+        final double[] smallY = {
+            Double.MIN_VALUE,
+            2 * Double.MIN_VALUE,
+            1e-40, 1e-40,
+            2e-40, 2e-40,
+            1e-30,
+            2e-30,
+            3e-30,
+            4e-30,
+            5e-10,
+            6e-10,
+            7e-10,
+            8e-10
+        };
+
+        final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
+        Assert.assertEquals(0, test.kolmogorovSmirnovTest(largeX, smallY), 
1e-10);
+    }
+
+    @Test(expected=NotANumberException.class)
+    public void testTwoSampleWithTiesAndNaN1() {
+        // Cf. MATH-1405
+
+        final double[] x = { 1, Double.NaN, 3, 4 };
+        final double[] y = { 1, 2, 3, 4 };
+        new KolmogorovSmirnovTest().kolmogorovSmirnovTest(x, y);
+    }
+
+    @Test(expected=NotANumberException.class)
+    public void testTwoSampleWithTiesAndNaN2() {
+        // Cf. MATH-1405
+
+        final double[] x = { 1, 2, 3, 4 };
+        final double[] y = { 1, 2, Double.NaN, 4 };
+
+        new KolmogorovSmirnovTest().kolmogorovSmirnovTest(x, y);
+    }
+
+    @Test
     public void testTwoSamplesAllEqual() {
         int iterations = 10_000;
         final KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();

Reply via email to