This is an automated email from the ASF dual-hosted git repository. alsay pushed a commit to branch move_ks_test in repository https://gitbox.apache.org/repos/asf/datasketches-java.git
commit cb82e09d90b91cc51b5357b34c76b509f15523bb Author: AlexanderSaydakov <[email protected]> AuthorDate: Tue May 18 18:41:47 2021 -0700 move KS test from Util to its own class --- .../datasketches/quantiles/KolmogorovSmirnov.java | 111 +++++++++++++++++ .../org/apache/datasketches/quantiles/Util.java | 84 ++----------- .../quantiles/KolmogorovSmirnovTest.java | 134 +++++++++++++++++++++ .../apache/datasketches/quantiles/UtilTest.java | 89 -------------- 4 files changed, 253 insertions(+), 165 deletions(-) diff --git a/src/main/java/org/apache/datasketches/quantiles/KolmogorovSmirnov.java b/src/main/java/org/apache/datasketches/quantiles/KolmogorovSmirnov.java new file mode 100644 index 0000000..848272f --- /dev/null +++ b/src/main/java/org/apache/datasketches/quantiles/KolmogorovSmirnov.java @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.quantiles; + +/** + * Kolmogorov-Smirnov Test + * See <a href="https://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test">Kolmogorov–Smirnov Test</a> + */ +final class KolmogorovSmirnov { + + /** + * Computes the raw delta area between two quantile sketches for the + * {@link #kolmogorovSmirnovTest(DoublesSketch, DoublesSketch, double) + * Kolmogorov-Smirnov Test} + * method. + * @param sketch1 Input DoubleSketch 1 + * @param sketch2 Input DoubleSketch 2 + * @return the raw delta area between two quantile sketches + */ + public static double computeKSDelta(final DoublesSketch sketch1, final DoublesSketch sketch2) { + final DoublesAuxiliary p = new DoublesAuxiliary(sketch1); + final DoublesAuxiliary q = new DoublesAuxiliary(sketch2); + + final double[] pSamplesArr = p.auxSamplesArr_; + final double[] qSamplesArr = q.auxSamplesArr_; + final long[] pCumWtsArr = p.auxCumWtsArr_; + final long[] qCumWtsArr = q.auxCumWtsArr_; + final int pSamplesArrLen = pSamplesArr.length; + final int qSamplesArrLen = qSamplesArr.length; + + final double n1 = sketch1.getN(); + final double n2 = sketch2.getN(); + + double deltaArea = 0; + int i = 0; + int j = 0; + + while ((i < pSamplesArrLen) && (j < qSamplesArrLen)) { + deltaArea = Math.max(deltaArea, Math.abs(pCumWtsArr[i] / n1 - qCumWtsArr[j] / n2)); + if (pSamplesArr[i] < qSamplesArr[j]) { + i++; + } else if (qSamplesArr[j] < pSamplesArr[i]) { + j++; + } else { + i++; + j++; + } + } + + deltaArea = Math.max(deltaArea, Math.abs(pCumWtsArr[i] / n1 - qCumWtsArr[j] / n2)); + return deltaArea; + } + + /** + * Computes the adjusted delta area threshold for the + * {@link #kolmogorovSmirnovTest(DoublesSketch, DoublesSketch, double) Kolmogorov-Smirnov Test} + * method. + * This adjusts the computed threshold by the error epsilons of the two given sketches. + * @param sketch1 Input DoubleSketch 1 + * @param sketch2 Input DoubleSketch 2 + * @param tgtPvalue Target p-value. Typically .001 to .1, e.g., .05. + * @return the adjusted threshold to be compared with the raw delta area. + */ + public static double computeKSThreshold(final DoublesSketch sketch1, + final DoublesSketch sketch2, + final double tgtPvalue) { + final double r1 = sketch1.getRetainedItems(); + final double r2 = sketch2.getRetainedItems(); + final double alpha = tgtPvalue; + final double alphaFactor = Math.sqrt(-0.5 * Math.log(0.5 * alpha)); + final double deltaAreaThreshold = alphaFactor * Math.sqrt((r1 + r2) / (r1 * r2)); + final double eps1 = sketch1.getNormalizedRankError(false); + final double eps2 = sketch2.getNormalizedRankError(false); + return deltaAreaThreshold + eps1 + eps2; + } + + /** + * Performs the Kolmogorov-Smirnov Test between two quantiles sketches. + * Note: if the given sketches have insufficient data or if the sketch sizes are too small, + * this will return false. + * @param sketch1 Input DoubleSketch 1 + * @param sketch2 Input DoubleSketch 2 + * @param tgtPvalue Target p-value. Typically .001 to .1, e.g., .05. + * @return Boolean indicating whether we can reject the null hypothesis (that the sketches + * reflect the same underlying distribution) using the provided tgtPValue. + */ + public static boolean kolmogorovSmirnovTest(final DoublesSketch sketch1, + final DoublesSketch sketch2, final double tgtPvalue) { + final double delta = computeKSDelta(sketch1, sketch2); + final double thresh = computeKSThreshold(sketch1, sketch2, tgtPvalue); + return delta > thresh; + } + +} diff --git a/src/main/java/org/apache/datasketches/quantiles/Util.java b/src/main/java/org/apache/datasketches/quantiles/Util.java index c60bc21..a61a2c2 100644 --- a/src/main/java/org/apache/datasketches/quantiles/Util.java +++ b/src/main/java/org/apache/datasketches/quantiles/Util.java @@ -73,52 +73,9 @@ final class Util { * @param sketch2 Input DoubleSketch 2 * @return the raw delta area between two quantile sketches */ - public static double computeKSDelta(final DoublesSketch sketch1, - final DoublesSketch sketch2) { - final DoublesAuxiliary p = new DoublesAuxiliary(sketch1); - final DoublesAuxiliary q = new DoublesAuxiliary(sketch2); - - final double[] pSamplesArr = p.auxSamplesArr_; - final double[] qSamplesArr = q.auxSamplesArr_; - final long[] pCumWtsArr = p.auxCumWtsArr_; - final long[] qCumWtsArr = q.auxCumWtsArr_; - final int pSamplesArrLen = pSamplesArr.length; - final int qSamplesArrLen = qSamplesArr.length; - - final double n1 = sketch1.getN(); - final double n2 = sketch2.getN(); - - //Compute D from the two distributions - double deltaArea = 0.0; - int i = getNextIndex(pSamplesArr, -1); - int j = getNextIndex(qSamplesArr, -1); - - // We're done if either array reaches the end - while ((i < pSamplesArrLen) && (j < qSamplesArrLen)) { - final double pSample = pSamplesArr[i]; - final double qSample = qSamplesArr[j]; - final long pWt = pCumWtsArr[i]; - final long qWt = qCumWtsArr[j]; - final double pNormWt = pWt / n1; - final double qNormWt = qWt / n2; - final double pMinusQ = Math.abs(pNormWt - qNormWt); - final double curD = deltaArea; - deltaArea = Math.max(curD, pMinusQ); - - //Increment i or j or both - if (pSample == qSample) { - i = getNextIndex(pSamplesArr, i); - j = getNextIndex(qSamplesArr, j); - } else if (pSample < qSample) { - i = getNextIndex(pSamplesArr, i); - } else { - j = getNextIndex(qSamplesArr, j); - } - } - - //This is D, the delta difference in area of the two distributions - deltaArea = Math.max(deltaArea, Math.abs((pCumWtsArr[i] / n1) - (qCumWtsArr[j] / n2))); - return deltaArea; + @Deprecated + public static double computeKSDelta(final DoublesSketch sketch1, final DoublesSketch sketch2) { + return KolmogorovSmirnov.computeKSDelta(sketch1, sketch2); } /** @@ -132,19 +89,11 @@ final class Util { * @param tgtPvalue Target p-value. Typically .001 to .1, e.g., .05. * @return the adjusted threshold to be compared with the raw delta area. */ + @Deprecated public static double computeKSThreshold(final DoublesSketch sketch1, final DoublesSketch sketch2, final double tgtPvalue) { - final double r1 = sketch1.getRetainedItems(); - final double r2 = sketch2.getRetainedItems(); - final double alpha = tgtPvalue; - final double alphaFactor = Math.sqrt(-0.5 * Math.log(0.5 * alpha)); - final double deltaAreaThreshold = alphaFactor * Math.sqrt((r1 + r2) / (r1 * r2)); - final double eps1 = Util.getNormalizedRankError(sketch1.getK(), false); - final double eps2 = Util.getNormalizedRankError(sketch2.getK(), false); - - final double adjDeltaAreaThreshold = deltaAreaThreshold + eps1 + eps2; - return adjDeltaAreaThreshold; + return KolmogorovSmirnov.computeKSThreshold(sketch1, sketch2, tgtPvalue); } /** @@ -157,32 +106,15 @@ final class Util { * @return Boolean indicating whether we can reject the null hypothesis (that the sketches * reflect the same underlying distribution) using the provided tgtPValue. */ + @Deprecated public static boolean kolmogorovSmirnovTest(final DoublesSketch sketch1, final DoublesSketch sketch2, final double tgtPvalue) { - final double delta = computeKSDelta(sketch1, sketch2); - final double thresh = computeKSThreshold(sketch1, sketch2, tgtPvalue); - return delta > thresh; - } - - private static final int getNextIndex(final double[] samplesArr, final int stIdx) { - int idx = stIdx + 1; - final int samplesArrLen = samplesArr.length; - - if (idx >= samplesArrLen) { return samplesArrLen; } - - // if we have a sequence of equal values, use the last one of the sequence - final double val = samplesArr[idx]; - int nxtIdx = idx + 1; - while ((nxtIdx < samplesArrLen) && (samplesArr[nxtIdx] == val)) { - idx = nxtIdx; - ++nxtIdx; - } - return idx; + return KolmogorovSmirnov.kolmogorovSmirnovTest(sketch1, sketch2, tgtPvalue); } /** * Gets the normalized rank error given k and pmf for the Quantiles DoubleSketch and ItemsSketch. - * @param k the configuation parameter + * @param k the configuration parameter * @param pmf if true, returns the "double-sided" normalized rank error for the getPMF() function. * Otherwise, it is the "single-sided" normalized rank error for all the other queries. * @return if pmf is true, the normalized rank error for the getPMF() function. diff --git a/src/test/java/org/apache/datasketches/quantiles/KolmogorovSmirnovTest.java b/src/test/java/org/apache/datasketches/quantiles/KolmogorovSmirnovTest.java new file mode 100644 index 0000000..e69d688 --- /dev/null +++ b/src/test/java/org/apache/datasketches/quantiles/KolmogorovSmirnovTest.java @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.quantiles; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; + +import java.util.Random; + +import org.testng.annotations.Test; + +import org.apache.datasketches.SketchesArgumentException; + +@SuppressWarnings("javadoc") +public class KolmogorovSmirnovTest { + + @Test + public void checkKomologorovSmirnovStatistic1() { + final int k = 256; + final UpdateDoublesSketch s1 = DoublesSketch.builder().setK(k).build(); + final UpdateDoublesSketch s2 = DoublesSketch.builder().setK(k).build(); + + final Random rand = new Random(); + + final int n = (3 * k) - 1; + for (int i = 0; i < n; ++i) { + final double x = rand.nextGaussian(); + s1.update(x + 100); + s2.update(x); + } + + assertEquals(KolmogorovSmirnov.computeKSDelta(s1, s2), 1.0, 1E-6); + println("D = " + KolmogorovSmirnov.computeKSDelta(s1, s2)); + } + + @Test + public void checkKomologorovSmirnovStatistic2() { + final int k = 256; + final UpdateDoublesSketch s1 = DoublesSketch.builder().setK(k).build(); + final UpdateDoublesSketch s2 = DoublesSketch.builder().setK(k).build(); + + final Random rand = new Random(); + + final int n = (3 * k) - 1; + for (int i = 0; i < n; ++i) { + final double x = rand.nextGaussian(); + s1.update(x); + s2.update(x); + } + + assertEquals(KolmogorovSmirnov.computeKSDelta(s1, s2), 0, .01); + println("D = " + KolmogorovSmirnov.computeKSDelta(s1, s2)); + } + + @Test + public void checkKomologorovSmirnovStatistic3() { + final int k = 2048; + final UpdateDoublesSketch s1 = DoublesSketch.builder().setK(k).build(); + final UpdateDoublesSketch s2 = DoublesSketch.builder().setK(k).build(); + final double tgtPvalue = .05; + + final Random rand = new Random(); + + final int n = (3 * k) - 1; + for (int i = 0; i < n; ++i) { + final double x = rand.nextGaussian(); + s1.update(x + .05); + s2.update(x); + } + + double D = KolmogorovSmirnov.computeKSDelta(s1, s2); + double thresh = KolmogorovSmirnov.computeKSThreshold(s1, s2, tgtPvalue); + final boolean reject = KolmogorovSmirnov.kolmogorovSmirnovTest(s1, s2, tgtPvalue); + println("pVal = " + tgtPvalue + "\nK = " + k + "\nD = " + D + "\nTh = " + thresh + + "\nNull Hypoth Rejected = " + reject); + assertFalse(reject); + } + + @Test + public void checkKomologorovSmirnovStatistic4() { + final int k = 8192; + final UpdateDoublesSketch s1 = DoublesSketch.builder().setK(k).build(); + final UpdateDoublesSketch s2 = DoublesSketch.builder().setK(k).build(); + final double tgtPvalue = .05; + + final Random rand = new Random(); + + final int n = (3 * k) - 1; + for (int i = 0; i < n; ++i) { + final double x = rand.nextGaussian(); + s1.update(x + .05); + s2.update(x); + } + + double D = KolmogorovSmirnov.computeKSDelta(s1, s2); + double thresh = KolmogorovSmirnov.computeKSThreshold(s1, s2, tgtPvalue); + final boolean reject = KolmogorovSmirnov.kolmogorovSmirnovTest(s1, s2, tgtPvalue); + println("pVal = " + tgtPvalue + "\nK = " + k + "\nD = " + D + "\nTh = " + thresh + + "\nNull Hypoth Rejected = " + reject); + assertTrue(reject); + } + + + @Test + public void printlnTest() { + println("PRINTING: "+this.getClass().getName()); + } + + /** + * @param s value to print + */ + static void println(String s) { + //System.out.println(s); //disable here + } + +} diff --git a/src/test/java/org/apache/datasketches/quantiles/UtilTest.java b/src/test/java/org/apache/datasketches/quantiles/UtilTest.java index ce50160..e01ceab 100644 --- a/src/test/java/org/apache/datasketches/quantiles/UtilTest.java +++ b/src/test/java/org/apache/datasketches/quantiles/UtilTest.java @@ -20,11 +20,9 @@ package org.apache.datasketches.quantiles; import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertTrue; import java.util.Arrays; -import java.util.Random; import org.testng.annotations.Test; @@ -309,93 +307,6 @@ public class UtilTest { // exhaustiveMain(new String[] {"10", "100"}); // } - @Test - public void checkKomologorovSmirnovStatistic1() { - final int k = 256; - final UpdateDoublesSketch s1 = DoublesSketch.builder().setK(k).build(); - final UpdateDoublesSketch s2 = DoublesSketch.builder().setK(k).build(); - - final Random rand = new Random(); - - final int n = (3 * k) - 1; - for (int i = 0; i < n; ++i) { - final double x = rand.nextGaussian(); - s1.update(x + 100); - s2.update(x); - } - - assertEquals(Util.computeKSDelta(s1, s2), 1.0, 1.0 + 1E-6); - //println("D = " + Util.computeKSDelta(s1, s2)); - } - - @Test - public void checkKomologorovSmirnovStatistic2() { - final int k = 256; - final UpdateDoublesSketch s1 = DoublesSketch.builder().setK(k).build(); - final UpdateDoublesSketch s2 = DoublesSketch.builder().setK(k).build(); - - final Random rand = new Random(); - - final int n = (3 * k) - 1; - for (int i = 0; i < n; ++i) { - final double x = rand.nextGaussian(); - s1.update(x); - s2.update(x); - } - - assertEquals(Util.computeKSDelta(s1, s2), 0, .01); - //println("D = " + Util.computeKSDelta(s1, s2)); - } - - @Test - public void checkKomologorovSmirnovStatistic3() { - final int k = 2048; - final UpdateDoublesSketch s1 = DoublesSketch.builder().setK(k).build(); - final UpdateDoublesSketch s2 = DoublesSketch.builder().setK(k).build(); - final double tgtPvalue = .05; - - final Random rand = new Random(); - - final int n = (3 * k) - 1; - for (int i = 0; i < n; ++i) { - final double x = rand.nextGaussian(); - s1.update(x + .05); - s2.update(x); - } - - double D = Util.computeKSDelta(s1, s2); - double thresh = Util.computeKSThreshold(s1, s2, tgtPvalue); - final boolean reject = Util.kolmogorovSmirnovTest(s1, s2, tgtPvalue); - println("pVal = " + tgtPvalue + "\nK = " + k + "\nD = " + D + "\nTh = " + thresh - + "\nNull Hypoth Rejected = " + reject); - assertFalse(reject); - } - - @Test - public void checkKomologorovSmirnovStatistic4() { - final int k = 8192; - final UpdateDoublesSketch s1 = DoublesSketch.builder().setK(k).build(); - final UpdateDoublesSketch s2 = DoublesSketch.builder().setK(k).build(); - final double tgtPvalue = .05; - - final Random rand = new Random(); - - final int n = (3 * k) - 1; - for (int i = 0; i < n; ++i) { - final double x = rand.nextGaussian(); - s1.update(x + .05); - s2.update(x); - } - - double D = Util.computeKSDelta(s1, s2); - double thresh = Util.computeKSThreshold(s1, s2, tgtPvalue); - final boolean reject = Util.kolmogorovSmirnovTest(s1, s2, tgtPvalue); - println("pVal = " + tgtPvalue + "\nK = " + k + "\nD = " + D + "\nTh = " + thresh - + "\nNull Hypoth Rejected = " + reject); - assertTrue(reject); - } - - @Test public void printlnTest() { println("PRINTING: "+this.getClass().getName()); --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
