Github user sryza commented on a diff in the pull request:

    https://github.com/apache/spark/pull/6994#discussion_r34200828
  
    --- Diff: 
mllib/src/test/scala/org/apache/spark/mllib/stat/HypothesisTestSuite.scala ---
    @@ -153,4 +157,101 @@ class HypothesisTestSuite extends SparkFunSuite with 
MLlibTestSparkContext {
           Statistics.chiSqTest(sc.parallelize(continuousFeature, 2))
         }
       }
    +
    +  test("1 sample Kolmogorov-Smirnov test") {
    +    // Create theoretical distributions
    +    val stdNormalDist = new NormalDistribution(0, 1)
    +    val expDist = new ExponentialDistribution(0.6)
    +    val unifDist = new UniformRealDistribution()
    +
    +    // set seeds
    +    val seed = 10L
    +    stdNormalDist.reseedRandomGenerator(seed)
    +    expDist.reseedRandomGenerator(seed)
    +    unifDist.reseedRandomGenerator(seed)
    +
    +    // Sample data from the distributions and parallelize it
    +    val n = 100000
    +    val sampledNorm = sc.parallelize(stdNormalDist.sample(n), 10)
    +    val sampledExp = sc.parallelize(expDist.sample(n), 10)
    +    val sampledUnif = sc.parallelize(unifDist.sample(n), 10)
    +
    +    // Use a apache math commons local KS test to verify calculations
    +    val ksTest = new KolmogorovSmirnovTest()
    +    val pThreshold = 0.05
    +
    +    // Comparing a standard normal sample to a standard normal distribution
    +    val result1 = Statistics.ksTest(sampledNorm, "stdnorm")
    +    val referenceStat1 = ksTest.kolmogorovSmirnovStatistic(stdNormalDist, 
sampledNorm.collect())
    +    val referencePVal1 = 1 - ksTest.cdf(referenceStat1, n)
    +    // Verify vs apache math commons ks test
    +    assert(result1.statistic === referenceStat1)
    +    assert(result1.pValue === referencePVal1)
    +    // Cannot reject null hypothesis
    +    assert(result1.pValue > pThreshold)
    +
    +    // Comparing an exponential sample to a standard normal distribution
    +    val result2 = Statistics.ksTest(sampledExp, "stdnorm")
    +    val referenceStat2 = ksTest.kolmogorovSmirnovStatistic(stdNormalDist, 
sampledExp.collect())
    +    val referencePVal2 = 1 - ksTest.cdf(referenceStat2, n)
    +    // verify vs apache math commons ks test
    +    assert(result2.statistic === referenceStat2)
    +    assert(result2.pValue === referencePVal2)
    +    // reject null hypothesis
    +    assert(result2.pValue < pThreshold)
    +
    +    // Testing the use of a user provided CDF function
    +    // Distribution is not serializable, so will have to create in the 
lambda
    +    val expCDF = (x: Double) => new 
ExponentialDistribution(0.2).cumulativeProbability(x)
    +
    +    // Comparing an exponential sample with mean X to an exponential 
distribution with mean Y
    +    // Where X != Y
    +    val result3 = Statistics.ksTest(sampledExp, expCDF)
    +    val referenceStat3 = ksTest.kolmogorovSmirnovStatistic(new 
ExponentialDistribution(0.2),
    +      sampledExp.collect())
    +    val referencePVal3 = 1 - ksTest.cdf(referenceStat3, 
sampledNorm.count().toInt)
    +    // verify vs apache math commons ks test
    +    assert(result3.statistic === referenceStat3)
    +    assert(result3.pValue === referencePVal3)
    +    // reject null hypothesis
    +    assert(result3.pValue < pThreshold)
    +
    +    /*
    +     Comparing results with R's implementation of Kolmogorov-Smirnov for 1 
sample
    +     > sessionInfo()
    +     R version 3.2.0 (2015-04-16)
    +     Platform: x86_64-apple-darwin13.4.0 (64-bit)
    +     > set.seed(20)
    +     > v <- rnorm(20)
    +     > v
    +      [1]  1.16268529 -0.58592447  1.78546500 -1.33259371 -0.44656677  
0.56960612
    +      [7] -2.88971761 -0.86901834 -0.46170268 -0.55554091 -0.02013537 
-0.15038222
    +     [13] -0.62812676  1.32322085 -1.52135057 -0.43742787  0.97057758  
0.02822264
    +     [19] -0.08578219  0.38921440
    +     > ks.test(v, pnorm, alternative = "two.sided")
    +
    +             One-sample Kolmogorov-Smirnov test
    +
    +     data:  v
    +     D = 0.18874, p-value = 0.4223
    +     alternative hypothesis: two-sided
    +    */
    +
    +    val RKSStat = 0.18874
    +    val RKSPVal = 0.4223
    +    val RData = sc.parallelize(
    +        Array(
    --- End diff --
    
    indent this back a space


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to