Github user mengxr commented on a diff in the pull request:
https://github.com/apache/spark/pull/2455#discussion_r18536653
--- Diff:
core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala ---
@@ -18,96 +18,547 @@
package org.apache.spark.util.random
import java.util.Random
+import scala.collection.mutable.ArrayBuffer
import cern.jet.random.Poisson
-import org.scalatest.{BeforeAndAfter, FunSuite}
-import org.scalatest.mock.EasyMockSugar
-
-class RandomSamplerSuite extends FunSuite with BeforeAndAfter with
EasyMockSugar {
-
- val a = List(1, 2, 3, 4, 5, 6, 7, 8, 9)
-
- var random: Random = _
- var poisson: Poisson = _
-
- before {
- random = mock[Random]
- poisson = mock[Poisson]
- }
-
- test("BernoulliSamplerWithRange") {
- expecting {
- for(x <- Seq(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)) {
- random.nextDouble().andReturn(x)
- }
- }
- whenExecuting(random) {
- val sampler = new BernoulliSampler[Int](0.25, 0.55)
- sampler.rng = random
- assert(sampler.sample(a.iterator).toList == List(3, 4, 5))
- }
- }
-
- test("BernoulliSamplerWithRangeInverse") {
- expecting {
- for(x <- Seq(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)) {
- random.nextDouble().andReturn(x)
- }
- }
- whenExecuting(random) {
- val sampler = new BernoulliSampler[Int](0.25, 0.55, true)
- sampler.rng = random
- assert(sampler.sample(a.iterator).toList === List(1, 2, 6, 7, 8, 9))
- }
- }
-
- test("BernoulliSamplerWithRatio") {
- expecting {
- for(x <- Seq(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)) {
- random.nextDouble().andReturn(x)
- }
- }
- whenExecuting(random) {
- val sampler = new BernoulliSampler[Int](0.35)
- sampler.rng = random
- assert(sampler.sample(a.iterator).toList == List(1, 2, 3))
- }
- }
-
- test("BernoulliSamplerWithComplement") {
- expecting {
- for(x <- Seq(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)) {
- random.nextDouble().andReturn(x)
- }
- }
- whenExecuting(random) {
- val sampler = new BernoulliSampler[Int](0.25, 0.55, true)
- sampler.rng = random
- assert(sampler.sample(a.iterator).toList == List(1, 2, 6, 7, 8, 9))
- }
- }
-
- test("BernoulliSamplerSetSeed") {
- expecting {
- random.setSeed(10L)
- }
- whenExecuting(random) {
- val sampler = new BernoulliSampler[Int](0.2)
- sampler.rng = random
- sampler.setSeed(10L)
- }
- }
-
- test("PoissonSampler") {
- expecting {
- for(x <- Seq(0, 1, 2, 0, 1, 1, 0, 0, 0)) {
- poisson.nextInt().andReturn(x)
- }
- }
- whenExecuting(poisson) {
- val sampler = new PoissonSampler[Int](0.2)
- sampler.rng = poisson
- assert(sampler.sample(a.iterator).toList == List(2, 3, 3, 5, 6))
- }
+import cern.jet.random.engine.DRand
+
+import org.scalatest.{FunSuite, Matchers}
+
+class RandomSamplerSuite extends FunSuite with Matchers {
+ // My statistical testing methodology is to run a Kolmogorov-Smirnov
(KS) test
+ // between the random samplers and simple reference samplers (known to
work correctly).
+ // The sampling gap sizes between chosen samples should show up as
having the same
+ // distributions between test and reference, if things are working
properly. That is,
+ // the KS test will fail to strongly reject the null hypothesis that the
distributions of
+ // sampling gaps are the same.
+ // There are no actual KS tests implemented for scala (that I can find)
- and so what I
+ // have done here is pre-compute "D" - the KS statistic - that
corresponds to a "weak"
+ // p-value for a particular sample size. I can then test that my
measured KS stats
+ // are less than D. Computing D-values is easy, and implemented below.
+ //
+ // I used the scipy 'kstwobign' distribution to pre-compute my D value:
+ //
+ // def ksdval(q=0.1, n=1000):
+ // en = np.sqrt(float(n) / 2.0)
+ // return stats.kstwobign.isf(float(q)) / (en + 0.12 + 0.11 / en)
+ //
+ // When comparing KS stats I take the median of a small number of
independent test runs
+ // to compensate for the issue that any sampled statistic will show
"false positive" with
+ // some probability. Even when two distributions are the same, they
will register as
+ // different 10% of the time at a p-value of 0.1
+
+ // This D value is the precomputed KS statistic for p-value 0.1, sample
size 1000:
+ val sampleSize = 1000
+ val D = 0.0544280747619
+
+ // I'm not a big fan of fixing seeds, but unit testing based on running
statistical tests
+ // will always fail with some nonzero probability, so I'll fix the seed
to prevent these
+ // tests from generating random failure noise in CI testing, etc.
+ val rngSeed: Random = RandomSampler.rngDefault
+ rngSeed.setSeed(235711)
+ //rngSeed.setSeed(System.nanoTime)
--- End diff --
remove unused code
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]