Github user MLnick commented on a diff in the pull request:

    https://github.com/apache/spark/pull/19715#discussion_r157156113
  
    --- Diff: 
mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala 
---
    @@ -146,4 +147,258 @@ class QuantileDiscretizerSuite
         val model = discretizer.fit(df)
         assert(model.hasParent)
       }
    +
    +  test("Multiple Columns: Test observed number of buckets and their sizes 
match expected values") {
    +    val spark = this.spark
    +    import spark.implicits._
    +
    +    val datasetSize = 100000
    +    val numBuckets = 5
    +    val data1 = Array.range(1, 100001, 1).map(_.toDouble)
    +    val data2 = Array.range(1, 200000, 2).map(_.toDouble)
    +    val df = data1.zip(data2).toSeq.toDF("input1", "input2")
    +
    +    val discretizer = new QuantileDiscretizer()
    +      .setInputCols(Array("input1", "input2"))
    +      .setOutputCols(Array("result1", "result2"))
    +      .setNumBuckets(numBuckets)
    +    val result = discretizer.fit(df).transform(df)
    +
    +    val relativeError = discretizer.getRelativeError
    +    val isGoodBucket = udf {
    +      (size: Int) => math.abs( size - (datasetSize / numBuckets)) <= 
(relativeError * datasetSize)
    +    }
    +
    +    for (i <- 1 to 2) {
    +      val observedNumBuckets = result.select("result" + i).distinct.count
    +      assert(observedNumBuckets === numBuckets,
    +        "Observed number of buckets does not equal expected number of 
buckets.")
    +
    +      val numGoodBuckets = result.groupBy("result" + 
i).count.filter(isGoodBucket($"count")).count
    +      assert(numGoodBuckets === numBuckets,
    +        "Bucket sizes are not within expected relative error tolerance.")
    +    }
    +  }
    +
    +  test("Multiple Columns: Test on data with high proportion of duplicated 
values") {
    +    val spark = this.spark
    +    import spark.implicits._
    +
    +    val numBuckets = 5
    +    val expectedNumBucket = 3
    +    val data1 = Array(1.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 2.0, 2.0, 2.0, 
1.0, 3.0)
    +    val data2 = Array(1.0, 2.0, 3.0, 1.0, 1.0, 1.0, 1.0, 3.0, 2.0, 3.0, 
1.0, 2.0)
    +    val df = data1.zip(data2).toSeq.toDF("input1", "input2")
    +    val discretizer = new QuantileDiscretizer()
    +      .setInputCols(Array("input1", "input2"))
    +      .setOutputCols(Array("result1", "result2"))
    +      .setNumBuckets(numBuckets)
    +    val result = discretizer.fit(df).transform(df)
    +    for (i <- 1 to 2) {
    +      val observedNumBuckets = result.select("result" + i).distinct.count
    +      assert(observedNumBuckets == expectedNumBucket,
    +        s"Observed number of buckets are not correct." +
    +          s" Expected $expectedNumBucket but found ($observedNumBuckets")
    +    }
    +  }
    +
    +  test("Multiple Columns: Test transform on data with NaN value") {
    +    val spark = this.spark
    +    import spark.implicits._
    +
    +    val numBuckets = 3
    +    val validData1 = Array(-0.9, -0.5, -0.3, 0.0, 0.2, 0.5, 0.9, 
Double.NaN, Double.NaN, Double.NaN)
    +    val expectedKeep1 = Array(0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 
3.0)
    +    val expectedSkip1 = Array(0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 2.0)
    +    val validData2 = Array(0.2, -0.1, 0.3, 0.0, 0.1, 0.3, 0.5, Double.NaN, 
Double.NaN, Double.NaN)
    +    val expectedKeep2 = Array(1.0, 0.0, 2.0, 0.0, 1.0, 2.0, 2.0, 3.0, 3.0, 
3.0)
    +    val expectedSkip2 = Array(1.0, 0.0, 2.0, 0.0, 1.0, 2.0, 2.0)
    +
    +    val discretizer = new QuantileDiscretizer()
    +      .setInputCols(Array("input1", "input2"))
    +      .setOutputCols(Array("result1", "result2"))
    +      .setNumBuckets(numBuckets)
    +
    +    withClue("QuantileDiscretizer with handleInvalid=error should throw 
exception for NaN values") {
    +      val dataFrame: DataFrame = 
validData1.zip(validData2).toSeq.toDF("input1", "input2")
    +      intercept[SparkException] {
    +        discretizer.fit(dataFrame).transform(dataFrame).collect()
    +      }
    +    }
    +
    +    List(("keep", expectedKeep1, expectedKeep2), ("skip", expectedSkip1, 
expectedSkip2)).foreach {
    +      case (u, v, w) =>
    +        discretizer.setHandleInvalid(u)
    +        val dataFrame: DataFrame = 
validData1.zip(validData2).zip(v).zip(w).map {
    +          case (((a, b), c), d) => (a, b, c, d)
    +        }.toSeq.toDF("input1", "input2", "expected1", "expected2")
    +        dataFrame.show
    --- End diff --
    
    remove the `show` call here


---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to