Github user MLnick commented on a diff in the pull request:
https://github.com/apache/spark/pull/19715#discussion_r157156113
--- Diff:
mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala
---
@@ -146,4 +147,258 @@ class QuantileDiscretizerSuite
val model = discretizer.fit(df)
assert(model.hasParent)
}
+
+ test("Multiple Columns: Test observed number of buckets and their sizes
match expected values") {
+ val spark = this.spark
+ import spark.implicits._
+
+ val datasetSize = 100000
+ val numBuckets = 5
+ val data1 = Array.range(1, 100001, 1).map(_.toDouble)
+ val data2 = Array.range(1, 200000, 2).map(_.toDouble)
+ val df = data1.zip(data2).toSeq.toDF("input1", "input2")
+
+ val discretizer = new QuantileDiscretizer()
+ .setInputCols(Array("input1", "input2"))
+ .setOutputCols(Array("result1", "result2"))
+ .setNumBuckets(numBuckets)
+ val result = discretizer.fit(df).transform(df)
+
+ val relativeError = discretizer.getRelativeError
+ val isGoodBucket = udf {
+ (size: Int) => math.abs( size - (datasetSize / numBuckets)) <=
(relativeError * datasetSize)
+ }
+
+ for (i <- 1 to 2) {
+ val observedNumBuckets = result.select("result" + i).distinct.count
+ assert(observedNumBuckets === numBuckets,
+ "Observed number of buckets does not equal expected number of
buckets.")
+
+ val numGoodBuckets = result.groupBy("result" +
i).count.filter(isGoodBucket($"count")).count
+ assert(numGoodBuckets === numBuckets,
+ "Bucket sizes are not within expected relative error tolerance.")
+ }
+ }
+
+ test("Multiple Columns: Test on data with high proportion of duplicated
values") {
+ val spark = this.spark
+ import spark.implicits._
+
+ val numBuckets = 5
+ val expectedNumBucket = 3
+ val data1 = Array(1.0, 3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 2.0, 2.0, 2.0,
1.0, 3.0)
+ val data2 = Array(1.0, 2.0, 3.0, 1.0, 1.0, 1.0, 1.0, 3.0, 2.0, 3.0,
1.0, 2.0)
+ val df = data1.zip(data2).toSeq.toDF("input1", "input2")
+ val discretizer = new QuantileDiscretizer()
+ .setInputCols(Array("input1", "input2"))
+ .setOutputCols(Array("result1", "result2"))
+ .setNumBuckets(numBuckets)
+ val result = discretizer.fit(df).transform(df)
+ for (i <- 1 to 2) {
+ val observedNumBuckets = result.select("result" + i).distinct.count
+ assert(observedNumBuckets == expectedNumBucket,
+ s"Observed number of buckets are not correct." +
+ s" Expected $expectedNumBucket but found ($observedNumBuckets")
+ }
+ }
+
+ test("Multiple Columns: Test transform on data with NaN value") {
+ val spark = this.spark
+ import spark.implicits._
+
+ val numBuckets = 3
+ val validData1 = Array(-0.9, -0.5, -0.3, 0.0, 0.2, 0.5, 0.9,
Double.NaN, Double.NaN, Double.NaN)
+ val expectedKeep1 = Array(0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 3.0,
3.0)
+ val expectedSkip1 = Array(0.0, 0.0, 1.0, 1.0, 2.0, 2.0, 2.0)
+ val validData2 = Array(0.2, -0.1, 0.3, 0.0, 0.1, 0.3, 0.5, Double.NaN,
Double.NaN, Double.NaN)
+ val expectedKeep2 = Array(1.0, 0.0, 2.0, 0.0, 1.0, 2.0, 2.0, 3.0, 3.0,
3.0)
+ val expectedSkip2 = Array(1.0, 0.0, 2.0, 0.0, 1.0, 2.0, 2.0)
+
+ val discretizer = new QuantileDiscretizer()
+ .setInputCols(Array("input1", "input2"))
+ .setOutputCols(Array("result1", "result2"))
+ .setNumBuckets(numBuckets)
+
+ withClue("QuantileDiscretizer with handleInvalid=error should throw
exception for NaN values") {
+ val dataFrame: DataFrame =
validData1.zip(validData2).toSeq.toDF("input1", "input2")
+ intercept[SparkException] {
+ discretizer.fit(dataFrame).transform(dataFrame).collect()
+ }
+ }
+
+ List(("keep", expectedKeep1, expectedKeep2), ("skip", expectedSkip1,
expectedSkip2)).foreach {
+ case (u, v, w) =>
+ discretizer.setHandleInvalid(u)
+ val dataFrame: DataFrame =
validData1.zip(validData2).zip(v).zip(w).map {
+ case (((a, b), c), d) => (a, b, c, d)
+ }.toSeq.toDF("input1", "input2", "expected1", "expected2")
+ dataFrame.show
--- End diff --
remove the `show` call here
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]