This is an automated email from the ASF dual-hosted git repository.
srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 67bd124 [MINOR][TEST] Speed up slow tests in QuantileDiscretizerSuite
67bd124 is described below
commit 67bd124f4ff6b54cf29415aad209cbe199f7bcf5
Author: Sean Owen <[email protected]>
AuthorDate: Sat Apr 13 17:03:23 2019 -0500
[MINOR][TEST] Speed up slow tests in QuantileDiscretizerSuite
## What changes were proposed in this pull request?
This should reduce the total runtime of these tests from about 2 minutes to
about 25 seconds.
## How was this patch tested?
Existing tests
Closes #24360 from srowen/SpeedQDS.
Authored-by: Sean Owen <[email protected]>
Signed-off-by: Sean Owen <[email protected]>
---
.../ml/feature/QuantileDiscretizerSuite.scala | 67 ++++++++++++----------
1 file changed, 38 insertions(+), 29 deletions(-)
diff --git
a/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala
b/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala
index 82af050..ae086d3 100644
---
a/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala
+++
b/mllib/src/test/scala/org/apache/spark/ml/feature/QuantileDiscretizerSuite.scala
@@ -29,25 +29,30 @@ class QuantileDiscretizerSuite extends MLTest with
DefaultReadWriteTest {
val spark = this.spark
import spark.implicits._
- val datasetSize = 100000
+ val datasetSize = 30000
val numBuckets = 5
- val df = sc.parallelize(1 to
datasetSize).map(_.toDouble).map(Tuple1.apply).toDF("input")
+ val df = sc.parallelize(1 to datasetSize).map(_.toDouble).toDF("input")
val discretizer = new QuantileDiscretizer()
.setInputCol("input")
.setOutputCol("result")
.setNumBuckets(numBuckets)
val model = discretizer.fit(df)
- testTransformerByGlobalCheckFunc[(Double)](df, model, "result") { rows =>
- val result = rows.map { r => Tuple1(r.getDouble(0)) }.toDF("result")
- val observedNumBuckets = result.select("result").distinct.count
- assert(observedNumBuckets === numBuckets,
- "Observed number of buckets does not equal expected number of
buckets.")
- val relativeError = discretizer.getRelativeError
- val numGoodBuckets = result.groupBy("result").count
- .filter(s"abs(count - ${datasetSize / numBuckets}) <= ${relativeError
* datasetSize}").count
- assert(numGoodBuckets === numBuckets,
- "Bucket sizes are not within expected relative error tolerance.")
+ testTransformerByGlobalCheckFunc[Double](df, model, "result") { rows =>
+ val result = rows.map(_.getDouble(0)).toDF("result").cache()
+ try {
+ val observedNumBuckets = result.select("result").distinct().count()
+ assert(observedNumBuckets === numBuckets,
+ "Observed number of buckets does not equal expected number of
buckets.")
+ val relativeError = discretizer.getRelativeError
+ val numGoodBuckets = result.groupBy("result").count()
+ .filter(s"abs(count - ${datasetSize / numBuckets}) <=
${relativeError * datasetSize}")
+ .count()
+ assert(numGoodBuckets === numBuckets,
+ "Bucket sizes are not within expected relative error tolerance.")
+ } finally {
+ result.unpersist()
+ }
}
}
@@ -162,10 +167,10 @@ class QuantileDiscretizerSuite extends MLTest with
DefaultReadWriteTest {
val spark = this.spark
import spark.implicits._
- val datasetSize = 100000
+ val datasetSize = 30000
val numBuckets = 5
- val data1 = Array.range(1, 100001, 1).map(_.toDouble)
- val data2 = Array.range(1, 200000, 2).map(_.toDouble)
+ val data1 = Array.range(1, datasetSize + 1, 1).map(_.toDouble)
+ val data2 = Array.range(1, 2 * datasetSize, 2).map(_.toDouble)
val df = data1.zip(data2).toSeq.toDF("input1", "input2")
val discretizer = new QuantileDiscretizer()
@@ -175,20 +180,24 @@ class QuantileDiscretizerSuite extends MLTest with
DefaultReadWriteTest {
val model = discretizer.fit(df)
testTransformerByGlobalCheckFunc[(Double, Double)](df, model, "result1",
"result2") { rows =>
val result =
- rows.map { r => Tuple2(r.getDouble(0), r.getDouble(1))
}.toDF("result1", "result2")
- val relativeError = discretizer.getRelativeError
- for (i <- 1 to 2) {
- val observedNumBuckets = result.select("result" + i).distinct.count
- assert(observedNumBuckets === numBuckets,
- "Observed number of buckets does not equal expected number of
buckets.")
-
- val numGoodBuckets = result
- .groupBy("result" + i)
- .count
- .filter(s"abs(count - ${datasetSize / numBuckets}) <=
${relativeError * datasetSize}")
- .count
- assert(numGoodBuckets === numBuckets,
- "Bucket sizes are not within expected relative error tolerance.")
+ rows.map(r => (r.getDouble(0), r.getDouble(1))).toDF("result1",
"result2").cache()
+ try {
+ val relativeError = discretizer.getRelativeError
+ for (i <- 1 to 2) {
+ val observedNumBuckets = result.select("result" +
i).distinct().count()
+ assert(observedNumBuckets === numBuckets,
+ "Observed number of buckets does not equal expected number of
buckets.")
+
+ val numGoodBuckets = result
+ .groupBy("result" + i)
+ .count()
+ .filter(s"abs(count - ${datasetSize / numBuckets}) <=
${relativeError * datasetSize}")
+ .count()
+ assert(numGoodBuckets === numBuckets,
+ "Bucket sizes are not within expected relative error tolerance.")
+ }
+ } finally {
+ result.unpersist()
}
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]