This is an automated email from the ASF dual-hosted git repository. huaxingao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 789fce8 [SPARK-37959][ML] Fix the UT of checking norm in KMeans & BiKMeans 789fce8 is described below commit 789fce8c8b200eba5f94c2d83b4b83e3bfb9a2b1 Author: Ruifeng Zheng <ruife...@foxmail.com> AuthorDate: Wed Jan 19 09:17:25 2022 -0800 [SPARK-37959][ML] Fix the UT of checking norm in KMeans & BiKMeans ### What changes were proposed in this pull request? In `KMeansSuite` and `BisectingKMeansSuite`, there are some unused lines: ``` model1.clusterCenters.forall(Vectors.norm(_, 2) == 1.0 ``` For cosine distance, the norm of centering vector should be 1, so the norm checking is meaningful; For euclidean distance, the norm checking is meaningless; ### Why are the changes needed? to enable norm checking for cosine distance, and diable it for euclidean distance ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? updated testsuites Closes #35247 from zhengruifeng/fix_kmeans_ut. Authored-by: Ruifeng Zheng <ruife...@foxmail.com> Signed-off-by: huaxingao <huaxin.ga...@gmail.com> --- .../apache/spark/ml/clustering/BisectingKMeansSuite.scala | 10 +++------- .../scala/org/apache/spark/ml/clustering/KMeansSuite.scala | 14 +++----------- 2 files changed, 6 insertions(+), 18 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala index 04b20d1..fb6110d 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala @@ -186,7 +186,7 @@ class BisectingKMeansSuite extends MLTest with DefaultReadWriteTest { assert(predictionsMap(Vectors.dense(-1.0, 1.0)) == predictionsMap(Vectors.dense(-100.0, 90.0))) - model.clusterCenters.forall(Vectors.norm(_, 2) == 1.0) + assert(model.clusterCenters.forall(Vectors.norm(_, 2) ~== 1.0 absTol 1e-6)) } test("Comparing with and without weightCol with cosine distance") { @@ -217,7 +217,7 @@ class BisectingKMeansSuite extends MLTest with DefaultReadWriteTest { assert(predictionsMap1(Vectors.dense(-1.0, 1.0)) == predictionsMap1(Vectors.dense(-100.0, 90.0))) - model1.clusterCenters.forall(Vectors.norm(_, 2) == 1.0) + assert(model1.clusterCenters.forall(Vectors.norm(_, 2) ~== 1.0 absTol 1e-6)) val df2 = spark.createDataFrame(spark.sparkContext.parallelize(Seq( (Vectors.dense(1.0, 1.0), 2.0), (Vectors.dense(10.0, 10.0), 2.0), @@ -244,7 +244,7 @@ class BisectingKMeansSuite extends MLTest with DefaultReadWriteTest { assert(predictionsMap2(Vectors.dense(-1.0, 1.0)) == predictionsMap2(Vectors.dense(-100.0, 90.0))) - model2.clusterCenters.forall(Vectors.norm(_, 2) == 1.0) + assert(model2.clusterCenters.forall(Vectors.norm(_, 2) ~== 1.0 absTol 1e-6)) assert(model1.clusterCenters === model2.clusterCenters) } @@ -284,8 +284,6 @@ class BisectingKMeansSuite extends MLTest with DefaultReadWriteTest { assert(predictionsMap1(Vectors.dense(10.0, 10.0)) == predictionsMap1(Vectors.dense(10.0, 4.4))) - model1.clusterCenters.forall(Vectors.norm(_, 2) == 1.0) - val df2 = spark.createDataFrame(spark.sparkContext.parallelize(Seq( (Vectors.dense(1.0, 1.0), 1.0), (Vectors.dense(10.0, 10.0), 2.0), (Vectors.dense(1.0, 0.5), 2.0), (Vectors.dense(10.0, 4.4), 3.0), @@ -310,8 +308,6 @@ class BisectingKMeansSuite extends MLTest with DefaultReadWriteTest { assert(predictionsMap2(Vectors.dense(10.0, 10.0)) == predictionsMap2(Vectors.dense(10.0, 4.4))) - model2.clusterCenters.forall(Vectors.norm(_, 2) == 1.0) - assert(model1.clusterCenters(0) === model2.clusterCenters(0)) assert(model1.clusterCenters(1) === model2.clusterCenters(1)) assert(model1.clusterCenters(2) ~== model2.clusterCenters(2) absTol 1e-6) diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala index 61f4359..7d2a0b8 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala @@ -186,7 +186,7 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes assert(predictionsMap(Vectors.dense(-1.0, 1.0)) == predictionsMap(Vectors.dense(-100.0, 90.0))) - model.clusterCenters.forall(Vectors.norm(_, 2) == 1.0) + assert(model.clusterCenters.forall(Vectors.norm(_, 2) ~== 1.0 absTol 1e-6)) } test("KMeans with cosine distance is not supported for 0-length vectors") { @@ -283,7 +283,7 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes assert(predictionsMap1(Vectors.dense(-1.0, 1.0)) == predictionsMap1(Vectors.dense(-100.0, 90.0))) - model1.clusterCenters.forall(Vectors.norm(_, 2) == 1.0) + assert(model1.clusterCenters.forall(Vectors.norm(_, 2) ~== 1.0 absTol 1e-6)) val df2 = spark.createDataFrame(spark.sparkContext.parallelize(Seq( (Vectors.dense(1.0, 1.0), 1.0), @@ -313,7 +313,7 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes assert(predictionsMap2(Vectors.dense(-1.0, 1.0)) == predictionsMap2(Vectors.dense(-100.0, 90.0))) - model2.clusterCenters.forall(Vectors.norm(_, 2) == 1.0) + assert(model2.clusterCenters.forall(Vectors.norm(_, 2) ~== 1.0 absTol 1e-6)) // compare if model1 and model2 have the same cluster centers assert(model1.clusterCenters.length === model2.clusterCenters.length) @@ -350,8 +350,6 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes assert(predictionsMap1(Vectors.dense(9.0, 0.2)) == predictionsMap1(Vectors.dense(9.2, 0.0))) - model1.clusterCenters.forall(Vectors.norm(_, 2) == 1.0) - // center 1: // total weights in cluster 1: 2.0 + 2.0 + 2.0 = 6.0 // x: 9.0 * (2.0/6.0) + 9.0 * (2.0/6.0) + 9.2 * (2.0/6.0) = 9.066666666666666 @@ -394,8 +392,6 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes assert(predictionsMap2(Vectors.dense(9.0, 0.2)) == predictionsMap2(Vectors.dense(9.2, 0.0))) - model2.clusterCenters.forall(Vectors.norm(_, 2) == 1.0) - // center 1: // total weights in cluster 1: 2.5 + 1.0 + 2.0 = 5.5 // x: 9.0 * (2.5/5.5) + 9.0 * (1.0/5.5) + 9.2 * (2.0/5.5) = 9.072727272727272 @@ -441,8 +437,6 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes assert(predictionsMap1(Vectors.dense(-6.0, -6.0)) == predictionsMap1(Vectors.dense(-10.0, -10.0))) - model1.clusterCenters.forall(Vectors.norm(_, 2) == 1.0) - // use same weight, should have the same result as no weight val df2 = spark.createDataFrame(spark.sparkContext.parallelize(Seq( (Vectors.dense(0.1, 0.1), 2.0), @@ -474,8 +468,6 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes assert(predictionsMap2(Vectors.dense(-6.0, -6.0)) == predictionsMap2(Vectors.dense(-10.0, -10.0))) - model2.clusterCenters.forall(Vectors.norm(_, 2) == 1.0) - assert(model1.clusterCenters === model2.clusterCenters) } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org