Github user BryanCutler commented on a diff in the pull request: https://github.com/apache/spark/pull/20520#discussion_r167048075 --- Diff: python/pyspark/ml/tests.py --- @@ -1620,6 +1621,23 @@ def test_kmeans_summary(self): self.assertEqual(s.k, 2) +class KMeansTests(SparkSessionTestCase): + + def test_kmeans_cosine_distance(self): + data = [(Vectors.dense([1.0, 1.0]),), (Vectors.dense([10.0, 10.0]),), + (Vectors.dense([1.0, 0.5]),), (Vectors.dense([10.0, 4.4]),), + (Vectors.dense([-1.0, 1.0]),), (Vectors.dense([-100.0, 90.0]),)] + df = self.spark.createDataFrame(data, ["features"]) + kmeans = KMeans(k=3, seed=1) + kmeans.setDistanceMeasure("cosine") + model = kmeans.fit(df) + result = model.transform(df).rdd.collectAsMap() + self.assertTrue(result[Vectors.dense([1.0, 1.0])] == result[Vectors.dense([10.0, 10.0])]) --- End diff -- It's a little awkward to collectAsMap and compare like this, why not just regular collect and compare with `data` above?
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org