Github user mengxr commented on a diff in the pull request:
https://github.com/apache/spark/pull/21195#discussion_r185984500
--- Diff:
mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
---
@@ -182,6 +184,40 @@ class BisectingKMeansSuite
model.clusterCenters.forall(Vectors.norm(_, 2) == 1.0)
}
+
+ test("BisectingKMeans with Array input") {
+ val featuresColNameD = "array_double_features"
+ val featuresColNameF = "array_float_features"
+ val doubleUDF = udf { (features: Vector) =>
+ val featureArray = Array.fill[Double](features.size)(0.0)
+ features.foreachActive((idx, value) => featureArray(idx) =
value.toFloat)
+ featureArray
+ }
+ val floatUDF = udf { (features: Vector) =>
+ val featureArray = Array.fill[Float](features.size)(0.0f)
+ features.foreachActive((idx, value) => featureArray(idx) =
value.toFloat)
+ featureArray
+ }
+ val newdatasetD = dataset.withColumn(featuresColNameD,
doubleUDF(col("features")))
+ .drop("features")
+ val newdatasetF = dataset.withColumn(featuresColNameF,
floatUDF(col("features")))
+ .drop("features")
+ assert(newdatasetD.schema(featuresColNameD).dataType.equals(new
ArrayType(DoubleType, false)))
+ assert(newdatasetF.schema(featuresColNameF).dataType.equals(new
ArrayType(FloatType, false)))
+
+ val bkmD = new BisectingKMeans()
+ .setK(k).setMaxIter(1).setFeaturesCol(featuresColNameD).setSeed(1)
+ val bkmF = new BisectingKMeans()
+ .setK(k).setMaxIter(1).setFeaturesCol(featuresColNameF).setSeed(1)
+ val modelD = bkmD.fit(newdatasetD)
+ val modelF = bkmF.fit(newdatasetF)
+ val transformedD = modelD.transform(newdatasetD)
+ val transformedF = modelF.transform(newdatasetF)
+ val predictDifference = transformedD.select("prediction")
+ .except(transformedF.select("prediction"))
+ assert(predictDifference.count() == 0)
--- End diff --
This only verifies it handles `Array[Double]` and `Array[Float]` the same
way. But it doesn't guarantee that the result is correct. We can define a
method that takes a dataset, apply one iteration, and return the cost.
~~~scala
def trainAndComputeCost(dataset: DataFrame): Double = {
val model = new BisectingKMeans()
.setK(k).setMaxIter(1).setSeed(1)
.fit(dataset)
model.computeCost(dataset)
}
val trueCost = trainAndComputeCost(dataset)
val floatArrayCost = trainAndComputeCost(newDatasetF)
assert(floatArrayCost === trueCost)
val doubleArrayCost = trainAndComputeCost(newDatasetD)
assert(doubleArrayCost === trueCost)
~~~
We can map the original dataset to single precision to have exact match. Or
we can test equality with a threshold. See
https://github.com/apache/spark/blob/master/mllib/src/test/scala/org/apache/spark/mllib/util/TestingUtils.scala
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]