Github user mengxr commented on a diff in the pull request:
https://github.com/apache/spark/pull/21195#discussion_r185983646
--- Diff:
mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
---
@@ -182,6 +184,40 @@ class BisectingKMeansSuite
model.clusterCenters.forall(Vectors.norm(_, 2) == 1.0)
}
+
+ test("BisectingKMeans with Array input") {
+ val featuresColNameD = "array_double_features"
+ val featuresColNameF = "array_float_features"
+ val doubleUDF = udf { (features: Vector) =>
+ val featureArray = Array.fill[Double](features.size)(0.0)
+ features.foreachActive((idx, value) => featureArray(idx) =
value.toFloat)
+ featureArray
+ }
+ val floatUDF = udf { (features: Vector) =>
+ val featureArray = Array.fill[Float](features.size)(0.0f)
+ features.foreachActive((idx, value) => featureArray(idx) =
value.toFloat)
+ featureArray
+ }
+ val newdatasetD = dataset.withColumn(featuresColNameD,
doubleUDF(col("features")))
+ .drop("features")
--- End diff --
* Unnecessary to drop `features`. Or you can simply replace the features
column:
~~~scala
val newdatasetD = dataset.withColumn(FEATURES, doubleUDF(col(FEATURES)))
~~~
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]