Github user mgaido91 commented on a diff in the pull request:
https://github.com/apache/spark/pull/19340#discussion_r140861746
--- Diff:
mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala ---
@@ -546,10 +574,88 @@ object KMeans {
.run(data)
}
+ private[spark] def validateInitMode(initMode: String): Boolean = {
+ initMode match {
+ case KMeans.RANDOM => true
+ case KMeans.K_MEANS_PARALLEL => true
+ case _ => false
+ }
+ }
+ private[spark] def validateDistanceMeasure(distanceMeasure: String):
Boolean = {
+ distanceMeasure match {
+ case DistanceSuite.EUCLIDEAN => true
+ case DistanceSuite.COSINE => true
+ case _ => false
+ }
+ }
+}
+
+/**
+ * A vector with its norm for fast distance computation.
+ *
+ * @see [[org.apache.spark.mllib.clustering.KMeans#fastSquaredDistance]]
+ */
+private[clustering]
+class VectorWithNorm(val vector: Vector, val norm: Double) extends
Serializable {
+
+ def this(vector: Vector) = this(vector, Vectors.norm(vector, 2.0))
+
+ def this(array: Array[Double]) = this(Vectors.dense(array))
+
+ /** Converts the vector to a dense vector. */
+ def toDense: VectorWithNorm = new
VectorWithNorm(Vectors.dense(vector.toArray), norm)
+}
+
+
+private[spark] abstract class DistanceSuite extends Serializable {
+
+ /**
+ * Returns the index of the closest center to the given point, as well
as the squared distance.
+ */
+ def findClosest(
+ centers: TraversableOnce[VectorWithNorm],
+ point: VectorWithNorm): (Int, Double)
+
+ /**
+ * Returns the K-means cost of a given point against the given cluster
centers.
+ */
+ def pointCost(
+ centers: TraversableOnce[VectorWithNorm],
+ point: VectorWithNorm): Double =
+ findClosest(centers, point)._2
+
+ /**
+ * Returns whether a center converged or not, given the epsilon
parameter.
+ */
+ def isCenterConverged(
+ oldCenter: VectorWithNorm,
+ newCenter: VectorWithNorm,
+ epsilon: Double): Boolean
+
+}
+
+@Since("2.3.0")
+object DistanceSuite {
--- End diff --
About the name, if you have any better suggestion, I'd be happy to change
it. Maybe `DistanceMeasure`?
This in not internal because it contains the definition of the two
constants which might be used by the users to set the right distance measure.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]