Github user mgaido91 commented on a diff in the pull request:

    https://github.com/apache/spark/pull/19340#discussion_r140861746
  
    --- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala ---
    @@ -546,10 +574,88 @@ object KMeans {
           .run(data)
       }
     
    +  private[spark] def validateInitMode(initMode: String): Boolean = {
    +    initMode match {
    +      case KMeans.RANDOM => true
    +      case KMeans.K_MEANS_PARALLEL => true
    +      case _ => false
    +    }
    +  }
    +  private[spark] def validateDistanceMeasure(distanceMeasure: String): 
Boolean = {
    +    distanceMeasure match {
    +      case DistanceSuite.EUCLIDEAN => true
    +      case DistanceSuite.COSINE => true
    +      case _ => false
    +    }
    +  }
    +}
    +
    +/**
    + * A vector with its norm for fast distance computation.
    + *
    + * @see [[org.apache.spark.mllib.clustering.KMeans#fastSquaredDistance]]
    + */
    +private[clustering]
    +class VectorWithNorm(val vector: Vector, val norm: Double) extends 
Serializable {
    +
    +  def this(vector: Vector) = this(vector, Vectors.norm(vector, 2.0))
    +
    +  def this(array: Array[Double]) = this(Vectors.dense(array))
    +
    +  /** Converts the vector to a dense vector. */
    +  def toDense: VectorWithNorm = new 
VectorWithNorm(Vectors.dense(vector.toArray), norm)
    +}
    +
    +
    +private[spark] abstract class DistanceSuite extends Serializable {
    +
    +  /**
    +   * Returns the index of the closest center to the given point, as well 
as the squared distance.
    +   */
    +  def findClosest(
    +     centers: TraversableOnce[VectorWithNorm],
    +     point: VectorWithNorm): (Int, Double)
    +
    +  /**
    +   * Returns the K-means cost of a given point against the given cluster 
centers.
    +   */
    +  def pointCost(
    +      centers: TraversableOnce[VectorWithNorm],
    +      point: VectorWithNorm): Double =
    +    findClosest(centers, point)._2
    +
    +  /**
    +   * Returns whether a center converged or not, given the epsilon 
parameter.
    +   */
    +  def isCenterConverged(
    +      oldCenter: VectorWithNorm,
    +      newCenter: VectorWithNorm,
    +      epsilon: Double): Boolean
    +
    +}
    +
    +@Since("2.3.0")
    +object DistanceSuite {
    --- End diff --
    
    About the name, if you have any better suggestion, I'd be happy to change 
it. Maybe `DistanceMeasure`?
    This in not internal because it contains the definition of the two 
constants which might be used by the users to set the right distance measure.


---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to