Github user srowen commented on a diff in the pull request:

    https://github.com/apache/spark/pull/19340#discussion_r142001986
  
    --- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala ---
    @@ -546,10 +574,88 @@ object KMeans {
           .run(data)
       }
     
    +  private[spark] def validateInitMode(initMode: String): Boolean = {
    +    initMode match {
    +      case KMeans.RANDOM => true
    +      case KMeans.K_MEANS_PARALLEL => true
    +      case _ => false
    +    }
    +  }
    +  private[spark] def validateDistanceMeasure(distanceMeasure: String): 
Boolean = {
    +    distanceMeasure match {
    +      case DistanceMeasure.EUCLIDEAN => true
    +      case DistanceMeasure.COSINE => true
    +      case _ => false
    +    }
    +  }
    +}
    +
    +/**
    + * A vector with its norm for fast distance computation.
    + *
    + * @see [[org.apache.spark.mllib.clustering.KMeans#fastSquaredDistance]]
    + */
    +private[clustering]
    +class VectorWithNorm(val vector: Vector, val norm: Double) extends 
Serializable {
    +
    +  def this(vector: Vector) = this(vector, Vectors.norm(vector, 2.0))
    +
    +  def this(array: Array[Double]) = this(Vectors.dense(array))
    +
    +  /** Converts the vector to a dense vector. */
    +  def toDense: VectorWithNorm = new 
VectorWithNorm(Vectors.dense(vector.toArray), norm)
    +}
    +
    +
    +private[spark] abstract class DistanceMeasure extends Serializable {
    +
    +  /**
    +   * Returns the index of the closest center to the given point, as well 
as the squared distance.
    +   */
    +  def findClosest(
    +     centers: TraversableOnce[VectorWithNorm],
    +     point: VectorWithNorm): (Int, Double)
    +
    +  /**
    +   * Returns the K-means cost of a given point against the given cluster 
centers.
    +   */
    +  def pointCost(
    +      centers: TraversableOnce[VectorWithNorm],
    +      point: VectorWithNorm): Double =
    +    findClosest(centers, point)._2
    +
    +  /**
    +   * Returns whether a center converged or not, given the epsilon 
parameter.
    +   */
    +  def isCenterConverged(
    --- End diff --
    
    Likewise this always seems to be "distance < epsilon"; does it ever vary?


---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to