[GitHub] spark pull request #15342: [SPARK-11560] [MLLIB] Optimize KMeans implementat...

srowen Mon, 10 Oct 2016 14:01:48 -0700

Github user srowen commented on a diff in the pull request:

    https://github.com/apache/spark/pull/15342#discussion_r82682188
  
    --- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala ---
    @@ -258,149 +252,106 @@ class KMeans private (
             }
         }
         val initTimeInSeconds = (System.nanoTime() - initStartTime) / 1e9
    -    logInfo(s"Initialization with $initializationMode took " + 
"%.3f".format(initTimeInSeconds) +
    -      " seconds.")
    +    logInfo(f"Initialization with $initializationMode took 
$initTimeInSeconds%.3f seconds.")
     
    -    val active = Array.fill(numRuns)(true)
    -    val costs = Array.fill(numRuns)(0.0)
    -
    -    var activeRuns = new ArrayBuffer[Int] ++ (0 until numRuns)
    +    var converged = false
    +    var cost = 0.0
         var iteration = 0
     
         val iterationStartTime = System.nanoTime()
     
    -    instr.foreach(_.logNumFeatures(centers(0)(0).vector.size))
    +    instr.foreach(_.logNumFeatures(centers.head.vector.size))
     
    -    // Execute iterations of Lloyd's algorithm until all runs have 
converged
    -    while (iteration < maxIterations && !activeRuns.isEmpty) {
    -      type WeightedPoint = (Vector, Long)
    -      def mergeContribs(x: WeightedPoint, y: WeightedPoint): WeightedPoint 
= {
    -        axpy(1.0, x._1, y._1)
    -        (y._1, x._2 + y._2)
    -      }
    -
    -      val activeCenters = activeRuns.map(r => centers(r)).toArray
    -      val costAccums = activeRuns.map(_ => sc.doubleAccumulator)
    -
    -      val bcActiveCenters = sc.broadcast(activeCenters)
    +    // Execute iterations of Lloyd's algorithm until converged
    +    while (iteration < maxIterations && !converged) {
    +      val costAccum = sc.doubleAccumulator
    +      val bcCenters = sc.broadcast(centers)
     
           // Find the sum and count of points mapping to each center
           val totalContribs = data.mapPartitions { points =>
    -        val thisActiveCenters = bcActiveCenters.value
    -        val runs = thisActiveCenters.length
    -        val k = thisActiveCenters(0).length
    -        val dims = thisActiveCenters(0)(0).vector.size
    +        val thisCenters = bcCenters.value
    +        val dims = thisCenters.head.vector.size
     
    -        val sums = Array.fill(runs, k)(Vectors.zeros(dims))
    -        val counts = Array.fill(runs, k)(0L)
    +        val sums = Array.fill(thisCenters.length)(Vectors.zeros(dims))
    +        val counts = Array.fill(thisCenters.length)(0L)
     
             points.foreach { point =>
    -          (0 until runs).foreach { i =>
    -            val (bestCenter, cost) = 
KMeans.findClosest(thisActiveCenters(i), point)
    -            costAccums(i).add(cost)
    -            val sum = sums(i)(bestCenter)
    -            axpy(1.0, point.vector, sum)
    -            counts(i)(bestCenter) += 1
    -          }
    +          val (bestCenter, cost) = KMeans.findClosest(thisCenters, point)
    +          costAccum.add(cost)
    +          val sum = sums(bestCenter)
    +          axpy(1.0, point.vector, sum)
    +          counts(bestCenter) += 1
             }
     
    -        val contribs = for (i <- 0 until runs; j <- 0 until k) yield {
    -          ((i, j), (sums(i)(j), counts(i)(j)))
    -        }
    -        contribs.iterator
    -      }.reduceByKey(mergeContribs).collectAsMap()
    -
    -      bcActiveCenters.destroy(blocking = false)
    -
    -      // Update the cluster centers and costs for each active run
    -      for ((run, i) <- activeRuns.zipWithIndex) {
    -        var changed = false
    -        var j = 0
    -        while (j < k) {
    -          val (sum, count) = totalContribs((i, j))
    -          if (count != 0) {
    -            scal(1.0 / count, sum)
    -            val newCenter = new VectorWithNorm(sum)
    -            if (KMeans.fastSquaredDistance(newCenter, centers(run)(j)) > 
epsilon * epsilon) {
    -              changed = true
    -            }
    -            centers(run)(j) = newCenter
    -          }
    -          j += 1
    -        }
    -        if (!changed) {
    -          active(run) = false
    -          logInfo("Run " + run + " finished in " + (iteration + 1) + " 
iterations")
    +        counts.indices.filter(counts(_) > 0).map(j => (j, (sums(j), 
counts(j)))).iterator
    +      }.reduceByKey { case ((sum1, count1), (sum2, count2)) =>
    +        axpy(1.0, sum2, sum1)
    +        (sum1, count1 + count2)
    +      }.collectAsMap()
    +
    +      bcCenters.destroy(blocking = false)
    +
    +      // Update the cluster centers and costs
    +      converged = true
    --- End diff --
    
    I don't think it can be done the way you're suggesting; it's not just 
preference. You could just set it with a nice simple call `.forall` as you're 
suggesting, usually, but here we also need the side effect of visiting each 
element. To do both I think we have to 'unroll' the equivalent logic and it 
amounts to this.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] spark pull request #15342: [SPARK-11560] [MLLIB] Optimize KMeans implementat...

Reply via email to