Repository: spark Updated Branches: refs/heads/master c402a4a68 -> b8d258003
[MLLIB] set RDD names in ALS This is very useful when debugging & fine tuning jobs with large data sets. Author: Neville Li <[email protected]> Closes #966 from nevillelyh/master and squashes the following commits: 6747764 [Neville Li] [MLLIB] use string interpolation for RDD names 3b15d34 [Neville Li] [MLLIB] set RDD names in ALS Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b8d25800 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b8d25800 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b8d25800 Branch: refs/heads/master Commit: b8d25800393d0208a76813bcd94509ac24a3add5 Parents: c402a4a Author: Neville Li <[email protected]> Authored: Wed Jun 4 01:51:34 2014 -0700 Committer: Xiangrui Meng <[email protected]> Committed: Wed Jun 4 01:51:34 2014 -0700 ---------------------------------------------------------------------- .../org/apache/spark/mllib/recommendation/ALS.scala | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/b8d25800/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala index cfc3b68..d743bd7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala @@ -201,6 +201,10 @@ class ALS private ( val (userInLinks, userOutLinks) = makeLinkRDDs(numBlocks, ratingsByUserBlock, partitioner) val (productInLinks, productOutLinks) = makeLinkRDDs(numBlocks, ratingsByProductBlock, partitioner) + userInLinks.setName("userInLinks") + userOutLinks.setName("userOutLinks") + productInLinks.setName("productInLinks") + productOutLinks.setName("productOutLinks") // Initialize user and product factors randomly, but use a deterministic seed for each // partition so that fault recovery works @@ -225,14 +229,14 @@ class ALS private ( // perform ALS update logInfo("Re-computing I given U (Iteration %d/%d)".format(iter, iterations)) // Persist users because it will be called twice. - users.persist() + users.setName(s"users-$iter").persist() val YtY = Some(sc.broadcast(computeYtY(users))) val previousProducts = products products = updateFeatures(users, userOutLinks, productInLinks, partitioner, rank, lambda, alpha, YtY) previousProducts.unpersist() logInfo("Re-computing U given I (Iteration %d/%d)".format(iter, iterations)) - products.persist() + products.setName(s"products-$iter").persist() val XtX = Some(sc.broadcast(computeYtY(products))) val previousUsers = users users = updateFeatures(products, productOutLinks, userInLinks, partitioner, rank, lambda, @@ -245,22 +249,24 @@ class ALS private ( logInfo("Re-computing I given U (Iteration %d/%d)".format(iter, iterations)) products = updateFeatures(users, userOutLinks, productInLinks, partitioner, rank, lambda, alpha, YtY = None) + products.setName(s"products-$iter") logInfo("Re-computing U given I (Iteration %d/%d)".format(iter, iterations)) users = updateFeatures(products, productOutLinks, userInLinks, partitioner, rank, lambda, alpha, YtY = None) + users.setName(s"users-$iter") } } // The last `products` will be used twice. One to generate the last `users` and the other to // generate `productsOut`. So we cache it for better performance. - products.persist() + products.setName("products").persist() // Flatten and cache the two final RDDs to un-block them val usersOut = unblockFactors(users, userOutLinks) val productsOut = unblockFactors(products, productOutLinks) - usersOut.persist() - productsOut.persist() + usersOut.setName("usersOut").persist() + productsOut.setName("productsOut").persist() // Materialize usersOut and productsOut. usersOut.count()
