Repository: spark Updated Branches: refs/heads/master 6175d6cfe -> d31c618e3
[SPARK-7368] [MLLIB] Add QR decomposition for RowMatrix jira: https://issues.apache.org/jira/browse/SPARK-7368 Add QR decomposition for RowMatrix. I'm not sure what's the blueprint about the distributed Matrix from community and whether this will be a desirable feature , so I sent a prototype for discussion. I'll go on polish the code and provide ut and performance statistics if it's acceptable. The implementation refers to the [paper: https://www.cs.purdue.edu/homes/dgleich/publications/Benson%202013%20-%20direct-tsqr.pdf] Austin R. Benson, David F. Gleich, James Demmel. "Direct QR factorizations for tall-and-skinny matrices in MapReduce architectures", 2013 IEEE International Conference on Big Data, which is a stable algorithm with good scalability. Currently I tried it on a 400000 * 500 rowMatrix (16 partitions) and it can bring down the computation time from 8.8 mins (using breeze.linalg.qr.reduced) to 2.6 mins on a 4 worker cluster. I think there will still be some room for performance improvement. Any trial and suggestion is welcome. Author: Yuhao Yang <[email protected]> Closes #5909 from hhbyyh/qrDecomposition and squashes the following commits: cec797b [Yuhao Yang] remove unnecessary qr 0fb1012 [Yuhao Yang] hierarchy R computing 3fbdb61 [Yuhao Yang] update qr to indirect and add ut 0d913d3 [Yuhao Yang] Merge remote-tracking branch 'upstream/master' into qrDecomposition 39213c3 [Yuhao Yang] Merge remote-tracking branch 'upstream/master' into qrDecomposition c0fc0c7 [Yuhao Yang] Merge remote-tracking branch 'upstream/master' into qrDecomposition 39b0b22 [Yuhao Yang] initial draft for discussion Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d31c618e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d31c618e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d31c618e Branch: refs/heads/master Commit: d31c618e3c8838f8198556876b9dcbbbf835f7b2 Parents: 6175d6c Author: Yuhao Yang <[email protected]> Authored: Thu Jul 30 07:49:10 2015 -0700 Committer: Xiangrui Meng <[email protected]> Committed: Thu Jul 30 07:49:10 2015 -0700 ---------------------------------------------------------------------- .../linalg/SingularValueDecomposition.scala | 8 ++++ .../mllib/linalg/distributed/RowMatrix.scala | 46 +++++++++++++++++++- .../linalg/distributed/RowMatrixSuite.scala | 17 ++++++++ 3 files changed, 70 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/d31c618e/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala index 9669c36..b416d50 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/SingularValueDecomposition.scala @@ -25,3 +25,11 @@ import org.apache.spark.annotation.Experimental */ @Experimental case class SingularValueDecomposition[UType, VType](U: UType, s: Vector, V: VType) + +/** + * :: Experimental :: + * Represents QR factors. + */ +@Experimental +case class QRDecomposition[UType, VType](Q: UType, R: VType) + http://git-wip-us.apache.org/repos/asf/spark/blob/d31c618e/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala index 1626da9..bfc90c9 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala @@ -22,7 +22,7 @@ import java.util.Arrays import scala.collection.mutable.ListBuffer import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, SparseVector => BSV, axpy => brzAxpy, - svd => brzSvd} + svd => brzSvd, MatrixSingularException, inv} import breeze.numerics.{sqrt => brzSqrt} import com.github.fommil.netlib.BLAS.{getInstance => blas} @@ -498,6 +498,50 @@ class RowMatrix( } /** + * Compute QR decomposition for [[RowMatrix]]. The implementation is designed to optimize the QR + * decomposition (factorization) for the [[RowMatrix]] of a tall and skinny shape. + * Reference: + * Paul G. Constantine, David F. Gleich. "Tall and skinny QR factorizations in MapReduce + * architectures" ([[http://dx.doi.org/10.1145/1996092.1996103]]) + * + * @param computeQ whether to computeQ + * @return QRDecomposition(Q, R), Q = null if computeQ = false. + */ + def tallSkinnyQR(computeQ: Boolean = false): QRDecomposition[RowMatrix, Matrix] = { + val col = numCols().toInt + // split rows horizontally into smaller matrices, and compute QR for each of them + val blockQRs = rows.glom().map { partRows => + val bdm = BDM.zeros[Double](partRows.length, col) + var i = 0 + partRows.foreach { row => + bdm(i, ::) := row.toBreeze.t + i += 1 + } + breeze.linalg.qr.reduced(bdm).r + } + + // combine the R part from previous results vertically into a tall matrix + val combinedR = blockQRs.treeReduce{ (r1, r2) => + val stackedR = BDM.vertcat(r1, r2) + breeze.linalg.qr.reduced(stackedR).r + } + val finalR = Matrices.fromBreeze(combinedR.toDenseMatrix) + val finalQ = if (computeQ) { + try { + val invR = inv(combinedR) + this.multiply(Matrices.fromBreeze(invR)) + } catch { + case err: MatrixSingularException => + logWarning("R is not invertible and return Q as null") + null + } + } else { + null + } + QRDecomposition(finalQ, finalR) + } + + /** * Find all similar columns using the DIMSUM sampling algorithm, described in two papers * * http://arxiv.org/abs/1206.2082 http://git-wip-us.apache.org/repos/asf/spark/blob/d31c618e/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala ---------------------------------------------------------------------- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala index b6cb53d..283ffec 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.mllib.linalg.distributed import scala.util.Random +import breeze.numerics.abs import breeze.linalg.{DenseVector => BDV, DenseMatrix => BDM, norm => brzNorm, svd => brzSvd} import org.apache.spark.SparkFunSuite @@ -238,6 +239,22 @@ class RowMatrixSuite extends SparkFunSuite with MLlibTestSparkContext { } } } + + test("QR Decomposition") { + for (mat <- Seq(denseMat, sparseMat)) { + val result = mat.tallSkinnyQR(true) + val expected = breeze.linalg.qr.reduced(mat.toBreeze()) + val calcQ = result.Q + val calcR = result.R + assert(closeToZero(abs(expected.q) - abs(calcQ.toBreeze()))) + assert(closeToZero(abs(expected.r) - abs(calcR.toBreeze.asInstanceOf[BDM[Double]]))) + assert(closeToZero(calcQ.multiply(calcR).toBreeze - mat.toBreeze())) + // Decomposition without computing Q + val rOnly = mat.tallSkinnyQR(computeQ = false) + assert(rOnly.Q == null) + assert(closeToZero(abs(expected.r) - abs(rOnly.R.toBreeze.asInstanceOf[BDM[Double]]))) + } + } } class RowMatrixClusterSuite extends SparkFunSuite with LocalClusterSparkContext { --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
