Author: dlyubimov
Date: Thu Jul 25 23:31:43 2013
New Revision: 1507154

URL: http://svn.apache.org/r1507154
Log:
MAHOUT-1281: Wean distributed SSVD from apache-math dependency, use updated 
Mahout's native eigen solver.

Squashed commit of the following:

commit 7ed5a9af8e01275cccac8a306ca46b41acaa8008
Merge: 808ea77 e211f3b
Author: Dmitriy Lyubimov <[email protected]>
Date:   Thu Jul 25 16:29:14 2013 -0700

    Merge branch 'trunk' into MAHOUT-1281

    Conflicts:
        
core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDHelper.java
        
core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDSolver.java

commit 808ea77b1f95b10ed56ae086f39e708724670672
Author: Dmitriy Lyubimov <[email protected]>
Date:   Wed Jul 17 23:35:49 2013 -0700

    Illegal like()

commit 7f54f0b0369329b8933b0dcc441117ad0b43675b
Author: Dmitriy Lyubimov <[email protected]>
Date:   Wed Jul 10 13:25:04 2013 -0700

    Fixing Lanczos for the correct order coming out of eigen dec

commit 12d0861ba4e998b45818410d988d89573d5ab996
Author: Dmitriy Lyubimov <[email protected]>
Date:   Wed Jul 10 12:55:09 2013 -0700

    Switching distributed SSVD to use EigenDecomposition. Reversing sort 
procedure for EigenDecomposition.

commit 142ada1dd8e8dae07e340a177e137471e738a7f9
Author: Dmitriy Lyubimov <[email protected]>
Date:   Wed Jul 10 12:54:46 2013 -0700

    Bug fixes in constructor-by-vector

commit ef11cfa02727fb29b2533c0848734809f77f8a3e
Author: Dmitriy Lyubimov <[email protected]>
Date:   Wed Jul 10 11:22:06 2013 -0700

    Switching SSVD uses to UpperTriangular.

commit 3e73a8cd7ba32cb8696d76b93ec287540c710f68
Author: Dmitriy Lyubimov <[email protected]>
Date:   Wed Jul 10 10:55:11 2013 -0700

    Adding test for dense symmetric matrix asserting Eigen decomposition 
equivalent to that over a dense matrix.

commit 6fc530b75215c5ad1c0b5561ff3af724c6e48c6b
Author: Dmitriy Lyubimov <[email protected]>
Date:   Tue Jul 9 18:30:59 2013 -0700

    Moving UpperTriangular matrix to mahout.math; adding DenseSymmetric matrix.

Removed:
    
mahout/trunk/math/src/main/java/org/apache/mahout/math/ssvd/EigenSolverWrapper.java
Modified:
    
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDHelper.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDSolver.java
    
mahout/trunk/math/src/main/java/org/apache/mahout/math/decomposer/lanczos/LanczosSolver.java
    
mahout/trunk/math/src/main/java/org/apache/mahout/math/solver/EigenDecomposition.java
    
mahout/trunk/math/src/test/java/org/apache/mahout/math/decomposer/lanczos/TestLanczosSolver.java

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDHelper.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDHelper.java?rev=1507154&r1=1507153&r2=1507154&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDHelper.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDHelper.java
 Thu Jul 25 23:31:43 2013
@@ -35,11 +35,8 @@ import org.apache.mahout.common.iterator
 import org.apache.mahout.common.iterator.sequencefile.PathType;
 import 
org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator;
 import 
org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable;
-import org.apache.mahout.math.DenseVector;
-import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.*;
 import org.apache.mahout.math.UpperTriangular;
-import org.apache.mahout.math.Vector;
-import org.apache.mahout.math.VectorWritable;
 import org.apache.mahout.math.function.Functions;
 
 import com.google.common.collect.Lists;
@@ -218,9 +215,9 @@ public final class SSVDHelper {
    *
    * @return the sum of upper triangular inputs.
    */
-  public static UpperTriangular loadAndSumUpperTriangularMatrices(Path glob, 
Configuration conf) throws IOException {
+  public static DenseSymmetricMatrix 
loadAndSumUpperTriangularMatricesAsSymmetric(Path glob, Configuration conf) 
throws IOException {
     Vector v = loadAndSumUpVectors(glob, conf);
-    return v == null ? null : new UpperTriangular(v);
+    return v == null ? null : new DenseSymmetricMatrix(v);
   }
 
   /**

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDSolver.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDSolver.java?rev=1507154&r1=1507153&r2=1507154&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDSolver.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/stochasticsvd/SSVDSolver.java
 Thu Jul 25 23:31:43 2013
@@ -17,11 +17,7 @@
 
 package org.apache.mahout.math.hadoop.stochasticsvd;
 
-import java.io.Closeable;
-import java.io.IOException;
-import java.util.Deque;
-import java.util.Random;
-
+import com.google.common.collect.Lists;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -30,18 +26,21 @@ import org.apache.mahout.common.IOUtils;
 import org.apache.mahout.common.RandomUtils;
 import org.apache.mahout.math.*;
 import org.apache.mahout.math.function.Functions;
-import org.apache.mahout.math.ssvd.EigenSolverWrapper;
+import org.apache.mahout.math.solver.EigenDecomposition;
 
-import com.google.common.collect.Lists;
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.Deque;
+import java.util.Random;
 
 /**
  * Stochastic SVD solver (API class).
- * <P>
- * 
+ * <p/>
+ * <p/>
  * Implementation details are in my working notes in MAHOUT-376
  * (https://issues.apache.org/jira/browse/MAHOUT-376).
- * <P>
- * 
+ * <p/>
+ * <p/>
  * As of the time of this writing, I don't have benchmarks for this method in
  * comparison to other methods. However, non-hadoop differentiating
  * characteristics of this method are thought to be :
@@ -52,9 +51,9 @@ import com.google.common.collect.Lists;
  * Lanczos unless full rank SVD decomposition is sought.
  * <LI>"more scale" -- can presumably take on larger problems than Lanczos one
  * (not confirmed by benchmark at this time)
- * <P>
- * <P>
- * 
+ * <p/>
+ * <p/>
+ * <p/>
  * Specifically in regards to this implementation, <i>I think</i> couple of
  * other differentiating points are:
  * <LI>no need to specify input matrix height or width in command line, it is
@@ -64,12 +63,12 @@ import com.google.common.collect.Lists;
  * <LI>can request U or V or U<sub>&sigma;</sub>=U* &Sigma;<sup>0.5</sup> or
  * V<sub>&sigma;</sub>=V* &Sigma;<sup>0.5</sup> none of which would require 
pass
  * over input A and these jobs are parallel map-only jobs.
- * <P>
- * <P>
- * 
+ * <p/>
+ * <p/>
+ * <p/>
  * This class is central public API for SSVD solver. The use pattern is as
  * follows:
- * 
+ * <p/>
  * <UL>
  * <LI>create the solver using constructor and supplying computation 
parameters.
  * <LI>set optional parameters thru setter methods.
@@ -78,7 +77,7 @@ import com.google.common.collect.Lists;
  * containing m x k U matrix file(s).
  * <LI> {@link #getVPath()} (if computed) returns the path to the directory
  * containing n x k V matrix file(s).
- * 
+ * <p/>
  * </UL>
  */
 public final class SSVDSolver {
@@ -116,26 +115,18 @@ public final class SSVDSolver {
   /**
    * create new SSVD solver. Required parameters are passed to constructor to
    * ensure they are set. Optional parameters can be set using setters .
-   * <P>
-   * 
-   * @param conf
-   *          hadoop configuration
-   * @param inputPath
-   *          Input path (should be compatible with DistributedRowMatrix as of
-   *          the time of this writing).
-   * @param outputPath
-   *          Output path containing U, V and singular values vector files.
-   * @param ablockRows
-   *          The vertical hight of a q-block (bigger value require more memory
-   *          in mappers+ perhaps larger {@code minSplitSize} values
-   * @param k
-   *          desired rank
-   * @param p
-   *          SSVD oversampling parameter
-   * @param reduceTasks
-   *          Number of reduce tasks (where applicable)
-   * @throws IOException
-   *           when IO condition occurs.
+   * <p/>
+   *
+   * @param conf        hadoop configuration
+   * @param inputPath   Input path (should be compatible with 
DistributedRowMatrix as of
+   *                    the time of this writing).
+   * @param outputPath  Output path containing U, V and singular values vector 
files.
+   * @param ablockRows  The vertical hight of a q-block (bigger value require 
more memory
+   *                    in mappers+ perhaps larger {@code minSplitSize} values
+   * @param k           desired rank
+   * @param p           SSVD oversampling parameter
+   * @param reduceTasks Number of reduce tasks (where applicable)
+   * @throws IOException when IO condition occurs.
    */
   public SSVDSolver(Configuration conf,
                     Path[] inputPath,
@@ -160,7 +151,7 @@ public final class SSVDSolver {
   /**
    * sets q, amount of additional power iterations to increase precision
    * (0..2!). Defaults to 0.
-   * 
+   *
    * @param q
    */
   public void setQ(int q) {
@@ -170,7 +161,6 @@ public final class SSVDSolver {
   /**
    * The setting controlling whether to compute U matrix of low rank SSVD.
    * Default true.
-   * 
    */
   public void setComputeU(boolean val) {
     computeU = val;
@@ -178,16 +168,14 @@ public final class SSVDSolver {
 
   /**
    * Setting controlling whether to compute V matrix of low-rank SSVD.
-   * 
-   * @param val
-   *          true if we want to output V matrix. Default is true.
+   *
+   * @param val true if we want to output V matrix. Default is true.
    */
   public void setComputeV(boolean val) {
     computeV = val;
   }
 
   /**
-   * 
    * @param cUHat whether produce U*Sigma^0.5 as well (default false)
    */
   public void setcUHalfSigma(boolean cUHat) {
@@ -195,7 +183,6 @@ public final class SSVDSolver {
   }
 
   /**
-   * 
    * @param cVHat whether produce V*Sigma^0.5 as well (default false)
    */
   public void setcVHalfSigma(boolean cVHat) {
@@ -203,7 +190,6 @@ public final class SSVDSolver {
   }
 
   /**
-   * 
    * @param cUSigma whether produce U*Sigma output as well (default false)
    */
   public void setcUSigma(boolean cUSigma) {
@@ -221,9 +207,8 @@ public final class SSVDSolver {
    * Sometimes, if requested A blocks become larger than a split, we may need 
to
    * use that to ensure at least k+p rows of A get into a split. This is
    * requirement necessary to obtain orthonormalized Q blocks of SSVD.
-   * 
-   * @param size
-   *          the minimum split size to use
+   *
+   * @param size the minimum split size to use
    */
   public void setMinSplitSize(int size) {
     minSplitSize = size;
@@ -231,7 +216,7 @@ public final class SSVDSolver {
 
   /**
    * This contains k+p singular values resulted from the solver run.
-   * 
+   *
    * @return singlular values (largest to smallest)
    */
   public Vector getSingularValues() {
@@ -240,7 +225,7 @@ public final class SSVDSolver {
 
   /**
    * returns U path (if computation were requested and successful).
-   * 
+   *
    * @return U output hdfs path, or null if computation was not completed for
    *         whatever reason.
    */
@@ -250,7 +235,7 @@ public final class SSVDSolver {
 
   /**
    * return V path ( if computation was requested and successful ) .
-   * 
+   *
    * @return V output hdfs path, or null if computation was not completed for
    *         whatever reason.
    */
@@ -280,7 +265,7 @@ public final class SSVDSolver {
 
   /**
    * if true, driver to clean output folder first if exists.
-   * 
+   *
    * @param overwrite
    */
   public void setOverwrite(boolean overwrite) {
@@ -296,7 +281,7 @@ public final class SSVDSolver {
    * to produce less keys for combining and shuffle and sort therefore somewhat
    * improving running time; but require larger blocks to be formed in RAM (so
    * setting this too high can lead to OOM).
-   * 
+   *
    * @param outerBlockHeight
    */
   public void setOuterBlockHeight(int outerBlockHeight) {
@@ -312,7 +297,7 @@ public final class SSVDSolver {
    * to set it higher than default 200,000 for extremely sparse inputs and when
    * more ram is available. y_i block height and ABt job would occupy approx.
    * abtBlockHeight x (k+p) x sizeof (double) (as dense).
-   * 
+   *
    * @param abtBlockHeight
    */
   public void setAbtBlockHeight(int abtBlockHeight) {
@@ -326,7 +311,7 @@ public final class SSVDSolver {
   /**
    * If this property is true, use DestributedCache mechanism to broadcast some
    * stuff around. May improve efficiency. Default is false.
-   * 
+   *
    * @param broadcast
    */
   public void setBroadcast(boolean broadcast) {
@@ -336,18 +321,17 @@ public final class SSVDSolver {
   /**
    * Optional. Single-vector file path for a vector (aka xi in MAHOUT-817
    * working notes) to be subtracted from each row of input.
-   * <P>
-   * 
+   * <p/>
+   * <p/>
    * Brute force approach would force would turn input into a dense input, 
which
    * is often not very desirable. By supplying this offset to SSVD solver, we
    * can avoid most of that overhead due to increased input density.
-   * <P>
-   * 
+   * <p/>
+   * <p/>
    * The vector size for this offest is n (width of A input). In PCA and R this
    * is known as "column means", but in this case it can be any offset of row
    * vectors of course to propagate into SSVD solution.
-   * <P>
-   * 
+   * <p/>
    */
   public Path getPcaMeanPath() {
     return pcaMeanPath;
@@ -359,9 +343,8 @@ public final class SSVDSolver {
 
   /**
    * run all SSVD jobs.
-   * 
-   * @throws IOException
-   *           if I/O condition occurs.
+   *
+   * @throws IOException if I/O condition occurs.
    */
   public void run() throws IOException {
 
@@ -498,28 +481,21 @@ public final class SSVDSolver {
         sqPath = new Path(btPath, BtJob.OUTPUT_SQ + "-*");
       }
 
-      org.apache.mahout.math.UpperTriangular bbtTriangular =
-        SSVDHelper.loadAndSumUpperTriangularMatrices(new Path(btPath,
-                                                              BtJob.OUTPUT_BBT
-                                                                  + "-*"), 
conf);
+      DenseSymmetricMatrix bbt =
+        SSVDHelper.loadAndSumUpperTriangularMatricesAsSymmetric(new 
Path(btPath,
+                                                                         
BtJob.OUTPUT_BBT
+                                                                           + 
"-*"), conf);
 
       // convert bbt to something our eigensolver could understand
-      assert bbtTriangular.columnSize() == k + p;
+      assert bbt.columnSize() == k + p;
 
       /*
        * we currently use a 3rd party in-core eigensolver. So we need just a
        * dense array representation for it.
        */
-      DenseMatrix bbtSquare = new DenseMatrix(k + p, k + p);
+      Matrix bbtSquare = new DenseMatrix(k + p, k + p);
+      bbtSquare.assign(bbt);
 
-      for (int i = 0; i < k + p; i++) {
-        for (int j = i; j < k + p; j++) {
-          double val = bbtTriangular.getQuick(i, j);
-          bbtSquare.setQuick(i, j, val);
-          bbtSquare.setQuick(j, i, val);
-        }
-      }
-      
       // MAHOUT-817
       if (pcaMeanPath != null) {
         Vector sq = SSVDHelper.loadAndSumUpVectors(sqPath, conf);
@@ -535,16 +511,17 @@ public final class SSVDSolver {
 
       }
 
-      EigenSolverWrapper eigenWrapper = new 
EigenSolverWrapper(SSVDHelper.extractRawData(bbtSquare));
-      Matrix uHat = new DenseMatrix(eigenWrapper.getUHat());
-      svalues = new DenseVector(eigenWrapper.getEigenValues());
+      EigenDecomposition eigen = new EigenDecomposition(bbtSquare);
+
+      Matrix uHat = eigen.getV();
+      svalues = eigen.getRealEigenvalues().clone();
 
       svalues.assign(Functions.SQRT);
 
       // save/redistribute UHat
       fs.mkdirs(uHatPath);
       DistributedRowMatrixWriter.write(uHatPath =
-        new Path(uHatPath, "uhat.seq"), conf, uHat);
+                                         new Path(uHatPath, "uhat.seq"), conf, 
uHat);
 
       // save sigma.
       SSVDHelper.saveVector(svalues,
@@ -570,28 +547,28 @@ public final class SSVDSolver {
       if (cUHalfSigma) {
         uhsjob = new UJob();
         uhsjob.run(conf,
-                 new Path(btPath, BtJob.OUTPUT_Q + "-*"),
-                 uHatPath,
-                 svPath,
-                 uHalfSigmaPath,
-                 k,
-                 reduceTasks,
-                 labelType,
-                 OutputScalingEnum.HALFSIGMA);
+                   new Path(btPath, BtJob.OUTPUT_Q + "-*"),
+                   uHatPath,
+                   svPath,
+                   uHalfSigmaPath,
+                   k,
+                   reduceTasks,
+                   labelType,
+                   OutputScalingEnum.HALFSIGMA);
       }
 
       UJob usjob = null;
       if (cUSigma) {
         usjob = new UJob();
         usjob.run(conf,
-                 new Path(btPath, BtJob.OUTPUT_Q + "-*"),
-                 uHatPath,
-                 svPath,
-                 uSigmaPath,
-                 k,
-                 reduceTasks,
-                 labelType,
-                 OutputScalingEnum.SIGMA);
+                  new Path(btPath, BtJob.OUTPUT_Q + "-*"),
+                  uHatPath,
+                  svPath,
+                  uSigmaPath,
+                  k,
+                  reduceTasks,
+                  labelType,
+                  OutputScalingEnum.SIGMA);
       }
 
       VJob vjob = null;

Modified: 
mahout/trunk/math/src/main/java/org/apache/mahout/math/decomposer/lanczos/LanczosSolver.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/decomposer/lanczos/LanczosSolver.java?rev=1507154&r1=1507153&r2=1507154&view=diff
==============================================================================
--- 
mahout/trunk/math/src/main/java/org/apache/mahout/math/decomposer/lanczos/LanczosSolver.java
 (original)
+++ 
mahout/trunk/math/src/main/java/org/apache/mahout/math/decomposer/lanczos/LanczosSolver.java
 Thu Jul 25 23:31:43 2013
@@ -154,8 +154,8 @@ public class LanczosSolver {
     startTime(TimingSection.FINAL_EIGEN_CREATE);
     for (int row = 0; row < i; row++) {
       Vector realEigen = null;
-      // the eigenvectors live as columns of V, in reverse order.  Weird but 
true.
-      Vector ejCol = eigenVects.viewColumn(i - row - 1);
+
+      Vector ejCol = eigenVects.viewColumn(row);
       int size = Math.min(ejCol.size(), state.getBasisSize());
       for (int j = 0; j < size; j++) {
         double d = ejCol.get(j);

Modified: 
mahout/trunk/math/src/main/java/org/apache/mahout/math/solver/EigenDecomposition.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/solver/EigenDecomposition.java?rev=1507154&r1=1507153&r2=1507154&view=diff
==============================================================================
--- 
mahout/trunk/math/src/main/java/org/apache/mahout/math/solver/EigenDecomposition.java
 (original)
+++ 
mahout/trunk/math/src/main/java/org/apache/mahout/math/solver/EigenDecomposition.java
 Thu Jul 25 23:31:43 2013
@@ -326,7 +326,7 @@ public class EigenDecomposition {
       int k = i;
       double p = d.getQuick(i);
       for (int j = i + 1; j < n; j++) {
-        if (d.getQuick(j) < p) {
+        if (d.getQuick(j) > p) {
           k = j;
           p = d.getQuick(j);
         }

Modified: 
mahout/trunk/math/src/test/java/org/apache/mahout/math/decomposer/lanczos/TestLanczosSolver.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/math/src/test/java/org/apache/mahout/math/decomposer/lanczos/TestLanczosSolver.java?rev=1507154&r1=1507153&r2=1507154&view=diff
==============================================================================
--- 
mahout/trunk/math/src/test/java/org/apache/mahout/math/decomposer/lanczos/TestLanczosSolver.java
 (original)
+++ 
mahout/trunk/math/src/test/java/org/apache/mahout/math/decomposer/lanczos/TestLanczosSolver.java
 Thu Jul 25 23:31:43 2013
@@ -49,12 +49,12 @@ public final class TestLanczosSolver ext
 
     float fractionOfEigensExpectedGood = 0.6f;
     for (int i = 0; i < fractionOfEigensExpectedGood * desiredRank; i++) {
-      double s = state.getSingularValue(desiredRank - i - 1);
-      double e = eigenvalues.get(eigenvalues.size() - i - 1);
+      double s = state.getSingularValue(i);
+      double e = eigenvalues.get(i);
       log.info("{} : L = {}, E = {}", i, s, e);
       assertTrue("Singular value differs from eigenvalue", Math.abs((s-e)/e) < 
ERROR_TOLERANCE);
       Vector v = state.getRightSingularVector(i);
-      Vector v2 = decomposition.getV().viewColumn(eigenvalues.size() - i - 1);
+      Vector v2 = decomposition.getV().viewColumn(i);
       double error = 1 - Math.abs(v.dot(v2)/(v.norm(2) * v2.norm(2)));
       log.info("error: {}", error);
       assertTrue(i + ": 1 - cosAngle = " + error, error < ERROR_TOLERANCE);


Reply via email to