[SYSTEMML-1547] Cache-conscious tsmm post-processing (copy upper, nnz)

This patch improves the performance of tsmm postprocessing including (1)
the copy of the upper triangular matrix to the lower triangle, and (2)
the number of non-zeros computation of the output. We now use a
cache-conscious scheme of copying similar to dense-dense transpose
including the fused maintenance of nnz. 

On a scenario of 10 iterations running outer products of 15K vectors
(with 15K x 15K outputs), this patch improved end-to-end execution time
from 16.2s to 11.4s 

Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/117ea480
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/117ea480
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/117ea480

Branch: refs/heads/master
Commit: 117ea480dadbdaf8be4318eddbdf23018ce50544
Parents: b641edc
Author: Matthias Boehm <mboe...@gmail.com>
Authored: Wed Apr 19 18:39:26 2017 -0700
Committer: Matthias Boehm <mboe...@gmail.com>
Committed: Wed Apr 19 18:46:31 2017 -0700

----------------------------------------------------------------------
 .../runtime/matrix/data/LibMatrixMult.java      | 60 ++++++++++++++------
 .../runtime/matrix/data/LibMatrixReorg.java     |  2 +-
 2 files changed, 45 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/117ea480/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index d746aa3..5c7c2ef 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -385,8 +385,8 @@ public class LibMatrixMult
                        matrixMultTransposeSelfDense(m1, ret, leftTranspose, 0, 
ret.rlen );
 
                //post-processing
-               copyUpperToLowerTriangle( ret );                
-               ret.recomputeNonZeros();
+               long nnz = copyUpperToLowerTriangle(ret);
+               ret.setNonZeros(nnz);
                ret.examSparsity();     
                
                //System.out.println("TSMM 
("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+","+leftTranspose+")
 in "+time.stop());
@@ -436,8 +436,8 @@ public class LibMatrixMult
                }
                
                //post-processing
-               copyUpperToLowerTriangle( ret );                
-               ret.recomputeNonZeros();
+               long nnz = copyUpperToLowerTriangle(ret);               
+               ret.setNonZeros(nnz);
                ret.examSparsity();     
                
                //System.out.println("TSMM k="+k+" 
("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+","+leftTranspose+")
 in "+time.stop());
@@ -3413,17 +3413,47 @@ public class LibMatrixMult
         * result down to lower triangular matrix once.
         * 
         * @param ret matrix
+        * @return number of non zeros
         */
-       public static void copyUpperToLowerTriangle( MatrixBlock ret )
-       {
-               double[] c = ret.denseBlock;
-               final int m = ret.rlen;
-               final int n = ret.clen;
+       public static long copyUpperToLowerTriangle( MatrixBlock ret )
+       {
+               //ret is guaranteed to be a squared, symmetric matrix
+               if( ret.rlen != ret.clen )
+                       throw new RuntimeException("Invalid non-squared input 
matrix.");
+               
+               final double[] c = ret.denseBlock;
+               final int n = ret.rlen;
+               long nnz = 0;
+               
+               //blocked execution (2x128KB for L2 blocking)
+               final int blocksizeIJ = 128; 
+               
+               //handle blocks on diagonal
+               for( int bi = 0; bi<n; bi+=blocksizeIJ ) {
+                       int bimin = Math.min(bi+blocksizeIJ, n);
+                       for( int i=bi, rix=bi*n; i<bimin; i++, rix+=n ) {
+                               LibMatrixReorg.transposeRow(c, c, rix+bi, 
bi*n+i, n, bimin-bi);
+                               for( int j=rix+i+1; j<rix+bimin; j++ )
+                                       nnz += (c[j] != 0) ? 2 : 0;
+                               nnz++; //for diagonal element
+                       }
+               }
                
-               //copy symmetric values
-               for( int i=0, uix=0; i<m; i++, uix+=n )
-                       for( int j=i+1, lix=j*n+i; j<n; j++, lix+=n )
-                               c[ lix ] = c[ uix+j ];
+               //handle non-diagonal blocks (full block copies)
+               for( int bi = 0; bi<n; bi+=blocksizeIJ ) {
+                       int bimin = Math.min(bi+blocksizeIJ, n);
+                       for( int bj = bi; bj<n; bj+=blocksizeIJ ) 
+                               if( bi != bj ) { //not on diagonal
+                                       int bjmin = Math.min(bj+blocksizeIJ, n);
+                                       for( int i=bi, rix=bi*n; i<bimin; i++, 
rix+=n ) {
+                                               LibMatrixReorg.transposeRow(c, 
c, rix+bj, bj*n+i, n, bjmin-bj);
+                                               for( int j=rix+bj; j<rix+bjmin; 
j++ )
+                                                       nnz += (c[j] != 0) ? 2 
: 0;
+                                       }
+                               }
+               }
+               
+               return nnz;
        }
 
        private static MatrixBlock prepMatrixMultTransposeSelfInput( 
MatrixBlock m1, boolean leftTranspose ) 
@@ -3676,13 +3706,11 @@ public class LibMatrixMult
                }
                
                @Override
-               public Object call() throws DMLRuntimeException
-               {
+               public Object call() throws DMLRuntimeException {
                        if( _m1.sparse )
                                matrixMultTransposeSelfSparse(_m1, _ret, _left, 
_rl, _ru);
                        else
                                matrixMultTransposeSelfDense(_m1, _ret, _left, 
_rl, _ru);
-                       
                        return null;
                }
        }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/117ea480/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
index dfd19aa..9f45590 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java
@@ -939,7 +939,7 @@ public class LibMatrixReorg
                }
        }
 
-       private static void transposeRow( double[] a, double[] c, int aix, int 
cix, int n2, int len )
+       static void transposeRow( double[] a, double[] c, int aix, int cix, int 
n2, int len )
        {
                final int bn = len%8;
                

Reply via email to