[SYSTEMML-1547] Cache-conscious tsmm post-processing (copy upper, nnz) This patch improves the performance of tsmm postprocessing including (1) the copy of the upper triangular matrix to the lower triangle, and (2) the number of non-zeros computation of the output. We now use a cache-conscious scheme of copying similar to dense-dense transpose including the fused maintenance of nnz.
On a scenario of 10 iterations running outer products of 15K vectors (with 15K x 15K outputs), this patch improved end-to-end execution time from 16.2s to 11.4s Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/117ea480 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/117ea480 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/117ea480 Branch: refs/heads/master Commit: 117ea480dadbdaf8be4318eddbdf23018ce50544 Parents: b641edc Author: Matthias Boehm <mboe...@gmail.com> Authored: Wed Apr 19 18:39:26 2017 -0700 Committer: Matthias Boehm <mboe...@gmail.com> Committed: Wed Apr 19 18:46:31 2017 -0700 ---------------------------------------------------------------------- .../runtime/matrix/data/LibMatrixMult.java | 60 ++++++++++++++------ .../runtime/matrix/data/LibMatrixReorg.java | 2 +- 2 files changed, 45 insertions(+), 17 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/117ea480/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java index d746aa3..5c7c2ef 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java @@ -385,8 +385,8 @@ public class LibMatrixMult matrixMultTransposeSelfDense(m1, ret, leftTranspose, 0, ret.rlen ); //post-processing - copyUpperToLowerTriangle( ret ); - ret.recomputeNonZeros(); + long nnz = copyUpperToLowerTriangle(ret); + ret.setNonZeros(nnz); ret.examSparsity(); //System.out.println("TSMM ("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+","+leftTranspose+") in "+time.stop()); @@ -436,8 +436,8 @@ public class LibMatrixMult } //post-processing - copyUpperToLowerTriangle( ret ); - ret.recomputeNonZeros(); + long nnz = copyUpperToLowerTriangle(ret); + ret.setNonZeros(nnz); ret.examSparsity(); //System.out.println("TSMM k="+k+" ("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+","+leftTranspose+") in "+time.stop()); @@ -3413,17 +3413,47 @@ public class LibMatrixMult * result down to lower triangular matrix once. * * @param ret matrix + * @return number of non zeros */ - public static void copyUpperToLowerTriangle( MatrixBlock ret ) - { - double[] c = ret.denseBlock; - final int m = ret.rlen; - final int n = ret.clen; + public static long copyUpperToLowerTriangle( MatrixBlock ret ) + { + //ret is guaranteed to be a squared, symmetric matrix + if( ret.rlen != ret.clen ) + throw new RuntimeException("Invalid non-squared input matrix."); + + final double[] c = ret.denseBlock; + final int n = ret.rlen; + long nnz = 0; + + //blocked execution (2x128KB for L2 blocking) + final int blocksizeIJ = 128; + + //handle blocks on diagonal + for( int bi = 0; bi<n; bi+=blocksizeIJ ) { + int bimin = Math.min(bi+blocksizeIJ, n); + for( int i=bi, rix=bi*n; i<bimin; i++, rix+=n ) { + LibMatrixReorg.transposeRow(c, c, rix+bi, bi*n+i, n, bimin-bi); + for( int j=rix+i+1; j<rix+bimin; j++ ) + nnz += (c[j] != 0) ? 2 : 0; + nnz++; //for diagonal element + } + } - //copy symmetric values - for( int i=0, uix=0; i<m; i++, uix+=n ) - for( int j=i+1, lix=j*n+i; j<n; j++, lix+=n ) - c[ lix ] = c[ uix+j ]; + //handle non-diagonal blocks (full block copies) + for( int bi = 0; bi<n; bi+=blocksizeIJ ) { + int bimin = Math.min(bi+blocksizeIJ, n); + for( int bj = bi; bj<n; bj+=blocksizeIJ ) + if( bi != bj ) { //not on diagonal + int bjmin = Math.min(bj+blocksizeIJ, n); + for( int i=bi, rix=bi*n; i<bimin; i++, rix+=n ) { + LibMatrixReorg.transposeRow(c, c, rix+bj, bj*n+i, n, bjmin-bj); + for( int j=rix+bj; j<rix+bjmin; j++ ) + nnz += (c[j] != 0) ? 2 : 0; + } + } + } + + return nnz; } private static MatrixBlock prepMatrixMultTransposeSelfInput( MatrixBlock m1, boolean leftTranspose ) @@ -3676,13 +3706,11 @@ public class LibMatrixMult } @Override - public Object call() throws DMLRuntimeException - { + public Object call() throws DMLRuntimeException { if( _m1.sparse ) matrixMultTransposeSelfSparse(_m1, _ret, _left, _rl, _ru); else matrixMultTransposeSelfDense(_m1, _ret, _left, _rl, _ru); - return null; } } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/117ea480/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java index dfd19aa..9f45590 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java @@ -939,7 +939,7 @@ public class LibMatrixReorg } } - private static void transposeRow( double[] a, double[] c, int aix, int cix, int n2, int len ) + static void transposeRow( double[] a, double[] c, int aix, int cix, int n2, int len ) { final int bn = len%8;