[SYSTEMML-694] Performance dense-sparse vector transpose, for lstm Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/3841ca88 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/3841ca88 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/3841ca88
Branch: refs/heads/master Commit: 3841ca88e69842ae528b5e329c38b123ddec7ef1 Parents: 7c5b83c Author: Matthias Boehm <[email protected]> Authored: Sat Jul 23 18:07:56 2016 -0700 Committer: Matthias Boehm <[email protected]> Committed: Sat Jul 23 18:07:56 2016 -0700 ---------------------------------------------------------------------- .../runtime/matrix/data/LibMatrixReorg.java | 44 ++++++++++++-------- 1 file changed, 26 insertions(+), 18 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/3841ca88/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java index c5674bb..e472413 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixReorg.java @@ -859,24 +859,32 @@ public class LibMatrixReorg double[] a = in.getDenseBlock(); SparseBlock c = out.getSparseBlock(); - //blocking according to typical L2 cache sizes - final int blocksizeI = 128; - final int blocksizeJ = 128; - - //blocked execution - for( int bi = 0; bi<m; bi+=blocksizeI ) - for( int bj = 0; bj<n; bj+=blocksizeJ ) - { - int bimin = Math.min(bi+blocksizeI, m); - int bjmin = Math.min(bj+blocksizeJ, n); - //core transpose operation - for( int i=bi; i<bimin; i++ ) - for( int j=bj, aix=i*n+bj; j<bjmin; j++, aix++ ) - { - c.allocate(j, ennz2, n2); - c.append(j, i, a[aix]); - } - } + if( out.rlen == 1 ) //VECTOR-VECTOR + { + c.allocate(0, (int)in.nonZeros); + c.setIndexRange(0, 0, m, a, 0, m); + } + else //general case: MATRIX-MATRIX + { + //blocking according to typical L2 cache sizes + final int blocksizeI = 128; + final int blocksizeJ = 128; + + //blocked execution + for( int bi = 0; bi<m; bi+=blocksizeI ) + for( int bj = 0; bj<n; bj+=blocksizeJ ) + { + int bimin = Math.min(bi+blocksizeI, m); + int bjmin = Math.min(bj+blocksizeJ, n); + //core transpose operation + for( int i=bi; i<bimin; i++ ) + for( int j=bj, aix=i*n+bj; j<bjmin; j++, aix++ ) + { + c.allocate(j, ennz2, n2); + c.append(j, i, a[aix]); + } + } + } } /**
