Repository: incubator-systemml Updated Branches: refs/heads/master 27a6a2b2a -> 01d9fdb45
[SYSTEMML-809] Performance sparse-dense wdivmm_left (L2 blocking output) Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/01d9fdb4 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/01d9fdb4 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/01d9fdb4 Branch: refs/heads/master Commit: 01d9fdb4552c78d663ebf0dbc3e850e06bc9b820 Parents: 27a6a2b Author: Matthias Boehm <[email protected]> Authored: Wed Jul 13 22:40:42 2016 -0700 Committer: Matthias Boehm <[email protected]> Committed: Thu Jul 14 13:05:48 2016 -0700 ---------------------------------------------------------------------- .../runtime/matrix/data/LibMatrixMult.java | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/01d9fdb4/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java index e54abec..c875e45 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java @@ -2710,14 +2710,16 @@ public class LibMatrixMult //approach: iterate over non-zeros of w, selective mm computation //blocked over ij, while maintaining front of column indexes, where the - //blocksize is chosen such that we reuse each vector on average 8 times. - final int blocksizeIJ = (int) (8L*mW.rlen*mW.clen/mW.nonZeros); - int[] curk = new int[blocksizeIJ]; - boolean[] aligned = (four&&!scalar) ? new boolean[blocksizeIJ] : null; - - for( int bi = rl; bi < ru; bi+=blocksizeIJ ) + //blocksize is chosen such that we reuse each Ui/Vj vector on average 8 times, + //with custom blocksizeJ for wdivmm_left to avoid LLC misses on output. + final int blocksizeI = (int) (8L*mW.rlen*mW.clen/mW.nonZeros); + final int blocksizeJ = left ? Math.max(8,Math.min(256*1024/(mU.clen*8), blocksizeI)) : blocksizeI; + int[] curk = new int[blocksizeI]; + boolean[] aligned = (four&&!scalar) ? new boolean[blocksizeI] : null; + + for( int bi = rl; bi < ru; bi+=blocksizeI ) { - int bimin = Math.min(ru, bi+blocksizeIJ); + int bimin = Math.min(ru, bi+blocksizeI); //prepare starting indexes for block row for( int i=bi; i<bimin; i++ ) { int k = (cl==0||w.isEmpty(i)) ? 0 : w.posFIndexGTE(i,cl); @@ -2728,9 +2730,9 @@ public class LibMatrixMult for( int i=bi; i<bimin; i++ ) aligned[i-bi] = w.isAligned(i-bi, x); //blocked execution over column blocks - for( int bj = cl; bj < cu; bj+=blocksizeIJ ) + for( int bj = cl; bj < cu; bj+=blocksizeJ ) { - int bjmin = Math.min(cu, bj+blocksizeIJ); + int bjmin = Math.min(cu, bj+blocksizeJ); for( int i=bi, uix=bi*cd; i<bimin; i++, uix+=cd ) { if( !w.isEmpty(i) ) { int wpos = w.pos(i);
