incubator-systemml git commit: [SYSTEMML-809] Performance sparse-dense wdivmm (multi-level blocking)

mboehm7 Mon, 25 Jul 2016 13:18:51 -0700

Repository: incubator-systemml
Updated Branches:
  refs/heads/master 3841ca88e -> fceb6620e



[SYSTEMML-809] Performance sparse-dense wdivmm (multi-level blocking)

So far we used a best-effort cache blocking to ensure row reuse.
However, with high sparsity and large factors this led to severe cache
thrashing effects. We now use a multi-level cache blocking: best effort
for reuse and subsequent blocking for L2 cache size. This also
generalizes the custom blocking for wdivmm_left. 

The performance improvements on 200k x 200k, sparsity=0.001, rank=100
were substantial for both wdivmm left (2400ms -> 540ms) and wdivmm_right
(1500ms -> 600ms).

Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/fceb6620
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/fceb6620
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/fceb6620

Branch: refs/heads/master
Commit: fceb6620e7bc29a097dd7ac6057f963b3a2ff235
Parents: 3841ca8
Author: Matthias Boehm <[email protected]>
Authored: Sun Jul 24 23:30:33 2016 -0700
Committer: Matthias Boehm <[email protected]>
Committed: Sun Jul 24 23:30:33 2016 -0700

----------------------------------------------------------------------
 .../runtime/matrix/data/LibMatrixMult.java      | 99 +++++++++++---------
 1 file changed, 54 insertions(+), 45 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/fceb6620/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index 57ce557..b5fe663 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -68,9 +68,10 @@ import org.apache.sysml.runtime.util.UtilFunctions;
 public class LibMatrixMult 
 {
        //internal configuration
-       public static final boolean LOW_LEVEL_OPTIMIZATION = true;
-       public static final long MEM_OVERHEAD_THRESHOLD = 2L*1024*1024; //MAX 2 
MB
+       private static final boolean LOW_LEVEL_OPTIMIZATION = true;
+       private static final long MEM_OVERHEAD_THRESHOLD = 2L*1024*1024; //MAX 
2 MB
        private static final long PAR_MINFLOP_THRESHOLD = 2L*1024*1024; //MIN 2 
MFLOP
+       private static final int L2_CACHESIZE = 256 *1024; //256KB (common size)
        
        private LibMatrixMult() {
                //prevent instantiation via private constructor
@@ -2737,15 +2738,18 @@ public class LibMatrixMult
                //approach: iterate over non-zeros of w, selective mm 
computation
                //blocked over ij, while maintaining front of column indexes, 
where the
                //blocksize is chosen such that we reuse each  Ui/Vj vector on 
average 8 times,
-               //with custom blocksizeJ for wdivmm_left to avoid LLC misses on 
output.
-               final int blocksizeI = (int) (8L*mW.rlen*mW.clen/mW.nonZeros);
-               final int blocksizeJ = left ? 
Math.max(8,Math.min(256*1024/(mU.clen*8), blocksizeI)) : blocksizeI; 
-               int[] curk = new int[blocksizeI];               
-               boolean[] aligned = (four&&!scalar) ? new boolean[blocksizeI] : 
null;
+               //we use an additional ij blocking to ensure that Ui/Vj as well 
as the output
+               //in case of wdivmm_left fit into L2 cache (avoid L2/LLC 
misses).
+               final int blocksizeIJ = (int) (8L*mW.rlen*mW.clen/mW.nonZeros);
+               final int blocksizeIJ2 = L2_CACHESIZE / (2*mU.clen*8); //mU 
guaranteed <=1000
                
-               for( int bi = rl; bi < ru; bi+=blocksizeI ) 
+               int[] curk = new int[blocksizeIJ];              
+               boolean[] aligned = (four&&!scalar) ? new boolean[blocksizeIJ] 
: null;
+               
+               //blocked execution over row/column blocks
+               for( int bi=rl; bi<ru; bi+=blocksizeIJ ) 
                {
-                       int bimin = Math.min(ru, bi+blocksizeI);
+                       int bimin = Math.min(ru, bi+blocksizeIJ);
                        //prepare starting indexes for block row
                        for( int i=bi; i<bimin; i++ ) {
                                int k = (cl==0||w.isEmpty(i)) ? 0 : 
w.posFIndexGTE(i,cl);
@@ -2755,47 +2759,52 @@ public class LibMatrixMult
                        if( four && !scalar )
                                for( int i=bi; i<bimin; i++ )
                                        aligned[i-bi] = w.isAligned(i-bi, x);
-                       //blocked execution over column blocks
-                       for( int bj = cl; bj < cu; bj+=blocksizeJ ) 
+                       
+                       for( int bj=cl; bj<cu; bj+=blocksizeIJ )  
                        {
-                               int bjmin = Math.min(cu, bj+blocksizeJ);
-                               for( int i=bi, uix=bi*cd; i<bimin; i++, uix+=cd 
) {
-                                       if( !w.isEmpty(i) ) {
-                                               int wpos = w.pos(i);
-                                               int wlen = w.size(i);
-                                               int[] wix = w.indexes(i);
-                                               double[] wval = w.values(i);    
                        
-                                               
-                                               int k = wpos + curk[i-bi];
-                                               if( basic ) {
-                                                       for( ; k<wpos+wlen && 
wix[k]<bjmin; k++ )
-                                                               
ret.appendValue( i, wix[k], wval[k] * dotProduct(u, v, uix, wix[k]*cd, cd));
-                                               }
-                                               else if( four ) { //left/right
-                                                       //checking alignment 
per row is ok because early abort if false, 
-                                                       //row nnz likely fit in 
L1/L2 cache, and asymptotically better if aligned
-                                                       if( !scalar && 
w.isAligned(i, x) ) {
-                                                               //O(n) where n 
is nnz in w/x 
-                                                               double[] xvals 
= x.values(i);
-                                                               for( ; 
k<wpos+wlen && wix[k]<bjmin; k++ )
-                                                                       
wdivmm(wval[k], xvals[k], u, v, c, uix, wix[k]*cd, left, scalar, cd);
+                               //blocked execution over row/column blocks for 
L2
+                               for( int bi2=bi, bjmin=Math.min(cu, 
bj+blocksizeIJ); bi2<bimin; bi2+=blocksizeIJ2)                                  
    
+                                       for( int bj2=bj, bimin2=Math.min(bimin, 
bi2+blocksizeIJ2); bj2<bjmin; bj2+=blocksizeIJ2 ) {
+                                                       
+                                               //core wdivmm block matrix mult
+                                               for( int i=bi2, uix=bi2*cd, 
bjmin2=Math.min(bjmin, bj2+blocksizeIJ2); i<bimin2; i++, uix+=cd ) {
+                                                       if( w.isEmpty(i) ) 
continue;
+                                                       
+                                                       int wpos = w.pos(i);
+                                                       int wlen = w.size(i);
+                                                       int[] wix = 
w.indexes(i);
+                                                       double[] wval = 
w.values(i);                            
+                                                       
+                                                       int k = wpos + 
curk[i-bi2];
+                                                       if( basic ) {
+                                                               for( ; 
k<wpos+wlen && wix[k]<bjmin2; k++ )
+                                                                       
ret.appendValue( i, wix[k], wval[k] * dotProduct(u, v, uix, wix[k]*cd, cd));
                                                        }
-                                                       else {
-                                                               //scalar or O(n 
log m) where n/m are nnz in w/x
-                                                               for( ; 
k<wpos+wlen && wix[k]<bjmin; k++ )
-                                                                       if 
(scalar)
-                                                                               
wdivmm(wval[k], eps, u, v, c, uix, wix[k]*cd, left, scalar, cd);
-                                                                       else
-                                                                               
wdivmm(wval[k], x.get(i, wix[k]), u, v, c, uix, wix[k]*cd, left, scalar, cd);
+                                                       else if( four ) { 
//left/right
+                                                               //checking 
alignment per row is ok because early abort if false, 
+                                                               //row nnz 
likely fit in L1/L2 cache, and asymptotically better if aligned
+                                                               if( !scalar && 
w.isAligned(i, x) ) {
+                                                                       //O(n) 
where n is nnz in w/x 
+                                                                       
double[] xvals = x.values(i);
+                                                                       for( ; 
k<wpos+wlen && wix[k]<bjmin2; k++ )
+                                                                               
wdivmm(wval[k], xvals[k], u, v, c, uix, wix[k]*cd, left, scalar, cd);
+                                                               }
+                                                               else {
+                                                                       
//scalar or O(n log m) where n/m are nnz in w/x
+                                                                       for( ; 
k<wpos+wlen && wix[k]<bjmin2; k++ )
+                                                                               
if (scalar)
+                                                                               
        wdivmm(wval[k], eps, u, v, c, uix, wix[k]*cd, left, scalar, cd);
+                                                                               
else
+                                                                               
        wdivmm(wval[k], x.get(i, wix[k]), u, v, c, uix, wix[k]*cd, left, 
scalar, cd);
+                                                               }
                                                        }
+                                                       else { //left/right 
minus default
+                                                               for( ; 
k<wpos+wlen && wix[k]<bjmin2; k++ )
+                                                                       
wdivmm(wval[k], u, v, c, uix, wix[k]*cd, left, mult, minus, cd);
+                                                       }
+                                                       curk[i-bi2] = k - wpos;
                                                }
-                                               else { //left/right minus 
default
-                                                       for( ; k<wpos+wlen && 
wix[k]<bjmin; k++ )
-                                                               wdivmm(wval[k], 
u, v, c, uix, wix[k]*cd, left, mult, minus, cd);
-                                               }
-                                               curk[i-bi] = k - wpos;
                                        }
-                               }
                        }
                }
        }

incubator-systemml git commit: [SYSTEMML-809] Performance sparse-dense wdivmm (multi-level blocking)

Reply via email to