Repository: incubator-systemml
Updated Branches:
  refs/heads/master bcf431331 -> 085009a36


[SYSTEMML-913] Cache-conscious sparse matrix-vector multiplication

This patch makes the sparse-dense matrix-vector multiplication cache
conscious to handle cases with many features (and hence large rhs
vectors) more efficient. On singlenode experiments with 80GB max heap
and 100 iterations, example improvements were as follows:
(1) 100x10M, 0.1: 14.5s -> 10.1s
(2) 1Kx10M, 0.01: 29.5s -> 11.5s
(3) 10Kx10M, 0.001: 31.5s -> 21.5s
(4) 1Kx100M, 0.001: 33.8s -> 25.5s 

Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/b304e605
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/b304e605
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/b304e605

Branch: refs/heads/master
Commit: b304e6058001ba195a0c9ddca0ebde24dfcdbdaa
Parents: bcf4313
Author: Matthias Boehm <mbo...@us.ibm.com>
Authored: Wed Sep 14 23:30:53 2016 +0200
Committer: Matthias Boehm <mbo...@us.ibm.com>
Committed: Thu Sep 15 09:41:25 2016 +0200

----------------------------------------------------------------------
 .../runtime/matrix/data/LibMatrixMult.java      | 36 ++++++++++++++++----
 1 file changed, 30 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/b304e605/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index 9d0dd9b..48054ef 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -1321,24 +1321,48 @@ public class LibMatrixMult
                final int m = m1.rlen;
                final int n = m2.clen;
                final int cd = m2.rlen;
+               final long xsp = (long)m*cd/m1.nonZeros;
 
                if( LOW_LEVEL_OPTIMIZATION )
                {
                        SparseBlock a = m1.sparseBlock;
                        
-                       if( m==1 && n==1 )         //DOT PRODUCT
+                       if( m==1 && n==1 )            //DOT PRODUCT
                        {
                                if( !a.isEmpty(0) ) {
                                        c[0] = dotProduct(a.values(0), b, 
a.indexes(0), a.pos(0), 0, a.size(0));
                                }
                        }
-                       else if( n==1 )            //MATRIX-VECTOR
+                       else if( n==1 && cd<=2*1024 ) //MATRIX-VECTOR (short 
rhs)
                        {
                                for( int i=rl; i<ru; i++ )
                                        if( !a.isEmpty(i) )
                                                c[i] = dotProduct(a.values(i), 
b, a.indexes(i), a.pos(i), 0, a.size(i));                                       
                 
                        }
-                       else if( pm2 && m==1 )     //VECTOR-MATRIX
+                       else if( n==1 )               //MATRIX-VECTOR (tall rhs)
+                       {
+                               final int blocksizeI = 32;
+                               final int blocksizeK = 
(int)Math.max(2*1024,2*1024*xsp/32); //~ 16KB L1  
+                               int[] curk = new int[blocksizeI];
+                               
+                               for( int bi = rl; bi < ru; bi+=blocksizeI ) {
+                                       Arrays.fill(curk, 0); //reset positions
+                                       for( int bk=0, bimin = Math.min(ru, 
bi+blocksizeI); bk<cd; bk+=blocksizeK ) {
+                                               for( int i=bi, bkmin = 
Math.min(bk+blocksizeK, cd); i<bimin; i++) {
+                                                       if( a.isEmpty(i) ) 
continue;
+                                                       int apos = a.pos(i);
+                                                       int alen = a.size(i);
+                                                       int[] aix = 
a.indexes(i);
+                                                       double[] avals = 
a.values(i);                                   
+                                                       int k = curk[i-bi] + 
apos;                                                                      
+                                                       for( ; k<apos+alen && 
aix[k]<bkmin; k++ )
+                                                               c[i] += 
avals[k] * b[aix[k]];
+                                                       curk[i-bi] = k - apos;
+                                               }
+                                       }       
+                               }
+                       }
+                       else if( pm2 && m==1 )        //VECTOR-MATRIX
                        {
                                //parallelization over rows in rhs matrix
                                if( !a.isEmpty(0) ) 
@@ -1357,7 +1381,7 @@ public class LibMatrixMult
                                        }
                                }
                        }
-                       else if( pm2 && m<=16 )    //MATRIX-MATRIX (short lhs) 
+                       else if( pm2 && m<=16 )       //MATRIX-MATRIX (short 
lhs) 
                        {
                                int arlen = a.numRows();
                                for( int i=0, cix=0; i<arlen; i++, cix+=n )
@@ -1388,7 +1412,7 @@ public class LibMatrixMult
                                        }
                                        }
                        }
-                       else if( n<=64 )           //MATRIX-MATRIX (skinny rhs)
+                       else if( n<=64 )              //MATRIX-MATRIX (skinny 
rhs)
                        {
                                //no blocking since b and c fit into cache 
anyway
                                for( int i=rl, cix=rl*n; i<ru; i++, cix+=n ) {
@@ -1408,7 +1432,7 @@ public class LibMatrixMult
                                                aix[k]*n, aix[k+1]*n, 
aix[k+2]*n, aix[k+3]*n, cix, n );
                                }       
                        }
-                       else                       //MATRIX-MATRIX
+                       else                          //MATRIX-MATRIX
                        {                                                       
                                //blocksizes to fit blocks of B (dense) and 
several rows of A/C in common L2 cache size, 
                                //while blocking A/C for L1/L2 yet allowing 
long scans (2 pages) in the inner loop over j

Reply via email to