ultra-sparse inputs

mboehm7 Sat, 28 Oct 2017 01:12:07 -0700

[SYSTEMML-1976] Performance codegen outer ops w/ ultra-sparse inputs

This patch improves the performance of codegen outer operations
(especially left mm), for ultra-sparse inputs. On ultra-sparse inputs,
the allocation and maintenance of column indexes can become a
bottleneck. Accordingly, this patch adds a special case for ultra-sparse
matrices.


For the core update rules of ALS-CG, this patch improved the performance
as follows (10 iterations over the amazon review dataset):
Left (t(t(U) %*% (W * (U %*% t(V))))): 125.7s -> 14.2s
Right ((W * (U %*% t(V))) %*% V): 25.2s -> 17.9s


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/ede870de
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/ede870de
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/ede870de

Branch: refs/heads/master
Commit: ede870de8a7b01bd44ac3b6bcfe7f0e86b1c93c8
Parents: 83e01b0
Author: Matthias Boehm <[email protected]>
Authored: Fri Oct 27 22:01:36 2017 -0700
Committer: Matthias Boehm <[email protected]>
Committed: Fri Oct 27 22:01:36 2017 -0700

----------------------------------------------------------------------
 .../runtime/codegen/SpoofOuterProduct.java      | 77 +++++++++++++-------
 1 file changed, 52 insertions(+), 25 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/ede870de/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java 
b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
index ac8dc57..26a661a 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
@@ -28,6 +28,7 @@ import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
 
+import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.compress.CompressedMatrixBlock;
 import org.apache.sysml.runtime.instructions.cp.DoubleObject;
@@ -391,7 +392,7 @@ public abstract class SpoofOuterProduct extends 
SpoofOperator
                        c[0] = sum;
        }
        
-       private void executeSparse(SparseBlock sblock,  double[] u, double[] v, 
double[][] b, double[] scalars,
+       private void executeSparse(SparseBlock sblock, double[] u, double[] v, 
double[][] b, double[] scalars,
                double[] c, int m, int n, int k, long nnz, OutProdType type, 
int rl, int ru, int cl, int cu) 
        {
                boolean left = (_outerProductType== 
OutProdType.LEFT_OUTER_PRODUCT);
@@ -401,37 +402,63 @@ public abstract class SpoofOuterProduct extends 
SpoofOperator
                //blocksize is chosen such that we reuse each  Ui/Vj vector on 
average 8 times,
                //with custom blocksizeJ for wdivmm_left to avoid LLC misses on 
output.
                final int blocksizeI = (int) (8L*m*n/nnz);
-               final int blocksizeJ = left ? 
Math.max(8,Math.min(L2_CACHESIZE/(k*8), blocksizeI)) : blocksizeI;
-               int[] curk = new int[Math.min(blocksizeI,ru-rl)];
                
-               for( int bi = rl; bi < ru; bi+=blocksizeI ) 
+               if( OptimizerUtils.getSparsity(m, n, nnz) < 
MatrixBlock.ULTRA_SPARSITY_TURN_POINT ) //ultra-sparse
                {
-                       int bimin = Math.min(ru, bi+blocksizeI);
-                       //prepare starting indexes for block row
-                       for( int i=bi; i<bimin; i++ ) {
+                       //for ultra-sparse matrices, we do not allocate the 
index array because
+                       //its allocation and maintenance can dominate the total 
runtime.
+                       
+                       //core wdivmm block matrix mult
+                       for( int i=rl, uix=rl*k; i<ru; i++, uix+=k ) {
+                               if( sblock.isEmpty(i) ) continue;
+                               
+                               int wpos = sblock.pos(i);
+                               int wlen = sblock.size(i);
+                               int[] wix = sblock.indexes(i);
+                               double[] wval = sblock.values(i);
+                               
                                int index = (cl==0||sblock.isEmpty(i)) ? 0 : 
sblock.posFIndexGTE(i,cl);
-                               curk[i-bi] = (index>=0) ? index : n;
+                               index = wpos + ((index>=0) ? index : n);
+                               for( ; index<wpos+wlen && wix[index]<cu; 
index++ ) {
+                                       genexecDense(wval[index], u, uix, v, 
wix[index]*k, b, scalars, c,
+                                               (left ? wix[index]*k : uix), m, 
n, k, i, wix[index]);
+                               }
                        }
+               }
+               else //sparse
+               {
+                       final int blocksizeJ = left ? 
Math.max(8,Math.min(L2_CACHESIZE/(k*8), blocksizeI)) : blocksizeI;
+                       int[] curk = new int[Math.min(blocksizeI,ru-rl)];
                        
-                       //blocked execution over column blocks
-                       for( int bj = cl; bj < cu; bj+=blocksizeJ )
+                       for( int bi = rl; bi < ru; bi+=blocksizeI ) 
                        {
-                               int bjmin = Math.min(cu, bj+blocksizeJ);
-                               //core wdivmm block matrix mult
-                               for( int i=bi, uix=bi*k; i<bimin; i++, uix+=k ) 
{
-                                       if( sblock.isEmpty(i) ) continue;
-                                       
-                                       int wpos = sblock.pos(i);
-                                       int wlen = sblock.size(i);
-                                       int[] wix = sblock.indexes(i);
-                                       double[] wval = sblock.values(i);
-                                       
-                                       int index = wpos + curk[i-bi];
-                                       for( ; index<wpos+wlen && 
wix[index]<bjmin; index++ ) {
-                                               genexecDense(wval[index], u, 
uix, v, wix[index]*k, b, scalars, c,
-                                                       (left ? wix[index]*k : 
uix), m, n, k, i, wix[index]);
+                               int bimin = Math.min(ru, bi+blocksizeI);
+                               //prepare starting indexes for block row
+                               for( int i=bi; i<bimin; i++ ) {
+                                       int index = (cl==0||sblock.isEmpty(i)) 
? 0 : sblock.posFIndexGTE(i,cl);
+                                       curk[i-bi] = (index>=0) ? index : n;
+                               }
+                               
+                               //blocked execution over column blocks
+                               for( int bj = cl; bj < cu; bj+=blocksizeJ )
+                               {
+                                       int bjmin = Math.min(cu, bj+blocksizeJ);
+                                       //core wdivmm block matrix mult
+                                       for( int i=bi, uix=bi*k; i<bimin; i++, 
uix+=k ) {
+                                               if( sblock.isEmpty(i) ) 
continue;
+                                               
+                                               int wpos = sblock.pos(i);
+                                               int wlen = sblock.size(i);
+                                               int[] wix = sblock.indexes(i);
+                                               double[] wval = 
sblock.values(i);
+                                               
+                                               int index = wpos + curk[i-bi];
+                                               for( ; index<wpos+wlen && 
wix[index]<bjmin; index++ ) {
+                                                       
genexecDense(wval[index], u, uix, v, wix[index]*k, b, scalars, c,
+                                                               (left ? 
wix[index]*k : uix), m, n, k, i, wix[index]);
+                                               }
+                                               curk[i-bi] = index - wpos;
                                        }
-                                       curk[i-bi] = index - wpos;
                                }
                        }
                }

[2/3] systemml git commit: [SYSTEMML-1976] Performance codegen outer ops w/ ultra-sparse inputs

Reply via email to