Repository: systemml
Updated Branches:
  refs/heads/master fcfbd3d24 -> a2bf0006f


[SYSTEMML-1841] Performance codegen outer operations (GC, load balance)

This patch improves performance of several variations of codegen outer
operations, especially for sparse and ultra-sparse input data. In
detail, this includes (1) an increased number of tasks for better load
balance, and (2) partition-aware sizes of thread-local temporary arrays
for cache-conscious sparse implementations. 

For example on the ultra-sparse Amazon Books review dataset (8,026,324 x
2,330,066, nnz=22,507,155) and a 100 iterations of sum(X * U%*%t(V)) and 
(X * U%*%t(V))%*%V this patch improved performance from 63s to 32s and
from 124s to 70s, respectively.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/86edac16
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/86edac16
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/86edac16

Branch: refs/heads/master
Commit: 86edac16516e4330a85f65ceaba87dd942205971
Parents: fcfbd3d
Author: Matthias Boehm <[email protected]>
Authored: Tue Aug 15 17:49:37 2017 -0700
Committer: Matthias Boehm <[email protected]>
Committed: Tue Aug 15 17:49:37 2017 -0700

----------------------------------------------------------------------
 .../runtime/codegen/SpoofOuterProduct.java      | 22 ++++++++++++--------
 1 file changed, 13 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/86edac16/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java 
b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
index 9148dd4..c49201a 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java
@@ -35,6 +35,7 @@ import org.apache.sysml.runtime.instructions.cp.ScalarObject;
 import org.apache.sysml.runtime.matrix.data.IJV;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.data.SparseBlock;
+import org.apache.sysml.runtime.util.UtilFunctions;
 
 public abstract class SpoofOuterProduct extends SpoofOperator
 {
@@ -128,9 +129,11 @@ public abstract class SpoofOuterProduct extends 
SpoofOperator
                        ArrayList<ParOuterProdAggTask> tasks = new 
ArrayList<ParOuterProdAggTask>();                    
                        //create tasks (for wdivmm-left, parallelization over 
columns;
                        //for wdivmm-right, parallelization over rows; both 
ensure disjoint results)
-                       int blklen = (int)(Math.ceil((double)m/numThreads));
-                       for( int i=0; i<numThreads & i*blklen<m; i++ )
-                               tasks.add(new 
ParOuterProdAggTask(inputs.get(0), ab[0], ab[1], b, scalars, m, n, k, 
_outerProductType, i*blklen, Math.min((i+1)*blklen,m), 0, n));
+                       int numThreads2 = 
UtilFunctions.roundToNext(Math.min(8*k,m/32), k);
+                       int blklen = (int)(Math.ceil((double)m/numThreads2));
+                       for( int i=0; i<numThreads2 & i*blklen<m; i++ )
+                               tasks.add(new 
ParOuterProdAggTask(inputs.get(0), ab[0], ab[1], b, scalars, 
+                                       m, n, k, _outerProductType, i*blklen, 
Math.min((i+1)*blklen,m), 0, n));
                        //execute tasks
                        List<Future<Double>> taskret = pool.invokeAll(tasks);
                        pool.shutdown();
@@ -292,8 +295,9 @@ public abstract class SpoofOuterProduct extends 
SpoofOperator
                        }
                        else { //right or cell-wise
                                //parallelize over row partitions
-                               int blklen = 
(int)(Math.ceil((double)m/numThreads));
-                               for( int i=0; i<numThreads & i*blklen<m; i++ )
+                               int numThreads2 = 
UtilFunctions.roundToNext(Math.min(8*k,m/32), k);
+                               int blklen = 
(int)(Math.ceil((double)m/numThreads2));
+                               for( int i=0; i<numThreads2 & i*blklen<m; i++ )
                                        tasks.add(new ParExecTask(a, ab[0], 
ab[1], b, scalars, out, m, n, k, _outerProductType, i*blklen, 
Math.min((i+1)*blklen,m), 0, n));
                        }
                        List<Future<Long>> taskret = pool.invokeAll(tasks);
@@ -378,7 +382,7 @@ public abstract class SpoofOuterProduct extends 
SpoofOperator
                //with custom blocksizeJ for wdivmm_left to avoid LLC misses on 
output.
                final int blocksizeI = (int) (8L*m*n/nnz);
                final int blocksizeJ = left ? 
Math.max(8,Math.min(L2_CACHESIZE/(k*8), blocksizeI)) : blocksizeI;
-               int[] curk = new int[blocksizeI];
+               int[] curk = new int[Math.min(blocksizeI,ru-rl)];
                
                for( int bi = rl; bi < ru; bi+=blocksizeI ) 
                {
@@ -404,8 +408,8 @@ public abstract class SpoofOuterProduct extends 
SpoofOperator
 
                                        int index = wpos + curk[i-bi];
                                        for( ; index<wpos+wlen && 
wix[index]<bjmin; index++ ) {
-                                               genexecDense( wval[index], u, 
uix, v, wix[index]*k, b, scalars, c, 
-                                                               (left ? 
wix[index]*k : uix), m, n, k, i, wix[index] );
+                                               genexecDense(wval[index], u, 
uix, v, wix[index]*k, b, scalars, c, 
+                                                       (left ? wix[index]*k : 
uix), m, n, k, i, wix[index]);
                                        }
                                        curk[i-bi] = index - wpos;
                                }
@@ -417,7 +421,7 @@ public abstract class SpoofOuterProduct extends 
SpoofOperator
                MatrixBlock out, int m, int n, int k, long nnz, OutProdType 
type, int rl, int ru, int cl, int cu ) 
        {
                final int blocksizeIJ = (int) (8L*m*n/nnz); 
-               int[] curk = new int[blocksizeIJ];                      
+               int[] curk = new int[Math.min(blocksizeIJ, ru-rl)];             
        
                
                if( !out.isInSparseFormat() ) //DENSE
                {

Reply via email to