Repository: systemml Updated Branches: refs/heads/master fcfbd3d24 -> a2bf0006f
[SYSTEMML-1841] Performance codegen outer operations (GC, load balance) This patch improves performance of several variations of codegen outer operations, especially for sparse and ultra-sparse input data. In detail, this includes (1) an increased number of tasks for better load balance, and (2) partition-aware sizes of thread-local temporary arrays for cache-conscious sparse implementations. For example on the ultra-sparse Amazon Books review dataset (8,026,324 x 2,330,066, nnz=22,507,155) and a 100 iterations of sum(X * U%*%t(V)) and (X * U%*%t(V))%*%V this patch improved performance from 63s to 32s and from 124s to 70s, respectively. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/86edac16 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/86edac16 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/86edac16 Branch: refs/heads/master Commit: 86edac16516e4330a85f65ceaba87dd942205971 Parents: fcfbd3d Author: Matthias Boehm <[email protected]> Authored: Tue Aug 15 17:49:37 2017 -0700 Committer: Matthias Boehm <[email protected]> Committed: Tue Aug 15 17:49:37 2017 -0700 ---------------------------------------------------------------------- .../runtime/codegen/SpoofOuterProduct.java | 22 ++++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/86edac16/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java index 9148dd4..c49201a 100644 --- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java +++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofOuterProduct.java @@ -35,6 +35,7 @@ import org.apache.sysml.runtime.instructions.cp.ScalarObject; import org.apache.sysml.runtime.matrix.data.IJV; import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.matrix.data.SparseBlock; +import org.apache.sysml.runtime.util.UtilFunctions; public abstract class SpoofOuterProduct extends SpoofOperator { @@ -128,9 +129,11 @@ public abstract class SpoofOuterProduct extends SpoofOperator ArrayList<ParOuterProdAggTask> tasks = new ArrayList<ParOuterProdAggTask>(); //create tasks (for wdivmm-left, parallelization over columns; //for wdivmm-right, parallelization over rows; both ensure disjoint results) - int blklen = (int)(Math.ceil((double)m/numThreads)); - for( int i=0; i<numThreads & i*blklen<m; i++ ) - tasks.add(new ParOuterProdAggTask(inputs.get(0), ab[0], ab[1], b, scalars, m, n, k, _outerProductType, i*blklen, Math.min((i+1)*blklen,m), 0, n)); + int numThreads2 = UtilFunctions.roundToNext(Math.min(8*k,m/32), k); + int blklen = (int)(Math.ceil((double)m/numThreads2)); + for( int i=0; i<numThreads2 & i*blklen<m; i++ ) + tasks.add(new ParOuterProdAggTask(inputs.get(0), ab[0], ab[1], b, scalars, + m, n, k, _outerProductType, i*blklen, Math.min((i+1)*blklen,m), 0, n)); //execute tasks List<Future<Double>> taskret = pool.invokeAll(tasks); pool.shutdown(); @@ -292,8 +295,9 @@ public abstract class SpoofOuterProduct extends SpoofOperator } else { //right or cell-wise //parallelize over row partitions - int blklen = (int)(Math.ceil((double)m/numThreads)); - for( int i=0; i<numThreads & i*blklen<m; i++ ) + int numThreads2 = UtilFunctions.roundToNext(Math.min(8*k,m/32), k); + int blklen = (int)(Math.ceil((double)m/numThreads2)); + for( int i=0; i<numThreads2 & i*blklen<m; i++ ) tasks.add(new ParExecTask(a, ab[0], ab[1], b, scalars, out, m, n, k, _outerProductType, i*blklen, Math.min((i+1)*blklen,m), 0, n)); } List<Future<Long>> taskret = pool.invokeAll(tasks); @@ -378,7 +382,7 @@ public abstract class SpoofOuterProduct extends SpoofOperator //with custom blocksizeJ for wdivmm_left to avoid LLC misses on output. final int blocksizeI = (int) (8L*m*n/nnz); final int blocksizeJ = left ? Math.max(8,Math.min(L2_CACHESIZE/(k*8), blocksizeI)) : blocksizeI; - int[] curk = new int[blocksizeI]; + int[] curk = new int[Math.min(blocksizeI,ru-rl)]; for( int bi = rl; bi < ru; bi+=blocksizeI ) { @@ -404,8 +408,8 @@ public abstract class SpoofOuterProduct extends SpoofOperator int index = wpos + curk[i-bi]; for( ; index<wpos+wlen && wix[index]<bjmin; index++ ) { - genexecDense( wval[index], u, uix, v, wix[index]*k, b, scalars, c, - (left ? wix[index]*k : uix), m, n, k, i, wix[index] ); + genexecDense(wval[index], u, uix, v, wix[index]*k, b, scalars, c, + (left ? wix[index]*k : uix), m, n, k, i, wix[index]); } curk[i-bi] = index - wpos; } @@ -417,7 +421,7 @@ public abstract class SpoofOuterProduct extends SpoofOperator MatrixBlock out, int m, int n, int k, long nnz, OutProdType type, int rl, int ru, int cl, int cu ) { final int blocksizeIJ = (int) (8L*m*n/nnz); - int[] curk = new int[blocksizeIJ]; + int[] curk = new int[Math.min(blocksizeIJ, ru-rl)]; if( !out.isInSparseFormat() ) //DENSE {
