Repository: systemml Updated Branches: refs/heads/master 4e3ebcaeb -> 62a1b75ba
[SYSTEMML-1767] Performance codegen rowwise template w/ column agg This patch makes the codegen row-wise template consistent with the mmchain operation in terms of its condition to fallback to single-threaded operations if the temporary memory for partial aggregations exceed the internal threshold. On a scenario with 2M spase features, this patch improved performance by 20x because it avoids unnecessary L3 cache thrashing. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/62a1b75b Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/62a1b75b Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/62a1b75b Branch: refs/heads/master Commit: 62a1b75baf5d3ae3225ca4126e5a3ea93aa86a0f Parents: 4e3ebca Author: Matthias Boehm <mboe...@gmail.com> Authored: Wed Jul 12 19:25:52 2017 -0700 Committer: Matthias Boehm <mboe...@gmail.com> Committed: Wed Jul 12 19:26:12 2017 -0700 ---------------------------------------------------------------------- .../hops/codegen/template/TemplateRow.java | 1 + .../sysml/runtime/codegen/SpoofRowwise.java | 20 +++++++++++++------- .../runtime/matrix/data/LibMatrixMult.java | 9 ++++++--- 3 files changed, 20 insertions(+), 10 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/62a1b75b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java index 0bc0380..5cb016c 100644 --- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java +++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java @@ -320,6 +320,7 @@ public class TemplateRow extends TemplateBase //special case for cbind with zeros CNode cdata1 = tmp.get(hop.getInput().get(0).getHopID()); out = new CNodeUnary(cdata1, UnaryType.CBIND0); + inHops.remove(hop.getInput().get(1)); //rm 0-matrix } else if(hop instanceof BinaryOp) { http://git-wip-us.apache.org/repos/asf/systemml/blob/62a1b75b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java index 13536d3..dc6baff 100644 --- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java +++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java @@ -131,7 +131,7 @@ public abstract class SpoofRowwise extends SpoofOperator double[] scalars = prepInputScalars(scalarObjects); //setup thread-local memory if necessary - if( allocTmp ) + if( allocTmp &&_reqVectMem > 0 ) LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, n, n2); //core sequential execute @@ -144,7 +144,7 @@ public abstract class SpoofRowwise extends SpoofOperator executeSparse(a.getSparseBlock(), b, scalars, c, n, 0, m); //post-processing - if( allocTmp ) + if( allocTmp &&_reqVectMem > 0 ) LibSpoofPrimitives.cleanupThreadLocalMemory(); out.recomputeNonZeros(); out.examSparsity(); @@ -155,7 +155,8 @@ public abstract class SpoofRowwise extends SpoofOperator throws DMLRuntimeException { //redirect to serial execution - if( k <= 1 || (long)inputs.get(0).getNumRows()*inputs.get(0).getNumColumns()<PAR_NUMCELL_THRESHOLD ) { + if( k <= 1 || (_type.isColumnAgg() && !LibMatrixMult.checkParColumnAgg(inputs.get(0), k, false)) + || (long)inputs.get(0).getNumRows()*inputs.get(0).getNumColumns()<PAR_NUMCELL_THRESHOLD ) { execute(inputs, scalarObjects, out); return; } @@ -320,7 +321,8 @@ public abstract class SpoofRowwise extends SpoofOperator public double[] call() throws DMLRuntimeException { //allocate vector intermediates and partial output - LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, _clen, _clen2); + if( _reqVectMem > 0 ) + LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, _clen, _clen2); double[] c = new double[(_clen2>0)?_clen*_clen2 : _clen]; if( _a instanceof CompressedMatrixBlock ) @@ -330,7 +332,8 @@ public abstract class SpoofRowwise extends SpoofOperator else executeSparse(_a.getSparseBlock(), _b, _scalars, c, _clen, _rl, _ru); - LibSpoofPrimitives.cleanupThreadLocalMemory(); + if( _reqVectMem > 0 ) + LibSpoofPrimitives.cleanupThreadLocalMemory(); return c; } } @@ -363,7 +366,8 @@ public abstract class SpoofRowwise extends SpoofOperator @Override public Long call() throws DMLRuntimeException { //allocate vector intermediates - LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, _clen, _clen2); + if( _reqVectMem > 0 ) + LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, _clen, _clen2); if( _a instanceof CompressedMatrixBlock ) executeCompressed((CompressedMatrixBlock)_a, _b, _scalars, _c.getDenseBlock(), _clen, _rl, _ru); @@ -371,7 +375,9 @@ public abstract class SpoofRowwise extends SpoofOperator executeDense(_a.getDenseBlock(), _b, _scalars, _c.getDenseBlock(), _clen, _rl, _ru); else executeSparse(_a.getSparseBlock(), _b, _scalars, _c.getDenseBlock(), _clen, _rl, _ru); - LibSpoofPrimitives.cleanupThreadLocalMemory(); + + if( _reqVectMem > 0 ) + LibSpoofPrimitives.cleanupThreadLocalMemory(); //maintain nnz for row partition return _c.recomputeNonZeros(_rl, _ru-1, 0, _c.getNumColumns()-1); http://git-wip-us.apache.org/repos/asf/systemml/blob/62a1b75b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java index da3b12b..30e7d3d 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java @@ -309,9 +309,7 @@ public class LibMatrixMult //check too high additional memory requirements (fallback to sequential) //check too small workload in terms of flops (fallback to sequential too) - if( 8L * mV.rlen * k > MEM_OVERHEAD_THRESHOLD - || 4L * mX.rlen * mX.clen < PAR_MINFLOP_THRESHOLD) - { + if( !checkParColumnAgg(mX, k, true) ) { matrixMultChain(mX, mV, mW, ret, ct); return; } @@ -3531,6 +3529,11 @@ public class LibMatrixMult return m1rlen > m2clen && m2rlen > m2clen && m2clen > 1 && m2clen < 64 && 8*m2rlen*m2clen < L2_CACHESIZE; } + + public static boolean checkParColumnAgg(MatrixBlock m1, int k, boolean inclFLOPs) { + return (8L * m1.clen * k <= MEM_OVERHEAD_THRESHOLD + && (!inclFLOPs || 4L * m1.rlen * m1.clen >= PAR_MINFLOP_THRESHOLD)); + } private static boolean checkParMatrixMultRightInputRows( MatrixBlock m1, MatrixBlock m2, int k ) { //parallelize over rows in rhs matrix if number of rows in lhs/output is very small