Repository: systemml
Updated Branches:
  refs/heads/master 4e3ebcaeb -> 62a1b75ba


[SYSTEMML-1767] Performance codegen rowwise template w/ column agg

This patch makes the codegen row-wise template consistent with the
mmchain operation in terms of its condition to fallback to
single-threaded operations if the temporary memory for partial
aggregations exceed the internal threshold. On a scenario with 2M spase
features, this patch improved performance by 20x because it avoids
unnecessary L3 cache thrashing.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/62a1b75b
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/62a1b75b
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/62a1b75b

Branch: refs/heads/master
Commit: 62a1b75baf5d3ae3225ca4126e5a3ea93aa86a0f
Parents: 4e3ebca
Author: Matthias Boehm <mboe...@gmail.com>
Authored: Wed Jul 12 19:25:52 2017 -0700
Committer: Matthias Boehm <mboe...@gmail.com>
Committed: Wed Jul 12 19:26:12 2017 -0700

----------------------------------------------------------------------
 .../hops/codegen/template/TemplateRow.java      |  1 +
 .../sysml/runtime/codegen/SpoofRowwise.java     | 20 +++++++++++++-------
 .../runtime/matrix/data/LibMatrixMult.java      |  9 ++++++---
 3 files changed, 20 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/62a1b75b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java 
b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
index 0bc0380..5cb016c 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
@@ -320,6 +320,7 @@ public class TemplateRow extends TemplateBase
                        //special case for cbind with zeros
                        CNode cdata1 = 
tmp.get(hop.getInput().get(0).getHopID());
                        out = new CNodeUnary(cdata1, UnaryType.CBIND0);
+                       inHops.remove(hop.getInput().get(1)); //rm 0-matrix
                }
                else if(hop instanceof BinaryOp)
                {

http://git-wip-us.apache.org/repos/asf/systemml/blob/62a1b75b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java 
b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
index 13536d3..dc6baff 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
@@ -131,7 +131,7 @@ public abstract class SpoofRowwise extends SpoofOperator
                double[] scalars = prepInputScalars(scalarObjects);
                
                //setup thread-local memory if necessary
-               if( allocTmp )
+               if( allocTmp &&_reqVectMem > 0 )
                        LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, 
n, n2);
                
                //core sequential execute
@@ -144,7 +144,7 @@ public abstract class SpoofRowwise extends SpoofOperator
                        executeSparse(a.getSparseBlock(), b, scalars, c, n, 0, 
m);
                
                //post-processing
-               if( allocTmp )
+               if( allocTmp &&_reqVectMem > 0 )
                        LibSpoofPrimitives.cleanupThreadLocalMemory();
                out.recomputeNonZeros();
                out.examSparsity();
@@ -155,7 +155,8 @@ public abstract class SpoofRowwise extends SpoofOperator
                throws DMLRuntimeException
        {
                //redirect to serial execution
-               if( k <= 1 || 
(long)inputs.get(0).getNumRows()*inputs.get(0).getNumColumns()<PAR_NUMCELL_THRESHOLD
 ) {
+               if( k <= 1 || (_type.isColumnAgg() && 
!LibMatrixMult.checkParColumnAgg(inputs.get(0), k, false))
+                       || 
(long)inputs.get(0).getNumRows()*inputs.get(0).getNumColumns()<PAR_NUMCELL_THRESHOLD
 ) {
                        execute(inputs, scalarObjects, out);
                        return;
                }
@@ -320,7 +321,8 @@ public abstract class SpoofRowwise extends SpoofOperator
                public double[] call() throws DMLRuntimeException {
                        
                        //allocate vector intermediates and partial output
-                       LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, 
_clen, _clen2);
+                       if( _reqVectMem > 0 )
+                               
LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, _clen, _clen2);
                        double[] c = new double[(_clen2>0)?_clen*_clen2 : 
_clen];
                        
                        if( _a instanceof CompressedMatrixBlock )
@@ -330,7 +332,8 @@ public abstract class SpoofRowwise extends SpoofOperator
                        else
                                executeSparse(_a.getSparseBlock(), _b, 
_scalars, c, _clen, _rl, _ru);
                        
-                       LibSpoofPrimitives.cleanupThreadLocalMemory();
+                       if( _reqVectMem > 0 )
+                               LibSpoofPrimitives.cleanupThreadLocalMemory();
                        return c;
                }
        }
@@ -363,7 +366,8 @@ public abstract class SpoofRowwise extends SpoofOperator
                @Override
                public Long call() throws DMLRuntimeException {
                        //allocate vector intermediates
-                       LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, 
_clen, _clen2);
+                       if( _reqVectMem > 0 )
+                               
LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, _clen, _clen2);
                        
                        if( _a instanceof CompressedMatrixBlock )
                                executeCompressed((CompressedMatrixBlock)_a, 
_b, _scalars, _c.getDenseBlock(), _clen, _rl, _ru);
@@ -371,7 +375,9 @@ public abstract class SpoofRowwise extends SpoofOperator
                                executeDense(_a.getDenseBlock(), _b, _scalars, 
_c.getDenseBlock(), _clen, _rl, _ru);
                        else
                                executeSparse(_a.getSparseBlock(), _b, 
_scalars, _c.getDenseBlock(), _clen, _rl, _ru);
-                       LibSpoofPrimitives.cleanupThreadLocalMemory();
+                       
+                       if( _reqVectMem > 0 )
+                               LibSpoofPrimitives.cleanupThreadLocalMemory();
                        
                        //maintain nnz for row partition
                        return _c.recomputeNonZeros(_rl, _ru-1, 0, 
_c.getNumColumns()-1);

http://git-wip-us.apache.org/repos/asf/systemml/blob/62a1b75b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index da3b12b..30e7d3d 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -309,9 +309,7 @@ public class LibMatrixMult
                
                //check too high additional memory requirements (fallback to 
sequential)
                //check too small workload in terms of flops (fallback to 
sequential too)
-               if( 8L * mV.rlen * k > MEM_OVERHEAD_THRESHOLD 
-                       || 4L * mX.rlen * mX.clen < PAR_MINFLOP_THRESHOLD) 
-               { 
+               if( !checkParColumnAgg(mX, k, true) ) { 
                        matrixMultChain(mX, mV, mW, ret, ct);
                        return;
                }
@@ -3531,6 +3529,11 @@ public class LibMatrixMult
                return m1rlen > m2clen && m2rlen > m2clen && m2clen > 1 
                        && m2clen < 64 && 8*m2rlen*m2clen < L2_CACHESIZE;
        }
+       
+       public static boolean checkParColumnAgg(MatrixBlock m1, int k, boolean 
inclFLOPs) {
+               return (8L * m1.clen * k <= MEM_OVERHEAD_THRESHOLD 
+                       && (!inclFLOPs || 4L * m1.rlen * m1.clen >= 
PAR_MINFLOP_THRESHOLD));
+       }
 
        private static boolean checkParMatrixMultRightInputRows( MatrixBlock 
m1, MatrixBlock m2, int k ) {
                //parallelize over rows in rhs matrix if number of rows in 
lhs/output is very small

Reply via email to