Repository: systemml
Updated Branches:
  refs/heads/master ea2a6e491 -> e106966a9


[SYSTEMML-1958] Performance conv2d (nnz maintenance, bias add)

This patch makes some additional minor performance improvements to the
conv2d operations:

1) Multi-threaded nnz maintenance per thread to exploit temporal
locality and reduce the serial fraction for better utilization.

2) Removed unnecessary nnz maintenance of matrix mult outputs, which are
anyway consumed under the assumption of dense outputs.

3) Bias add per row instead of once per thread-local output in order to
exploit temporal locality. This also replaced the actual bias add
computation with a more efficient implementation.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/cf95849f
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/cf95849f
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/cf95849f

Branch: refs/heads/master
Commit: cf95849f147f6e73c61cf6b5a132fe172506bc2b
Parents: ea2a6e4
Author: Matthias Boehm <[email protected]>
Authored: Thu Oct 12 17:38:51 2017 -0700
Committer: Matthias Boehm <[email protected]>
Committed: Thu Oct 12 20:46:43 2017 -0700

----------------------------------------------------------------------
 .../sysml/runtime/matrix/data/LibMatrixDNN.java | 13 ++++----
 .../matrix/data/LibMatrixDNNConv2dHelper.java   | 31 +++++++++++---------
 .../runtime/matrix/data/LibMatrixDNNHelper.java | 19 ++++--------
 .../runtime/matrix/data/LibMatrixMult.java      | 29 +++++++++++++-----
 .../sysml/runtime/matrix/data/MatrixBlock.java  |  4 +++
 5 files changed, 57 insertions(+), 39 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/cf95849f/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
index 40192de..3f67b1a 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
@@ -164,10 +164,10 @@ public class LibMatrixDNN {
                if(isEligibleForConv2dSparse(params))
                        Statistics.numNativeSparseConv2dCalls.increment();
                
-               execute(LibMatrixDNNHelper.getConv2dWorkers(params), params);
+               long nnz = execute(LibMatrixDNNHelper.getConv2dWorkers(params), 
params);
                
                //post-processing: maintain nnz
-               outputBlock.recomputeNonZeros(); 
+               outputBlock.setNonZeros(nnz);
                outputBlock.examSparsity();
        }
        
@@ -552,14 +552,15 @@ public class LibMatrixDNN {
         * @param params convolution parameters
         * @throws DMLRuntimeException if the error occurs
         */
-       private static void execute(ArrayList<Callable<Long>> tasks, 
ConvolutionParameters params) throws DMLRuntimeException {
+       private static long execute(ArrayList<Callable<Long>> tasks, 
ConvolutionParameters params) throws DMLRuntimeException {
                int k = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+               long lnnz = 0;
                try {
                        if(k == 1) {
                                // Single-threaded execution when called in 
parfor
                                // this avoid unnecessary creation of 
threadpool.
                                for(Callable<Long> task : tasks) {
-                                       task.call();
+                                       lnnz += task.call();
                                }
                        }
                        else {
@@ -567,12 +568,14 @@ public class LibMatrixDNN {
                                List<Future<Long>> taskret = 
pool.invokeAll(tasks);
                                pool.shutdown();
                                for( Future<Long> task : taskret )
-                                       task.get();
+                                       lnnz += task.get();
                        }
                } 
                catch (Exception e) {
                        throw new DMLRuntimeException("Error while executing 
multi-threaded tasks", e);
                }
+               
+               return lnnz;
        }
        
        static boolean 
isEligibleForConv2dBackwardFilterSparseDense(ConvolutionParameters params) {

http://git-wip-us.apache.org/repos/asf/systemml/blob/cf95849f/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
index 1036af7..876996f 100644
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java
@@ -70,16 +70,17 @@ public class LibMatrixDNNConv2dHelper {
                                        // Add the matrix matMultOutBlock of 
shape [K X PQ] to params.output.denseBlock + destPos
                                        add(matMultOutBlock, 
_params.output.getDenseBlock(), n*K*PQ, K, PQ);
                                }
-                       }
-                       if(_params.bias != null) {
-                               // bias is always converted to dense format
-                               LibMatrixDNNHelper.addBias(_rl, _ru, 
_params.output.getDenseBlock(), _params.bias.getDenseBlock(), K, PQ);
+                               // Add bias to current row if necessary, always 
dense
+                               if(_params.bias != null)
+                                       LibMatrixDNNHelper.addBias(n, 
_params.output.getDenseBlock(), _params.bias.getDenseBlock(), K, PQ);
                        }
                        if(DMLScript.STATISTICS && 
LibMatrixDNN.DISPLAY_STATISTICS) {
                                
LibMatrixDNN.loopedConvIm2ColTime.addAndGet(time1);
                                
LibMatrixDNN.loopedConvMatMultTime.addAndGet(time2);
                        }
-                       return 0L;
+                       
+                       //multi-threaded nnz maintenance of current working set
+                       return _params.output.recomputeNonZeros(_rl, _ru-1);
                }
                
                // Copy the matrix src of shape [K X PQ] to 
params.output.denseBlock + destPos
@@ -105,9 +106,7 @@ public class LibMatrixDNNConv2dHelper {
                                        }
                                }
                                else {
-                                       for(int i = 0; i < K * PQ; i++) {
-                                               dest[destPos+i] += 
src.denseBlock[i];
-                                       }
+                                       LibMatrixMult.vectAdd(src.denseBlock, 
dest, 0, destPos, K*PQ);
                                }
                        }
                }
@@ -151,16 +150,19 @@ public class LibMatrixDNNConv2dHelper {
                                
                                // Copy the matrix matMultOutBlock of shape [K 
X PQ] to params.output.denseBlock + destPos
                                partialCopy1(outMM, 
_params.output.getDenseBlock(), n*K*PQ, K, PQ);
+                               
+                               // Add bias to current row if necessary, always 
dense
+                               if(_params.bias != null)
+                                       LibMatrixDNNHelper.addBias(n, 
_params.output.getDenseBlock(), _params.bias.getDenseBlock(), K, PQ);
                        }
-                       if(_params.bias != null) {
-                               // bias is always converted to dense format
-                               LibMatrixDNNHelper.addBias(_rl, _ru, 
_params.output.getDenseBlock(), _params.bias.getDenseBlock(), K, PQ);
-                       }
+                       
                        if(DMLScript.STATISTICS && 
LibMatrixDNN.DISPLAY_STATISTICS) {
                                
LibMatrixDNN.loopedConvIm2ColTime.addAndGet(time1);
                                
LibMatrixDNN.loopedConvMatMultTime.addAndGet(time2);
                        }
-                       return 0L;
+                       
+                       //multi-threaded nnz maintenance of current working set
+                       return _params.output.recomputeNonZeros(_rl, _ru-1);
                }
                
                // Copy the matrix src of shape [K X PQ] to 
params.output.denseBlock + destPos
@@ -216,7 +218,8 @@ public class LibMatrixDNNConv2dHelper {
                                        System.arraycopy(temp, 0, 
_params.output.denseBlock, n*KPQ, KPQ);
                                }
                        }
-                       return 0L;
+                       //multi-threaded nnz maintenance of current working set
+                       return _params.output.recomputeNonZeros(_rl, _ru-1);
                }
        }
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/cf95849f/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
index ff932fa..276a78e 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
@@ -308,6 +308,7 @@ public class LibMatrixDNNHelper {
                        prepNonZerosForMatrixMult(m1, recomputeNNZM1);
                        prepNonZerosForMatrixMult(m2, recomputeNNZM2);
                        LibMatrixMult.matrixMult(m1, m2, ret, false);
+                       ret.setNonZeros((long)ret.rlen*ret.clen);
                }
                else {
                        ret.sparse = false;
@@ -319,17 +320,9 @@ public class LibMatrixDNNHelper {
                }
        }
        
-       static void addBias(int _rl, int _ru, double [] outputArr, double [] 
biasArr, int K, int PQ) {
-               // double [] biasArr = _params.bias.getDenseBlock();
-               
-               int index = _rl*K*PQ;
-               for(int n = _rl; n < _ru; n++) {
-                       for(int k = 0; k < K; k++) {
-                               for(int pq = 0; pq < PQ; pq++, index++) {
-                                       outputArr[index] += biasArr[k];
-                               }
-                       }
-               }
+       static void addBias(int r, double [] out, double [] bias, int K, int 
PQ) {
+               for(int k=0, cix=r*K*PQ; k<K; k++, cix+=PQ)
+                       LibMatrixMult.vectAddInPlace(bias[k], out, cix, PQ);
        }
        
        /**
@@ -555,8 +548,8 @@ public class LibMatrixDNNHelper {
                //non-zeros are not evaluated for dense matrix multiplies
                //so we simply need to ensure the block is not marked empty 
                if( !mb.isInSparseFormat() )
-                       mb.setNonZeros(mb.getNumRows() * mb.getNumColumns());
+                       mb.setNonZeros((long)mb.getNumRows() * 
mb.getNumColumns());
                else
-                       mb.recomputeNonZeros(); 
+                       mb.recomputeNonZeros();
        }
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/cf95849f/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index 181ff98..190d807 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -95,13 +95,13 @@ public class LibMatrixMult
         * @param m1 first matrix
         * @param m2 second matrix
         * @param ret result matrix
-        * @param examSparsity if false, sparsity examination is disabled
+        * @param maintainNnz if false, nnzs are not recomputed and evaluated
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public static void matrixMult(MatrixBlock m1, MatrixBlock m2, 
MatrixBlock ret, boolean examSparsity) 
+       public static void matrixMult(MatrixBlock m1, MatrixBlock m2, 
MatrixBlock ret, boolean maintainNnz) 
                        throws DMLRuntimeException
        {       
-               matrixMult(m1, m2, ret, 0, m1.rlen, examSparsity);
+               matrixMult(m1, m2, ret, 0, m1.rlen, maintainNnz);
        }
        
        public static void matrixMult(MatrixBlock m1, MatrixBlock m2, 
MatrixBlock ret, int rl, int ru) 
@@ -110,7 +110,7 @@ public class LibMatrixMult
                matrixMult(m1, m2, ret, rl, ru, true);
        }
        
-       public static void matrixMult(MatrixBlock m1, MatrixBlock m2, 
MatrixBlock ret, int rl, int ru, boolean examSparsity) 
+       public static void matrixMult(MatrixBlock m1, MatrixBlock m2, 
MatrixBlock ret, int rl, int ru, boolean maintainNnz) 
                throws DMLRuntimeException
        {
                //check inputs / outputs
@@ -146,10 +146,11 @@ public class LibMatrixMult
                        matrixMultDenseSparse(m1, m2, ret, pm2, 0, ru2);
                
                //post-processing: nnz/representation
-               if( !ret.sparse )
-                       ret.recomputeNonZeros();
-               if(examSparsity)
+               if( maintainNnz ) {
+                       if( !ret.sparse )
+                               ret.recomputeNonZeros();
                        ret.examSparsity();
+               }
                
                //System.out.println("MM 
("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+")x"
 +
                //              
"("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+")
 in "+time.stop());
@@ -3282,6 +3283,20 @@ public class LibMatrixMult
                        c[ ci+7 ] += a1[ ai+7 ] + a2[ ai+7 ] + a3[ ai+7 ] + a4[ 
ai+7 ];
                }
        }
+       
+       public static void vectAddInPlace(double aval, double[] c, final int 
ci, final int len) {
+               final int bn = len%8;
+               //rest, not aligned to 8-blocks
+               for( int j = ci; j < ci+bn; j++)
+                       c[ j ] += aval;
+               //unrolled 8-block  (for better instruction-level parallelism)
+               for( int j = ci+bn; j < ci+len; j+=8) {
+                       c[ j+0 ] += aval; c[ j+1 ] += aval; 
+                       c[ j+2 ] += aval; c[ j+3 ] += aval;
+                       c[ j+4 ] += aval; c[ j+5 ] += aval;
+                       c[ j+6 ] += aval; c[ j+7 ] += aval;
+               }
+       }
 
        private static void vectSubtract( double[] a, double[] c, int ai, int 
ci, final int len )
        {

http://git-wip-us.apache.org/repos/asf/systemml/blob/cf95849f/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
index 822c6f9..432b18a 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
@@ -1217,6 +1217,10 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock, Externalizab
                return nonZeros;
        }
        
+       public long recomputeNonZeros(int rl, int ru) {
+               return recomputeNonZeros(rl, ru, 0, clen-1);
+       }
+       
        /**
         * Recomputes the number of non-zero values of a specified 
         * range of the matrix block. NOTE: This call does not materialize

Reply via email to