Repository: systemml Updated Branches: refs/heads/master ea2a6e491 -> e106966a9
[SYSTEMML-1958] Performance conv2d (nnz maintenance, bias add) This patch makes some additional minor performance improvements to the conv2d operations: 1) Multi-threaded nnz maintenance per thread to exploit temporal locality and reduce the serial fraction for better utilization. 2) Removed unnecessary nnz maintenance of matrix mult outputs, which are anyway consumed under the assumption of dense outputs. 3) Bias add per row instead of once per thread-local output in order to exploit temporal locality. This also replaced the actual bias add computation with a more efficient implementation. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/cf95849f Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/cf95849f Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/cf95849f Branch: refs/heads/master Commit: cf95849f147f6e73c61cf6b5a132fe172506bc2b Parents: ea2a6e4 Author: Matthias Boehm <[email protected]> Authored: Thu Oct 12 17:38:51 2017 -0700 Committer: Matthias Boehm <[email protected]> Committed: Thu Oct 12 20:46:43 2017 -0700 ---------------------------------------------------------------------- .../sysml/runtime/matrix/data/LibMatrixDNN.java | 13 ++++---- .../matrix/data/LibMatrixDNNConv2dHelper.java | 31 +++++++++++--------- .../runtime/matrix/data/LibMatrixDNNHelper.java | 19 ++++-------- .../runtime/matrix/data/LibMatrixMult.java | 29 +++++++++++++----- .../sysml/runtime/matrix/data/MatrixBlock.java | 4 +++ 5 files changed, 57 insertions(+), 39 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/cf95849f/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java index 40192de..3f67b1a 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java @@ -164,10 +164,10 @@ public class LibMatrixDNN { if(isEligibleForConv2dSparse(params)) Statistics.numNativeSparseConv2dCalls.increment(); - execute(LibMatrixDNNHelper.getConv2dWorkers(params), params); + long nnz = execute(LibMatrixDNNHelper.getConv2dWorkers(params), params); //post-processing: maintain nnz - outputBlock.recomputeNonZeros(); + outputBlock.setNonZeros(nnz); outputBlock.examSparsity(); } @@ -552,14 +552,15 @@ public class LibMatrixDNN { * @param params convolution parameters * @throws DMLRuntimeException if the error occurs */ - private static void execute(ArrayList<Callable<Long>> tasks, ConvolutionParameters params) throws DMLRuntimeException { + private static long execute(ArrayList<Callable<Long>> tasks, ConvolutionParameters params) throws DMLRuntimeException { int k = OptimizerUtils.getConstrainedNumThreads(params.numThreads); + long lnnz = 0; try { if(k == 1) { // Single-threaded execution when called in parfor // this avoid unnecessary creation of threadpool. for(Callable<Long> task : tasks) { - task.call(); + lnnz += task.call(); } } else { @@ -567,12 +568,14 @@ public class LibMatrixDNN { List<Future<Long>> taskret = pool.invokeAll(tasks); pool.shutdown(); for( Future<Long> task : taskret ) - task.get(); + lnnz += task.get(); } } catch (Exception e) { throw new DMLRuntimeException("Error while executing multi-threaded tasks", e); } + + return lnnz; } static boolean isEligibleForConv2dBackwardFilterSparseDense(ConvolutionParameters params) { http://git-wip-us.apache.org/repos/asf/systemml/blob/cf95849f/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java index 1036af7..876996f 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java @@ -70,16 +70,17 @@ public class LibMatrixDNNConv2dHelper { // Add the matrix matMultOutBlock of shape [K X PQ] to params.output.denseBlock + destPos add(matMultOutBlock, _params.output.getDenseBlock(), n*K*PQ, K, PQ); } - } - if(_params.bias != null) { - // bias is always converted to dense format - LibMatrixDNNHelper.addBias(_rl, _ru, _params.output.getDenseBlock(), _params.bias.getDenseBlock(), K, PQ); + // Add bias to current row if necessary, always dense + if(_params.bias != null) + LibMatrixDNNHelper.addBias(n, _params.output.getDenseBlock(), _params.bias.getDenseBlock(), K, PQ); } if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) { LibMatrixDNN.loopedConvIm2ColTime.addAndGet(time1); LibMatrixDNN.loopedConvMatMultTime.addAndGet(time2); } - return 0L; + + //multi-threaded nnz maintenance of current working set + return _params.output.recomputeNonZeros(_rl, _ru-1); } // Copy the matrix src of shape [K X PQ] to params.output.denseBlock + destPos @@ -105,9 +106,7 @@ public class LibMatrixDNNConv2dHelper { } } else { - for(int i = 0; i < K * PQ; i++) { - dest[destPos+i] += src.denseBlock[i]; - } + LibMatrixMult.vectAdd(src.denseBlock, dest, 0, destPos, K*PQ); } } } @@ -151,16 +150,19 @@ public class LibMatrixDNNConv2dHelper { // Copy the matrix matMultOutBlock of shape [K X PQ] to params.output.denseBlock + destPos partialCopy1(outMM, _params.output.getDenseBlock(), n*K*PQ, K, PQ); + + // Add bias to current row if necessary, always dense + if(_params.bias != null) + LibMatrixDNNHelper.addBias(n, _params.output.getDenseBlock(), _params.bias.getDenseBlock(), K, PQ); } - if(_params.bias != null) { - // bias is always converted to dense format - LibMatrixDNNHelper.addBias(_rl, _ru, _params.output.getDenseBlock(), _params.bias.getDenseBlock(), K, PQ); - } + if(DMLScript.STATISTICS && LibMatrixDNN.DISPLAY_STATISTICS) { LibMatrixDNN.loopedConvIm2ColTime.addAndGet(time1); LibMatrixDNN.loopedConvMatMultTime.addAndGet(time2); } - return 0L; + + //multi-threaded nnz maintenance of current working set + return _params.output.recomputeNonZeros(_rl, _ru-1); } // Copy the matrix src of shape [K X PQ] to params.output.denseBlock + destPos @@ -216,7 +218,8 @@ public class LibMatrixDNNConv2dHelper { System.arraycopy(temp, 0, _params.output.denseBlock, n*KPQ, KPQ); } } - return 0L; + //multi-threaded nnz maintenance of current working set + return _params.output.recomputeNonZeros(_rl, _ru-1); } } } http://git-wip-us.apache.org/repos/asf/systemml/blob/cf95849f/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java index ff932fa..276a78e 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java @@ -308,6 +308,7 @@ public class LibMatrixDNNHelper { prepNonZerosForMatrixMult(m1, recomputeNNZM1); prepNonZerosForMatrixMult(m2, recomputeNNZM2); LibMatrixMult.matrixMult(m1, m2, ret, false); + ret.setNonZeros((long)ret.rlen*ret.clen); } else { ret.sparse = false; @@ -319,17 +320,9 @@ public class LibMatrixDNNHelper { } } - static void addBias(int _rl, int _ru, double [] outputArr, double [] biasArr, int K, int PQ) { - // double [] biasArr = _params.bias.getDenseBlock(); - - int index = _rl*K*PQ; - for(int n = _rl; n < _ru; n++) { - for(int k = 0; k < K; k++) { - for(int pq = 0; pq < PQ; pq++, index++) { - outputArr[index] += biasArr[k]; - } - } - } + static void addBias(int r, double [] out, double [] bias, int K, int PQ) { + for(int k=0, cix=r*K*PQ; k<K; k++, cix+=PQ) + LibMatrixMult.vectAddInPlace(bias[k], out, cix, PQ); } /** @@ -555,8 +548,8 @@ public class LibMatrixDNNHelper { //non-zeros are not evaluated for dense matrix multiplies //so we simply need to ensure the block is not marked empty if( !mb.isInSparseFormat() ) - mb.setNonZeros(mb.getNumRows() * mb.getNumColumns()); + mb.setNonZeros((long)mb.getNumRows() * mb.getNumColumns()); else - mb.recomputeNonZeros(); + mb.recomputeNonZeros(); } } http://git-wip-us.apache.org/repos/asf/systemml/blob/cf95849f/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java index 181ff98..190d807 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java @@ -95,13 +95,13 @@ public class LibMatrixMult * @param m1 first matrix * @param m2 second matrix * @param ret result matrix - * @param examSparsity if false, sparsity examination is disabled + * @param maintainNnz if false, nnzs are not recomputed and evaluated * @throws DMLRuntimeException if DMLRuntimeException occurs */ - public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean examSparsity) + public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean maintainNnz) throws DMLRuntimeException { - matrixMult(m1, m2, ret, 0, m1.rlen, examSparsity); + matrixMult(m1, m2, ret, 0, m1.rlen, maintainNnz); } public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru) @@ -110,7 +110,7 @@ public class LibMatrixMult matrixMult(m1, m2, ret, rl, ru, true); } - public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru, boolean examSparsity) + public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru, boolean maintainNnz) throws DMLRuntimeException { //check inputs / outputs @@ -146,10 +146,11 @@ public class LibMatrixMult matrixMultDenseSparse(m1, m2, ret, pm2, 0, ru2); //post-processing: nnz/representation - if( !ret.sparse ) - ret.recomputeNonZeros(); - if(examSparsity) + if( maintainNnz ) { + if( !ret.sparse ) + ret.recomputeNonZeros(); ret.examSparsity(); + } //System.out.println("MM ("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+")x" + // "("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop()); @@ -3282,6 +3283,20 @@ public class LibMatrixMult c[ ci+7 ] += a1[ ai+7 ] + a2[ ai+7 ] + a3[ ai+7 ] + a4[ ai+7 ]; } } + + public static void vectAddInPlace(double aval, double[] c, final int ci, final int len) { + final int bn = len%8; + //rest, not aligned to 8-blocks + for( int j = ci; j < ci+bn; j++) + c[ j ] += aval; + //unrolled 8-block (for better instruction-level parallelism) + for( int j = ci+bn; j < ci+len; j+=8) { + c[ j+0 ] += aval; c[ j+1 ] += aval; + c[ j+2 ] += aval; c[ j+3 ] += aval; + c[ j+4 ] += aval; c[ j+5 ] += aval; + c[ j+6 ] += aval; c[ j+7 ] += aval; + } + } private static void vectSubtract( double[] a, double[] c, int ai, int ci, final int len ) { http://git-wip-us.apache.org/repos/asf/systemml/blob/cf95849f/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java index 822c6f9..432b18a 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java @@ -1217,6 +1217,10 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab return nonZeros; } + public long recomputeNonZeros(int rl, int ru) { + return recomputeNonZeros(rl, ru, 0, clen-1); + } + /** * Recomputes the number of non-zero values of a specified * range of the matrix block. NOTE: This call does not materialize
