Repository: incubator-systemml Updated Branches: refs/heads/master 2ebf885a6 -> ab45af17c
[SYSTEMML-769] Minor improvement for dense-dense conv2d and added statistics method for performance debugging Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/ab45af17 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/ab45af17 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/ab45af17 Branch: refs/heads/master Commit: ab45af17c3ff54a77262a318c5d0be084384b8f7 Parents: 2ebf885 Author: Niketan Pansare <[email protected]> Authored: Sat Jul 9 09:33:52 2016 -0700 Committer: Niketan Pansare <[email protected]> Committed: Sat Jul 9 09:33:52 2016 -0700 ---------------------------------------------------------------------- .../sysml/runtime/matrix/data/LibMatrixDNN.java | 144 ++++++++++++------- 1 file changed, 94 insertions(+), 50 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/ab45af17/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java index 26e2b8b..3014b49 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java @@ -32,11 +32,11 @@ import java.util.concurrent.atomic.AtomicLong; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.sysml.api.DMLScript; import org.apache.sysml.hops.OptimizerUtils; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.util.ConvolutionUtils; - public class LibMatrixDNN { protected static final Log LOG = LogFactory.getLog(LibMatrixDNN.class.getName()); @@ -77,6 +77,44 @@ public class LibMatrixDNN { int maxCommonIndexS; } + private static AtomicLong conv2dSparseCount = new AtomicLong(0); + private static AtomicLong conv2dDenseCount = new AtomicLong(0); + private static AtomicLong conv2dBwdFilterSparseCount = new AtomicLong(0); + private static AtomicLong conv2dBwdFilterDenseCount = new AtomicLong(0); + private static AtomicLong conv2dBwdDataSparseCount = new AtomicLong(0); + private static AtomicLong conv2dBwdDataDenseCount = new AtomicLong(0); + private static AtomicLong im2colSparseCount = new AtomicLong(0); + private static AtomicLong im2colDenseCount = new AtomicLong(0); + private static AtomicLong maxPoolBwdSparseCount = new AtomicLong(0); + private static AtomicLong maxPoolBwdDenseCount = new AtomicLong(0); + public static void appendStatistics(StringBuilder sb) { + sb.append("LibMatrixDNN dense count (conv/bwdF/bwdD/im2col/maxBwd):\t" + + conv2dDenseCount.get() + "/" + + conv2dBwdFilterDenseCount.get() + "/" + + conv2dBwdDataDenseCount.get() + "/" + + im2colDenseCount.get() + "/" + + maxPoolBwdDenseCount.get() + ".\n"); + sb.append("LibMatrixDNN sparse count (conv/bwdF/bwdD/im2col/maxBwd):\t" + + conv2dSparseCount.get() + "/" + + conv2dBwdFilterSparseCount.get() + "/" + + conv2dBwdDataSparseCount.get() + "/" + + im2colSparseCount.get() + "/" + + maxPoolBwdSparseCount.get() + ".\n"); + } + public static void resetStatistics() { + conv2dDenseCount.set(0); + conv2dBwdFilterDenseCount.set(0); + conv2dBwdDataDenseCount.set(0); + im2colDenseCount.set(0); + maxPoolBwdDenseCount.set(0); + + conv2dSparseCount.set(0); + conv2dBwdFilterSparseCount.set(0); + conv2dBwdDataSparseCount.set(0); + im2colSparseCount.set(0); + maxPoolBwdSparseCount.set(0); + } + public static class ConvolutionParameters { public int N; public int C; public int H; public int W; public int K; public int R; public int S; public int stride_h; public int stride_w; public int pad_h; public int pad_w; @@ -169,6 +207,15 @@ public class LibMatrixDNN { throw new DMLRuntimeException("Only positive strides supported"); } + if(DMLScript.STATISTICS) { + if(input.isInSparseFormat() || dout.isInSparseFormat()) { + conv2dBwdFilterSparseCount.addAndGet(1); + } + else { + conv2dBwdFilterDenseCount.addAndGet(1); + } + } + int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads); if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) { for (int c = 0; c < params.C; c++) { @@ -366,6 +413,15 @@ public class LibMatrixDNN { throw new DMLRuntimeException("Incorrect input to conv2d"); } + if(DMLScript.STATISTICS) { + if(input.isInSparseFormat() || filter.isInSparseFormat()) { + conv2dSparseCount.addAndGet(1); + } + else { + conv2dDenseCount.addAndGet(1); + } + } + params.tmpData = new TemporaryConvolutionData(); if(input.isInSparseFormat()) { params.tmpData.minIndexArrR = new int[params.H]; @@ -433,6 +489,15 @@ public class LibMatrixDNN { if(dout.getNumColumns() != params.C*params.P*params.Q || dout.getNumRows() != params.N) { throw new DMLRuntimeException("Incorrect dout dimensions in maxpooling_backward:" + input.getNumRows() + " " + input.getNumColumns() + " " + params.N + " " + params.K*params.P*params.Q); } + + if(DMLScript.STATISTICS) { + if(input.isInSparseFormat() || dout.isInSparseFormat()) { + maxPoolBwdSparseCount.addAndGet(1); + } + else { + maxPoolBwdDenseCount.addAndGet(1); + } + } int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads); if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) { @@ -455,41 +520,10 @@ public class LibMatrixDNN { int [] minIndexArrS = params.tmpData.minIndexArrS; int [] maxIndexArrS = params.tmpData.maxIndexArrS; - int minCommonIndexS = params.tmpData.minCommonIndexS; - int maxCommonIndexS = params.tmpData.maxCommonIndexS; - + final int minCommonIndexS = params.tmpData.minCommonIndexS; + final int maxCommonIndexS = params.tmpData.maxCommonIndexS; - int minS = 0; - if(params.S >= 4) { - minS = params.S - params.S % 4; - for (int n = n1; n < n2; n++) { - for (int c = 0; c < params.C; c++) { - for (int r = 0; r < params.R; r++) { - final int filterOffset = k*params.C*params.R*params.S + c*params.R*params.S + r*params.S; - for (int p = minIndexArrR[r]; p < maxIndexArrR[r]; p++) { - final int h = p*params.stride_h + r - params.pad_h; - final int inputOffSet = n*params.C*params.H*params.W + c*params.H*params.W + h*params.W - params.pad_w; - final int outputOffset = n*params.K*params.P*params.Q + k*params.P*params.Q + p*params.Q; - // ------------------------------------------------------------------------ - // Efficient striding with vectorization - for (int q = minCommonIndexS; q < maxCommonIndexS; q++) { - final int wOffset = inputOffSet + q*params.stride_w; - final int outOffsetWithQ = outputOffset + q; - for (int s = 0; s < minS; s += 4) { - final int inOffsetWithS = wOffset + s; - final int filterOffsetWithS = filterOffset + s; - outputArray[outOffsetWithQ] += inputArray[inOffsetWithS]*filterArray[filterOffsetWithS] - + inputArray[inOffsetWithS+1]*filterArray[filterOffsetWithS+1] - + inputArray[inOffsetWithS+2]*filterArray[filterOffsetWithS+2] - + inputArray[inOffsetWithS+3]*filterArray[filterOffsetWithS+3]; - } - } - // ------------------------------------------------------------------------ - } - } - } - } - } + final int minS = (params.S >= 4) ? (params.S - params.S % 4) : 0; for (int n = n1; n < n2; n++) { for (int c = 0; c < params.C; c++) { @@ -499,28 +533,28 @@ public class LibMatrixDNN { final int h = p*params.stride_h + r - params.pad_h; final int inputOffSet = n*params.C*params.H*params.W + c*params.H*params.W + h*params.W - params.pad_w; final int outputOffset = n*params.K*params.P*params.Q + k*params.P*params.Q + p*params.Q; - // ------------------------------------------------------------------------ - // Efficient striding + for (int q = minCommonIndexS; q < maxCommonIndexS; q++) { final int wOffset = inputOffSet + q*params.stride_w; + // ------------------------------------------------------------------------ + // Efficient striding with vectorization + final int outOffsetWithQ = outputOffset + q; + for (int s = 0; s < minS; s += 4) { + final int inOffsetWithS = wOffset + s; + final int filterOffsetWithS = filterOffset + s; + outputArray[outOffsetWithQ] += inputArray[inOffsetWithS]*filterArray[filterOffsetWithS] + + inputArray[inOffsetWithS+1]*filterArray[filterOffsetWithS+1] + + inputArray[inOffsetWithS+2]*filterArray[filterOffsetWithS+2] + + inputArray[inOffsetWithS+3]*filterArray[filterOffsetWithS+3]; + } + // ------------------------------------------------------------------------ + // Efficient striding without vectorization for (int s = minS; s < params.S; s++) { outputArray[outputOffset + q] += inputArray[wOffset + s]*filterArray[filterOffset + s]; } + // ------------------------------------------------------------------------ } // ------------------------------------------------------------------------ - } - } - } - - - for (int c = 0; c < params.C; c++) { - for (int r = 0; r < params.R; r++) { - final int filterOffset = k*params.C*params.R*params.S + c*params.R*params.S + r*params.S; - for (int p = minIndexArrR[r]; p < maxIndexArrR[r]; p++) { - final int h = p*params.stride_h + r - params.pad_h; - final int inputOffSet = n*params.C*params.H*params.W + c*params.H*params.W + h*params.W - params.pad_w; - final int outputOffset = n*params.K*params.P*params.Q + k*params.P*params.Q + p*params.Q; - // ------------------------------------------------------------------------ // Inefficient striding for (int s = 0; s < params.S; s++) { for (int q = minIndexArrS[s]; q < minCommonIndexS; q++) { @@ -1032,6 +1066,16 @@ public class LibMatrixDNN { params.output = outputBlock; params.outputNNZ.set(0); + + if(DMLScript.STATISTICS) { + if(input.isInSparseFormat()) { + im2colSparseCount.addAndGet(1); + } + else { + im2colDenseCount.addAndGet(1); + } + } + int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads); if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) { for (int n = 0; n < params.N; n++) { // Do following for all images
