Repository: incubator-systemml Updated Branches: refs/heads/master 1570ee0e1 -> efbfd987c
[SYSTEMML-769] Adding memoryless loop-based convolution operator that is useful in low-memory situation The initial set of experiments on Lenet have show that there is a performance degradation due to Cache RLS for im2col approach for large number of images and small memory budget size (as example: with a 20g JVM, the overhead is additional 1000 seconds). This degradation is not observed when we increases the memory budget (as an example: with 80g JVM). Additional features: 1. Moved setting of nnz from instruction to LibMatrixDNN 2. Improved the degree of parallelism of ConvTask TODOs: - Improve the performance of LibMatrixDNN.doConv2d_Backward_Filter. Since this was not in top 10 heavy hitter, I have left this for subsequent PR. Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/efbfd987 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/efbfd987 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/efbfd987 Branch: refs/heads/master Commit: efbfd987c9d27aefdfa45525bd09c4c8b40482df Parents: 1570ee0 Author: Niketan Pansare <[email protected]> Authored: Wed Jul 6 11:50:47 2016 -0700 Committer: Niketan Pansare <[email protected]> Committed: Wed Jul 6 11:50:47 2016 -0700 ---------------------------------------------------------------------- .../org/apache/sysml/hops/ConvolutionOp.java | 5 +- .../instructions/CPInstructionParser.java | 4 +- .../cp/ConvolutionCPInstruction.java | 26 +- .../sysml/runtime/matrix/data/LibMatrixDNN.java | 321 ++++++++++++++++++- .../sysml/runtime/util/ConvolutionUtils.java | 42 ++- .../functions/tensor/Conv2DBackwardTest.java | 56 +++- .../functions/tensor/Conv2DTest.java | 58 +++- 7 files changed, 473 insertions(+), 39 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/efbfd987/src/main/java/org/apache/sysml/hops/ConvolutionOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java index 3ec597b..3da2cd2 100644 --- a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java +++ b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java @@ -39,6 +39,8 @@ public class ConvolutionOp extends Hop implements MultiThreadedHop private Hop.ConvOp op; private int _maxNumThreads = -1; //-1 for unlimited + + public static boolean FORCE_NON_IM2COL = false; private ConvolutionOp() { //default constructor for clone @@ -173,8 +175,9 @@ public class ConvolutionOp extends Hop implements MultiThreadedHop } Lop in = inputs.get(0).constructLops(); + int numThreads = et == ExecType.CP ? OptimizerUtils.getConstrainedNumThreads(primaryOp.getMaxNumThreads()) : 1; ConvolutionTransform transform1 = new ConvolutionTransform( in, - HopsConv2Lops.get(op), primaryOp.getDataType(), primaryOp.getValueType(), et, 1); + HopsConv2Lops.get(op), primaryOp.getDataType(), primaryOp.getValueType(), et, numThreads); // setOutputDimensions(transform1); transform1.getOutputParameters().setDimensions( http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/efbfd987/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java index 23ca5ef..f3a1621 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java @@ -220,7 +220,9 @@ public class CPInstructionParser extends InstructionParser String2CPInstructionType.put( "col2im" , CPINSTRUCTION_TYPE.Convolution); String2CPInstructionType.put( "maxpooling" , CPINSTRUCTION_TYPE.Convolution); String2CPInstructionType.put( "maxpooling_backward" , CPINSTRUCTION_TYPE.Convolution); - + String2CPInstructionType.put( "conv2d" , CPINSTRUCTION_TYPE.Convolution); + String2CPInstructionType.put( "conv2d_backward_filter" , CPINSTRUCTION_TYPE.Convolution); + // Quaternary instruction opcodes String2CPInstructionType.put( "wsloss" , CPINSTRUCTION_TYPE.Quaternary); String2CPInstructionType.put( "wsigmoid", CPINSTRUCTION_TYPE.Quaternary); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/efbfd987/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java index 1030071..15b043f 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java @@ -116,7 +116,9 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { padding, input_shape, filter_shape, k); } else if (opcode.equalsIgnoreCase("pooling_backward_reshape") - || opcode.equalsIgnoreCase("maxpooling_backward")) { + || opcode.equalsIgnoreCase("maxpooling_backward") + || opcode.equalsIgnoreCase("conv2d") + || opcode.equalsIgnoreCase("conv2d_backward_filter")) { InstructionUtils.checkNumFields(parts, 16); // dout, stride1, stride2, padding1, padding2 // input_shape1, input_shape2, input_shape3, input_shape4, @@ -191,7 +193,6 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { outputBlock = getDenseOutputBlock(ec, C * R * S, N * P * Q, true); params.setReuseNonZeroedOutput(_reuseNonZeroedOutput); LibMatrixDNN.im2col(matBlock, outputBlock, params); - outputBlock.setNonZeros(params.outputNNZ.get()); } else if (instOpcode.equalsIgnoreCase("reshape_col")) { checkHeightWidth(ec, params); @@ -200,7 +201,6 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { outputBlock = getDenseOutputBlock(ec, N, K * P * Q, true); params.setReuseNonZeroedOutput(_reuseNonZeroedOutput); LibMatrixDNN.reshape_col(matBlock, outputBlock, params); - outputBlock.setNonZeros(matBlock.getNonZeros()); // As number of non-zeros doesnot change for reshape_col } else if (instOpcode.equalsIgnoreCase("rotate180")) { checkHeightWidth(ec, params); @@ -208,7 +208,6 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { outputBlock = getDenseOutputBlock(ec, N * P * Q, K, true); params.setReuseNonZeroedOutput(_reuseNonZeroedOutput); LibMatrixDNN.rotate180(matBlock, outputBlock, params); - outputBlock.setNonZeros(matBlock.getNonZeros()); // As number of non-zeros doesnot change for rotate180 } else if (instOpcode.equalsIgnoreCase("col2im")) { checkHeightWidth(ec, params); @@ -216,7 +215,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { // needs to be zeroed-out outputBlock = getDenseOutputBlock(ec, N, C * H * W, false); params.setReuseNonZeroedOutput(_reuseNonZeroedOutput); - LibMatrixDNN.col2im(matBlock, outputBlock, params); // No efficient nnz computation, so setting it to -1 + LibMatrixDNN.col2im(matBlock, outputBlock, params); } else if (instOpcode.equalsIgnoreCase("maxpooling")) { // Is eligible for REUSE_NONZEROED_OUTPUT but cannot guarantee that previous output has been rmvar-ed @@ -224,7 +223,6 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { outputBlock = getDenseOutputBlock(ec, N, C*P*Q, true); params.setReuseNonZeroedOutput(_reuseNonZeroedOutput); LibMatrixDNN.maxpooling(matBlock, outputBlock, params); - outputBlock.setNonZeros(params.outputNNZ.get()); } else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) { MatrixBlock dout = ec.getMatrixInput(_in2.getName()); @@ -232,7 +230,21 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { // without somewhat expensive HashMap checks outputBlock = getDenseOutputBlock(ec, N, C*H*W, false); params.setReuseNonZeroedOutput(_reuseNonZeroedOutput); - LibMatrixDNN.maxpooling_backward(matBlock, dout, outputBlock, params); // No efficient nnz computation, so setting it to -1 + LibMatrixDNN.maxpooling_backward(matBlock, dout, outputBlock, params); + ec.releaseMatrixInput(_in2.getName()); + } + else if (instOpcode.equalsIgnoreCase("conv2d")) { + MatrixBlock filter = ec.getMatrixInput(_in2.getName()); + outputBlock = getDenseOutputBlock(ec, N, K*P*Q, false); + params.setReuseNonZeroedOutput(_reuseNonZeroedOutput); + LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params); + ec.releaseMatrixInput(_in2.getName()); + } + else if (instOpcode.equalsIgnoreCase("conv2d_backward_filter")) { + MatrixBlock filter = ec.getMatrixInput(_in2.getName()); + outputBlock = getDenseOutputBlock(ec, K, C*R*S, false); + params.setReuseNonZeroedOutput(_reuseNonZeroedOutput); + LibMatrixDNN.conv2d_backward_filter(matBlock, filter, outputBlock, params); ec.releaseMatrixInput(_in2.getName()); } else { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/efbfd987/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java index 0565136..0f6b646 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java @@ -20,16 +20,20 @@ package org.apache.sysml.runtime.matrix.data; import java.lang.ref.SoftReference; import java.util.ArrayList; +import java.util.List; import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; +import java.util.concurrent.Future; import java.util.concurrent.atomic.AtomicLong; import org.apache.sysml.hops.OptimizerUtils; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.util.ConvolutionUtils; + public class LibMatrixDNN { public static final boolean ALLOW_MULTI_THREADED_OPS = true; @@ -56,10 +60,17 @@ public class LibMatrixDNN { } enum TaskType { - ReshapeCol, Rotate180, Im2Col, Col2Im, MaxPooling_Forward, MaxPooling_Backward + ReshapeCol, Rotate180, Im2Col, Col2Im, MaxPooling_Forward, MaxPooling_Backward, LoopBasedConv2d } public static final int TASK_SIZE = 64; // to take care of extremely small tasks + public static class TemporaryConvolutionData { + public int [] minIndexArrR; + public int [] minIndexArrS; + public int [] maxIndexArrR; + public int [] maxIndexArrS; + } + public static class ConvolutionParameters { public int N; public int C; public int H; public int W; public int K; public int R; public int S; public int stride_h; public int stride_w; public int pad_h; public int pad_w; @@ -70,6 +81,8 @@ public class LibMatrixDNN { MatrixBlock input1; MatrixBlock input2; MatrixBlock output; boolean reuseNonZeroedOutput = false; + public TemporaryConvolutionData tmpData; + private int convertToInt(long val) throws DMLRuntimeException { if( val > Integer.MAX_VALUE ) { throw new DMLRuntimeException("The value for ConvolutionParameters is too large:" + val); @@ -138,6 +151,146 @@ public class LibMatrixDNN { } } + public static void conv2d_backward_filter(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { + params.input1 = input; + params.input2 = dout; + params.output = outputBlock; + if(input.getNumRows() != params.N || input.getNumColumns() != params.C*params.H*params.W || + dout.getNumRows() != params.N || dout.getNumColumns() != params.K*params.P*params.Q) { + throw new DMLRuntimeException("Incorrect input to conv2d_backward_filter"); + } + + int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads); + if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) { + for (int c = 0; c < params.C; c++) { + for (int k = 0; k < params.K; k++) { + for (int r = 0; r < params.R; r++) { + for (int s = 0; s < params.S; s++) { + doConv2d_Backward_Filter(k, c, r, s, params); + } + } + } + } + } + else { + ArrayList<ConvBackwardFilterTask> tasks = new ArrayList<ConvBackwardFilterTask>(); + for (int c = 0; c < params.C; c++) { + for (int k = 0; k < params.K; k++) { + for (int r = 0; r < params.R; r++) { + for (int s = 0; s < params.S; s++) { + tasks.add(new ConvBackwardFilterTask(k, c, r, s, params)); + } + } + } + } + ExecutorService pool = Executors.newFixedThreadPool( Math.min(constrainedNumThreads, tasks.size()) ); + List<Future<Object>> taskret; + try { + taskret = pool.invokeAll(tasks); + pool.shutdown(); + for( Future<Object> task : taskret ) + task.get(); + } catch (InterruptedException e) { + throw new DMLRuntimeException("Error while executing multi-threaded ConvBackwardFilterTask", e); + } catch (ExecutionException e) { + throw new DMLRuntimeException("Error while executing multi-threaded ConvBackwardFilterTask", e); + } + } + } + + public static void doConv2d_Backward_Filter(int k, int c, int r, int s, ConvolutionParameters params) { + double [] inputArray = null; + if (!params.input1.isInSparseFormat()) + inputArray = params.input1.getDenseBlock(); + double [] doutArray = null; + if (!params.input2.isInSparseFormat()) + doutArray = params.input2.getDenseBlock(); + double [] outputArray = params.output.getDenseBlock(); + + long outputVal = 0; + for (int n = 0; n < params.N; n++) { + for (int p = 0; p < params.P; p++) { + for (int q = 0; q < params.Q; q++) { + double doutVal = 0; + if(doutArray != null) + doutVal = doutArray[n*params.K*params.P*params.Q + k*params.P*params.Q + p*params.Q + q]; + else + doutVal = params.input2.quickGetValue(n, k*params.P*params.Q + p*params.Q + q); + if(doutVal != 0) { + // TODO: Improve the performance by striding + for (int h = 0; h < params.H; h++) { + for (int w = 0; w < params.W; w++) { + if(h == p*params.stride_h + r - params.pad_h && + w == q*params.stride_w + s - params.pad_w) { + if(inputArray != null) + outputVal += doutVal*inputArray[n*params.C*params.H*params.W + c*params.H*params.W + h*params.W+w]; + else + outputVal += doutVal*params.input1.quickGetValue(n, c*params.H*params.W + h*params.W + w); + } + } + } + } + } + } + } + outputArray[k*params.C*params.R*params.S + c*params.R*params.S + r*params.S + s] = outputVal; + } + + private static class ConvBackwardFilterTask implements Callable<Object> { + int k; int c; int r; int s; + ConvolutionParameters params; + public ConvBackwardFilterTask(int k, int c, int r, int s, ConvolutionParameters params) { + this.k = k; + this.c = c; + this.r = r; + this.s = s; + this.params = params; + } + + @Override + public Object call() throws Exception { + doConv2d_Backward_Filter(k, c, r, s, params); + return null; + } + + } + + public static void conv2d(MatrixBlock input, MatrixBlock filter, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { + params.input1 = input; + params.input2 = filter; + params.output = outputBlock; + + if(input.getNumRows() != params.N || input.getNumColumns() != params.C*params.H*params.W || + filter.getNumRows() != params.K || filter.getNumColumns() != params.C*params.R*params.S) { + throw new DMLRuntimeException("Incorrect input to conv2d"); + } + + params.tmpData = new TemporaryConvolutionData(); + params.tmpData.minIndexArrR = new int[params.R]; + params.tmpData.maxIndexArrR = new int[params.R]; + params.tmpData.minIndexArrS = new int[params.S]; + params.tmpData.maxIndexArrS = new int[params.S]; + for (int r = 0; r < params.R; r++) { + params.tmpData.minIndexArrR[r] = getMinPQ(params.pad_h, r, params.stride_h); + params.tmpData.maxIndexArrR[r] = getMaxPQ(params.pad_h, r, params.stride_h, params.P, params.H); + } + for (int s = 0; s < params.S; s++) { + params.tmpData.minIndexArrS[s] = getMinPQ(params.pad_w, s, params.stride_w); + params.tmpData.maxIndexArrS[s] = getMaxPQ(params.pad_w, s, params.stride_w, params.Q, params.W); + } + + int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads); + if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) { + for (int n = 0; n < params.N; n++) { + for (int k = 0; k < params.K; k++) { + doLoopBasedConv2d(n, k, params); + } + } + } + else + runParallelConvTask(constrainedNumThreads, params.K, TaskType.LoopBasedConv2d, params); + } + public static void maxpooling_backward(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { params.input1 = input; params.input2 = dout; @@ -163,6 +316,122 @@ public class LibMatrixDNN { } } + /** + * This is essentially memory-less operation and can be used when the memory pressure is extremely high. + * @param n + * @param k + * @param params + */ + private static void doLoopBasedConv2d(int n, int k, ConvolutionParameters params) { + double [] inputArray = null; + if (!params.input1.isInSparseFormat()) + inputArray = params.input1.getDenseBlock(); + double [] filterArray = null; + if (!params.input2.isInSparseFormat()) + filterArray = params.input2.getDenseBlock(); + double [] outputArray = params.output.getDenseBlock(); + + int outputOffset = n*params.K*params.P*params.Q + k*params.P*params.Q; + + int [] minIndexArrR = params.tmpData.minIndexArrR; + int [] maxIndexArrR = params.tmpData.maxIndexArrR; + int [] minIndexArrS = params.tmpData.minIndexArrS; + int [] maxIndexArrS = params.tmpData.maxIndexArrS; + + if(inputArray != null && filterArray != null) { + for (int c = 0; c < params.C; c++) { + for (int r = 0; r < params.R; r++) { + int filterOffset = k*params.C*params.R*params.S + c*params.R*params.S + r*params.S; + for (int p = minIndexArrR[r]; p < maxIndexArrR[r]; p++) { + for (int s = 0; s < params.S; s++) { + double filterVal = filterArray[filterOffset + s]; + if(filterVal != 0) { + int h = p*params.stride_h + r - params.pad_h; + for (int q = minIndexArrS[s]; q < maxIndexArrS[s]; q++) { + int w = q*params.stride_w + s - params.pad_w; + outputArray[outputOffset + p*params.Q + q] += denseConvMultiply(inputArray, filterVal, params, n, c, h, w); + } + } + } + } + } + } + } + else if(inputArray != null && filterArray == null) { + for (int c = 0; c < params.C; c++) { + for (int r = 0; r < params.R; r++) { + for (int p = minIndexArrR[r]; p < maxIndexArrR[r]; p++) { + for (int s = 0; s < params.S; s++) { + double filterVal = params.input2.quickGetValue(k, c*params.R*params.S + r*params.S + s); + if(filterVal != 0) { + int h = p*params.stride_h + r - params.pad_h; + for (int q = minIndexArrS[s]; q < maxIndexArrS[s]; q++) { + int w = q*params.stride_w + s - params.pad_w; + outputArray[outputOffset + p*params.Q + q] += denseConvMultiply(inputArray, filterVal, params, n, c, h, w); + } + } + } + } + } + } + } + else if(inputArray == null && filterArray != null) { + for (int c = 0; c < params.C; c++) { + for (int r = 0; r < params.R; r++) { + int filterOffset = k*params.C*params.R*params.S + c*params.R*params.S + r*params.S; + for (int p = minIndexArrR[r]; p < maxIndexArrR[r]; p++) { + for (int s = 0; s < params.S; s++) { + double filterVal = filterArray[filterOffset + s]; + if(filterVal != 0) { + int h = p*params.stride_h + r - params.pad_h; + for (int q = minIndexArrS[s]; q < maxIndexArrS[s]; q++) { + int w = q*params.stride_w + s - params.pad_w; + outputArray[outputOffset + p*params.Q + q] += sparseConvMultiply(inputArray, filterVal, params, n, c, h, w); + } + } + } + } + } + } + } + else if(inputArray == null && filterArray == null) { + for (int c = 0; c < params.C; c++) { + for (int r = 0; r < params.R; r++) { + for (int p = minIndexArrR[r]; p < maxIndexArrR[r]; p++) { + for (int s = 0; s < params.S; s++) { + double filterVal = params.input2.quickGetValue(k, c*params.R*params.S + r*params.S + s); + if(filterVal != 0) { + int h = p*params.stride_h + r - params.pad_h; + for (int q = minIndexArrS[s]; q < maxIndexArrS[s]; q++) { + int w = q*params.stride_w + s - params.pad_w; + outputArray[outputOffset + p*params.Q + q] += sparseConvMultiply(inputArray, filterVal, params, n, c, h, w); + } + } + } + } + } + } + } + } + + private static int getMinPQ(int pad, int filterSize, int stride) { + return Math.max(0, (int)Math.ceil(((double)(pad - filterSize))/stride)); + } + + private static int getMaxPQ(int pad, int filterSize, int stride, int outputSize, int inputSize) { + return Math.min(outputSize, (int)Math.ceil(((double)(inputSize + pad - filterSize)) / stride)); + } + + private static double denseConvMultiply(double [] inputArray, double filterVal, ConvolutionParameters params, + int n, int c, int h, int w) { + return inputArray[n*params.C*params.H*params.W + c*params.H*params.W + h*params.W+w]*filterVal; + } + + private static double sparseConvMultiply(double [] inputArray, double filterVal, ConvolutionParameters params, + int n, int c, int h, int w) { + return params.input1.quickGetValue(n, c*params.H*params.W + h*params.W + w)*filterVal; + } + private static void doPoolingBackward(int n, int c, ConvolutionParameters params) { double [] inputArray = null; if (!params.input1.isInSparseFormat()) @@ -234,7 +503,8 @@ public class LibMatrixDNN { } else { runParallelConvTask(constrainedNumThreads, params.C, TaskType.MaxPooling_Forward, params); - } + } + outputBlock.setNonZeros(params.outputNNZ.get()); } private static void doPooling(int n, int c, ConvolutionParameters params) { @@ -291,6 +561,7 @@ public class LibMatrixDNN { else { runParallelConvTask(constrainedNumThreads, 1, TaskType.Rotate180, params); } + outputBlock.setNonZeros(input.getNonZeros()); // As number of non-zeros doesnot change for rotate180 } private static void doRotate180(int n, ConvolutionParameters params) { @@ -332,27 +603,44 @@ public class LibMatrixDNN { else { runParallelConvTask(constrainedNumThreads, 1, TaskType.ReshapeCol, params); } - + outputBlock.setNonZeros(input.getNonZeros()); // As number of non-zeros doesnot change for reshape_col } private static void runParallelConvTask(int constrainedNumThreads, int Z, TaskType type, ConvolutionParameters params) throws DMLRuntimeException { - ArrayList<ConvTask> tasks = new ArrayList<ConvTask>(); - // Total number of compute units available: constrainedNumThreads // Static task allocation. TODO: Do this in dynamic way + int taskSize = TASK_SIZE; + while(true) { + if(params.N * Math.ceil(Z/taskSize) > constrainedNumThreads || taskSize == 1) { + doRunParallelConvTask(constrainedNumThreads, Z, type, params, taskSize); + return; + } + taskSize = Math.max(taskSize/2, 1); + } + } + + private static void doRunParallelConvTask(int constrainedNumThreads, int Z, TaskType type, ConvolutionParameters params, int taskSize) throws DMLRuntimeException { + ArrayList<ConvTask> tasks = new ArrayList<ConvTask>(); + for (int n = 0; n < params.N; n++) { - for (int z = 0; z < Z; z += TASK_SIZE) { - tasks.add(new ConvTask(n, n+1, z, Math.min(Z, z+TASK_SIZE), type, params)); + for (int z = 0; z < Z; z += taskSize) { + tasks.add(new ConvTask(n, n+1, z, Math.min(Z, z+taskSize), type, params)); } } ExecutorService pool = Executors.newFixedThreadPool( Math.min(constrainedNumThreads, tasks.size()) ); + List<Future<Object>> taskret; try { - pool.invokeAll(tasks); + taskret = pool.invokeAll(tasks); + pool.shutdown(); + for( Future<Object> task : taskret ) + task.get(); } catch (InterruptedException e) { throw new DMLRuntimeException("Error while executing multi-threaded " + type.name(), e); - } - pool.shutdown(); + } catch (ExecutionException e) { + throw new DMLRuntimeException("Error while executing multi-threaded " + type.name(), e); + } + } private static class ConvTask implements Callable<Object> { @@ -369,7 +657,7 @@ public class LibMatrixDNN { } @Override - public Object call() throws Exception { + public Object call() throws DMLRuntimeException { switch(type) { case ReshapeCol: for (int n = n1; n < n2; n++) { @@ -409,8 +697,15 @@ public class LibMatrixDNN { } } break; + case LoopBasedConv2d: + for (int n = n1; n < n2; n++) { + for (int z = z1; z < z2; z++) { + LibMatrixDNN.doLoopBasedConv2d(n, z, params); + } + } + break; default: - throw new RuntimeException("Unsupported ConvTask:" + type.name()); + throw new DMLRuntimeException("Unsupported ConvTask:" + type.name()); } return null; } @@ -457,6 +752,7 @@ public class LibMatrixDNN { else { runParallelConvTask(constrainedNumThreads, params.C, TaskType.Im2Col, params); } + outputBlock.setNonZeros(params.outputNNZ.get()); } // Converts a matrix of dimension (CRS, NPQ) to a 4D tensor (N, C, H, W) @@ -524,6 +820,7 @@ public class LibMatrixDNN { } + private static void doIm2colOverInputPath_NCHW(int n, int c, ConvolutionParameters params) { double [] inputArray = null; if (!params.input1.isInSparseFormat()) http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/efbfd987/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java b/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java index 9def65a..0e469c2 100644 --- a/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java +++ b/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java @@ -73,11 +73,22 @@ public class ConvolutionUtils { return false; } + // Simple heuristic that prefers im2col for non-test/non-validation cases. + private static boolean preferIm2Col(ExecType et, long N, long K, long C, long R, long S, long P, long Q) throws HopsException { + if(et == ExecType.CP && ConvolutionOp.FORCE_NON_IM2COL) { + return false; + } + else if(et == ExecType.CP && N < 256 ) { + return true; // Prefer im2col to non-test/non-validation + } + return false; + } + public static Lop constructConvolutionBackwardFilterLops(Hop currentHop) throws HopsException, LopsException { - ExecType et = ExecType.CP; + ExecType et = ExecType.CP; // TODO: Check memory estimates if(DMLScript.USE_ACCELERATOR) et = ExecType.GPU; // TODO: Add memory estimate checks - else + else if(et == ExecType.MR || et == ExecType.SPARK) return null; if(currentHop != null && isTranspose(currentHop)) { @@ -96,10 +107,27 @@ public class ConvolutionUtils { } // K, C * R * S - long K = currentHop.computeSizeInformation(inputs.get(10)); + long N = currentHop.computeSizeInformation(inputs.get(6)); long C = currentHop.computeSizeInformation(inputs.get(7)); + long H = currentHop.computeSizeInformation(inputs.get(8)); + long W = currentHop.computeSizeInformation(inputs.get(9)); + long K = currentHop.computeSizeInformation(inputs.get(10)); long R = currentHop.computeSizeInformation(inputs.get(12)); long S = currentHop.computeSizeInformation(inputs.get(13)); + long stride_h = currentHop.computeSizeInformation(inputs.get(2)); + long stride_w = currentHop.computeSizeInformation(inputs.get(3)); + long pad_h = currentHop.computeSizeInformation(inputs.get(4)); + long pad_w = currentHop.computeSizeInformation(inputs.get(5)); + long P = -1; long Q = -1; + if(H > 0 && R > 0 && stride_h > 0 && pad_h > 0) + P = ConvolutionUtils.getP(H, R, stride_h, pad_h); + if(W > 0 && S > 0 && stride_w > 0 && pad_w > 0) + Q = ConvolutionUtils.getQ(W, S, stride_w, pad_w); + + if(preferIm2Col(et, N, K, C, R, S, P, Q)) { + return null; + } + long rlen = K; long clen = ConvolutionOp.getExtractedVal(C, R, S); return ConvolutionOp.constructFusedConvolutionLops(et, inputs, ConvOp.DIRECT_CONV2D_BACKWARD_FILTER, (ConvolutionOp) x_col, rlen, clen); @@ -112,7 +140,7 @@ public class ConvolutionUtils { public static Lop constructConvolutionLops(Hop currentHop, ExecType et) throws HopsException, LopsException { if(DMLScript.USE_ACCELERATOR) et = ExecType.GPU; // TODO: Add memory estimate checks - else + else if(et == ExecType.MR || et == ExecType.SPARK) return null; if(currentHop != null && isConvolutionOp(currentHop, ConvOp.RESHAPE_COL)) { @@ -131,6 +159,7 @@ public class ConvolutionUtils { // N, K * P * Q long N = currentHop.computeSizeInformation(inputs.get(6)); + long C = currentHop.computeSizeInformation(inputs.get(7)); long H = currentHop.computeSizeInformation(inputs.get(8)); long W = currentHop.computeSizeInformation(inputs.get(9)); long K = currentHop.computeSizeInformation(inputs.get(10)); @@ -145,6 +174,11 @@ public class ConvolutionUtils { P = ConvolutionUtils.getP(H, R, stride_h, pad_h); if(W > 0 && S > 0 && stride_w > 0 && pad_w > 0) Q = ConvolutionUtils.getQ(W, S, stride_w, pad_w); + + if(preferIm2Col(et, N, K, C, R, S, P, Q)) { + return null; + } + long rlen = N; long clen = ConvolutionOp.getExtractedVal(K, P, Q); return ConvolutionOp.constructFusedConvolutionLops(et, inputs, ConvOp.DIRECT_CONV2D, (ConvolutionOp) x_col, rlen, clen); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/efbfd987/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardTest.java b/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardTest.java index fd674d9..c213b55 100644 --- a/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardTest.java +++ b/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DBackwardTest.java @@ -22,6 +22,7 @@ import java.util.HashMap; import org.apache.sysml.api.DMLScript; import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM; +import org.apache.sysml.hops.ConvolutionOp; import org.apache.sysml.lops.LopProperties.ExecType; import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex; import org.apache.sysml.runtime.util.ConvolutionUtils; @@ -49,35 +50,70 @@ public class Conv2DBackwardTest extends AutomatedTestBase public void testConv2DBackwardFilterDense1() { int numImg = 3; int imgSize = 3; int numChannels = 3; int numFilters = 1; int filterSize = 2; int stride = 1; int pad = 0; - runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad); + runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false); } @Test public void testConv2DBackwardFilterDense2() { int numImg = 3; int imgSize = 3; int numChannels = 3; int numFilters = 4; int filterSize = 2; int stride = 1; int pad = 0; - runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad); + runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false); } @Test public void testConv2DBackwardFilterDense3() { int numImg = 3; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1; - runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad); + runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false); } @Test public void testConv2DBackwardFilterDense4() { - int numImg = 3; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 3; int stride = 1; int pad = 1; - runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad); + int numImg = 3; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 5; int stride = 1; int pad = 1; + runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false); } @Test public void testConv2DBackwardFilterDense5() { - int numImg = 3; int imgSize = 10; int numChannels = 2; int numFilters = 3; int filterSize = 3; int stride = 3; int pad = 1; - runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad); + int numImg = 3; int imgSize = 10; int numChannels = 2; int numFilters = 3; int filterSize = 5; int stride = 3; int pad = 2; + runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false); + } + + @Test + public void testConv2DBackwardFilterDense6() + { + int numImg = 3; int imgSize = 3; int numChannels = 3; int numFilters = 1; int filterSize = 2; int stride = 1; int pad = 0; + runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true); + } + + @Test + public void testConv2DBackwardFilterDense7() + { + int numImg = 3; int imgSize = 3; int numChannels = 3; int numFilters = 4; int filterSize = 2; int stride = 1; int pad = 0; + runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true); + } + + @Test + public void testConv2DBackwardFilterDense8() + { + int numImg = 3; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1; + runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true); + } + + @Test + public void testConv2DBackwardFilterDense9() + { + int numImg = 3; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 5; int stride = 1; int pad = 1; + runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true); + } + + @Test + public void testConv2DBackwardFilterDense10() + { + int numImg = 3; int imgSize = 10; int numChannels = 2; int numFilters = 3; int filterSize = 5; int stride = 3; int pad = 2; + runConv2DBackwardFilterTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true); } /** @@ -86,12 +122,13 @@ public class Conv2DBackwardTest extends AutomatedTestBase * @param sparse */ public void runConv2DBackwardFilterTest( ExecType et, int imgSize, int numImg, int numChannels, int numFilters, - int filterSize, int stride, int pad) + int filterSize, int stride, int pad, boolean forceNonIm2Col) { RUNTIME_PLATFORM oldRTP = rtplatform; boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG; - + boolean oldForceNonIm2col = ConvolutionOp.FORCE_NON_IM2COL; + ConvolutionOp.FORCE_NON_IM2COL = forceNonIm2Col; try { TestConfiguration config = getTestConfiguration(TEST_NAME); @@ -139,6 +176,7 @@ public class Conv2DBackwardTest extends AutomatedTestBase { rtplatform = oldRTP; DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld; + ConvolutionOp.FORCE_NON_IM2COL = oldForceNonIm2col; } } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/efbfd987/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java b/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java index 3737801..8b87372 100644 --- a/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java +++ b/src/test/java/org/apache/sysml/test/integration/functions/tensor/Conv2DTest.java @@ -22,6 +22,7 @@ import java.util.HashMap; import org.apache.sysml.api.DMLScript; import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM; +import org.apache.sysml.hops.ConvolutionOp; import org.apache.sysml.lops.LopProperties.ExecType; import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex; import org.apache.sysml.test.integration.AutomatedTestBase; @@ -47,42 +48,88 @@ public class Conv2DTest extends AutomatedTestBase public void testConv2DDense1() { int numImg = 5; int imgSize = 3; int numChannels = 3; int numFilters = 6; int filterSize = 2; int stride = 1; int pad = 0; - runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad); + runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false); } @Test public void testConv2DDense2() { int numImg = 1; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 4; int stride = 2; int pad = 0; - runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad); + runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false); } @Test public void testConv2DDense3() { int numImg = 1; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 4; int stride = 2; int pad = 1; - runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad); + runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false); } @Test public void testConv2DDense4() { int numImg = 3; int imgSize = 10; int numChannels = 1; int numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1; - runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad); + runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false); } + @Test + public void testConv2DDense5() + { + int numImg = 3; int imgSize = 8; int numChannels = 2; int numFilters = 3; int filterSize = 3; int stride = 1; int pad = 2; + runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, false); + } + + @Test + public void testConv2DDense6() + { + int numImg = 5; int imgSize = 3; int numChannels = 3; int numFilters = 6; int filterSize = 2; int stride = 1; int pad = 0; + runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true); + } + + @Test + public void testConv2DDense7() + { + int numImg = 1; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 4; int stride = 2; int pad = 0; + runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true); + } + + @Test + public void testConv2DDense8() + { + int numImg = 1; int imgSize = 10; int numChannels = 4; int numFilters = 3; int filterSize = 4; int stride = 2; int pad = 1; + runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true); + } + + @Test + public void testConv2DDense9() + { + int numImg = 3; int imgSize = 10; int numChannels = 1; int numFilters = 3; int filterSize = 2; int stride = 2; int pad = 1; + runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true); + } + + @Test + public void testConv2DDense10() + { + int numImg = 3; int imgSize = 8; int numChannels = 2; int numFilters = 3; int filterSize = 3; int stride = 1; int pad = 2; + runConv2DTest(ExecType.CP, imgSize, numImg, numChannels, numFilters, filterSize, stride, pad, true); + } + + /** * * @param et * @param sparse */ public void runConv2DTest( ExecType et, int imgSize, int numImg, int numChannels, int numFilters, - int filterSize, int stride, int pad) + int filterSize, int stride, int pad, boolean FORCE_NON_IM2COL) { RUNTIME_PLATFORM oldRTP = rtplatform; boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG; + boolean oldForceNonIm2col = ConvolutionOp.FORCE_NON_IM2COL; + ConvolutionOp.FORCE_NON_IM2COL = FORCE_NON_IM2COL; + try { TestConfiguration config = getTestConfiguration(TEST_NAME); @@ -128,6 +175,7 @@ public class Conv2DTest extends AutomatedTestBase { rtplatform = oldRTP; DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld; + ConvolutionOp.FORCE_NON_IM2COL = oldForceNonIm2col; } } }
