Repository: incubator-systemml Updated Branches: refs/heads/master d79dea926 -> 623779912
[SYSTEMML-540] Improved performance of col2im and removed unnecessary r' Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/62377991 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/62377991 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/62377991 Branch: refs/heads/master Commit: 623779912745fb69194b3ba08ec0f67adaef5eb3 Parents: d79dea9 Author: Niketan Pansare <[email protected]> Authored: Sun Aug 14 10:11:07 2016 -0700 Committer: Niketan Pansare <[email protected]> Committed: Sun Aug 14 10:11:07 2016 -0700 ---------------------------------------------------------------------- .../org/apache/sysml/parser/DMLTranslator.java | 4 +- .../cp/ConvolutionCPInstruction.java | 6 +- .../sysml/runtime/matrix/data/LibMatrixDNN.java | 182 +++++++++---------- .../sysml/runtime/util/ConvolutionUtils.java | 60 +++--- 4 files changed, 129 insertions(+), 123 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/62377991/src/main/java/org/apache/sysml/parser/DMLTranslator.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/parser/DMLTranslator.java b/src/main/java/org/apache/sysml/parser/DMLTranslator.java index a3b8b52..b5bb7c3 100644 --- a/src/main/java/org/apache/sysml/parser/DMLTranslator.java +++ b/src/main/java/org/apache/sysml/parser/DMLTranslator.java @@ -2863,9 +2863,9 @@ public class DMLTranslator Hop dout_reshaped = new ConvolutionOp(dout.getName(), dout.getDataType(), dout.getValueType(), Hop.ConvOp.ROTATE180, inHops1); Hop temp1 = new AggBinaryOp("temp" + target.getName(), target.getDataType(), target.getValueType(), OpOp2.MULT, AggOp.SUM, dout_reshaped, filter); - Hop temp2 = new ReorgOp("tempTranspose" + target.getName(), target.getDataType(), target.getValueType(), Hop.ReOrgOp.TRANSPOSE, temp1); + // Hop temp2 = new ReorgOp("tempTranspose" + target.getName(), target.getDataType(), target.getValueType(), Hop.ReOrgOp.TRANSPOSE, temp1); - ArrayList<Hop> inHops2 = getALHopsForConvOp(temp2, source, 2, hops); + ArrayList<Hop> inHops2 = getALHopsForConvOp(temp1, source, 2, hops); currBuiltinOp = new ConvolutionOp(target.getName(), target.getDataType(), target.getValueType(), Hop.ConvOp.COL2IM, inHops2); setBlockSizeAndRefreshSizeInfo(filter, currBuiltinOp); break; http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/62377991/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java index 8324ff2..4b04eca 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java @@ -309,13 +309,13 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { private void checkInputDimensionForIm2col(MatrixBlock matBlock, ConvolutionParameters params) throws DMLRuntimeException { if((params.N != matBlock.getNumRows() || params.C*params.H*params.W != matBlock.getNumColumns())) { - throw new DMLRuntimeException("Incorrect input shape in conv2d"); + throw new DMLRuntimeException("Incorrect input shape in im2col"); } } private void checkInputDimensionForCol2im(MatrixBlock matBlock, ConvolutionParameters params) throws DMLRuntimeException { - if((params.C*params.R*params.S != matBlock.getNumRows() || params.N*params.P*params.Q != matBlock.getNumColumns())) { - throw new DMLRuntimeException("Incorrect input shape in conv2d_backward_data"); + if((params.N*params.P*params.Q != matBlock.getNumRows() || params.C*params.R*params.S != matBlock.getNumColumns())) { + throw new DMLRuntimeException("Incorrect input shape in col2im"); } } } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/62377991/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java index e657d18..59a6a47 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java @@ -411,11 +411,10 @@ public class LibMatrixDNN { return partialRetBlock; } - private static void computeTensorIndexes(int i, int j, int [] ret, int N, int C, int H, int W) throws DMLRuntimeException { - ret[0] = i; - ret[1] = j / (H*W); - ret[2] = (j - ret[1]*(H*W))/W; - ret[3] = j % W; + private static void computeTensorIndexes(int j, int [] ret, int H, int W) throws DMLRuntimeException { + ret[0] = j / (H*W); + ret[1] = (j - ret[0]*(H*W))/W; + ret[2] = j % W; } public static void conv2d(MatrixBlock input, MatrixBlock filter, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { @@ -575,14 +574,14 @@ public class LibMatrixDNN { throw new DMLRuntimeException("Incorrect usage: Call optimized versions"); Iterator<IJV> iter = params.input2.sparseBlock.getIterator(n, n+1); - int [] tensorIndexes = new int[4]; + int [] tensorIndexes = new int[3]; while(iter.hasNext()) { IJV ijv = iter.next(); - computeTensorIndexes(ijv.getI(), ijv.getJ(), tensorIndexes, params.N, params.C, params.P, params.Q); - int c = tensorIndexes[1]; - int p = tensorIndexes[2]; - int q = tensorIndexes[3]; + computeTensorIndexes(ijv.getJ(), tensorIndexes, params.P, params.Q); + int c = tensorIndexes[0]; + int p = tensorIndexes[1]; + int q = tensorIndexes[2]; final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W; int start_index_h = p * params.stride_h - params.pad_h; @@ -597,14 +596,14 @@ public class LibMatrixDNN { private static void doPoolingBackwardDenseSparse(int n, double [] inputArray, MatrixBlock dout, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException { Iterator<IJV> iter = dout.sparseBlock.getIterator(n, n+1); - int [] tensorIndexes = new int[4]; + int [] tensorIndexes = new int[3]; while(iter.hasNext()) { IJV ijv = iter.next(); - computeTensorIndexes(ijv.getI(), ijv.getJ(), tensorIndexes, params.N, params.C, params.P, params.Q); - int c = tensorIndexes[1]; - int p = tensorIndexes[2]; - int q = tensorIndexes[3]; + computeTensorIndexes(ijv.getJ(), tensorIndexes, params.P, params.Q); + int c = tensorIndexes[0]; + int p = tensorIndexes[1]; + int q = tensorIndexes[2]; final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W; int start_index_h = p * params.stride_h - params.pad_h; @@ -640,7 +639,7 @@ public class LibMatrixDNN { throw new DMLRuntimeException("Incorrect usage: Only sparse format supported"); Iterator<IJV> iter = input.sparseBlock.getIterator(n, n+1); - int [] tensorIndexes = new int[4]; + int [] tensorIndexes = new int[3]; int start_index_w = Math.max(q * params.stride_w - params.pad_w, 0); int end_index_w = Math.min(start_index_w + params.S, params.W); @@ -653,11 +652,11 @@ public class LibMatrixDNN { double currDoutVal = -1; while(iter.hasNext()) { IJV ijv = iter.next(); - computeTensorIndexes(ijv.getI(), ijv.getJ(), tensorIndexes, params.N, params.C, params.H, params.W); - if(c != tensorIndexes[1]) + computeTensorIndexes(ijv.getJ(), tensorIndexes, params.H, params.W); + if(c != tensorIndexes[0]) continue; - int h = tensorIndexes[2]; - int w = tensorIndexes[3]; + int h = tensorIndexes[1]; + int w = tensorIndexes[2]; if(h >= start_index_h && h < end_index_h && w >= start_index_w && w < end_index_w) { currDoutVal = ijv.getV(); if(maxVal < currDoutVal) { @@ -807,13 +806,13 @@ public class LibMatrixDNN { Arrays.fill(outputArray, 0); Iterator<IJV> iter = input.sparseBlock.getIterator(inputN, inputN+1); - int [] tensorIndexes = new int[4]; + int [] tensorIndexes = new int[3]; while(iter.hasNext()) { IJV ijv = iter.next(); - computeTensorIndexes(ijv.getI(), ijv.getJ(), tensorIndexes, params.N, params.K, params.P, params.Q); - int k = tensorIndexes[1]; - int p = tensorIndexes[2]; - int q = tensorIndexes[3]; + computeTensorIndexes(ijv.getJ(), tensorIndexes, params.P, params.Q); + int k = tensorIndexes[0]; + int p = tensorIndexes[1]; + int q = tensorIndexes[2]; outputArray[outputOffset + p*params.Q*params.K + q*params.K + k] = ijv.getV(); } } @@ -961,9 +960,7 @@ public class LibMatrixDNN { break; case Col2Im: for (int n = n1; n < n2; n++) { - for (int z = z1; z < z2; z++) { - doCol2imOverInputPath_NCHW(n, z, params); - } + doCol2imOverMultipleImages(n, params); } break; case MaxPooling_Forward: @@ -1082,20 +1079,18 @@ public class LibMatrixDNN { warnSingleThreaded(); // Sequential col2im for (int n = 0; n < params.N; n++) { // Do following for all images - for (int c = 0; c < params.C; c++) { // Since format is NCHW - doCol2imOverInputPath_NCHW(n, c, params); - } + doCol2imOverMultipleImages(n, params); } } else { // Parallel col2im - runConvTask(constrainedNumThreads, params.C, TaskType.Col2Im, params); + runConvTask(constrainedNumThreads, 1, TaskType.Col2Im, params); } } // Converts input: PQ X CRS matrix and writes to 1 X CHW - private static void doCol2imOverSingleImage(int n, MatrixBlock input, ConvolutionParameters params) throws DMLRuntimeException { + private static void doCol2imOverSingleImage(int outputN, MatrixBlock input, ConvolutionParameters params) throws DMLRuntimeException { if(input.rlen != params.P*params.Q || input.clen != params.C*params.R*params.S) { throw new DMLRuntimeException("Incorrect input dimensions"); } @@ -1109,45 +1104,64 @@ public class LibMatrixDNN { if(!input.isInSparseFormat()) { double [] inputArray = input.getDenseBlock(); - doCol2IMDenseInput(n, inputArray, outputArray, params); + doCol2IMDenseInput(0, outputN, inputArray, outputArray, params); } else { - doCol2IMSparseInput(n, input.getSparseBlockIterator(), outputArray, params); + doCol2IMSparseInput(0, outputN, input.getSparseBlockIterator(), outputArray, params); } } - private static void doCol2IMSparseInput(int n, Iterator<IJV> inputIter, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException { - int [] tensorIndexes = new int[4]; + private static void doCol2IMSparseInput(int inputN, int outputN, Iterator<IJV> inputIter, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException { + int [] tensorIndexes = new int[3]; + while(inputIter.hasNext()) { IJV ijv = inputIter.next(); - computeTensorIndexes(ijv.getI(), ijv.getJ(), tensorIndexes, params.P*params.Q, params.C, params.R, params.S); - int c = tensorIndexes[1]; - int r = tensorIndexes[2]; - int s = tensorIndexes[3]; - int p = ijv.getI() / params.Q; - int q = ijv.getI() % params.Q; + computeTensorIndexes(ijv.getJ(), tensorIndexes, params.R, params.S); + int c = tensorIndexes[0]; + int r = tensorIndexes[1]; + int s = tensorIndexes[2]; + computeTensorIndexes(ijv.getI(), tensorIndexes, params.P, params.Q); + int p = tensorIndexes[1]; + int q = tensorIndexes[2]; + if(inputN != tensorIndexes[0]) { + throw new DMLRuntimeException("Incorrect tensor indexes: " + inputN + " != " + tensorIndexes[0] + " <" + p + " " + q + " " + ijv.getI() + params.P + " " + params.Q + ">"); + } int h = p*params.stride_h + r - params.pad_h; int w = q*params.stride_w + s - params.pad_w; if(h >= 0 && h < params.H && w >= 0 && w < params.W) { - int outIndex = n*params.C*params.H*params.W + c*params.H*params.W + h*params.W + w; + int outIndex = outputN*params.C*params.H*params.W + c*params.H*params.W + h*params.W + w; outputArray[outIndex] += ijv.getV(); } } } - private static void doCol2IMDenseInput(int n, double [] inputArray, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException { - for (int c = 0; c < params.C; c++) { - for (int r = 0; r < params.R; r++) { // Get an input patch of size R X S - for (int s = 0; s < params.S; s++) { - for (int p = 0; p < params.P; p++) { - for (int q = 0; q < params.Q; q++) { - int inputIndex = (p*params.Q + q)*params.C*params.R*params.S + c*params.R*params.S + r*params.S + s; - int h = p*params.stride_h + r - params.pad_h; - int w = q*params.stride_w + s - params.pad_w; - if(h >= 0 && h < params.H && w >= 0 && w < params.W) { - int outIndex = n*params.C*params.H*params.W + c*params.H*params.W + h*params.W + w; - outputArray[outIndex] += inputArray[inputIndex]; - } + // Converts input: PQ X CRS matrix and writes to 1 X CHW if inputN == 0 + // Or converts input: NPQ X CRS matrix and writes to N X CHW + private static void doCol2IMDenseInput(int inputN, int outputN, double [] inputArray, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException { + final int outputNOffset = outputN*params.C*params.H*params.W; + for (int p = 0; p < params.P; p++) { + // h = p*params.stride_h + r - params.pad_h + // = r + hOffset + // Based on restrictions: h >= 0 and r >= 0 and h < params.H and r < params.R, we get + // max(0, - hOffset) <= r < min(params.R, params.H - hOffset) + final int hOffset = p*params.stride_h - params.pad_h; + final int rStart = Math.max(0, - hOffset); + final int rEnd = Math.min(params.R, params.H - hOffset); + for (int q = 0; q < params.Q; q++) { + // Using the same logic as above on following: + // w = q*params.stride_w + s - params.pad_w + final int wOffset = q*params.stride_w - params.pad_w; + final int sStart = Math.max(0, - wOffset); + final int sEnd = Math.min(params.S, params.W - wOffset); + final int tempOffset = (inputN*params.P*params.Q + p*params.Q + q)*params.C*params.R*params.S; + for (int c = 0; c < params.C; c++) { + final int outOffset = outputNOffset + c*params.H*params.W; + final int inputOffset = tempOffset + c*params.R*params.S; + for (int r = rStart; r < rEnd; r++) { + for (int s = sStart; s < sEnd; s++) { + int inputIndex = inputOffset + r*params.S + s; + int outIndex = outOffset + (hOffset + r)*params.W + wOffset + s; + outputArray[outIndex] += inputArray[inputIndex]; } } } @@ -1155,48 +1169,28 @@ public class LibMatrixDNN { } } - private static void doCol2imOverInputPath_NCHW(int n, int c, ConvolutionParameters params) { - double [] inputArray = null; - if (!params.input1.isInSparseFormat()) - inputArray = params.input1.getDenseBlock(); + // NPQ X CRS + private static void doCol2imOverMultipleImages(int n, ConvolutionParameters params) throws DMLRuntimeException { + MatrixBlock input = params.input1; + + if(input.rlen != params.N*params.P*params.Q || input.clen != params.C*params.R*params.S) { + throw new DMLRuntimeException("Incorrect input dimensions"); + } + double [] outputArray = null; if (!params.output.isInSparseFormat()) outputArray = params.output.getDenseBlock(); - - for (int r = 0; r < params.R; r++) { // Get an input patch of size R X S - for (int s = 0; s < params.S; s++) { - int localIndex = ((c*params.R*params.S*params.N + r*params.S*params.N + s*params.N + n)*params.P*params.Q); - - int input_row = r - params.pad_h; - // And copy it to outputArray[i] (taking care of padding & striding) - for (int p = params.P; p > 0; p--) { - if (input_row >= 0 && input_row < params.H) { - int input_col = s - params.pad_w; - for (int q = params.Q; q > 0; q--, localIndex++) { - if (input_col >= 0 && input_col < params.W) { - // Copy from [channel c, height input_row, width input_col] - int index = n*params.C*params.H*params.W + c*params.H*params.W + input_row*params.W + input_col; - if (inputArray != null) { - outputArray[index] += inputArray[localIndex]; - } - else { - // TODO: Optimize for sparse input - // Note: localIndex = row*N*P*Q + col - int row = localIndex / (params.N*params.P*params.Q); - int col = localIndex % (params.N*params.P*params.Q); - outputArray[index] += params.input1.quickGetValue(row, col); - } - } - input_col += params.stride_w; - } - } else { - localIndex += params.Q; - } - input_row += params.stride_h; - } - } + else { + throw new DMLRuntimeException("Only dense output is implemented"); } + if(!input.isInSparseFormat()) { + double [] inputArray = input.getDenseBlock(); + doCol2IMDenseInput(n, n, inputArray, outputArray, params); + } + else { + doCol2IMSparseInput(n, n, input.getSparseBlockIterator(n*params.P*params.Q, (n+1)*params.P*params.Q), outputArray, params); + } } private static long doIm2colOverInputPath_NCHW(int n, int c, ConvolutionParameters params) throws DMLRuntimeException { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/62377991/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java b/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java index 7a83278..ac19816 100644 --- a/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java +++ b/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java @@ -196,33 +196,45 @@ public class ConvolutionUtils { return null; if(currentHop != null && isConvolutionOp(currentHop, ConvOp.COL2IM)) { - Hop temp = currentHop.getInput().get(0); - if(temp != null && isTranspose(temp)) { - Hop matMult = temp.getInput().get(0); - if(matMult != null && isMatMult(matMult)) { - Hop rotate180 = matMult.getInput().get(0); - Hop filter = matMult.getInput().get(1); - if(isConvolutionOp(rotate180, ConvOp.ROTATE180)) { - ArrayList<Hop> inputs = new ArrayList<Hop>(); - inputs.add(filter); - inputs.add(rotate180.getInput().get(0)); - for(int i = 1; i < rotate180.getInput().size(); i++) { - inputs.add(rotate180.getInput().get(i)); - } - - // N, C * H * W - long N = currentHop.computeSizeInformation(inputs.get(6)); - long C = currentHop.computeSizeInformation(inputs.get(7)); - long H = currentHop.computeSizeInformation(inputs.get(8)); - long W = currentHop.computeSizeInformation(inputs.get(9)); - long rlen = N; - long clen = ConvolutionOp.getExtractedVal(C, H, W); - return ConvolutionOp.constructFusedConvolutionLops(et, inputs, ConvOp.DIRECT_CONV2D_BACKWARD_DATA, (ConvolutionOp) rotate180, rlen, clen); - - + Hop matMult = currentHop.getInput().get(0); + if(matMult != null && isMatMult(matMult)) { + Hop rotate180 = matMult.getInput().get(0); + Hop filter = matMult.getInput().get(1); + if(isConvolutionOp(rotate180, ConvOp.ROTATE180)) { + ArrayList<Hop> inputs = new ArrayList<Hop>(); + inputs.add(filter); + inputs.add(rotate180.getInput().get(0)); + for(int i = 1; i < rotate180.getInput().size(); i++) { + inputs.add(rotate180.getInput().get(i)); + } + + // N, C * H * W + long N = currentHop.computeSizeInformation(inputs.get(6)); + long C = currentHop.computeSizeInformation(inputs.get(7)); + long H = currentHop.computeSizeInformation(inputs.get(8)); + long W = currentHop.computeSizeInformation(inputs.get(9)); + long K = currentHop.computeSizeInformation(inputs.get(10)); + long R = currentHop.computeSizeInformation(inputs.get(12)); + long S = currentHop.computeSizeInformation(inputs.get(13)); + long stride_h = currentHop.computeSizeInformation(inputs.get(2)); + long stride_w = currentHop.computeSizeInformation(inputs.get(3)); + long pad_h = currentHop.computeSizeInformation(inputs.get(4)); + long pad_w = currentHop.computeSizeInformation(inputs.get(5)); + long P = -1; long Q = -1; + if(H > 0 && R > 0 && stride_h > 0 && pad_h > 0) + P = ConvolutionUtils.getP(H, R, stride_h, pad_h); + if(W > 0 && S > 0 && stride_w > 0 && pad_w > 0) + Q = ConvolutionUtils.getQ(W, S, stride_w, pad_w); + + if(preferIm2Col(et, N, K, C, R, S, P, Q)) { + return null; } + long rlen = N; + long clen = ConvolutionOp.getExtractedVal(C, H, W); + return ConvolutionOp.constructFusedConvolutionLops(et, inputs, ConvOp.DIRECT_CONV2D_BACKWARD_DATA, (ConvolutionOp) rotate180, rlen, clen); } } + } return null;
