[MINOR] Refactoring and cleanup cp convolution operations This patch refactors the convolution operations to remove unnecessary and unused code as a preparation step for the support of large dense blocks.
Furthermore, this also includes a fix for special cases of sparse-dense matrix multiplications over large dense blocks. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/45eec2d2 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/45eec2d2 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/45eec2d2 Branch: refs/heads/master Commit: 45eec2d258a5239413df8071716011aaabd4d28a Parents: 20b1b5a Author: Matthias Boehm <[email protected]> Authored: Wed Jan 10 19:03:07 2018 -0800 Committer: Matthias Boehm <[email protected]> Committed: Thu Jan 11 11:45:09 2018 -0800 ---------------------------------------------------------------------- .../sysml/runtime/matrix/data/LibMatrixDNN.java | 281 ++++---- .../runtime/matrix/data/LibMatrixDNNConv2d.java | 653 +++++++++++++++++++ .../LibMatrixDNNConv2dBackwardDataHelper.java | 114 ---- .../LibMatrixDNNConv2dBackwardFilterHelper.java | 206 ------ .../matrix/data/LibMatrixDNNConv2dHelper.java | 307 --------- .../runtime/matrix/data/LibMatrixDNNHelper.java | 526 +-------------- .../runtime/matrix/data/LibMatrixDNNIm2Col.java | 351 ++++++++++ .../matrix/data/LibMatrixDNNIm2ColHelper.java | 419 ------------ .../matrix/data/LibMatrixDNNPooling.java | 532 +++++++++++++++ .../data/LibMatrixDNNPoolingBackwardHelper.java | 299 --------- .../matrix/data/LibMatrixDNNPoolingHelper.java | 170 ----- .../runtime/matrix/data/LibMatrixDNNRelu.java | 89 +++ .../matrix/data/LibMatrixDNNRotate180.java | 109 ++++ .../data/LibMatrixDNNRotate180Helper.java | 110 ---- .../runtime/matrix/data/LibMatrixMult.java | 2 +- 15 files changed, 1865 insertions(+), 2303 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/45eec2d2/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java index ca38db3..e8a88d8 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java @@ -31,8 +31,6 @@ import org.apache.commons.logging.LogFactory; import org.apache.sysml.api.DMLScript; import org.apache.sysml.hops.OptimizerUtils; import org.apache.sysml.runtime.DMLRuntimeException; -import org.apache.sysml.runtime.instructions.InstructionUtils; -import org.apache.sysml.runtime.matrix.operators.BinaryOperator; import org.apache.sysml.runtime.util.ConvolutionUtils; /* @@ -51,7 +49,7 @@ import org.apache.sysml.runtime.util.ConvolutionUtils; * followed by the above mentioned functions are as follows: * execute(LibMatrixDNNHelper.get__Workers(params), params); * 3. LibMatrixDNN's execute() method ensures the creation and shutdown of the ExecutorService. - * 4. LibMatrixDNNHelper.get__Workers creates appropriate workers based on the runtime characteristics of + * 4. LibMatrixDNN__.getWorkers creates appropriate workers based on the runtime characteristics of * the input data (for example: input activations, filter, dout, ...). For code maintenance, these workers * are placed in the respective LibMatrixDNN__Helper files. * 5. The above mentioned workers may also use additional workers such as im2col and rotate180. @@ -129,18 +127,7 @@ public class LibMatrixDNN { loopedConvBwdDataMatMultTime.set(0); loopedConvBwdDataCol2ImTime.set(0); } - - // Commonly used operators - static BinaryOperator _binaryElementWiseAddition = null; - static BinaryOperator _binaryElementWiseMultiplication = null; - static { - try { - _binaryElementWiseAddition = InstructionUtils.parseBinaryOperator("+"); - _binaryElementWiseMultiplication = InstructionUtils.parseBinaryOperator("*"); - } catch (DMLRuntimeException e) { - throw new RuntimeException("ERROR initializing LibMatrixDNN", e); - } - } + // ------------------------------------------------------------------------------------------------ /** @@ -154,11 +141,10 @@ public class LibMatrixDNN { */ public static void conv2d(MatrixBlock input, MatrixBlock filter, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { LibMatrixDNN.checkInputsConv2d(input, filter, outputBlock, params); - if(params.bias != null && params.bias.isInSparseFormat()) params.bias.sparseToDense(); // Since bias is extremely small array - long nnz = execute(LibMatrixDNNHelper.getConv2dWorkers(params), params); + long nnz = execute(LibMatrixDNNConv2d.getConv2dWorkers(params), params); //post-processing: maintain nnz outputBlock.setNonZeros(nnz); @@ -177,7 +163,7 @@ public class LibMatrixDNN { public static void conv2dBackwardData(MatrixBlock filter, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { checkInputsConv2dBackwardData(filter, dout, outputBlock, params); - long nnz = execute(LibMatrixDNNHelper.getConv2dBackwardDataWorkers(params), params); + long nnz = execute(LibMatrixDNNConv2d.getConv2dBackwardDataWorkers(params), params); //post-processing: maintain nnz outputBlock.setNonZeros(nnz); @@ -196,99 +182,34 @@ public class LibMatrixDNN { public static void conv2dBackwardFilter(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { checkInputsConv2dBackwardFilter(input, dout, outputBlock, params); - execute(LibMatrixDNNHelper.getConv2dBackwardFilterWorkers(params), params); + execute(LibMatrixDNNConv2d.getConv2dBackwardFilterWorkers(params), params); //post-processing: maintain nnz outputBlock.recomputeNonZeros(); outputBlock.examSparsity(); } - - private static void checkOrThrowException(String msg, long lhs, long rhs) throws DMLRuntimeException { - if(lhs != rhs) - throw new DMLRuntimeException(msg + ":" + lhs + " != " + rhs); - } - private static void checkOrThrowException(String msg, long lhs, long rhs1, long rhs2, long rhs3) throws DMLRuntimeException { - if(lhs != (rhs1*rhs2*rhs3)) - throw new DMLRuntimeException(msg + ":" + lhs + " != (" + rhs1 + " * " + rhs2 + " * " + rhs3); - } - - static void checkInputsConv2dBackwardData(MatrixBlock filter, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { - params.input1 = filter; - params.input2 = dout; - params.output = outputBlock; - checkOrThrowException("Incorrect input to conv2d_backward_data: Number of rows of input filter != " - + "number of filters in filter_shape", filter.getNumRows(), params.K); - checkOrThrowException("Incorrect input to conv2d_backward_data: Number of columns of input filter != " - + "channels*filter_height*filter_height in filter_shape", filter.getNumColumns(), params.C, params.R, params.S); - checkOrThrowException("Incorrect input to conv2d_backward_data: Number of rows of input errors != " - + "batch size in input_shape", dout.getNumRows(), params.N); - checkOrThrowException("Incorrect input to conv2d_backward_data: Number of columns of input errors != " - + "expected input error channels*height*width", dout.getNumColumns(), params.K, params.P, params.Q); - if(params.stride_h <= 0 || params.stride_w <= 0) - throw new DMLRuntimeException("Only positive strides supported:" + params.stride_h + ", " + params.stride_w); - - if(DMLScript.FINEGRAINED_STATISTICS) { - if(filter.isInSparseFormat() || dout.isInSparseFormat()) { - conv2dBwdDataSparseCount.addAndGet(1); - } - else { - conv2dBwdDataDenseCount.addAndGet(1); - } - } - } - - static void checkInputsConv2dBackwardFilter(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { + public static void maxpooling(MatrixBlock input, MatrixBlock output, ConvolutionParameters params) throws DMLRuntimeException { params.input1 = input; - params.input2 = dout; - params.output = outputBlock; - checkOrThrowException("Incorrect input to conv2d_backward_filter: Number of rows of input data != " - + "batch size in input_shape", input.getNumRows(), params.N); - checkOrThrowException("Incorrect input to conv2d_backward_filter: Number of columns of input data != " - + "channels*input_height*input_height in input_shape", input.getNumColumns(), params.C, params.H, params.W); - checkOrThrowException("Incorrect input to conv2d_backward_filter: Number of rows of input errors != " - + "batch size in input_shape", dout.getNumRows(), params.N); - checkOrThrowException("Incorrect input to conv2d_backward_filter: Number of columns of input errors != " - + "expected input error channels*height*width", dout.getNumColumns(), params.K, params.P, params.Q); - if(params.stride_h <= 0 || params.stride_w <= 0) - throw new DMLRuntimeException("Only positive strides supported:" + params.stride_h + ", " + params.stride_w); + params.output = output; - if(DMLScript.FINEGRAINED_STATISTICS) { - if(input.isInSparseFormat() || dout.isInSparseFormat()) { - conv2dBwdFilterSparseCount.addAndGet(1); - } - else { - conv2dBwdFilterDenseCount.addAndGet(1); - } + if(input.getNumColumns() != params.C*params.H*params.W || input.getNumRows() != params.N) { + throw new DMLRuntimeException("Incorrect input dimensions in maxpooling:" + input.getNumRows() + " " + + input.getNumColumns() + " " + params.N + " " + params.C*params.H*params.W); } - } - - static void checkInputsConv2d(MatrixBlock input, MatrixBlock filter, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { - params.input1 = input; - params.input2 = filter; - params.output = outputBlock; - checkOrThrowException("Incorrect input to conv2d: Number of rows of input filter != " - + "number of filters in filter_shape", filter.getNumRows(), params.K); - checkOrThrowException("Incorrect input to conv2d: Number of columns of input filter != " - + "channels*filter_height*filter_height in filter_shape", filter.getNumColumns(), params.C, params.R, params.S); - checkOrThrowException("Incorrect input to conv2d: Number of rows of input data != " - + "batch size in input_shape", input.getNumRows(), params.N); - checkOrThrowException("Incorrect input to conv2d: Number of columns of input data != " - + "channels*input_height*input_height in input_shape", input.getNumColumns(), params.C, params.H, params.W); - if(params.stride_h <= 0 || params.stride_w <= 0) - throw new DMLRuntimeException("Only positive strides supported:" + params.stride_h + ", " + params.stride_w); + //materialize indexes unless basic case with stride=1 and pad=0 + if( !params.isStride1Pad0() || input.sparse ) + fillIndexesArray(params); - if(DMLScript.FINEGRAINED_STATISTICS) { - if(input.isInSparseFormat() || filter.isInSparseFormat()) { - conv2dSparseCount.addAndGet(1); - } - else { - conv2dDenseCount.addAndGet(1); - } - } + long nnz = execute(LibMatrixDNNPooling.getMaxPoolingWorkers(params), params); + + // post-processing: maintain nnz + output.setNonZeros(nnz); + output.examSparsity(); } + /** * This method computes the backpropogation errors for previous layer of maxpooling operation * @@ -325,7 +246,7 @@ public class LibMatrixDNN { if( !(params.input1.isInSparseFormat() && !params.input2.isInSparseFormat()) ) fillIndexesArray(params); //not needed for sparse-dense - long nnz = execute(LibMatrixDNNHelper.getMaxPoolingBackwardWorkers(params, performReluBackward), params); + long nnz = execute(LibMatrixDNNPooling.getMaxPoolingBackwardWorkers(params, performReluBackward), params); //post-processing: maintain nnz outputBlock.setNonZeros(nnz); @@ -333,29 +254,6 @@ public class LibMatrixDNN { } /** - * This method computes start and end indexes required for max_pool and max_pool_backward operations. - * This speeds up the performance of max_pool and max_pool_backward - * - * @param params parameters required for max_pool and max_pool_backward operations - */ - private static void fillIndexesArray(ConvolutionParameters params) { - params.start_indexes_h = new int[params.P]; - params.end_indexes_h = new int[params.P]; - params.start_indexes_w = new int[params.Q]; - params.end_indexes_w = new int[params.Q]; - for( int p=0, ix=-params.pad_h; p < params.P; p++, ix+=params.stride_h ) { - // Note: We do not treat pad as zero - params.start_indexes_h[p] = Math.max(ix, 0); - params.end_indexes_h[p] = Math.min(ix+params.R, params.H); - } - for( int q=0, ix=-params.pad_w; q < params.Q; q++, ix+=params.stride_w) { - // Note: We do not treat pad as zero - params.start_indexes_w[q] = Math.max(ix, 0); - params.end_indexes_w[q] = Math.min(ix+params.S, params.W); - } - } - - /** * This method computes the backpropagation errors for previous layer of relu operation * * @param input input matrix @@ -375,7 +273,7 @@ public class LibMatrixDNN { input.getNumRows() + " != " + dout.getNumRows() + " || " + input.getNumColumns() + " != " + dout.getNumColumns()); } - execute(LibMatrixDNNHelper.getReluBackwardWorkers(params), params); + execute(LibMatrixDNNRelu.getReluBackwardWorkers(params), params); // post-processing: maintain nnz outputBlock.recomputeNonZeros(); @@ -503,7 +401,7 @@ public class LibMatrixDNN { } //post-processing: maintain nnz - params.output.recomputeNonZeros(); + params.output.recomputeNonZeros(); params.output.examSparsity(); } else { @@ -511,26 +409,6 @@ public class LibMatrixDNN { } } - public static void maxpooling(MatrixBlock input, MatrixBlock output, ConvolutionParameters params) throws DMLRuntimeException { - params.input1 = input; - params.output = output; - - if(input.getNumColumns() != params.C*params.H*params.W || input.getNumRows() != params.N) { - throw new DMLRuntimeException("Incorrect input dimensions in maxpooling:" + input.getNumRows() + " " - + input.getNumColumns() + " " + params.N + " " + params.C*params.H*params.W); - } - - //materialize indexes unless basic case with stride=1 and pad=0 - if( !params.isStride1Pad0() || input.sparse ) - fillIndexesArray(params); - - long nnz = execute(LibMatrixDNNHelper.getMaxPoolingWorkers(params), params); - - // post-processing: maintain nnz - output.setNonZeros(nnz); - output.examSparsity(); - } - /** * Executes the tasks in parallel using java's ExecutorService. * @@ -564,18 +442,111 @@ public class LibMatrixDNN { return lnnz; } - static boolean isEligibleForConv2dBackwardFilterSparseDense(ConvolutionParameters params) { - // NativeHelper.conv2dBackwardFilterSparseDense only if input is sparse. - // dout converted to dense if sparse. - return params.enableNative && params.input1.isInSparseFormat(); + private static void checkOrThrowException(String msg, long lhs, long rhs) throws DMLRuntimeException { + if(lhs != rhs) + throw new DMLRuntimeException(msg + ":" + lhs + " != " + rhs); + } + private static void checkOrThrowException(String msg, long lhs, long rhs1, long rhs2, long rhs3) throws DMLRuntimeException { + if(lhs != (rhs1*rhs2*rhs3)) + throw new DMLRuntimeException(msg + ":" + lhs + " != (" + rhs1 + " * " + rhs2 + " * " + rhs3); + } + + static void checkInputsConv2dBackwardData(MatrixBlock filter, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { + params.input1 = filter; + params.input2 = dout; + params.output = outputBlock; + checkOrThrowException("Incorrect input to conv2d_backward_data: Number of rows of input filter != " + + "number of filters in filter_shape", filter.getNumRows(), params.K); + checkOrThrowException("Incorrect input to conv2d_backward_data: Number of columns of input filter != " + + "channels*filter_height*filter_height in filter_shape", filter.getNumColumns(), params.C, params.R, params.S); + checkOrThrowException("Incorrect input to conv2d_backward_data: Number of rows of input errors != " + + "batch size in input_shape", dout.getNumRows(), params.N); + checkOrThrowException("Incorrect input to conv2d_backward_data: Number of columns of input errors != " + + "expected input error channels*height*width", dout.getNumColumns(), params.K, params.P, params.Q); + if(params.stride_h <= 0 || params.stride_w <= 0) + throw new DMLRuntimeException("Only positive strides supported:" + params.stride_h + ", " + params.stride_w); + + if(DMLScript.FINEGRAINED_STATISTICS) { + if(filter.isInSparseFormat() || dout.isInSparseFormat()) { + conv2dBwdDataSparseCount.addAndGet(1); + } + else { + conv2dBwdDataDenseCount.addAndGet(1); + } + } + } + + static void checkInputsConv2dBackwardFilter(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { + params.input1 = input; + params.input2 = dout; + params.output = outputBlock; + checkOrThrowException("Incorrect input to conv2d_backward_filter: Number of rows of input data != " + + "batch size in input_shape", input.getNumRows(), params.N); + checkOrThrowException("Incorrect input to conv2d_backward_filter: Number of columns of input data != " + + "channels*input_height*input_height in input_shape", input.getNumColumns(), params.C, params.H, params.W); + checkOrThrowException("Incorrect input to conv2d_backward_filter: Number of rows of input errors != " + + "batch size in input_shape", dout.getNumRows(), params.N); + checkOrThrowException("Incorrect input to conv2d_backward_filter: Number of columns of input errors != " + + "expected input error channels*height*width", dout.getNumColumns(), params.K, params.P, params.Q); + if(params.stride_h <= 0 || params.stride_w <= 0) + throw new DMLRuntimeException("Only positive strides supported:" + params.stride_h + ", " + params.stride_w); + + if(DMLScript.FINEGRAINED_STATISTICS) { + if(input.isInSparseFormat() || dout.isInSparseFormat()) { + conv2dBwdFilterSparseCount.addAndGet(1); + } + else { + conv2dBwdFilterDenseCount.addAndGet(1); + } + } } - static boolean isEligibleForConv2dSparse(ConvolutionParameters params) { - // NativeHelper.conv2dSparse only if filter is dense and input is sparse - return params.enableNative && params.input1.isInSparseFormat() && !params.input2.isInSparseFormat(); + + static void checkInputsConv2d(MatrixBlock input, MatrixBlock filter, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { + params.input1 = input; + params.input2 = filter; + params.output = outputBlock; + + checkOrThrowException("Incorrect input to conv2d: Number of rows of input filter != " + + "number of filters in filter_shape", filter.getNumRows(), params.K); + checkOrThrowException("Incorrect input to conv2d: Number of columns of input filter != " + + "channels*filter_height*filter_height in filter_shape", filter.getNumColumns(), params.C, params.R, params.S); + checkOrThrowException("Incorrect input to conv2d: Number of rows of input data != " + + "batch size in input_shape", input.getNumRows(), params.N); + checkOrThrowException("Incorrect input to conv2d: Number of columns of input data != " + + "channels*input_height*input_height in input_shape", input.getNumColumns(), params.C, params.H, params.W); + if(params.stride_h <= 0 || params.stride_w <= 0) + throw new DMLRuntimeException("Only positive strides supported:" + params.stride_h + ", " + params.stride_w); + + if(DMLScript.FINEGRAINED_STATISTICS) { + if(input.isInSparseFormat() || filter.isInSparseFormat()) { + conv2dSparseCount.addAndGet(1); + } + else { + conv2dDenseCount.addAndGet(1); + } + } } - static boolean isEligibleForConv2dBackwardDataDense(ConvolutionParameters params) { - // NativeHelper.conv2dBackwardDataDense only if filter is dense. - // dout converted to dense if sparse. - return params.enableNative && !params.input1.isInSparseFormat(); + + /** + * This method computes start and end indexes required for max_pool and max_pool_backward operations. + * This speeds up the performance of max_pool and max_pool_backward + * + * @param params parameters required for max_pool and max_pool_backward operations + */ + private static void fillIndexesArray(ConvolutionParameters params) { + params.start_indexes_h = new int[params.P]; + params.end_indexes_h = new int[params.P]; + params.start_indexes_w = new int[params.Q]; + params.end_indexes_w = new int[params.Q]; + for( int p=0, ix=-params.pad_h; p < params.P; p++, ix+=params.stride_h ) { + // Note: We do not treat pad as zero + params.start_indexes_h[p] = Math.max(ix, 0); + params.end_indexes_h[p] = Math.min(ix+params.R, params.H); + } + for( int q=0, ix=-params.pad_w; q < params.Q; q++, ix+=params.stride_w) { + // Note: We do not treat pad as zero + params.start_indexes_w[q] = Math.max(ix, 0); + params.end_indexes_w[q] = Math.min(ix+params.S, params.W); + } } } http://git-wip-us.apache.org/repos/asf/systemml/blob/45eec2d2/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2d.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2d.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2d.java new file mode 100644 index 0000000..92ae8a3 --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2d.java @@ -0,0 +1,653 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.matrix.data; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.concurrent.Callable; + +import org.apache.sysml.api.DMLScript; +import org.apache.sysml.hops.OptimizerUtils; +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.matrix.data.LibMatrixDNNIm2Col.Im2colWorker; +import org.apache.sysml.runtime.matrix.data.LibMatrixDNNRotate180.Rotate180Worker; +import org.apache.sysml.utils.NativeHelper; +import org.apache.sysml.utils.Statistics; + +/** + * This class contains the set of operators used for performing conv2d + */ +public class LibMatrixDNNConv2d +{ + /** + * Factory method that returns list of callable tasks for performing conv2d + * + * @param params convolution parameters + * @return list of callable tasks for performing conv2d + * @throws DMLRuntimeException if error occurs + */ + public static ArrayList<Callable<Long>> getConv2dWorkers(ConvolutionParameters params) throws DMLRuntimeException { + ArrayList<Callable<Long>> ret = new ArrayList<>(); + + // Try to create twice as many tasks as threads for improved load balance + // (due to constant-sized intermediates, GC works well, so the overhead per task is small) + int k = OptimizerUtils.getConstrainedNumThreads(params.numThreads); + int taskSize = (int)(Math.ceil((double)params.N / k / 2)); + + MatrixBlock in1 = params.input1; + boolean isEmptyDenseInput = !in1.isInSparseFormat() && in1.denseBlock == null; + boolean isTransPref = in1.sparse && !params.input2.sparse && + MatrixBlock.evalSparseFormatInMemory(in1.clen, in1.rlen, in1.nonZeros); + boolean applyNative = isEligibleForConv2dSparse(params) + && !(!isEmptyDenseInput && isTransPref); + if( applyNative ) + Statistics.numNativeSparseConv2dCalls.increment(); + + //transpose filter once for efficient sparse-dense multiplies in LoopedIm2ColConv2dTransAllChan + //in order to share the temporary object and its creation costs across threads + if( !applyNative && !isEmptyDenseInput && isTransPref ) { + params.input2 = LibMatrixReorg.transpose(params.input2, + new MatrixBlock(params.input2.clen, params.input2.rlen, false), k); + } + + for(int i = 0; i*taskSize < params.N; i++) { + //note: we prefer the java backend for sparse inputs because the native + //implementation simply converts the sparse input into dense rows + if( applyNative ) + ret.add(new SparseNativeConv2d(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); + else if(!isEmptyDenseInput && isTransPref) + ret.add(new LoopedIm2ColConv2dTransAllChan(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); + else if(!isEmptyDenseInput) + ret.add(new LoopedIm2ColConv2dAllChan(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); + else + throw new DMLRuntimeException("Unsupported operator"); + } + return ret; + } + + /** + * Factory method that returns list of callable tasks for performing conv2d backward filter + * + * @param params convolution parameters + * @return list of callable tasks for performing conv2d backward filter + * @throws DMLRuntimeException if error occurs + */ + public static ArrayList<Callable<Long>> getConv2dBackwardFilterWorkers(ConvolutionParameters params) throws DMLRuntimeException { + ArrayList<Callable<Long>> ret = new ArrayList<>(); + // Try to create as many tasks as threads. + // Creating more tasks will help in tail, but would have additional overhead of maintaining the intermediate + // data structures such as im2col blocks. + int k = OptimizerUtils.getConstrainedNumThreads(params.numThreads); + int taskSize = (int)(Math.ceil((double)params.N / k)); + + boolean isEmptyDenseInput = (!params.input1.isInSparseFormat() && params.input1.denseBlock == null) || + (!params.input2.isInSparseFormat() && params.input2.denseBlock == null); + boolean applyNative = isEligibleForConv2dBackwardFilterSparseDense(params) + && !params.input2.isInSparseFormat(); + if( applyNative ) + Statistics.numNativeSparseConv2dBwdFilterCalls.increment(); + + for(int i = 0; i*taskSize < params.N; i++) { + //note: we prefer the java backend for sparse filters because the native + //implementation simply rotates the sparse filters into dense rows + if( applyNative ) + ret.add(new SparseNativeConv2dBackwardFilterDense(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); + else if( params.input2.sparse && params.input1.getSparsity() > params.input2.getSparsity() ) + ret.add(new Conv2dBackwardFilterTrans(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); + else if(!isEmptyDenseInput) + ret.add(new Conv2dBackwardFilter(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); + else + throw new DMLRuntimeException("Unsupported operator"); + } + return ret; + } + + /** + * Factory method that returns list of callable tasks for performing conv2d backward data + * + * @param params convolution parameters + * @return list of callable tasks for performing conv2d backward data + * @throws DMLRuntimeException if error occurs + */ + public static ArrayList<Callable<Long>> getConv2dBackwardDataWorkers(ConvolutionParameters params) throws DMLRuntimeException { + ArrayList<Callable<Long>> ret = new ArrayList<>(); + + // Try to create as many tasks as threads. + // Creating more tasks will help in tail, but would have additional overhead of maintaining the intermediate + // data structures such as im2col blocks. + int k = OptimizerUtils.getConstrainedNumThreads(params.numThreads); + int taskSize = (int)(Math.ceil((double)params.N / k)); + + boolean isEmptyDenseInput = (!params.input1.isInSparseFormat() && params.input1.denseBlock == null) || + (!params.input2.isInSparseFormat() && params.input2.denseBlock == null); + boolean applyNative = isEligibleForConv2dBackwardDataDense(params) + && !params.input2.isInSparseFormat(); + if( applyNative ) + Statistics.numNativeSparseConv2dBwdDataCalls.increment(); + + for(int i = 0; i*taskSize < params.N; i++) { + //note: we prefer the java backend for sparse filters because the native + //implementation simply converts the sparse filters into dense rows + if( applyNative ) + ret.add(new SparseNativeConv2dBackwardDataDense(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); + else if(!isEmptyDenseInput) + ret.add(new Conv2dBackwardData(i*taskSize, Math.min((i+1)*taskSize, params.N), params)); + else + throw new DMLRuntimeException("Unsupported operator"); + } + + return ret; + } + + /** + * Performs convolution via: partialCopy1(filter %*% im2col(input)) = output + */ + private static class LoopedIm2ColConv2dAllChan implements Callable<Long> + { + protected final int _rl, _ru; + protected final ConvolutionParameters _params; + + public LoopedIm2ColConv2dAllChan(int rl, int ru, ConvolutionParameters params) { + _rl = rl; _ru = ru; + _params = params; + } + + @Override + public Long call() throws Exception { + final int PQ = _params.P*_params.Q, K = _params.K, CRS = _params.C*_params.R*_params.S; + MatrixBlock outIm2col = new MatrixBlock(CRS, PQ, false); + MatrixBlock outMM = new MatrixBlock(K, PQ, false); + Im2colWorker im2ColWorker = Im2colWorker.getWorker( _params.input1, outIm2col, _params, false); + long time1 = 0; long time2 = 0; + for(int n = _rl; n < _ru; n++) { + // im2col(input) => _im2ColOutBlock + long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + im2ColWorker.execute(n); + long t2 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + + // filter %*% _im2ColOutBlock => matMultOutBlock + outMM.reset(outMM.rlen, outMM.clen, false); + LibMatrixDNNHelper.singleThreadedMatMult(_params.input2, outIm2col, outMM, false, true, _params); + long t3 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + + if(DMLScript.FINEGRAINED_STATISTICS) { + time1 += t2 - t1; + time2 += t3 - t2; + } + + // Copy the matrix matMultOutBlock of shape [K X PQ] to params.output.denseBlock + destPos + partialCopy1(outMM, _params.output.getDenseBlockValues(), n*K*PQ, K, PQ); + + // Add bias to current row if necessary, always dense + if(_params.bias != null) + addBias(n, _params.output.getDenseBlockValues(), + _params.bias.getDenseBlockValues(), K, PQ); + } + + if(DMLScript.FINEGRAINED_STATISTICS) { + LibMatrixDNN.loopedConvIm2ColTime.addAndGet(time1); + LibMatrixDNN.loopedConvMatMultTime.addAndGet(time2); + } + + //multi-threaded nnz maintenance of current working set + return _params.output.recomputeNonZeros(_rl, _ru-1); + } + + // Copy the matrix src of shape [K X PQ] to params.output.denseBlock + destPos + private static void partialCopy1(MatrixBlock src, double [] dest, int destPos, int K, int PQ) { + // Copying is required as LibMatrixMult.matrixMult (and/or Java) is not pointer aware. + // This is not required in Native implementation + if( src.isEmptyBlock() ) + return; + if(src.isInSparseFormat()) { + SparseBlock sblock = src.sparseBlock; + for(int k = 0; k < src.getNumRows(); k++) { + if( sblock.isEmpty(k) ) continue; + int apos = sblock.pos(k); + int alen = sblock.size(k); + int[] aix = sblock.indexes(k); + double[] avals = sblock.values(k); + int desPosK = destPos + k*PQ; + for(int j = apos; j < apos+alen; j++) + dest[desPosK+aix[j]] = avals[j]; + } + } + else + System.arraycopy(src.getDenseBlockValues(), 0, dest, destPos, K * PQ); + } + } + + /** + * This implementation is similar to LoopedIm2ColConv2dAllChan, except for using a + * sparse-dense matrix multiplication with t(t(Xi) %*% t(F)) instead of a + * dense-sparse matrix multiplication with Xi %*% F. + * + * NOTE: this implementation assumes that the filter is passed in transposed form + * in order to share this temporary matrix (and its creation cost) across threads. + */ + private static class LoopedIm2ColConv2dTransAllChan extends LoopedIm2ColConv2dAllChan + { + public LoopedIm2ColConv2dTransAllChan(int rl, int ru, ConvolutionParameters params) { + super(rl, ru, params); + } + + @Override + public Long call() throws Exception { + final int PQ = _params.P*_params.Q, K = _params.K, CRS = _params.C*_params.R*_params.S; + MatrixBlock outIm2col = new MatrixBlock(PQ, CRS, false); + MatrixBlock outMM = new MatrixBlock(PQ, K, false); + Im2colWorker im2ColWorker = Im2colWorker.getWorker( _params.input1, outIm2col, _params, true); + + for(int n = _rl; n < _ru; n++) { + // im2col(input) => _im2ColOutBlock + im2ColWorker.execute(n); + + // t(_im2ColOutBlock) %*% t(filter) => t(matMultOutBlock) + outMM.reset(outMM.rlen, outMM.clen, false); + LibMatrixDNNHelper.singleThreadedMatMult(outIm2col, _params.input2, outMM, false, false, _params); + + // Copy the matrix matMultOutBlock of shape [K X PQ] to params.output.denseBlock + destPos + partialCopyTrans(outMM, _params.output, n*K*PQ, K, PQ); + + // Add bias to current row if necessary, always dense + if(_params.bias != null) + addBias(n, _params.output.getDenseBlockValues(), + _params.bias.getDenseBlockValues(), K, PQ); + } + + //multi-threaded nnz maintenance of current working set + return _params.output.recomputeNonZeros(_rl, _ru-1); + } + + private static void partialCopyTrans(MatrixBlock src, MatrixBlock dest, int destPos, int K, int PQ) { + if( src.isEmptyBlock() ) + return; + //copy src into its destination row w/ piggybacked transpose + //src is [PQ x K] -> [K x PQ] -> [1 x KPQ] + if(src.isInSparseFormat()) { + SparseBlock sblock = src.sparseBlock; + double[] c = dest.getDenseBlockValues(); + for(int i = 0; i < src.getNumRows(); i++) { + if( sblock.isEmpty(i) ) continue; + int apos = sblock.pos(i); + int alen = sblock.size(i); + int[] aix = sblock.indexes(i); + double[] avals = sblock.values(i); + int desPosK = destPos + i; + for(int j = apos; j < apos+alen; j++) + c[desPosK+aix[j]*PQ] = avals[j]; + } + } + else { + double[] a = src.getDenseBlockValues(); + double[] c = dest.getDenseBlockValues(); + final int blocksizeIJ = 128; //128KB for L2 + //cache-conscious blocked execution + for( int bi = 0; bi < PQ; bi+=blocksizeIJ ) + for( int bj = 0; bj < K; bj+=blocksizeIJ ) { + int bimin = Math.min(bi+blocksizeIJ, PQ); + int bjmin = Math.min(bj+blocksizeIJ, K); + //core transpose operation + for(int i=bi, aix=bi*K+bj, cix=bj*PQ+bi; i<bimin; i++, aix+=K, cix++) + LibMatrixReorg.transposeRow(a, c, aix, destPos+cix, PQ, bjmin-bj); + } + } + } + } + + /** + * This operator is used only if native is enabled, filter is dense and input is sparse + */ + private static class SparseNativeConv2d implements Callable<Long> + { + public final int _rl, _ru; + private final ConvolutionParameters _params; + public SparseNativeConv2d(int rl, int ru, ConvolutionParameters params) { + _rl = rl; _ru = ru; + _params = params; + } + + @Override + public Long call() throws Exception { + int KPQ = _params.K*_params.P*_params.Q; + double[] temp = new double[KPQ]; + for(int n = _rl; n < _ru; n++) { + if( !_params.input1.getSparseBlock().isEmpty(n) ) { + int apos = _params.input1.getSparseBlock().pos(n); + int alen = _params.input1.getSparseBlock().size(n); + int[] aix = _params.input1.getSparseBlock().indexes(n); + double[] avals = _params.input1.getSparseBlock().values(n); + NativeHelper.conv2dSparse(apos, alen, aix, avals, _params.input2.getDenseBlockValues(), temp, + 1, _params.C, _params.H, _params.W, _params.K, _params.R, _params.S, + _params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, _params.Q, 1); + System.arraycopy(temp, 0, _params.output.getDenseBlockValues(), n*KPQ, KPQ); + } + } + //multi-threaded nnz maintenance of current working set + return _params.output.recomputeNonZeros(_rl, _ru-1); + } + } + + // BACKWARD DATA + + /** + * This operator is used only if native is enabled and filter is sparse. + * dout is converted into dense if sparse. + */ + private static class SparseNativeConv2dBackwardDataDense implements Callable<Long> + { + public final int _rl, _ru; + private final ConvolutionParameters _params; + public SparseNativeConv2dBackwardDataDense(int rl, int ru, ConvolutionParameters params) { + _rl = rl; _ru = ru; + _params = params; + } + + @Override + public Long call() throws Exception { + int CHW = _params.C*_params.H*_params.W; + double [] ret = new double[CHW]; + double [] filterArr = _params.input1.getDenseBlockValues(); + double [] dout_n = new double[_params.P*_params.Q*_params.K]; + for(int n = _rl; n < _ru; n++) { + getRowInDenseFormat(_params.input2, n, dout_n); + if(n > _rl) + Arrays.fill(ret, 0); + NativeHelper.conv2dBackwardDataDense(filterArr, dout_n, ret, 1, + _params.C, _params.H, _params.W, _params.K, + _params.R, _params.S, _params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, _params.Q, 1); + System.arraycopy(ret, 0, _params.output.getDenseBlockValues(), n*CHW, CHW); + } + //multi-threaded nnz maintenance of current working set + return _params.output.recomputeNonZeros(_rl, _ru-1); + } + } + + /** + * General conv2d backward data operator + */ + private static class Conv2dBackwardData implements Callable<Long> { + + public final int _rl, _ru; + private final ConvolutionParameters _params; + public Conv2dBackwardData(int rl, int ru, ConvolutionParameters params) { + _rl = rl; _ru = ru; + _params = params; + } + + @Override + public Long call() throws Exception { + int PQ = _params.P*_params.Q; int K = _params.K; int CRS = _params.C*_params.R*_params.S; + MatrixBlock filter = _params.input1; + MatrixBlock dout = _params.input2; + MatrixBlock outRotate = new MatrixBlock(PQ, K, dout.sparse); + MatrixBlock outMM = new MatrixBlock(PQ, CRS, false); + outRotate.allocateBlock(); + LibMatrixDNNRotate180.Rotate180Worker rotate180Worker = + LibMatrixDNNRotate180.Rotate180Worker.getWorker( dout, outRotate, _params, true, false); + long time1 = 0; long time2 = 0; + for(int n = _rl; n < _ru; n++) { + // rotate180(dout[n,]) => dout_reshaped + rotate180Worker.execute(n, 0); + // dout_reshaped %*% filter => temp + long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + outMM.reset(PQ, CRS, false); + LibMatrixDNNHelper.singleThreadedMatMult(outRotate, filter, outMM, !outRotate.sparse, false, _params); + long t2 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + // col2im(temp) => output[n,] + LibMatrixDNNIm2Col.doCol2imOverSingleImage(n, outMM, _params); + long t3 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + + if(DMLScript.FINEGRAINED_STATISTICS) { + time1 += t2 - t1; + time2 += t3 - t2; + } + } + if(DMLScript.FINEGRAINED_STATISTICS) { + LibMatrixDNN.loopedConvBwdDataMatMultTime.addAndGet(time1); + LibMatrixDNN.loopedConvBwdDataCol2ImTime.addAndGet(time2); + } + + //multi-threaded nnz maintenance of current working set + return _params.output.recomputeNonZeros(_rl, _ru-1); + } + } + + //BACKWARD FILTER + + /** + * This operator is used only if native is enabled and input is sparse. + * dout is converted into dense if sparse. + */ + private static class SparseNativeConv2dBackwardFilterDense implements Callable<Long> + { + public final int _rl, _ru; + private final ConvolutionParameters _params; + public SparseNativeConv2dBackwardFilterDense(int rl, int ru, ConvolutionParameters params) { + _rl = rl; _ru = ru; + _params = params; + } + + @Override + public Long call() throws Exception { + int CRS = _params.C*_params.R*_params.S, PQ = _params.P*_params.Q, K = _params.K; + MatrixBlock dout_n = new MatrixBlock(PQ, K, false); + dout_n.allocateBlock(); + LibMatrixDNNRotate180.Rotate180Worker rotate180Worker = + LibMatrixDNNRotate180.Rotate180Worker.getWorker( _params.input2, dout_n, _params, true, false); + double [] ldout_n = dout_n.getDenseBlockValues(); + double [] partRet = new double[CRS*_params.K]; //CRS x K + for(int n = _rl; n < _ru; n++) { + if( !_params.input1.getSparseBlock().isEmpty(n) ) { + // rotate180(dout[n,]) => dout_n + rotate180Worker.execute(n, 0); + + int apos = _params.input1.getSparseBlock().pos(n); + int alen = _params.input1.getSparseBlock().size(n); + int[] aix = _params.input1.getSparseBlock().indexes(n); + double[] avals = _params.input1.getSparseBlock().values(n); + NativeHelper.conv2dBackwardFilterSparseDense(apos, alen, aix, avals, + ldout_n, partRet, 1, _params.C, _params.H, _params.W, _params.K, + _params.R, _params.S, _params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, _params.Q, 1); + } + } + inplaceTransAdd(partRet, _params); + return 0L; + } + } + + /** + * General conv2d backward data operator + */ + private static class Conv2dBackwardFilter implements Callable<Long> { + private final int _rl, _ru; + private final ConvolutionParameters _params; + + public Conv2dBackwardFilter(int rl, int ru, ConvolutionParameters params) { + _rl = rl; _ru = ru; + _params = params; + } + + @Override + public Long call() throws Exception { + int PQ = _params.P*_params.Q, K = _params.K, CRS = _params.C*_params.R*_params.S; + MatrixBlock dout = _params.input2; + MatrixBlock im2ColOutBlock = new MatrixBlock(CRS, PQ, false); + MatrixBlock outRotate = new MatrixBlock(PQ, K, dout.sparse); + MatrixBlock outMM = new MatrixBlock(CRS, K, false); + outRotate.allocateBlock(); + + Im2colWorker im2ColWorker = Im2colWorker.getWorker( _params.input1, im2ColOutBlock, _params, false); + Rotate180Worker rotate180Worker = Rotate180Worker.getWorker( dout, outRotate, _params, true, false); + double [] partRet = new double[CRS*_params.K]; + long time1 = 0; long time2 = 0; + for(int n = _rl; n < _ru; n++) { + // rotate180(dout[n,]) => dout_reshaped + rotate180Worker.execute(n, 0); + + // im2col(input) => _im2ColOutBlock + long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + im2ColWorker.execute(n); + long t2 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + + outMM.reset(CRS, K, false); + LibMatrixDNNHelper.singleThreadedMatMult(im2ColOutBlock, outRotate, outMM, !im2ColOutBlock.sparse, !outRotate.sparse, _params); + long t3 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + + if( !outMM.isEmptyBlock() ) //accumulate row results + LibMatrixMult.vectAdd(outMM.getDenseBlockValues(), partRet, 0, 0, K*CRS); + + if(DMLScript.FINEGRAINED_STATISTICS) { + time1 += t2 - t1; + time2 += t3 - t2; + } + } + inplaceTransAdd(partRet, _params); + if(DMLScript.FINEGRAINED_STATISTICS) { + LibMatrixDNN.loopedConvBwdFilterIm2ColTime.addAndGet(time1); + LibMatrixDNN.loopedConvBwdFilterMatMultTime.addAndGet(time2); + } + return 0L; + } + } + + private static class Conv2dBackwardFilterTrans implements Callable<Long> { + private final int _rl, _ru; + private final ConvolutionParameters _params; + + public Conv2dBackwardFilterTrans(int rl, int ru, ConvolutionParameters params) { + _rl = rl; _ru = ru; + _params = params; + } + + @Override + public Long call() throws Exception { + int PQ = _params.P*_params.Q, K = _params.K, CRS = _params.C*_params.R*_params.S; + MatrixBlock dout = _params.input2; + MatrixBlock im2ColOutBlock = new MatrixBlock(PQ, CRS, false).allocateBlock(); + MatrixBlock outRotate = new MatrixBlock(K, PQ, dout.sparse).allocateBlock(); + MatrixBlock outMM = new MatrixBlock(K, CRS, false).allocateBlock(); + + Im2colWorker im2ColWorker = Im2colWorker.getWorker( _params.input1, im2ColOutBlock, _params, true); + Rotate180Worker rotate180Worker = Rotate180Worker.getWorker( dout, outRotate, _params, true, true); + double [] partRet = new double[CRS*_params.K]; + long time1 = 0; long time2 = 0; + for(int n = _rl; n < _ru; n++) { + // rotate180(dout[n,]) => dout_reshaped + rotate180Worker.execute(n, 0); + + // im2col(input) => _im2ColOutBlock + long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + im2ColWorker.execute(n); + long t2 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + + outMM.reset(K, CRS, false); + //Timing time = new Timing(true); + LibMatrixDNNHelper.singleThreadedMatMult(outRotate, im2ColOutBlock, + outMM, !outRotate.sparse, !im2ColOutBlock.sparse, _params); + long t3 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + + if( !outMM.isEmptyBlock() ) //accumulate row results + LibMatrixMult.vectAdd(outMM.getDenseBlockValues(), partRet, 0, 0, K*CRS); + + if(DMLScript.FINEGRAINED_STATISTICS) { + time1 += t2 - t1; + time2 += t3 - t2; + } + } + //no need to transpose because t(t(out)) cancel out + inplaceAdd(partRet, _params); + if(DMLScript.FINEGRAINED_STATISTICS) { + LibMatrixDNN.loopedConvBwdFilterIm2ColTime.addAndGet(time1); + LibMatrixDNN.loopedConvBwdFilterMatMultTime.addAndGet(time2); + } + return 0L; + } + } + + private static void inplaceAdd(double[] a, ConvolutionParameters params) { + synchronized (params.output.denseBlock) { + LibMatrixMult.vectAdd(a, params.output.getDenseBlockValues(), 0, 0, a.length); + } + } + + private static void inplaceTransAdd(double[] a, ConvolutionParameters params) { + synchronized (params.output.denseBlock) { + // Perform transposed addition: output of size [K, CRS] += input of size [CRS,K] + double [] c = params.output.getDenseBlockValues(); + final int CRS = params.C*params.R*params.S, K = params.K; + final int blocksizeIJ = 128; //L2 cache + + //cache-conscious blocked execution + for( int bi=0; bi<CRS; bi+=blocksizeIJ ) + for( int bj=0; bj<K; bj+=blocksizeIJ ) { + int bimin = Math.min(bi+blocksizeIJ, CRS); + int bjmin = Math.min(bj+blocksizeIJ, K); + //core transpose add operation + for(int i=bi, aix=bi*K; i<bimin; i++, aix+=K) + for(int j=bj, cix=i+bj*CRS; j<bjmin; j++, cix+=CRS) + c[cix] += a[aix+j]; + } + } + } + + private static void getRowInDenseFormat(MatrixBlock input, int n, double [] ret) throws DMLRuntimeException { + if(input.getNumColumns() != ret.length) { + throw new DMLRuntimeException("Invalid parameters"); + } + // Use temporary array to avoid binary search + if(input.isInSparseFormat()) { + Arrays.fill(ret, 0); + if( !input.sparseBlock.isEmpty(n) ) { + int apos = input.sparseBlock.pos(n); + int alen = input.sparseBlock.size(n); + int[] aix = input.sparseBlock.indexes(n); + double[] avals = input.sparseBlock.values(n); + for(int j=apos; j<apos+alen; j++) + ret[ aix[j] ] = avals[j]; + } + } + else { + System.arraycopy(input.getDenseBlockValues(), + n*input.getNumColumns(), ret, 0, input.getNumColumns()); + } + } + + private static void addBias(int r, double [] out, double [] bias, int K, int PQ) { + for(int k=0, cix=r*K*PQ; k<K; k++, cix+=PQ) + LibMatrixMult.vectAddInPlace(bias[k], out, cix, PQ); + } + + private static boolean isEligibleForConv2dBackwardFilterSparseDense(ConvolutionParameters params) { + // NativeHelper.conv2dBackwardFilterSparseDense only if input is sparse. + // dout converted to dense if sparse. + return params.enableNative && params.input1.isInSparseFormat(); + } + + private static boolean isEligibleForConv2dSparse(ConvolutionParameters params) { + // NativeHelper.conv2dSparse only if filter is dense and input is sparse + return params.enableNative && params.input1.isInSparseFormat() && !params.input2.isInSparseFormat(); + } + + private static boolean isEligibleForConv2dBackwardDataDense(ConvolutionParameters params) { + // NativeHelper.conv2dBackwardDataDense only if filter is dense. + // dout converted to dense if sparse. + return params.enableNative && !params.input1.isInSparseFormat(); + } +} http://git-wip-us.apache.org/repos/asf/systemml/blob/45eec2d2/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java deleted file mode 100644 index 03dfa93..0000000 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardDataHelper.java +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.sysml.runtime.matrix.data; - -import java.util.Arrays; -import java.util.concurrent.Callable; - -import org.apache.sysml.api.DMLScript; -import org.apache.sysml.utils.NativeHelper; - -/** - * This class contains the set of operators used for performing conv2d backward data - */ -public class LibMatrixDNNConv2dBackwardDataHelper { - - /** - * This operator is used only if native is enabled and filter is sparse. - * dout is converted into dense if sparse. - */ - public static class SparseNativeConv2dBackwardDataDense implements Callable<Long> - { - public int _rl; public int _ru; - private final ConvolutionParameters _params; - public SparseNativeConv2dBackwardDataDense(int rl, int ru, ConvolutionParameters params) { - _rl = rl; _ru = ru; - _params = params; - } - - @Override - public Long call() throws Exception { - int CHW = _params.C*_params.H*_params.W; - double [] ret = new double[CHW]; - double [] filterArr = _params.input1.getDenseBlockValues(); - double [] dout_n = new double[_params.P*_params.Q*_params.K]; - for(int n = _rl; n < _ru; n++) { - LibMatrixDNNHelper.getRowInDenseFormat(_params.input2, n, dout_n); - if(n > _rl) - Arrays.fill(ret, 0); - NativeHelper.conv2dBackwardDataDense(filterArr, dout_n, ret, 1, - _params.C, _params.H, _params.W, _params.K, - _params.R, _params.S, _params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, _params.Q, 1); - System.arraycopy(ret, 0, _params.output.getDenseBlockValues(), n*CHW, CHW); - } - //multi-threaded nnz maintenance of current working set - return _params.output.recomputeNonZeros(_rl, _ru-1); - } - } - - /** - * General conv2d backward data operator - */ - public static class Conv2dBackwardData implements Callable<Long> { - - public int _rl; public int _ru; - private final ConvolutionParameters _params; - public Conv2dBackwardData(int rl, int ru, ConvolutionParameters params) { - _rl = rl; _ru = ru; - _params = params; - } - - @Override - public Long call() throws Exception { - int PQ = _params.P*_params.Q; int K = _params.K; int CRS = _params.C*_params.R*_params.S; - MatrixBlock filter = _params.input1; - MatrixBlock dout = _params.input2; - MatrixBlock outRotate = new MatrixBlock(PQ, K, dout.sparse); - MatrixBlock outMM = new MatrixBlock(PQ, CRS, false); - outRotate.allocateBlock(); - LibMatrixDNNRotate180Helper.Rotate180Worker rotate180Worker = - LibMatrixDNNRotate180Helper.Rotate180Worker.getWorker( dout, outRotate, _params, true, false); - long time1 = 0; long time2 = 0; - for(int n = _rl; n < _ru; n++) { - // rotate180(dout[n,]) => dout_reshaped - rotate180Worker.execute(n, 0); - // dout_reshaped %*% filter => temp - long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; - outMM.reset(PQ, CRS, false); - LibMatrixDNNHelper.singleThreadedMatMult(outRotate, filter, outMM, !outRotate.sparse, false, _params); - long t2 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; - // col2im(temp) => output[n,] - LibMatrixDNNHelper.doCol2imOverSingleImage(n, outMM, _params); - long t3 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; - - if(DMLScript.FINEGRAINED_STATISTICS) { - time1 += t2 - t1; - time2 += t3 - t2; - } - } - if(DMLScript.FINEGRAINED_STATISTICS) { - LibMatrixDNN.loopedConvBwdDataMatMultTime.addAndGet(time1); - LibMatrixDNN.loopedConvBwdDataCol2ImTime.addAndGet(time2); - } - - //multi-threaded nnz maintenance of current working set - return _params.output.recomputeNonZeros(_rl, _ru-1); - } - } -} http://git-wip-us.apache.org/repos/asf/systemml/blob/45eec2d2/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java deleted file mode 100644 index f30916c..0000000 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dBackwardFilterHelper.java +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.sysml.runtime.matrix.data; - -import java.util.concurrent.Callable; - -import org.apache.sysml.api.DMLScript; -import org.apache.sysml.runtime.matrix.data.LibMatrixDNNIm2ColHelper.Im2colWorker; -import org.apache.sysml.runtime.matrix.data.LibMatrixDNNRotate180Helper.Rotate180Worker; -import org.apache.sysml.utils.NativeHelper; - -public class LibMatrixDNNConv2dBackwardFilterHelper { - - /** - * This operator is used only if native is enabled and input is sparse. - * dout is converted into dense if sparse. - */ - public static class SparseNativeConv2dBackwardFilterDense implements Callable<Long> - { - - public int _rl; public int _ru; - private final ConvolutionParameters _params; - public SparseNativeConv2dBackwardFilterDense(int rl, int ru, ConvolutionParameters params) { - _rl = rl; _ru = ru; - _params = params; - } - - @Override - public Long call() throws Exception { - int CRS = _params.C*_params.R*_params.S, PQ = _params.P*_params.Q, K = _params.K; - MatrixBlock dout_n = new MatrixBlock(PQ, K, false); - dout_n.allocateBlock(); - LibMatrixDNNRotate180Helper.Rotate180Worker rotate180Worker = - LibMatrixDNNRotate180Helper.Rotate180Worker.getWorker( _params.input2, dout_n, _params, true, false); - double [] ldout_n = dout_n.getDenseBlockValues(); - double [] partRet = new double[CRS*_params.K]; //CRS x K - for(int n = _rl; n < _ru; n++) { - if( !_params.input1.getSparseBlock().isEmpty(n) ) { - // rotate180(dout[n,]) => dout_n - rotate180Worker.execute(n, 0); - - int apos = _params.input1.getSparseBlock().pos(n); - int alen = _params.input1.getSparseBlock().size(n); - int[] aix = _params.input1.getSparseBlock().indexes(n); - double[] avals = _params.input1.getSparseBlock().values(n); - NativeHelper.conv2dBackwardFilterSparseDense(apos, alen, aix, avals, - ldout_n, partRet, 1, _params.C, _params.H, _params.W, _params.K, - _params.R, _params.S, _params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, _params.Q, 1); - } - } - inplaceTransAdd(partRet, _params); - return 0L; - } - } - - /** - * General conv2d backward data operator - */ - public static class Conv2dBackwardFilter implements Callable<Long> { - private final int _rl, _ru; - private final ConvolutionParameters _params; - - public Conv2dBackwardFilter(int rl, int ru, ConvolutionParameters params) { - _rl = rl; _ru = ru; - _params = params; - } - - @Override - public Long call() throws Exception { - int PQ = _params.P*_params.Q, K = _params.K, CRS = _params.C*_params.R*_params.S; - MatrixBlock dout = _params.input2; - MatrixBlock im2ColOutBlock = new MatrixBlock(CRS, PQ, false); - MatrixBlock outRotate = new MatrixBlock(PQ, K, dout.sparse); - MatrixBlock outMM = new MatrixBlock(CRS, K, false); - outRotate.allocateBlock(); - - Im2colWorker im2ColWorker = Im2colWorker.getWorker( _params.input1, im2ColOutBlock, _params, true, false); - Rotate180Worker rotate180Worker = Rotate180Worker.getWorker( dout, outRotate, _params, true, false); - double [] partRet = new double[CRS*_params.K]; - long time1 = 0; long time2 = 0; - for(int n = _rl; n < _ru; n++) { - // rotate180(dout[n,]) => dout_reshaped - rotate180Worker.execute(n, 0); - - // im2col(input) => _im2ColOutBlock - long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; - im2ColWorker.execute(n); - long t2 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; - - outMM.reset(CRS, K, false); - LibMatrixDNNHelper.singleThreadedMatMult(im2ColOutBlock, outRotate, outMM, !im2ColOutBlock.sparse, !outRotate.sparse, _params); - long t3 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; - - if( !outMM.isEmptyBlock() ) //accumulate row results - LibMatrixMult.vectAdd(outMM.getDenseBlockValues(), partRet, 0, 0, K*CRS); - - if(DMLScript.FINEGRAINED_STATISTICS) { - time1 += t2 - t1; - time2 += t3 - t2; - } - } - inplaceTransAdd(partRet, _params); - if(DMLScript.FINEGRAINED_STATISTICS) { - LibMatrixDNN.loopedConvBwdFilterIm2ColTime.addAndGet(time1); - LibMatrixDNN.loopedConvBwdFilterMatMultTime.addAndGet(time2); - } - return 0L; - } - } - - public static class Conv2dBackwardFilterTrans implements Callable<Long> { - private final int _rl, _ru; - private final ConvolutionParameters _params; - - public Conv2dBackwardFilterTrans(int rl, int ru, ConvolutionParameters params) { - _rl = rl; _ru = ru; - _params = params; - } - - @Override - public Long call() throws Exception { - int PQ = _params.P*_params.Q, K = _params.K, CRS = _params.C*_params.R*_params.S; - MatrixBlock dout = _params.input2; - MatrixBlock im2ColOutBlock = new MatrixBlock(PQ, CRS, false).allocateBlock(); - MatrixBlock outRotate = new MatrixBlock(K, PQ, dout.sparse).allocateBlock(); - MatrixBlock outMM = new MatrixBlock(K, CRS, false).allocateBlock(); - - Im2colWorker im2ColWorker = Im2colWorker.getWorker( _params.input1, im2ColOutBlock, _params, true, true); - Rotate180Worker rotate180Worker = Rotate180Worker.getWorker( dout, outRotate, _params, true, true); - double [] partRet = new double[CRS*_params.K]; - long time1 = 0; long time2 = 0; - for(int n = _rl; n < _ru; n++) { - // rotate180(dout[n,]) => dout_reshaped - rotate180Worker.execute(n, 0); - - // im2col(input) => _im2ColOutBlock - long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; - im2ColWorker.execute(n); - long t2 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; - - outMM.reset(K, CRS, false); - //Timing time = new Timing(true); - LibMatrixDNNHelper.singleThreadedMatMult(outRotate, im2ColOutBlock, - outMM, !outRotate.sparse, !im2ColOutBlock.sparse, _params); - long t3 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; - - if( !outMM.isEmptyBlock() ) //accumulate row results - LibMatrixMult.vectAdd(outMM.getDenseBlockValues(), partRet, 0, 0, K*CRS); - - if(DMLScript.FINEGRAINED_STATISTICS) { - time1 += t2 - t1; - time2 += t3 - t2; - } - } - //no need to transpose because t(t(out)) cancel out - inplaceAdd(partRet, _params); - if(DMLScript.FINEGRAINED_STATISTICS) { - LibMatrixDNN.loopedConvBwdFilterIm2ColTime.addAndGet(time1); - LibMatrixDNN.loopedConvBwdFilterMatMultTime.addAndGet(time2); - } - return 0L; - } - } - - private static void inplaceAdd(double[] a, ConvolutionParameters params) { - synchronized (params.output.denseBlock) { - LibMatrixMult.vectAdd(a, params.output.getDenseBlockValues(), 0, 0, a.length); - } - } - - private static void inplaceTransAdd(double[] a, ConvolutionParameters params) { - synchronized (params.output.denseBlock) { - // Perform transposed addition: output of size [K, CRS] += input of size [CRS,K] - double [] c = params.output.getDenseBlockValues(); - final int CRS = params.C*params.R*params.S, K = params.K; - final int blocksizeIJ = 128; //L2 cache - - //cache-conscious blocked execution - for( int bi=0; bi<CRS; bi+=blocksizeIJ ) - for( int bj=0; bj<K; bj+=blocksizeIJ ) { - int bimin = Math.min(bi+blocksizeIJ, CRS); - int bjmin = Math.min(bj+blocksizeIJ, K); - //core transpose add operation - for(int i=bi, aix=bi*K; i<bimin; i++, aix+=K) - for(int j=bj, cix=i+bj*CRS; j<bjmin; j++, cix+=CRS) - c[cix] += a[aix+j]; - } - } - } -} http://git-wip-us.apache.org/repos/asf/systemml/blob/45eec2d2/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java deleted file mode 100644 index 3699512..0000000 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNConv2dHelper.java +++ /dev/null @@ -1,307 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.sysml.runtime.matrix.data; - -import java.util.ArrayList; -import java.util.concurrent.Callable; - -import org.apache.sysml.api.DMLScript; -import org.apache.sysml.runtime.matrix.data.LibMatrixDNNIm2ColHelper.Im2colWorker; -import org.apache.sysml.utils.NativeHelper; - -/** - * This class contains the set of operators used for performing conv2d - */ -public class LibMatrixDNNConv2dHelper { - - /** - * Performs convolution via: partialCopy1(filter %*% im2col(input)) = output. - * This operator has less memory pressure than LoopedIm2ColConv2dAllChannels. - */ - public static class LoopedIm2ColConv2dOneChan implements Callable<Long> - { - protected final int _rl, _ru; - protected final ConvolutionParameters _params; - protected final ArrayList<MatrixBlock> _filters; - - public LoopedIm2ColConv2dOneChan(int rl, int ru, ConvolutionParameters params, ArrayList<MatrixBlock> filters) { - _rl = rl; _ru = ru; - _params = params; - _filters = filters; - } - - @Override - public Long call() throws Exception { - int PQ = _params.P*_params.Q; int K = _params.K; - int RS = _params.R*_params.S; - MatrixBlock im2ColOutBlock = new MatrixBlock(RS, PQ, false); - Im2colWorker im2ColWorker = Im2colWorker.getWorker( _params.input1, im2ColOutBlock, _params, false, false); - long time1 = 0; long time2 = 0; - for(int n = _rl; n < _ru; n++) { - for(int c = 0; c < _params.C; c++) { - // im2col(input) => _im2ColOutBlock - long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; - im2ColWorker.execute(n, c); - long t2 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; - - // filter %*% _im2ColOutBlock => matMultOutBlock - MatrixBlock matMultOutBlock = new MatrixBlock(K, PQ, false); - LibMatrixDNNHelper.singleThreadedMatMult(_filters.get(c), im2ColOutBlock, matMultOutBlock, false, true, _params); - long t3 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; - - if(DMLScript.FINEGRAINED_STATISTICS) { - time1 += t2 - t1; - time2 += t3 - t2; - } - - // Add the matrix matMultOutBlock of shape [K X PQ] to params.output.denseBlock + destPos - add(matMultOutBlock, _params.output.getDenseBlockValues(), n*K*PQ, K, PQ); - } - // Add bias to current row if necessary, always dense - if(_params.bias != null) - LibMatrixDNNHelper.addBias(n, _params.output.getDenseBlockValues(), - _params.bias.getDenseBlockValues(), K, PQ); - } - if(DMLScript.FINEGRAINED_STATISTICS) { - LibMatrixDNN.loopedConvIm2ColTime.addAndGet(time1); - LibMatrixDNN.loopedConvMatMultTime.addAndGet(time2); - } - - //multi-threaded nnz maintenance of current working set - return _params.output.recomputeNonZeros(_rl, _ru-1); - } - - // Copy the matrix src of shape [K X PQ] to params.output.denseBlock + destPos - private static void add(MatrixBlock src, double [] dest, int destPos, int K, int PQ) { - // Copying is required as LibMatrixMult.matrixMult (and/or Java) is not pointer aware. - // This is not required in Native implementation - if(!src.isEmptyBlock()) { - if(src.isInSparseFormat()) { - // Copy the sparse matrix matMultOutBlock of shape [K X PQ] to - // params.output.denseBlock + destPos - for(int k = 0; k < src.getNumRows(); k++) { - if( !src.sparseBlock.isEmpty(k) ) { - int apos = src.sparseBlock.pos(k); - int alen = src.sparseBlock.size(k); - int[] aix = src.sparseBlock.indexes(k); - double[] avals = src.sparseBlock.values(k); - int desPosK = destPos + k*PQ; - for(int j = apos; j < apos+alen; j++) { - int pqIndex = aix[j]; - dest[desPosK + pqIndex ] += avals[j]; - } - } - } - } - else { - LibMatrixMult.vectAdd(src.getDenseBlockValues(), dest, 0, destPos, K*PQ); - } - } - } - } - - /** - * Performs convolution via: partialCopy1(filter %*% im2col(input)) = output - */ - public static class LoopedIm2ColConv2dAllChan implements Callable<Long> - { - protected final int _rl, _ru; - protected final ConvolutionParameters _params; - - public LoopedIm2ColConv2dAllChan(int rl, int ru, ConvolutionParameters params) { - _rl = rl; _ru = ru; - _params = params; - } - - @Override - public Long call() throws Exception { - final int PQ = _params.P*_params.Q, K = _params.K, CRS = _params.C*_params.R*_params.S; - MatrixBlock outIm2col = new MatrixBlock(CRS, PQ, false); - MatrixBlock outMM = new MatrixBlock(K, PQ, false); - Im2colWorker im2ColWorker = Im2colWorker.getWorker( _params.input1, outIm2col, _params, true, false); - long time1 = 0; long time2 = 0; - for(int n = _rl; n < _ru; n++) { - // im2col(input) => _im2ColOutBlock - long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; - im2ColWorker.execute(n); - long t2 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; - - // filter %*% _im2ColOutBlock => matMultOutBlock - outMM.reset(outMM.rlen, outMM.clen, false); - LibMatrixDNNHelper.singleThreadedMatMult(_params.input2, outIm2col, outMM, false, true, _params); - long t3 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; - - if(DMLScript.FINEGRAINED_STATISTICS) { - time1 += t2 - t1; - time2 += t3 - t2; - } - - // Copy the matrix matMultOutBlock of shape [K X PQ] to params.output.denseBlock + destPos - partialCopy1(outMM, _params.output.getDenseBlockValues(), n*K*PQ, K, PQ); - - // Add bias to current row if necessary, always dense - if(_params.bias != null) - LibMatrixDNNHelper.addBias(n, _params.output.getDenseBlockValues(), - _params.bias.getDenseBlockValues(), K, PQ); - } - - if(DMLScript.FINEGRAINED_STATISTICS) { - LibMatrixDNN.loopedConvIm2ColTime.addAndGet(time1); - LibMatrixDNN.loopedConvMatMultTime.addAndGet(time2); - } - - //multi-threaded nnz maintenance of current working set - return _params.output.recomputeNonZeros(_rl, _ru-1); - } - - // Copy the matrix src of shape [K X PQ] to params.output.denseBlock + destPos - private static void partialCopy1(MatrixBlock src, double [] dest, int destPos, int K, int PQ) { - // Copying is required as LibMatrixMult.matrixMult (and/or Java) is not pointer aware. - // This is not required in Native implementation - if( src.isEmptyBlock() ) - return; - if(src.isInSparseFormat()) { - SparseBlock sblock = src.sparseBlock; - for(int k = 0; k < src.getNumRows(); k++) { - if( sblock.isEmpty(k) ) continue; - int apos = sblock.pos(k); - int alen = sblock.size(k); - int[] aix = sblock.indexes(k); - double[] avals = sblock.values(k); - int desPosK = destPos + k*PQ; - for(int j = apos; j < apos+alen; j++) - dest[desPosK+aix[j]] = avals[j]; - } - } - else - System.arraycopy(src.getDenseBlockValues(), 0, dest, destPos, K * PQ); - } - } - - /** - * This implementation is similar to LoopedIm2ColConv2dAllChan, except for using a - * sparse-dense matrix multiplication with t(t(Xi) %*% t(F)) instead of a - * dense-sparse matrix multiplication with Xi %*% F. - * - * NOTE: this implementation assumes that the filter is passed in transposed form - * in order to share this temporary matrix (and its creation cost) across threads. - */ - public static class LoopedIm2ColConv2dTransAllChan extends LoopedIm2ColConv2dAllChan - { - public LoopedIm2ColConv2dTransAllChan(int rl, int ru, ConvolutionParameters params) { - super(rl, ru, params); - } - - @Override - public Long call() throws Exception { - final int PQ = _params.P*_params.Q, K = _params.K, CRS = _params.C*_params.R*_params.S; - MatrixBlock outIm2col = new MatrixBlock(PQ, CRS, false); - MatrixBlock outMM = new MatrixBlock(PQ, K, false); - Im2colWorker im2ColWorker = Im2colWorker.getWorker( _params.input1, outIm2col, _params, true, true); - - for(int n = _rl; n < _ru; n++) { - // im2col(input) => _im2ColOutBlock - im2ColWorker.execute(n); - - // t(_im2ColOutBlock) %*% t(filter) => t(matMultOutBlock) - outMM.reset(outMM.rlen, outMM.clen, false); - LibMatrixDNNHelper.singleThreadedMatMult(outIm2col, _params.input2, outMM, false, false, _params); - - // Copy the matrix matMultOutBlock of shape [K X PQ] to params.output.denseBlock + destPos - partialCopyTrans(outMM, _params.output, n*K*PQ, K, PQ); - - // Add bias to current row if necessary, always dense - if(_params.bias != null) - LibMatrixDNNHelper.addBias(n, _params.output.getDenseBlockValues(), - _params.bias.getDenseBlockValues(), K, PQ); - } - - //multi-threaded nnz maintenance of current working set - return _params.output.recomputeNonZeros(_rl, _ru-1); - } - - private static void partialCopyTrans(MatrixBlock src, MatrixBlock dest, int destPos, int K, int PQ) { - if( src.isEmptyBlock() ) - return; - //copy src into its destination row w/ piggybacked transpose - //src is [PQ x K] -> [K x PQ] -> [1 x KPQ] - if(src.isInSparseFormat()) { - SparseBlock sblock = src.sparseBlock; - double[] c = dest.getDenseBlockValues(); - for(int i = 0; i < src.getNumRows(); i++) { - if( sblock.isEmpty(i) ) continue; - int apos = sblock.pos(i); - int alen = sblock.size(i); - int[] aix = sblock.indexes(i); - double[] avals = sblock.values(i); - int desPosK = destPos + i; - for(int j = apos; j < apos+alen; j++) - c[desPosK+aix[j]*PQ] = avals[j]; - } - } - else { - double[] a = src.getDenseBlockValues(); - double[] c = dest.getDenseBlockValues(); - final int blocksizeIJ = 128; //128KB for L2 - //cache-conscious blocked execution - for( int bi = 0; bi < PQ; bi+=blocksizeIJ ) - for( int bj = 0; bj < K; bj+=blocksizeIJ ) { - int bimin = Math.min(bi+blocksizeIJ, PQ); - int bjmin = Math.min(bj+blocksizeIJ, K); - //core transpose operation - for(int i=bi, aix=bi*K+bj, cix=bj*PQ+bi; i<bimin; i++, aix+=K, cix++) - LibMatrixReorg.transposeRow(a, c, aix, destPos+cix, PQ, bjmin-bj); - } - } - } - } - - /** - * This operator is used only if native is enabled, filter is dense and input is sparse - */ - public static class SparseNativeConv2d implements Callable<Long> - { - public int _rl; public int _ru; - private final ConvolutionParameters _params; - public SparseNativeConv2d(int rl, int ru, ConvolutionParameters params) { - _rl = rl; _ru = ru; - _params = params; - } - - @Override - public Long call() throws Exception { - int KPQ = _params.K*_params.P*_params.Q; - double[] temp = new double[KPQ]; - for(int n = _rl; n < _ru; n++) { - if( !_params.input1.getSparseBlock().isEmpty(n) ) { - int apos = _params.input1.getSparseBlock().pos(n); - int alen = _params.input1.getSparseBlock().size(n); - int[] aix = _params.input1.getSparseBlock().indexes(n); - double[] avals = _params.input1.getSparseBlock().values(n); - NativeHelper.conv2dSparse(apos, alen, aix, avals, _params.input2.getDenseBlockValues(), temp, - 1, _params.C, _params.H, _params.W, _params.K, _params.R, _params.S, - _params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, _params.Q, 1); - System.arraycopy(temp, 0, _params.output.getDenseBlockValues(), n*KPQ, KPQ); - } - } - //multi-threaded nnz maintenance of current working set - return _params.output.recomputeNonZeros(_rl, _ru-1); - } - } -}
