http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java index f839990..c794932 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java @@ -21,15 +21,20 @@ package org.apache.sysml.runtime.instructions.cp; import java.util.ArrayList; import java.util.Arrays; + +import org.apache.sysml.conf.ConfigurationManager; +import org.apache.sysml.conf.DMLConfig; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.functionobjects.SwapIndex; import org.apache.sysml.runtime.instructions.InstructionUtils; import org.apache.sysml.runtime.matrix.data.ConvolutionParameters; import org.apache.sysml.runtime.matrix.data.LibMatrixDNN; +import org.apache.sysml.runtime.matrix.data.LibMatrixNative; import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.matrix.operators.ReorgOperator; import org.apache.sysml.runtime.util.ConvolutionUtils; +import org.apache.sysml.utils.NativeHelper; public class ConvolutionCPInstruction extends UnaryCPInstruction { @@ -131,7 +136,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction return new ConvolutionCPInstruction(in, out, opcode, str, stride, padding, input_shape, filter_shape, k); } - else if (opcode.equalsIgnoreCase("maxpooling_backward") + else if (opcode.equalsIgnoreCase("maxpooling_backward") || opcode.equalsIgnoreCase("relu_maxpooling_backward") || opcode.equalsIgnoreCase("conv2d") || opcode.equalsIgnoreCase("conv2d_backward_filter") || opcode.equalsIgnoreCase("conv2d_backward_data")) { @@ -288,6 +293,17 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction ec.setMatrixOutput(getOutputVariableName(), outputBlock); } + // Assumption: enableNative && NativeHelper.isNativeLibraryLoaded() is true + // This increases the number of native calls. For example:the cases where filter is sparse but input is dense + private boolean isFilterSparse(MatrixBlock filter) throws DMLRuntimeException { + long numElems = filter.getNumRows()*filter.getNumColumns(); + // if filter is less than 10 MB in dense format (which handles almost all the cases). + // In fact, using threshold of 1 MB is still sufficient for common CNNs. + if(filter.isInSparseFormat() && numElems < 10e+6) + filter.sparseToDense(); + return filter.isInSparseFormat(); + } + @Override public void processInstruction(ExecutionContext ec) @@ -326,6 +342,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction int Q = (int) ConvolutionUtils.getQ(W, S, stride_w, pad_w); ConvolutionParameters params = new ConvolutionParameters(N, C, H, W, K, R, S, stride_h, stride_w, pad_h, pad_w, _numThreads); + params.enableNative = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.NATIVE_BLAS) && NativeHelper.isNativeLibraryLoaded(); if (instOpcode.equalsIgnoreCase("maxpooling") || instOpcode.equalsIgnoreCase("relu_maxpooling")) { if(matBlock.isEmptyBlock()) { outputBlock = new MatrixBlock(N, C*P*Q, true); @@ -337,14 +354,17 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction LibMatrixDNN.maxpooling(matBlock, outputBlock, params); } } - else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) { + else if (instOpcode.equalsIgnoreCase("maxpooling_backward") || instOpcode.equalsIgnoreCase("relu_maxpooling_backward")) { MatrixBlock dout = ec.getMatrixInput(_in2.getName()); if(matBlock.isEmptyBlock() || dout.isEmptyBlock()) { outputBlock = new MatrixBlock(N, C*H*W, true); } else { outputBlock = getDenseOutputBlock(N, C*H*W); - LibMatrixDNN.maxpoolingBackward(matBlock, dout, outputBlock, params); + if(instOpcode.equalsIgnoreCase("maxpooling_backward")) + LibMatrixDNN.maxpoolingBackward(matBlock, dout, outputBlock, params, false); + else + LibMatrixDNN.maxpoolingBackward(matBlock, dout, outputBlock, params, true); } ec.releaseMatrixInput(_in2.getName()); } @@ -355,7 +375,10 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction } else { outputBlock = getDenseOutputBlock(N, K*P*Q); - LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params); + if(params.enableNative && !isFilterSparse(filter) && !matBlock.isInSparseFormat()) + LibMatrixNative.conv2d(matBlock, filter, outputBlock, params); + else + LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params); } ec.releaseMatrixInput(_in2.getName()); } @@ -367,9 +390,13 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction } else { outputBlock = getDenseOutputBlock(N, K*P*Q); - if(!bias.isEmptyBlock()) + if(!bias.isEmptyBlock()) { params.bias = bias; - LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params); + } + if(params.enableNative && !isFilterSparse(filter) && !matBlock.isInSparseFormat()) + LibMatrixNative.conv2d(matBlock, filter, outputBlock, params); + else + LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params); } ec.releaseMatrixInput(_in3.getName()); ec.releaseMatrixInput(_in2.getName()); @@ -381,7 +408,10 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction } else { outputBlock = getDenseOutputBlock(K, C*R*S); - LibMatrixDNN.conv2dBackwardFilter(matBlock, dout, outputBlock, params); + if(params.enableNative && !matBlock.isInSparseFormat() && !dout.isInSparseFormat()) + LibMatrixNative.conv2dBackwardFilter(matBlock, dout, outputBlock, params); + else + LibMatrixDNN.conv2dBackwardFilter(matBlock, dout, outputBlock, params); } ec.releaseMatrixInput(_in2.getName()); } @@ -392,7 +422,10 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction } else { outputBlock = getDenseOutputBlock(N, C * H * W); - LibMatrixDNN.conv2dBackwardData(matBlock, dout, outputBlock, params); + if(params.enableNative && !isFilterSparse(matBlock) && !dout.isInSparseFormat()) + LibMatrixNative.conv2dBackwardData(matBlock, dout, outputBlock, params); + else + LibMatrixDNN.conv2dBackwardData(matBlock, dout, outputBlock, params); } ec.releaseMatrixInput(_in2.getName()); }
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java index 497b182..9d4cd1f 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java @@ -141,7 +141,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction return new ConvolutionGPUInstruction(in1, in2, in3, out, opcode, str, stride, padding, input_shape, filter_shape); } - else if (opcode.equalsIgnoreCase("maxpooling") || opcode.equalsIgnoreCase("relu_maxpooling")) { + else if (opcode.equalsIgnoreCase("maxpooling")) { InstructionUtils.checkNumFields(parts, 14); CPOperand in1 = new CPOperand(parts[1]); CPOperand out = new CPOperand(parts[14]); @@ -303,7 +303,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction LibMatrixCUDA.conv2dBackwardData(ec.getGPUContext(), getExtendedOpcode(), filter, dout, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); } - else if (instOpcode.equalsIgnoreCase("maxpooling") || instOpcode.equalsIgnoreCase("relu_maxpooling")) { + else if (instOpcode.equalsIgnoreCase("maxpooling")) { MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName()); if(image.getNumRows() != N || image.getNumColumns() != C*H*W) @@ -315,9 +315,6 @@ public class ConvolutionGPUInstruction extends GPUInstruction if(instOpcode.equalsIgnoreCase("maxpooling")) LibMatrixCUDA.maxpooling(ec.getGPUContext(), getExtendedOpcode(), image, out, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); - else - LibMatrixCUDA.reluMaxpooling(ec.getGPUContext(), getExtendedOpcode(), image, out, N, C, H, W, - K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); } else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) { MatrixObject image = getMatrixInputForGPUInstruction(ec, _input1.getName()); @@ -341,7 +338,7 @@ public class ConvolutionGPUInstruction extends GPUInstruction // release inputs/outputs ec.releaseMatrixInputForGPUInstruction(_input1.getName()); - if (!( instOpcode.equalsIgnoreCase("maxpooling") || instOpcode.equalsIgnoreCase("relu_maxpooling")) ) + if ( !instOpcode.equalsIgnoreCase("maxpooling") ) ec.releaseMatrixInputForGPUInstruction(_input2.getName()); if (instOpcode.equalsIgnoreCase("conv2d_bias_add")) http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/instructions/spark/ConvolutionSPInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/ConvolutionSPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/ConvolutionSPInstruction.java index b485233..f687cc0 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/spark/ConvolutionSPInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/ConvolutionSPInstruction.java @@ -25,6 +25,8 @@ import java.util.Iterator; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.function.PairFlatMapFunction; import org.apache.spark.broadcast.Broadcast; +import org.apache.sysml.conf.ConfigurationManager; +import org.apache.sysml.conf.DMLConfig; import org.apache.sysml.parser.Expression.DataType; import org.apache.sysml.parser.Expression.ValueType; import org.apache.sysml.runtime.DMLRuntimeException; @@ -41,6 +43,7 @@ import org.apache.sysml.runtime.matrix.MatrixFormatMetaData; import org.apache.sysml.runtime.matrix.data.ConvolutionParameters; import org.apache.sysml.runtime.matrix.data.InputInfo; import org.apache.sysml.runtime.matrix.data.LibMatrixDNN; +import org.apache.sysml.runtime.matrix.data.LibMatrixNative; import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.matrix.data.MatrixIndexes; import org.apache.sysml.runtime.matrix.data.OutputInfo; @@ -281,7 +284,8 @@ public class ConvolutionSPInstruction extends UnarySPInstruction { int Q = (int) ConvolutionUtils.getQ(W, S, stride_w, pad_w); ConvolutionParameters params = new ConvolutionParameters(numRowsPerBlock, C, H, W, K, R, S, stride_h, stride_w, pad_h, pad_w, 1); - JavaPairRDD<MatrixIndexes,MatrixBlock> out = inputRDD.mapPartitionsToPair(new RDDConv2dMapMMFunction(filterBroadcast, params, instOpcode, biasBroadcast, mcRdd.getRows()), true); + boolean enableNativeBLAS = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.NATIVE_BLAS); + JavaPairRDD<MatrixIndexes,MatrixBlock> out = inputRDD.mapPartitionsToPair(new RDDConv2dMapMMFunction(filterBroadcast, params, instOpcode, biasBroadcast, mcRdd.getRows(), enableNativeBLAS), true); //put output RDD handle into symbol table sec.setRDDHandleForVariable(output.getName(), out); @@ -316,15 +320,16 @@ public class ConvolutionSPInstruction extends UnarySPInstruction { Broadcast<MatrixBlock> filterBroadcast = null; Broadcast<MatrixBlock> biasBroadcast = null; ConvolutionParameters params = null; - String instOpcode = null; + String instOpcode = null; boolean enableNative; long numRows = 0; public RDDConv2dMapMMFunction(Broadcast<MatrixBlock> filterBroadcast, - ConvolutionParameters params, String instOpcode, Broadcast<MatrixBlock> biasBroadcast, long numRows) { + ConvolutionParameters params, String instOpcode, Broadcast<MatrixBlock> biasBroadcast, long numRows, boolean enableNativeBLAS) { this.filterBroadcast = filterBroadcast; this.params = params; this.instOpcode = instOpcode; this.biasBroadcast = biasBroadcast; this.numRows = numRows; + this.enableNative = enableNativeBLAS; } private MatrixBlock processRectangularBlock(MatrixBlock matBlock) throws Exception { @@ -336,7 +341,10 @@ public class ConvolutionSPInstruction extends UnarySPInstruction { } else { outputBlock = getDenseOutputBlock(params.N, params.K*params.P*params.Q); - LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params); + if(enableNative) + LibMatrixNative.conv2d(matBlock, filter, outputBlock, params); + else + LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params); } } else if (instOpcode.equalsIgnoreCase("conv2d_bias_add")) { @@ -349,7 +357,10 @@ public class ConvolutionSPInstruction extends UnarySPInstruction { outputBlock = getDenseOutputBlock(params.N, params.K*params.P*params.Q); if(!bias.isEmptyBlock()) params.bias = bias; - LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params); + if(enableNative) + LibMatrixNative.conv2d(matBlock, filter, outputBlock, params); + else + LibMatrixDNN.conv2d(matBlock, filter, outputBlock, params); } } else if(instOpcode.equalsIgnoreCase("maxpooling") || instOpcode.equalsIgnoreCase("relu_maxpooling")) { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java index 3f0437f..11e74ca 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java @@ -34,7 +34,7 @@ public class ConvolutionParameters implements Serializable { public int K; public int R; public int S; public int stride_h; public int stride_w; public int pad_h; public int pad_w; public int P; public int Q; public int numThreads; - + public boolean enableNative = false; public MatrixBlock input1; public MatrixBlock input2; public MatrixBlock output; public MatrixBlock bias; http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java index 9ddd87a..56360f8 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java @@ -1043,41 +1043,6 @@ public class LibMatrixCUDA { } /** - * performs relu followed by maxpooling on GPU by exploiting cudnnPoolingForward(...) - * @param gCtx a valid {@link GPUContext} - * @param instName the invoking instruction's name for record {@link Statistics}. - * @param image image as matrix object - * @param outputBlock output matrix - * @param N batch size - * @param C number of channels - * @param H height of image - * @param W width of image - * @param K number of filters - * @param R height of filter - * @param S width of filter - * @param pad_h vertical padding - * @param pad_w horizontal padding - * @param stride_h horizontal stride - * @param stride_w vertical stride - * @param P (H - R + 1 + 2*pad_h)/stride_h - * @param Q (W - S + 1 + 2*pad_w)/stride_w - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - public static void reluMaxpooling(GPUContext gCtx, String instName, MatrixObject image, - MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, - int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, - int Q) throws DMLRuntimeException { - LOG.trace("GPU : reluMaxpooling" + ", GPUContext=" + gCtx); - cudnnTensorDescriptor srcTensorDesc = allocateTensorDescriptor(gCtx, image, N, C, H, W); - long size = image.getNumRows() * image.getNumColumns() * Sizeof.DOUBLE; - Pointer tmp = gCtx.allocate(size); - performCuDNNReLU(gCtx, instName, image, tmp, srcTensorDesc); - //cudaDeviceSynchronize; // It seemed like the cudnn operation in performCuDNNReLU was being done aysnchronously, this adds the neccesary sync - performMaxpooling(gCtx, instName, tmp, srcTensorDesc, outputBlock, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); - gCtx.cudaFreeHelper(tmp); - } - - /** * Performs maxpoolingBackward on GPU by exploiting cudnnPoolingBackward(...) * This method computes the backpropogation errors for previous layer of maxpooling operation * @param gCtx a valid {@link GPUContext} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java index cd88b95..6a43917 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java @@ -36,6 +36,8 @@ import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.instructions.InstructionUtils; import org.apache.sysml.runtime.matrix.operators.BinaryOperator; import org.apache.sysml.runtime.util.ConvolutionUtils; +import org.apache.sysml.utils.NativeHelper; +import org.apache.sysml.utils.Statistics; /** * This class allows users to invoke deep learning related operations @@ -57,10 +59,10 @@ public class LibMatrixDNN { public static boolean DISPLAY_STATISTICS = false; //conv2d summaries in stats output private enum TaskType { - MaxPooling_Forward, MaxPooling_Backward, + MaxPooling_Forward, MaxPooling_Backward, MaxPooling_Relu_Backward, // Alternate approaches that we tried but the performance was unsatisfactory be included: direct, non-looped im2col LoopedIm2ColConv2d, LoopedIm2ColConv2dBwdFilter, LoopedIm2ColConv2dBwdData, - BiasAdd, ReluBackward, BiasMultiply + ReluBackward } // ------------------------------------------------------------------------------------------------ @@ -141,6 +143,30 @@ public class LibMatrixDNN { // ------------------------------------------------------------------------------------------------ /** + * This method performs convolution (i.e. cross-correlation) operation on input + * + * @param input input batch + * @param filter filter + * @param outputBlock output of convolution + * @param params convolution parameters + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static void conv2d(MatrixBlock input, MatrixBlock filter, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { + LibMatrixDNN.checkInputsConv2d(input, filter, outputBlock, params); + + if(params.bias != null && params.bias.isInSparseFormat()) + params.bias.sparseToDense(); // Since bias is extremely small array + + if(isEligibleForConv2dSparse(params)) + Statistics.numNativeLibMatrixDNNCalls.increment(); + + runConvTask(TaskType.LoopedIm2ColConv2d, params); + + //post-processing: maintain nnz + outputBlock.recomputeNonZeros(); + } + + /** * This method computes the backpropogation errors for previous layer of convolution operation * * @param filter filter used in conv2d @@ -150,25 +176,10 @@ public class LibMatrixDNN { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void conv2dBackwardData(MatrixBlock filter, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { - params.input1 = filter; - params.input2 = dout; - params.output = outputBlock; - if(filter.getNumRows() != params.K || filter.getNumColumns() != params.C*params.R*params.S || - dout.getNumRows() != params.N || dout.getNumColumns() != params.K*params.P*params.Q) { - throw new DMLRuntimeException("Incorrect input to conv2d_backward_filter"); - } - if(params.stride_h <= 0 || params.stride_w <= 0) { - throw new DMLRuntimeException("Only positive strides supported"); - } + checkInputsConv2dBackwardData(filter, dout, outputBlock, params); - if(DMLScript.STATISTICS && DISPLAY_STATISTICS) { - if(filter.isInSparseFormat() || dout.isInSparseFormat()) { - conv2dBwdDataSparseCount.addAndGet(1); - } - else { - conv2dBwdDataDenseCount.addAndGet(1); - } - } + if(isEligibleForConv2dBackwardDataDense(params)) + Statistics.numNativeLibMatrixDNNCalls.increment(); runConvTask(TaskType.LoopedIm2ColConv2dBwdData, params); @@ -186,17 +197,71 @@ public class LibMatrixDNN { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void conv2dBackwardFilter(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { - params.input1 = input; + checkInputsConv2dBackwardFilter(input, dout, outputBlock, params); + + if(isEligibleForConv2dBackwardFilterSparseDense(params)) + Statistics.numNativeLibMatrixDNNCalls.increment(); + + runConvTask(TaskType.LoopedIm2ColConv2dBwdFilter, params); + + //post-processing: maintain nnz + outputBlock.recomputeNonZeros(); + } + + + private static void checkOrThrowException(String msg, long lhs, long rhs) throws DMLRuntimeException { + if(lhs != rhs) + throw new DMLRuntimeException(msg + ":" + lhs + " != " + rhs); + } + private static void checkOrThrowException(String msg, long lhs, long rhs1, long rhs2, long rhs3) throws DMLRuntimeException { + if(lhs != (rhs1*rhs2*rhs3)) + throw new DMLRuntimeException(msg + ":" + lhs + " != (" + rhs1 + " * " + rhs2 + " * " + rhs3); + } + + static void checkInputsConv2dBackwardData(MatrixBlock filter, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { + params.input1 = filter; params.input2 = dout; params.output = outputBlock; - if(input.getNumRows() != params.N || input.getNumColumns() != params.C*params.H*params.W || - dout.getNumRows() != params.N || dout.getNumColumns() != params.K*params.P*params.Q) { - throw new DMLRuntimeException("Incorrect input to conv2d_backward_filter"); - } - if(params.stride_h <= 0 || params.stride_w <= 0) { - throw new DMLRuntimeException("Only positive strides supported"); + checkOrThrowException("Incorrect input to conv2d_backward_data: Number of rows of input filter != " + + "number of filters in filter_shape", filter.getNumRows(), params.K); + checkOrThrowException("Incorrect input to conv2d_backward_data: Number of columns of input filter != " + + "channels*filter_height*filter_height in filter_shape", filter.getNumColumns(), params.C, params.R, params.S); + checkOrThrowException("Incorrect input to conv2d_backward_data: Number of rows of input errors != " + + "batch size in input_shape", dout.getNumRows(), params.N); + checkOrThrowException("Incorrect input to conv2d_backward_data: Number of columns of input errors != " + + "expected input error channels*height*width", dout.getNumColumns(), params.K, params.P, params.Q); + if(params.stride_h <= 0 || params.stride_w <= 0) + throw new DMLRuntimeException("Only positive strides supported:" + params.stride_h + ", " + params.stride_w); + + if(DMLScript.STATISTICS && DISPLAY_STATISTICS) { + if(filter.isInSparseFormat() || dout.isInSparseFormat()) { + conv2dBwdDataSparseCount.addAndGet(1); + } + else { + conv2dBwdDataDenseCount.addAndGet(1); + } } + int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads); + if (!(ALLOW_MULTI_THREADED_OPS && params.isOutputThreadSafe() && constrainedNumThreads > 1)) + params.numThreads = 1; + } + + static void checkInputsConv2dBackwardFilter(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { + params.input1 = input; + params.input2 = dout; + params.output = outputBlock; + checkOrThrowException("Incorrect input to conv2d_backward_filter: Number of rows of input data != " + + "batch size in input_shape", input.getNumRows(), params.N); + checkOrThrowException("Incorrect input to conv2d_backward_filter: Number of columns of input data != " + + "channels*input_height*input_height in input_shape", input.getNumColumns(), params.C, params.H, params.W); + checkOrThrowException("Incorrect input to conv2d_backward_filter: Number of rows of input errors != " + + "batch size in input_shape", dout.getNumRows(), params.N); + checkOrThrowException("Incorrect input to conv2d_backward_filter: Number of columns of input errors != " + + "expected input error channels*height*width", dout.getNumColumns(), params.K, params.P, params.Q); + if(params.stride_h <= 0 || params.stride_w <= 0) + throw new DMLRuntimeException("Only positive strides supported:" + params.stride_h + ", " + params.stride_w); + if(DMLScript.STATISTICS && DISPLAY_STATISTICS) { if(input.isInSparseFormat() || dout.isInSparseFormat()) { conv2dBwdFilterSparseCount.addAndGet(1); @@ -205,11 +270,6 @@ public class LibMatrixDNN { conv2dBwdFilterDenseCount.addAndGet(1); } } - - runConvTask(TaskType.LoopedIm2ColConv2dBwdFilter, params); - - //post-processing: maintain nnz - outputBlock.recomputeNonZeros(); } /** @@ -252,11 +312,10 @@ public class LibMatrixDNN { MatrixBlock filter = params.input1; MatrixBlock dout = params.input2; doRotate180(n, 0, dout, dout_reshaped.denseBlock, params, true); - dout_reshaped.recomputeNonZeros(); MatrixBlock temp = new MatrixBlock(params.P*params.Q, params.C*params.R*params.S, false); long t1 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0; - LibMatrixMult.matrixMult(dout_reshaped, filter, temp, false); + singleThreadedMatMult(dout_reshaped, filter, temp, true, false, params); long t2 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ; doCol2imOverSingleImage(n, temp, params); long t3 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ; @@ -270,15 +329,13 @@ public class LibMatrixDNN { MatrixBlock im2ColOutBlock, MatrixBlock dout_reshaped, MatrixBlock partialRetBlock, ConvolutionParameters params, double [] tempIm2ColArr) throws DMLRuntimeException { long t1 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0; doIm2col(n, im2ColOutBlock, params, tempIm2ColArr); - im2ColOutBlock.recomputeNonZeros(); long t2 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ; doRotate180(n, 0, params.input2, dout_reshaped.denseBlock, params, true); - dout_reshaped.recomputeNonZeros(); MatrixBlock temp = new MatrixBlock(params.C*params.R*params.S, params.K, false); long t3 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ; - LibMatrixMult.matrixMult(im2ColOutBlock, dout_reshaped, temp, false); + singleThreadedMatMult(im2ColOutBlock, dout_reshaped, temp, true, true, params); long t4 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ; if(DMLScript.STATISTICS && DISPLAY_STATISTICS) { loopedConvBwdFilterMatMultTime.addAndGet(t4-t3); @@ -298,15 +355,21 @@ public class LibMatrixDNN { ret[2] = j % W; } - public static void conv2d(MatrixBlock input, MatrixBlock filter, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { + static void checkInputsConv2d(MatrixBlock input, MatrixBlock filter, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { params.input1 = input; params.input2 = filter; params.output = outputBlock; - if(input.getNumRows() != params.N || input.getNumColumns() != params.C*params.H*params.W || - filter.getNumRows() != params.K || filter.getNumColumns() != params.C*params.R*params.S) { - throw new DMLRuntimeException("Incorrect input to conv2d: " + input.getNumRows()); - } + checkOrThrowException("Incorrect input to conv2d: Number of rows of input filter != " + + "number of filters in filter_shape", filter.getNumRows(), params.K); + checkOrThrowException("Incorrect input to conv2d: Number of columns of input filter != " + + "channels*filter_height*filter_height in filter_shape", filter.getNumColumns(), params.C, params.R, params.S); + checkOrThrowException("Incorrect input to conv2d: Number of rows of input data != " + + "batch size in input_shape", input.getNumRows(), params.N); + checkOrThrowException("Incorrect input to conv2d: Number of columns of input data != " + + "channels*input_height*input_height in input_shape", input.getNumColumns(), params.C, params.H, params.W); + if(params.stride_h <= 0 || params.stride_w <= 0) + throw new DMLRuntimeException("Only positive strides supported:" + params.stride_h + ", " + params.stride_w); if(DMLScript.STATISTICS && DISPLAY_STATISTICS) { if(input.isInSparseFormat() || filter.isInSparseFormat()) { @@ -316,21 +379,36 @@ public class LibMatrixDNN { conv2dDenseCount.addAndGet(1); } } - - runConvTask(TaskType.LoopedIm2ColConv2d, params); - - //post-processing: maintain nnz - outputBlock.recomputeNonZeros(); + } + + // Single-threaded matrix multiplication + private static void singleThreadedMatMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, + boolean recomputeNNZM1, boolean recomputeNNZM2, ConvolutionParameters params) throws DMLRuntimeException { + if(!params.enableNative || m1.isInSparseFormat() || m2.isInSparseFormat()) { + if(recomputeNNZM1) + m1.recomputeNonZeros(); + if(recomputeNNZM2) + m2.recomputeNonZeros(); + LibMatrixMult.matrixMult(m1, m2, ret, false); + } + else { + ret.sparse = false; + if(ret.getDenseBlock() == null) + ret.allocateDenseBlock(); + NativeHelper.matrixMultDenseDense(m1.denseBlock, m2.denseBlock, + ret.denseBlock, m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), 1); + ret.recomputeNonZeros(); + } } private static void doLoopedIm2ColConv2d(int n, MatrixBlock im2ColOutBlock, ConvolutionParameters params, double [] temp) throws DMLRuntimeException { long t1 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0; doIm2col(n, im2ColOutBlock, params, temp); - im2ColOutBlock.recomputeNonZeros(); long t2 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0; MatrixBlock matMultOutBlock = new MatrixBlock(params.K, params.P*params.Q, false); - LibMatrixMult.matrixMult(params.input2, im2ColOutBlock, matMultOutBlock, false); + singleThreadedMatMult(params.input2, im2ColOutBlock, matMultOutBlock, false, true, params); + long t3 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0; if(DMLScript.STATISTICS && DISPLAY_STATISTICS) { @@ -380,9 +458,11 @@ public class LibMatrixDNN { * @param dout dout matrix * @param outputBlock output matrix * @param params convolution parameters + * @param performReluBackward perform ReLU backward * @throws DMLRuntimeException if DMLRuntimeException occurs */ - public static void maxpoolingBackward(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { + public static void maxpoolingBackward(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, + ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException { params.input1 = input; params.input2 = dout; params.output = outputBlock; @@ -407,7 +487,10 @@ public class LibMatrixDNN { throw new DMLRuntimeException("Sparse maxpooling_backward is not supported"); fillIndexesArray(params); - runConvTask(TaskType.MaxPooling_Backward, params); + if(performReluBackward) + runConvTask(TaskType.MaxPooling_Relu_Backward, params); + else + runConvTask(TaskType.MaxPooling_Backward, params); //post-processing: maintain nnz outputBlock.recomputeNonZeros(); @@ -440,7 +523,7 @@ public class LibMatrixDNN { } } - private static void doPoolingBackward(int n, ConvolutionParameters params) throws DMLRuntimeException { + private static void doPoolingBackward(int n, ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException { double [] inputArray = null; if (!params.input1.isInSparseFormat()) inputArray = params.input1.getDenseBlock(); @@ -455,19 +538,19 @@ public class LibMatrixDNN { if(inputArray != null) { if(doutArray != null) - doPoolingBackwardDenseDense(n, inputArray, doutArray, outputArray, params); + doPoolingBackwardDenseDense(n, inputArray, doutArray, outputArray, params, performReluBackward); else - doPoolingBackwardDenseSparse(n, inputArray, params.input2, outputArray, params); + doPoolingBackwardDenseSparse(n, inputArray, params.input2, outputArray, params, performReluBackward); } else { if(doutArray != null) - doPoolingBackwardSparseDense(n, doutArray, outputArray, params); + doPoolingBackwardSparseDense(n, doutArray, outputArray, params, performReluBackward); else - doPoolingBackwardSparseSparse(n, outputArray, params); + doPoolingBackwardSparseSparse(n, outputArray, params, performReluBackward); } } - private static void doPoolingBackwardSparseDense(int n, double [] doutArray, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException { + private static void doPoolingBackwardSparseDense(int n, double [] doutArray, double [] outputArray, ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException { if (!params.input1.isInSparseFormat()) throw new DMLRuntimeException("Incorrect usage: Call optimized versions"); @@ -477,7 +560,7 @@ public class LibMatrixDNN { double inVal = doutArray[n*params.C*params.P*params.Q + c*params.P*params.Q + p * params.Q + q]; if(inVal != 0) { final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W; - int maxIndex = getMaxIndexSparse(p, q, inputOffset, n, c, params.input1, params); + int maxIndex = getMaxIndexSparse(p, q, inputOffset, n, c, params.input1, params, performReluBackward); if(maxIndex != -1) outputArray[maxIndex] += inVal; } @@ -486,7 +569,7 @@ public class LibMatrixDNN { } } - private static void doPoolingBackwardSparseSparse(int n, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException { + private static void doPoolingBackwardSparseSparse(int n, double [] outputArray, ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException { if (!params.input1.isInSparseFormat()) throw new DMLRuntimeException("Incorrect usage: Call optimized versions"); @@ -502,7 +585,7 @@ public class LibMatrixDNN { int p = tensorIndexes[1]; int q = tensorIndexes[2]; final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W; - int maxIndex = getMaxIndexSparse(p, q, inputOffset, n, c, params.input1, params); + int maxIndex = getMaxIndexSparse(p, q, inputOffset, n, c, params.input1, params, performReluBackward); if(maxIndex != -1) outputArray[maxIndex] += avals[j]; } @@ -510,7 +593,7 @@ public class LibMatrixDNN { } private static void doPoolingBackwardDenseSparse(int n, double [] inputArray, - MatrixBlock dout, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException { + MatrixBlock dout, double [] outputArray, ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException { if( !dout.sparseBlock.isEmpty(n) ) { int [] tensorIndexes = new int[3]; int apos = dout.sparseBlock.pos(n); @@ -523,7 +606,7 @@ public class LibMatrixDNN { int p = tensorIndexes[1]; int q = tensorIndexes[2]; final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W; - int maxIndex = getMaxIndex(p, q, inputOffset, inputArray, params); + int maxIndex = getMaxIndex(p, q, inputOffset, inputArray, params, performReluBackward); if(maxIndex != -1) outputArray[maxIndex] += avals[j]; } @@ -531,14 +614,14 @@ public class LibMatrixDNN { } private static void doPoolingBackwardDenseDense(int n, double [] inputArray, double [] doutArray, - double [] outputArray, ConvolutionParameters params) { + double [] outputArray, ConvolutionParameters params, boolean performReluBackward) { for (int c = 0; c < params.C; c++) { final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W; final int outputOffset = n*params.C*params.P*params.Q + c*params.P*params.Q; for (int p = 0; p < params.P; p++) { for (int q = 0; q < params.Q; q++) { - int maxIndex = getMaxIndex(p, q, inputOffset, inputArray, params); + int maxIndex = getMaxIndex(p, q, inputOffset, inputArray, params, performReluBackward); if(maxIndex != -1) outputArray[maxIndex] += doutArray[outputOffset + p * params.Q + q]; } @@ -556,10 +639,11 @@ public class LibMatrixDNN { * @param c number of channels * @param input input matrix * @param params convolution parameters + * @param performReluBackward perform ReLU on input * @return index of the cell with maximum value * @throws DMLRuntimeException if error occurs */ - private static int getMaxIndexSparse(int p, int q, int inputOffset, int n, int c, MatrixBlock input, ConvolutionParameters params) throws DMLRuntimeException { + private static int getMaxIndexSparse(int p, int q, int inputOffset, int n, int c, MatrixBlock input, ConvolutionParameters params, boolean performReluBackward) throws DMLRuntimeException { if(!input.isInSparseFormat()) throw new DMLRuntimeException("Incorrect usage: Only sparse format supported"); @@ -591,9 +675,10 @@ public class LibMatrixDNN { int h = tensorIndexes[1]; int w = tensorIndexes[2]; if(h >= start_index_h && h < end_index_h && w >= start_index_w && w < end_index_w) { - if(maxVal < avals[j]) { + double val = performReluBackward && avals[j] < 0 ? 0 : avals[j]; + if(maxVal < val) { maxIndex = inputOffset + h*params.W + w; - maxVal = avals[j]; + maxVal = val; } } } @@ -612,9 +697,10 @@ public class LibMatrixDNN { * @param inputOffset offset to be used for input index * @param inputArray input array * @param params convolution parameters + * @param performReluBackward perform ReLU backward * @return index of cell with maximum value */ - private static int getMaxIndex(int p, int q, int inputOffset, double [] inputArray, ConvolutionParameters params) { + private static int getMaxIndex(int p, int q, int inputOffset, double [] inputArray, ConvolutionParameters params, boolean performReluBackward) { int start_index_h = params.start_indexes_h[p]; int end_index_h = params.end_indexes_h[p]; int start_index_w = params.start_indexes_w[q]; @@ -632,6 +718,7 @@ public class LibMatrixDNN { for (int h = start_index_h; h < end_index_h; h++) { for (int w = start_index_w; w < end_index_w; w++) { currDoutVal = inputArray[inputOffset + h*params.W + w]; + currDoutVal = performReluBackward && currDoutVal < 0 ? 0 : currDoutVal; if(maxVal < currDoutVal) { maxIndex = inputOffset + h*params.W + w; maxVal = currDoutVal; @@ -894,11 +981,15 @@ public class LibMatrixDNN { private static void addMatrixBlocks(int poolSize, TaskType type, ConvolutionParameters params, ConcurrentLinkedQueue<MatrixBlock> im2ColOutBlocks, ConcurrentLinkedQueue<MatrixBlock> doutReshapedBlocks, ConcurrentLinkedQueue<MatrixBlock> partialRetBlocks) { + boolean isEligibleForConv2dSparse = (type == TaskType.LoopedIm2ColConv2d) && isEligibleForConv2dSparse(params); + boolean isEligibleForConv2dBackwardFilterSparseDense = (type == TaskType.LoopedIm2ColConv2dBwdFilter) && isEligibleForConv2dBackwardFilterSparseDense(params) ; for(int i = 0; i < poolSize; i++) { if(type == TaskType.LoopedIm2ColConv2d || type == TaskType.LoopedIm2ColConv2dBwdFilter) { - MatrixBlock im2ColOutBlock = new MatrixBlock(params.C*params.R*params.S, params.P*params.Q, false); - im2ColOutBlock.allocateDenseBlock(); - im2ColOutBlocks.add(im2ColOutBlock); + if(!isEligibleForConv2dSparse && !isEligibleForConv2dBackwardFilterSparseDense) { + MatrixBlock im2ColOutBlock = new MatrixBlock(params.C*params.R*params.S, params.P*params.Q, false); + im2ColOutBlock.allocateDenseBlock(); + im2ColOutBlocks.add(im2ColOutBlock); + } } if(type == TaskType.LoopedIm2ColConv2dBwdFilter) { @@ -962,6 +1053,21 @@ public class LibMatrixDNN { } // ---------------------------------------------------------------------------------------------------------------- + private static boolean isEligibleForConv2dBackwardFilterSparseDense(ConvolutionParameters params) { + // NativeHelper.conv2dBackwardFilterSparseDense only if filter is sparse. + // dout converted to dense if sparse. + return params.enableNative && params.input1.isInSparseFormat(); + } + private static boolean isEligibleForConv2dSparse(ConvolutionParameters params) { + // NativeHelper.conv2dSparse only if filter is dense and input is sparse + return params.enableNative && params.input1.isInSparseFormat() && !params.input2.isInSparseFormat(); + } + private static boolean isEligibleForConv2dBackwardDataDense(ConvolutionParameters params) { + // NativeHelper.conv2dBackwardDataDense only if filter is dense. + // dout converted to dense if sparse. + return params.enableNative && !params.input1.isInSparseFormat(); + } + /** * The ConvTask allows the convolution operations (such s conv2d, conv2d_backward, maxpooling, etc) * to be executed in multi-thread manner. @@ -1001,55 +1107,108 @@ public class LibMatrixDNN { break; case MaxPooling_Backward: for(int n = _rl; n < _ru; n++) - doPoolingBackward(n, _params); - break; - case BiasAdd: - { - double [] dest = _params.output.getDenseBlock(); - ConvolutionUtils.binaryBiasOperations(_params.input1, _params.bias, dest, _params.K, _params.P*_params.Q, - _rl, _ru, _binaryElementWiseAddition); + doPoolingBackward(n, _params, false); break; - } - case BiasMultiply: - { - double [] dest = _params.output.getDenseBlock(); - ConvolutionUtils.binaryBiasOperations(_params.input1, _params.bias, dest, _params.K, _params.P*_params.Q, - _rl, _ru, _binaryElementWiseMultiplication); + case MaxPooling_Relu_Backward: + for(int n = _rl; n < _ru; n++) + doPoolingBackward(n, _params, true); break; - } case ReluBackward: lnnz = doReluBackward(_params, _rl, _ru); break; case LoopedIm2ColConv2d: { - MatrixBlock im2ColOutBlock = _im2ColOutBlocks.remove(); - double [] temp = _params.input1.isInSparseFormat() ? new double[_params.input1.getNumColumns()] : null; - for(int n = _rl; n < _ru; n++) - doLoopedIm2ColConv2d(n, im2ColOutBlock, _params, temp); - _im2ColOutBlocks.add(im2ColOutBlock); - if(_params.bias != null) - ConvolutionUtils.binaryBiasOperationInPlace(_params.bias, _params.output.getDenseBlock(), _params.K, - _params.P*_params.Q, _rl, _ru, _binaryElementWiseAddition); + if(isEligibleForConv2dSparse(_params)) { + // NativeHelper.conv2dSparse only if filter is dense and input is sparse + int KPQ = _params.K*_params.P*_params.Q; + double[] temp = new double[KPQ]; + for(int n = _rl; n < _ru; n++) { + if( !_params.input1.getSparseBlock().isEmpty(n) ) { + int apos = _params.input1.getSparseBlock().pos(n); + int alen = _params.input1.getSparseBlock().size(n); + int[] aix = _params.input1.getSparseBlock().indexes(n); + double[] avals = _params.input1.getSparseBlock().values(n); + NativeHelper.conv2dSparse(apos, alen, aix, avals, _params.input2.getDenseBlock(), temp, + 1, _params.C, _params.H, _params.W, _params.K, _params.R, _params.S, + _params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, _params.Q, 1); + System.arraycopy(temp, 0, _params.output.denseBlock, n*KPQ, KPQ); + } + } + } + else { + // In all other cases, perform im2col in Java + matmult (either native or java). + MatrixBlock im2ColOutBlock = _im2ColOutBlocks.remove(); + double [] temp = _params.input1.isInSparseFormat() ? new double[_params.input1.getNumColumns()] : null; + for(int n = _rl; n < _ru; n++) + doLoopedIm2ColConv2d(n, im2ColOutBlock, _params, temp); + _im2ColOutBlocks.add(im2ColOutBlock); + } + if(_params.bias != null) { + // bias is always converted to dense format + double [] biasArr = _params.bias.getDenseBlock(); + int PQ = _params.P*_params.Q; + int index = _rl*_params.K*PQ; + for(int n = _rl; n < _ru; n++) { + for(int k = 0; k < _params.K; k++) { + for(int pq = 0; pq < PQ; pq++, index++) { + _params.output.denseBlock[index] += biasArr[k]; + } + } + } + } break; } case LoopedIm2ColConv2dBwdFilter: { - MatrixBlock im2ColOutBlock = _im2ColOutBlocks.remove(); MatrixBlock partialRetBlock = _partialRetBlocks.remove(); MatrixBlock doutReshapedBlock = _doutReshapedBlocks.remove(); - double [] temp = _params.input1.isInSparseFormat() ? new double[_params.input1.getNumColumns()] : null; - for(int n = _rl; n < _ru; n++) - partialRetBlock = doLoopedIm2ColConv2dBwdFilter(n, im2ColOutBlock, doutReshapedBlock, partialRetBlock, _params, temp); - _im2ColOutBlocks.add(im2ColOutBlock); - _partialRetBlocks.add(partialRetBlock); + if(isEligibleForConv2dBackwardFilterSparseDense(_params)) { + double [] dout_n = doutReshapedBlock.getDenseBlock(); + for(int n = _rl; n < _ru; n++) { + if( !_params.input1.getSparseBlock().isEmpty(n) ) { + doRotate180(n, 0, _params.input2, dout_n, _params, true); + int apos = _params.input1.getSparseBlock().pos(n); + int alen = _params.input1.getSparseBlock().size(n); + int[] aix = _params.input1.getSparseBlock().indexes(n); + double[] avals = _params.input1.getSparseBlock().values(n); + NativeHelper.conv2dBackwardFilterSparseDense(apos, alen, aix, avals, + dout_n, partialRetBlock.getDenseBlock(), 1, _params.C, _params.H, _params.W, _params.K, + _params.R, _params.S, _params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, _params.Q, 1); + } + } + } + else { + MatrixBlock im2ColOutBlock = _im2ColOutBlocks.remove(); + double [] temp = _params.input1.isInSparseFormat() ? new double[_params.input1.getNumColumns()] : null; + for(int n = _rl; n < _ru; n++) + partialRetBlock = doLoopedIm2ColConv2dBwdFilter(n, im2ColOutBlock, doutReshapedBlock, partialRetBlock, _params, temp); + _im2ColOutBlocks.add(im2ColOutBlock); + } _doutReshapedBlocks.add(doutReshapedBlock); + _partialRetBlocks.add(partialRetBlock); break; } case LoopedIm2ColConv2dBwdData: { MatrixBlock doutReshapedBlock = _doutReshapedBlocks.remove(); - for(int n = _rl; n < _ru; n++) - doLoopedIm2ColConv2dBwdData(n, doutReshapedBlock, _params); + if(isEligibleForConv2dBackwardDataDense(_params)) { + int CHW = _params.C*_params.H*_params.W; + double [] ret = new double[CHW]; + double [] filterArr = _params.input1.getDenseBlock(); + for(int n = _rl; n < _ru; n++) { + double [] dout_n = getRowInDenseFormat(_params.input2, n, doutReshapedBlock.getDenseBlock()); + if(n > _rl) + Arrays.fill(ret, 0); + NativeHelper.conv2dBackwardDataDense(filterArr, dout_n, ret, 1, + _params.C, _params.H, _params.W, _params.K, + _params.R, _params.S, _params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, _params.Q, 1); + System.arraycopy(ret, 0, _params.output.getDenseBlock(), n*CHW, CHW); + } + } + else { + for(int n = _rl; n < _ru; n++) + doLoopedIm2ColConv2dBwdData(n, doutReshapedBlock, _params); + } _doutReshapedBlocks.add(doutReshapedBlock); break; } @@ -1192,16 +1351,24 @@ public class LibMatrixDNN { } // Returns the row of matrix in dense format - private static double [] getRowInDenseFormat(MatrixBlock input, int n, double [] temp) { + private static double [] getRowInDenseFormat(MatrixBlock input, int n, double [] temp) throws DMLRuntimeException { + if(input.getNumColumns() != temp.length) { + throw new DMLRuntimeException("Invalid parameters"); + } // Use temporary array to avoid binary search - Arrays.fill(temp, 0); - if( !input.sparseBlock.isEmpty(n) ) { - int apos = input.sparseBlock.pos(n); - int alen = input.sparseBlock.size(n); - int[] aix = input.sparseBlock.indexes(n); - double[] avals = input.sparseBlock.values(n); - for(int j=apos; j<apos+alen; j++) - temp[ aix[j] ] = avals[j]; + if(input.isInSparseFormat()) { + Arrays.fill(temp, 0); + if( !input.sparseBlock.isEmpty(n) ) { + int apos = input.sparseBlock.pos(n); + int alen = input.sparseBlock.size(n); + int[] aix = input.sparseBlock.indexes(n); + double[] avals = input.sparseBlock.values(n); + for(int j=apos; j<apos+alen; j++) + temp[ aix[j] ] = avals[j]; + } + } + else { + System.arraycopy(input.getDenseBlock(), n*input.getNumColumns(), temp, 0, input.getNumColumns()); } return temp; } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java index 5c7c2ef..983ce53 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java @@ -17,7 +17,6 @@ * under the License. */ - package org.apache.sysml.runtime.matrix.data; import java.util.ArrayList; @@ -123,7 +122,7 @@ public class LibMatrixMult { matrixMult(m1, m2, ret, rl, ru, true); } - + public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru, boolean examSparsity) throws DMLRuntimeException { @@ -132,9 +131,9 @@ public class LibMatrixMult ret.examSparsity(); //turn empty dense into sparse return; } - + //Timing time = new Timing(true); - + //pre-processing: output allocation boolean tm2 = checkPrepMatrixMultRightInput(m1,m2); m2 = prepMatrixMultRightInput(m1, m2); @@ -166,6 +165,7 @@ public class LibMatrixMult if(examSparsity) ret.examSparsity(); + //System.out.println("MM ("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+")x" + // "("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop()); } @@ -188,7 +188,7 @@ public class LibMatrixMult ret.examSparsity(); //turn empty dense into sparse return; } - + //check too high additional vector-matrix memory requirements (fallback to sequential) //check too small workload in terms of flops (fallback to sequential too) if( m1.rlen == 1 && (8L * m2.clen * k > MEM_OVERHEAD_THRESHOLD || !LOW_LEVEL_OPTIMIZATION || m2.clen==1 || m1.isUltraSparse() || m2.isUltraSparse()) @@ -247,6 +247,7 @@ public class LibMatrixMult throw new DMLRuntimeException(ex); } + //post-processing (nnz maintained in parallel) ret.examSparsity(); @@ -3960,4 +3961,4 @@ public class LibMatrixMult return _ret.recomputeNonZeros(_rl, _ru-1, 0, _ret.getNumColumns()-1); } } -} \ No newline at end of file +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java new file mode 100644 index 0000000..4b12596 --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.matrix.data; + +import org.apache.sysml.hops.OptimizerUtils; +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.utils.NativeHelper; +import org.apache.sysml.utils.Statistics; + +public class LibMatrixNative { + + // We could encapsulate heuristics in this function + // For now, we only consider matrix-vector operation to be memory bound + private static boolean isMatMultMemoryBound(int m1Rlen, int m1Clen, int m2Clen) { + return m1Rlen == 1 || m1Clen == 1 || m2Clen == 1; + } + + /** + * Performs matrix multiplication using native library if BLAS is available or else falls back to + * Java BLAS. + * + * @param m1 lhs matrix block + * @param m2 rhs matrix block + * @param ret output matrix block + * @param k number of threads + * @throws DMLRuntimeException if error occurs + */ + public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int k) throws DMLRuntimeException { + matrixMult(m1, m2, ret, k, true); + } + + public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int k, boolean examSparsity) throws DMLRuntimeException { + // Sanity check: + k = k <= 0 ? NativeHelper.getMaxNumThreads() : k; + + // check inputs / outputs + if (m1.isEmptyBlock() || m2.isEmptyBlock()) { + ret.setNonZeros(0); + if(examSparsity) + ret.examSparsity(); // turn empty dense into sparse + return; + } + if (NativeHelper.isNativeLibraryLoaded() && + !isMatMultMemoryBound(m1.rlen, m1.clen, m2.clen) && !m1.isInSparseFormat() && !m2.isInSparseFormat()) { + ret.sparse = false; + ret.allocateDenseBlock(); + if (NativeHelper.matrixMultDenseDense(m1.denseBlock, m2.denseBlock, + ret.denseBlock, m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), k)) { + Statistics.numNativeLibMatrixMultCalls.increment(); + ret.recomputeNonZeros(); + // post-processing (nnz maintained in parallel) + if(examSparsity) + ret.examSparsity(); + return; + } else { + // Else fall back to Java + Statistics.incrementNativeFailuresCounter(); + } + } + if (k == 1) + LibMatrixMult.matrixMult(m1, m2, ret, examSparsity); + else + LibMatrixMult.matrixMult(m1, m2, ret, k); + } + + /** + * This method performs convolution (i.e. cross-correlation) operation on input + * + * @param input input batch + * @param filter filter + * @param outputBlock output of convolution + * @param params convolution parameters + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static void conv2d(MatrixBlock input, MatrixBlock filter, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { + LibMatrixDNN.checkInputsConv2d(input, filter, outputBlock, params); + params.numThreads = params.numThreads <= 0 ? NativeHelper.getMaxNumThreads() : params.numThreads; + if(NativeHelper.isNativeLibraryLoaded() && !input.isInSparseFormat() && !filter.isInSparseFormat()) { + setNumThreads(params); + if(params.bias == null) { + if(NativeHelper.conv2dDense(input.denseBlock, filter.denseBlock, outputBlock.denseBlock, params.N, params.C, params.H, params.W, + params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, + params.P, params.Q, params.numThreads)) { + Statistics.numNativeLibMatrixDNNCalls.increment(); + // post-processing: maintain nnz + outputBlock.recomputeNonZeros(); + return; + } + else { + // Fall back to Java when failures + Statistics.incrementNativeFailuresCounter(); + } + } + else { + if(params.bias.isInSparseFormat()) + params.bias.sparseToDense(); // Bias matrix is usually extremely small + if(NativeHelper.conv2dBiasAddDense(input.denseBlock, params.bias.denseBlock, filter.denseBlock, outputBlock.denseBlock, + params.N, params.C, params.H, params.W, + params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, + params.P, params.Q, params.numThreads)) { + Statistics.numNativeLibMatrixDNNCalls.increment(); + // post-processing: maintain nnz + outputBlock.recomputeNonZeros(); + return; + } + else { + // Fall back to Java when failures + Statistics.incrementNativeFailuresCounter(); + } + } + } + + // Fall back to Java when failures or sparse + LibMatrixDNN.conv2d(input, filter, outputBlock, params); + } + + private static void setNumThreads(ConvolutionParameters params) { + params.numThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads); + if (!(params.isOutputThreadSafe() && params.numThreads > 1)) + params.numThreads = 1; + } + + /** + * This method computes the backpropogation errors for filter of convolution operation + * + * @param input input image + * @param dout errors from next layer + * @param outputBlock output errors + * @param params convolution parameters + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static void conv2dBackwardFilter(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { + LibMatrixDNN.checkInputsConv2dBackwardFilter(input, dout, outputBlock, params); + params.numThreads = params.numThreads <= 0 ? NativeHelper.getMaxNumThreads() : params.numThreads; + if(NativeHelper.isNativeLibraryLoaded() && !dout.isInSparseFormat() && !input.isInSparseFormat()) { + setNumThreads(params); + if(NativeHelper.conv2dBackwardFilterDense(input.denseBlock, dout.denseBlock, outputBlock.denseBlock, params.N, params.C, params.H, params.W, + params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, + params.P, params.Q, params.numThreads)) { + Statistics.numNativeLibMatrixDNNCalls.increment(); + // post-processing: maintain nnz + outputBlock.recomputeNonZeros(); + return; + } + else { + // Fall back to Java when failures + Statistics.incrementNativeFailuresCounter(); + } + } + // Fall back to Java when failures or sparse + LibMatrixDNN.conv2dBackwardFilter(input, dout, outputBlock, params); + } + + /** + * This method computes the backpropogation errors for previous layer of convolution operation + * + * @param filter filter used in conv2d + * @param dout errors from next layer + * @param outputBlock output errors + * @param params convolution parameters + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static void conv2dBackwardData(MatrixBlock filter, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException { + LibMatrixDNN.checkInputsConv2dBackwardData(filter, dout, outputBlock, params); + params.numThreads = params.numThreads <= 0 ? NativeHelper.getMaxNumThreads() : params.numThreads; + if(NativeHelper.isNativeLibraryLoaded() && !dout.isInSparseFormat() && !filter.isInSparseFormat()) { + setNumThreads(params); + if(NativeHelper.conv2dBackwardDataDense(filter.denseBlock, dout.denseBlock, outputBlock.denseBlock, params.N, params.C, params.H, params.W, + params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, + params.P, params.Q, params.numThreads)) { + Statistics.numNativeLibMatrixDNNCalls.increment(); + // post-processing: maintain nnz + outputBlock.recomputeNonZeros(); + return; + } + else { + // Fall back to Java when failures + Statistics.incrementNativeFailuresCounter(); + } + } + // Fall back to Java when failures or sparse + LibMatrixDNN.conv2dBackwardData(filter, dout, outputBlock, params); + } +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java index c458b07..8fac947 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java @@ -35,6 +35,7 @@ import java.util.stream.LongStream; import org.apache.commons.math3.random.Well1024a; import org.apache.hadoop.io.DataInputBuffer; import org.apache.sysml.conf.ConfigurationManager; +import org.apache.sysml.conf.DMLConfig; import org.apache.sysml.hops.Hop.OpOp2; import org.apache.sysml.hops.OptimizerUtils; import org.apache.sysml.lops.MMTSJ.MMTSJType; @@ -4873,15 +4874,26 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab return sum_wt; } + + public MatrixValue aggregateBinaryOperations(MatrixIndexes m1Index, MatrixValue m1Value, MatrixIndexes m2Index, MatrixValue m2Value, + MatrixValue result, AggregateBinaryOperator op ) throws DMLRuntimeException + { + boolean enableNativeBLAS = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.NATIVE_BLAS); + return aggregateBinaryOperations(m1Value, m2Value, result, op, enableNativeBLAS); + } public MatrixValue aggregateBinaryOperations(MatrixIndexes m1Index, MatrixValue m1Value, MatrixIndexes m2Index, MatrixValue m2Value, - MatrixValue result, AggregateBinaryOperator op ) - throws DMLRuntimeException + MatrixValue result, AggregateBinaryOperator op, boolean enableNativeBLAS ) throws DMLRuntimeException { - return aggregateBinaryOperations(m1Value, m2Value, result, op); + return aggregateBinaryOperations(m1Value, m2Value, result, op, enableNativeBLAS); + } + + public MatrixValue aggregateBinaryOperations(MatrixValue m1Value, MatrixValue m2Value, MatrixValue result, AggregateBinaryOperator op) throws DMLRuntimeException { + boolean enableNativeBLAS = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.NATIVE_BLAS); + return aggregateBinaryOperations(m1Value, m2Value, result, op, enableNativeBLAS); } - public MatrixValue aggregateBinaryOperations(MatrixValue m1Value, MatrixValue m2Value, MatrixValue result, AggregateBinaryOperator op) + public MatrixValue aggregateBinaryOperations(MatrixValue m1Value, MatrixValue m2Value, MatrixValue result, AggregateBinaryOperator op, boolean nativeMatMult) throws DMLRuntimeException { //check input types, dimensions, configuration @@ -4907,7 +4919,9 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab ret.reset(rl, cl, sp.sparse, sp.estimatedNonZeros); //compute matrix multiplication (only supported binary aggregate operation) - if( op.getNumThreads() > 1 ) + if( nativeMatMult ) + LibMatrixNative.matrixMult(m1, m2, ret, op.getNumThreads()); + else if( op.getNumThreads() > 1 ) LibMatrixMult.matrixMult(m1, m2, ret, op.getNumThreads()); else LibMatrixMult.matrixMult(m1, m2, ret); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java b/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java index b988546..8e51405 100644 --- a/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java +++ b/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java @@ -22,6 +22,8 @@ package org.apache.sysml.runtime.util; import java.util.Arrays; import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.functionobjects.Multiply; +import org.apache.sysml.runtime.functionobjects.Plus; import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.matrix.operators.BinaryOperator; import org.apache.sysml.runtime.matrix.operators.ScalarOperator; @@ -58,12 +60,20 @@ public class ConvolutionUtils { } return ret; } + - // Performs dest[destPos ...] <- src[src_rl:src_ru, ] - //Assumes that dest is zeroed-out before calling - public static void copy(MatrixBlock src, double [] dest, int destPos, int destNumCols, int src_rl, int src_ru) { + // Performs dest[destPos...] op= thatValue[src_rl:src_ru,] + public static void binaryOperationInPlace(MatrixBlock src, double [] dest, + int destPos, int destNumCols, int src_rl, int src_ru, BinaryOperator op) throws DMLRuntimeException { if(src.isInSparseFormat()) { - if(!src.isEmptyBlock()) { + if(src.isEmptyBlock() && op.fn == Plus.getPlusFnObject()) { + // Do nothing: Inplace addition by zero + } + else if(src.isEmptyBlock() && op.fn == Multiply.getMultiplyFnObject()) { + // Inplace multiplication by zero + Arrays.fill(dest, destPos, destPos + (src_ru-src_rl)*destNumCols, 0); + } + else if(op.fn == Plus.getPlusFnObject()) { for(int i = src_rl, cix = destPos; i < src_ru; i++, cix += destNumCols) { if( !src.getSparseBlock().isEmpty(i) ) { int apos = src.getSparseBlock().pos(i); @@ -71,37 +81,54 @@ public class ConvolutionUtils { int[] aix = src.getSparseBlock().indexes(i); double[] avals = src.getSparseBlock().values(i); for(int j = apos; j < apos+alen; j++) { - dest[ cix+aix[j] ] = avals[j]; + dest[ cix+aix[j] ] += avals[j]; } } } } - } - else { - System.arraycopy(src.getDenseBlock(), src_rl*src.getNumColumns(), dest, destPos, (src_ru-src_rl)*src.getNumColumns()); - } - } - - // Performs dest[destPos...] op= thatValue[src_rl:src_ru,] - public static void binaryOperationInPlace(MatrixBlock src, double [] dest, - int destPos, int destNumCols, int src_rl, int src_ru, BinaryOperator op) throws DMLRuntimeException { - if(src.isInSparseFormat()) { - for(int i = src_rl, cix = destPos; i < src_ru; i++, cix += destNumCols) { - if( !src.getSparseBlock().isEmpty(i) ) { - int apos = src.getSparseBlock().pos(i); - int alen = src.getSparseBlock().size(i); - int[] aix = src.getSparseBlock().indexes(i); - double[] avals = src.getSparseBlock().values(i); - for(int j = apos; j < apos+alen; j++) { - dest[ cix+aix[j] ] = op.fn.execute(dest[ cix+aix[j] ], avals[j]); + else if(op.fn == Multiply.getMultiplyFnObject()) { + // Unsafe operation + for(int i = src_rl, cix = destPos; i < src_ru; i++, cix += destNumCols) { + if( !src.getSparseBlock().isEmpty(i) ) { + int apos = src.getSparseBlock().pos(i); + int alen = src.getSparseBlock().size(i); + int[] aix = src.getSparseBlock().indexes(i); + double[] avals = src.getSparseBlock().values(i); + int prevDestIndex = 0; + for(int j = apos; j < apos+alen; j++) { + // Multiplication by zero. Assumption: aix is sorted. + Arrays.fill(dest, cix+prevDestIndex, aix[j], 0); + prevDestIndex = aix[j]+1; + dest[ cix+aix[j] ] *= avals[j]; + } + Arrays.fill(dest, cix+prevDestIndex, cix+destNumCols, 0); + } + else { + Arrays.fill(dest, cix, cix + destNumCols, 0); } } } + else { + // As operation could be safe or unsafe. This will be caught at development time. + throw new DMLRuntimeException("Unimplemented sparse operation"); + } } else { double [] inputArr = src.getDenseBlock(); - for(int i = destPos; i < src_ru*destNumCols; i++) { - dest[i] = op.fn.execute(dest[i], inputArr[i]); + if(op.fn == Plus.getPlusFnObject()) { + for(int i = destPos; i < src_ru*destNumCols; i++) { + dest[i] += inputArr[i]; + } + } + else if(op.fn == Multiply.getMultiplyFnObject()) { + for(int i = destPos; i < src_ru*destNumCols; i++) { + dest[i] *= inputArr[i]; + } + } + else { + for(int i = destPos; i < src_ru*destNumCols; i++) { + dest[i] = op.fn.execute(dest[i], inputArr[i]); + } } } } @@ -130,45 +157,6 @@ public class ConvolutionUtils { } } - // dest (of size N x KPQ) = input (of size N x KPQ) op bias (of size K x 1) - public static void binaryBiasOperations(MatrixBlock input, MatrixBlock bias, double [] dest, - int K, int PQ, int rl, int ru, BinaryOperator op) throws DMLRuntimeException { - copy(input, dest, rl*K*PQ, K*PQ, rl, ru); - binaryBiasOperationInPlace(bias, dest, K, PQ, rl, ru, op); - } - - // dest (of size N x KPQ) op= bias (of size K x 1) - public static void binaryBiasOperationInPlace(MatrixBlock bias, double [] dest, - int K, int PQ, int rl, int ru, BinaryOperator op) throws DMLRuntimeException { - // bias.getNumColumns() == 1 checked outside - if(!bias.isInSparseFormat()) { - double [] biasArr = bias.getDenseBlock(); - int index = rl*K*PQ; - for(int n = rl; n < ru; n++) { - for(int k = 0; k < K; k++) { - for(int pq = 0; pq < PQ; pq++, index++) { - dest[index] = op.fn.execute(dest[index], biasArr[k]); - } - } - } - } - else { - for(int k = 0; k < K; k++) { - if( !bias.getSparseBlock().isEmpty(k) ) { - int apos = bias.getSparseBlock().pos(k); - double[] avals = bias.getSparseBlock().values(k); - double val = avals[apos]; - for(int n = rl; n < ru; n++) { - int index = n*K*PQ + k*PQ; - for(int pq = 0; pq < PQ; pq++, index++) { - dest[index] = op.fn.execute(dest[index], val); - } - } - } - } - } - } - public static void fillBias(MatrixBlock bias, double [] outputArray, int src_rl, int src_ru, int N, int K, int PQ) throws DMLRuntimeException { // bias.getNumColumns() == 1 checked outside if(bias.isInSparseFormat()) { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/utils/EnvironmentHelper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/utils/EnvironmentHelper.java b/src/main/java/org/apache/sysml/utils/EnvironmentHelper.java new file mode 100644 index 0000000..c1b4c3e --- /dev/null +++ b/src/main/java/org/apache/sysml/utils/EnvironmentHelper.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.utils; + + +/** + * This class is useful in setting environment variable for loading MKL library (done by Native Helper) + */ +public class EnvironmentHelper { + public static native void setEnv(String key, String value); +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/utils/NativeHelper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/utils/NativeHelper.java b/src/main/java/org/apache/sysml/utils/NativeHelper.java new file mode 100644 index 0000000..2b997ed --- /dev/null +++ b/src/main/java/org/apache/sysml/utils/NativeHelper.java @@ -0,0 +1,259 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysml.utils; + +import java.io.IOException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import java.util.HashMap; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.File; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang.SystemUtils; +import org.apache.sysml.conf.ConfigurationManager; +import org.apache.sysml.conf.DMLConfig; +import org.apache.sysml.hops.OptimizerUtils; + +/** + * This class helps in loading native library. + * By default, it first tries to load Intel MKL, else tries to load OpenBLAS. + */ +public class NativeHelper { + private static boolean isSystemMLLoaded = false; + private static final Log LOG = LogFactory.getLog(NativeHelper.class.getName()); + private static HashMap<String, String> supportedArchitectures = new HashMap<String, String>(); + public static String blasType; + private static int maxNumThreads = -1; + private static boolean setMaxNumThreads = false; + static { + // Note: we only support 64 bit Java on x86 and AMD machine + supportedArchitectures.put("x86_64", "x86_64"); + supportedArchitectures.put("amd64", "x86_64"); + } + + private static boolean attemptedLoading = false; + + // Performing loading in a method instead of a static block will throw a detailed stack trace in case of fatal errors + private static void init() { + // Only Linux supported for BLAS + if(!SystemUtils.IS_OS_LINUX) + return; + + // attemptedLoading variable ensures that we don't try to load SystemML and other dependencies + // again and again especially in the parfor (hence the double-checking with synchronized). + if(!attemptedLoading) { + DMLConfig dmlConfig = ConfigurationManager.getDMLConfig(); + String userSpecifiedBLAS = System.getenv("SYSTEMML_BLAS"); + userSpecifiedBLAS = (userSpecifiedBLAS == null) ? "" : userSpecifiedBLAS.trim().toLowerCase(); + // ------------------------------------------------------------------------------------- + // We allow BLAS to be enabled or disabled or explicitly selected in one of the two ways: + // 1. DML Configuration: native.blas (boolean flag) + // 2. Environment variable: SYSTEMML_BLAS (can be set to mkl, openblas or none) + // The option 1 will be removed in later SystemML versions. + // The option 2 is useful for two reasons: + // - Developer testing of different BLAS + // - Provides fine-grained control. Certain machines could use mkl while others use openblas, etc. + boolean enabledViaConfig = (dmlConfig == null) ? true : dmlConfig.getBooleanValue(DMLConfig.NATIVE_BLAS); + boolean enabledViaEnvironmentVariable = userSpecifiedBLAS.equals("") || userSpecifiedBLAS.equals("mkl") || userSpecifiedBLAS.equals("openblas"); + + if(enabledViaConfig && enabledViaEnvironmentVariable) { + long start = System.nanoTime(); + if(!supportedArchitectures.containsKey(SystemUtils.OS_ARCH)) { + LOG.warn("Unsupported architecture for native BLAS:" + SystemUtils.OS_ARCH); + return; + } + synchronized(NativeHelper.class) { + if(!attemptedLoading) { + // ----------------------------------------------------------------------------- + // ============================================================================= + // By default, we will native.blas=true and we will attempt to load MKL first. + // If MKL is not enabled then we try to load OpenBLAS. + // If both MKL and OpenBLAS are not available we fall back to Java BLAS. + if(userSpecifiedBLAS.equalsIgnoreCase("")) { + blasType = isMKLAvailable() ? "mkl" : isOpenBLASAvailable() ? "openblas" : null; + if(blasType == null) + LOG.warn("Unable to load either MKL or OpenBLAS"); + } + else if(userSpecifiedBLAS.equalsIgnoreCase("mkl")) { + blasType = isMKLAvailable() ? "mkl" : null; + if(blasType == null) + LOG.warn("Unable to load MKL"); + } + else if(userSpecifiedBLAS.equalsIgnoreCase("openblas")) { + blasType = isOpenBLASAvailable() ? "openblas" : null; + if(blasType == null) + LOG.warn("Unable to load OpenBLAS"); + } + else { + LOG.warn("Unsupported BLAS:" + userSpecifiedBLAS); + } + // ============================================================================= + if(blasType != null && loadLibraryHelper("libsystemml_" + blasType + "-Linux-x86_64.so")) { + LOG.info("Using native blas: " + blasType); + isSystemMLLoaded = true; + } + } + } + double timeToLoadInMilliseconds = (System.nanoTime()-start)*1e-6; + if(timeToLoadInMilliseconds > 100) + LOG.warn("Time to load native blas: " + timeToLoadInMilliseconds + " milliseconds."); + } + else { + if(enabledViaConfig) + LOG.warn("Using internal Java BLAS as native BLAS support is disabled by the configuration 'native.blas'."); + else + LOG.warn("Using internal Java BLAS as native BLAS support is disabled by the environment variable 'SYSTEMML_BLAS=" + userSpecifiedBLAS + "'."); + } + attemptedLoading = true; + } + } + + public static boolean isNativeLibraryLoaded() { + init(); + if(maxNumThreads == -1) + maxNumThreads = OptimizerUtils.getConstrainedNumThreads(-1); + if(isSystemMLLoaded && !setMaxNumThreads && maxNumThreads != -1) { + // This method helps us decide whether to use GetPrimitiveArrayCritical or GetDoubleArrayElements in JNI as each has different tradeoffs. + // In current implementation, we always use GetPrimitiveArrayCritical as it has proven to be fastest. + // We can revisit this decision later and hence I would not recommend removing this method. + setMaxNumThreads(maxNumThreads); + setMaxNumThreads = true; + } + return isSystemMLLoaded; + } + + public static int getMaxNumThreads() { + if(maxNumThreads == -1) + maxNumThreads = OptimizerUtils.getConstrainedNumThreads(-1); + return maxNumThreads; + } + + + private static boolean isMKLAvailable() { + // ------------------------------------------------------------ + // Set environment variable MKL_THREADING_LAYER to GNU on Linux for performance + if(!loadLibraryHelper("libpreload_systemml-Linux-x86_64.so")) { + LOG.warn("Unable to load preload_systemml (required for loading MKL-enabled SystemML library)"); + return false; + } + // The most reliable way in my investigation to ensure that MKL runs smoothly with OpenMP (used by conv2d*) + // is setting the environment variable MKL_THREADING_LAYER to GNU + EnvironmentHelper.setEnv("MKL_THREADING_LAYER", "GNU"); + if(!loadBLAS("gomp", "gomp required for loading MKL-enabled SystemML library")) + return false; + + // ------------------------------------------------------------ + return loadBLAS("mkl_rt", null); + } + + private static boolean isOpenBLASAvailable() { + if(!loadBLAS("gomp", "gomp required for loading OpenBLAS-enabled SystemML library")) + return false; + return loadBLAS("openblas", null); + } + + private static boolean loadBLAS(String blas, String optionalMsg) { + try { + System.loadLibrary(blas); + return true; + } + catch (UnsatisfiedLinkError e) { + if(optionalMsg != null) + LOG.warn("Unable to load " + blas + "(" + optionalMsg + "):" + e.getMessage()); + else + LOG.warn("Unable to load " + blas + ":" + e.getMessage()); + return false; + } + } + + private static boolean loadLibraryHelper(String path) { + InputStream in = null; OutputStream out = null; + try { + // This logic is added because Java doesnot allow to load library from a resource file. + in = NativeHelper.class.getResourceAsStream("/lib/"+path); + if(in != null) { + File temp = File.createTempFile(path, ""); + temp.deleteOnExit(); + out = FileUtils.openOutputStream(temp); + IOUtils.copy(in, out); + in.close(); in = null; + out.close(); out = null; + System.load(temp.getAbsolutePath()); + return true; + } + else + LOG.warn("No lib available in the jar:" + path); + } catch(IOException e) { + LOG.warn("Unable to load library " + path + " from resource:" + e.getMessage()); + } finally { + if(out != null) + try { + out.close(); + } catch (IOException e) {} + if(in != null) + try { + in.close(); + } catch (IOException e) {} + } + return false; + } + + // TODO: Add pmm, wsloss, mmchain, etc. + public static native boolean matrixMultDenseDense(double [] m1, double [] m2, double [] ret, int m1rlen, int m1clen, int m2clen, int numThreads); + private static native boolean tsmm(double [] m1, double [] ret, int m1rlen, int m1clen, boolean isLeftTranspose, int numThreads); + + // ---------------------------------------------------------------------------------------------------------------- + // LibMatrixDNN operations: + // N = number of images, C = number of channels, H = image height, W = image width + // K = number of filters, R = filter height, S = filter width + // TODO: case not handled: sparse filters (which will only be executed in Java). Since filters are relatively smaller, this is a low priority. + + // Called by ConvolutionCPInstruction if both input and filter are dense + public static native boolean conv2dDense(double [] input, double [] filter, double [] ret, int N, int C, int H, int W, + int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads); + public static native boolean conv2dBiasAddDense(double [] input, double [] bias, double [] filter, double [] ret, int N, int C, int H, int W, + int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads); + // Called by ConvolutionCPInstruction if both input and filter are dense + public static native boolean conv2dBackwardFilterDense(double [] input, double [] dout, double [] ret, int N, int C, int H, int W, + int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads); + // If both filter and dout are dense, then called by ConvolutionCPInstruction + // Else, called by LibMatrixDNN's thread if filter is dense. dout[n] is converted to dense if sparse. + public static native boolean conv2dBackwardDataDense(double [] filter, double [] dout, double [] ret, int N, int C, int H, int W, + int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads); + + // Currently only supported with numThreads = 1 and sparse input + // Called by LibMatrixDNN's thread if input is sparse. dout[n] is converted to dense if sparse. + public static native boolean conv2dBackwardFilterSparseDense(int apos, int alen, int[] aix, double[] avals, double [] rotatedDoutPtr, double [] ret, int N, int C, int H, int W, + int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads); + // Called by LibMatrixDNN's thread if input is sparse and filter is dense + public static native boolean conv2dSparse(int apos, int alen, int[] aix, double[] avals, double [] filter, double [] ret, int N, int C, int H, int W, + int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads); + // ---------------------------------------------------------------------------------------------------------------- + + // This method helps us decide whether to use GetPrimitiveArrayCritical or GetDoubleArrayElements in JNI as each has different tradeoffs. + // In current implementation, we always use GetPrimitiveArrayCritical as it has proven to be fastest. + // We can revisit this decision later and hence I would not recommend removing this method. + private static native void setMaxNumThreads(int numThreads); +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/utils/Statistics.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/utils/Statistics.java b/src/main/java/org/apache/sysml/utils/Statistics.java index 097f58e..97888cb 100644 --- a/src/main/java/org/apache/sysml/utils/Statistics.java +++ b/src/main/java/org/apache/sysml/utils/Statistics.java @@ -112,6 +112,17 @@ public class Statistics return numExecutedMRJobs.longValue(); } + private static LongAdder numNativeFailures = new LongAdder(); + public static LongAdder numNativeLibMatrixMultCalls = new LongAdder(); + public static LongAdder numNativeLibMatrixDNNCalls = new LongAdder(); + public static void incrementNativeFailuresCounter() { + numNativeFailures.increment(); + // This is very rare and am not sure it is possible at all. Our initial experiments never encountered this case. + // Note: all the native calls have a fallback to Java; so if the user wants she can recompile SystemML by + // commenting this exception and everything should work fine. + throw new RuntimeException("Unexpected ERROR: OOM caused during JNI transfer. Please disable native BLAS by setting enviroment variable: SYSTEMML_BLAS=none"); + } + public static void incrementNoOfExecutedMRJobs() { numExecutedMRJobs.increment(); } @@ -366,6 +377,9 @@ public class Statistics resetCPHeavyHitters(); GPUStatistics.reset(); + numNativeLibMatrixMultCalls.reset(); + numNativeLibMatrixDNNCalls.reset(); + numNativeFailures.reset(); LibMatrixDNN.resetStatistics(); } @@ -621,6 +635,11 @@ public class Statistics //show extended caching/compilation statistics if( DMLScript.STATISTICS ) { + if(NativeHelper.blasType != null && (numNativeLibMatrixMultCalls.longValue() > 0 || + numNativeLibMatrixDNNCalls.longValue() > 0)) { + String blas = NativeHelper.blasType != null ? NativeHelper.blasType : ""; + sb.append("Native " + blas + " calls (LibMatrixMult/LibMatrixDNN):\t" + numNativeLibMatrixMultCalls.longValue() + "/" + numNativeLibMatrixDNNCalls.longValue() + ".\n"); + } sb.append("Cache hits (Mem, WB, FS, HDFS):\t" + CacheStatistics.displayHits() + ".\n"); sb.append("Cache writes (WB, FS, HDFS):\t" + CacheStatistics.displayWrites() + ".\n"); sb.append("Cache times (ACQr/m, RLS, EXP):\t" + CacheStatistics.displayTime() + " sec.\n");