Repository: incubator-systemml Updated Branches: refs/heads/master 0490fec93 -> 5baac2d62
[HOTFIX] Disabling GPU fused relu & maxpooling operator because of bug - Fixed the timer that counts the number of times memory chunks are zero-ed out - Some minor code refactoring Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/5baac2d6 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/5baac2d6 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/5baac2d6 Branch: refs/heads/master Commit: 5baac2d62f64026ff82b9d674b909bc4b80800b0 Parents: 0490fec Author: Nakul Jindal <[email protected]> Authored: Mon Mar 13 15:40:08 2017 -0700 Committer: Nakul Jindal <[email protected]> Committed: Wed Mar 15 15:31:17 2017 -0700 ---------------------------------------------------------------------- .../org/apache/sysml/hops/ConvolutionOp.java | 12 +++++++---- .../gpu/ConvolutionGPUInstruction.java | 9 +++++++-- .../instructions/gpu/context/JCudaObject.java | 2 +- .../runtime/matrix/data/LibMatrixCUDA.java | 21 ++++++++++---------- 4 files changed, 26 insertions(+), 18 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5baac2d6/src/main/java/org/apache/sysml/hops/ConvolutionOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java index 943ff96..9483b2c 100644 --- a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java +++ b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java @@ -19,15 +19,13 @@ package org.apache.sysml.hops; -import java.util.ArrayList; - import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.hops.Hop.MultiThreadedHop; import org.apache.sysml.lops.ConvolutionTransform; import org.apache.sysml.lops.ConvolutionTransform.OperationTypes; import org.apache.sysml.lops.Lop; -import org.apache.sysml.lops.LopsException; import org.apache.sysml.lops.LopProperties.ExecType; +import org.apache.sysml.lops.LopsException; import org.apache.sysml.lops.ReBlock; import org.apache.sysml.parser.Expression.DataType; import org.apache.sysml.parser.Expression.ValueType; @@ -35,6 +33,8 @@ import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; import org.apache.sysml.runtime.matrix.data.ConvolutionParameters; +import java.util.ArrayList; + public class ConvolutionOp extends Hop implements MultiThreadedHop { private Hop.ConvOp op; @@ -179,7 +179,11 @@ public class ConvolutionOp extends Hop implements MultiThreadedHop ArrayList<Hop> inputs1 = inputs; int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads); OperationTypes lopOp = HopsConv2Lops.get(op); - if(op == ConvOp.MAX_POOLING && isInputReLU(inputs.get(0))) { + + // The fused relu_maxpooling is being disabled for now on the GPU + // There is a bug in LibMatrixCUDA#reluMaxpooling + // which we need to understand before enabling this by removing the "et != ExecType.GPU" guard. + if(op == ConvOp.MAX_POOLING && isInputReLU(inputs.get(0)) && et != ExecType.GPU) { in = inputs.get(0).getInput().get(0).constructLops(); lopOp = OperationTypes.RELU_MAX_POOLING; } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5baac2d6/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java index daf3c58..7460d6b 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java @@ -18,8 +18,6 @@ */ package org.apache.sysml.runtime.instructions.gpu; -import java.util.ArrayList; - import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; @@ -31,6 +29,8 @@ import org.apache.sysml.runtime.matrix.operators.ReorgOperator; import org.apache.sysml.runtime.util.ConvolutionUtils; import org.apache.sysml.utils.GPUStatistics; +import java.util.ArrayList; + public class ConvolutionGPUInstruction extends GPUInstruction { private CPOperand _input1; @@ -337,8 +337,13 @@ public class ConvolutionGPUInstruction extends GPUInstruction // release inputs/outputs ec.releaseMatrixInputForGPUInstruction(_input1.getName()); + if (!( instOpcode.equalsIgnoreCase("maxpooling") || instOpcode.equalsIgnoreCase("relu_maxpooling")) ) ec.releaseMatrixInputForGPUInstruction(_input2.getName()); + + if (instOpcode.equalsIgnoreCase("conv2d_bias_add")) + ec.releaseMatrixInputForGPUInstruction(_input3.getName()); + ec.releaseMatrixOutputForGPUInstruction(_output.getName()); } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5baac2d6/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java index c980b20..47ff197 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java @@ -1188,7 +1188,7 @@ public class JCudaObject extends GPUObject { // Set all elements to 0 since newly allocated space will contain garbage if (DMLScript.STATISTICS) t1 = System.nanoTime(); cudaMemset(A, 0, size); - if (DMLScript.STATISTICS) end = System.nanoTime() - t1; + if (DMLScript.STATISTICS) end = System.nanoTime(); if (instructionName != null && GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instructionName, GPUInstruction.MISC_TIMER_SET_ZERO, end - t1); if (DMLScript.STATISTICS) GPUStatistics.cudaMemSet0Time.getAndAdd(end - t1); if (DMLScript.STATISTICS) GPUStatistics.cudaMemSet0Count.getAndAdd(1); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5baac2d6/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java index a3b9168..1a08396 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java @@ -648,7 +648,9 @@ public class LibMatrixCUDA { ((JCudaObject)image.getGPUObject()).sparseToDense(instName); } Pointer x = ((JCudaObject)image.getGPUObject()).jcudaDenseMatrixPtr; - performMaxpooling(instName, x, outputBlock, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); + Pointer y = ((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr; + + performMaxpooling(instName, x, y, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); } /** @@ -678,17 +680,17 @@ public class LibMatrixCUDA { if(isInSparseFormat(image)) { ((JCudaObject)image.getGPUObject()).sparseToDense(instName); } + long size = image.getNumRows() * image.getNumColumns() * Sizeof.DOUBLE; Pointer x = ((JCudaObject)image.getGPUObject()).jcudaDenseMatrixPtr; - //MatrixObject temp = new MatrixObject(image); - //temp.getGPUObject().acquireDeviceModifyDense(); - Pointer y = ((JCudaObject)image.getGPUObject()).jcudaDenseMatrixPtr; - performReLU(instName, x, y, N, C, H, W); - performMaxpooling(instName, y, outputBlock, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); - //((JCudaObject)temp.getGPUObject()).clearData(); // deallocate the temporary data + Pointer y = ((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr; + Pointer tmp = allocate(size); + performReLU(instName, x, tmp, N, C, H, W); + performMaxpooling(instName, tmp, y, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); + cudaFreeHelper(tmp); } private static void performMaxpooling(String instName, Pointer x, - MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, + Pointer y, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException { @@ -706,9 +708,6 @@ public class LibMatrixCUDA { xDesc = allocateTensorDescriptor(N, C, H, W); poolingDesc = allocatePoolingDescriptor(R, S, pad_h, pad_w, stride_h, stride_w); - // Allocate data - Pointer y = ((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr; - alpha = pointerTo(1.0); beta = pointerTo(0.0f);
