Repository: incubator-systemml
Updated Branches:
  refs/heads/master 0490fec93 -> 5baac2d62


[HOTFIX] Disabling GPU fused relu & maxpooling operator because of bug

- Fixed the timer that counts the number of times memory chunks are
  zero-ed out
- Some minor code refactoring


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/5baac2d6
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/5baac2d6
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/5baac2d6

Branch: refs/heads/master
Commit: 5baac2d62f64026ff82b9d674b909bc4b80800b0
Parents: 0490fec
Author: Nakul Jindal <[email protected]>
Authored: Mon Mar 13 15:40:08 2017 -0700
Committer: Nakul Jindal <[email protected]>
Committed: Wed Mar 15 15:31:17 2017 -0700

----------------------------------------------------------------------
 .../org/apache/sysml/hops/ConvolutionOp.java    | 12 +++++++----
 .../gpu/ConvolutionGPUInstruction.java          |  9 +++++++--
 .../instructions/gpu/context/JCudaObject.java   |  2 +-
 .../runtime/matrix/data/LibMatrixCUDA.java      | 21 ++++++++++----------
 4 files changed, 26 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5baac2d6/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java 
b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
index 943ff96..9483b2c 100644
--- a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
+++ b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
@@ -19,15 +19,13 @@
 
 package org.apache.sysml.hops;
 
-import java.util.ArrayList;
-
 import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.hops.Hop.MultiThreadedHop;
 import org.apache.sysml.lops.ConvolutionTransform;
 import org.apache.sysml.lops.ConvolutionTransform.OperationTypes;
 import org.apache.sysml.lops.Lop;
-import org.apache.sysml.lops.LopsException;
 import org.apache.sysml.lops.LopProperties.ExecType;
+import org.apache.sysml.lops.LopsException;
 import org.apache.sysml.lops.ReBlock;
 import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.parser.Expression.ValueType;
@@ -35,6 +33,8 @@ import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
 import org.apache.sysml.runtime.matrix.data.ConvolutionParameters;
 
+import java.util.ArrayList;
+
 public class ConvolutionOp extends Hop  implements MultiThreadedHop
 {      
        private Hop.ConvOp op;
@@ -179,7 +179,11 @@ public class ConvolutionOp extends Hop  implements 
MultiThreadedHop
                ArrayList<Hop> inputs1 = inputs;
                int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
                OperationTypes lopOp = HopsConv2Lops.get(op);
-               if(op == ConvOp.MAX_POOLING && isInputReLU(inputs.get(0))) {
+
+               // The fused relu_maxpooling is being disabled for now on the 
GPU
+               // There is a bug in LibMatrixCUDA#reluMaxpooling
+               // which we need to understand before enabling this by removing 
the "et != ExecType.GPU" guard.
+               if(op == ConvOp.MAX_POOLING && isInputReLU(inputs.get(0)) && et 
!= ExecType.GPU) {
                        in = inputs.get(0).getInput().get(0).constructLops();
                        lopOp = OperationTypes.RELU_MAX_POOLING;
                }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5baac2d6/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
index daf3c58..7460d6b 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
@@ -18,8 +18,6 @@
  */
 package org.apache.sysml.runtime.instructions.gpu;
 
-import java.util.ArrayList;
-
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
@@ -31,6 +29,8 @@ import 
org.apache.sysml.runtime.matrix.operators.ReorgOperator;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
 import org.apache.sysml.utils.GPUStatistics;
 
+import java.util.ArrayList;
+
 public class ConvolutionGPUInstruction extends GPUInstruction 
 {
        private CPOperand _input1; 
@@ -337,8 +337,13 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                
                // release inputs/outputs
                ec.releaseMatrixInputForGPUInstruction(_input1.getName());
+
                if (!( instOpcode.equalsIgnoreCase("maxpooling") || 
instOpcode.equalsIgnoreCase("relu_maxpooling")) )
                        
ec.releaseMatrixInputForGPUInstruction(_input2.getName());
+
+               if (instOpcode.equalsIgnoreCase("conv2d_bias_add"))
+                       
ec.releaseMatrixInputForGPUInstruction(_input3.getName());
+
                ec.releaseMatrixOutputForGPUInstruction(_output.getName());
        }
 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5baac2d6/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
index c980b20..47ff197 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
@@ -1188,7 +1188,7 @@ public class JCudaObject extends GPUObject {
                        // Set all elements to 0 since newly allocated space 
will contain garbage
                        if (DMLScript.STATISTICS) t1 = System.nanoTime();
                        cudaMemset(A, 0, size);
-                       if (DMLScript.STATISTICS) end = System.nanoTime() - t1;
+                       if (DMLScript.STATISTICS) end = System.nanoTime();
                        if (instructionName != null && 
GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instructionName, 
GPUInstruction.MISC_TIMER_SET_ZERO, end - t1);
                        if (DMLScript.STATISTICS) 
GPUStatistics.cudaMemSet0Time.getAndAdd(end - t1);
                        if (DMLScript.STATISTICS) 
GPUStatistics.cudaMemSet0Count.getAndAdd(1);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5baac2d6/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index a3b9168..1a08396 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -648,7 +648,9 @@ public class LibMatrixCUDA {
                        
((JCudaObject)image.getGPUObject()).sparseToDense(instName);
                }
                Pointer x = 
((JCudaObject)image.getGPUObject()).jcudaDenseMatrixPtr;
-               performMaxpooling(instName, x, outputBlock, N, C, H, W, K, R, 
S, pad_h, pad_w, stride_h, stride_w, P, Q);
+               Pointer y = 
((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr;
+
+               performMaxpooling(instName, x, y, N, C, H, W, K, R, S, pad_h, 
pad_w, stride_h, stride_w, P, Q);
        }
        
        /**
@@ -678,17 +680,17 @@ public class LibMatrixCUDA {
                if(isInSparseFormat(image)) {
                        
((JCudaObject)image.getGPUObject()).sparseToDense(instName);
                }
+               long size  = image.getNumRows() * image.getNumColumns() * 
Sizeof.DOUBLE;
                Pointer x = 
((JCudaObject)image.getGPUObject()).jcudaDenseMatrixPtr;
-               //MatrixObject temp = new MatrixObject(image);
-               //temp.getGPUObject().acquireDeviceModifyDense();
-               Pointer y = 
((JCudaObject)image.getGPUObject()).jcudaDenseMatrixPtr;
-               performReLU(instName, x, y, N, C, H, W);
-               performMaxpooling(instName, y, outputBlock, N, C, H, W, K, R, 
S, pad_h, pad_w, stride_h, stride_w, P, Q);
-               //((JCudaObject)temp.getGPUObject()).clearData(); // deallocate 
the temporary data
+               Pointer y = 
((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr;
+               Pointer tmp = allocate(size);
+               performReLU(instName, x, tmp, N, C, H, W);
+               performMaxpooling(instName, tmp, y, N, C, H, W, K, R, S, pad_h, 
pad_w, stride_h, stride_w, P, Q);
+               cudaFreeHelper(tmp);
        }
        
        private static void performMaxpooling(String instName, Pointer x,
-                                                                               
                MatrixObject outputBlock, int N, int C, int H, int W, int K, 
int R,
+                                                                               
                Pointer y, int N, int C, int H, int W, int K, int R,
                                                                                
                int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
                                                                                
                int Q) throws DMLRuntimeException {
                
@@ -706,9 +708,6 @@ public class LibMatrixCUDA {
                        xDesc = allocateTensorDescriptor(N, C, H, W);
                        poolingDesc = allocatePoolingDescriptor(R, S, pad_h, 
pad_w, stride_h, stride_w);
 
-                       // Allocate data
-                       Pointer y = 
((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr;
-
                        alpha = pointerTo(1.0);
                        beta = pointerTo(0.0f);
 

Reply via email to