col variance bug fix)

nakul02 Wed, 31 May 2017 21:55:43 -0700

[SYSTEMML-1625] GPU Unit Tests (and GPU row/col variance bug fix)

- Documented random matrix generation
- GPU unit test using MLContext. Compares CPU output to GPU
- Pseudo-unit tests for GPU implementations of
  unary ops, unary aggregate ops, transpose, elementwise ops,
  matrix multiplication ops, builtin ops & NN ops
- Fixed crucial bug in col/row var
- gpuTests profile for GPU tests (mvn verify -PgpuTests)
- Updated intellij style for import order


Closes #513


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/772fb588
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/772fb588
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/772fb588

Branch: refs/heads/master
Commit: 772fb588324916e4225bb6e1970ca6a8f87eb414
Parents: ceeec4b
Author: Nakul Jindal <naku...@gmail.com>
Authored: Wed May 31 21:54:13 2017 -0700
Committer: Nakul Jindal <naku...@gmail.com>
Committed: Wed May 31 21:54:13 2017 -0700

----------------------------------------------------------------------
 dev/code-style/systemml-style-intellij.xml      |  18 +
 pom.xml                                         |  10 +
 .../apache/sysml/api/ScriptExecutorUtils.java   |   1 +
 .../context/ExecutionContext.java               |   3 +
 .../instructions/GPUInstructionParser.java      | 120 +++--
 .../instructions/gpu/context/GPUContext.java    | 118 +++--
 .../instructions/gpu/context/GPUObject.java     |  55 +-
 .../instructions/gpu/context/JCudaKernels.java  |   3 +-
 .../runtime/matrix/data/LibMatrixCUDA.java      | 112 ++--
 .../runtime/matrix/data/LibMatrixDatagen.java   |  78 +--
 .../matrix/data/RandomMatrixGenerator.java      | 123 ++++-
 .../sysml/test/gpu/AggregateUnaryOpTests.java   | 133 +++++
 .../apache/sysml/test/gpu/BinaryOpTests.java    |  85 ++++
 .../org/apache/sysml/test/gpu/GPUTests.java     | 250 +++++++++
 .../gpu/MatrixMatrixElementWiseOpTests.java     | 271 ++++++++++
 .../test/gpu/MatrixMultiplicationOpTest.java    | 190 +++++++
 .../sysml/test/gpu/NeuralNetworkOpTests.java    | 508 +++++++++++++++++++
 .../org/apache/sysml/test/gpu/ReorgOpTests.java |  70 +++
 .../gpu/ScalarMatrixElementwiseOpTests.java     | 131 +++++
 .../org/apache/sysml/test/gpu/UnaryOpTests.java | 113 +++++
 .../apache/sysml/test/gpu/UnaryOpTestsBase.java | 106 ++++
 .../test/integration/gpu/ZPackageSuite.java     |  46 ++
 22 files changed, 2308 insertions(+), 236 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/dev/code-style/systemml-style-intellij.xml
----------------------------------------------------------------------
diff --git a/dev/code-style/systemml-style-intellij.xml 
b/dev/code-style/systemml-style-intellij.xml
index 248c600..1ad3209 100644
--- a/dev/code-style/systemml-style-intellij.xml
+++ b/dev/code-style/systemml-style-intellij.xml
@@ -16,7 +16,25 @@
  * specific language governing permissions and limitations
  * under the License.
 -->
+
 <code_scheme name="SystemML Format">
+  <option name="CLASS_COUNT_TO_USE_IMPORT_ON_DEMAND" value="999" />
+  <option name="NAMES_COUNT_TO_USE_IMPORT_ON_DEMAND" value="999" />
+  <option name="IMPORT_LAYOUT_TABLE">
+    <value>
+      <package name="" withSubpackages="true" static="false" />
+      <emptyLine />
+      <package name="java" withSubpackages="true" static="false" />
+      <emptyLine />
+      <package name="javax" withSubpackages="true" static="false" />
+      <emptyLine />
+      <package name="org" withSubpackages="true" static="false" />
+      <emptyLine />
+      <package name="com" withSubpackages="true" static="false" />
+      <emptyLine />
+      <package name="" withSubpackages="true" static="true" />
+    </value>
+  </option>
   <codeStyleSettings language="JAVA">
     <option name="KEEP_LINE_BREAKS" value="false" />
     <option name="KEEP_FIRST_COLUMN_COMMENT" value="false" />

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 5ce5576..99e2dec 100644
--- a/pom.xml
+++ b/pom.xml
@@ -436,6 +436,7 @@
                                                
<exclude>**/slowtest/**</exclude>
                                                
<exclude>**/integration/**</exclude>
                                                
<exclude>**/test/unit/**</exclude>
+                                               
<exclude>**/test/gpu/**</exclude>
                                        </excludes>
 
                                </configuration>
@@ -478,6 +479,7 @@
                                                
-Djava.awt.headless=true</argLine>
 
                                        <includes>
+                                               
<include>${gpuTestsPath}</include> <!-- Path for GPU integration tests, enabled 
for gpuTests profile -->
                                                
<include>**/integration/applications/**/*Suite.java</include>
                                                
<include>**/integration/conversion/*Suite.java</include>
                                                
<include>**/integration/functions/data/*Suite.java</include>
@@ -896,6 +898,14 @@
                        </build>
                </profile>
 
+               <!-- profile to enable running tests on the GPU -->
+               <profile>
+                       <id>gpuTests</id>
+                       <properties>
+                               
<gpuTestsPath>**/integration/gpu/**/*Suite.java</gpuTestsPath>
+                       </properties>
+               </profile>
+
                <profile>
                        <!-- Can be used to ignore doclint javadoc issues -->
                        <id>ignore-doclint</id>

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java 
b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
index f582c36..674a011 100644
--- a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
+++ b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
@@ -94,6 +94,7 @@ public class ScriptExecutorUtils {
                        rtprog.execute(ec);
                } finally { // ensure cleanup/shutdown
                        if (DMLScript.USE_ACCELERATOR && ec.getGPUContext() != 
null) {
+                               ec.getGPUContext().clearTemporaryMemory();
                                GPUContextPool.returnToPool(ec.getGPUContext());
                        }
                        if (dmlconf.getBooleanValue(DMLConfig.CODEGEN))

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
index 35b4cd1..735f394 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
@@ -271,6 +271,9 @@ public class ExecutionContext {
                MatrixObject mo = getMatrixObject(varName);
                if( mo.getGPUObject(getGPUContext()) == null ) {
                        GPUObject newGObj = getGPUContext().createGPUObject(mo);
+                       // The lock is added here for an output block
+                       // so that any block currently in use is not 
deallocated by eviction on the GPU
+                       newGObj.addLock();
                        mo.setGPUObject(getGPUContext(), newGObj);
                }
                return mo;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
index 443d0eb..e0bcd1b 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
@@ -42,81 +42,79 @@ public class GPUInstructionParser  extends InstructionParser
                // Neural Network Operators
                String2GPUInstructionType.put( "relu_backward",          
GPUINSTRUCTION_TYPE.Convolution);
                String2GPUInstructionType.put( "conv2d",                 
GPUINSTRUCTION_TYPE.Convolution);
-               String2GPUInstructionType.put( "conv2d_bias_add",               
  GPUINSTRUCTION_TYPE.Convolution);
+               String2GPUInstructionType.put( "conv2d_bias_add",        
GPUINSTRUCTION_TYPE.Convolution);
                String2GPUInstructionType.put( "conv2d_backward_filter", 
GPUINSTRUCTION_TYPE.Convolution);
                String2GPUInstructionType.put( "conv2d_backward_data",   
GPUINSTRUCTION_TYPE.Convolution);
                String2GPUInstructionType.put( "maxpooling",             
GPUINSTRUCTION_TYPE.Convolution);
                String2GPUInstructionType.put( "maxpooling_backward",    
GPUINSTRUCTION_TYPE.Convolution);
-               String2GPUInstructionType.put( "bias_add",                      
 GPUINSTRUCTION_TYPE.Convolution);
-               String2GPUInstructionType.put( "bias_multiply",                 
         GPUINSTRUCTION_TYPE.Convolution);
+               String2GPUInstructionType.put( "bias_add",               
GPUINSTRUCTION_TYPE.Convolution);
+               String2GPUInstructionType.put( "bias_multiply",          
GPUINSTRUCTION_TYPE.Convolution);
 
                // Matrix Multiply Operators
-               String2GPUInstructionType.put( "ba+*",                   
GPUINSTRUCTION_TYPE.AggregateBinary);
-               String2GPUInstructionType.put( "tsmm",                   
GPUINSTRUCTION_TYPE.MMTSJ);
+               String2GPUInstructionType.put( "ba+*", 
GPUINSTRUCTION_TYPE.AggregateBinary);
+               String2GPUInstructionType.put( "tsmm", 
GPUINSTRUCTION_TYPE.MMTSJ);
 
                // Reorg/Transpose
-               String2GPUInstructionType.put( "r'",                     
GPUINSTRUCTION_TYPE.Reorg);
+               String2GPUInstructionType.put( "r'",   
GPUINSTRUCTION_TYPE.Reorg);
        
                // Binary Cellwise
-               String2GPUInstructionType.put( "+"    , 
GPUINSTRUCTION_TYPE.ArithmeticBinary);
-               String2GPUInstructionType.put( "-"    , 
GPUINSTRUCTION_TYPE.ArithmeticBinary);
-               String2GPUInstructionType.put( "*"    , 
GPUINSTRUCTION_TYPE.ArithmeticBinary);
-               String2GPUInstructionType.put( "/"    , 
GPUINSTRUCTION_TYPE.ArithmeticBinary);
-               String2GPUInstructionType.put( "%%"   , 
GPUINSTRUCTION_TYPE.ArithmeticBinary);
-               String2GPUInstructionType.put( "%/%"  , 
GPUINSTRUCTION_TYPE.ArithmeticBinary);
-               String2GPUInstructionType.put( "^"    , 
GPUINSTRUCTION_TYPE.ArithmeticBinary);
-               String2GPUInstructionType.put( "1-*"  , 
GPUINSTRUCTION_TYPE.ArithmeticBinary); //special * case
-               String2GPUInstructionType.put( "^2"   , 
GPUINSTRUCTION_TYPE.ArithmeticBinary); //special ^ case
-               String2GPUInstructionType.put( "*2"   , 
GPUINSTRUCTION_TYPE.ArithmeticBinary); //special * case
-               String2GPUInstructionType.put( "-nz"  , 
GPUINSTRUCTION_TYPE.ArithmeticBinary); //special - case
-               String2GPUInstructionType.put( "+*"   , 
GPUINSTRUCTION_TYPE.ArithmeticBinary);
-               String2GPUInstructionType.put( "-*"   , 
GPUINSTRUCTION_TYPE.ArithmeticBinary);
+               String2GPUInstructionType.put( "+",    
GPUINSTRUCTION_TYPE.ArithmeticBinary);
+               String2GPUInstructionType.put( "-",    
GPUINSTRUCTION_TYPE.ArithmeticBinary);
+               String2GPUInstructionType.put( "*",    
GPUINSTRUCTION_TYPE.ArithmeticBinary);
+               String2GPUInstructionType.put( "/",    
GPUINSTRUCTION_TYPE.ArithmeticBinary);
+               //String2GPUInstructionType.put( "%%",   
GPUINSTRUCTION_TYPE.ArithmeticBinary);
+               //String2GPUInstructionType.put( "%/%",  
GPUINSTRUCTION_TYPE.ArithmeticBinary);
+               String2GPUInstructionType.put( "^",    
GPUINSTRUCTION_TYPE.ArithmeticBinary);
+               String2GPUInstructionType.put( "1-*",  
GPUINSTRUCTION_TYPE.ArithmeticBinary); //special * case
+               String2GPUInstructionType.put( "^2",   
GPUINSTRUCTION_TYPE.ArithmeticBinary); //special ^ case
+               String2GPUInstructionType.put( "*2",   
GPUINSTRUCTION_TYPE.ArithmeticBinary); //special * case
+               String2GPUInstructionType.put( "-nz",  
GPUINSTRUCTION_TYPE.ArithmeticBinary); //special - case
+               String2GPUInstructionType.put( "+*",   
GPUINSTRUCTION_TYPE.ArithmeticBinary);
+               String2GPUInstructionType.put( "-*",   
GPUINSTRUCTION_TYPE.ArithmeticBinary);
                
-               // Builtin functions
-               String2GPUInstructionType.put( "sel+"   , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
-               String2GPUInstructionType.put( "exp"    , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
-               String2GPUInstructionType.put( "log"    , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
-               String2GPUInstructionType.put( "abs"    , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
-               String2GPUInstructionType.put( "sqrt"   , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
-               String2GPUInstructionType.put( "round"  , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
-               String2GPUInstructionType.put( "floor"  , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
-               String2GPUInstructionType.put( "ceil"   , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
-               String2GPUInstructionType.put( "sin"    , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
-               String2GPUInstructionType.put( "cos"    , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
-               String2GPUInstructionType.put( "tan"    , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
-               String2GPUInstructionType.put( "asin"   , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
-               String2GPUInstructionType.put( "acos"   , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
-               String2GPUInstructionType.put( "atan"   , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
-               String2GPUInstructionType.put( "sign"   , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
-
-
-
-               String2GPUInstructionType.put( "solve"  , 
GPUINSTRUCTION_TYPE.BuiltinBinary);
+               // Unary Builtin functions
+               String2GPUInstructionType.put( "sel+",  
GPUINSTRUCTION_TYPE.BuiltinUnary);
+               String2GPUInstructionType.put( "exp",   
GPUINSTRUCTION_TYPE.BuiltinUnary);
+               String2GPUInstructionType.put( "log",   
GPUINSTRUCTION_TYPE.BuiltinUnary);
+               String2GPUInstructionType.put( "abs",   
GPUINSTRUCTION_TYPE.BuiltinUnary);
+               String2GPUInstructionType.put( "sqrt",  
GPUINSTRUCTION_TYPE.BuiltinUnary);
+               String2GPUInstructionType.put( "round", 
GPUINSTRUCTION_TYPE.BuiltinUnary);
+               String2GPUInstructionType.put( "floor", 
GPUINSTRUCTION_TYPE.BuiltinUnary);
+               String2GPUInstructionType.put( "ceil",  
GPUINSTRUCTION_TYPE.BuiltinUnary);
+               String2GPUInstructionType.put( "sin",   
GPUINSTRUCTION_TYPE.BuiltinUnary);
+               String2GPUInstructionType.put( "cos",   
GPUINSTRUCTION_TYPE.BuiltinUnary);
+               String2GPUInstructionType.put( "tan",   
GPUINSTRUCTION_TYPE.BuiltinUnary);
+               String2GPUInstructionType.put( "asin",  
GPUINSTRUCTION_TYPE.BuiltinUnary);
+               String2GPUInstructionType.put( "acos",  
GPUINSTRUCTION_TYPE.BuiltinUnary);
+               String2GPUInstructionType.put( "atan",  
GPUINSTRUCTION_TYPE.BuiltinUnary);
+               String2GPUInstructionType.put( "sign",  
GPUINSTRUCTION_TYPE.BuiltinUnary);
 
+               // Binary Builtin functions
+               String2GPUInstructionType.put( "solve", 
GPUINSTRUCTION_TYPE.BuiltinBinary);
 
                // Aggregate Unary
-               String2GPUInstructionType.put( "ua+"             , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Sum
-               String2GPUInstructionType.put( "uak+"      , 
GPUINSTRUCTION_TYPE.AggregateUnary);       // Sum
-               String2GPUInstructionType.put( "uar+"            , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Row Sum
-               String2GPUInstructionType.put( "uark+"   , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Row Sum
-               String2GPUInstructionType.put( "uac+"            , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Col Sum
-               String2GPUInstructionType.put( "uack+"   , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Col Sum
-               String2GPUInstructionType.put( "ua*"             , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Multiplication
-               String2GPUInstructionType.put( "uamean"  , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Mean
-               String2GPUInstructionType.put( "uarmean" , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Row Mean
-               String2GPUInstructionType.put( "uacmean" , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Col Mean
-               String2GPUInstructionType.put( "uamax"   , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Max
-               String2GPUInstructionType.put( "uarmax"  , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Row Max
-               String2GPUInstructionType.put( "uacmax"  , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Col Max
-               String2GPUInstructionType.put( "uamin"   , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Min
-               String2GPUInstructionType.put( "uarmin"  , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Row Min
-               String2GPUInstructionType.put( "uacmin"  , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Col Min
-               String2GPUInstructionType.put( "uasqk+"  , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Sum of Squares
-               String2GPUInstructionType.put( "uarsqk+" , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Row Sum of Squares
-               String2GPUInstructionType.put( "uacsqk+" , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Col Sum of Squares
-               String2GPUInstructionType.put( "uavar"   , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Variance
-               String2GPUInstructionType.put( "uarvar"  , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Row Variance
-               String2GPUInstructionType.put( "uacvar"  , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Col Variance
+               String2GPUInstructionType.put( "ua+"     , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Sum
+               String2GPUInstructionType.put( "uak+"    , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Sum
+               String2GPUInstructionType.put( "uar+"    , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Row Sum
+               String2GPUInstructionType.put( "uark+"   , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Row Sum
+               String2GPUInstructionType.put( "uac+"    , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Col Sum
+               String2GPUInstructionType.put( "uack+"   , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Col Sum
+               String2GPUInstructionType.put( "ua*"     , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Multiplication
+               String2GPUInstructionType.put( "uamean"  , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Mean
+               String2GPUInstructionType.put( "uarmean" , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Row Mean
+               String2GPUInstructionType.put( "uacmean" , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Col Mean
+               String2GPUInstructionType.put( "uamax"   , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Max
+               String2GPUInstructionType.put( "uarmax"  , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Row Max
+               String2GPUInstructionType.put( "uacmax"  , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Col Max
+               String2GPUInstructionType.put( "uamin"   , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Min
+               String2GPUInstructionType.put( "uarmin"  , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Row Min
+               String2GPUInstructionType.put( "uacmin"  , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Col Min
+               String2GPUInstructionType.put( "uasqk+"  , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Sum of Squares
+               String2GPUInstructionType.put( "uarsqk+" , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Row Sum of Squares
+               String2GPUInstructionType.put( "uacsqk+" , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Col Sum of Squares
+               String2GPUInstructionType.put( "uavar"   , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Variance
+               String2GPUInstructionType.put( "uarvar"  , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Row Variance
+               String2GPUInstructionType.put( "uacvar"  , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Col Variance
        }
        
        public static GPUInstruction parseSingleInstruction (String str ) 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index 673601f..89a2b67 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -18,25 +18,14 @@
  */
 package org.apache.sysml.runtime.instructions.gpu.context;
 
-import static jcuda.jcublas.JCublas2.cublasCreate;
-import static jcuda.jcublas.JCublas2.cublasDestroy;
-import static jcuda.jcudnn.JCudnn.cudnnCreate;
-import static jcuda.jcudnn.JCudnn.cudnnDestroy;
-import static jcuda.jcusolver.JCusolverDn.cusolverDnDestroy;
-import static jcuda.jcusolver.JCusolverSp.cusolverSpDestroy;
-import static jcuda.jcusparse.JCusparse.cusparseCreate;
-import static jcuda.jcusparse.JCusparse.cusparseDestroy;
-import static jcuda.jcusolver.JCusolverDn.cusolverDnCreate;
-import static jcuda.jcusolver.JCusolverSp.cusolverSpCreate;
-
-import static jcuda.runtime.JCuda.cudaDeviceScheduleBlockingSync;
-import static jcuda.runtime.JCuda.cudaFree;
-import static jcuda.runtime.JCuda.cudaGetDeviceCount;
-import static jcuda.runtime.JCuda.cudaMalloc;
-import static jcuda.runtime.JCuda.cudaMemGetInfo;
-import static jcuda.runtime.JCuda.cudaMemset;
-import static jcuda.runtime.JCuda.cudaSetDevice;
-import static jcuda.runtime.JCuda.cudaSetDeviceFlags;
+import jcuda.Pointer;
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcudnn.cudnnHandle;
+import jcuda.jcusolver.cusolverDnHandle;
+import jcuda.jcusolver.cusolverSpHandle;
+import jcuda.jcusparse.cusparseHandle;
+import jcuda.runtime.JCuda;
+import jcuda.runtime.cudaDeviceProp;
 
 import java.util.ArrayList;
 import java.util.Collections;
@@ -56,14 +45,24 @@ import 
org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
 import org.apache.sysml.utils.GPUStatistics;
 import org.apache.sysml.utils.LRUCacheMap;
 
-import jcuda.Pointer;
-import jcuda.jcublas.cublasHandle;
-import jcuda.jcudnn.cudnnHandle;
-import jcuda.jcusolver.cusolverDnHandle;
-import jcuda.jcusolver.cusolverSpHandle;
-import jcuda.jcusparse.cusparseHandle;
-import jcuda.runtime.JCuda;
-import jcuda.runtime.cudaDeviceProp;
+import static jcuda.jcublas.JCublas2.cublasCreate;
+import static jcuda.jcublas.JCublas2.cublasDestroy;
+import static jcuda.jcudnn.JCudnn.cudnnCreate;
+import static jcuda.jcudnn.JCudnn.cudnnDestroy;
+import static jcuda.jcusolver.JCusolverDn.cusolverDnCreate;
+import static jcuda.jcusolver.JCusolverDn.cusolverDnDestroy;
+import static jcuda.jcusolver.JCusolverSp.cusolverSpCreate;
+import static jcuda.jcusolver.JCusolverSp.cusolverSpDestroy;
+import static jcuda.jcusparse.JCusparse.cusparseCreate;
+import static jcuda.jcusparse.JCusparse.cusparseDestroy;
+import static jcuda.runtime.JCuda.cudaDeviceScheduleBlockingSync;
+import static jcuda.runtime.JCuda.cudaFree;
+import static jcuda.runtime.JCuda.cudaGetDeviceCount;
+import static jcuda.runtime.JCuda.cudaMalloc;
+import static jcuda.runtime.JCuda.cudaMemGetInfo;
+import static jcuda.runtime.JCuda.cudaMemset;
+import static jcuda.runtime.JCuda.cudaSetDevice;
+import static jcuda.runtime.JCuda.cudaSetDeviceFlags;
 
 /**
  * Represents a context per GPU accessible through the same JVM
@@ -159,6 +158,7 @@ public class GPUContext {
 
   }
 
+  @SuppressWarnings("unused")
   public int getDeviceNum() {
     return deviceNum;
   }
@@ -174,6 +174,7 @@ public class GPUContext {
     cudaSetDevice(deviceNum);
   }
 
+  @SuppressWarnings("unused")
   public static int cudaGetDevice() {
     int[] device = new int[1];
     JCuda.cudaGetDevice(device);
@@ -288,6 +289,9 @@ public class GPUContext {
    * @param eager           true if to be done eagerly
    */
   public void cudaFreeHelper(String instructionName, final Pointer toFree, 
boolean eager) {
+       Pointer dummy = new Pointer();
+       if (toFree == dummy) // trying to free a null pointer
+               return;
     long t0 = 0;
     assert cudaBlockSizeMap.containsKey(toFree) : "ERROR : Internal state 
corrupted, cache block size map is not aware of a block it trying to free up";
     long size = cudaBlockSizeMap.get(toFree);
@@ -382,14 +386,14 @@ public class GPUContext {
       return;
 
     if (allocatedGPUObjects.size() == 0) {
-      throw new DMLRuntimeException("There is not enough memory on device for 
this matrix!");
+      throw new DMLRuntimeException("There is not enough memory on device for 
this matrix, request (" + neededSize + ")");
     }
 
     Collections.sort(allocatedGPUObjects, new Comparator<GPUObject>() {
       @Override
       public int compare(GPUObject p1, GPUObject p2) {
-        long p1Val = p1.readLocks.get();
-        long p2Val = p2.readLocks.get();
+        long p1Val = p1.locks.get();
+        long p2Val = p2.locks.get();
 
         if (p1Val > 0 && p2Val > 0) {
           // Both are locked, so don't sort
@@ -426,8 +430,8 @@ public class GPUContext {
 
     while (neededSize > getAvailableMemory() && allocatedGPUObjects.size() > 
0) {
       GPUObject toBeRemoved = 
allocatedGPUObjects.get(allocatedGPUObjects.size() - 1);
-      if (toBeRemoved.readLocks.get() > 0) {
-        throw new DMLRuntimeException("There is not enough memory on device 
for this matrix!");
+      if (toBeRemoved.locks.get() > 0) {
+        throw new DMLRuntimeException("There is not enough memory on device 
for this matrix, request (" + neededSize + ")");
       }
       if (toBeRemoved.dirty) {
         toBeRemoved.copyFromDeviceToHost();
@@ -546,6 +550,7 @@ public class GPUContext {
    * @return the shared memory per block
    * @throws DMLRuntimeException ?
    */
+  @SuppressWarnings("unused")
   public long getMaxSharedMemory() throws DMLRuntimeException {
     cudaDeviceProp deviceProp = getGPUProperties();
     return deviceProp.sharedMemPerBlock;
@@ -588,10 +593,10 @@ public class GPUContext {
 
   /**
    * Destroys this GPUContext object
-   * This method MUST BE called so that the GPU is available to be used again
    *
    * @throws DMLRuntimeException if error
    */
+  @SuppressWarnings("unused")
   public void destroy() throws DMLRuntimeException {
     LOG.trace("GPU : this context was destroyed, this = " + this.toString());
     clearMemory();
@@ -608,14 +613,51 @@ public class GPUContext {
 
   /**
    * Clears all memory used by this {@link GPUContext}
-   * Be careful to ensure that no memory is currently being used before 
invoking this
+   * Be careful to ensure that no memory is currently being used in the 
temporary memory before invoking this
+   * If memory is being used between MLContext invocations, they are pointed 
to by a {@link GPUObject} instance
+   * which would be part of the {@link MatrixObject}. The cleanup of that 
{@link MatrixObject} instance will
+   * cause the memory associated with that block on the GPU to be freed up.
    * @throws DMLRuntimeException ?
    */
   public void clearMemory() throws DMLRuntimeException {
-    while (allocatedGPUObjects.isEmpty()) {
+    clearTemporaryMemory();
+    while (!allocatedGPUObjects.isEmpty()) {
       GPUObject o = allocatedGPUObjects.get(0);
-      o.clearData();
+      if (o.isDirty()){
+        LOG.warn("Attempted to free GPU Memory when a block[" + o + "] is 
still on GPU memory, copying it back to host.");
+        o.acquireHostRead();
+      }
+      o.clearData(true);
     }
+    allocatedGPUObjects.clear();
+  }
+
+  /**
+   * Clears up the memory used to optimize cudaMalloc/cudaFree calls
+   */
+  public void clearTemporaryMemory() {
+    // To record the cuda block sizes needed by allocatedGPUObjects, others 
are cleared up.
+    HashMap<Pointer, Long> tmpCudaBlockSizeMap = new HashMap<>();
+         for (GPUObject o : allocatedGPUObjects) {
+                 if (o.isSparse()) {
+                         CSRPointer p = o.getSparseMatrixCudaPointer();
+                         if (p.rowPtr != null && 
cudaBlockSizeMap.containsKey(p.rowPtr)) {
+                                 tmpCudaBlockSizeMap.put(p.rowPtr, 
cudaBlockSizeMap.get(p.rowPtr));
+                         }
+                         if (p.colInd != null && 
cudaBlockSizeMap.containsKey(p.colInd)) {
+                                 tmpCudaBlockSizeMap.put(p.colInd, 
cudaBlockSizeMap.get(p.colInd));
+                         }
+                         if (p.val != null && 
cudaBlockSizeMap.containsKey(p.val)) {
+                                 tmpCudaBlockSizeMap.put(p.val, 
cudaBlockSizeMap.get(p.val));
+                         }
+
+                 } else {
+                         Pointer p = o.getJcudaDenseMatrixPtr();
+                         tmpCudaBlockSizeMap.put(p, cudaBlockSizeMap.get(p));
+                 }
+         }
+
+    // garbage collect all temporarily allocated spaces
     for (LinkedList<Pointer> l : freeCUDASpaceMap.values()) {
       for (Pointer p : l) {
         cudaFreeHelper(p, true);
@@ -623,7 +665,9 @@ public class GPUContext {
     }
     cudaBlockSizeMap.clear();
     freeCUDASpaceMap.clear();
-    allocatedGPUObjects.clear();
+
+    // Restore only those entries for which there are still blocks on the GPU
+    cudaBlockSizeMap.putAll(tmpCudaBlockSizeMap);
   }
 
   @Override

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index be3cc09..0ed34c5 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -86,8 +86,8 @@ public class GPUObject {
        /** whether the block attached to this {@link GPUContext} is dirty on 
the device and needs to be copied back to host */
        protected boolean dirty = false;
 
-       /** number of read locks on this object */
-       protected AtomicInteger readLocks = new AtomicInteger(0);
+       /** number of read/write locks on this object (this GPUObject is being 
used in a current instruction) */
+       protected AtomicInteger locks = new AtomicInteger(0);
 
        /** Timestamp, needed by {@link GPUContext#evict(long)} */
        AtomicLong timestamp = new AtomicLong(0);
@@ -112,7 +112,7 @@ public class GPUObject {
             that.allocateTensorDescriptor(me.tensorShape[0], 
me.tensorShape[1], me.tensorShape[2], me.tensorShape[3]);
         }
                that.dirty = me.dirty;
-               that.readLocks = new AtomicInteger(me.readLocks.get());
+               that.locks = new AtomicInteger(me.locks.get());
                that.timestamp = new AtomicLong(me.timestamp.get());
                that.isSparse = me.isSparse;
 
@@ -126,7 +126,7 @@ public class GPUObject {
                        cudaMemcpy(that.jcudaDenseMatrixPtr, 
me.jcudaDenseMatrixPtr, size, cudaMemcpyDeviceToDevice);
                }
 
-               if (me.jcudaSparseMatrixPtr != null){
+               if (me.getJcudaSparseMatrixPtr() != null){
                        long rows = mat.getNumRows();
                        that.jcudaSparseMatrixPtr = 
me.jcudaSparseMatrixPtr.clone((int)rows);
                }
@@ -265,7 +265,6 @@ public class GPUObject {
 
        /**
         * Convenience method to directly set the sparse matrix on GPU
-        * Make sure to call {@link #addReadLock()} after this to set 
appropriate state, if you are not sure what you are doing.
         * Needed for operations like {@link 
JCusparse#cusparseDcsrgemm(cusparseHandle, int, int, int, int, int, 
cusparseMatDescr, int, Pointer, Pointer, Pointer, cusparseMatDescr, int, 
Pointer, Pointer, Pointer, cusparseMatDescr, Pointer, Pointer, Pointer)}
         * @param sparseMatrixPtr CSR (compressed sparse row) pointer
         * 
@@ -278,11 +277,11 @@ public class GPUObject {
                        cudaFreeHelper(getJcudaDenseMatrixPtr());
                        jcudaDenseMatrixPtr = null;
                }
+               getGPUContext().recordBlockUsage(this);
        }
 
        /**
         * Convenience method to directly set the dense matrix pointer on GPU
-        * Make sure to call {@link #addReadLock()} after this to set 
appropriate state, if you are not sure what you are doing.
         *
         * @param densePtr dense pointer
         * @throws DMLRuntimeException ?
@@ -294,6 +293,7 @@ public class GPUObject {
                        getJcudaSparseMatrixPtr().deallocate();
                        jcudaSparseMatrixPtr = null;
                }
+               getGPUContext().recordBlockUsage(this);
        }
 
        /**
@@ -491,7 +491,6 @@ public class GPUObject {
        public void allocateSparseAndEmpty() throws DMLRuntimeException{
                LOG.trace("GPU : allocate sparse and empty block on " + this + 
", GPUContext=" + getGPUContext());
                
setSparseMatrixCudaPointer(CSRPointer.allocateEmpty(getGPUContext(), 0, 
mat.getNumRows()));
-               addReadLock();
        }
 
        /**
@@ -508,7 +507,6 @@ public class GPUObject {
                int numElems = toIntExact(rows * cols);
                long size = getDoubleSizeOf(numElems);
                setDenseMatrixCudaPointer(allocate(size));
-               addReadLock();
                // The "fill" kernel is called which treats the matrix 
"jcudaDensePtr" like a vector and fills it with value "v"
                // If the fill value is 0, no need to call the special kernel, 
the allocate memsets the allocated region to 0
                if (v != 0)
@@ -535,9 +533,8 @@ public class GPUObject {
                        LOG.trace("GPU : in acquireDeviceRead, data is not 
allocated, copying from host, on " + this + ", GPUContext=" + getGPUContext());
                        copyFromHostToDevice();
                        transferred = true;
-               } else {
-                       addReadLock();
                }
+               addLock();
                if(!isAllocated())
                        throw new DMLRuntimeException("Expected device data to 
be allocated");
                return transferred;
@@ -552,7 +549,6 @@ public class GPUObject {
                        // Dense block, size = numRows * numCols
                        allocateDenseMatrixOnDevice();
                        allocated = true;
-                       getGPUContext().recordBlockUsage(this);
                }
                dirty = true;
                if(!isAllocated())
@@ -569,8 +565,6 @@ public class GPUObject {
                        mat.setDirty(true);
                        allocateSparseMatrixOnDevice();
                        allocated = true;
-                       getGPUContext().recordBlockUsage(this);
-
                }
                dirty = true;
                if(!isAllocated())
@@ -578,8 +572,8 @@ public class GPUObject {
                return allocated;
        }
 
-       public void addReadLock() {
-               readLocks.addAndGet(1);
+       public void addLock() {
+               locks.addAndGet(1);
        }
 
        /**
@@ -606,11 +600,13 @@ public class GPUObject {
         * Updates the locks depending on the eviction policy selected
         * @throws DMLRuntimeException if there is no locked GPU Object or if 
could not obtain a {@link GPUContext}
         */
-       private void updateReleaseLocks() throws DMLRuntimeException {
-               if (readLocks.addAndGet(-1) < 0) {
-                       throw new CacheException("Redundant release of GPU 
object");
+       private void updateReleaseLocks(int l) throws DMLRuntimeException {
+               int newLocks = locks.addAndGet(l);
+               if (newLocks < 0) {
+                       throw new CacheException("Internal state error : 
Invalid number of locks on a GPUObject");
                }
-               LOG.trace("GPU : updateReleaseLocks, new number of read locks 
is " + readLocks.get() + ", on " + this + ", GPUContext=" + getGPUContext());
+
+               LOG.trace("GPU : updateReleaseLocks, new number of locks is " + 
locks.get() + ", on " + this + ", GPUContext=" + getGPUContext());
                GPUContext.EvictionPolicy evictionPolicy = 
getGPUContext().evictionPolicy;
                switch (evictionPolicy){
                        case LRU : timestamp.set(System.nanoTime()); break;
@@ -625,7 +621,8 @@ public class GPUObject {
         * @throws DMLRuntimeException if data is not allocated or if there is 
no locked GPU Object or if could not obtain a {@link GPUContext}
         */
        public void releaseInput() throws DMLRuntimeException {
-               updateReleaseLocks();
+               // A read lock is a positive quantity, therefor when the lock 
is freed, a negative 1 is added
+               updateReleaseLocks(-1);
                if(!isAllocated())
                        throw new CacheException("Attempting to release an 
input before allocating it");
        }
@@ -635,7 +632,8 @@ public class GPUObject {
         * @throws DMLRuntimeException if data is not allocated or if there is 
no locked GPU Object or if could not obtain a {@link GPUContext}
         */
        public void releaseOutput() throws DMLRuntimeException {
-               updateReleaseLocks();
+               // A write lock is a negative quantity, therefore when the lock 
is freed, a positive number is added
+               updateReleaseLocks(1);
                dirty = true;
                if(!isAllocated())
                        throw new CacheException("Attempting to release an 
output before allocating it");
@@ -651,7 +649,6 @@ public class GPUObject {
                long size = getDoubleSizeOf(rows * cols);
                Pointer tmp = allocate(size);
                setDenseMatrixCudaPointer(tmp);
-               addReadLock();
        }
 
        void allocateSparseMatrixOnDevice() throws DMLRuntimeException {
@@ -660,10 +657,9 @@ public class GPUObject {
                long rows = mat.getNumRows();
                long nnz = mat.getNnz();
                assert rows > 0 : "Internal error - invalid number of rows when 
allocating a sparse matrix";
-               assert nnz > 0 : "Internal error - invalid number of non zeroes 
when allocating a sparse matrix";
+               assert nnz >= 0 : "Internal error - invalid number of non 
zeroes when allocating a sparse matrix";
                CSRPointer tmp = CSRPointer.allocateEmpty(getGPUContext(), nnz, 
rows);
                setSparseMatrixCudaPointer(tmp);
-               addReadLock();
        }
 
        void deallocateMemoryOnDevice(boolean eager) throws DMLRuntimeException 
{
@@ -680,7 +676,8 @@ public class GPUObject {
                        cudnnDestroyTensorDescriptor(tensorDescriptor);
                        tensorDescriptor = null;
                }
-               readLocks.set(0);
+               locks.set(0);
+               getGPUContext().removeRecordedUsage(this);
        }
 
        protected long getSizeOnDevice() throws DMLRuntimeException {
@@ -754,8 +751,8 @@ public class GPUObject {
                                colInd = csrBlock.indexes();
                                values = csrBlock.values();
                        }
+
                        allocateSparseMatrixOnDevice();
-                       getGPUContext().recordBlockUsage(this);
 
                        if(copyToDevice) {
                                
CSRPointer.copyToDevice(getJcudaSparseMatrixPtr(), tmp.getNumRows(), 
tmp.getNonZeros(), rowPtr, colInd, values);
@@ -773,7 +770,6 @@ public class GPUObject {
 
                        // Copy dense block
                        allocateDenseMatrixOnDevice();
-                       getGPUContext().recordBlockUsage(this);
 
                        cudaMemcpy(getJcudaDenseMatrixPtr(), Pointer.to(data), 
getDoubleSizeOf(mat.getNumRows()*mat.getNumColumns()), cudaMemcpyHostToDevice);
                }
@@ -862,9 +858,8 @@ public class GPUObject {
         * @throws CacheException ?
         */
        public void clearData(boolean eager) throws DMLRuntimeException {
-               getGPUContext().removeRecordedUsage(this);
                deallocateMemoryOnDevice(eager);
-
+               getGPUContext().removeRecordedUsage(this);
        }
 
        /** 
@@ -896,7 +891,7 @@ public class GPUObject {
                final StringBuilder sb = new StringBuilder("GPUObject{");
                sb.append(", 
tensorShape=").append(Arrays.toString(tensorShape));
                sb.append(", dirty=").append(dirty);
-               sb.append(", readLocks=").append(readLocks);
+               sb.append(", locks=").append(locks);
                sb.append(", sparse? ").append(isSparse);
                sb.append(", 
dims=[").append(mat.getNumRows()).append(",").append(mat.getNumColumns()).append("]");
                sb.append('}');

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
index ac11df9..4d06831 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaKernels.java
@@ -34,6 +34,7 @@ import jcuda.Pointer;
 import jcuda.driver.CUfunction;
 import jcuda.driver.CUmodule;
 import jcuda.driver.CUresult;
+import jcuda.runtime.JCuda;
 
 /**
  * Utility class that allows LibMatrixCUDA as well as GPUObject to invoke 
custom CUDA kernels.
@@ -111,7 +112,7 @@ public class JCudaKernels {
                                config.gridDimX, config.gridDimY, 
config.gridDimZ, 
                                config.blockDimX, config.blockDimY, 
config.blockDimZ, 
                                config.sharedMemBytes, config.stream, 
Pointer.to(kernelParams), null));
-               //JCuda.cudaDeviceSynchronize();
+        JCuda.cudaDeviceSynchronize();
        }
        
     public static void checkResult(int cuResult) throws DMLRuntimeException {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index b023159..7990fef 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -235,8 +235,8 @@ public class LibMatrixCUDA {
 
 
        private static int CONVOLUTION_PREFERENCE = 
cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
-       
-       private static Pointer _one; 
+
+       private static Pointer _one;
        private static Pointer _zero;
        /**
         * Convenience method to get a pointer to value '1.0' on device. 
Instead of allocating and deallocating it for every kernel invocation.
@@ -258,7 +258,7 @@ public class LibMatrixCUDA {
                }
                return _zero;
        }
-       
+
        /**
         * Convenience method to get tensor descriptor from underlying GPUObject
         * @param gCtx   a valid {@link GPUContext}
@@ -272,7 +272,7 @@ public class LibMatrixCUDA {
         */
        private static cudnnTensorDescriptor 
allocateTensorDescriptor(GPUContext gCtx, MatrixObject mat, int N, int C, int 
H, int W) throws DMLRuntimeException {
                if(mat.getNumRows() != N || mat.getNumColumns() != C*H*W) {
-                       throw new DMLRuntimeException("Mismatch 
descriptor-matrix dimensions:" + mat.getNumRows() + " != " + N 
+                       throw new DMLRuntimeException("Mismatch 
descriptor-matrix dimensions:" + mat.getNumRows() + " != " + N
                                        + " || " + mat.getNumColumns() + " != " 
+ (C*H*W));
                }
                return mat.getGPUObject(gCtx).allocateTensorDescriptor(N, C, H, 
W);
@@ -293,7 +293,7 @@ public class LibMatrixCUDA {
                cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, 
CUDNN_DATA_DOUBLE, N, C, H, W);
                return tensorDescriptor;
        }
-       
+
        /**
         * Convenience method to get jcudaDenseMatrixPtr. This method 
explicitly converts sparse to dense format, so use it judiciously.
         * @param gCtx a valid {@link GPUContext}
@@ -308,7 +308,7 @@ public class LibMatrixCUDA {
                }
                return getDensePointer(gCtx, image, instName);
        }
-       
+
        /**
         * Convenience method to get jcudaDenseMatrixPtr. This method 
explicitly converts sparse to dense format, so use it judiciously.
         * @param gCtx a valid {@link GPUContext}
@@ -339,15 +339,15 @@ public class LibMatrixCUDA {
                }
                return input.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
        }
-       
+
        /**
         * Convenience method for checking the status of CuDNN kernel.
-        * 
+        *
         * @param status status returned by CuDNN
         * @throws DMLRuntimeException if status is not CUDNN_STATUS_SUCCESS
         */
        private static void checkStatus(int status) throws DMLRuntimeException {
-               if(status != cudnnStatus.CUDNN_STATUS_SUCCESS) 
+               if(status != cudnnStatus.CUDNN_STATUS_SUCCESS)
                        throw new DMLRuntimeException("Error status returned by 
CuDNN:" + jcuda.jcudnn.cudnnStatus.stringFor(status));
        }
 
@@ -383,7 +383,7 @@ public class LibMatrixCUDA {
                //cudaDeviceSynchronize;
                biasAdd(gCtx, instName, outputBlock, bias, outputBlock);
        }
-       
+
        public static void conv2d(GPUContext gCtx, String instName, 
MatrixObject image, MatrixObject filter, MatrixObject outputBlock, int N, int 
C, int H, int W,
                                                                                
                                int K, int R, int S, int pad_h, int pad_w, int 
stride_h, int stride_w, int P, int Q)
                                        throws DMLRuntimeException {
@@ -530,7 +530,7 @@ public class LibMatrixCUDA {
                if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1);
 
        }
-       
+
        /**
         * Performs the operation corresponding to the DML script:
         * ones = matrix(1, rows=1, cols=Hout*Wout)
@@ -635,7 +635,7 @@ public class LibMatrixCUDA {
                        throw new DMLRuntimeException("Incorrect dimensions for 
running variance");
                }
        }
-       
+
        /**
         * Performs the forward BatchNormalization layer computation for 
inference
         * @param gCtx   a valid {@link GPUContext}
@@ -650,21 +650,21 @@ public class LibMatrixCUDA {
         * @throws DMLRuntimeException if error occurs
         */
        public static void batchNormalizationForwardInference(GPUContext gCtx, 
String instName, MatrixObject image,
-                       MatrixObject scale, MatrixObject bias, MatrixObject 
runningMean, MatrixObject runningVar, 
+                       MatrixObject scale, MatrixObject bias, MatrixObject 
runningMean, MatrixObject runningVar,
                        MatrixObject ret, double epsilon) throws 
DMLRuntimeException {
         LOG.trace("GPU : batchNormalizationForwardInference" + ", GPUContext=" 
+ gCtx);
         int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
-               
+
                int N = (int) image.getNumRows();
                int C = (int) scale.getNumColumns();
                long CHW = image.getNumColumns();
                validateBatchNormalizationDimensions(scale, bias, runningMean, 
runningVar, C);
-               
+
                // Allocate descriptors
                cudnnTensorDescriptor nCHWDescriptor = 
allocateNCHWDescriptors(gCtx, N, C, CHW,
                                new MatrixObject[] {image},  new MatrixObject[] 
{ret});
                cudnnTensorDescriptor scaleTensorDesc = 
allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
-               
+
                // Get underlying dense pointer
                Pointer imagePtr = getDensePointer(gCtx, image, true, instName);
                Pointer retPtr = getDensePointer(gCtx, ret, true, instName);
@@ -672,13 +672,13 @@ public class LibMatrixCUDA {
                Pointer scalePtr = getDensePointer(gCtx, scale, true, instName);
                Pointer runningMeanPtr = getDensePointer(gCtx, runningMean, 
true, instName);
                Pointer runningVarPtr = getDensePointer(gCtx, runningVar, true, 
instName);
-               
+
                
checkStatus(cudnnBatchNormalizationForwardInference(getCudnnHandle(gCtx), mode, 
one(), zero(),
                                nCHWDescriptor, imagePtr, nCHWDescriptor, 
retPtr,
                        scaleTensorDesc, scalePtr, biasPtr,
                        runningMeanPtr, runningVarPtr, epsilon));
        }
-       
+
        /**
         * Performs the forward BatchNormalization layer computation for 
training
         * @param gCtx   a valid {@link GPUContext}
@@ -696,21 +696,21 @@ public class LibMatrixCUDA {
         * @throws DMLRuntimeException if error occurs
         */
        public static void batchNormalizationForwardTraining(GPUContext gCtx, 
String instName, MatrixObject image,
-                       MatrixObject scale,  MatrixObject bias, MatrixObject 
runningMean, MatrixObject runningVar, 
+                       MatrixObject scale,  MatrixObject bias, MatrixObject 
runningMean, MatrixObject runningVar,
                        MatrixObject ret, MatrixObject retRunningMean, 
MatrixObject retRunningVar, double epsilon, double exponentialAverageFactor) 
throws DMLRuntimeException {
         LOG.trace("GPU : batchNormalizationForwardTraining" + ", GPUContext=" 
+ gCtx);
         int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
-               
+
                int N = (int) image.getNumRows();
                int C = (int) scale.getNumColumns();
                long CHW = image.getNumColumns();
                validateBatchNormalizationDimensions(scale, bias, runningMean, 
runningVar, C);
-               
+
                // Allocate descriptors
                cudnnTensorDescriptor nCHWDescriptor = 
allocateNCHWDescriptors(gCtx, N, C, CHW,
                                new MatrixObject[] {image},  new MatrixObject[] 
{ret});
                cudnnTensorDescriptor scaleTensorDesc = 
allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
-               
+
                // Get underlying dense pointer
                Pointer imagePtr = getDensePointer(gCtx, image, true, instName);
                Pointer retPtr = getDensePointer(gCtx, ret, true, instName);
@@ -718,20 +718,20 @@ public class LibMatrixCUDA {
                Pointer scalePtr = getDensePointer(gCtx, scale, true, instName);
                Pointer runningMeanPtr = getDensePointer(gCtx, runningMean, 
true, instName);
                Pointer runningVarPtr = getDensePointer(gCtx, runningVar, true, 
instName);
-               
+
                // To allow for copy-on-write
                Pointer retRunningMeanPtr = getDensePointer(gCtx, 
retRunningMean, true, instName);
                Pointer retRunningVarPtr = getDensePointer(gCtx, retRunningVar, 
true, instName);
                cudaMemcpy(retRunningMeanPtr, runningMeanPtr, C * 
Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
                cudaMemcpy(retRunningVarPtr, runningVarPtr, C * Sizeof.DOUBLE, 
cudaMemcpyDeviceToDevice);
-               
+
                // ignoring resultSaveMean and resultSaveVariance as it 
requires state management
                
checkStatus(cudnnBatchNormalizationForwardTraining(getCudnnHandle(gCtx), mode, 
one(), zero(),
                                nCHWDescriptor, imagePtr, nCHWDescriptor, 
retPtr,
                        scaleTensorDesc, scalePtr, biasPtr, 
exponentialAverageFactor,
                        retRunningMeanPtr, retRunningVarPtr, epsilon, new 
Pointer(), new Pointer()));
        }
-       
+
        /**
         * Convenient utility for batch normalization that returns a NCHW 
descriptor
         * @param gCtx a valid {@link GPUContext}
@@ -776,8 +776,8 @@ public class LibMatrixCUDA {
                        H = HW; W = 1; // If not known
                        double potentialH = Math.sqrt(HW);
                        if(potentialH == ((int) potentialH)) {
-                               H = (int) potentialH; 
-                               W = H; 
+                               H = (int) potentialH;
+                               W = H;
                        }
                        // We are not sure about H and W, hence don't allocate 
them.
                        ret = new cudnnTensorDescriptor();
@@ -786,7 +786,7 @@ public class LibMatrixCUDA {
                }
                return ret;
        }
-       
+
        /**
         * This method computes the backpropagation errors for image, scale and 
bias of batch normalization layer
         * @param gCtx   a valid {@link GPUContext}
@@ -805,16 +805,16 @@ public class LibMatrixCUDA {
                        double epsilon) throws DMLRuntimeException {
         LOG.trace("GPU : batchNormalizationBackward" + ", GPUContext=" + gCtx);
         int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
-               
+
                int N = (int) image.getNumRows();
                int C = (int) scale.getNumColumns();
                long CHW = image.getNumColumns();
-               
+
                // Allocate descriptors
                cudnnTensorDescriptor nCHWDescriptor = 
allocateNCHWDescriptors(gCtx, N, C, CHW,
                                new MatrixObject[] {image, dout},  new 
MatrixObject[] {ret});
                cudnnTensorDescriptor scaleTensorDesc = 
allocateTensorDescriptor(gCtx, scale, 1, C, 1, 1);
-               
+
                // Get underlying dense pointer
                Pointer imagePtr = getDensePointer(gCtx, image, true, instName);
                Pointer doutPtr = getDensePointer(gCtx, dout, true, instName);
@@ -822,7 +822,7 @@ public class LibMatrixCUDA {
                Pointer retPtr = getDensePointer(gCtx, ret, true, instName);
                Pointer retScalePtr = getDensePointer(gCtx, retScale, true, 
instName);
                Pointer retBiasPtr = getDensePointer(gCtx, retBias, true, 
instName);
-               
+
                // ignoring resultSaveMean and resultSaveVariance as it 
requires state management
                
checkStatus(cudnnBatchNormalizationBackward(getCudnnHandle(gCtx), mode,  one(), 
zero(), one(), zero(),
                                nCHWDescriptor,  imagePtr, nCHWDescriptor, 
doutPtr, nCHWDescriptor, retPtr,
@@ -915,7 +915,7 @@ public class LibMatrixCUDA {
                }
        }
 
-       private static long numDoublesIn2GB = 125000000;
+       private static long numDoublesIn2GB = 268435456;
 
        /**
         * This method computes the backpropogation errors for previous layer 
of convolution operation
@@ -961,7 +961,7 @@ public class LibMatrixCUDA {
                        Pointer w = getDensePointer(gCtx, filter, true, 
instName);
                        Pointer dy = getDensePointer(gCtx, dout, true, 
instName);
                        Pointer dx = getDensePointer(gCtx, output, true, 
instName);
-                       
+
                        int padding [] = { pad_h, pad_w };
                        int strides [] = { stride_h, stride_w };
                        convDesc = allocateConvolutionDescriptor(padding, 
strides);
@@ -999,7 +999,7 @@ public class LibMatrixCUDA {
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
                }
        }
-       
+
        /**
         * performs maxpooling on GPU by exploiting cudnnPoolingForward(...)
         * @param gCtx   a valid {@link GPUContext}
@@ -1029,7 +1029,7 @@ public class LibMatrixCUDA {
                cudnnTensorDescriptor xDesc = allocateTensorDescriptor(gCtx, 
image, N, C, H, W);
                performMaxpooling(gCtx, instName, x, xDesc, outputBlock, N, C, 
H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
        }
-       
+
        public static void performMaxpooling(GPUContext gCtx, String instName, 
Pointer x, cudnnTensorDescriptor xDesc,
                        MatrixObject outputBlock, int N, int C, int H, int W, 
int K, int R,
                        int S, int pad_h, int pad_w, int stride_h, int 
stride_w, int P,
@@ -1064,7 +1064,7 @@ public class LibMatrixCUDA {
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
                }
        }
-       
+
        /**
         * Performs maxpoolingBackward on GPU by exploiting 
cudnnPoolingBackward(...)
         * This method computes the backpropogation errors for previous layer 
of maxpooling operation
@@ -1149,13 +1149,13 @@ public class LibMatrixCUDA {
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t4);
                }
        }
-       
+
        private static void performCuDNNReLU(GPUContext gCtx, String instName, 
MatrixObject in, Pointer dstData, cudnnTensorDescriptor srcTensorDesc) throws 
DMLRuntimeException {
                long t0=0;
                try {
             LOG.trace("GPU : performCuDNNReLU" + ", GPUContext=" + gCtx);
             cudnnTensorDescriptor dstTensorDesc = srcTensorDesc;
-                       
+
                        Pointer srcData = getDensePointer(gCtx, in, true, 
instName);
                        cudnnActivationDescriptor activationDescriptor = new 
cudnnActivationDescriptor();
                        cudnnCreateActivationDescriptor(activationDescriptor);
@@ -1175,7 +1175,7 @@ public class LibMatrixCUDA {
                        if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t1);
                }
        }
-       
+
 
        /**
         * Performs the relu operation on the GPU.
@@ -1580,7 +1580,6 @@ public class LibMatrixCUDA {
                if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB, System.nanoTime() - 
t1);
 
                output.getGPUObject(gCtx).setDenseMatrixCudaPointer(C_dense);
-               output.getGPUObject(gCtx).addReadLock();
        }
 
        /**
@@ -1671,7 +1670,6 @@ public class LibMatrixCUDA {
                if (GPUStatistics.DISPLAY_STATISTICS) 
GPUStatistics.maintainCPMiscTimes(instName, 
GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, System.nanoTime() - t0);
 
                output.getGPUObject(gCtx).setSparseMatrixCudaPointer(C);
-               output.getGPUObject(gCtx).addReadLock();
 
                if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
                cusparseDcsrgemm(getCusparseHandle(gCtx), transA, transB, m, n, 
k,
@@ -2079,7 +2077,7 @@ public class LibMatrixCUDA {
                                                reduceRow(gCtx, instName, 
"reduce_row_sum", tmp2, tmpRow, rlen, clen);
 
                                                ScalarOperator divideOp = new 
RightScalarOperator(Divide.getDivideFnObject(), clen - 1);
-                                               matrixScalarOp(gCtx, instName, 
tmpRow, clen - 1, rlen, clen, out, divideOp);
+                                               matrixScalarOp(gCtx, instName, 
tmpRow, clen - 1, rlen, 1, out, divideOp);
 
                                                gCtx.cudaFreeHelper(instName, 
tmpRow);
 
@@ -2097,7 +2095,7 @@ public class LibMatrixCUDA {
                                                reduceCol(gCtx, instName, 
"reduce_col_sum", tmp2, tmpCol, rlen, clen);
 
                                                ScalarOperator divideOp = new 
RightScalarOperator(Divide.getDivideFnObject(), rlen - 1);
-                                               matrixScalarOp(gCtx, instName, 
tmpCol, rlen - 1, rlen, clen, out, divideOp);
+                                               matrixScalarOp(gCtx, instName, 
tmpCol, rlen - 1, 1, clen, out, divideOp);
 
                                                gCtx.cudaFreeHelper(instName, 
tmpCol);
 
@@ -2348,14 +2346,21 @@ public class LibMatrixCUDA {
                                else if(op.fn instanceof Power) {
                                        setOutputToConstant(ec, gCtx, instName, 
1.0, outputName);
                                }
-                               else if(op.fn instanceof Divide && 
isSparseAndEmpty(gCtx, in)) {
-                                       setOutputToConstant(ec, gCtx, instName, 
Double.NaN, outputName);
-                               }
-                               else if(op.fn instanceof Divide) {
-                                       //For division, IEEE 754 defines x/0.0 
as INFINITY and 0.0/0.0 as NaN.
-                                       compareAndSet(ec, gCtx, instName, in, 
outputName, 0.0, 1e-6, Double.NaN, Double.POSITIVE_INFINITY, 
Double.POSITIVE_INFINITY);
-                               }
-                               else {
+                // TODO:
+                // x/0.0 is either +Infinity or -Infinity according to Java.
+                // In the context of a matrix, different elements of the matrix
+                // could have different values.
+                // If the IEEE 754 standard defines otherwise, this logic needs
+                // to be re-enabled and the Java computation logic for divide 
by zero
+                // needs to be revisited
+                //else if(op.fn instanceof Divide && isSparseAndEmpty(gCtx, 
in)) {
+                //     setOutputToConstant(ec, gCtx, instName, Double.NaN, 
outputName);
+                //}
+                //else if(op.fn instanceof Divide) {
+                //     //For division, IEEE 754 defines x/0.0 as INFINITY and 
0.0/0.0 as NaN.
+                //     compareAndSet(ec, gCtx, instName, in, outputName, 0.0, 
1e-6, Double.NaN, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY);
+                //}
+                else {
                                        // TODO: Potential to optimize
                                        matrixScalarOp(ec, gCtx, instName, in, 
outputName, isInputTransposed, op);
                                }
@@ -2790,7 +2795,6 @@ public class LibMatrixCUDA {
                        CSRPointer B = 
in2.getGPUObject(gCtx).getJcudaSparseMatrixPtr();
 
                        ec.allocateGPUMatrixObject(outputName);
-                       out.getGPUObject(gCtx).addReadLock();
 
                        if (in1 == in2 && isLeftTransposed == true && 
isLeftTransposed == isRightTransposed) {
                                // Special case for transpose
@@ -3160,7 +3164,7 @@ public class LibMatrixCUDA {
                MatrixObject out = ec.getMatrixObject(outputName);
                getDenseMatrixOutputForGPUInstruction(ec, instName, 
outputName);        // Allocated the dense output matrix
                Pointer C = getDensePointer(gCtx, out, instName);
-               
+
                long t1=0, t2=0;
                if(in1.getNumRows() == in2.getNumRows() && in1.getNumColumns() 
== in2.getNumColumns()) {
             LOG.trace("GPU : cublasDaxpy" + ", GPUContext=" + gCtx);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDatagen.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDatagen.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDatagen.java
index fb62c41..bb0a9a8 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDatagen.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDatagen.java
@@ -45,9 +45,6 @@ public class LibMatrixDatagen
 {
        private static final Log LOG = 
LogFactory.getLog(LibMatrixDatagen.class.getName());
        private static final long PAR_NUMCELL_THRESHOLD = 512*1024; //Min 500k 
elements
-       public static final String RAND_PDF_UNIFORM = "uniform";
-       public static final String RAND_PDF_NORMAL = "normal";
-       public static final String RAND_PDF_POISSON = "poisson";
        
        private static IDSequence _seqRandInput = new IDSequence(); 
        
@@ -55,9 +52,9 @@ public class LibMatrixDatagen
                //prevent instantiation via private constructor
        }
 
-       public static boolean isShortcutRandOperation( double min, double max, 
double sparsity, String pdf )
+       public static boolean isShortcutRandOperation( double min, double max, 
double sparsity, RandomMatrixGenerator.PDF pdf )
        {
-               return pdf.equalsIgnoreCase(RAND_PDF_UNIFORM)
+               return pdf == RandomMatrixGenerator.PDF.UNIFORM
                           && (  ( min == 0.0 && max == 0.0 ) //all zeros
                                   ||( sparsity==1.0d && min == max )); //equal 
values
        }
@@ -137,27 +134,31 @@ public class LibMatrixDatagen
                }
        }
 
-    public static RandomMatrixGenerator createRandomMatrixGenerator(String 
pdf, int r, int c, int rpb, int cpb, double sp, double min, double max, String 
distParams) 
+    public static RandomMatrixGenerator createRandomMatrixGenerator(String 
pdfStr, int r, int c, int rpb, int cpb, double sp, double min, double max, 
String distParams)
        throws DMLRuntimeException
     {
-       RandomMatrixGenerator rgen = null;
-       
-       if ( pdf.equalsIgnoreCase(RAND_PDF_UNIFORM))
-               rgen = new RandomMatrixGenerator(pdf, r, c, rpb, cpb, sp, min, 
max);
-       else if ( pdf.equalsIgnoreCase(RAND_PDF_NORMAL))
-               rgen = new RandomMatrixGenerator(pdf, r, c, rpb, cpb, sp);
-       else if ( pdf.equalsIgnoreCase(RAND_PDF_POISSON))
-       {
-               double mean = Double.NaN;
-               try {
-                       mean = Double.parseDouble(distParams);
-               } catch(NumberFormatException e) {
-                       throw new DMLRuntimeException("Failed to parse Poisson 
distribution parameter: " + distParams);
-               }
-               rgen = new RandomMatrixGenerator(pdf, r, c, rpb, cpb, sp, min, 
max, mean);
-       }
-       else
-               throw new DMLRuntimeException("Unsupported probability 
distribution \"" + pdf + "\" in rand() -- it must be one of \"uniform\", 
\"normal\", or \"poisson\"");
+               RandomMatrixGenerator.PDF pdf = 
RandomMatrixGenerator.PDF.valueOf(pdfStr.toUpperCase());
+               RandomMatrixGenerator rgen = null;
+       switch (pdf) {
+               case UNIFORM:
+                       rgen = new RandomMatrixGenerator(pdf, r, c, rpb, cpb, 
sp, min, max);
+                       break;
+               case NORMAL:
+                       rgen = new RandomMatrixGenerator(pdf, r, c, rpb, cpb, 
sp);
+                       break;
+               case POISSON:
+                       double mean = Double.NaN;
+                       try {
+                               mean = Double.parseDouble(distParams);
+                       } catch (NumberFormatException e) {
+                               throw new DMLRuntimeException("Failed to parse 
Poisson distribution parameter: " + distParams);
+                       }
+                       rgen = new RandomMatrixGenerator(pdf, r, c, rpb, cpb, 
sp, min, max, mean);
+                       break;
+               default:
+                       throw new DMLRuntimeException("Unsupported probability 
distribution \"" + pdf + "\" in rand() -- it must be one of \"uniform\", 
\"normal\", or \"poisson\"");
+
+               }
        return rgen;
     }
        
@@ -202,11 +203,11 @@ public class LibMatrixDatagen
                 * (max-min)*prng.nextDouble() is still valid. This is done 
primarily to
                 * share the same code across different distributions.
                 */
-               double min = rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM) ? 
rgen._min : 0;
-               double max = rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM) ? 
rgen._max : 1;
+               double min = rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM ? 
rgen._min : 0;
+               double max = rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM ? 
rgen._max : 1;
                
                // Special case shortcuts for efficiency
-               if ( rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM)) {
+               if ( rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM) {
                        if ( min == 0.0 && max == 0.0 ) { //all zeros
                                out.reset(rows, cols, true);
                                return;
@@ -288,8 +289,8 @@ public class LibMatrixDatagen
                 * (max-min)*prng.nextDouble() is still valid. This is done 
primarily to
                 * share the same code across different distributions.
                 */
-               double min = rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM) ? 
rgen._min : 0;
-               double max = rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM) ? 
rgen._max : 1;
+               double min = rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM ? 
rgen._min : 0;
+               double max = rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM ? 
rgen._max : 1;
                
                //determine the sparsity of output matrix (multi-threaded 
always invoked from CP):
                //estimated NNZ is for entire matrix (nnz=0, if 0 initialized)
@@ -304,7 +305,7 @@ public class LibMatrixDatagen
                }
 
                //special case shortcuts for efficiency
-               if ( rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM)) {
+               if ( rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM) {
                        if ( min == 0.0 && max == 0.0 ) { //all zeros
                                out.reset(rows, cols, false);
                                return;
@@ -497,8 +498,8 @@ public class LibMatrixDatagen
                int cpb = rgen._colsPerBlock;
                double sparsity = rgen._sparsity;
                PRNGenerator valuePRNG = rgen._valuePRNG;
-               double min = rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM) ? 
rgen._min : 0;
-               double max = rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM) ? 
rgen._max : 1;
+               double min = rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM ? 
rgen._min : 0;
+               double max = rgen._pdf == RandomMatrixGenerator.PDF.UNIFORM ? 
rgen._max : 1;
                double range = max - min;
                int clen = out.clen;
                int estimatedNNzsPerRow = out.estimatedNNzsPerRow;
@@ -510,14 +511,19 @@ public class LibMatrixDatagen
 
                // Setup Pseudo Random Number Generator for cell values based 
on 'pdf'.
                if (valuePRNG == null) {
-                       if ( rgen._pdf.equalsIgnoreCase(RAND_PDF_UNIFORM)) 
+                       switch (rgen._pdf) {
+                       case UNIFORM:
                                valuePRNG = new UniformPRNGenerator();
-                       else if ( rgen._pdf.equalsIgnoreCase(RAND_PDF_NORMAL))
+                               break;
+                       case NORMAL:
                                valuePRNG = new NormalPRNGenerator();
-                       else if ( rgen._pdf.equalsIgnoreCase(RAND_PDF_POISSON))
+                               break;
+                       case POISSON:
                                valuePRNG = new PoissonPRNGenerator();
-                       else
+                               break;
+                       default:
                                throw new DMLRuntimeException("Unsupported 
distribution function for Rand: " + rgen._pdf);
+                       }
                }
                
                // loop through row-block indices

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/main/java/org/apache/sysml/runtime/matrix/data/RandomMatrixGenerator.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/RandomMatrixGenerator.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/RandomMatrixGenerator.java
index 1dc818c..b4c6c95 100644
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/RandomMatrixGenerator.java
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/RandomMatrixGenerator.java
@@ -26,8 +26,15 @@ import org.apache.sysml.runtime.util.PoissonPRNGenerator;
 import org.apache.sysml.runtime.util.UniformPRNGenerator;
 
 public class RandomMatrixGenerator {
-       
-       String _pdf;
+
+       /**
+        * Types of Probability density functions
+        */
+       enum PDF {
+               NORMAL, UNIFORM, POISSON
+       }
+
+       PDF _pdf;
        int _rows, _cols, _rowsPerBlock, _colsPerBlock;
        double _sparsity, _mean; 
        double _min, _max; 
@@ -36,25 +43,76 @@ public class RandomMatrixGenerator {
 
        public RandomMatrixGenerator() 
        {
-               _pdf = "";
+               _pdf = PDF.UNIFORM;
                _rows = _cols = _rowsPerBlock = _colsPerBlock = -1;
                _sparsity = 0.0;
                _min = _max = Double.NaN;
                _valuePRNG = null;
                _mean = 1.0;
        }
-       
-       public RandomMatrixGenerator(String pdf, int r, int c, int rpb, int 
cpb, double sp) throws DMLRuntimeException 
+
+       /**
+        * Instantiates a Random number generator
+        * @param pdf    probability density function
+        * @param r      number of rows
+        * @param c      number of columns
+        * @param rpb    rows per block
+        * @param cpb    columns per block
+        * @param sp     sparsity (0 = completely sparse, 1 = completely dense)
+        * @throws DMLRuntimeException if error
+        */
+       public RandomMatrixGenerator(PDF pdf, int r, int c, int rpb, int cpb, 
double sp) throws DMLRuntimeException
        {
                this(pdf, r, c, rpb, cpb, sp, Double.NaN, Double.NaN);
        }
-       
-       public RandomMatrixGenerator(String pdf, int r, int c, int rpb, int 
cpb, double sp, double min, double max) throws DMLRuntimeException 
+
+       /**
+        * Instantiates a Random number generator
+        * @param pdfStr probability density function
+        * @param r      number of rows
+        * @param c      number of columns
+        * @param rpb    rows per block
+        * @param cpb    columns per block
+        * @param sp     sparsity (0 = completely sparse, 1 = completely dense)
+        * @param min    minimum of range of random numbers
+        * @param max    maximum of range of random numbers
+        * @throws DMLRuntimeException if error
+        */
+       public RandomMatrixGenerator(String pdfStr, int r, int c, int rpb, int 
cpb, double sp, double min, double max) throws DMLRuntimeException
+       {
+               init(PDF.valueOf(pdfStr.toUpperCase()), r, c, rpb, cpb, sp, 
min, max);
+       }
+
+       /**
+        * Instantiates a Random number generator
+        * @param pdf    probability density function
+        * @param r      number of rows
+        * @param c      number of columns
+        * @param rpb    rows per block
+        * @param cpb    columns per block
+        * @param sp     sparsity (0 = completely sparse, 1 = completely dense)
+        * @param min    minimum of range of random numbers
+        * @param max    maximum of range of random numbers
+        * @throws DMLRuntimeException if error
+        */
+       public RandomMatrixGenerator(PDF pdf, int r, int c, int rpb, int cpb, 
double sp, double min, double max) throws DMLRuntimeException
        {
                init(pdf, r, c, rpb, cpb, sp, min, max);
        }
-       
-       public void init(String pdf, int r, int c, int rpb, int cpb, double sp, 
double min, double max) throws DMLRuntimeException 
+
+       /**
+        * Initializes internal data structures. Called by Constructor
+        * @param pdf    probability density function
+        * @param r      number of rows
+        * @param c      number of columns
+        * @param rpb    rows per block
+        * @param cpb    columns per block
+        * @param sp     sparsity (0 = completely sparse, 1 = completely dense)
+        * @param min    minimum of range of random numbers
+        * @param max    maximum of range of random numbers
+        * @throws DMLRuntimeException if error
+        */
+       public void init(PDF pdf, int r, int c, int rpb, int cpb, double sp, 
double min, double max) throws DMLRuntimeException
        {
                _pdf = pdf;
                _rows = r;
@@ -67,13 +125,39 @@ public class RandomMatrixGenerator {
                
                setupValuePRNG();
        }
-       
-       public RandomMatrixGenerator(String pdf, int r, int c, int rpb, int 
cpb, double sp, double min, double max, double mean) throws DMLRuntimeException 
+
+       /**
+        * Instantiates a Random number generator with a specific poisson mean
+        * @param pdf    probability density function
+        * @param r      number of rows
+        * @param c      number of columns
+        * @param rpb    rows per block
+        * @param cpb    columns per block
+        * @param sp     sparsity (0 = completely sparse, 1 = completely dense)
+        * @param min    minimum of range of random numbers
+        * @param max    maximum of range of random numbers
+        * @param mean   the poisson mean
+        * @throws DMLRuntimeException if error
+        */
+       public RandomMatrixGenerator(PDF pdf, int r, int c, int rpb, int cpb, 
double sp, double min, double max, double mean) throws DMLRuntimeException
        {
                init(pdf, r, c, rpb, cpb, sp, min, max, mean);
        }
-       
-       public void init(String pdf, int r, int c, int rpb, int cpb, double sp, 
double min, double max, double mean) throws DMLRuntimeException 
+
+       /**
+        * Instantiates a Random number generator with a specific poisson mean
+        * @param pdf    probability density function
+        * @param r      number of rows
+        * @param c      number of columns
+        * @param rpb    rows per block
+        * @param cpb    columns per block
+        * @param sp     sparsity (0 = completely sparse, 1 = completely dense)
+        * @param min    minimum of range of random numbers
+        * @param max    maximum of range of random numbers
+        * @param mean   the poisson mean
+        * @throws DMLRuntimeException if error
+        */
+       public void init(PDF pdf, int r, int c, int rpb, int cpb, double sp, 
double min, double max, double mean) throws DMLRuntimeException
        {
                _pdf = pdf;
                _rows = r;
@@ -89,15 +173,20 @@ public class RandomMatrixGenerator {
        
        protected void setupValuePRNG() throws DMLRuntimeException 
        {
-               if ( _pdf.equalsIgnoreCase(LibMatrixDatagen.RAND_PDF_NORMAL) ) 
+               switch (_pdf) {
+               case NORMAL:
                        _valuePRNG = new NormalPRNGenerator();
-               else if ( 
_pdf.equalsIgnoreCase(LibMatrixDatagen.RAND_PDF_UNIFORM) ) 
+                       break;
+               case UNIFORM:
                        _valuePRNG = new UniformPRNGenerator();
-               else if ( 
_pdf.equalsIgnoreCase(LibMatrixDatagen.RAND_PDF_POISSON) ) 
-               {
+                       break;
+               case POISSON:
                        if(_mean <= 0)
                                throw new DMLRuntimeException("Invalid 
parameter (" + _mean + ") for Poisson distribution.");
                        _valuePRNG = new PoissonPRNGenerator(_mean);
+                       break;
+               default:
+                       throw new DMLRuntimeException("Unsupported probability 
density function");
                }
        }
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java 
b/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java
new file mode 100644
index 0000000..0b229f0
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.gpu;
+
+import org.apache.sysml.test.utils.TestUtils;
+import org.junit.Test;
+
+/**
+ * Tests Aggregate Unary ops
+ */
+public class AggregateUnaryOpTests extends UnaryOpTestsBase {
+
+       private final static String TEST_NAME = "AggregateUnaryOpTests";
+
+       @Override
+       public void setUp() {
+               TestUtils.clearAssertionInformation();
+               addTestConfiguration(TEST_DIR, TEST_NAME);
+               getAndLoadTestConfiguration(TEST_NAME);
+       }
+
+       @Test
+       public void sum() {
+               testSimpleUnaryOpMatrixOutput("sum", "gpu_uak+");
+       }
+
+       @Test
+       public void colSums() {
+               testSimpleUnaryOpMatrixOutput("colSums", "gpu_uack+");
+       }
+
+       @Test
+       public void rowSums() {
+               testSimpleUnaryOpMatrixOutput("rowSums", "gpu_uark+");
+       }
+
+       @Test
+       public void mult() {
+               testSimpleUnaryOpMatrixOutput("prod", "gpu_ua*");
+       }
+
+       @Test
+       public void mean() {
+               testSimpleUnaryOpMatrixOutput("mean", "gpu_uamean");
+       }
+
+       @Test
+       public void colMeans() {
+               testSimpleUnaryOpMatrixOutput("colMeans", "gpu_uacmean");
+       }
+
+       @Test
+       public void rowMeans() {
+               testSimpleUnaryOpMatrixOutput("rowMeans", "gpu_uarmean");
+       }
+
+       @Test
+       public void max() {
+               testSimpleUnaryOpMatrixOutput("max", "gpu_uamax");
+       }
+
+       @Test
+       public void rowMaxs() {
+               testSimpleUnaryOpMatrixOutput("rowMaxs", "gpu_uarmax");
+       }
+
+       @Test
+       public void colMaxs() {
+               testSimpleUnaryOpMatrixOutput("colMaxs", "gpu_uacmax");
+       }
+
+       @Test
+       public void min() {
+               testSimpleUnaryOpMatrixOutput("min", "gpu_uamin");
+       }
+
+       @Test
+       public void rowMins() {
+               testSimpleUnaryOpMatrixOutput("rowMins", "gpu_uarmin");
+       }
+
+       @Test
+       public void colMins() {
+               testSimpleUnaryOpMatrixOutput("colMins", "gpu_uacmin");
+       }
+
+       @Test
+       public void var() {
+               testSimpleUnaryOpMatrixOutput("var", "gpu_uavar");
+       }
+
+       @Test
+       public void colVars() {
+               testSimpleUnaryOpMatrixOutput("colVars", "gpu_uacvar");
+       }
+
+       @Test
+       public void rowVars() {
+               testSimpleUnaryOpMatrixOutput("rowVars", "gpu_uarvar");
+       }
+
+       @Test
+       public void sumsq() {
+               testUnaryOpMatrixOutput("out = sum(in1*in1)", "gpu_uasqk+", 
"in1", "out");
+       }
+
+       @Test
+       public void rowSumsqs() {
+               testUnaryOpMatrixOutput("out = rowSums(in1*in1)", 
"gpu_uarsqk+", "in1", "out");
+       }
+
+       @Test
+       public void colSumsqs() {
+               testUnaryOpMatrixOutput("out = colSums(in1*in1)", 
"gpu_uacsqk+", "in1", "out");
+       }
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/test/java/org/apache/sysml/test/gpu/BinaryOpTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/BinaryOpTests.java 
b/src/test/java/org/apache/sysml/test/gpu/BinaryOpTests.java
new file mode 100644
index 0000000..f3d2b21
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/gpu/BinaryOpTests.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.gpu;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.sysml.api.mlcontext.Matrix;
+import org.apache.sysml.test.utils.TestUtils;
+import org.junit.Test;
+
+/**
+ * Tests builtin binary ops on GPU
+ */
+public class BinaryOpTests extends GPUTests {
+
+       private final static String TEST_NAME = "BinaryOpTests";
+       private final int seed = 42;
+
+       @Override
+       public void setUp() {
+               TestUtils.clearAssertionInformation();
+               addTestConfiguration(TEST_DIR, TEST_NAME);
+               getAndLoadTestConfiguration(TEST_NAME);
+       }
+
+       @Test
+       public void testSolve() {
+               // Test Ax = b
+               // Dimensions of A = m * n
+               // Dimensions of x = n * 1
+               // Dimensions of b = m * 1
+
+               double sparsity = 1.0; // Only dense matrices supported by 
"solve"
+               final int[] sides = { 32, 33, 128, 256, 513, 2049 };
+               for (int i = 0; i < sides.length; i++) {
+                       for (int j = i; j < sides.length; j++) {
+                               int m = sides[j];
+                               int n = sides[i];
+                               runSolveTest(sparsity, m, n);
+                       }
+               }
+
+       }
+
+       /**
+        * Runs the test for solve (Ax = b) for input with given dimensions and 
sparsities
+        * A can be overdetermined (rows in A > columns in A)
+        *
+        * @param sparsity sparsity for the block A and b
+        * @param m        rows in A
+        * @param n        columns in A
+        */
+       protected void runSolveTest(double sparsity, int m, int n) {
+               String scriptStr = "x = solve(A, b)";
+               System.out.println("In solve, A[" + m + ", " + n + "], b[" + m 
+ ", 1]");
+               Matrix A = generateInputMatrix(spark, m, n, sparsity, seed);
+               Matrix b = generateInputMatrix(spark, m, 1, sparsity, seed);
+               HashMap<String, Object> inputs = new HashMap<>();
+               inputs.put("A", A);
+               inputs.put("b", b);
+               List<Object> outCPU = runOnCPU(spark, scriptStr, inputs, 
Arrays.asList("x"));
+               List<Object> outGPU = runOnGPU(spark, scriptStr, inputs, 
Arrays.asList("x"));
+               assertHeavyHitterPresent("gpu_solve");
+               assertEqualObjects(outCPU.get(0), outGPU.get(0));
+       }
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/772fb588/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/gpu/GPUTests.java 
b/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
new file mode 100644
index 0000000..8af9104
--- /dev/null
+++ b/src/test/java/org/apache/sysml/test/gpu/GPUTests.java
@@ -0,0 +1,250 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.gpu;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.spark.sql.SparkSession;
+import org.apache.sysml.api.mlcontext.MLContext;
+import org.apache.sysml.api.mlcontext.Matrix;
+import org.apache.sysml.api.mlcontext.Script;
+import org.apache.sysml.api.mlcontext.ScriptFactory;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.test.integration.AutomatedTestBase;
+import org.apache.sysml.utils.Statistics;
+import org.junit.After;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+
+/**
+ * Parent class for all GPU tests
+ */
+public abstract class GPUTests extends AutomatedTestBase {
+
+       protected final static String TEST_DIR = 
"org/apache/sysml/api/mlcontext";
+       protected static SparkSession spark;
+       protected final double THRESHOLD = 1e-9;    // for relative error
+
+       @BeforeClass
+       public static void beforeClass() {
+               spark = createSystemMLSparkSession("GPUTests", "local");
+       }
+
+       @AfterClass
+       public static void afterClass() {
+               spark.close();
+       }
+
+       /**
+        * Gets threshold for relative error in tests
+        *
+        * @return a valid threshold
+        */
+       protected double getTHRESHOLD() {
+               return THRESHOLD;
+       }
+
+       @After
+       public void tearDown() {
+               clearGPUMemory();
+               super.tearDown();
+       }
+
+       /**
+        * Clear out the memory on all GPUs
+        */
+       protected void clearGPUMemory() {
+               try {
+                       int count = GPUContextPool.getDeviceCount();
+                       int freeCount = GPUContextPool.getAvailableCount();
+                       Assert.assertTrue("All GPUContexts have not been 
returned to the GPUContextPool", count == freeCount);
+                       ArrayList<GPUContext> gpuContexts = new ArrayList<>();
+                       for (int i = 0; i < count; i++) {
+                               GPUContext gCtx = GPUContextPool.getFromPool();
+                               gCtx.initializeThread();
+                               gCtx.clearMemory();
+                               gpuContexts.add(gCtx);
+                       }
+                       for (GPUContext gCtx : gpuContexts) {
+                               GPUContextPool.returnToPool(gCtx);
+                       }
+
+               } catch (DMLRuntimeException e) {
+                       // Ignore
+               }
+       }
+
+       /**
+        * Generates a random input matrix with a given size and sparsity
+        *
+        * @param spark    valid instance of {@link SparkSession}
+        * @param m        number of rows
+        * @param n        number of columns
+        * @param sparsity sparsity (1 = completely dense, 0 = completely 
sparse)
+        * @return a random matrix with given size and sparsity
+        */
+       protected Matrix generateInputMatrix(SparkSession spark, int m, int n, 
double sparsity, int seed) {
+               // Generate a random matrix of size m * n
+               MLContext genMLC = new MLContext(spark);
+               String scriptStr;
+               if (sparsity == 0.0) {
+                       scriptStr = "in1 = matrix(0, rows=" + m + ", cols=" + n 
+ ")";
+               } else {
+                       scriptStr = "in1 = rand(rows=" + m + ", cols=" + n + ", 
sparsity = " + sparsity + ", seed= " + seed
+                                       + ", min=-1.0, max=1.0)";
+               }
+               Script generateScript = 
ScriptFactory.dmlFromString(scriptStr).out("in1");
+               Matrix in1 = genMLC.execute(generateScript).getMatrix("in1");
+               genMLC.close();
+               return in1;
+       }
+
+       /**
+        * Asserts that the values in two matrices are in {@link 
UnaryOpTests#THRESHOLD} of each other
+        *
+        * @param expected expected matrix
+        * @param actual   actual matrix
+        */
+       private void assertEqualMatrices(Matrix expected, Matrix actual) {
+               try {
+                       MatrixBlock expectedMB = 
expected.toMatrixObject().acquireRead();
+                       MatrixBlock actualMB = 
actual.toMatrixObject().acquireRead();
+
+                       long rows = expectedMB.getNumRows();
+                       long cols = expectedMB.getNumColumns();
+                       Assert.assertEquals(rows, actualMB.getNumRows());
+                       Assert.assertEquals(cols, actualMB.getNumColumns());
+
+                       for (int i = 0; i < rows; i++) {
+                               for (int j = 0; j < cols; j++) {
+                                       double expectedDouble = 
expectedMB.quickGetValue(i, j);
+                                       double actualDouble = 
actualMB.quickGetValue(i, j);
+                                       if (expectedDouble != 0.0 && 
!Double.isNaN(expectedDouble) && Double.isFinite(expectedDouble)) {
+                                               double relativeError = 
Math.abs((expectedDouble - actualDouble) / expectedDouble);
+                                               Assert.assertTrue("Comparing 
floating point numbers, relative error(" + relativeError
+                                                               + ") is more 
than threshold (" + getTHRESHOLD() + ")", relativeError < getTHRESHOLD());
+                                       } else {
+                                               
Assert.assertEquals(expectedDouble, actualDouble, getTHRESHOLD());
+                                       }
+                               }
+                       }
+                       expected.toMatrixObject().release();
+                       actual.toMatrixObject().release();
+               } catch (DMLRuntimeException e) {
+                       throw new RuntimeException(e);
+               }
+       }
+
+       /**
+        * asserts that the expected op was executed
+        *
+        * @param heavyHitterOpCode opcode of the heavy hitter for the unary op
+        */
+       protected void assertHeavyHitterPresent(String heavyHitterOpCode) {
+               Set<String> heavyHitterOpCodes = 
Statistics.getCPHeavyHitterOpCodes();
+               
Assert.assertTrue(heavyHitterOpCodes.contains(heavyHitterOpCode));
+       }
+
+       /**
+        * Runs a program on the CPU
+        *
+        * @param spark     a valid {@link SparkSession}
+        * @param scriptStr the script to run (as a string)
+        * @param inputs    map of input variables names in the scriptStr (of 
variable_name -> object)
+        * @param outStrs   list of variable names needed as output from the 
scriptStr
+        * @return list of output objects in order of outStrs
+        */
+       protected List<Object> runOnCPU(SparkSession spark, String scriptStr, 
Map<String, Object> inputs,
+                       List<String> outStrs) {
+               MLContext cpuMLC = new MLContext(spark);
+               List<Object> outputs = new ArrayList<>();
+               Script script = 
ScriptFactory.dmlFromString(scriptStr).in(inputs).out(outStrs);
+               for (String outStr : outStrs) {
+                       Object output = cpuMLC.execute(script).get(outStr);
+                       outputs.add(output);
+               }
+               cpuMLC.close();
+               return outputs;
+       }
+
+       /**
+        * Runs a program on the GPU
+        *
+        * @param spark     a valid {@link SparkSession}
+        * @param scriptStr the script to run (as a string)
+        * @param inputs    map of input variables names in the scriptStr (of 
variable_name -> object)
+        * @param outStrs   list of variable names needed as output from the 
scriptStr
+        * @return list of output objects in order of outStrs
+        */
+       protected List<Object> runOnGPU(SparkSession spark, String scriptStr, 
Map<String, Object> inputs,
+                       List<String> outStrs) {
+               MLContext gpuMLC = new MLContext(spark);
+               gpuMLC.setGPU(true);
+               gpuMLC.setForceGPU(true);
+               gpuMLC.setStatistics(true);
+               List<Object> outputs = new ArrayList<>();
+               Script script = 
ScriptFactory.dmlFromString(scriptStr).in(inputs).out(outStrs);
+               for (String outStr : outStrs) {
+                       Object output = gpuMLC.execute(script).get(outStr);
+                       outputs.add(output);
+               }
+               gpuMLC.close();
+               return outputs;
+       }
+
+       /**
+        * Assert that the two objects are equal. Supported types are Boolean, 
Integer, String, Double and Matrix
+        *
+        * @param expected
+        * @param actual
+        */
+       protected void assertEqualObjects(Object expected, Object actual) {
+               Assert.assertEquals(expected.getClass(), actual.getClass());
+
+               if (expected instanceof Boolean) {
+                       Assert.assertEquals(((Boolean) 
expected).booleanValue(), ((Boolean) actual).booleanValue());
+               } else if (expected instanceof Double) {
+                       double expectedDouble = ((Double) 
expected).doubleValue();
+                       double actualDouble = ((Double) actual).doubleValue();
+                       if (expectedDouble != 0.0 && 
!Double.isNaN(expectedDouble) && Double.isFinite(expectedDouble)) {
+                               double relativeError = Math.abs((expectedDouble 
- actualDouble) / expectedDouble);
+                               Assert.assertTrue("Comparing floating point 
numbers, relative error(" + relativeError
+                                               + ") is more than threshold (" 
+ getTHRESHOLD() + ")", relativeError < getTHRESHOLD());
+                       } else {
+                               Assert.assertEquals(expectedDouble, 
actualDouble, getTHRESHOLD());
+                       }
+               } else if (expected instanceof String) {
+                       Assert.assertEquals(expected.toString(), 
actual.toString());
+               } else if (expected instanceof Integer) {
+                       Assert.assertEquals(((Integer) expected).intValue(), 
((Integer) actual).intValue());
+               } else if (expected instanceof Matrix)
+                       assertEqualMatrices((Matrix) expected, (Matrix) actual);
+               else {
+                       Assert.fail("Invalid types for comparison");
+               }
+       }
+}

[2/2] incubator-systemml git commit: [SYSTEMML-1625] GPU Unit Tests (and GPU row/col variance bug fix)

Reply via email to