Repository: incubator-systemml
Updated Branches:
  refs/heads/master c528b769c -> 627fdbe2d


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/627fdbe2/src/main/java/org/apache/sysml/api/DMLScript.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/DMLScript.java 
b/src/main/java/org/apache/sysml/api/DMLScript.java
index bc6bacb..e0e0cb2 100644
--- a/src/main/java/org/apache/sysml/api/DMLScript.java
+++ b/src/main/java/org/apache/sysml/api/DMLScript.java
@@ -320,7 +320,7 @@ public class DMLScript
                                                        throw new 
DMLRuntimeException("Unsupported flag for -gpu:" + flag);
                                                }
                                        }
-                                       GPUContext.createGPUContext(); // Set 
GPU memory budget
+                                       GPUContext.getGPUContext(); // creates 
the singleton GPU context object. Return value ignored.
                                }
                                else if( args[i].equalsIgnoreCase("-python") ) {
                                        parsePyDML = true;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/627fdbe2/src/main/java/org/apache/sysml/conf/DMLConfig.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java 
b/src/main/java/org/apache/sysml/conf/DMLConfig.java
index ec3a30d..922ba82 100644
--- a/src/main/java/org/apache/sysml/conf/DMLConfig.java
+++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java
@@ -74,7 +74,7 @@ public class DMLConfig
        // Fraction of available memory to use. The available memory is 
computer when the JCudaContext is created
        // to handle the tradeoff on calling cudaMemGetInfo too often.
        public static final String GPU_MEMORY_UTILIZATION_FACTOR    = 
"gpu.memory.util.factor";
-       // Invoke cudaMemGetInfo to get available memory information. Useful if 
GPU is shared among multiple application. 
+       // Invoke cudaMemGetInfo to get available memory information. Useful if 
GPU is shared among multiple application.
        public static final String REFRESH_AVAILABLE_MEMORY_EVERY_TIME    = 
"gpu.memory.refresh";
 
        // supported prefixes for custom map/reduce configurations

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/627fdbe2/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java 
b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
index e2b8f12..797855b 100644
--- a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
@@ -144,11 +144,11 @@ public class AggUnaryOp extends Hop implements 
MultiThreadedHop
                                }                               
                                else { //general case           
                                        int k = 
OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
-                                       if(DMLScript.USE_ACCELERATOR && 
(DMLScript.FORCE_ACCELERATOR || getMemEstimate() < 
OptimizerUtils.GPU_MEMORY_BUDGET) && (_op == AggOp.SUM)) {
+                                       if(DMLScript.USE_ACCELERATOR && 
(DMLScript.FORCE_ACCELERATOR || getMemEstimate() < 
OptimizerUtils.GPU_MEMORY_BUDGET)) {
                                                // Only implemented methods for 
GPU
                                                if ((_op == AggOp.SUM && 
(_direction == Direction.RowCol || _direction == Direction.Row || _direction == 
Direction.Col))
-                                                                               
|| (_op == AggOp.MAX && (_direction == Direction.RowCol))
-                                                                               
|| (_op == AggOp.MIN && (_direction == Direction.RowCol))
+                                                                               
|| (_op == AggOp.MAX && (_direction == Direction.RowCol || _direction == 
Direction.Row || _direction == Direction.Col))
+                                                                               
|| (_op == AggOp.MIN && (_direction == Direction.RowCol || _direction == 
Direction.Row || _direction == Direction.Col))
                                                                                
|| (_op == AggOp.MEAN && (_direction == Direction.RowCol))){
                                                        et = ExecType.GPU;
                                                        k = 1;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/627fdbe2/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
index 5e3ab62..127cafd 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
@@ -73,15 +73,22 @@ public class GPUInstructionParser  extends InstructionParser
                String2GPUInstructionType.put( "sel+"  , 
GPUINSTRUCTION_TYPE.BuiltinUnary);
 
                // Aggregate Unary
-               String2GPUInstructionType.put( "ua+"     , 
GPUINSTRUCTION_TYPE.AggregateUnary);
-               String2GPUInstructionType.put( "uak+"    , 
GPUINSTRUCTION_TYPE.AggregateUnary);
-               String2GPUInstructionType.put( "uar+"    , 
GPUINSTRUCTION_TYPE.AggregateUnary);
-               String2GPUInstructionType.put( "uark+"   , 
GPUINSTRUCTION_TYPE.AggregateUnary);
-               String2GPUInstructionType.put( "uac+"    , 
GPUINSTRUCTION_TYPE.AggregateUnary);
-               String2GPUInstructionType.put( "uack+"   , 
GPUINSTRUCTION_TYPE.AggregateUnary);
-               String2GPUInstructionType.put( "uamean"  , 
GPUINSTRUCTION_TYPE.AggregateUnary);
-               String2GPUInstructionType.put( "uamax"   , 
GPUINSTRUCTION_TYPE.AggregateUnary);
-               String2GPUInstructionType.put( "uamin"   , 
GPUINSTRUCTION_TYPE.AggregateUnary);
+               String2GPUInstructionType.put( "ua+"     , 
GPUINSTRUCTION_TYPE.AggregateUnary);         // Sum
+               String2GPUInstructionType.put( "uak+"    , 
GPUINSTRUCTION_TYPE.AggregateUnary);         // Sum
+               String2GPUInstructionType.put( "uar+"    , 
GPUINSTRUCTION_TYPE.AggregateUnary);         // Row Sum
+               String2GPUInstructionType.put( "uark+"   , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Row Sum
+               String2GPUInstructionType.put( "uac+"    , 
GPUINSTRUCTION_TYPE.AggregateUnary);         // Col Sum
+               String2GPUInstructionType.put( "uack+"   , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Col Sum
+               String2GPUInstructionType.put( "uamean"  , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Mean
+               String2GPUInstructionType.put( "uarmean" , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Row Mean
+               String2GPUInstructionType.put( "uacmean" , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Col Mean
+               String2GPUInstructionType.put( "uamax"   , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Max
+               String2GPUInstructionType.put( "uarmax"  , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Row Max
+               String2GPUInstructionType.put( "uacmax"  , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Col Max
+               String2GPUInstructionType.put( "uamin"   , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Min
+               String2GPUInstructionType.put( "uarmin"  , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Row Min
+               String2GPUInstructionType.put( "uacmin"  , 
GPUINSTRUCTION_TYPE.AggregateUnary); // Col Min
+
        }
        
        public static GPUInstruction parseSingleInstruction (String str ) 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/627fdbe2/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index f0e31c4..c1b77eb 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -51,12 +51,11 @@ public abstract class GPUContext {
        public abstract void ensureComputeCapability() throws 
DMLRuntimeException;
        
        /**
-        * Creation / Destruction of GPUContext and related handles
-        * 
+        * Singleton Factory method for creation of {@link GPUContext}
         * @return GPU context
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public static GPUContext createGPUContext() throws DMLRuntimeException {
+       public static GPUContext getGPUContext() throws DMLRuntimeException {
                if(currContext == null && DMLScript.USE_ACCELERATOR) {
                        synchronized(isGPUContextCreated) {
                                currContext = new JCudaContext();

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/627fdbe2/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
index 893f416..1157214 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java
@@ -6,9 +6,9 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- * 
+ *
  *   http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -45,9 +45,7 @@ import static jcuda.jcusparse.JCusparse.cusparseDestroy;
 import static jcuda.jcusparse.JCusparse.cusparseCreate;
 import static jcuda.driver.JCudaDriver.cuInit;
 import static jcuda.driver.JCudaDriver.cuDeviceGetCount;
-import static jcuda.runtime.JCuda.cudaGetDeviceProperties;
-import static jcuda.runtime.JCuda.cudaGetDeviceCount;
-import static jcuda.runtime.JCuda.cudaMemGetInfo;
+import static jcuda.runtime.JCuda.*;
 import static jcuda.runtime.cudaError.cudaSuccess;
 
 /**
@@ -59,23 +57,30 @@ import static jcuda.runtime.cudaError.cudaSuccess;
  */
 public class JCudaContext extends GPUContext {
 
+       private static final Log LOG = 
LogFactory.getLog(JCudaContext.class.getName());
+
        // The minimum CUDA Compute capability needed for SystemML.
        // After compute capability 3.0, 2^31 - 1 blocks and 1024 threads per 
block are supported.
        // If SystemML needs to run on an older card, this logic can be 
revisited.
        final int MAJOR_REQUIRED = 3;
        final int MINOR_REQUIRED = 0;
 
-       private static final Log LOG = 
LogFactory.getLog(JCudaContext.class.getName());
-       
+       /** The total number of cuda devices on this machine */
+       public static int deviceCount = -1;
+
+       /** enable this to print debug information before code pertaining to 
the GPU is executed  */
        public static boolean DEBUG = false;
-       
-       public static long totalNumBytes = 0;
-       public static AtomicLong availableNumBytesWithoutUtilFactor = new 
AtomicLong(0);
-       // Fraction of available memory to use. The available memory is 
computer when the JCudaContext is created
-       // to handle the tradeoff on calling cudaMemGetInfo too often.
-       public boolean REFRESH_AVAILABLE_MEMORY_EVERY_TIME = 
ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.REFRESH_AVAILABLE_MEMORY_EVERY_TIME);
+
+       /** total bytes available on currently active cude device, please be 
careful with its bookkeeping */
+       private AtomicLong deviceMemBytes = new AtomicLong(0);
+
+       /** Stores the cached deviceProperties */
+       private static cudaDeviceProp[] deviceProperties;
+
        // Invoke cudaMemGetInfo to get available memory information. Useful if 
GPU is shared among multiple application.
        public double GPU_MEMORY_UTILIZATION_FACTOR = 
ConfigurationManager.getDMLConfig().getDoubleValue(DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR);
+       // Whether to invoke cudaMemGetInfo for available memory or rely on 
internal bookkeeping for memory info.
+       public boolean REFRESH_AVAILABLE_MEMORY_EVERY_TIME = 
ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.REFRESH_AVAILABLE_MEMORY_EVERY_TIME);
        static {
                long start = System.nanoTime();
                JCuda.setExceptionsEnabled(true);
@@ -84,28 +89,57 @@ public class JCudaContext extends GPUContext {
                JCusparse.setExceptionsEnabled(true);
                JCudaDriver.setExceptionsEnabled(true);
                cuInit(0); // Initialize the driver
-               // Obtain the number of devices
-        int deviceCountArray[] = { 0 };
-        cuDeviceGetCount(deviceCountArray);
-        int deviceCount = deviceCountArray[0];
-        LOG.info("Total number of GPUs on the machine: " + deviceCount);
-        Statistics.cudaInitTime = System.nanoTime() - start;
+
+               int deviceCountArray[] = { 0 };
+               cuDeviceGetCount(deviceCountArray);                             
// Obtain the number of devices
+               deviceCount = deviceCountArray[0];
+               deviceProperties = new cudaDeviceProp[deviceCount];
+
+               LOG.info("Total number of GPUs on the machine: " + deviceCount);
+               int maxBlocks = getMaxBlocks();
+               int maxThreadsPerBlock = getMaxThreadsPerBlock();
+               long sharedMemPerBlock = getMaxSharedMemory();
+               int[] device = {-1};
+               cudaGetDevice(device);
+               LOG.info("Active CUDA device number : " + device[0]);
+               LOG.info("Max Blocks/Threads/SharedMem : " + maxBlocks + "/" + 
maxThreadsPerBlock + "/" + sharedMemPerBlock);
+
+               Statistics.cudaInitTime = System.nanoTime() - start;
+
+               start = System.nanoTime();
+               LibMatrixCUDA.cudnnHandle = new cudnnHandle();
+               cudnnCreate(LibMatrixCUDA.cudnnHandle);
+               LibMatrixCUDA.cublasHandle = new cublasHandle();
+               cublasCreate(LibMatrixCUDA.cublasHandle);
+               // For cublas v2, cublasSetPointerMode tells Cublas whether to 
expect scalar arguments on device or on host
+               // This applies to arguments like "alpha" in Dgemm, and "y" in 
Ddot.
+               // cublasSetPointerMode(LibMatrixCUDA.cublasHandle, 
cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE);
+               LibMatrixCUDA.cusparseHandle = new cusparseHandle();
+               cusparseCreate(LibMatrixCUDA.cusparseHandle);
+               Statistics.cudaLibrariesInitTime = System.nanoTime() - start;
+
+               try {
+                       LibMatrixCUDA.kernels = new JCudaKernels();
+               } catch (DMLRuntimeException e) {
+                       System.err.println("ERROR - Unable to initialize 
JCudaKernels. System in an inconsistent state");
+                       LibMatrixCUDA.kernels = null;
+               }
+
        }
 
        @Override
        public long getAvailableMemory() {
-               if(REFRESH_AVAILABLE_MEMORY_EVERY_TIME) {
-                       long free [] = { 0 };
-               long total [] = { 0 };
-               if(cudaMemGetInfo(free, total) == cudaSuccess) {
-                       totalNumBytes = total[0];
-                       availableNumBytesWithoutUtilFactor.set(free[0]);
-               }
-               else {
-                       throw new RuntimeException("ERROR: Unable to get memory 
information of the GPU.");
-               }
+               if (REFRESH_AVAILABLE_MEMORY_EVERY_TIME) {
+                       long free[] = {0};
+                       long total[] = {0};
+                       if (cudaMemGetInfo(free, total) == cudaSuccess) {
+                               long totalNumBytes = total[0];
+                               deviceMemBytes.set(free[0]);
+                       } else {
+                               throw new RuntimeException("ERROR: Unable to 
get memory information of the GPU.");
+                       }
                }
-               return (long) 
(availableNumBytesWithoutUtilFactor.get()*GPU_MEMORY_UTILIZATION_FACTOR);
+               return (long) 
(deviceMemBytes.get()*GPU_MEMORY_UTILIZATION_FACTOR);
        }
 
        @Override
@@ -117,8 +151,7 @@ public class JCudaContext extends GPUContext {
                }
                boolean isComputeCapable = true;
                for (int i=0; i<devices[0]; i++) {
-                       cudaDeviceProp properties = new cudaDeviceProp();
-                       cudaGetDeviceProperties(properties, i);
+                       cudaDeviceProp properties = getGPUProperties(i);
                        int major = properties.major;
                        int minor = properties.minor;
                        if (major < MAJOR_REQUIRED) {
@@ -131,8 +164,77 @@ public class JCudaContext extends GPUContext {
                        throw new DMLRuntimeException("One of the CUDA cards on 
the system has compute capability lower than " + MAJOR_REQUIRED + "." + 
MINOR_REQUIRED);
                }
        }
-       
-       
+
+       /**
+        * Gets the device properties for the active GPU (set with 
cudaSetDevice())
+        * @return
+        */
+       public static cudaDeviceProp getGPUProperties() {
+               int[] device = {-1};
+               cudaGetDevice(device);  // Get currently active device
+               return getGPUProperties(device[0]);
+       }
+
+       /**
+        * Gets the device properties
+        * @param device the device number (on a machine with more than 1 GPU)
+        * @return
+        */
+       public static cudaDeviceProp getGPUProperties(int device){
+               if (deviceProperties[device] == null) {
+                       cudaDeviceProp properties = new cudaDeviceProp();
+                       cudaGetDeviceProperties(properties, device);
+                       deviceProperties[device] = properties;
+               }
+               return deviceProperties[device];
+       }
+
+
+       /**
+        * Gets the maximum number of threads per block for "active" GPU
+        * @return
+        */
+       public static int getMaxThreadsPerBlock() {
+               cudaDeviceProp deviceProps = getGPUProperties();
+               return deviceProps.maxThreadsPerBlock;
+       }
+
+       /**
+        * Gets the maximum number of blocks supported by the active cuda device
+        * @return
+        */
+       public static int getMaxBlocks() {
+               cudaDeviceProp deviceProp = getGPUProperties();
+               return deviceProp.maxGridSize[0];
+       }
+
+       /**
+        * Gets the shared memory per block supported by the active cuda device
+        * @return
+        */
+       public static long getMaxSharedMemory() {
+               cudaDeviceProp deviceProp = getGPUProperties();
+               return deviceProp.sharedMemPerBlock;
+       }
+
+       /**
+        * Gets the warp size supported by the active cuda device
+        * @return
+        */
+       public static int getWarpSize() {
+               cudaDeviceProp deviceProp = getGPUProperties();
+               return deviceProp.warpSize;
+       }
+
+       /**
+        * Gets the available memory and then adds value to it
+        * @param v the value to add
+        * @return
+        */
+       public long getAndAddAvailableMemory(long v){
+               return deviceMemBytes.getAndAdd(v);
+       }
+
        public JCudaContext() throws DMLRuntimeException {
                if(isGPUContextCreated) {
                        // Wait until it is deleted. This case happens during 
multi-threaded testing.
@@ -149,33 +251,23 @@ public class JCudaContext extends GPUContext {
                                }
                        }
                }
-               GPUContext.currContext = this;
-               
-               long start = System.nanoTime();
-               LibMatrixCUDA.cudnnHandle = new cudnnHandle();
-               cudnnCreate(LibMatrixCUDA.cudnnHandle);
-               LibMatrixCUDA.cublasHandle = new cublasHandle();
-               cublasCreate(LibMatrixCUDA.cublasHandle);
-               // For cublas v2, cublasSetPointerMode tells Cublas whether to 
expect scalar arguments on device or on host
-               // This applies to arguments like "alpha" in Dgemm, and "y" in 
Ddot.
-               // cublasSetPointerMode(LibMatrixCUDA.cublasHandle, 
cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE); 
-               LibMatrixCUDA.cusparseHandle = new cusparseHandle();
-               cusparseCreate(LibMatrixCUDA.cusparseHandle);
-               Statistics.cudaLibrariesInitTime = System.nanoTime() - start;
-               
+               synchronized (isGPUContextCreated){
+                       GPUContext.currContext = this;
+               }
+
                long free [] = { 0 };
-        long total [] = { 0 };
-        if(cudaMemGetInfo(free, total) == cudaSuccess) {
-               totalNumBytes = total[0];
-               availableNumBytesWithoutUtilFactor.set(free[0]);
-        }
-        else {
-               throw new RuntimeException("ERROR: Unable to get memory 
information of the GPU.");
-        }
-        LOG.info("Total GPU memory: " + (totalNumBytes*(1e-6)) + " MB");
-        LOG.info("Available GPU memory: " + 
(availableNumBytesWithoutUtilFactor.get()*(1e-6)) + " MB");
-        
-        LibMatrixCUDA.kernels = new JCudaKernels();
+               long total [] = { 0 };
+               long totalNumBytes = 0;
+               if(cudaMemGetInfo(free, total) == cudaSuccess) {
+                       totalNumBytes = total[0];
+                       deviceMemBytes.set(free[0]);
+               }
+               else {
+                       throw new RuntimeException("ERROR: Unable to get memory 
information of the GPU.");
+               }
+               LOG.info("Total GPU memory: " + (totalNumBytes*(1e-6)) + " MB");
+               LOG.info("Available GPU memory: " + 
(deviceMemBytes.get()*(1e-6)) + " MB");
+
        }
 
        @Override
@@ -194,4 +286,4 @@ public class JCudaContext extends GPUContext {
                }
        }
 
-}
\ No newline at end of file
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/627fdbe2/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
index f39e804..58bc9ec 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
@@ -643,7 +643,7 @@ public class JCudaObject extends GPUObject {
                        if(numElemToAllocate == -1 && 
LibMatrixCUDA.isInSparseFormat(mat)) {
                                
setSparseMatrixCudaPointer(CSRPointer.allocateEmpty(mat.getNnz(), 
mat.getNumRows()));
                                numBytes = 
CSRPointer.estimateSize(mat.getNnz(), mat.getNumRows());
-                               
JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(-numBytes);
+                               
JCudaContext.deviceMemBytes.addAndGet(-numBytes);
                                isInSparseFormat = true;
                                //throw new DMLRuntimeException("Sparse format 
not implemented");
                        } else if(numElemToAllocate == -1) {
@@ -651,7 +651,7 @@ public class JCudaObject extends GPUObject {
                                setDenseMatrixCudaPointer(new Pointer());
                                numBytes = 
mat.getNumRows()*getDoubleSizeOf(mat.getNumColumns());
                                cudaMalloc(jcudaDenseMatrixPtr, numBytes);
-                               
JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(-numBytes);
+                               
JCudaContext.deviceMemBytes.addAndGet(-numBytes);
                        }
                        else {
                                // Called for dense output
@@ -660,7 +660,7 @@ public class JCudaObject extends GPUObject {
                                if(numElemToAllocate <= 0 || numBytes <= 0)
                                        throw new DMLRuntimeException("Cannot 
allocate dense matrix object with " + numElemToAllocate + " elements and size " 
+ numBytes);
                                cudaMalloc(jcudaDenseMatrixPtr,  numBytes);
-                               
JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(-numBytes);
+                               
JCudaContext.deviceMemBytes.addAndGet(-numBytes);
                        }
 
                        
Statistics.cudaAllocTime.addAndGet(System.nanoTime()-start);
@@ -712,7 +712,7 @@ public class JCudaObject extends GPUObject {
        public void setDeviceModify(long numBytes) {
                this.numLocks.addAndGet(1);
                this.numBytes = numBytes;
-               
JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(-numBytes);
+               
((JCudaContext)GPUContext.currContext).getAndAddAvailableMemory(-numBytes);
        }
 
        @Override
@@ -720,14 +720,14 @@ public class JCudaObject extends GPUObject {
                if(jcudaDenseMatrixPtr != null) {
                        long start = System.nanoTime();
                        cudaFree(jcudaDenseMatrixPtr);
-                       
JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(numBytes);
+                       
((JCudaContext)GPUContext.currContext).getAndAddAvailableMemory(numBytes);
                        
Statistics.cudaDeAllocTime.addAndGet(System.nanoTime()-start);
                        Statistics.cudaDeAllocCount.addAndGet(1);
                }
                if (jcudaSparseMatrixPtr != null) {
                        long start = System.nanoTime();
                        jcudaSparseMatrixPtr.deallocate();
-                       
JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(numBytes);
+                       
((JCudaContext)GPUContext.currContext).getAndAddAvailableMemory(numBytes);
                        
Statistics.cudaDeAllocTime.addAndGet(System.nanoTime()-start);
                        Statistics.cudaDeAllocCount.addAndGet(1);
                }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/627fdbe2/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index 1af2a1d..87a66f4 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -65,9 +65,7 @@ import 
org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.functionobjects.*;
 import org.apache.sysml.runtime.instructions.cp.DoubleObject;
-import org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig;
-import org.apache.sysml.runtime.instructions.gpu.context.JCudaKernels;
-import org.apache.sysml.runtime.instructions.gpu.context.JCudaObject;
+import org.apache.sysml.runtime.instructions.gpu.context.*;
 import 
org.apache.sysml.runtime.instructions.gpu.context.JCudaObject.CSRPointer;
 import org.apache.sysml.runtime.matrix.operators.*;
 import org.apache.sysml.utils.Statistics;
@@ -90,15 +88,68 @@ import jcuda.jcusparse.cusparseHandle;
 public class LibMatrixCUDA {
 
        // Assume Compute Capability 3.0
-       public static final int MAX_THREADS = 1024;                             
// For compute capability > 3.0
-       public static final int MAX_BLOCKS = 2147483647;        // 2^31 - 1 For 
compute capability > 3.0
+       // MAX BLOCKS is 2^31 - 1 For compute capability > 3.0
+       // MAX_THREADS is 1024 For compute capability > 3.0
+       private static int _MAX_THREADS = -1;
+       private static int _MAX_BLOCKS  = -1;
+       private static int _WARP_SIZE   = -1;
+
+       /**
+        * Utility function to get maximum number of threads supported by the 
active CUDA device.
+        * This is put into a singleton style method because the GPUContext is 
not fully initialized when
+        * the LibMatrixCUDA class is loaded. If the {@link 
GPUContext#getGPUContext()} is invoked in a
+        * static block in this class, it will access the {@link JCudaContext} 
instance when it is not
+        * properly initialized. Due to the proper checks in place, a deadlock 
occurs.
+        * @return max threads
+        * @throws DMLRuntimeException if exception occurs
+        */
+       static int getMaxThreads() throws DMLRuntimeException{
+               if (_MAX_THREADS == -1){
+                       _MAX_THREADS = JCudaContext.getMaxThreadsPerBlock();
+               }
+               return _MAX_THREADS;
+       }
+
+       /**
+        * Utility function to get maximum number of blocks supported by the 
active CUDA device.
+        * This is put into a singleton style method because the GPUContext is 
not fully initialized when
+        * the LibMatrixCUDA class is loaded. If the {@link 
GPUContext#getGPUContext()} is invoked in a
+        * static block in this class, it will access the {@link JCudaContext} 
instance when it is not
+        * properly initialized. Due to the proper checks in place, a deadlock 
occurs.
+        * @return max blocks
+        * @throws DMLRuntimeException if exception occurs
+        */
+       static int getMaxBlocks() throws DMLRuntimeException{
+               if (_MAX_BLOCKS == -1){
+                       _MAX_BLOCKS = JCudaContext.getMaxBlocks();
+               }
+               return _MAX_BLOCKS;
+       }
+
+       /**
+        * Utility function to get the warp size supported by the active CUDA 
device.
+        * This is put into a singleton style method because the GPUContext is 
not fully initialized when
+        * the LibMatrixCUDA class is loaded. If the {@link 
GPUContext#getGPUContext()} is invoked in a
+        * static block in this class, it will access the {@link JCudaContext} 
instance when it is not
+        * properly initialized. Due to the proper checks in place, a deadlock 
occurs.
+        * @return warp size
+        * @throws DMLRuntimeException if exception occurs
+        */
+       static int getWarpSize() throws DMLRuntimeException {
+               if (_WARP_SIZE == -1) {
+                       _WARP_SIZE = JCudaContext.getWarpSize();
+               }
+               return _WARP_SIZE;
+       }
+
+
 
        public static cudnnHandle cudnnHandle;
        public static cublasHandle cublasHandle;
        public static cusparseHandle cusparseHandle;
        public static JCudaKernels kernels; // Used to launch custom kernels
 
-    private static final Log LOG = 
LogFactory.getLog(LibMatrixCUDA.class.getName());
+       private static final Log LOG = 
LogFactory.getLog(LibMatrixCUDA.class.getName());
 
        private static int CONVOLUTION_PREFERENCE = 
cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
 
@@ -240,7 +291,7 @@ public class LibMatrixCUDA {
 
        /**
         * This method computes the backpropagation errors for previous layer 
of relu operation
-        * 
+        *
         * @param input input image
         * @param dout  next layer error propogation
         * @param outputBlock output
@@ -262,13 +313,13 @@ public class LibMatrixCUDA {
                                
ExecutionConfig.getConfigForSimpleMatrixOperations((int)rows, (int)cols),
                                imagePointer, doutPointer, outputPointer, 
(int)rows, (int)cols);
        }
-       
+
        /**
         * Performs the operation corresponding to the DML script:
-        * ones = matrix(1, rows=1, cols=Hout*Wout)             
+        * ones = matrix(1, rows=1, cols=Hout*Wout)
         * output = input + matrix(bias %*% ones, rows=1, cols=F*Hout*Wout)
         * This operation is often followed by conv2d and hence we have 
introduced bias_add(input, bias) built-in function
-        * 
+        *
         * @param input input image
         * @param bias bias
         * @param outputBlock output
@@ -299,8 +350,8 @@ public class LibMatrixCUDA {
 
        /**
         * This method computes the backpropogation errors for filter of 
convolution operation
-        * 
-        * @param image input image 
+        *
+        * @param image input image
         * @param dout errors from next layer
         * @param outputBlock  output errors
         * @param N number of images
@@ -312,7 +363,7 @@ public class LibMatrixCUDA {
         * @param S filter width
         * @param pad_h pad height
         * @param pad_w pad width
-        * @param stride_h stride height 
+        * @param stride_h stride height
         * @param stride_w stride width
         * @param P output activation height
         * @param Q output activation width
@@ -1063,11 +1114,11 @@ public class LibMatrixCUDA {
                                                break;
                                        }
                                        case REDUCTION_COL : {  // The names 
are a bit misleading, REDUCTION_COL refers to the direction (reduce all 
elements in a column)
-                                               reduceRow("reduce_row", in, 
out, rlen, clen);
+                                               reduceRow("reduce_row_sum", in, 
out, rlen, clen);
                                                break;
                                        }
                                        case REDUCTION_ROW : {
-                                               reduceCol("reduce_col", in, 
out, rlen, clen);
+                                               reduceCol("reduce_col_sum", in, 
out, rlen, clen);
                                                break;
                                        }
 
@@ -1101,7 +1152,7 @@ public class LibMatrixCUDA {
                                        default:
                                                throw new 
DMLRuntimeException("Internal Error - Unsupported reduction direction for 
mean");
                                }
-                               // break;
+                               break;
                        }
                        case OP_VARIANCE : {
                                switch(reductionDirection) {
@@ -1130,13 +1181,18 @@ public class LibMatrixCUDA {
                                                ec.setScalarOutput(output, new 
DoubleObject(result));
                                                break;
                                        }
-                                       case REDUCTION_COL:
-                                       case REDUCTION_ROW:
-                                               throw new 
DMLRuntimeException("Internal Error - All, Row & Column max of matrix not 
implemented yet for GPU ");
+                                       case REDUCTION_COL: {
+                                               reduceRow("reduce_row_max", in, 
out, rlen, clen);
+                                               break;
+                                       }
+                                       case REDUCTION_ROW: {
+                                               reduceCol("reduce_col_max", in, 
out, rlen, clen);
+                                               break;
+                                       }
                                        default:
                                                throw new 
DMLRuntimeException("Internal Error - Unsupported reduction direction for max");
                                }
-                               // break;
+                               break;
                        }
                        case OP_MIN :{
                                switch(reductionDirection) {
@@ -1145,13 +1201,18 @@ public class LibMatrixCUDA {
                                                ec.setScalarOutput(output, new 
DoubleObject(result));
                                                break;
                                        }
-                                       case REDUCTION_COL:
-                                       case REDUCTION_ROW:
-                                               throw new 
DMLRuntimeException("Internal Error - All, Row & Column min of matrix not 
implemented yet for GPU ");
+                                       case REDUCTION_COL: {
+                                               reduceRow("reduce_row_min", in, 
out, rlen, clen);
+                                               break;
+                                       }
+                                       case REDUCTION_ROW: {
+                                               reduceCol("reduce_col_min", in, 
out, rlen, clen);
+                                               break;
+                                       }
                                        default:
                                                throw new 
DMLRuntimeException("Internal Error - Unsupported reduction direction for min");
                                }
-                               // break;
+                               break;
                        }
                        case OP_MAXINDEX : {
                                switch(reductionDirection) {
@@ -1237,7 +1298,7 @@ public class LibMatrixCUDA {
        private static void reduceCol(String kernelFunction, Pointer in, 
Pointer out, int rows, int cols) throws DMLRuntimeException {
                int[] tmp = getKernelParamsForReduceByCol(rows, cols);
                int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
-               kernels.launchKernel("reduce_col", new ExecutionConfig(blocks, 
threads, sharedMem),
+               kernels.launchKernel(kernelFunction, new 
ExecutionConfig(blocks, threads, sharedMem),
                                                in, out, rows, cols);
                cudaDeviceSynchronize();
        }
@@ -1247,14 +1308,17 @@ public class LibMatrixCUDA {
         * @param n size of input array
         * @return integer array containing {blocks, threads, shared memory}
         */
-       private static int[] getKernelParamsForReduceAll(int n){
-               int threads = (n < MAX_THREADS*2) ? nextPow2((n + 1)/ 2) : 
MAX_THREADS;
+       private static int[] getKernelParamsForReduceAll(int n) throws 
DMLRuntimeException{
+               final int MAX_THREADS = getMaxThreads();
+               final int MAX_BLOCKS = getMaxBlocks();
+               final int WARP_SIZE = getWarpSize();
+               int threads = (n < MAX_THREADS *2) ? nextPow2((n + 1)/ 2) : 
MAX_THREADS;
 
                int blocks = (n + (threads * 2 - 1)) / (threads * 2);
                blocks = Math.min(MAX_BLOCKS, blocks);
 
                int sharedMemSize = threads * Sizeof.DOUBLE;
-               if (threads <= 32){
+               if (threads <= WARP_SIZE){
                        sharedMemSize *= 2;
                }
                return new int[] {blocks, threads, sharedMemSize};
@@ -1266,23 +1330,27 @@ public class LibMatrixCUDA {
         * @param cols number of columns in input matrix
         * @return integer array containing {blocks, threads, shared memory}
         */
-       private static int[] getKernelParamsForReduceByRow(int rows, int cols) {
-               final int WARP_SIZE = 32;
-               int threads = Math.min(cols, WARP_SIZE);
+       private static int[] getKernelParamsForReduceByRow(int rows, int cols) 
throws DMLRuntimeException {
+               final int WARP_SIZE = getWarpSize();
+               final int MAX_THREADS = getMaxThreads();
+               int threads = (cols < MAX_THREADS *2) ? nextPow2((cols + 1)/ 2) 
: MAX_THREADS;
                int blocks = rows;
                int sharedMemSize = threads * Sizeof.DOUBLE;
-               if (threads <= 32){
+               if (threads <= WARP_SIZE){
                        sharedMemSize *=2;
                }
                return new int[] {blocks, threads, sharedMemSize};
        }
 
-       private static int[] getKernelParamsForReduceByCol(int rows, int cols) {
+       private static int[] getKernelParamsForReduceByCol(int rows, int cols) 
throws DMLRuntimeException {
+               final int MAX_THREADS = getMaxThreads();
+               final int MAX_BLOCKS = getMaxBlocks();
+               final int WARP_SIZE = getWarpSize();
                int threads = Math.min(cols, MAX_THREADS);
-               int blocks = cols/1024;
-               if (cols % 1024 != 0) blocks++;
+               int blocks = cols/MAX_THREADS;
+               if (cols % MAX_THREADS != 0) blocks++;
                int sharedMemSize = threads * Sizeof.DOUBLE;
-               if (threads <= 32){
+               if (threads <= WARP_SIZE){
                        sharedMemSize *=2;
                }
                return new int[] {blocks, threads, sharedMemSize};
@@ -1307,8 +1375,8 @@ public class LibMatrixCUDA {
 
        /**
         * This method computes the backpropogation errors for previous layer 
of convolution operation
-        * 
-        * @param filter filter used in conv2d 
+        *
+        * @param filter filter used in conv2d
         * @param dout errors from next layer
         * @param output  output errors
         * @param N number of images
@@ -1320,7 +1388,7 @@ public class LibMatrixCUDA {
         * @param S filter width
         * @param pad_h pad height
         * @param pad_w pad width
-        * @param stride_h stride height 
+        * @param stride_h stride height
         * @param stride_w stride width
         * @param P output activation height
         * @param Q output activation width
@@ -1464,7 +1532,7 @@ public class LibMatrixCUDA {
        /**
         * Performs maxpoolingBackward on GPU by exploiting 
cudnnPoolingBackward(...)
         * This method computes the backpropogation errors for previous layer 
of maxpooling operation
-        * 
+        *
         * @param image image as matrix object
         * @param dout                  delta matrix, output of previous layer
         * @param outputBlock output matrix
@@ -1831,7 +1899,7 @@ public class LibMatrixCUDA {
 
        /**
         * Performs daxpy operation
-        * 
+        *
         * @param ec execution context
         * @param in1 input matrix 1
         * @param in2 input matrix 2

Reply via email to