Repository: incubator-systemml Updated Branches: refs/heads/master c528b769c -> 627fdbe2d
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/627fdbe2/src/main/java/org/apache/sysml/api/DMLScript.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/api/DMLScript.java b/src/main/java/org/apache/sysml/api/DMLScript.java index bc6bacb..e0e0cb2 100644 --- a/src/main/java/org/apache/sysml/api/DMLScript.java +++ b/src/main/java/org/apache/sysml/api/DMLScript.java @@ -320,7 +320,7 @@ public class DMLScript throw new DMLRuntimeException("Unsupported flag for -gpu:" + flag); } } - GPUContext.createGPUContext(); // Set GPU memory budget + GPUContext.getGPUContext(); // creates the singleton GPU context object. Return value ignored. } else if( args[i].equalsIgnoreCase("-python") ) { parsePyDML = true; http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/627fdbe2/src/main/java/org/apache/sysml/conf/DMLConfig.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java b/src/main/java/org/apache/sysml/conf/DMLConfig.java index ec3a30d..922ba82 100644 --- a/src/main/java/org/apache/sysml/conf/DMLConfig.java +++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java @@ -74,7 +74,7 @@ public class DMLConfig // Fraction of available memory to use. The available memory is computer when the JCudaContext is created // to handle the tradeoff on calling cudaMemGetInfo too often. public static final String GPU_MEMORY_UTILIZATION_FACTOR = "gpu.memory.util.factor"; - // Invoke cudaMemGetInfo to get available memory information. Useful if GPU is shared among multiple application. + // Invoke cudaMemGetInfo to get available memory information. Useful if GPU is shared among multiple application. public static final String REFRESH_AVAILABLE_MEMORY_EVERY_TIME = "gpu.memory.refresh"; // supported prefixes for custom map/reduce configurations http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/627fdbe2/src/main/java/org/apache/sysml/hops/AggUnaryOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java index e2b8f12..797855b 100644 --- a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java +++ b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java @@ -144,11 +144,11 @@ public class AggUnaryOp extends Hop implements MultiThreadedHop } else { //general case int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads); - if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || getMemEstimate() < OptimizerUtils.GPU_MEMORY_BUDGET) && (_op == AggOp.SUM)) { + if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || getMemEstimate() < OptimizerUtils.GPU_MEMORY_BUDGET)) { // Only implemented methods for GPU if ((_op == AggOp.SUM && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) - || (_op == AggOp.MAX && (_direction == Direction.RowCol)) - || (_op == AggOp.MIN && (_direction == Direction.RowCol)) + || (_op == AggOp.MAX && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) + || (_op == AggOp.MIN && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) || (_op == AggOp.MEAN && (_direction == Direction.RowCol))){ et = ExecType.GPU; k = 1; http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/627fdbe2/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java index 5e3ab62..127cafd 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java @@ -73,15 +73,22 @@ public class GPUInstructionParser extends InstructionParser String2GPUInstructionType.put( "sel+" , GPUINSTRUCTION_TYPE.BuiltinUnary); // Aggregate Unary - String2GPUInstructionType.put( "ua+" , GPUINSTRUCTION_TYPE.AggregateUnary); - String2GPUInstructionType.put( "uak+" , GPUINSTRUCTION_TYPE.AggregateUnary); - String2GPUInstructionType.put( "uar+" , GPUINSTRUCTION_TYPE.AggregateUnary); - String2GPUInstructionType.put( "uark+" , GPUINSTRUCTION_TYPE.AggregateUnary); - String2GPUInstructionType.put( "uac+" , GPUINSTRUCTION_TYPE.AggregateUnary); - String2GPUInstructionType.put( "uack+" , GPUINSTRUCTION_TYPE.AggregateUnary); - String2GPUInstructionType.put( "uamean" , GPUINSTRUCTION_TYPE.AggregateUnary); - String2GPUInstructionType.put( "uamax" , GPUINSTRUCTION_TYPE.AggregateUnary); - String2GPUInstructionType.put( "uamin" , GPUINSTRUCTION_TYPE.AggregateUnary); + String2GPUInstructionType.put( "ua+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Sum + String2GPUInstructionType.put( "uak+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Sum + String2GPUInstructionType.put( "uar+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Sum + String2GPUInstructionType.put( "uark+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Sum + String2GPUInstructionType.put( "uac+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Sum + String2GPUInstructionType.put( "uack+" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Sum + String2GPUInstructionType.put( "uamean" , GPUINSTRUCTION_TYPE.AggregateUnary); // Mean + String2GPUInstructionType.put( "uarmean" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Mean + String2GPUInstructionType.put( "uacmean" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Mean + String2GPUInstructionType.put( "uamax" , GPUINSTRUCTION_TYPE.AggregateUnary); // Max + String2GPUInstructionType.put( "uarmax" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Max + String2GPUInstructionType.put( "uacmax" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Max + String2GPUInstructionType.put( "uamin" , GPUINSTRUCTION_TYPE.AggregateUnary); // Min + String2GPUInstructionType.put( "uarmin" , GPUINSTRUCTION_TYPE.AggregateUnary); // Row Min + String2GPUInstructionType.put( "uacmin" , GPUINSTRUCTION_TYPE.AggregateUnary); // Col Min + } public static GPUInstruction parseSingleInstruction (String str ) http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/627fdbe2/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java index f0e31c4..c1b77eb 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java @@ -51,12 +51,11 @@ public abstract class GPUContext { public abstract void ensureComputeCapability() throws DMLRuntimeException; /** - * Creation / Destruction of GPUContext and related handles - * + * Singleton Factory method for creation of {@link GPUContext} * @return GPU context * @throws DMLRuntimeException if DMLRuntimeException occurs */ - public static GPUContext createGPUContext() throws DMLRuntimeException { + public static GPUContext getGPUContext() throws DMLRuntimeException { if(currContext == null && DMLScript.USE_ACCELERATOR) { synchronized(isGPUContextCreated) { currContext = new JCudaContext(); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/627fdbe2/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java index 893f416..1157214 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -45,9 +45,7 @@ import static jcuda.jcusparse.JCusparse.cusparseDestroy; import static jcuda.jcusparse.JCusparse.cusparseCreate; import static jcuda.driver.JCudaDriver.cuInit; import static jcuda.driver.JCudaDriver.cuDeviceGetCount; -import static jcuda.runtime.JCuda.cudaGetDeviceProperties; -import static jcuda.runtime.JCuda.cudaGetDeviceCount; -import static jcuda.runtime.JCuda.cudaMemGetInfo; +import static jcuda.runtime.JCuda.*; import static jcuda.runtime.cudaError.cudaSuccess; /** @@ -59,23 +57,30 @@ import static jcuda.runtime.cudaError.cudaSuccess; */ public class JCudaContext extends GPUContext { + private static final Log LOG = LogFactory.getLog(JCudaContext.class.getName()); + // The minimum CUDA Compute capability needed for SystemML. // After compute capability 3.0, 2^31 - 1 blocks and 1024 threads per block are supported. // If SystemML needs to run on an older card, this logic can be revisited. final int MAJOR_REQUIRED = 3; final int MINOR_REQUIRED = 0; - private static final Log LOG = LogFactory.getLog(JCudaContext.class.getName()); - + /** The total number of cuda devices on this machine */ + public static int deviceCount = -1; + + /** enable this to print debug information before code pertaining to the GPU is executed */ public static boolean DEBUG = false; - - public static long totalNumBytes = 0; - public static AtomicLong availableNumBytesWithoutUtilFactor = new AtomicLong(0); - // Fraction of available memory to use. The available memory is computer when the JCudaContext is created - // to handle the tradeoff on calling cudaMemGetInfo too often. - public boolean REFRESH_AVAILABLE_MEMORY_EVERY_TIME = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.REFRESH_AVAILABLE_MEMORY_EVERY_TIME); + + /** total bytes available on currently active cude device, please be careful with its bookkeeping */ + private AtomicLong deviceMemBytes = new AtomicLong(0); + + /** Stores the cached deviceProperties */ + private static cudaDeviceProp[] deviceProperties; + // Invoke cudaMemGetInfo to get available memory information. Useful if GPU is shared among multiple application. public double GPU_MEMORY_UTILIZATION_FACTOR = ConfigurationManager.getDMLConfig().getDoubleValue(DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR); + // Whether to invoke cudaMemGetInfo for available memory or rely on internal bookkeeping for memory info. + public boolean REFRESH_AVAILABLE_MEMORY_EVERY_TIME = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.REFRESH_AVAILABLE_MEMORY_EVERY_TIME); static { long start = System.nanoTime(); JCuda.setExceptionsEnabled(true); @@ -84,28 +89,57 @@ public class JCudaContext extends GPUContext { JCusparse.setExceptionsEnabled(true); JCudaDriver.setExceptionsEnabled(true); cuInit(0); // Initialize the driver - // Obtain the number of devices - int deviceCountArray[] = { 0 }; - cuDeviceGetCount(deviceCountArray); - int deviceCount = deviceCountArray[0]; - LOG.info("Total number of GPUs on the machine: " + deviceCount); - Statistics.cudaInitTime = System.nanoTime() - start; + + int deviceCountArray[] = { 0 }; + cuDeviceGetCount(deviceCountArray); // Obtain the number of devices + deviceCount = deviceCountArray[0]; + deviceProperties = new cudaDeviceProp[deviceCount]; + + LOG.info("Total number of GPUs on the machine: " + deviceCount); + int maxBlocks = getMaxBlocks(); + int maxThreadsPerBlock = getMaxThreadsPerBlock(); + long sharedMemPerBlock = getMaxSharedMemory(); + int[] device = {-1}; + cudaGetDevice(device); + LOG.info("Active CUDA device number : " + device[0]); + LOG.info("Max Blocks/Threads/SharedMem : " + maxBlocks + "/" + maxThreadsPerBlock + "/" + sharedMemPerBlock); + + Statistics.cudaInitTime = System.nanoTime() - start; + + start = System.nanoTime(); + LibMatrixCUDA.cudnnHandle = new cudnnHandle(); + cudnnCreate(LibMatrixCUDA.cudnnHandle); + LibMatrixCUDA.cublasHandle = new cublasHandle(); + cublasCreate(LibMatrixCUDA.cublasHandle); + // For cublas v2, cublasSetPointerMode tells Cublas whether to expect scalar arguments on device or on host + // This applies to arguments like "alpha" in Dgemm, and "y" in Ddot. + // cublasSetPointerMode(LibMatrixCUDA.cublasHandle, cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE); + LibMatrixCUDA.cusparseHandle = new cusparseHandle(); + cusparseCreate(LibMatrixCUDA.cusparseHandle); + Statistics.cudaLibrariesInitTime = System.nanoTime() - start; + + try { + LibMatrixCUDA.kernels = new JCudaKernels(); + } catch (DMLRuntimeException e) { + System.err.println("ERROR - Unable to initialize JCudaKernels. System in an inconsistent state"); + LibMatrixCUDA.kernels = null; + } + } @Override public long getAvailableMemory() { - if(REFRESH_AVAILABLE_MEMORY_EVERY_TIME) { - long free [] = { 0 }; - long total [] = { 0 }; - if(cudaMemGetInfo(free, total) == cudaSuccess) { - totalNumBytes = total[0]; - availableNumBytesWithoutUtilFactor.set(free[0]); - } - else { - throw new RuntimeException("ERROR: Unable to get memory information of the GPU."); - } + if (REFRESH_AVAILABLE_MEMORY_EVERY_TIME) { + long free[] = {0}; + long total[] = {0}; + if (cudaMemGetInfo(free, total) == cudaSuccess) { + long totalNumBytes = total[0]; + deviceMemBytes.set(free[0]); + } else { + throw new RuntimeException("ERROR: Unable to get memory information of the GPU."); + } } - return (long) (availableNumBytesWithoutUtilFactor.get()*GPU_MEMORY_UTILIZATION_FACTOR); + return (long) (deviceMemBytes.get()*GPU_MEMORY_UTILIZATION_FACTOR); } @Override @@ -117,8 +151,7 @@ public class JCudaContext extends GPUContext { } boolean isComputeCapable = true; for (int i=0; i<devices[0]; i++) { - cudaDeviceProp properties = new cudaDeviceProp(); - cudaGetDeviceProperties(properties, i); + cudaDeviceProp properties = getGPUProperties(i); int major = properties.major; int minor = properties.minor; if (major < MAJOR_REQUIRED) { @@ -131,8 +164,77 @@ public class JCudaContext extends GPUContext { throw new DMLRuntimeException("One of the CUDA cards on the system has compute capability lower than " + MAJOR_REQUIRED + "." + MINOR_REQUIRED); } } - - + + /** + * Gets the device properties for the active GPU (set with cudaSetDevice()) + * @return + */ + public static cudaDeviceProp getGPUProperties() { + int[] device = {-1}; + cudaGetDevice(device); // Get currently active device + return getGPUProperties(device[0]); + } + + /** + * Gets the device properties + * @param device the device number (on a machine with more than 1 GPU) + * @return + */ + public static cudaDeviceProp getGPUProperties(int device){ + if (deviceProperties[device] == null) { + cudaDeviceProp properties = new cudaDeviceProp(); + cudaGetDeviceProperties(properties, device); + deviceProperties[device] = properties; + } + return deviceProperties[device]; + } + + + /** + * Gets the maximum number of threads per block for "active" GPU + * @return + */ + public static int getMaxThreadsPerBlock() { + cudaDeviceProp deviceProps = getGPUProperties(); + return deviceProps.maxThreadsPerBlock; + } + + /** + * Gets the maximum number of blocks supported by the active cuda device + * @return + */ + public static int getMaxBlocks() { + cudaDeviceProp deviceProp = getGPUProperties(); + return deviceProp.maxGridSize[0]; + } + + /** + * Gets the shared memory per block supported by the active cuda device + * @return + */ + public static long getMaxSharedMemory() { + cudaDeviceProp deviceProp = getGPUProperties(); + return deviceProp.sharedMemPerBlock; + } + + /** + * Gets the warp size supported by the active cuda device + * @return + */ + public static int getWarpSize() { + cudaDeviceProp deviceProp = getGPUProperties(); + return deviceProp.warpSize; + } + + /** + * Gets the available memory and then adds value to it + * @param v the value to add + * @return + */ + public long getAndAddAvailableMemory(long v){ + return deviceMemBytes.getAndAdd(v); + } + public JCudaContext() throws DMLRuntimeException { if(isGPUContextCreated) { // Wait until it is deleted. This case happens during multi-threaded testing. @@ -149,33 +251,23 @@ public class JCudaContext extends GPUContext { } } } - GPUContext.currContext = this; - - long start = System.nanoTime(); - LibMatrixCUDA.cudnnHandle = new cudnnHandle(); - cudnnCreate(LibMatrixCUDA.cudnnHandle); - LibMatrixCUDA.cublasHandle = new cublasHandle(); - cublasCreate(LibMatrixCUDA.cublasHandle); - // For cublas v2, cublasSetPointerMode tells Cublas whether to expect scalar arguments on device or on host - // This applies to arguments like "alpha" in Dgemm, and "y" in Ddot. - // cublasSetPointerMode(LibMatrixCUDA.cublasHandle, cublasPointerMode.CUBLAS_POINTER_MODE_DEVICE); - LibMatrixCUDA.cusparseHandle = new cusparseHandle(); - cusparseCreate(LibMatrixCUDA.cusparseHandle); - Statistics.cudaLibrariesInitTime = System.nanoTime() - start; - + synchronized (isGPUContextCreated){ + GPUContext.currContext = this; + } + long free [] = { 0 }; - long total [] = { 0 }; - if(cudaMemGetInfo(free, total) == cudaSuccess) { - totalNumBytes = total[0]; - availableNumBytesWithoutUtilFactor.set(free[0]); - } - else { - throw new RuntimeException("ERROR: Unable to get memory information of the GPU."); - } - LOG.info("Total GPU memory: " + (totalNumBytes*(1e-6)) + " MB"); - LOG.info("Available GPU memory: " + (availableNumBytesWithoutUtilFactor.get()*(1e-6)) + " MB"); - - LibMatrixCUDA.kernels = new JCudaKernels(); + long total [] = { 0 }; + long totalNumBytes = 0; + if(cudaMemGetInfo(free, total) == cudaSuccess) { + totalNumBytes = total[0]; + deviceMemBytes.set(free[0]); + } + else { + throw new RuntimeException("ERROR: Unable to get memory information of the GPU."); + } + LOG.info("Total GPU memory: " + (totalNumBytes*(1e-6)) + " MB"); + LOG.info("Available GPU memory: " + (deviceMemBytes.get()*(1e-6)) + " MB"); + } @Override @@ -194,4 +286,4 @@ public class JCudaContext extends GPUContext { } } -} \ No newline at end of file +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/627fdbe2/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java index f39e804..58bc9ec 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java @@ -643,7 +643,7 @@ public class JCudaObject extends GPUObject { if(numElemToAllocate == -1 && LibMatrixCUDA.isInSparseFormat(mat)) { setSparseMatrixCudaPointer(CSRPointer.allocateEmpty(mat.getNnz(), mat.getNumRows())); numBytes = CSRPointer.estimateSize(mat.getNnz(), mat.getNumRows()); - JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(-numBytes); + JCudaContext.deviceMemBytes.addAndGet(-numBytes); isInSparseFormat = true; //throw new DMLRuntimeException("Sparse format not implemented"); } else if(numElemToAllocate == -1) { @@ -651,7 +651,7 @@ public class JCudaObject extends GPUObject { setDenseMatrixCudaPointer(new Pointer()); numBytes = mat.getNumRows()*getDoubleSizeOf(mat.getNumColumns()); cudaMalloc(jcudaDenseMatrixPtr, numBytes); - JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(-numBytes); + JCudaContext.deviceMemBytes.addAndGet(-numBytes); } else { // Called for dense output @@ -660,7 +660,7 @@ public class JCudaObject extends GPUObject { if(numElemToAllocate <= 0 || numBytes <= 0) throw new DMLRuntimeException("Cannot allocate dense matrix object with " + numElemToAllocate + " elements and size " + numBytes); cudaMalloc(jcudaDenseMatrixPtr, numBytes); - JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(-numBytes); + JCudaContext.deviceMemBytes.addAndGet(-numBytes); } Statistics.cudaAllocTime.addAndGet(System.nanoTime()-start); @@ -712,7 +712,7 @@ public class JCudaObject extends GPUObject { public void setDeviceModify(long numBytes) { this.numLocks.addAndGet(1); this.numBytes = numBytes; - JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(-numBytes); + ((JCudaContext)GPUContext.currContext).getAndAddAvailableMemory(-numBytes); } @Override @@ -720,14 +720,14 @@ public class JCudaObject extends GPUObject { if(jcudaDenseMatrixPtr != null) { long start = System.nanoTime(); cudaFree(jcudaDenseMatrixPtr); - JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(numBytes); + ((JCudaContext)GPUContext.currContext).getAndAddAvailableMemory(numBytes); Statistics.cudaDeAllocTime.addAndGet(System.nanoTime()-start); Statistics.cudaDeAllocCount.addAndGet(1); } if (jcudaSparseMatrixPtr != null) { long start = System.nanoTime(); jcudaSparseMatrixPtr.deallocate(); - JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(numBytes); + ((JCudaContext)GPUContext.currContext).getAndAddAvailableMemory(numBytes); Statistics.cudaDeAllocTime.addAndGet(System.nanoTime()-start); Statistics.cudaDeAllocCount.addAndGet(1); } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/627fdbe2/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java index 1af2a1d..87a66f4 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java @@ -65,9 +65,7 @@ import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.functionobjects.*; import org.apache.sysml.runtime.instructions.cp.DoubleObject; -import org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig; -import org.apache.sysml.runtime.instructions.gpu.context.JCudaKernels; -import org.apache.sysml.runtime.instructions.gpu.context.JCudaObject; +import org.apache.sysml.runtime.instructions.gpu.context.*; import org.apache.sysml.runtime.instructions.gpu.context.JCudaObject.CSRPointer; import org.apache.sysml.runtime.matrix.operators.*; import org.apache.sysml.utils.Statistics; @@ -90,15 +88,68 @@ import jcuda.jcusparse.cusparseHandle; public class LibMatrixCUDA { // Assume Compute Capability 3.0 - public static final int MAX_THREADS = 1024; // For compute capability > 3.0 - public static final int MAX_BLOCKS = 2147483647; // 2^31 - 1 For compute capability > 3.0 + // MAX BLOCKS is 2^31 - 1 For compute capability > 3.0 + // MAX_THREADS is 1024 For compute capability > 3.0 + private static int _MAX_THREADS = -1; + private static int _MAX_BLOCKS = -1; + private static int _WARP_SIZE = -1; + + /** + * Utility function to get maximum number of threads supported by the active CUDA device. + * This is put into a singleton style method because the GPUContext is not fully initialized when + * the LibMatrixCUDA class is loaded. If the {@link GPUContext#getGPUContext()} is invoked in a + * static block in this class, it will access the {@link JCudaContext} instance when it is not + * properly initialized. Due to the proper checks in place, a deadlock occurs. + * @return max threads + * @throws DMLRuntimeException if exception occurs + */ + static int getMaxThreads() throws DMLRuntimeException{ + if (_MAX_THREADS == -1){ + _MAX_THREADS = JCudaContext.getMaxThreadsPerBlock(); + } + return _MAX_THREADS; + } + + /** + * Utility function to get maximum number of blocks supported by the active CUDA device. + * This is put into a singleton style method because the GPUContext is not fully initialized when + * the LibMatrixCUDA class is loaded. If the {@link GPUContext#getGPUContext()} is invoked in a + * static block in this class, it will access the {@link JCudaContext} instance when it is not + * properly initialized. Due to the proper checks in place, a deadlock occurs. + * @return max blocks + * @throws DMLRuntimeException if exception occurs + */ + static int getMaxBlocks() throws DMLRuntimeException{ + if (_MAX_BLOCKS == -1){ + _MAX_BLOCKS = JCudaContext.getMaxBlocks(); + } + return _MAX_BLOCKS; + } + + /** + * Utility function to get the warp size supported by the active CUDA device. + * This is put into a singleton style method because the GPUContext is not fully initialized when + * the LibMatrixCUDA class is loaded. If the {@link GPUContext#getGPUContext()} is invoked in a + * static block in this class, it will access the {@link JCudaContext} instance when it is not + * properly initialized. Due to the proper checks in place, a deadlock occurs. + * @return warp size + * @throws DMLRuntimeException if exception occurs + */ + static int getWarpSize() throws DMLRuntimeException { + if (_WARP_SIZE == -1) { + _WARP_SIZE = JCudaContext.getWarpSize(); + } + return _WARP_SIZE; + } + + public static cudnnHandle cudnnHandle; public static cublasHandle cublasHandle; public static cusparseHandle cusparseHandle; public static JCudaKernels kernels; // Used to launch custom kernels - private static final Log LOG = LogFactory.getLog(LibMatrixCUDA.class.getName()); + private static final Log LOG = LogFactory.getLog(LibMatrixCUDA.class.getName()); private static int CONVOLUTION_PREFERENCE = cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE; @@ -240,7 +291,7 @@ public class LibMatrixCUDA { /** * This method computes the backpropagation errors for previous layer of relu operation - * + * * @param input input image * @param dout next layer error propogation * @param outputBlock output @@ -262,13 +313,13 @@ public class LibMatrixCUDA { ExecutionConfig.getConfigForSimpleMatrixOperations((int)rows, (int)cols), imagePointer, doutPointer, outputPointer, (int)rows, (int)cols); } - + /** * Performs the operation corresponding to the DML script: - * ones = matrix(1, rows=1, cols=Hout*Wout) + * ones = matrix(1, rows=1, cols=Hout*Wout) * output = input + matrix(bias %*% ones, rows=1, cols=F*Hout*Wout) * This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function - * + * * @param input input image * @param bias bias * @param outputBlock output @@ -299,8 +350,8 @@ public class LibMatrixCUDA { /** * This method computes the backpropogation errors for filter of convolution operation - * - * @param image input image + * + * @param image input image * @param dout errors from next layer * @param outputBlock output errors * @param N number of images @@ -312,7 +363,7 @@ public class LibMatrixCUDA { * @param S filter width * @param pad_h pad height * @param pad_w pad width - * @param stride_h stride height + * @param stride_h stride height * @param stride_w stride width * @param P output activation height * @param Q output activation width @@ -1063,11 +1114,11 @@ public class LibMatrixCUDA { break; } case REDUCTION_COL : { // The names are a bit misleading, REDUCTION_COL refers to the direction (reduce all elements in a column) - reduceRow("reduce_row", in, out, rlen, clen); + reduceRow("reduce_row_sum", in, out, rlen, clen); break; } case REDUCTION_ROW : { - reduceCol("reduce_col", in, out, rlen, clen); + reduceCol("reduce_col_sum", in, out, rlen, clen); break; } @@ -1101,7 +1152,7 @@ public class LibMatrixCUDA { default: throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for mean"); } - // break; + break; } case OP_VARIANCE : { switch(reductionDirection) { @@ -1130,13 +1181,18 @@ public class LibMatrixCUDA { ec.setScalarOutput(output, new DoubleObject(result)); break; } - case REDUCTION_COL: - case REDUCTION_ROW: - throw new DMLRuntimeException("Internal Error - All, Row & Column max of matrix not implemented yet for GPU "); + case REDUCTION_COL: { + reduceRow("reduce_row_max", in, out, rlen, clen); + break; + } + case REDUCTION_ROW: { + reduceCol("reduce_col_max", in, out, rlen, clen); + break; + } default: throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for max"); } - // break; + break; } case OP_MIN :{ switch(reductionDirection) { @@ -1145,13 +1201,18 @@ public class LibMatrixCUDA { ec.setScalarOutput(output, new DoubleObject(result)); break; } - case REDUCTION_COL: - case REDUCTION_ROW: - throw new DMLRuntimeException("Internal Error - All, Row & Column min of matrix not implemented yet for GPU "); + case REDUCTION_COL: { + reduceRow("reduce_row_min", in, out, rlen, clen); + break; + } + case REDUCTION_ROW: { + reduceCol("reduce_col_min", in, out, rlen, clen); + break; + } default: throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for min"); } - // break; + break; } case OP_MAXINDEX : { switch(reductionDirection) { @@ -1237,7 +1298,7 @@ public class LibMatrixCUDA { private static void reduceCol(String kernelFunction, Pointer in, Pointer out, int rows, int cols) throws DMLRuntimeException { int[] tmp = getKernelParamsForReduceByCol(rows, cols); int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2]; - kernels.launchKernel("reduce_col", new ExecutionConfig(blocks, threads, sharedMem), + kernels.launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, out, rows, cols); cudaDeviceSynchronize(); } @@ -1247,14 +1308,17 @@ public class LibMatrixCUDA { * @param n size of input array * @return integer array containing {blocks, threads, shared memory} */ - private static int[] getKernelParamsForReduceAll(int n){ - int threads = (n < MAX_THREADS*2) ? nextPow2((n + 1)/ 2) : MAX_THREADS; + private static int[] getKernelParamsForReduceAll(int n) throws DMLRuntimeException{ + final int MAX_THREADS = getMaxThreads(); + final int MAX_BLOCKS = getMaxBlocks(); + final int WARP_SIZE = getWarpSize(); + int threads = (n < MAX_THREADS *2) ? nextPow2((n + 1)/ 2) : MAX_THREADS; int blocks = (n + (threads * 2 - 1)) / (threads * 2); blocks = Math.min(MAX_BLOCKS, blocks); int sharedMemSize = threads * Sizeof.DOUBLE; - if (threads <= 32){ + if (threads <= WARP_SIZE){ sharedMemSize *= 2; } return new int[] {blocks, threads, sharedMemSize}; @@ -1266,23 +1330,27 @@ public class LibMatrixCUDA { * @param cols number of columns in input matrix * @return integer array containing {blocks, threads, shared memory} */ - private static int[] getKernelParamsForReduceByRow(int rows, int cols) { - final int WARP_SIZE = 32; - int threads = Math.min(cols, WARP_SIZE); + private static int[] getKernelParamsForReduceByRow(int rows, int cols) throws DMLRuntimeException { + final int WARP_SIZE = getWarpSize(); + final int MAX_THREADS = getMaxThreads(); + int threads = (cols < MAX_THREADS *2) ? nextPow2((cols + 1)/ 2) : MAX_THREADS; int blocks = rows; int sharedMemSize = threads * Sizeof.DOUBLE; - if (threads <= 32){ + if (threads <= WARP_SIZE){ sharedMemSize *=2; } return new int[] {blocks, threads, sharedMemSize}; } - private static int[] getKernelParamsForReduceByCol(int rows, int cols) { + private static int[] getKernelParamsForReduceByCol(int rows, int cols) throws DMLRuntimeException { + final int MAX_THREADS = getMaxThreads(); + final int MAX_BLOCKS = getMaxBlocks(); + final int WARP_SIZE = getWarpSize(); int threads = Math.min(cols, MAX_THREADS); - int blocks = cols/1024; - if (cols % 1024 != 0) blocks++; + int blocks = cols/MAX_THREADS; + if (cols % MAX_THREADS != 0) blocks++; int sharedMemSize = threads * Sizeof.DOUBLE; - if (threads <= 32){ + if (threads <= WARP_SIZE){ sharedMemSize *=2; } return new int[] {blocks, threads, sharedMemSize}; @@ -1307,8 +1375,8 @@ public class LibMatrixCUDA { /** * This method computes the backpropogation errors for previous layer of convolution operation - * - * @param filter filter used in conv2d + * + * @param filter filter used in conv2d * @param dout errors from next layer * @param output output errors * @param N number of images @@ -1320,7 +1388,7 @@ public class LibMatrixCUDA { * @param S filter width * @param pad_h pad height * @param pad_w pad width - * @param stride_h stride height + * @param stride_h stride height * @param stride_w stride width * @param P output activation height * @param Q output activation width @@ -1464,7 +1532,7 @@ public class LibMatrixCUDA { /** * Performs maxpoolingBackward on GPU by exploiting cudnnPoolingBackward(...) * This method computes the backpropogation errors for previous layer of maxpooling operation - * + * * @param image image as matrix object * @param dout delta matrix, output of previous layer * @param outputBlock output matrix @@ -1831,7 +1899,7 @@ public class LibMatrixCUDA { /** * Performs daxpy operation - * + * * @param ec execution context * @param in1 input matrix 1 * @param in2 input matrix 2
