Repository: incubator-systemml Updated Branches: refs/heads/master 464600e3e -> be2632127
[SYSTEMML-445][SYSTEMML-540] Adding NNZ to Convolution CP instruction and also refactoring GPUContext and GPUObject. Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/be263212 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/be263212 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/be263212 Branch: refs/heads/master Commit: be2632127269bb6a705923e0996fa27a9978b03e Parents: 464600e Author: Niketan Pansare <npan...@us.ibm.com> Authored: Fri Jun 24 10:30:14 2016 -0700 Committer: Niketan Pansare <npan...@us.ibm.com> Committed: Fri Jun 24 10:31:21 2016 -0700 ---------------------------------------------------------------------- .../java/org/apache/sysml/api/DMLScript.java | 2 +- .../controlprogram/caching/CacheableData.java | 2 +- .../controlprogram/caching/MatrixObject.java | 2 +- .../context/ExecutionContext.java | 1 + .../controlprogram/context/GPUContext.java | 72 ----- .../controlprogram/context/GPUObject.java | 168 ------------ .../controlprogram/context/JCudaContext.java | 148 ----------- .../controlprogram/context/JCudaObject.java | 263 ------------------- .../cp/ConvolutionCPInstruction.java | 10 +- .../instructions/gpu/GPUInstruction.java | 2 +- .../instructions/gpu/context/GPUContext.java | 71 +++++ .../instructions/gpu/context/GPUObject.java | 167 ++++++++++++ .../instructions/gpu/context/JCudaContext.java | 148 +++++++++++ .../instructions/gpu/context/JCudaObject.java | 263 +++++++++++++++++++ .../runtime/matrix/data/LibMatrixCUDA.java | 2 +- .../sysml/runtime/matrix/data/LibMatrixDNN.java | 16 +- 16 files changed, 677 insertions(+), 660 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/api/DMLScript.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/api/DMLScript.java b/src/main/java/org/apache/sysml/api/DMLScript.java index a7c2db6..814bcb8 100644 --- a/src/main/java/org/apache/sysml/api/DMLScript.java +++ b/src/main/java/org/apache/sysml/api/DMLScript.java @@ -69,7 +69,7 @@ import org.apache.sysml.runtime.controlprogram.caching.CacheStatistics; import org.apache.sysml.runtime.controlprogram.caching.CacheableData; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.controlprogram.context.ExecutionContextFactory; -import org.apache.sysml.runtime.controlprogram.context.GPUContext; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext; import org.apache.sysml.runtime.controlprogram.parfor.ProgramConverter; import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer; http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java index 4bab6b8..d60c607 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java @@ -33,7 +33,7 @@ import org.apache.sysml.parser.Expression.DataType; import org.apache.sysml.parser.Expression.ValueType; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.LazyWriteBuffer.RPolicy; -import org.apache.sysml.runtime.controlprogram.context.GPUObject; +import org.apache.sysml.runtime.instructions.gpu.context.GPUObject; import org.apache.sysml.runtime.controlprogram.parfor.util.IDSequence; import org.apache.sysml.runtime.instructions.cp.Data; import org.apache.sysml.runtime.instructions.spark.data.BroadcastObject; http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/controlprogram/caching/MatrixObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/MatrixObject.java b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/MatrixObject.java index 2144ef8..5d5f41f 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/MatrixObject.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/MatrixObject.java @@ -32,8 +32,8 @@ import org.apache.sysml.parser.Expression.DataType; import org.apache.sysml.parser.Expression.ValueType; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat; -import org.apache.sysml.runtime.controlprogram.context.GPUContext; import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; import org.apache.sysml.runtime.instructions.spark.data.RDDObject; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; import org.apache.sysml.runtime.matrix.MatrixDimensionsMetaData; http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java index 051347b..7e089c8 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java @@ -45,6 +45,7 @@ import org.apache.sysml.runtime.instructions.cp.FunctionCallCPInstruction; import org.apache.sysml.runtime.instructions.cp.IntObject; import org.apache.sysml.runtime.instructions.cp.ScalarObject; import org.apache.sysml.runtime.instructions.cp.StringObject; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; import org.apache.sysml.runtime.matrix.MatrixDimensionsMetaData; import org.apache.sysml.runtime.matrix.MatrixFormatMetaData; http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUContext.java deleted file mode 100644 index 91a236b..0000000 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUContext.java +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.sysml.runtime.controlprogram.context; - -import java.util.ArrayList; - -import org.apache.sysml.api.DMLScript; -import org.apache.sysml.hops.OptimizerUtils; -import org.apache.sysml.runtime.DMLRuntimeException; -import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; - -//FIXME merge JCudaContext into GPUContext as this context is anyway CUDA specific -//FIXME move to gpu instruction package -public abstract class GPUContext { - - public static ArrayList<GPUObject> allocatedPointers = new ArrayList<GPUObject>(); - protected static GPUContext currContext; - protected GPUContext() { } - - public static volatile Boolean isGPUContextCreated = false; - - public abstract long getAvailableMemory(); - - // Creation / Destruction of GPUContext and related handles - public static GPUContext createGPUContext() { - if(currContext == null && DMLScript.USE_ACCELERATOR) { - // TODO: Handle this thread and resolve concurrency related bugs if any - new Thread(new Runnable() { - @Override - public void run() { - // Lazy GPU context creation - synchronized(isGPUContextCreated) { - currContext = new JCudaContext(); - OptimizerUtils.GPU_MEMORY_BUDGET = ((JCudaContext)currContext).getAvailableMemory(); - isGPUContextCreated = true; - } - } - }).start(); - } - return currContext; - } - public static GPUObject createGPUObject(MatrixObject mo) { - if(DMLScript.USE_ACCELERATOR) { - synchronized(isGPUContextCreated) { - if(currContext == null) - throw new RuntimeException("GPUContext is not created"); - if(currContext instanceof JCudaContext) - return new JCudaObject(mo); - } - } - throw new RuntimeException("Cannot create createGPUObject when USE_ACCELERATOR is off"); - } - public abstract void destroy() throws DMLRuntimeException; - - -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUObject.java deleted file mode 100644 index 8037b8a..0000000 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/context/GPUObject.java +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.sysml.runtime.controlprogram.context; - -import java.util.Collections; -import java.util.Comparator; -import java.util.concurrent.atomic.AtomicInteger; - -import org.apache.sysml.runtime.DMLRuntimeException; -import org.apache.sysml.runtime.controlprogram.caching.CacheException; -import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; -import org.apache.sysml.utils.Statistics; - -//FIXME merge JCudaObject into GPUObject to avoid unnecessary complexity -//FIXME move to gpu instruction package -public abstract class GPUObject -{ - protected boolean isDeviceCopyModified = false; - protected AtomicInteger numLocks = new AtomicInteger(0); - protected boolean isInSparseFormat = false; - protected boolean isAllocated = false; - protected MatrixObject mat = null; - - protected GPUObject(MatrixObject mat2) { - this.mat = mat2; - } - - public boolean isInSparseFormat() { - return isInSparseFormat; - } - - public boolean isAllocated() { - return isAllocated; - } - - public abstract void acquireDeviceRead() throws DMLRuntimeException; - public abstract void acquireDenseDeviceModify(int numElemsToAllocate) throws DMLRuntimeException; - public abstract void acquireHostRead() throws CacheException; - public abstract void acquireHostModify() throws CacheException; - public abstract void release(boolean isGPUCopyModified) throws CacheException; - - - // package-level visibility as these methods are guarded by underlying GPUContext - abstract void allocateMemoryOnDevice(int numElemToAllocate) throws DMLRuntimeException; - abstract void deallocateMemoryOnDevice() throws DMLRuntimeException; - abstract long getSizeOnDevice() throws DMLRuntimeException; - abstract void copyFromHostToDevice() throws DMLRuntimeException; - abstract void copyFromDeviceToHost() throws DMLRuntimeException; // Called by export() - - - /** - * It finds matrix toBeRemoved such that toBeRemoved.GPUSize >= size - * // TODO: it is the smallest matrix size that satisfy the above condition. For now just evicting the largest pointer. - * Then returns toBeRemoved. - * - */ - protected void evict(long GPUSize) throws DMLRuntimeException { - if(GPUContext.allocatedPointers.size() == 0) { - throw new DMLRuntimeException("There is not enough memory on device for this matrix!"); - } - - Statistics.cudaEvictionCount.addAndGet(1); - - synchronized(evictionLock) { - Collections.sort(GPUContext.allocatedPointers, new Comparator<GPUObject>() { - - @Override - public int compare(GPUObject p1, GPUObject p2) { - int p1Val = p1.numLocks.get(); - int p2Val = p2.numLocks.get(); - - if(p1Val < 0 || p2Val < 0) { - throw new RuntimeException("Number of locks cannot be negative"); - } - else if(p1Val == 0 && p2Val == 0) { - // Both p1 and p2 are unlocked, return largest object - // TODO: Modify this !! - long p1Size = 0; long p2Size = 0; - try { - p1Size = p1.getSizeOnDevice(); - p2Size = p2.getSizeOnDevice(); - } catch (DMLRuntimeException e) { - throw new RuntimeException(e); - } - if(p1Size == p2Size) { - return 0; - } - else if(p1Size < p2Size) { - return 1; - } - else { - return -1; - } - } - else if(p1Val > p2Val) { - // There are more locks on p1 - return 1; - } - else { - // There are more locks on p2 - return -1; - } - } - }); - - - while(GPUSize > getAvailableMemory() && GPUContext.allocatedPointers.size() > 0) { - GPUObject toBeRemoved = GPUContext.allocatedPointers.get(GPUContext.allocatedPointers.size() - 1); - if(toBeRemoved.numLocks.get() != 0) { - throw new DMLRuntimeException("There is not enough memory on device for this matrix!"); - } - if(toBeRemoved.isDeviceCopyModified) { - toBeRemoved.copyFromDeviceToHost(); - } - toBeRemoved.clearData(); - } - } - } - - public void clearData() throws CacheException { - synchronized(evictionLock) { - GPUContext.allocatedPointers.remove(this); - } - try { - deallocateMemoryOnDevice(); - } catch (DMLRuntimeException e) { - throw new CacheException(e); - } - } - - static Boolean evictionLock = new Boolean(true); - - protected long getAvailableMemory() { - return GPUContext.currContext.getAvailableMemory(); - } - -// // Copying from device -> host occurs here -// // Called by MatrixObject's exportData -// public void exportData() throws CacheException { -// boolean isDeviceCopyModified = mat.getGPUObject() != null && mat.getGPUObject().isDeviceCopyModified; -// boolean isHostCopyUnavailable = mat.getMatrixBlock() == null || -// (mat.getMatrixBlock().getDenseBlock() == null && mat.getMatrixBlock().getSparseBlock() == null); -// -// if(mat.getGPUObject() != null && (isDeviceCopyModified || isHostCopyUnavailable)) { -// try { -// mat.getGPUObject().copyFromDeviceToHost(); -// } catch (DMLRuntimeException e) { -// throw new CacheException(e); -// } -// } -// } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaContext.java b/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaContext.java deleted file mode 100644 index bfb823e..0000000 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaContext.java +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.sysml.runtime.controlprogram.context; - -import java.util.concurrent.atomic.AtomicLong; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.sysml.runtime.DMLRuntimeException; -import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; -import org.apache.sysml.utils.Statistics; - -import jcuda.driver.JCudaDriver; -import jcuda.jcublas.JCublas2; -import jcuda.jcublas.cublasHandle; -import jcuda.jcudnn.JCudnn; -import jcuda.runtime.JCuda; -import jcuda.jcudnn.cudnnHandle; -import static jcuda.jcudnn.JCudnn.cudnnCreate; -import static jcuda.jcublas.JCublas2.cublasCreate; -import static jcuda.jcublas.JCublas2.cublasDestroy; -import static jcuda.jcudnn.JCudnn.cudnnDestroy; -import static jcuda.driver.JCudaDriver.cuInit; -import static jcuda.driver.JCudaDriver.cuDeviceGetCount; -import static jcuda.runtime.JCuda.cudaMemGetInfo; -import static jcuda.runtime.cudaError.cudaSuccess; - -/** - * Setup: - * 1. Install CUDA 7.5 - * 2. Install CuDNN v4 from http://developer.download.nvidia.com/compute/redist/cudnn/v4/cudnn-7.0-win-x64-v4.0-prod.zip - * 3. Download JCuda binaries version 0.7.5b and JCudnn version 0.7.5. Copy the DLLs into C:\lib (or /lib) directory. Link: http://www.jcuda.org/downloads/downloads.html - * - */ -public class JCudaContext extends GPUContext { - - private static final Log LOG = LogFactory.getLog(JCudaContext.class.getName()); - - public static boolean DEBUG = false; - - public static long totalNumBytes = 0; - public static AtomicLong availableNumBytesWithoutUtilFactor = new AtomicLong(0); - // Fraction of available memory to use. The available memory is computer when the JCudaContext is created - // to handle the tradeoff on calling cudaMemGetInfo too often. - public static double GPU_MEMORY_UTILIZATION_FACTOR = 0.9; - public static boolean REFRESH_AVAILABLE_MEMORY_EVERY_TIME = true; - - static { - long start = System.nanoTime(); - JCuda.setExceptionsEnabled(true); - JCudnn.setExceptionsEnabled(true); - JCublas2.setExceptionsEnabled(true); - JCudaDriver.setExceptionsEnabled(true); - cuInit(0); // Initialize the driver - // Obtain the number of devices - int deviceCountArray[] = { 0 }; - cuDeviceGetCount(deviceCountArray); - int deviceCount = deviceCountArray[0]; - LOG.info("Total number of GPUs on the machine: " + deviceCount); - Statistics.cudaInitTime = System.nanoTime() - start; - } - - public long getAvailableMemory() { - if(REFRESH_AVAILABLE_MEMORY_EVERY_TIME) { - long free [] = { 0 }; - long total [] = { 0 }; - if(cudaMemGetInfo(free, total) == cudaSuccess) { - totalNumBytes = total[0]; - availableNumBytesWithoutUtilFactor.set(free[0]); - } - else { - throw new RuntimeException("ERROR: Unable to get memory information of the GPU."); - } - } - return (long) (availableNumBytesWithoutUtilFactor.get()*GPU_MEMORY_UTILIZATION_FACTOR); - } - - - public JCudaContext() { - if(isGPUContextCreated) { - // Wait until it is deleted. This case happens during multi-threaded testing. - // This also allows for multi-threaded execute calls - long startTime = System.currentTimeMillis(); - do { - try { - Thread.sleep(100); - } catch (InterruptedException e) {} - } while(isGPUContextCreated && (System.currentTimeMillis() - startTime) < 60000); - synchronized(isGPUContextCreated) { - if(GPUContext.currContext != null) { - throw new RuntimeException("Cannot create multiple JCudaContext. Waited for 10 min to close previous GPUContext"); - } - } - } - GPUContext.currContext = this; - - long start = System.nanoTime(); - LibMatrixCUDA.cudnnHandle = new cudnnHandle(); - cudnnCreate(LibMatrixCUDA.cudnnHandle); - LibMatrixCUDA.cublasHandle = new cublasHandle(); - cublasCreate(LibMatrixCUDA.cublasHandle); - Statistics.cudaLibrariesInitTime = System.nanoTime() - start; - - long free [] = { 0 }; - long total [] = { 0 }; - if(cudaMemGetInfo(free, total) == cudaSuccess) { - totalNumBytes = total[0]; - availableNumBytesWithoutUtilFactor.set(free[0]); - } - else { - throw new RuntimeException("ERROR: Unable to get memory information of the GPU."); - } - LOG.info("Total GPU memory: " + (totalNumBytes*(1e-6)) + " MB"); - LOG.info("Available GPU memory: " + (availableNumBytesWithoutUtilFactor.get()*(1e-6)) + " MB"); - } - - @Override - public void destroy() throws DMLRuntimeException { - if(currContext != null) { - synchronized(isGPUContextCreated) { - cudnnDestroy(LibMatrixCUDA.cudnnHandle); - cublasDestroy(LibMatrixCUDA.cublasHandle); - currContext = null; - isGPUContextCreated = false; - } - } - else if(LibMatrixCUDA.cudnnHandle != null || LibMatrixCUDA.cublasHandle != null) { - throw new DMLRuntimeException("Error while destroying the GPUContext"); - } - } - -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaObject.java b/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaObject.java deleted file mode 100644 index 5d37909..0000000 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/context/JCudaObject.java +++ /dev/null @@ -1,263 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.sysml.runtime.controlprogram.context; - -import static jcuda.runtime.JCuda.cudaFree; -import static jcuda.runtime.JCuda.cudaMalloc; -import static jcuda.runtime.JCuda.cudaMemcpy; -import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice; -import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost; -import jcuda.Pointer; -import jcuda.Sizeof; - -import org.apache.sysml.runtime.DMLRuntimeException; -import org.apache.sysml.runtime.controlprogram.caching.CacheException; -import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; -import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; -import org.apache.sysml.runtime.matrix.data.MatrixBlock; -import org.apache.sysml.utils.Statistics; - -public class JCudaObject extends GPUObject { - - public Pointer jcudaPointer = null; - public long numBytes; - - JCudaObject(MatrixObject mat2) { - super(mat2); - } - - private void prepare(boolean isInput, int numElemsToAllocate) throws DMLRuntimeException { - if(jcudaPointer != null) { - // Already allocated on GPU and expected to be in sync - } - else { - if(isInput) { - if(numElemsToAllocate != -1) - throw new DMLRuntimeException("Expected numElemsToAllocate to be -1 as it is inferred from the input"); - // Copy performs allocation - copyFromHostToDevice(); - } - else { - // Don't copy just allocate - ensureFreeSpaceForDenseBlock(numElemsToAllocate); - allocateMemoryOnDevice(numElemsToAllocate); - synchronized(evictionLock) { - GPUContext.allocatedPointers.add(this); - } - } - } - numLocks.addAndGet(1); - } - - @Override - public void acquireDeviceRead() throws DMLRuntimeException { - prepare(true, -1); - if(!isAllocated) - throw new DMLRuntimeException("Expected device data to be allocated"); - } - - @Override - public void acquireDenseDeviceModify(int numElemsToAllocate) throws DMLRuntimeException { - prepare(false, numElemsToAllocate); - isDeviceCopyModified = true; - if(!isAllocated) - throw new DMLRuntimeException("Expected device data to be allocated"); - } - - @Override - public void acquireHostRead() throws CacheException { - if(isAllocated) { - try { - if(isDeviceCopyModified) { - copyFromDeviceToHost(); - } - } catch (DMLRuntimeException e) { - throw new CacheException(e); - } - } - } - - @Override - public void acquireHostModify() throws CacheException { - if(isAllocated) { - try { - if(isDeviceCopyModified) { - throw new DMLRuntimeException("Potential overwrite of GPU data"); - // copyFromDeviceToHost(); - } - clearData(); - } catch (DMLRuntimeException e) { - throw new CacheException(e); - } - } - } - - public void release(boolean isGPUCopyModified) throws CacheException { - if(numLocks.addAndGet(-1) < 0) { - throw new CacheException("Redundant release of GPU object"); - } - isDeviceCopyModified = isGPUCopyModified; - } - - @Override - void allocateMemoryOnDevice(int numElemToAllocate) throws DMLRuntimeException { - if(jcudaPointer == null) { - long start = System.nanoTime(); - jcudaPointer = new Pointer(); - if(numElemToAllocate == -1 && LibMatrixCUDA.isInSparseFormat(mat)) - throw new DMLRuntimeException("Sparse format not implemented"); - else if(numElemToAllocate == -1) { - // Called for dense input - numBytes = mat.getNumRows()*mat.getNumColumns()*Sizeof.DOUBLE; - cudaMalloc(jcudaPointer, numBytes); - JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(-numBytes); - } - else { - // Called for dense output - numBytes = numElemToAllocate*Sizeof.DOUBLE; - cudaMalloc(jcudaPointer, numBytes); - JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(-numBytes); - } - - Statistics.cudaAllocTime.addAndGet(System.nanoTime()-start); - Statistics.cudaAllocCount.addAndGet(1); - - } - isAllocated = true; - } - - @Override - void deallocateMemoryOnDevice() { - if(jcudaPointer != null) { - long start = System.nanoTime(); - cudaFree(jcudaPointer); - JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(numBytes); - Statistics.cudaDeAllocTime.addAndGet(System.nanoTime()-start); - Statistics.cudaDeAllocCount.addAndGet(1); - - } - jcudaPointer = null; - isAllocated = false; - numLocks.set(0); - } - - void ensureFreeSpaceForDenseBlock(int numElem) throws DMLRuntimeException { - long GPUSize = (Sizeof.DOUBLE) * numElem; - if(GPUSize >= getAvailableMemory()) { - evict(GPUSize); - } - } - - @Override - void copyFromHostToDevice() - throws DMLRuntimeException - { - printCaller(); - long start = System.nanoTime(); - - MatrixBlock tmp = mat.acquireRead(); - if(tmp.isInSparseFormat()) { - throw new DMLRuntimeException("Sparse matrix is not implemented"); - // tmp.sparseToDense(); - } - else { - double[] data = tmp.getDenseBlock(); - - if( data == null && tmp.getSparseBlock() != null ) - throw new DMLRuntimeException("Incorrect sparsity calculation"); - else if( data==null && tmp.getNonZeros() != 0 ) - throw new DMLRuntimeException("MatrixBlock is not allocated"); - else if( tmp.getNonZeros() == 0 ) - data = new double[tmp.getNumRows()*tmp.getNumColumns()]; - - // Copy dense block - ensureFreeSpaceForDenseBlock(data.length); - allocateMemoryOnDevice(data.length); - synchronized(evictionLock) { - GPUContext.allocatedPointers.add(this); - } - cudaMemcpy(jcudaPointer, Pointer.to(data), mat.getNumRows()*mat.getNumColumns() * Sizeof.DOUBLE, cudaMemcpyHostToDevice); - } - - mat.release(); - - Statistics.cudaToDevTime.addAndGet(System.nanoTime()-start); - Statistics.cudaToDevCount.addAndGet(1); - } - - @Override - protected void copyFromDeviceToHost() throws DMLRuntimeException { - if(jcudaPointer != null) { - printCaller(); - if(LibMatrixCUDA.isInSparseFormat(mat)) - throw new DMLRuntimeException("Sparse format not implemented"); - else { - long start = System.nanoTime(); - MatrixBlock tmp = new MatrixBlock((int)mat.getNumRows(), (int)mat.getNumColumns(), false); - tmp.allocateDenseBlock(); - double [] data = tmp.getDenseBlock(); - - cudaMemcpy(Pointer.to(data), jcudaPointer, data.length * Sizeof.DOUBLE, cudaMemcpyDeviceToHost); - - tmp.recomputeNonZeros(); - mat.acquireModify(tmp); - mat.release(); - - Statistics.cudaFromDevTime.addAndGet(System.nanoTime()-start); - Statistics.cudaFromDevCount.addAndGet(1); - } - } - else { - throw new DMLRuntimeException("Cannot copy from device to host as JCuda pointer is not allocated"); - } - isDeviceCopyModified = false; - } - - @Override - protected long getSizeOnDevice() throws DMLRuntimeException { - long GPUSize = 0; - int rlen = (int) mat.getNumRows(); - int clen = (int) mat.getNumColumns(); - - if(LibMatrixCUDA.isInSparseFormat(mat)) { - throw new DMLRuntimeException("Sparse format not implemented"); - } - else { - GPUSize = (Sizeof.DOUBLE) * (long) (rlen * clen); - } - return GPUSize; - } - - private String getClassAndMethod(StackTraceElement st) { - String [] str = st.getClassName().split("\\."); - return str[str.length - 1] + "." + st.getMethodName(); - } - - private void printCaller() { - if(JCudaContext.DEBUG) { - StackTraceElement[] st = Thread.currentThread().getStackTrace(); - String ret = getClassAndMethod(st[1]); - for(int i = 2; i < st.length && i < 7; i++) { - ret += "->" + getClassAndMethod(st[i]); - } - System.out.println("CALL_STACK:" + ret); - } - - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java index 4148a7d..1030071 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java @@ -191,6 +191,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { outputBlock = getDenseOutputBlock(ec, C * R * S, N * P * Q, true); params.setReuseNonZeroedOutput(_reuseNonZeroedOutput); LibMatrixDNN.im2col(matBlock, outputBlock, params); + outputBlock.setNonZeros(params.outputNNZ.get()); } else if (instOpcode.equalsIgnoreCase("reshape_col")) { checkHeightWidth(ec, params); @@ -199,6 +200,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { outputBlock = getDenseOutputBlock(ec, N, K * P * Q, true); params.setReuseNonZeroedOutput(_reuseNonZeroedOutput); LibMatrixDNN.reshape_col(matBlock, outputBlock, params); + outputBlock.setNonZeros(matBlock.getNonZeros()); // As number of non-zeros doesnot change for reshape_col } else if (instOpcode.equalsIgnoreCase("rotate180")) { checkHeightWidth(ec, params); @@ -206,6 +208,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { outputBlock = getDenseOutputBlock(ec, N * P * Q, K, true); params.setReuseNonZeroedOutput(_reuseNonZeroedOutput); LibMatrixDNN.rotate180(matBlock, outputBlock, params); + outputBlock.setNonZeros(matBlock.getNonZeros()); // As number of non-zeros doesnot change for rotate180 } else if (instOpcode.equalsIgnoreCase("col2im")) { checkHeightWidth(ec, params); @@ -213,7 +216,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { // needs to be zeroed-out outputBlock = getDenseOutputBlock(ec, N, C * H * W, false); params.setReuseNonZeroedOutput(_reuseNonZeroedOutput); - LibMatrixDNN.col2im(matBlock, outputBlock, params); + LibMatrixDNN.col2im(matBlock, outputBlock, params); // No efficient nnz computation, so setting it to -1 } else if (instOpcode.equalsIgnoreCase("maxpooling")) { // Is eligible for REUSE_NONZEROED_OUTPUT but cannot guarantee that previous output has been rmvar-ed @@ -221,6 +224,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { outputBlock = getDenseOutputBlock(ec, N, C*P*Q, true); params.setReuseNonZeroedOutput(_reuseNonZeroedOutput); LibMatrixDNN.maxpooling(matBlock, outputBlock, params); + outputBlock.setNonZeros(params.outputNNZ.get()); } else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) { MatrixBlock dout = ec.getMatrixInput(_in2.getName()); @@ -228,7 +232,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { // without somewhat expensive HashMap checks outputBlock = getDenseOutputBlock(ec, N, C*H*W, false); params.setReuseNonZeroedOutput(_reuseNonZeroedOutput); - LibMatrixDNN.maxpooling_backward(matBlock, dout, outputBlock, params); + LibMatrixDNN.maxpooling_backward(matBlock, dout, outputBlock, params); // No efficient nnz computation, so setting it to -1 ec.releaseMatrixInput(_in2.getName()); } else { @@ -246,7 +250,7 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { if(DMLScript.STATISTICS) start = System.nanoTime(); - MatrixBlock outputBlock = new MatrixBlock(numRows, numCols, numRows * numCols); + MatrixBlock outputBlock = new MatrixBlock(numRows, numCols, false, numRows * numCols); _reuseNonZeroedOutput = false; if(reuseNonZeroedOutput1 && DMLScript.REUSE_NONZEROED_OUTPUT) { _reuseNonZeroedOutput = true; http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java index 618f3b8..ce9646b 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java @@ -36,7 +36,7 @@ public abstract class GPUInstruction extends Instruction protected boolean _requiresLabelUpdate = false; public GPUInstruction(String opcode, String istr) { - type = INSTRUCTION_TYPE.CONTROL_PROGRAM; + type = INSTRUCTION_TYPE.GPU; instString = istr; //prepare opcode and update requirement for repeated usage http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java new file mode 100644 index 0000000..0127154 --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.instructions.gpu.context; + +import java.util.ArrayList; + +import org.apache.sysml.api.DMLScript; +import org.apache.sysml.hops.OptimizerUtils; +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; + +//FIXME merge JCudaContext into GPUContext as this context is anyway CUDA specific +public abstract class GPUContext { + + public static ArrayList<GPUObject> allocatedPointers = new ArrayList<GPUObject>(); + protected static GPUContext currContext; + protected GPUContext() { } + + public static volatile Boolean isGPUContextCreated = false; + + public abstract long getAvailableMemory(); + + // Creation / Destruction of GPUContext and related handles + public static GPUContext createGPUContext() { + if(currContext == null && DMLScript.USE_ACCELERATOR) { + // TODO: Handle this thread and resolve concurrency related bugs if any + new Thread(new Runnable() { + @Override + public void run() { + // Lazy GPU context creation + synchronized(isGPUContextCreated) { + currContext = new JCudaContext(); + OptimizerUtils.GPU_MEMORY_BUDGET = ((JCudaContext)currContext).getAvailableMemory(); + isGPUContextCreated = true; + } + } + }).start(); + } + return currContext; + } + public static GPUObject createGPUObject(MatrixObject mo) { + if(DMLScript.USE_ACCELERATOR) { + synchronized(isGPUContextCreated) { + if(currContext == null) + throw new RuntimeException("GPUContext is not created"); + if(currContext instanceof JCudaContext) + return new JCudaObject(mo); + } + } + throw new RuntimeException("Cannot create createGPUObject when USE_ACCELERATOR is off"); + } + public abstract void destroy() throws DMLRuntimeException; + + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java new file mode 100644 index 0000000..45b8c5b --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.instructions.gpu.context; + +import java.util.Collections; +import java.util.Comparator; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.controlprogram.caching.CacheException; +import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; +import org.apache.sysml.utils.Statistics; + +//FIXME merge JCudaObject into GPUObject to avoid unnecessary complexity +public abstract class GPUObject +{ + protected boolean isDeviceCopyModified = false; + protected AtomicInteger numLocks = new AtomicInteger(0); + protected boolean isInSparseFormat = false; + public boolean isAllocated = false; + protected MatrixObject mat = null; + + protected GPUObject(MatrixObject mat2) { + this.mat = mat2; + } + + public boolean isInSparseFormat() { + return isInSparseFormat; + } + + public boolean isAllocated() { + return isAllocated; + } + + public abstract void acquireDeviceRead() throws DMLRuntimeException; + public abstract void acquireDenseDeviceModify(int numElemsToAllocate) throws DMLRuntimeException; + public abstract void acquireHostRead() throws CacheException; + public abstract void acquireHostModify() throws CacheException; + public abstract void release(boolean isGPUCopyModified) throws CacheException; + + + // package-level visibility as these methods are guarded by underlying GPUContext + abstract void allocateMemoryOnDevice(int numElemToAllocate) throws DMLRuntimeException; + abstract void deallocateMemoryOnDevice() throws DMLRuntimeException; + abstract long getSizeOnDevice() throws DMLRuntimeException; + abstract void copyFromHostToDevice() throws DMLRuntimeException; + abstract void copyFromDeviceToHost() throws DMLRuntimeException; // Called by export() + + + /** + * It finds matrix toBeRemoved such that toBeRemoved.GPUSize >= size + * // TODO: it is the smallest matrix size that satisfy the above condition. For now just evicting the largest pointer. + * Then returns toBeRemoved. + * + */ + protected void evict(long GPUSize) throws DMLRuntimeException { + if(GPUContext.allocatedPointers.size() == 0) { + throw new DMLRuntimeException("There is not enough memory on device for this matrix!"); + } + + Statistics.cudaEvictionCount.addAndGet(1); + + synchronized(evictionLock) { + Collections.sort(GPUContext.allocatedPointers, new Comparator<GPUObject>() { + + @Override + public int compare(GPUObject p1, GPUObject p2) { + int p1Val = p1.numLocks.get(); + int p2Val = p2.numLocks.get(); + + if(p1Val < 0 || p2Val < 0) { + throw new RuntimeException("Number of locks cannot be negative"); + } + else if(p1Val == 0 && p2Val == 0) { + // Both p1 and p2 are unlocked, return largest object + // TODO: Modify this !! + long p1Size = 0; long p2Size = 0; + try { + p1Size = p1.getSizeOnDevice(); + p2Size = p2.getSizeOnDevice(); + } catch (DMLRuntimeException e) { + throw new RuntimeException(e); + } + if(p1Size == p2Size) { + return 0; + } + else if(p1Size < p2Size) { + return 1; + } + else { + return -1; + } + } + else if(p1Val > p2Val) { + // There are more locks on p1 + return 1; + } + else { + // There are more locks on p2 + return -1; + } + } + }); + + + while(GPUSize > getAvailableMemory() && GPUContext.allocatedPointers.size() > 0) { + GPUObject toBeRemoved = GPUContext.allocatedPointers.get(GPUContext.allocatedPointers.size() - 1); + if(toBeRemoved.numLocks.get() != 0) { + throw new DMLRuntimeException("There is not enough memory on device for this matrix!"); + } + if(toBeRemoved.isDeviceCopyModified) { + toBeRemoved.copyFromDeviceToHost(); + } + toBeRemoved.clearData(); + } + } + } + + public void clearData() throws CacheException { + synchronized(evictionLock) { + GPUContext.allocatedPointers.remove(this); + } + try { + deallocateMemoryOnDevice(); + } catch (DMLRuntimeException e) { + throw new CacheException(e); + } + } + + static Boolean evictionLock = new Boolean(true); + + protected long getAvailableMemory() { + return GPUContext.currContext.getAvailableMemory(); + } + +// // Copying from device -> host occurs here +// // Called by MatrixObject's exportData +// public void exportData() throws CacheException { +// boolean isDeviceCopyModified = mat.getGPUObject() != null && mat.getGPUObject().isDeviceCopyModified; +// boolean isHostCopyUnavailable = mat.getMatrixBlock() == null || +// (mat.getMatrixBlock().getDenseBlock() == null && mat.getMatrixBlock().getSparseBlock() == null); +// +// if(mat.getGPUObject() != null && (isDeviceCopyModified || isHostCopyUnavailable)) { +// try { +// mat.getGPUObject().copyFromDeviceToHost(); +// } catch (DMLRuntimeException e) { +// throw new CacheException(e); +// } +// } +// } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java new file mode 100644 index 0000000..708badc --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaContext.java @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.instructions.gpu.context; + +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; +import org.apache.sysml.utils.Statistics; + +import jcuda.driver.JCudaDriver; +import jcuda.jcublas.JCublas2; +import jcuda.jcublas.cublasHandle; +import jcuda.jcudnn.JCudnn; +import jcuda.runtime.JCuda; +import jcuda.jcudnn.cudnnHandle; +import static jcuda.jcudnn.JCudnn.cudnnCreate; +import static jcuda.jcublas.JCublas2.cublasCreate; +import static jcuda.jcublas.JCublas2.cublasDestroy; +import static jcuda.jcudnn.JCudnn.cudnnDestroy; +import static jcuda.driver.JCudaDriver.cuInit; +import static jcuda.driver.JCudaDriver.cuDeviceGetCount; +import static jcuda.runtime.JCuda.cudaMemGetInfo; +import static jcuda.runtime.cudaError.cudaSuccess; + +/** + * Setup: + * 1. Install CUDA 7.5 + * 2. Install CuDNN v4 from http://developer.download.nvidia.com/compute/redist/cudnn/v4/cudnn-7.0-win-x64-v4.0-prod.zip + * 3. Download JCuda binaries version 0.7.5b and JCudnn version 0.7.5. Copy the DLLs into C:\lib (or /lib) directory. Link: http://www.jcuda.org/downloads/downloads.html + * + */ +public class JCudaContext extends GPUContext { + + private static final Log LOG = LogFactory.getLog(JCudaContext.class.getName()); + + public static boolean DEBUG = false; + + public static long totalNumBytes = 0; + public static AtomicLong availableNumBytesWithoutUtilFactor = new AtomicLong(0); + // Fraction of available memory to use. The available memory is computer when the JCudaContext is created + // to handle the tradeoff on calling cudaMemGetInfo too often. + public static double GPU_MEMORY_UTILIZATION_FACTOR = 0.9; + public static boolean REFRESH_AVAILABLE_MEMORY_EVERY_TIME = true; + + static { + long start = System.nanoTime(); + JCuda.setExceptionsEnabled(true); + JCudnn.setExceptionsEnabled(true); + JCublas2.setExceptionsEnabled(true); + JCudaDriver.setExceptionsEnabled(true); + cuInit(0); // Initialize the driver + // Obtain the number of devices + int deviceCountArray[] = { 0 }; + cuDeviceGetCount(deviceCountArray); + int deviceCount = deviceCountArray[0]; + LOG.info("Total number of GPUs on the machine: " + deviceCount); + Statistics.cudaInitTime = System.nanoTime() - start; + } + + public long getAvailableMemory() { + if(REFRESH_AVAILABLE_MEMORY_EVERY_TIME) { + long free [] = { 0 }; + long total [] = { 0 }; + if(cudaMemGetInfo(free, total) == cudaSuccess) { + totalNumBytes = total[0]; + availableNumBytesWithoutUtilFactor.set(free[0]); + } + else { + throw new RuntimeException("ERROR: Unable to get memory information of the GPU."); + } + } + return (long) (availableNumBytesWithoutUtilFactor.get()*GPU_MEMORY_UTILIZATION_FACTOR); + } + + + public JCudaContext() { + if(isGPUContextCreated) { + // Wait until it is deleted. This case happens during multi-threaded testing. + // This also allows for multi-threaded execute calls + long startTime = System.currentTimeMillis(); + do { + try { + Thread.sleep(100); + } catch (InterruptedException e) {} + } while(isGPUContextCreated && (System.currentTimeMillis() - startTime) < 60000); + synchronized(isGPUContextCreated) { + if(GPUContext.currContext != null) { + throw new RuntimeException("Cannot create multiple JCudaContext. Waited for 10 min to close previous GPUContext"); + } + } + } + GPUContext.currContext = this; + + long start = System.nanoTime(); + LibMatrixCUDA.cudnnHandle = new cudnnHandle(); + cudnnCreate(LibMatrixCUDA.cudnnHandle); + LibMatrixCUDA.cublasHandle = new cublasHandle(); + cublasCreate(LibMatrixCUDA.cublasHandle); + Statistics.cudaLibrariesInitTime = System.nanoTime() - start; + + long free [] = { 0 }; + long total [] = { 0 }; + if(cudaMemGetInfo(free, total) == cudaSuccess) { + totalNumBytes = total[0]; + availableNumBytesWithoutUtilFactor.set(free[0]); + } + else { + throw new RuntimeException("ERROR: Unable to get memory information of the GPU."); + } + LOG.info("Total GPU memory: " + (totalNumBytes*(1e-6)) + " MB"); + LOG.info("Available GPU memory: " + (availableNumBytesWithoutUtilFactor.get()*(1e-6)) + " MB"); + } + + @Override + public void destroy() throws DMLRuntimeException { + if(currContext != null) { + synchronized(isGPUContextCreated) { + cudnnDestroy(LibMatrixCUDA.cudnnHandle); + cublasDestroy(LibMatrixCUDA.cublasHandle); + currContext = null; + isGPUContextCreated = false; + } + } + else if(LibMatrixCUDA.cudnnHandle != null || LibMatrixCUDA.cublasHandle != null) { + throw new DMLRuntimeException("Error while destroying the GPUContext"); + } + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java new file mode 100644 index 0000000..811f2dd --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java @@ -0,0 +1,263 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.instructions.gpu.context; + +import static jcuda.runtime.JCuda.cudaFree; +import static jcuda.runtime.JCuda.cudaMalloc; +import static jcuda.runtime.JCuda.cudaMemcpy; +import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice; +import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost; +import jcuda.Pointer; +import jcuda.Sizeof; + +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.controlprogram.caching.CacheException; +import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; +import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; +import org.apache.sysml.runtime.matrix.data.MatrixBlock; +import org.apache.sysml.utils.Statistics; + +public class JCudaObject extends GPUObject { + + public Pointer jcudaPointer = null; + public long numBytes; + + JCudaObject(MatrixObject mat2) { + super(mat2); + } + + private void prepare(boolean isInput, int numElemsToAllocate) throws DMLRuntimeException { + if(jcudaPointer != null) { + // Already allocated on GPU and expected to be in sync + } + else { + if(isInput) { + if(numElemsToAllocate != -1) + throw new DMLRuntimeException("Expected numElemsToAllocate to be -1 as it is inferred from the input"); + // Copy performs allocation + copyFromHostToDevice(); + } + else { + // Don't copy just allocate + ensureFreeSpaceForDenseBlock(numElemsToAllocate); + allocateMemoryOnDevice(numElemsToAllocate); + synchronized(evictionLock) { + GPUContext.allocatedPointers.add(this); + } + } + } + numLocks.addAndGet(1); + } + + @Override + public void acquireDeviceRead() throws DMLRuntimeException { + prepare(true, -1); + if(!isAllocated) + throw new DMLRuntimeException("Expected device data to be allocated"); + } + + @Override + public void acquireDenseDeviceModify(int numElemsToAllocate) throws DMLRuntimeException { + prepare(false, numElemsToAllocate); + isDeviceCopyModified = true; + if(!isAllocated) + throw new DMLRuntimeException("Expected device data to be allocated"); + } + + @Override + public void acquireHostRead() throws CacheException { + if(isAllocated) { + try { + if(isDeviceCopyModified) { + copyFromDeviceToHost(); + } + } catch (DMLRuntimeException e) { + throw new CacheException(e); + } + } + } + + @Override + public void acquireHostModify() throws CacheException { + if(isAllocated) { + try { + if(isDeviceCopyModified) { + throw new DMLRuntimeException("Potential overwrite of GPU data"); + // copyFromDeviceToHost(); + } + clearData(); + } catch (DMLRuntimeException e) { + throw new CacheException(e); + } + } + } + + public void release(boolean isGPUCopyModified) throws CacheException { + if(numLocks.addAndGet(-1) < 0) { + throw new CacheException("Redundant release of GPU object"); + } + isDeviceCopyModified = isGPUCopyModified; + } + + @Override + void allocateMemoryOnDevice(int numElemToAllocate) throws DMLRuntimeException { + if(jcudaPointer == null) { + long start = System.nanoTime(); + jcudaPointer = new Pointer(); + if(numElemToAllocate == -1 && LibMatrixCUDA.isInSparseFormat(mat)) + throw new DMLRuntimeException("Sparse format not implemented"); + else if(numElemToAllocate == -1) { + // Called for dense input + numBytes = mat.getNumRows()*mat.getNumColumns()*Sizeof.DOUBLE; + cudaMalloc(jcudaPointer, numBytes); + JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(-numBytes); + } + else { + // Called for dense output + numBytes = numElemToAllocate*Sizeof.DOUBLE; + cudaMalloc(jcudaPointer, numBytes); + JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(-numBytes); + } + + Statistics.cudaAllocTime.addAndGet(System.nanoTime()-start); + Statistics.cudaAllocCount.addAndGet(1); + + } + isAllocated = true; + } + + @Override + void deallocateMemoryOnDevice() { + if(jcudaPointer != null) { + long start = System.nanoTime(); + cudaFree(jcudaPointer); + JCudaContext.availableNumBytesWithoutUtilFactor.addAndGet(numBytes); + Statistics.cudaDeAllocTime.addAndGet(System.nanoTime()-start); + Statistics.cudaDeAllocCount.addAndGet(1); + + } + jcudaPointer = null; + isAllocated = false; + numLocks.set(0); + } + + void ensureFreeSpaceForDenseBlock(int numElem) throws DMLRuntimeException { + long GPUSize = (Sizeof.DOUBLE) * numElem; + if(GPUSize >= getAvailableMemory()) { + evict(GPUSize); + } + } + + @Override + void copyFromHostToDevice() + throws DMLRuntimeException + { + printCaller(); + long start = System.nanoTime(); + + MatrixBlock tmp = mat.acquireRead(); + if(tmp.isInSparseFormat()) { + throw new DMLRuntimeException("Sparse matrix is not implemented"); + // tmp.sparseToDense(); + } + else { + double[] data = tmp.getDenseBlock(); + + if( data == null && tmp.getSparseBlock() != null ) + throw new DMLRuntimeException("Incorrect sparsity calculation"); + else if( data==null && tmp.getNonZeros() != 0 ) + throw new DMLRuntimeException("MatrixBlock is not allocated"); + else if( tmp.getNonZeros() == 0 ) + data = new double[tmp.getNumRows()*tmp.getNumColumns()]; + + // Copy dense block + ensureFreeSpaceForDenseBlock(data.length); + allocateMemoryOnDevice(data.length); + synchronized(evictionLock) { + GPUContext.allocatedPointers.add(this); + } + cudaMemcpy(jcudaPointer, Pointer.to(data), mat.getNumRows()*mat.getNumColumns() * Sizeof.DOUBLE, cudaMemcpyHostToDevice); + } + + mat.release(); + + Statistics.cudaToDevTime.addAndGet(System.nanoTime()-start); + Statistics.cudaToDevCount.addAndGet(1); + } + + @Override + protected void copyFromDeviceToHost() throws DMLRuntimeException { + if(jcudaPointer != null) { + printCaller(); + if(LibMatrixCUDA.isInSparseFormat(mat)) + throw new DMLRuntimeException("Sparse format not implemented"); + else { + long start = System.nanoTime(); + MatrixBlock tmp = new MatrixBlock((int)mat.getNumRows(), (int)mat.getNumColumns(), false); + tmp.allocateDenseBlock(); + double [] data = tmp.getDenseBlock(); + + cudaMemcpy(Pointer.to(data), jcudaPointer, data.length * Sizeof.DOUBLE, cudaMemcpyDeviceToHost); + + tmp.recomputeNonZeros(); + mat.acquireModify(tmp); + mat.release(); + + Statistics.cudaFromDevTime.addAndGet(System.nanoTime()-start); + Statistics.cudaFromDevCount.addAndGet(1); + } + } + else { + throw new DMLRuntimeException("Cannot copy from device to host as JCuda pointer is not allocated"); + } + isDeviceCopyModified = false; + } + + @Override + protected long getSizeOnDevice() throws DMLRuntimeException { + long GPUSize = 0; + int rlen = (int) mat.getNumRows(); + int clen = (int) mat.getNumColumns(); + + if(LibMatrixCUDA.isInSparseFormat(mat)) { + throw new DMLRuntimeException("Sparse format not implemented"); + } + else { + GPUSize = (Sizeof.DOUBLE) * (long) (rlen * clen); + } + return GPUSize; + } + + private String getClassAndMethod(StackTraceElement st) { + String [] str = st.getClassName().split("\\."); + return str[str.length - 1] + "." + st.getMethodName(); + } + + private void printCaller() { + if(JCudaContext.DEBUG) { + StackTraceElement[] st = Thread.currentThread().getStackTrace(); + String ret = getClassAndMethod(st[1]); + for(int i = 2; i < st.length && i < 7; i++) { + ret += "->" + getClassAndMethod(st[i]); + } + System.out.println("CALL_STACK:" + ret); + } + + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java index 45f68dd..fc1f657 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java @@ -49,7 +49,7 @@ import jcuda.jcudnn.cudnnTensorDescriptor; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; -import org.apache.sysml.runtime.controlprogram.context.JCudaObject; +import org.apache.sysml.runtime.instructions.gpu.context.JCudaObject; //FIXME move could to respective instructions, this is not a block library public class LibMatrixCUDA { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/be263212/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java index b68a51c..0565136 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java @@ -24,6 +24,7 @@ import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; +import java.util.concurrent.atomic.AtomicLong; import org.apache.sysml.hops.OptimizerUtils; import org.apache.sysml.runtime.DMLRuntimeException; @@ -31,7 +32,7 @@ import org.apache.sysml.runtime.util.ConvolutionUtils; public class LibMatrixDNN { - public static boolean ALLOW_MULTI_THREADED_OPS = true; + public static final boolean ALLOW_MULTI_THREADED_OPS = true; // Using hashmap to avoid any performance impacts of multimap private static final ConcurrentHashMap<Integer, SoftReference<double[]>> non_zeroed_double_arr = new ConcurrentHashMap<Integer, SoftReference<double[]>>(); private static final int NON_ZEROED_DOUBLE_ARR_THRESHOLD = 100; @@ -64,6 +65,8 @@ public class LibMatrixDNN { public int K; public int R; public int S; public int stride_h; public int stride_w; public int pad_h; public int pad_w; public int P; public int Q; public int numThreads; + public AtomicLong outputNNZ = new AtomicLong(-1); + MatrixBlock input1; MatrixBlock input2; MatrixBlock output; boolean reuseNonZeroedOutput = false; @@ -219,6 +222,8 @@ public class LibMatrixDNN { throw new DMLRuntimeException("Incorrect input dimensions in maxpooling:" + input.getNumRows() + " " + input.getNumColumns() + " " + params.N + " " + params.K*params.P*params.Q); } + params.outputNNZ.set(0); + int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads); if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) { for (int n = 0; n < params.N; n++) { @@ -240,6 +245,7 @@ public class LibMatrixDNN { if (!params.output.isInSparseFormat()) outputArray = params.output.getDenseBlock(); + long tmpNNZ = 0; for (int p = 0; p < params.P; p++) { for (int q = 0; q < params.Q; q++) { int start_index_h = p * params.stride_h - params.pad_h; @@ -258,10 +264,13 @@ public class LibMatrixDNN { else inVal = params.input1.quickGetValue(n, c*params.H*params.W + h*params.W + w); outputArray[out_index] = Math.max(outputArray[out_index], inVal); + if(outputArray[out_index] != 0) + tmpNNZ++; } } } } + params.outputNNZ.addAndGet(tmpNNZ); } // Reshape a 4D tensor of dimension (N, K, P, Q) to matrix of dimension (K, NPQ) @@ -436,6 +445,7 @@ public class LibMatrixDNN { params.input1 = input; params.output = outputBlock; + params.outputNNZ.set(0); int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads); if(!ALLOW_MULTI_THREADED_OPS || constrainedNumThreads <= 1) { for (int n = 0; n < params.N; n++) { // Do following for all images @@ -525,6 +535,7 @@ public class LibMatrixDNN { final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W; final int outputOffset = (c*params.R*params.S*params.N + n)*params.P*params.Q; + long tmpNNZ = 0; for (int r = 0; r < params.R; r++) { // Get an input patch of size R X S for (int s = 0; s < params.S; s++) { int localIndex = outputOffset + ((r*params.S*params.N + s*params.N)*params.P*params.Q); @@ -541,6 +552,8 @@ public class LibMatrixDNN { outputArray[localIndex] = inputArray[inputOffset + input_row*params.W + input_col]; else outputArray[localIndex] = params.input1.quickGetValue(n, c*params.H*params.W + input_row*params.W + input_col); + if(outputArray[localIndex] != 0) + tmpNNZ++; } else if(params.reuseNonZeroedOutput) { outputArray[localIndex] = 0; @@ -560,5 +573,6 @@ public class LibMatrixDNN { } } + params.outputNNZ.addAndGet(tmpNNZ); } }