http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/api/DMLScript.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/api/DMLScript.java b/src/main/java/org/apache/sysml/api/DMLScript.java index 07e766a..c737e92 100644 --- a/src/main/java/org/apache/sysml/api/DMLScript.java +++ b/src/main/java/org/apache/sysml/api/DMLScript.java @@ -112,6 +112,7 @@ public class DMLScript LFU, // Evict the least frequently used GPUObject. MIN_EVICT, MRU, // http://www.vldb.org/conf/1985/P127.PDF + ALIGN_MEMORY // TODO: // ARC, // https://dbs.uni-leipzig.de/file/ARC.pdf // LOOP_AWARE // different policies for operations in for/while/parfor loop vs out-side the loop @@ -172,7 +173,8 @@ public class DMLScript public static ExplainType EXPLAIN = DMLOptions.defaultOptions.explainType; // explain type public static String DML_FILE_PATH_ANTLR_PARSER = DMLOptions.defaultOptions.filePath; // filename of dml/pydml script public static String FLOATING_POINT_PRECISION = "double"; // data type to use internally - public static EvictionPolicy GPU_EVICTION_POLICY = EvictionPolicy.LRU; // currently employed GPU eviction policy + public static EvictionPolicy GPU_EVICTION_POLICY = EvictionPolicy.ALIGN_MEMORY; // currently employed GPU eviction policy + public static boolean PRINT_GPU_MEMORY_INFO = false; // whether to print GPU memory-related information /** * Global variable indicating the script type (DML or PYDML). Can be used
http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java index d587f48..2d913b6 100644 --- a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java +++ b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java @@ -27,6 +27,7 @@ import org.apache.sysml.conf.DMLConfig; import org.apache.sysml.hops.codegen.SpoofCompiler; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.Program; +import org.apache.sysml.runtime.controlprogram.caching.CacheableData; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool; @@ -70,7 +71,9 @@ public class ScriptExecutorUtils { // Whether extra statistics useful for developers and others interested // in digging into performance problems are recorded and displayed DMLScript.FINEGRAINED_STATISTICS = DMLScript.STATISTICS && dmlconf.getBooleanValue(DMLConfig.EXTRA_FINEGRAINED_STATS); + DMLScript.PRINT_GPU_MEMORY_INFO = dmlconf.getBooleanValue(DMLConfig.PRINT_GPU_MEMORY_INFO); DMLScript.SYNCHRONIZE_GPU = dmlconf.getBooleanValue(DMLConfig.SYNCHRONIZE_GPU); + CacheableData.CACHING_BUFFER_SIZE = dmlconf.getDoubleValue(DMLConfig.CACHING_BUFFER_SIZE); DMLScript.EAGER_CUDA_FREE = dmlconf.getBooleanValue(DMLConfig.EAGER_CUDA_FREE); DMLScript.STATISTICS_MAX_WRAP_LEN = dmlconf.getIntValue(DMLConfig.STATS_MAX_WRAP_LEN); NativeHelper.initialize(dmlconf.getTextValue(DMLConfig.NATIVE_BLAS_DIR), dmlconf.getTextValue(DMLConfig.NATIVE_BLAS).trim()); http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/conf/DMLConfig.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java b/src/main/java/org/apache/sysml/conf/DMLConfig.java index fb0237f..7279c57 100644 --- a/src/main/java/org/apache/sysml/conf/DMLConfig.java +++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java @@ -81,6 +81,7 @@ public class DMLConfig public static final String CODEGEN_OPTIMIZER = "sysml.codegen.optimizer"; //see SpoofCompiler.PlanSelector public static final String CODEGEN_PLANCACHE = "sysml.codegen.plancache"; //boolean public static final String CODEGEN_LITERALS = "sysml.codegen.literals"; //1..heuristic, 2..always + public static final String CACHING_BUFFER_SIZE = "sysml.caching.bufferSize"; //double: default:0.15 public static final String EXTRA_FINEGRAINED_STATS = "sysml.stats.finegrained"; //boolean public static final String STATS_MAX_WRAP_LEN = "sysml.stats.maxWrapLength"; //int @@ -92,7 +93,8 @@ public class DMLConfig // to handle the tradeoff on calling cudaMemGetInfo too often. public static final String GPU_MEMORY_UTILIZATION_FACTOR = "sysml.gpu.memory.util.factor"; public static final String FLOATING_POINT_PRECISION = "sysml.floating.point.precision"; // String to specify the datatype to use internally: supported values are double, single - + public static final String PRINT_GPU_MEMORY_INFO = "sysml.gpu.print.memoryInfo"; + // supported prefixes for custom map/reduce configurations public static final String PREFIX_MAPRED = "mapred"; public static final String PREFIX_MAPREDUCE = "mapreduce"; @@ -136,7 +138,9 @@ public class DMLConfig _defaultVals.put(STATS_MAX_WRAP_LEN, "30" ); _defaultVals.put(GPU_MEMORY_UTILIZATION_FACTOR, "0.9" ); _defaultVals.put(AVAILABLE_GPUS, "-1"); - _defaultVals.put(GPU_EVICTION_POLICY, "lru"); + _defaultVals.put(GPU_EVICTION_POLICY, "align_memory"); + _defaultVals.put(SYNCHRONIZE_GPU, "false" ); + _defaultVals.put(CACHING_BUFFER_SIZE, "0.15" ); _defaultVals.put(SYNCHRONIZE_GPU, "true" ); _defaultVals.put(EAGER_CUDA_FREE, "false" ); _defaultVals.put(FLOATING_POINT_PRECISION, "double" ); @@ -420,7 +424,7 @@ public class DMLConfig CP_PARALLEL_OPS, CP_PARALLEL_IO, NATIVE_BLAS, NATIVE_BLAS_DIR, COMPRESSED_LINALG, CODEGEN, CODEGEN_COMPILER, CODEGEN_OPTIMIZER, CODEGEN_PLANCACHE, CODEGEN_LITERALS, - EXTRA_FINEGRAINED_STATS, STATS_MAX_WRAP_LEN, + EXTRA_FINEGRAINED_STATS, STATS_MAX_WRAP_LEN, PRINT_GPU_MEMORY_INFO, CACHING_BUFFER_SIZE, AVAILABLE_GPUS, SYNCHRONIZE_GPU, EAGER_CUDA_FREE, FLOATING_POINT_PRECISION, GPU_EVICTION_POLICY }; http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java index 73f6c08..0bb2a67 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java @@ -77,7 +77,7 @@ public abstract class CacheableData<T extends CacheBlock> extends Data // global constant configuration parameters public static final long CACHING_THRESHOLD = (long)Math.max(4*1024, //obj not s.t. caching 1e-5 * InfrastructureAnalyzer.getLocalMaxMemory()); //if below threshold [in bytes] - public static final double CACHING_BUFFER_SIZE = 0.15; + public static double CACHING_BUFFER_SIZE = 0.15; public static final RPolicy CACHING_BUFFER_POLICY = RPolicy.FIFO; public static final boolean CACHING_BUFFER_PAGECACHE = false; public static final boolean CACHING_WRITE_CACHE_ON_READ = false; @@ -607,7 +607,7 @@ public abstract class CacheableData<T extends CacheBlock> extends Data if( _gpuObjects != null ) for (GPUObject gObj : _gpuObjects.values()) if (gObj != null) - gObj.clearData(); + gObj.clearData(null, DMLScript.EAGER_CUDA_FREE); // change object state EMPTY setDirty(false); http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java index bdc5b21..7eb4033 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CSRPointer.java @@ -416,10 +416,6 @@ public class CSRPointer { return getGPUContext().allocate(size); } - private void cudaFreeHelper(Pointer toFree, boolean eager) { - getGPUContext().cudaFreeHelper(toFree, eager); - } - private GPUContext getGPUContext() { return gpuContext; } @@ -494,17 +490,20 @@ public class CSRPointer { */ public void deallocate(boolean eager) { if (nnz > 0) { - cudaFreeHelper(val, eager); - cudaFreeHelper(rowPtr, eager); - cudaFreeHelper(colInd, eager); - val = null; - rowPtr = null; - colInd = null; + if (val != null) + getGPUContext().cudaFreeHelper(null, val, eager); + if (rowPtr != null) + getGPUContext().cudaFreeHelper(null, rowPtr, eager); + if (colInd != null) + getGPUContext().cudaFreeHelper(null, colInd, eager); } + val = null; + rowPtr = null; + colInd = null; } @Override public String toString() { return "CSRPointer{" + "nnz=" + nnz + '}'; } -} +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java index 7f8f106..2d3918c 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java @@ -37,6 +37,7 @@ import org.apache.sysml.api.DMLScript; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.utils.GPUStatistics; + import jcuda.Pointer; import jcuda.jcublas.cublasHandle; import jcuda.jcudnn.cudnnHandle; @@ -63,6 +64,7 @@ public class GPUContext { * active device assigned to this GPUContext instance */ private final int deviceNum; + /** * cudnnHandle for Deep Neural Network operations on the GPU */ @@ -130,9 +132,11 @@ public class GPUContext { } } - private void initializeCudaLibraryHandles() { - deleteCudaLibraryHandles(); - + private void initializeCudaLibraryHandles() throws DMLRuntimeException { + // We don't need to explicitly delete the handles if we are planning to create them again. + // This has a huge performance impact on scripts that has large number of layers (i.e. FunctionCallCP) for example ResNet. + // If this is absolutely required for parfor, please add appropriate safeguard for non-parfor scripts. + // deleteCudaLibraryHandles(); if (cudnnHandle == null) { cudnnHandle = new cudnnHandle(); cudnnCreate(cudnnHandle); @@ -150,11 +154,6 @@ public class GPUContext { cusparseHandle = new cusparseHandle(); cusparseCreate(cusparseHandle); } - - if (cusolverDnHandle == null) { - cusolverDnHandle = new cusolverDnHandle(); - cusolverDnCreate(cusolverDnHandle); - } if (kernels == null) { kernels = new JCudaKernels(); @@ -204,36 +203,6 @@ public class GPUContext { return memoryManager.malloc(instructionName, size); } - - /** - * Does lazy cudaFree calls. - * - * @param toFree {@link Pointer} instance to be freed - */ - public void cudaFreeHelper(final Pointer toFree) { - cudaFreeHelper(null, toFree, DMLScript.EAGER_CUDA_FREE); - } - - /** - * Does lazy/eager cudaFree calls. - * - * @param toFree {@link Pointer} instance to be freed - * @param eager true if to be done eagerly - */ - public void cudaFreeHelper(final Pointer toFree, boolean eager) { - cudaFreeHelper(null, toFree, eager); - } - - /** - * Does lazy cudaFree calls. - * - * @param instructionName name of the instruction for which to record per instruction free time, null if do not want to record - * @param toFree {@link Pointer} instance to be freed - */ - public void cudaFreeHelper(String instructionName, final Pointer toFree) { - cudaFreeHelper(instructionName, toFree, DMLScript.EAGER_CUDA_FREE); - } - /** * Does cudaFree calls, lazily. * @@ -290,7 +259,7 @@ public class GPUContext { */ public GPUObject createGPUObject(MatrixObject mo) { GPUObject ret = new GPUObject(this, mo); - getMemoryManager().addGPUObject(ret); + getMemoryManager().getGPUMatrixMemoryManager().addGPUObject(ret); return ret; } @@ -376,6 +345,15 @@ public class GPUContext { * @return cusolverDnHandle for current thread */ public cusolverDnHandle getCusolverDnHandle() { + if (cusolverDnHandle == null) { + synchronized(this) { + if (cusolverDnHandle == null) { + // Since cusolverDnHandle handle is rarely used and occupies unnecessary memory, it is only initialized when needed. + cusolverDnHandle = new cusolverDnHandle(); + cusolverDnCreate(cusolverDnHandle); + } + } + } return cusolverDnHandle; } @@ -442,4 +420,4 @@ public class GPUContext { public String toString() { return "GPUContext{" + "deviceNum=" + deviceNum + '}'; } -} +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java new file mode 100644 index 0000000..830f7df --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPULazyCudaFreeMemoryManager.java @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.instructions.gpu.context; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.stream.Collectors; +import java.util.Optional; +import java.util.Set; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.sysml.api.DMLScript; +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.instructions.gpu.GPUInstruction; + +import jcuda.Pointer; + +public class GPULazyCudaFreeMemoryManager { + protected static final Log LOG = LogFactory.getLog(GPULazyCudaFreeMemoryManager.class.getName()); + GPUMemoryManager gpuManager; + public GPULazyCudaFreeMemoryManager(GPUMemoryManager gpuManager) { + this.gpuManager = gpuManager; + } + + /** + * Map of free blocks allocate on GPU. maps size_of_block -> pointer on GPU + */ + private HashMap<Long, Set<Pointer>> rmvarGPUPointers = new HashMap<Long, Set<Pointer>>(); + + /** + * Get any pointer of the given size from rmvar-ed pointers (applicable if eager cudaFree is set to false) + * + * @param opcode opcode + * @param size size in bytes + * @return pointer + */ + public Pointer getRmvarPointer(String opcode, long size) { + if (rmvarGPUPointers.containsKey(size)) { + if(LOG.isTraceEnabled()) + LOG.trace("Getting rmvar-ed pointers for size:" + size); + Pointer A = remove(rmvarGPUPointers, size); // remove from rmvarGPUPointers as you are not calling cudaFree + return A; + } + else { + return null; + } + } + + public Set<Pointer> getAllPointers() { + return rmvarGPUPointers.values().stream().flatMap(ptrs -> ptrs.stream()).collect(Collectors.toSet()); + } + + public void clearAll() { + Set<Pointer> toFree = new HashSet<Pointer>(); + for(Set<Pointer> ptrs : rmvarGPUPointers.values()) { + toFree.addAll(ptrs); + } + rmvarGPUPointers.clear(); + for(Pointer ptr : toFree) { + gpuManager.guardedCudaFree(ptr); + } + } + + public Pointer getRmvarPointerMinSize(String opcode, long minSize) throws DMLRuntimeException { + Optional<Long> toClear = rmvarGPUPointers.entrySet().stream().filter(e -> e.getValue().size() > 0).map(e -> e.getKey()) + .filter(size -> size >= minSize).min((s1, s2) -> s1 < s2 ? -1 : 1); + if(toClear.isPresent()) { + long t0 = opcode != null && DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; + Pointer A = remove(rmvarGPUPointers, toClear.get()); // remove from rmvarGPUPointers as you are not calling cudaFree + gpuManager.addMiscTime(opcode, GPUInstruction.MISC_TIMER_REUSE, t0); + return A; + } + return null; + } + + + /** + * Remove any pointer in the given hashmap + * + * @param hm hashmap of size, pointers + * @param size size in bytes + * @return the pointer that was removed + */ + private Pointer remove(HashMap<Long, Set<Pointer>> hm, long size) { + Pointer A = hm.get(size).iterator().next(); + remove(hm, size, A); + return A; + } + + /** + * Remove a specific pointer in the given hashmap + * + * @param hm hashmap of size, pointers + * @param size size in bytes + * @param ptr pointer to be removed + */ + private void remove(HashMap<Long, Set<Pointer>> hm, long size, Pointer ptr) { + hm.get(size).remove(ptr); + if (hm.get(size).isEmpty()) + hm.remove(size); + } + + /** + * Return the total memory in bytes used by this memory manager + * @return number of bytes + */ + public long getTotalMemoryAllocated() { + long rmvarMemoryAllocated = 0; + for(long numBytes : rmvarGPUPointers.keySet()) { + rmvarMemoryAllocated += numBytes; + } + return rmvarMemoryAllocated; + } + + /** + * Get total number of rmvared pointers + * + * @return number of pointers + */ + public int getNumPointers() { + return rmvarGPUPointers.size(); + } + + /** + * Add a pointer to the rmvar-ed list + * @param size size of the pointer + * @param toFree pointer + */ + public void add(long size, Pointer toFree) { + Set<Pointer> freeList = rmvarGPUPointers.get(size); + if (freeList == null) { + freeList = new HashSet<Pointer>(); + rmvarGPUPointers.put(size, freeList); + } + if (freeList.contains(toFree)) + throw new RuntimeException("GPU : Internal state corrupted, double free"); + freeList.add(toFree); + } + + /** + * Remove a specific pointer if present in the internal hashmap + * + * @param size size in bytes + * @param ptr pointer to be removed + */ + public void removeIfPresent(long size, Pointer ptr) { + if(rmvarGPUPointers.containsKey(size) && rmvarGPUPointers.get(size).contains(ptr)) { + rmvarGPUPointers.get(size).remove(ptr); + if (rmvarGPUPointers.get(size).isEmpty()) + rmvarGPUPointers.remove(size); + } + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java new file mode 100644 index 0000000..066905b --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.instructions.gpu.context; + +import java.util.Comparator; +import java.util.HashSet; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; + +import jcuda.Pointer; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.sysml.runtime.DMLRuntimeException; + +public class GPUMatrixMemoryManager { + protected static final Log LOG = LogFactory.getLog(GPUMatrixMemoryManager.class.getName()); + GPUMemoryManager gpuManager; + public GPUMatrixMemoryManager(GPUMemoryManager gpuManager) { + this.gpuManager = gpuManager; + } + + /** + * Adds the GPU object to the memory manager + * + * @param gpuObj the handle to the GPU object + */ + void addGPUObject(GPUObject gpuObj) { + gpuObjects.add(gpuObj); + } + + /** + * Returns worst-case contiguous memory size + * @param gpuObj gpu object + * @return memory size in bytes + */ + long getWorstCaseContiguousMemorySize(GPUObject gpuObj) { + long ret = 0; + if(!gpuObj.isDensePointerNull()) { + ret = gpuManager.allPointers.get(gpuObj.getDensePointer()).getSizeInBytes(); + } + else if(gpuObj.getJcudaSparseMatrixPtr() != null) { + CSRPointer sparsePtr = gpuObj.getJcudaSparseMatrixPtr(); + if(sparsePtr.nnz > 0) { + if(sparsePtr.rowPtr != null) + ret = Math.max(ret, gpuManager.allPointers.get(sparsePtr.rowPtr).getSizeInBytes()); + if(sparsePtr.colInd != null) + ret = Math.max(ret, gpuManager.allPointers.get(sparsePtr.colInd).getSizeInBytes()); + if(sparsePtr.val != null) + ret = Math.max(ret, gpuManager.allPointers.get(sparsePtr.val).getSizeInBytes()); + } + } + return ret; + } + + /** + * Get list of all Pointers in a GPUObject + * @param gObj gpu object + * @return set of pointers + */ + Set<Pointer> getPointers(GPUObject gObj) { + Set<Pointer> ret = new HashSet<>(); + if(!gObj.isDensePointerNull() && gObj.getSparseMatrixCudaPointer() != null) { + LOG.warn("Matrix allocated in both dense and sparse format"); + } + if(!gObj.isDensePointerNull()) { + ret.add(gObj.getDensePointer()); + } + if(gObj.getSparseMatrixCudaPointer() != null) { + CSRPointer sparsePtr = gObj.getSparseMatrixCudaPointer(); + if(sparsePtr != null) { + if(sparsePtr.rowPtr != null) + ret.add(sparsePtr.rowPtr); + else if(sparsePtr.colInd != null) + ret.add(sparsePtr.colInd); + else if(sparsePtr.val != null) + ret.add(sparsePtr.val); + } + } + return ret; + } + + /** + * list of allocated {@link GPUObject} instances allocated on {@link GPUContext#deviceNum} GPU + * These are matrices allocated on the GPU on which rmvar hasn't been called yet. + * If a {@link GPUObject} has more than one lock on it, it cannot be freed + * If it has zero locks on it, it can be freed, but it is preferrable to keep it around + * so that an extraneous host to dev transfer can be avoided + */ + HashSet<GPUObject> gpuObjects = new HashSet<>(); + + /** + * Get GPUObjects from the first memory sections "Matrix Memory" + * @param locked return locked GPU objects if true + * @param dirty return dirty GPU objects if true + * @return set of GPU Objects + */ + Set<GPUObject> getGPUObjects(boolean locked, boolean dirty) { + return gpuObjects.stream().filter(gObj -> gObj.isLocked() == locked && gObj.isDirty() == dirty).collect(Collectors.toSet()); + } + + /** + * Return all pointers in the first section + * @return all pointers in this section + */ + Set<Pointer> getPointers() { + return gpuObjects.stream().flatMap(gObj -> getPointers(gObj).stream()).collect(Collectors.toSet()); + } + + /** + * Get pointers from the first memory sections "Matrix Memory" + * @param locked return locked pointers if true + * @param dirty return dirty pointers if true + * @return set of pointers + */ + Set<Pointer> getPointers(boolean locked, boolean dirty) { + return gpuObjects.stream().filter(gObj -> gObj.isLocked() == locked && gObj.isDirty() == dirty).flatMap(gObj -> getPointers(gObj).stream()).collect(Collectors.toSet()); + } + + /** + * Clear the memory of the gpu object that matches the provided parameters + * + * @param locked is locked + * @param dirty is dirty + * @param minSize of atleast given size + * @param comparator sorting comparator in case there are more than one gpu object that matches above parameters + * @param opcode instruction code + * @return true if a gpu object satisfies the above condition else false + * @throws DMLRuntimeException if error occurs + */ + boolean clear(boolean locked, boolean dirty, long minSize, Comparator<GPUObject> comparator, String opcode) throws DMLRuntimeException { + Optional<GPUObject> toClear = getGPUObjects(locked, dirty).stream() + .filter(gObj -> getWorstCaseContiguousMemorySize(gObj) >= minSize) + .max(comparator); + if(toClear.isPresent()) { + GPUObject gObj = toClear.get(); + if(gObj.dirty) + gObj.copyFromDeviceToHost(opcode, true, true); // Perform eviction if dirty + else + gObj.clearData(opcode, true); + gpuObjects.remove(gObj); + } + return toClear.isPresent(); + } + + /** + * Clear all unlocked gpu objects + * + * @param opcode instruction code + * @throws DMLRuntimeException if error + */ + void clearAllUnlocked(String opcode) throws DMLRuntimeException { + Set<GPUObject> unlockedGPUObjects = gpuObjects.stream() + .filter(gpuObj -> !gpuObj.isLocked()).collect(Collectors.toSet()); + if(unlockedGPUObjects.size() > 0) { + if(LOG.isWarnEnabled()) + LOG.warn("Clearing all unlocked matrices (count=" + unlockedGPUObjects.size() + ")."); + for(GPUObject toBeRemoved : unlockedGPUObjects) { + if(toBeRemoved.dirty) + toBeRemoved.copyFromDeviceToHost(opcode, true, true); + else + toBeRemoved.clearData(opcode, true); + } + gpuObjects.removeAll(unlockedGPUObjects); + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java index e2012a7..411606d 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java @@ -23,13 +23,14 @@ import static jcuda.runtime.JCuda.cudaMalloc; import static jcuda.runtime.JCuda.cudaMemGetInfo; import static jcuda.runtime.JCuda.cudaMemset; -import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; +import java.util.List; import java.util.Set; import java.util.concurrent.atomic.LongAdder; +import java.util.stream.Collectors; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -42,7 +43,6 @@ import org.apache.sysml.runtime.instructions.gpu.GPUInstruction; import org.apache.sysml.utils.GPUStatistics; import jcuda.Pointer; - /** * - All cudaFree and cudaMalloc in SystemML should go through this class to avoid OOM or incorrect results. * - This class can be refactored in future to accept a chunk of memory ahead of time rather than while execution. This will only thow memory-related errors during startup. @@ -50,67 +50,87 @@ import jcuda.Pointer; public class GPUMemoryManager { protected static final Log LOG = LogFactory.getLog(GPUMemoryManager.class.getName()); - // If the available free size is less than this factor, GPUMemoryManager will warn users of multiple programs grabbing onto GPU memory. - // This often happens if user tries to use both TF and SystemML, and TF grabs onto 90% of the memory ahead of time. - private static final double WARN_UTILIZATION_FACTOR = 0.7; + /*****************************************************************************************/ + // GPU Memory is divided into three major sections: + // 1. Matrix Memory: Memory allocated to matrices in SystemML and addressable by GPUObjects. + // This memory section is divided into three minor sections: + // 1.1 Locked Matrix Memory + // 1.2 UnLocked + Non-Dirty Matrix Memory + // 1.3 UnLocked + Dirty Matrix Memory + // To get the GPUObjects/Pointers in this section, please use getGPUObjects and getPointers methods of GPUMatrixMemoryManager. + // To clear GPUObjects/Pointers in this section, please use clear and clearAll methods of GPUMatrixMemoryManager. + // Both these methods allow to get/clear unlocked/locked and dirty/non-dirty objects of a certain size. + GPUMatrixMemoryManager matrixMemoryManager; + public GPUMatrixMemoryManager getGPUMatrixMemoryManager() { + return matrixMemoryManager; + } - // Invoke cudaMemGetInfo to get available memory information. Useful if GPU is shared among multiple application. - public double GPU_MEMORY_UTILIZATION_FACTOR = ConfigurationManager.getDMLConfig() - .getDoubleValue(DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR); + // 2. Rmvar-ed pointers: If sysml.gpu.eager.cudaFree is set to false, + // then this manager caches pointers of the GPUObject on which rmvar instruction has been executed for future reuse. + // We observe 2-3x improvement with this approach and hence recommend to set this flag to false. + GPULazyCudaFreeMemoryManager lazyCudaFreeMemoryManager; + public GPULazyCudaFreeMemoryManager getGPULazyCudaFreeMemoryManager() { + return lazyCudaFreeMemoryManager; + } - /** - * Map of free blocks allocate on GPU. maps size_of_block -> pointer on GPU - */ - private HashMap<Long, Set<Pointer>> rmvarGPUPointers = new HashMap<Long, Set<Pointer>>(); + // 3. Non-matrix locked pointers: Other pointers (required for execution of an instruction that are not memory). For example: workspace + // These pointers are not explicitly tracked by a memory manager but one can get them by using getNonMatrixLockedPointers + private Set<Pointer> getNonMatrixLockedPointers() { + Set<Pointer> managedPointers = matrixMemoryManager.getPointers(); + managedPointers.addAll(lazyCudaFreeMemoryManager.getAllPointers()); + return nonIn(allPointers.keySet(), managedPointers); + } - /** - * list of allocated {@link GPUObject} instances allocated on {@link GPUContext#deviceNum} GPU - * These are matrices allocated on the GPU on which rmvar hasn't been called yet. - * If a {@link GPUObject} has more than one lock on it, it cannot be freed - * If it has zero locks on it, it can be freed, but it is preferrable to keep it around - * so that an extraneous host to dev transfer can be avoided - */ - private ArrayList<GPUObject> allocatedGPUObjects = new ArrayList<>(); /** - * To record size of allocated blocks + * To record size of all allocated pointers allocated by above memory managers */ - private HashMap<Pointer, Long> allocatedGPUPointers = new HashMap<>(); - - /** - * Adds the GPU object to the memory manager - * - * @param gpuObj the handle to the GPU object - */ - public void addGPUObject(GPUObject gpuObj) { - allocatedGPUObjects.add(gpuObj); - } - - /** - * Removes the GPU object from the memory manager - * - * @param gpuObj the handle to the GPU object - */ - public void removeGPUObject(GPUObject gpuObj) { - if(LOG.isDebugEnabled()) - LOG.debug("Removing the GPU object: " + gpuObj); - allocatedGPUObjects.removeIf(a -> a.equals(gpuObj)); - } + HashMap<Pointer, PointerInfo> allPointers = new HashMap<>(); + /*****************************************************************************************/ + /** * Get size of allocated GPU Pointer * @param ptr pointer to get size of * @return either the size or -1 if no such pointer exists */ public long getSizeAllocatedGPUPointer(Pointer ptr) { - if(allocatedGPUPointers.containsKey(ptr)) { - return allocatedGPUPointers.get(ptr); + if(allPointers.containsKey(ptr)) { + return allPointers.get(ptr).getSizeInBytes(); } return -1; } + /** + * Utility to debug memory leaks + */ + static class PointerInfo { + private long sizeInBytes; + private StackTraceElement[] stackTraceElements; + public PointerInfo(long sizeInBytes) { + if(DMLScript.PRINT_GPU_MEMORY_INFO) { + this.stackTraceElements = Thread.currentThread().getStackTrace(); + } + this.sizeInBytes = sizeInBytes; + } + public long getSizeInBytes() { + return sizeInBytes; + } + } + + // If the available free size is less than this factor, GPUMemoryManager will warn users of multiple programs grabbing onto GPU memory. + // This often happens if user tries to use both TF and SystemML, and TF grabs onto 90% of the memory ahead of time. + private static final double WARN_UTILIZATION_FACTOR = 0.7; + + // Invoke cudaMemGetInfo to get available memory information. Useful if GPU is shared among multiple application. + public double GPU_MEMORY_UTILIZATION_FACTOR = ConfigurationManager.getDMLConfig() + .getDoubleValue(DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR); + + public GPUMemoryManager(GPUContext gpuCtx) { + matrixMemoryManager = new GPUMatrixMemoryManager(this); + lazyCudaFreeMemoryManager = new GPULazyCudaFreeMemoryManager(this); long free[] = { 0 }; long total[] = { 0 }; cudaMemGetInfo(free, total); @@ -135,19 +155,48 @@ public class GPUMemoryManager { * @param size size in bytes * @return allocated pointer */ - private Pointer cudaMallocWarnIfFails(Pointer A, long size) { + private Pointer cudaMallocNoWarn(Pointer A, long size) { try { cudaMalloc(A, size); - allocatedGPUPointers.put(A, size); + allPointers.put(A, new PointerInfo(size)); return A; } catch(jcuda.CudaException e) { - LOG.warn("cudaMalloc failed immediately after cudaMemGetInfo reported that memory of size " + size + " is available. " - + "This usually happens if there are external programs trying to grab on to memory in parallel."); return null; } } /** + * Pretty printing utility to debug OOM error + * + * @param stackTrace stack trace + * @param index call depth + * @return pretty printed string + */ + private String getCallerInfo(StackTraceElement [] stackTrace, int index) { + if(stackTrace.length <= index) + return "->"; + else + return "->" + stackTrace[index].getClassName() + "." + stackTrace[index].getMethodName() + "(" + stackTrace[index].getFileName() + ":" + stackTrace[index].getLineNumber() + ")"; + } + + /** + * Pretty printing utility to print bytes + * + * @param numBytes number of bytes + * @return a human-readable display value + */ + private String byteCountToDisplaySize(long numBytes) { + // return org.apache.commons.io.FileUtils.byteCountToDisplaySize(bytes); // performs rounding + if (numBytes < 1024) { + return numBytes + " bytes"; + } + else { + int exp = (int) (Math.log(numBytes) / 6.931471805599453); + return String.format("%.3f %sB", ((double)numBytes) / Math.pow(1024, exp), "KMGTP".charAt(exp-1)); + } + } + + /** * Allocate pointer of the given size in bytes. * * @param opcode instruction name @@ -156,20 +205,26 @@ public class GPUMemoryManager { */ public Pointer malloc(String opcode, long size) { if(size < 0) { - throw new DMLRuntimeException("Cannot allocate memory of size " + size); + throw new DMLRuntimeException("Cannot allocate memory of size " + byteCountToDisplaySize(size)); } long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; + long mallocStart = 0; // Step 1: First try reusing exact match in rmvarGPUPointers to avoid holes in the GPU memory - Pointer A = getRmvarPointer(opcode, size); + Pointer A = lazyCudaFreeMemoryManager.getRmvarPointer(opcode, size); + if(A != null) + addMiscTime(opcode, GPUInstruction.MISC_TIMER_REUSE, t0); + Pointer tmpA = (A == null) ? new Pointer() : null; // Step 2: Allocate a new pointer in the GPU memory (since memory is available) if(A == null && size <= getAvailableMemory()) { - A = cudaMallocWarnIfFails(new Pointer(), size); + mallocStart = DMLScript.STATISTICS ? System.nanoTime() : 0; + A = cudaMallocNoWarn(tmpA, size); // Try malloc rather than check available memory to avoid fragmentation related issues + addMiscTime(null, GPUStatistics.cudaEvictMallocTime, GPUStatistics.cudaEvictionMallocCount, GPUInstruction.MISC_TIMER_EVICT, mallocStart); if(LOG.isTraceEnabled()) { if(A == null) - LOG.trace("Couldnot allocate a new pointer in the GPU memory:" + size); + LOG.trace("Couldnot allocate a new pointer in the GPU memory:" + byteCountToDisplaySize(size)); else - LOG.trace("Allocated a new pointer in the GPU memory:" + size); + LOG.trace("Allocated a new pointer in the GPU memory:" + byteCountToDisplaySize(size)); } } @@ -178,21 +233,19 @@ public class GPUMemoryManager { // Step 3: Try reusing non-exact match entry of rmvarGPUPointers if(A == null) { - // Find minimum key that is greater than size - long key = Long.MAX_VALUE; - for(Long k : rmvarGPUPointers.keySet()) { - key = k > size ? Math.min(key, k) : key; - } - if(key != Long.MAX_VALUE) { - A = getRmvarPointer(opcode, key); - // To avoid potential for holes in the GPU memory + A = lazyCudaFreeMemoryManager.getRmvarPointerMinSize(opcode, size); + if(A != null) { guardedCudaFree(A); - A = cudaMallocWarnIfFails(new Pointer(), size); - if(LOG.isTraceEnabled()) { + mallocStart = DMLScript.STATISTICS ? System.nanoTime() : 0; + A = cudaMallocNoWarn(tmpA, size); // Try malloc rather than check available memory to avoid fragmentation related issues + addMiscTime(null, GPUStatistics.cudaEvictMallocTime, GPUStatistics.cudaEvictionMallocCount, GPUInstruction.MISC_TIMER_EVICT, mallocStart); + if(DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) { if(A == null) - LOG.trace("Couldnot reuse non-exact match of rmvarGPUPointers:" + size); - else - LOG.trace("Reuses a non-exact match from rmvarGPUPointers:" + size); + LOG.info("Couldnot reuse non-exact match of rmvarGPUPointers:" + byteCountToDisplaySize(size)); + else { + LOG.info("Reuses a non-exact match from rmvarGPUPointers:" + byteCountToDisplaySize(size)); + LOG.info("GPU Memory info after reusing a non-exact match from rmvarGPUPointers:" + toString()); + } } } } @@ -205,20 +258,16 @@ public class GPUMemoryManager { // Step 4: Eagerly free-up rmvarGPUPointers and check if memory is available on GPU if(A == null) { - Set<Pointer> toFree = new HashSet<Pointer>(); - for(Set<Pointer> ptrs : rmvarGPUPointers.values()) { - toFree.addAll(ptrs); - } - for(Pointer ptr : toFree) { - guardedCudaFree(ptr); - } + lazyCudaFreeMemoryManager.clearAll(); if(size <= getAvailableMemory()) { - A = cudaMallocWarnIfFails(new Pointer(), size); - if(LOG.isTraceEnabled()) { + A = cudaMallocNoWarn(tmpA, size); + if(DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) { if(A == null) - LOG.trace("Couldnot allocate a new pointer in the GPU memory after eager free:" + size); - else - LOG.trace("Allocated a new pointer in the GPU memory after eager free:" + size); + LOG.info("Couldnot allocate a new pointer in the GPU memory after eager free:" + byteCountToDisplaySize(size)); + else { + LOG.info("Allocated a new pointer in the GPU memory after eager free:" + byteCountToDisplaySize(size)); + LOG.info("GPU Memory info after allocating new pointer post lazyCudaFreeMemoryManager.clearAll():" + toString()); + } } } } @@ -228,36 +277,114 @@ public class GPUMemoryManager { // Step 5: Try eviction based on the given policy if(A == null) { t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; - // Sort based on the eviction policy - Collections.sort(allocatedGPUObjects, new GPUComparator(size)); - while (size > getAvailableMemory() && allocatedGPUObjects.size() > 0) { - GPUObject toBeRemoved = allocatedGPUObjects.get(allocatedGPUObjects.size() - 1); - if (toBeRemoved.isLocked()) { - // All remaining blocks will also be locked - break; - } - else { - // Perform eviction - if (toBeRemoved.dirty) { - toBeRemoved.copyFromDeviceToHost(opcode, true); - } - toBeRemoved.clearData(true); + + // First, clear unlocked non-dirty matrices greater than or equal to size + // Comparator clears the largest matrix to avoid future evictions + boolean success = matrixMemoryManager.clear(false, false, size, SIMPLE_COMPARATOR_SORT_BY_SIZE, opcode); + if(DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) { + if(success) { + LOG.info("Cleared an unlocked non-dirty matrix greater than or equal to " + byteCountToDisplaySize(size)); + LOG.info("GPU Memory info after clearing an unlocked non-dirty matrix:" + toString()); } + else + LOG.info("No unlocked non-dirty matrix greater than or equal to " + byteCountToDisplaySize(size) + " found for clearing."); } - addMiscTime(opcode, GPUStatistics.cudaEvictionCount, GPUStatistics.cudaEvictTime, GPUInstruction.MISC_TIMER_EVICT, t0); - if(size <= getAvailableMemory()) { - A = cudaMallocWarnIfFails(new Pointer(), size); - if(LOG.isTraceEnabled()) { - if(A == null) - LOG.trace("Couldnot allocate a new pointer in the GPU memory after eviction:" + size); + if(!success) { + // First, clear unlocked dirty matrices greater than or equal to size using the eviction policy + // Comparator clears the largest matrix to avoid future evictions + if(DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) { + LOG.info("GPU Memory info before eviction:" + toString()); + } + success = matrixMemoryManager.clear(false, true, size, new EvictionPolicyBasedComparator(size), opcode); + // JCuda.cudaDeviceSynchronize(); + if(DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) { + if(success) { + LOG.info("Evicted an unlocked dirty matrix greater than or equal to " + byteCountToDisplaySize(size)); + LOG.info("GPU Memory info after evicting an unlocked dirty matrix:" + toString()); + } else - LOG.trace("Allocated a new pointer in the GPU memory after eviction:" + size); + LOG.info("No unlocked dirty matrix greater than or equal to " + byteCountToDisplaySize(size) + " found for evicted."); } + + if(!success) { + // Minor optimization: clear all unlocked non-dirty matrices before attempting eviction + // Delete all non-dirty + List<GPUObject> unlockedGPUObjects = matrixMemoryManager.gpuObjects.stream() + .filter(gpuObj -> !gpuObj.isLocked() && !gpuObj.isDirty()).collect(Collectors.toList()); + matrixMemoryManager.gpuObjects.removeAll(unlockedGPUObjects); + for(GPUObject toBeRemoved : unlockedGPUObjects) { + toBeRemoved.clearData(opcode, true); + } + if(DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) { + LOG.info("GPU Memory info after clearing all unlocked non-dirty matrices:" + toString()); + } + mallocStart = DMLScript.STATISTICS ? System.nanoTime() : 0; + A = cudaMallocNoWarn(tmpA, size); // Try malloc rather than check available memory to avoid fragmentation related issues + addMiscTime(null, GPUStatistics.cudaEvictMallocTime, GPUStatistics.cudaEvictionMallocCount, GPUInstruction.MISC_TIMER_EVICT, mallocStart); + + // --------------------------------------------------------------- + // Evict unlocked GPU objects one-by-one and try malloc + unlockedGPUObjects = null; + if(A == null) { + unlockedGPUObjects = matrixMemoryManager.gpuObjects.stream() + .filter(gpuObj -> !gpuObj.isLocked() && gpuObj.isDirty()).collect(Collectors.toList()); + Collections.sort(unlockedGPUObjects, new EvictionPolicyBasedComparator(size)); + while(A == null && unlockedGPUObjects.size() > 0) { + if(DMLScript.GPU_EVICTION_POLICY == DMLScript.EvictionPolicy.ALIGN_MEMORY) { + // TODO: Optimize later using sliding window + // Evict as many sequential dense objects from back of the queue as possible + long neededSize = size; + while(neededSize >= 0 && unlockedGPUObjects.size() > 0) { + GPUObject gpuObj = unlockedGPUObjects.remove(unlockedGPUObjects.size()-1); + neededSize -= matrixMemoryManager.getWorstCaseContiguousMemorySize(gpuObj); + gpuObj.copyFromDeviceToHost(opcode, true, true); + } + } + else { + GPUObject gpuObj = unlockedGPUObjects.remove(unlockedGPUObjects.size()-1); + gpuObj.copyFromDeviceToHost(opcode, true, true); + } + mallocStart = DMLScript.STATISTICS ? System.nanoTime() : 0; + A = cudaMallocNoWarn(tmpA, size); // Try malloc rather than check available memory to avoid fragmentation related issues + addMiscTime(null, GPUStatistics.cudaEvictMallocTime, GPUStatistics.cudaEvictionMallocCount, GPUInstruction.MISC_TIMER_EVICT, mallocStart); + } + if(DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) { + // greater than or equal to " + byteCountToDisplaySize(size) + LOG.info("GPU Memory info after eviction:" + toString()); + } + if(unlockedGPUObjects != null && unlockedGPUObjects.size() == 0) { + LOG.warn("Evicted all unlocked matrices"); + } + } + + } + // --------------------------------------------------------------- + } + addMiscTime(opcode, GPUStatistics.cudaEvictTime, GPUStatistics.cudaEvictionCount, GPUInstruction.MISC_TIMER_EVICT, t0); + if(A == null) { + A = cudaMallocNoWarn(tmpA, size); // if the matrix is not allocated via eviction } + if(A == null) { + LOG.warn("cudaMalloc failed immediately after cudaMemGetInfo reported that memory of size " + + byteCountToDisplaySize(size) + " is available. " + + "This usually happens if there are external programs trying to grab on to memory in parallel or there is potential fragmentation."); + } + else if(DMLScript.PRINT_GPU_MEMORY_INFO || LOG.isTraceEnabled()) { + LOG.info("Malloc after eviction/clearing is successful."); + } + } + + // Step 6: Handle defragmentation + if(A == null) { + LOG.warn("Potential fragmentation of the GPU memory. Forcibly evicting all ..."); + LOG.info("Before clearAllUnlocked, GPU Memory info:" + toString()); + matrixMemoryManager.clearAllUnlocked(opcode); + LOG.info("GPU Memory info after evicting all unlocked matrices:" + toString()); + A = cudaMallocNoWarn(tmpA, size); } if(A == null) { - throw new DMLRuntimeException("There is not enough memory on device for this matrix, request (" + size + "). " + throw new DMLRuntimeException("There is not enough memory on device for this matrix, requested = " + byteCountToDisplaySize(size) + ". \n " + toString()); } @@ -267,26 +394,50 @@ public class GPUMemoryManager { return A; } + // --------------- Developer Utilities to debug potential memory leaks ------------------------ + @SuppressWarnings("unused") + private void printPointers(List<PointerInfo> pointers) { + for(PointerInfo ptrInfo : pointers) { + System.out.println(">>" + + // getCallerInfo(ptrInfo.stackTraceElements, 5) + getCallerInfo(ptrInfo.stackTraceElements, 6) + getCallerInfo(ptrInfo.stackTraceElements, 7) + + getCallerInfo(ptrInfo.stackTraceElements, 8) + getCallerInfo(ptrInfo.stackTraceElements, 9) + getCallerInfo(ptrInfo.stackTraceElements, 10)); + } + } + + @SuppressWarnings("unused") + private void printPointers(Set<Pointer> pointers, StringBuilder sb) { + for(Pointer ptr : pointers) { + PointerInfo ptrInfo = allPointers.get(ptr); + sb.append(">>"); + // getCallerInfo(ptrInfo.stackTraceElements, 5) + getCallerInfo(ptrInfo.stackTraceElements, 6) + getCallerInfo(ptrInfo.stackTraceElements, 7) + + sb.append(getCallerInfo(ptrInfo.stackTraceElements, 8)); + sb.append(getCallerInfo(ptrInfo.stackTraceElements, 9)); + sb.append(getCallerInfo(ptrInfo.stackTraceElements, 10)); + sb.append("\n"); + } + } + // -------------------------------------------------------------------------------------------- + /** * Note: This method should not be called from an iterator as it removes entries from allocatedGPUPointers and rmvarGPUPointers * * @param toFree pointer to call cudaFree method on */ - private void guardedCudaFree(Pointer toFree) { - if (toFree != new Pointer()) { - if(allocatedGPUPointers.containsKey(toFree)) { - Long size = allocatedGPUPointers.remove(toFree); - if(rmvarGPUPointers.containsKey(size) && rmvarGPUPointers.get(size).contains(toFree)) { - remove(rmvarGPUPointers, size, toFree); - } - if(LOG.isDebugEnabled()) - LOG.debug("Free-ing up the pointer: " + toFree); - cudaFree(toFree); - } - else { - throw new RuntimeException("Attempting to free an unaccounted pointer:" + toFree); + void guardedCudaFree(Pointer toFree) { + if(allPointers.containsKey(toFree)) { + long size = allPointers.get(toFree).getSizeInBytes(); + if(LOG.isTraceEnabled()) { + LOG.trace("Free-ing up the pointer of size " + byteCountToDisplaySize(size)); } + allPointers.remove(toFree); + lazyCudaFreeMemoryManager.removeIfPresent(size, toFree); + cudaFree(toFree); + // JCuda.cudaDeviceSynchronize(); // Force a device synchronize after free-ing the pointer for debugging } + else { + throw new RuntimeException("Attempting to free an unaccounted pointer:" + toFree); + } + } /** @@ -295,88 +446,62 @@ public class GPUMemoryManager { * @param opcode instruction name * @param toFree pointer to free * @param eager whether to deallocate eagerly + * @throws DMLRuntimeException if error occurs */ - public void free(String opcode, Pointer toFree, boolean eager) { - Pointer dummy = new Pointer(); - if (toFree == dummy) { // trying to free a null pointer - return; - } + public void free(String opcode, Pointer toFree, boolean eager) throws DMLRuntimeException { + if(LOG.isTraceEnabled()) + LOG.trace("Free-ing the pointer with eager=" + eager); if (eager) { long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; guardedCudaFree(toFree); addMiscTime(opcode, GPUStatistics.cudaDeAllocTime, GPUStatistics.cudaDeAllocCount, GPUInstruction.MISC_TIMER_CUDA_FREE, t0); } else { - if (!allocatedGPUPointers.containsKey(toFree)) + if (!allPointers.containsKey(toFree)) { + LOG.info("GPU memory info before failure:" + toString()); throw new RuntimeException("ERROR : Internal state corrupted, cache block size map is not aware of a block it trying to free up"); - long size = allocatedGPUPointers.get(toFree); - Set<Pointer> freeList = rmvarGPUPointers.get(size); - if (freeList == null) { - freeList = new HashSet<Pointer>(); - rmvarGPUPointers.put(size, freeList); } - if (freeList.contains(toFree)) - throw new RuntimeException("GPU : Internal state corrupted, double free"); - freeList.add(toFree); + long size = allPointers.get(toFree).getSizeInBytes(); + lazyCudaFreeMemoryManager.add(size, toFree); } } /** + * Removes the GPU object from the memory manager + * + * @param gpuObj the handle to the GPU object + */ + public void removeGPUObject(GPUObject gpuObj) { + if(LOG.isDebugEnabled()) + LOG.debug("Removing the GPU object: " + gpuObj); + matrixMemoryManager.gpuObjects.removeIf(a -> a.equals(gpuObj)); + } + + + /** * Clear the allocated GPU objects */ public void clearMemory() { // First deallocate all the GPU objects - for(GPUObject gpuObj : allocatedGPUObjects) { + for(GPUObject gpuObj : matrixMemoryManager.gpuObjects) { if(gpuObj.isDirty()) { - LOG.debug("Attempted to free GPU Memory when a block[" + gpuObj + "] is still on GPU memory, copying it back to host."); - gpuObj.acquireHostRead(null); + if(LOG.isDebugEnabled()) + LOG.debug("Attempted to free GPU Memory when a block[" + gpuObj + "] is still on GPU memory, copying it back to host."); + gpuObj.copyFromDeviceToHost(null, true, true); } - gpuObj.clearData(true); + else + gpuObj.clearData(null, true); } - allocatedGPUObjects.clear(); + matrixMemoryManager.gpuObjects.clear(); // Then clean up remaining allocated GPU pointers - Set<Pointer> remainingPtr = new HashSet<>(allocatedGPUPointers.keySet()); + Set<Pointer> remainingPtr = new HashSet<>(allPointers.keySet()); for(Pointer toFree : remainingPtr) { guardedCudaFree(toFree); // cleans up allocatedGPUPointers and rmvarGPUPointers as well } + allPointers.clear(); } - - /** - * Get all pointers withing allocatedGPUObjects such that GPUObject is in dirty state - * - * @return set of pointers - */ - private HashSet<Pointer> getDirtyPointers() { - HashSet<Pointer> nonTemporaryPointers = new HashSet<Pointer>(); - for (GPUObject o : allocatedGPUObjects) { - if(o.isDirty()) { - if (o.isSparse()) { - CSRPointer p = o.getSparseMatrixCudaPointer(); - if (p == null) - throw new RuntimeException("CSRPointer is null in clearTemporaryMemory"); - if (p.rowPtr != null) { - nonTemporaryPointers.add(p.rowPtr); - } - if (p.colInd != null) { - nonTemporaryPointers.add(p.colInd); - } - if (p.val != null) { - nonTemporaryPointers.add(p.val); - } - - } else { - Pointer p = o.getJcudaDenseMatrixPtr(); - if (p == null) - throw new RuntimeException("Pointer is null in clearTemporaryMemory"); - nonTemporaryPointers.add(p); - } - } - } - return nonTemporaryPointers; - } - /** * Performs a non-in operation * @@ -399,7 +524,8 @@ public class GPUMemoryManager { */ public void clearTemporaryMemory() { // To record the cuda block sizes needed by allocatedGPUObjects, others are cleared up. - Set<Pointer> temporaryPointers = nonIn(allocatedGPUPointers.keySet(), getDirtyPointers()); + Set<Pointer> unlockedDirtyPointers = matrixMemoryManager.getPointers(false, true); + Set<Pointer> temporaryPointers = nonIn(allPointers.keySet(), unlockedDirtyPointers); for(Pointer tmpPtr : temporaryPointers) { guardedCudaFree(tmpPtr); } @@ -431,86 +557,84 @@ public class GPUMemoryManager { * @param instructionLevelTimer member of GPUInstruction * @param startTime start time */ - private void addMiscTime(String opcode, String instructionLevelTimer, long startTime) { + void addMiscTime(String opcode, String instructionLevelTimer, long startTime) { if (opcode != null && DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(opcode, instructionLevelTimer, System.nanoTime() - startTime); } - /** - * Get any pointer of the given size from rmvar-ed pointers (applicable if eager cudaFree is set to false) - * - * @param opcode opcode - * @param size size in bytes - * @return pointer - */ - private Pointer getRmvarPointer(String opcode, long size) { - if (rmvarGPUPointers.containsKey(size)) { - if(LOG.isTraceEnabled()) - LOG.trace("Getting rmvar-ed pointers for size:" + size); - long t0 = opcode != null && DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; - Pointer A = remove(rmvarGPUPointers, size); // remove from rmvarGPUPointers as you are not calling cudaFree - addMiscTime(opcode, GPUInstruction.MISC_TIMER_REUSE, t0); - return A; - } - else { - return null; - } - } - - /** - * Remove any pointer in the given hashmap - * - * @param hm hashmap of size, pointers - * @param size size in bytes - * @return the pointer that was removed - */ - private Pointer remove(HashMap<Long, Set<Pointer>> hm, long size) { - Pointer A = hm.get(size).iterator().next(); - remove(hm, size, A); - return A; - } - - /** - * Remove a specific pointer in the given hashmap - * - * @param hm hashmap of size, pointers - * @param size size in bytes - * @param ptr pointer to be removed - */ - private void remove(HashMap<Long, Set<Pointer>> hm, long size, Pointer ptr) { - hm.get(size).remove(ptr); - if (hm.get(size).isEmpty()) - hm.remove(size); - } - /** * Print debugging information */ public String toString() { - long sizeOfLockedGPUObjects = 0; long numLockedGPUObjects = 0; - long sizeOfUnlockedGPUObjects = 0; long numUnlockedGPUObjects = 0; - for(GPUObject gpuObj : allocatedGPUObjects) { - try { - if(gpuObj.isLocked()) { - numLockedGPUObjects++; - sizeOfLockedGPUObjects += gpuObj.getSizeOnDevice(); + long sizeOfLockedGPUObjects = 0; int numLockedGPUObjects = 0; int numLockedPointers = 0; + long sizeOfUnlockedDirtyGPUObjects = 0; int numUnlockedDirtyGPUObjects = 0; int numUnlockedDirtyPointers = 0; + long sizeOfUnlockedNonDirtyGPUObjects = 0; int numUnlockedNonDirtyGPUObjects = 0; int numUnlockedNonDirtyPointers = 0; + for(GPUObject gpuObj : matrixMemoryManager.gpuObjects) { + if(gpuObj.isLocked()) { + numLockedGPUObjects++; + sizeOfLockedGPUObjects += gpuObj.getSizeOnDevice(); + numLockedPointers += matrixMemoryManager.getPointers(gpuObj).size(); + } + else { + if(gpuObj.isDirty()) { + numUnlockedDirtyGPUObjects++; + sizeOfUnlockedDirtyGPUObjects += gpuObj.getSizeOnDevice(); + numUnlockedDirtyPointers += matrixMemoryManager.getPointers(gpuObj).size(); } else { - numUnlockedGPUObjects++; - sizeOfUnlockedGPUObjects += gpuObj.getSizeOnDevice(); + numUnlockedNonDirtyGPUObjects++; + sizeOfUnlockedNonDirtyGPUObjects += gpuObj.getSizeOnDevice(); + numUnlockedNonDirtyPointers += matrixMemoryManager.getPointers(gpuObj).size(); } - } catch (DMLRuntimeException e) { - throw new RuntimeException(e); } } + + long totalMemoryAllocated = 0; - for(Long numBytes : allocatedGPUPointers.values()) { - totalMemoryAllocated += numBytes; + for(PointerInfo ptrInfo : allPointers.values()) { + totalMemoryAllocated += ptrInfo.getSizeInBytes(); + } + + + Set<Pointer> potentiallyLeakyPointers = getNonMatrixLockedPointers(); + List<Long> sizePotentiallyLeakyPointers = potentiallyLeakyPointers.stream(). + map(ptr -> allPointers.get(ptr).sizeInBytes).collect(Collectors.toList()); + long totalSizePotentiallyLeakyPointers = 0; + for(long size : sizePotentiallyLeakyPointers) { + totalSizePotentiallyLeakyPointers += size; } - return "Num of GPU objects: [unlocked:" + numUnlockedGPUObjects + ", locked:" + numLockedGPUObjects + "]. " - + "Size of GPU objects in bytes: [unlocked:" + sizeOfUnlockedGPUObjects + ", locked:" + sizeOfLockedGPUObjects + "]. " - + "Total memory allocated by the current GPU context in bytes:" + totalMemoryAllocated; + StringBuilder ret = new StringBuilder(); + //if(DMLScript.PRINT_GPU_MEMORY_INFO) { + // if(potentiallyLeakyPointers.size() > 0) { + // ret.append("Non-matrix pointers were allocated by:\n"); + // printPointers(potentiallyLeakyPointers, ret); + // } + //} + ret.append("\n====================================================\n"); + ret.append(String.format("%-35s%-15s%-15s%-15s\n", "", + "Num Objects", "Num Pointers", "Size")); + ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Unlocked Dirty GPU objects", + numUnlockedDirtyGPUObjects, numUnlockedDirtyPointers, byteCountToDisplaySize(sizeOfUnlockedDirtyGPUObjects))); + ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Unlocked NonDirty GPU objects", + numUnlockedNonDirtyGPUObjects, numUnlockedNonDirtyPointers, byteCountToDisplaySize(sizeOfUnlockedNonDirtyGPUObjects))); + ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Locked GPU objects", + numLockedGPUObjects, numLockedPointers, byteCountToDisplaySize(sizeOfLockedGPUObjects))); + ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Cached rmvar-ed pointers", + "-", lazyCudaFreeMemoryManager.getNumPointers(), byteCountToDisplaySize(lazyCudaFreeMemoryManager.getTotalMemoryAllocated()))); + ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Non-matrix/non-cached pointers", + "-", potentiallyLeakyPointers.size(), byteCountToDisplaySize(totalSizePotentiallyLeakyPointers))); + ret.append(String.format("%-35s%-15s%-15s%-15s\n", "All pointers", + "-", allPointers.size(), byteCountToDisplaySize(totalMemoryAllocated))); + long free[] = { 0 }; + long total[] = { 0 }; + cudaMemGetInfo(free, total); + ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Free mem (from cudaMemGetInfo)", + "-", "-", byteCountToDisplaySize(free[0]))); + ret.append(String.format("%-35s%-15s%-15s%-15s\n", "Total mem (from cudaMemGetInfo)", + "-", "-", byteCountToDisplaySize(total[0]))); + ret.append("====================================================\n"); + return ret.toString(); } /** @@ -525,14 +649,39 @@ public class GPUMemoryManager { return (long) (free[0] * GPU_MEMORY_UTILIZATION_FACTOR); } + private static Comparator<GPUObject> SIMPLE_COMPARATOR_SORT_BY_SIZE = (o1, o2) -> o1.getSizeOnDevice() < o2.getSizeOnDevice() ? -1 : 1; + + private static class CustomPointer extends Pointer { + public CustomPointer(Pointer p) { + super(p); + } + + @Override + public long getNativePointer() { + return super.getNativePointer(); + } + + } /** * Class that governs the eviction policy */ - public static class GPUComparator implements Comparator<GPUObject> { + public static class EvictionPolicyBasedComparator implements Comparator<GPUObject> { private long neededSize; - public GPUComparator(long neededSize) { + public EvictionPolicyBasedComparator(long neededSize) { this.neededSize = neededSize; } + + private int minEvictCompare(GPUObject p1, GPUObject p2) { + long p1Size = p1.getSizeOnDevice() - neededSize; + long p2Size = p2.getSizeOnDevice() - neededSize; + + if (p1Size >= 0 && p2Size >= 0) { + return Long.compare(p2Size, p1Size); + } else { + return Long.compare(p1Size, p2Size); + } + } + @Override public int compare(GPUObject p1, GPUObject p2) { if (p1.isLocked() && p2.isLocked()) { @@ -548,25 +697,32 @@ public class GPUMemoryManager { return 1; } else { // Both are unlocked - if (DMLScript.GPU_EVICTION_POLICY == DMLScript.EvictionPolicy.MIN_EVICT) { - long p1Size = 0; - long p2Size = 0; - try { - p1Size = p1.getSizeOnDevice() - neededSize; - p2Size = p2.getSizeOnDevice() - neededSize; - } catch (DMLRuntimeException e) { - throw new RuntimeException(e); + if (DMLScript.GPU_EVICTION_POLICY == DMLScript.EvictionPolicy.ALIGN_MEMORY) { + if(!p1.isDensePointerNull() && !p2.isDensePointerNull()) { + long p1Ptr = new CustomPointer(p1.getDensePointer()).getNativePointer(); + long p2Ptr = new CustomPointer(p2.getDensePointer()).getNativePointer(); + + if(p1Ptr <= p2Ptr) + return -1; + else + return 1; } - - if (p1Size >= 0 && p2Size >= 0) { - return Long.compare(p2Size, p1Size); - } else { - return Long.compare(p1Size, p2Size); + else if(p1.isDensePointerNull() && !p2.isDensePointerNull()) { + return -1; + } + else if(!p1.isDensePointerNull() && p2.isDensePointerNull()) { + return 1; } + else { + return minEvictCompare(p1, p2); + } + } + else if (DMLScript.GPU_EVICTION_POLICY == DMLScript.EvictionPolicy.MIN_EVICT) { + return minEvictCompare(p1, p2); } else { return Long.compare(p2.timestamp.get(), p1.timestamp.get()); } } } } -} +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java index 613e16f..bf44895 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java @@ -23,6 +23,7 @@ import static jcuda.runtime.JCuda.cudaMemcpy; import static jcuda.runtime.JCuda.cudaMemset; import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice; import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost; + import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.LongAdder; @@ -97,6 +98,56 @@ public class GPUObject { * Enclosing {@link MatrixObject} instance */ protected MatrixObject mat = null; + + // ---------------------------------------------------------------------- + // Methods used to access, set and check jcudaDenseMatrixPtr + + /** + * Pointer to dense matrix + * + * @return a pointer to the dense matrix + */ + public Pointer getDensePointer() { + return jcudaDenseMatrixPtr; + } + + /** + * Checks if the dense pointer is null + * + * @return if the state of dense pointer is null + */ + public boolean isDensePointerNull() { + return jcudaDenseMatrixPtr == null; + } + + /** + * Removes the dense pointer and potential soft reference + */ + public void clearDensePointer() { + jcudaDenseMatrixPtr = null; + } + + /** + * Convenience method to directly set the dense matrix pointer on GPU + * + * @param densePtr dense pointer + */ + public void setDensePointer(Pointer densePtr) { + if (!this.isDensePointerNull()) { + throw new DMLRuntimeException("jcudaDenseMatrixPtr was already allocated for " + this + ", this will cause a memory leak on the GPU"); + } + this.jcudaDenseMatrixPtr = densePtr; + this.isSparse = false; + if(LOG.isDebugEnabled()) { + LOG.debug("Setting dense pointer of size " + getGPUContext().getMemoryManager().getSizeAllocatedGPUPointer(densePtr)); + } + if (getJcudaSparseMatrixPtr() != null) { + getJcudaSparseMatrixPtr().deallocate(); + jcudaSparseMatrixPtr = null; + } + } + // ---------------------------------------------------------------------- + @Override public Object clone() { @@ -111,12 +162,12 @@ public class GPUObject { that.isSparse = me.isSparse; try { - if (me.jcudaDenseMatrixPtr != null) { + if (!me.isDensePointerNull()) { long rows = me.mat.getNumRows(); long cols = me.mat.getNumColumns(); long size = rows * cols * LibMatrixCUDA.sizeOfDataType; - that.jcudaDenseMatrixPtr = allocate(size); - cudaMemcpy(that.jcudaDenseMatrixPtr, me.jcudaDenseMatrixPtr, size, cudaMemcpyDeviceToDevice); + that.setDensePointer(allocate(size)); + cudaMemcpy(that.getDensePointer(), me.getDensePointer(), size, cudaMemcpyDeviceToDevice); } if (me.getJcudaSparseMatrixPtr() != null) { @@ -135,12 +186,8 @@ public class GPUObject { return getGPUContext().allocate(size); } - private void cudaFreeHelper(Pointer toFree) { - getGPUContext().cudaFreeHelper(toFree); - } - - private void cudaFreeHelper(String instName, Pointer toFree, boolean eager) { - getGPUContext().cudaFreeHelper(instName, toFree, eager); + private void cudaFreeHelper(Pointer toFree) throws DMLRuntimeException { + getGPUContext().cudaFreeHelper(null, toFree, DMLScript.EAGER_CUDA_FREE); } private GPUContext getGPUContext() { @@ -172,6 +219,7 @@ public class GPUObject { lda, C, ldc); return C; } + /** * Convenience method to convert a CSR matrix to a dense matrix on the GPU @@ -224,8 +272,8 @@ public class GPUObject { C.colInd); //cudaDeviceSynchronize(); - gCtx.cudaFreeHelper(nnzPerRowPtr); - gCtx.cudaFreeHelper(nnzTotalDevHostPtr); + gCtx.cudaFreeHelper(null, nnzPerRowPtr, DMLScript.EAGER_CUDA_FREE); + gCtx.cudaFreeHelper(null, nnzTotalDevHostPtr, DMLScript.EAGER_CUDA_FREE); return C; } @@ -251,32 +299,12 @@ public class GPUObject { } this.jcudaSparseMatrixPtr = sparseMatrixPtr; this.isSparse = true; - if (getJcudaDenseMatrixPtr() != null) { - cudaFreeHelper(getJcudaDenseMatrixPtr()); - jcudaDenseMatrixPtr = null; - } - } - - /** - * Convenience method to directly set the dense matrix pointer on GPU - * - * @param densePtr dense pointer - */ - public void setDenseMatrixCudaPointer(Pointer densePtr) { - if (this.jcudaDenseMatrixPtr != null) { - throw new DMLRuntimeException("jcudaDenseMatrixPtr was already allocated for " + this + ", this will cause a memory leak on the GPU"); - } - this.jcudaDenseMatrixPtr = densePtr; - this.isSparse = false; - if(LOG.isDebugEnabled()) { - LOG.debug("Setting dense pointer of size " + getGPUContext().getMemoryManager().getSizeAllocatedGPUPointer(densePtr)); - } - if (getJcudaSparseMatrixPtr() != null) { - getJcudaSparseMatrixPtr().deallocate(); - jcudaSparseMatrixPtr = null; + if (!isDensePointerNull()) { + cudaFreeHelper(getDensePointer()); + clearDensePointer(); } } - + /** * Converts this GPUObject from dense to sparse format. */ @@ -293,12 +321,12 @@ public class GPUObject { int rows = toIntExact(mat.getNumRows()); int cols = toIntExact(mat.getNumColumns()); - if (getJcudaDenseMatrixPtr() == null || !isAllocated()) + if (isDensePointerNull() || !isAllocated()) throw new DMLRuntimeException("Expected allocated dense matrix before denseToSparse() call"); denseRowMajorToColumnMajor(); setSparseMatrixCudaPointer( - columnMajorDenseToRowMajorSparse(getGPUContext(), cusparseHandle, getJcudaDenseMatrixPtr(), rows, + columnMajorDenseToRowMajorSparse(getGPUContext(), cusparseHandle, getDensePointer(), rows, cols)); // TODO: What if mat.getNnz() is -1 ? if (DMLScript.STATISTICS) @@ -322,10 +350,10 @@ public class GPUObject { throw new DMLRuntimeException("Error in converting row major to column major : data is not allocated"); } - Pointer tmp = transpose(getGPUContext(), getJcudaDenseMatrixPtr(), m, n, lda, ldc); - cudaFreeHelper(getJcudaDenseMatrixPtr()); - jcudaDenseMatrixPtr = null; - setDenseMatrixCudaPointer(tmp); + Pointer tmp = transpose(getGPUContext(), getDensePointer(), m, n, lda, ldc); + cudaFreeHelper(getDensePointer()); + clearDensePointer(); + setDensePointer(tmp); } /** @@ -344,10 +372,10 @@ public class GPUObject { throw new DMLRuntimeException("Error in converting column major to row major : data is not allocated"); } - Pointer tmp = transpose(getGPUContext(), getJcudaDenseMatrixPtr(), m, n, lda, ldc); - cudaFreeHelper(getJcudaDenseMatrixPtr()); - jcudaDenseMatrixPtr = null; - setDenseMatrixCudaPointer(tmp); + Pointer tmp = transpose(getGPUContext(), getDensePointer(), m, n, lda, ldc); + cudaFreeHelper(getDensePointer()); + clearDensePointer(); + setDensePointer(tmp); } /** @@ -400,7 +428,7 @@ public class GPUObject { throw new DMLRuntimeException("Expected cusparse to be initialized"); int rows = toIntExact(mat.getNumRows()); int cols = toIntExact(mat.getNumColumns()); - setDenseMatrixCudaPointer(getJcudaSparseMatrixPtr().toColumnMajorDenseMatrix(cusparseHandle, null, rows, cols, null)); + setDensePointer(getJcudaSparseMatrixPtr().toColumnMajorDenseMatrix(cusparseHandle, null, rows, cols, null)); } /** @@ -426,7 +454,7 @@ public class GPUObject { } public boolean isAllocated() { - boolean eitherAllocated = (getJcudaDenseMatrixPtr() != null || getJcudaSparseMatrixPtr() != null); + boolean eitherAllocated = (!isDensePointerNull() || getJcudaSparseMatrixPtr() != null); return eitherAllocated; } @@ -455,13 +483,13 @@ public class GPUObject { long cols = mat.getNumColumns(); int numElems = toIntExact(rows * cols); long size = getDatatypeSizeOf(numElems); - setDenseMatrixCudaPointer(allocate(size)); + setDensePointer(allocate(size)); // The "fill" kernel is called which treats the matrix "jcudaDensePtr" like a vector and fills it with value "v" // If the fill value is 0, no need to call the special kernel, the allocate memsets the allocated region to 0 if (v != 0) getGPUContext().getKernels() .launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(numElems), - getJcudaDenseMatrixPtr(), v, numElems); + getDensePointer(), v, numElems); } /** @@ -475,6 +503,7 @@ public class GPUObject { boolean isEmptyAndSparseAndAllocated = isSparseAndAllocated && getJcudaSparseMatrixPtr().nnz == 0; return isEmptyAndSparseAndAllocated; } + /** * Being allocated is a prerequisite for computing nnz. @@ -505,7 +534,7 @@ public class GPUObject { Pointer nnzTotalDevHostPtr = null; nnzPerRowPtr = gCtx.allocate(getIntSizeOf(rows)); nnzTotalDevHostPtr = gCtx.allocate(getIntSizeOf(1)); - LibMatrixCUDA.cudaSupportFunctions.cusparsennz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, getJcudaDenseMatrixPtr(), rows, + LibMatrixCUDA.cudaSupportFunctions.cusparsennz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, rows, cols, matDescr, getDensePointer(), rows, nnzPerRowPtr, nnzTotalDevHostPtr); int[] nnzC = { -1 }; cudaMemcpy(Pointer.to(nnzC), nnzTotalDevHostPtr, getIntSizeOf(1), cudaMemcpyDeviceToHost); @@ -513,8 +542,8 @@ public class GPUObject { throw new DMLRuntimeException( "cusparseDnnz did not calculate the correct number of nnz on the GPU"); } - gCtx.cudaFreeHelper(nnzPerRowPtr); - gCtx.cudaFreeHelper(nnzTotalDevHostPtr); + gCtx.cudaFreeHelper(instName, nnzPerRowPtr, DMLScript.EAGER_CUDA_FREE); + gCtx.cudaFreeHelper(instName, nnzTotalDevHostPtr, DMLScript.EAGER_CUDA_FREE); if(DMLScript.FINEGRAINED_STATISTICS) { GPUStatistics.maintainCPMiscTimes(instName, CPInstruction.MISC_TIMER_RECOMPUTE_NNZ, System.nanoTime()-t1); } @@ -600,8 +629,18 @@ public class GPUObject { LOG.trace("GPU : data is dirty on device, copying to host, on " + this + ", GPUContext=" + getGPUContext()); } - copyFromDeviceToHost(instName, false); - copied = true; + + if (isAllocated() && dirty) { + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : data is dirty on device, copying to host, on " + this + ", GPUContext=" + + getGPUContext()); + } + // TODO: Future optimization: + // For now, we are deleting the device data when copied from device to host. + // This can be optimized later by treating acquiredModify+release as a new state + copyFromDeviceToHost(instName, false, true); + copied = true; + } } return copied; } @@ -657,6 +696,7 @@ public class GPUObject { timestamp.addAndGet(1); break; case MIN_EVICT: /* Do Nothing */ + case ALIGN_MEMORY: break; case MRU: timestamp.set(-System.nanoTime()); @@ -704,7 +744,7 @@ public class GPUObject { throw new DMLRuntimeException("Internal error - invalid number of columns when allocating dense matrix;"); long size = getDatatypeSizeOf(rows * cols); Pointer tmp = allocate(size); - setDenseMatrixCudaPointer(tmp); + setDensePointer(tmp); } void allocateSparseMatrixOnDevice() { @@ -723,21 +763,6 @@ public class GPUObject { setSparseMatrixCudaPointer(tmp); } - void deallocateMemoryOnDevice(boolean eager) { - if(LOG.isTraceEnabled()) { - LOG.trace("GPU : deallocateMemoryOnDevice, on " + this + ", GPUContext=" + getGPUContext()); - } - if (getJcudaDenseMatrixPtr() != null) { - cudaFreeHelper(null, getJcudaDenseMatrixPtr(), eager); - } - if (getJcudaSparseMatrixPtr() != null) { - getJcudaSparseMatrixPtr().deallocate(eager); - } - jcudaDenseMatrixPtr = null; - jcudaSparseMatrixPtr = null; - resetReadWriteLock(); - } - protected long getSizeOnDevice() { long GPUSize = 0; long rlen = mat.getNumRows(); @@ -850,14 +875,14 @@ public class GPUObject { // Minor optimization: No need to allocate empty error for CPU // data = new double[tmp.getNumRows() * tmp.getNumColumns()]; long t1 = DMLScript.FINEGRAINED_STATISTICS ? System.nanoTime() : 0; - cudaMemset(getJcudaDenseMatrixPtr(), 0, getDatatypeSizeOf(mat.getNumRows() * mat.getNumColumns())); + cudaMemset(getDensePointer(), 0, getDatatypeSizeOf(mat.getNumRows() * mat.getNumColumns())); if(DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(opcode, GPUInstruction.MISC_TIMER_SET_ZERO, System.nanoTime() - t1); } else { // Copy dense block // H2D now only measures the time taken to do - LibMatrixCUDA.cudaSupportFunctions.hostToDevice(getGPUContext(), data, getJcudaDenseMatrixPtr(), opcode); + LibMatrixCUDA.cudaSupportFunctions.hostToDevice(getGPUContext(), data, getDensePointer(), opcode); } } @@ -875,103 +900,102 @@ public class GPUObject { } return (int) l; } + - protected void copyFromDeviceToHost(String instName, boolean isEviction) { + /** + * Copies the data from device to host. + * Currently eagerDelete and isEviction are both provided for better control in different scenarios. + * In future, we can force eagerDelete if isEviction is true, else false. + * + * @param instName opcode of the instruction for fine-grained statistics + * @param isEviction is called for eviction + * @param eagerDelete whether to perform eager deletion of the device data. + * @throws DMLRuntimeException if error occurs + */ + protected void copyFromDeviceToHost(String instName, boolean isEviction, boolean eagerDelete) throws DMLRuntimeException { if(LOG.isTraceEnabled()) { LOG.trace("GPU : copyFromDeviceToHost, on " + this + ", GPUContext=" + getGPUContext()); } - if (getJcudaDenseMatrixPtr() != null && getJcudaSparseMatrixPtr() != null) { + if (isDensePointerNull() && getJcudaSparseMatrixPtr() == null) { + throw new DMLRuntimeException( + "Cannot copy from device to host as JCuda dense/sparse pointer is not allocated"); + } + else if (!isDensePointerNull() && getJcudaSparseMatrixPtr() != null) { throw new DMLRuntimeException("Invalid state : JCuda dense/sparse pointer are both allocated"); } - - if (getJcudaDenseMatrixPtr() != null) { - long start = 0; - if (DMLScript.STATISTICS) - start = System.nanoTime(); - MatrixBlock tmp = new MatrixBlock(toIntExact(mat.getNumRows()), toIntExact(mat.getNumColumns()), false); + else if(getJcudaSparseMatrixPtr() != null && !LibMatrixCUDA.isInSparseFormat(getGPUContext(), mat)) { + throw new DMLRuntimeException( + "Block not in sparse format on host yet the device sparse matrix pointer is not null"); + } + else if(getJcudaSparseMatrixPtr() != null && isSparseAndEmpty()) { + mat.acquireModify(new MatrixBlock((int)mat.getNumRows(), (int)mat.getNumColumns(), 0l)); // empty block + mat.release(); + return; + } + + MatrixBlock tmp = null; + long start = DMLScript.STATISTICS ? System.nanoTime() : 0; + if (!isDensePointerNull()) { + tmp = new MatrixBlock(toIntExact(mat.getNumRows()), toIntExact(mat.getNumColumns()), false); tmp.allocateDenseBlock(); LibMatrixCUDA.cudaSupportFunctions.deviceToHost(getGPUContext(), - getJcudaDenseMatrixPtr(), tmp.getDenseBlockValues(), instName, isEviction); - - tmp.recomputeNonZeros(); - mat.acquireModify(tmp); - mat.release(); - - if (DMLScript.STATISTICS) - GPUStatistics.cudaFromDevTime.add(System.nanoTime() - start); - if (DMLScript.STATISTICS) - GPUStatistics.cudaFromDevCount.add(1); - } else if (getJcudaSparseMatrixPtr() != null) { - if (!LibMatrixCUDA.isInSparseFormat(getGPUContext(), mat)) - throw new DMLRuntimeException( - "Block not in sparse format on host yet the device sparse matrix pointer is not null"); - - if (this.isSparseAndEmpty()) { - MatrixBlock tmp = new MatrixBlock((int)mat.getNumRows(), (int)mat.getNumColumns(), 0l); // Empty Block - mat.acquireModify(tmp); - mat.release(); - } else { - long start = 0; - if (DMLScript.STATISTICS) - start = System.nanoTime(); - - int rows = toIntExact(mat.getNumRows()); - int cols = toIntExact(mat.getNumColumns()); - int nnz = toIntExact(getJcudaSparseMatrixPtr().nnz); - double[] values = new double[nnz]; - LibMatrixCUDA.cudaSupportFunctions.deviceToHost(getGPUContext(), getJcudaSparseMatrixPtr().val, values, instName, isEviction); - int[] rowPtr = new int[rows + 1]; - int[] colInd = new int[nnz]; - long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; - CSRPointer.copyPtrToHost(getJcudaSparseMatrixPtr(), rows, nnz, rowPtr, colInd); - if (DMLScript.STATISTICS) - GPUStatistics.cudaFromDevTime.add(System.nanoTime() - t0); - if (DMLScript.STATISTICS) - GPUStatistics.cudaFromDevCount.add(3); - - SparseBlockCSR sparseBlock = new SparseBlockCSR(rowPtr, colInd, values, nnz); - MatrixBlock tmp = new MatrixBlock(rows, cols, nnz, sparseBlock); - mat.acquireModify(tmp); - mat.release(); - if (DMLScript.STATISTICS) - GPUStatistics.cudaFromDevTime.add(System.nanoTime() - start); - if (DMLScript.STATISTICS) - GPUStatistics.cudaFromDevCount.add(1); - } + getDensePointer(), tmp.getDenseBlockValues(), instName, isEviction); + // int nnz = LibMatrixCUDA.computeNNZ(getGPUContext(), getJcudaDenseMatrixPtr(), toIntExact(mat.getNumRows()*mat.getNumColumns())); + // tmp.setNonZeros(nnz); + if(eagerDelete) + clearData(instName, true); + // tmp.recomputeNonZeros(); + tmp.setNonZeros(-1); } else { - throw new DMLRuntimeException( - "Cannot copy from device to host as JCuda dense/sparse pointer is not allocated"); + int rows = toIntExact(mat.getNumRows()); + int cols = toIntExact(mat.getNumColumns()); + int nnz = toIntExact(getJcudaSparseMatrixPtr().nnz); + double[] values = new double[nnz]; + LibMatrixCUDA.cudaSupportFunctions.deviceToHost(getGPUContext(), getJcudaSparseMatrixPtr().val, values, instName, isEviction); + int[] rowPtr = new int[rows + 1]; + int[] colInd = new int[nnz]; + CSRPointer.copyPtrToHost(getJcudaSparseMatrixPtr(), rows, nnz, rowPtr, colInd); + if(eagerDelete) + clearData(instName, true); + SparseBlockCSR sparseBlock = new SparseBlockCSR(rowPtr, colInd, values, nnz); + tmp = new MatrixBlock(rows, cols, nnz, sparseBlock); + } + mat.acquireModify(tmp); + mat.release(); + if (DMLScript.STATISTICS) + GPUStatistics.cudaFromDevTime.add(System.nanoTime() - start); + if (DMLScript.STATISTICS) { + int count = !isDensePointerNull() ? 1 : 3; + GPUStatistics.cudaFromDevCount.add(count); } dirty = false; } - /** - * lazily clears the data associated with this {@link GPUObject} instance - */ - public void clearData() { - clearData(DMLScript.EAGER_CUDA_FREE); - } /** * Clears the data associated with this {@link GPUObject} instance * + * @param opcode opcode of the instruction * @param eager whether to be done synchronously or asynchronously + * @throws DMLRuntimeException if error occurs */ - public void clearData(boolean eager) { - deallocateMemoryOnDevice(eager); + public void clearData(String opcode, boolean eager) throws DMLRuntimeException { + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : clearData on " + this + ", GPUContext=" + getGPUContext()); + } + if (!isDensePointerNull()) { + getGPUContext().cudaFreeHelper(opcode, getDensePointer(), eager); + } + if (getJcudaSparseMatrixPtr() != null) { + getJcudaSparseMatrixPtr().deallocate(eager); + } + clearDensePointer(); + jcudaSparseMatrixPtr = null; + resetReadWriteLock(); getGPUContext().getMemoryManager().removeGPUObject(this); } /** - * Pointer to dense matrix - * - * @return ? - */ - public Pointer getJcudaDenseMatrixPtr() { - return jcudaDenseMatrixPtr; - } - - /** * Pointer to sparse matrix * * @return ? @@ -997,12 +1021,12 @@ public class GPUObject { sb.append(", writeLock=").append(writeLock); sb.append(", sparse? ").append(isSparse); sb.append(", dims=[").append(mat.getNumRows()).append(",").append(mat.getNumColumns()).append("]"); - if(jcudaDenseMatrixPtr != null) - sb.append(", densePtr=").append(jcudaDenseMatrixPtr); + if(!isDensePointerNull()) + sb.append(", densePtr=").append(getDensePointer()); if(jcudaSparseMatrixPtr != null) sb.append(", sparsePtr=").append(jcudaSparseMatrixPtr); sb.append('}'); return sb.toString(); } -} +} \ No newline at end of file
