Repository: systemml Updated Branches: refs/heads/master be2b3e220 -> cc7ec7625
[SYSTEMML-1325] Bugfix for GPU memory manager clear temporary memory. Fixes bug in GPU cleanup with JMLC. In GPUMemoryManager.clearTemporaryMemory() we deallocate pointers but do not set the corresponding Pointer slots to null in the associated GPUObject instances. This can lead to attempted double-freeing of Pointers which results in an exception. This commit fixes this issue by creating a list of GPU objects associated with Pointers that have been freed as part of clearTemporaryMemory() and setting the corresponding pointer slots to null. This commit also addresses a minor issue with cleanup in JMLC which was causing Pointers for pinned data to be improperly cleared. Note this commit will reduce performance of GPUMemoryManager.clearTemporaryMemory() because it is now necessary to search through the list of managed GPUObjects to find the ones corresponding to pointers being freed. However, this method is only called once at the end of script invocation and so the performance cost will be small. Closes #839. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/cc7ec762 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/cc7ec762 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/cc7ec762 Branch: refs/heads/master Commit: cc7ec762583aec137ed0bd10a8754e29140b2e35 Parents: be2b3e2 Author: Anthony Thomas <ahtho...@eng.ucsd.edu> Authored: Sat Nov 3 05:41:42 2018 +0530 Committer: Niketan Pansare <npan...@us.ibm.com> Committed: Sat Nov 3 05:41:42 2018 +0530 ---------------------------------------------------------------------- .../gpu/context/GPUMatrixMemoryManager.java | 27 +++++++++++++++++--- .../gpu/context/GPUMemoryManager.java | 13 +++++++--- .../instructions/gpu/context/GPUObject.java | 2 +- 3 files changed, 34 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/cc7ec762/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java index 47a8391..f69d340 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMatrixMemoryManager.java @@ -18,7 +18,9 @@ */ package org.apache.sysml.runtime.instructions.gpu.context; +import java.util.Collections; import java.util.HashSet; +import java.util.List; import java.util.Set; import java.util.stream.Collectors; @@ -43,8 +45,7 @@ public class GPUMatrixMemoryManager { void addGPUObject(GPUObject gpuObj) { gpuObjects.add(gpuObj); } - - + /** * Get list of all Pointers in a GPUObject * @param gObj gpu object @@ -81,6 +82,20 @@ public class GPUMatrixMemoryManager { * so that an extraneous host to dev transfer can be avoided */ HashSet<GPUObject> gpuObjects = new HashSet<>(); + + /** + * Return a set of GPU Objects associated with a list of pointers + * @param pointers A list of pointers + * @return A set of GPU objects corresponding to any of these pointers + */ + Set<GPUObject> getGpuObjects(Set<Pointer> pointers) { + Set<GPUObject> gObjs = new HashSet<>(); + for (GPUObject g : gpuObjects) { + if (!Collections.disjoint(getPointers(g), pointers)) + gObjs.add(g); + } + return gObjs; + } /** * Return all pointers in the first section @@ -94,10 +109,14 @@ public class GPUMatrixMemoryManager { * Get pointers from the first memory sections "Matrix Memory" * @param locked return locked pointers if true * @param dirty return dirty pointers if true + * @param isCleanupEnabled return pointers marked for cleanup if true * @return set of pointers */ - Set<Pointer> getPointers(boolean locked, boolean dirty) { - return gpuObjects.stream().filter(gObj -> gObj.isLocked() == locked && gObj.isDirty() == dirty).flatMap(gObj -> getPointers(gObj).stream()).collect(Collectors.toSet()); + Set<Pointer> getPointers(boolean locked, boolean dirty, boolean isCleanupEnabled) { + return gpuObjects.stream().filter( + gObj -> (gObj.isLocked() == locked && gObj.isDirty() == dirty) || + (gObj.mat.isCleanupEnabled() == isCleanupEnabled)).flatMap( + gObj -> getPointers(gObj).stream()).collect(Collectors.toSet()); } /** http://git-wip-us.apache.org/repos/asf/systemml/blob/cc7ec762/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java index 6a04d97..d403ca7 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java @@ -429,7 +429,6 @@ public class GPUMemoryManager { else { throw new RuntimeException("ERROR : Internal state corrupted, attempting to free an unaccounted pointer:" + toFree); } - } /** @@ -521,11 +520,19 @@ public class GPUMemoryManager { */ public void clearTemporaryMemory() { // To record the cuda block sizes needed by allocatedGPUObjects, others are cleared up. - Set<Pointer> unlockedDirtyPointers = matrixMemoryManager.getPointers(false, true); + Set<Pointer> unlockedDirtyPointers = matrixMemoryManager.getPointers(false, true, false); Set<Pointer> temporaryPointers = nonIn(allPointers.keySet(), unlockedDirtyPointers); - for(Pointer tmpPtr : temporaryPointers) { + for (Pointer tmpPtr : temporaryPointers) { guardedCudaFree(tmpPtr); } + + // Also set the pointer(s) to null in the corresponding GPU objects to avoid double freeing pointers + Set<GPUObject> gObjs = matrixMemoryManager.getGpuObjects(temporaryPointers); + for (GPUObject g : gObjs) { + g.jcudaDenseMatrixPtr = null; + g.jcudaSparseMatrixPtr = null; + removeGPUObject(g); + } } /** http://git-wip-us.apache.org/repos/asf/systemml/blob/cc7ec762/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java index b95d471..edcd683 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java @@ -68,7 +68,7 @@ public class GPUObject { /** * Pointer to the underlying sparse matrix block on GPU */ - private CSRPointer jcudaSparseMatrixPtr = null; + CSRPointer jcudaSparseMatrixPtr = null; /** * whether the block attached to this {@link GPUContext} is dirty on the device and needs to be copied back to host