Repository: systemml Updated Branches: refs/heads/master b366c0f89 -> 13baec95c
[SYSTEMML-445] Added support for unified memory allocator for GPU backend - The allocator can be configured using the property 'sysml.gpu.memory.allocator'. - Minor bugfix for setting configuration properties related to GPU. - Improved the performance of eviction by reducing number of cudaMalloc calls. This gave ~1.6x end-to-end improvement for ResNet-200 with batch size of 48 on Intel+P100. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/13baec95 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/13baec95 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/13baec95 Branch: refs/heads/master Commit: 13baec95c37595576766b8be45f514a0d67053ea Parents: b366c0f Author: Niketan Pansare <npan...@us.ibm.com> Authored: Sat Aug 4 15:03:46 2018 -0700 Committer: Niketan Pansare <npan...@us.ibm.com> Committed: Sat Aug 4 15:03:46 2018 -0700 ---------------------------------------------------------------------- conf/SystemML-config.xml.template | 2 + .../java/org/apache/sysml/api/DMLScript.java | 70 ++++++++++++--- .../apache/sysml/api/ScriptExecutorUtils.java | 44 ---------- .../sysml/api/mlcontext/ScriptExecutor.java | 11 +-- .../java/org/apache/sysml/conf/DMLConfig.java | 6 +- .../gpu/context/CudaMemoryAllocator.java | 83 ++++++++++++++++++ .../instructions/gpu/context/GPUContext.java | 2 +- .../gpu/context/GPUMemoryAllocator.java | 57 ++++++++++++ .../gpu/context/GPUMemoryManager.java | 60 ++++++++----- .../gpu/context/UnifiedMemoryAllocator.java | 91 ++++++++++++++++++++ 10 files changed, 336 insertions(+), 90 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/13baec95/conf/SystemML-config.xml.template ---------------------------------------------------------------------- diff --git a/conf/SystemML-config.xml.template b/conf/SystemML-config.xml.template index ca1c17b..3ce88c1 100644 --- a/conf/SystemML-config.xml.template +++ b/conf/SystemML-config.xml.template @@ -112,4 +112,6 @@ <!-- Fraction of available GPU memory to use. This is similar to TensorFlow's per_process_gpu_memory_fraction configuration property. (default: 0.9) --> <sysml.gpu.memory.util.factor>0.9</sysml.gpu.memory.util.factor> + <!-- Allocator to use to allocate GPU device memory. Supported values are cuda, unified_memory (default: cuda) --> + <sysml.gpu.memory.allocator>cuda</sysml.gpu.memory.allocator> </root> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/systemml/blob/13baec95/src/main/java/org/apache/sysml/api/DMLScript.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/api/DMLScript.java b/src/main/java/org/apache/sysml/api/DMLScript.java index fd0b861..d9413a8 100644 --- a/src/main/java/org/apache/sysml/api/DMLScript.java +++ b/src/main/java/org/apache/sysml/api/DMLScript.java @@ -79,6 +79,7 @@ import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration; import org.apache.sysml.runtime.util.LocalFileUtils; import org.apache.sysml.runtime.util.MapReduceTool; import org.apache.sysml.utils.Explain; +import org.apache.sysml.utils.NativeHelper; import org.apache.sysml.utils.Explain.ExplainCounts; import org.apache.sysml.utils.Explain.ExplainType; import org.apache.sysml.utils.Statistics; @@ -125,7 +126,8 @@ public class DMLScript public static long EVICTION_SHADOW_BUFFER_MAX_BYTES = 0; // maximum number of bytes to use for shadow buffer public static long EVICTION_SHADOW_BUFFER_CURR_BYTES = 0; // number of bytes to use for shadow buffer public static double GPU_MEMORY_UTILIZATION_FACTOR = 0.9; // fraction of available GPU memory to use - + public static String GPU_MEMORY_ALLOCATOR = "cuda"; // GPU memory allocator to use + /** * Global variable indicating the script type (DML or PYDML). Can be used * for DML/PYDML-specific tasks, such as outputting booleans in the correct @@ -416,16 +418,8 @@ public class DMLScript CompilerConfig cconf = OptimizerUtils.constructCompilerConfig(dmlconf); ConfigurationManager.setGlobalConfig(cconf); LOG.debug("\nDML config: \n" + dmlconf.getConfigInfo()); - - // Sets the GPUs to use for this process (a range, all GPUs, comma separated list or a specific GPU) - GPUContextPool.AVAILABLE_GPUS = dmlconf.getTextValue(DMLConfig.AVAILABLE_GPUS); - String evictionPolicy = dmlconf.getTextValue(DMLConfig.GPU_EVICTION_POLICY).toUpperCase(); - try { - DMLScript.GPU_EVICTION_POLICY = EvictionPolicy.valueOf(evictionPolicy); - } catch(IllegalArgumentException e) { - throw new RuntimeException("Unsupported eviction policy:" + evictionPolicy); - } + setGlobalFlags(dmlconf); //Step 2: set local/remote memory if requested (for compile in AM context) if( dmlconf.getBooleanValue(DMLConfig.YARN_APPMASTER) ){ @@ -499,6 +493,62 @@ public class DMLScript } /** + * Sets the global flags in DMLScript based on user provided configuration + * + * @param dmlconf user provided configuration + */ + public static void setGlobalFlags(DMLConfig dmlconf) { + // Sets the GPUs to use for this process (a range, all GPUs, comma separated list or a specific GPU) + GPUContextPool.AVAILABLE_GPUS = dmlconf.getTextValue(DMLConfig.AVAILABLE_GPUS); + + String evictionPolicy = dmlconf.getTextValue(DMLConfig.GPU_EVICTION_POLICY).toUpperCase(); + try { + DMLScript.GPU_EVICTION_POLICY = EvictionPolicy.valueOf(evictionPolicy); + } catch(IllegalArgumentException e) { + throw new RuntimeException("Unsupported eviction policy:" + evictionPolicy); + } + + // Whether extra statistics useful for developers and others interested + // in digging into performance problems are recorded and displayed + DMLScript.FINEGRAINED_STATISTICS = DMLScript.STATISTICS && dmlconf.getBooleanValue(DMLConfig.EXTRA_FINEGRAINED_STATS); + CacheableData.CACHING_BUFFER_SIZE = dmlconf.getDoubleValue(DMLConfig.CACHING_BUFFER_SIZE); + if(CacheableData.CACHING_BUFFER_SIZE < 0 || CacheableData.CACHING_BUFFER_SIZE > 1) + throw new RuntimeException("Incorrect value (" + CacheableData.CACHING_BUFFER_SIZE + ") for the configuration " + DMLConfig.CACHING_BUFFER_SIZE); + + DMLScript.STATISTICS_MAX_WRAP_LEN = dmlconf.getIntValue(DMLConfig.STATS_MAX_WRAP_LEN); + NativeHelper.initialize(dmlconf.getTextValue(DMLConfig.NATIVE_BLAS_DIR), dmlconf.getTextValue(DMLConfig.NATIVE_BLAS).trim()); + + DMLScript.SYNCHRONIZE_GPU = dmlconf.getBooleanValue(DMLConfig.SYNCHRONIZE_GPU); + DMLScript.EAGER_CUDA_FREE = dmlconf.getBooleanValue(DMLConfig.EAGER_CUDA_FREE); + DMLScript.PRINT_GPU_MEMORY_INFO = dmlconf.getBooleanValue(DMLConfig.PRINT_GPU_MEMORY_INFO); + DMLScript.GPU_MEMORY_UTILIZATION_FACTOR = dmlconf.getDoubleValue(DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR); + DMLScript.GPU_MEMORY_ALLOCATOR = dmlconf.getTextValue(DMLConfig.GPU_MEMORY_ALLOCATOR); + if(DMLScript.GPU_MEMORY_UTILIZATION_FACTOR < 0) { + throw new RuntimeException("Incorrect value (" + DMLScript.GPU_MEMORY_UTILIZATION_FACTOR + ") for the configuration:" + DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR); + } + + DMLScript.FLOATING_POINT_PRECISION = dmlconf.getTextValue(DMLConfig.FLOATING_POINT_PRECISION); + org.apache.sysml.runtime.matrix.data.LibMatrixCUDA.resetFloatingPointPrecision(); + if(DMLScript.FLOATING_POINT_PRECISION.equals("double")) { + DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES = 0; + } + else { + double shadowBufferSize = dmlconf.getDoubleValue(DMLConfig.EVICTION_SHADOW_BUFFERSIZE); + if(shadowBufferSize < 0 || shadowBufferSize > 1) + throw new RuntimeException("Incorrect value (" + shadowBufferSize + ") for the configuration:" + DMLConfig.EVICTION_SHADOW_BUFFERSIZE); + DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES = (long) (((double)InfrastructureAnalyzer.getLocalMaxMemory())*shadowBufferSize); + if(DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES > 0 && + DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES > DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES) { + // This will be printed in a very rare situation when: + // 1. There is a memory leak which leads to non-cleared shadow buffer OR + // 2. MLContext is registering to bunch of outputs that are all part of shadow buffer + System.out.println("WARN: Cannot use the shadow buffer due to potentially cached GPU objects. Current shadow buffer size (in bytes):" + + DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES + " > Max shadow buffer size (in bytes):" + DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES); + } + } + } + + /** * Launcher for DML debugger. This method should be called after * execution and debug properties have been correctly set, and customized parameters * http://git-wip-us.apache.org/repos/asf/systemml/blob/13baec95/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java index 7a97fcf..4af6949 100644 --- a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java +++ b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java @@ -28,15 +28,12 @@ import org.apache.sysml.conf.DMLConfig; import org.apache.sysml.hops.codegen.SpoofCompiler; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.Program; -import org.apache.sysml.runtime.controlprogram.caching.CacheableData; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; -import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer; import org.apache.sysml.runtime.instructions.cp.Data; import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool; import org.apache.sysml.runtime.instructions.gpu.context.GPUObject; -import org.apache.sysml.utils.NativeHelper; import org.apache.sysml.utils.Statistics; public class ScriptExecutorUtils { @@ -75,47 +72,6 @@ public class ScriptExecutorUtils { * output variables that were registered as part of MLContext */ public static void executeRuntimeProgram(Program rtprog, ExecutionContext ec, DMLConfig dmlconf, int statisticsMaxHeavyHitters, Set<String> outputVariables) { - // Whether extra statistics useful for developers and others interested - // in digging into performance problems are recorded and displayed - DMLScript.FINEGRAINED_STATISTICS = DMLScript.STATISTICS && dmlconf.getBooleanValue(DMLConfig.EXTRA_FINEGRAINED_STATS); - CacheableData.CACHING_BUFFER_SIZE = dmlconf.getDoubleValue(DMLConfig.CACHING_BUFFER_SIZE); - if(CacheableData.CACHING_BUFFER_SIZE < 0 || CacheableData.CACHING_BUFFER_SIZE > 1) - throw new RuntimeException("Incorrect value (" + CacheableData.CACHING_BUFFER_SIZE + ") for the configuration " + DMLConfig.CACHING_BUFFER_SIZE); - - DMLScript.STATISTICS_MAX_WRAP_LEN = dmlconf.getIntValue(DMLConfig.STATS_MAX_WRAP_LEN); - NativeHelper.initialize(dmlconf.getTextValue(DMLConfig.NATIVE_BLAS_DIR), dmlconf.getTextValue(DMLConfig.NATIVE_BLAS).trim()); - - if(DMLScript.USE_ACCELERATOR) { - DMLScript.SYNCHRONIZE_GPU = dmlconf.getBooleanValue(DMLConfig.SYNCHRONIZE_GPU); - DMLScript.EAGER_CUDA_FREE = dmlconf.getBooleanValue(DMLConfig.EAGER_CUDA_FREE); - DMLScript.PRINT_GPU_MEMORY_INFO = dmlconf.getBooleanValue(DMLConfig.PRINT_GPU_MEMORY_INFO); - DMLScript.GPU_MEMORY_UTILIZATION_FACTOR = dmlconf.getDoubleValue(DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR); - if(DMLScript.GPU_MEMORY_UTILIZATION_FACTOR < 0 || DMLScript.GPU_MEMORY_UTILIZATION_FACTOR > 1) { - throw new RuntimeException("Incorrect value (" + DMLScript.GPU_MEMORY_UTILIZATION_FACTOR + ") for the configuration:" + DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR); - } - - DMLScript.FLOATING_POINT_PRECISION = dmlconf.getTextValue(DMLConfig.FLOATING_POINT_PRECISION); - org.apache.sysml.runtime.matrix.data.LibMatrixCUDA.resetFloatingPointPrecision(); - if(DMLScript.FLOATING_POINT_PRECISION.equals("double")) { - DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES = 0; - } - else { - double shadowBufferSize = dmlconf.getDoubleValue(DMLConfig.EVICTION_SHADOW_BUFFERSIZE); - if(shadowBufferSize < 0 || shadowBufferSize > 1) - throw new RuntimeException("Incorrect value (" + shadowBufferSize + ") for the configuration:" + DMLConfig.EVICTION_SHADOW_BUFFERSIZE); - DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES = (long) (((double)InfrastructureAnalyzer.getLocalMaxMemory())*shadowBufferSize); - if(DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES > 0 && - DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES > DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES) { - // This will be printed in a very rare situation when: - // 1. There is a memory leak which leads to non-cleared shadow buffer OR - // 2. MLContext is registering to bunch of outputs that are all part of shadow buffer - System.out.println("WARN: Cannot use the shadow buffer due to potentially cached GPU objects. Current shadow buffer size (in bytes):" - + DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES + " > Max shadow buffer size (in bytes):" + DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES); - } - } - } - - boolean exceptionThrown = false; Statistics.startRunTimer(); http://git-wip-us.apache.org/repos/asf/systemml/blob/13baec95/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java b/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java index 00fc096..8e3bdaf 100644 --- a/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java +++ b/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java @@ -246,16 +246,7 @@ public class ScriptExecutor { throw new RuntimeException(ex); } - // set the GPUs to use for this process (a range, all GPUs, comma separated list or a specific GPU) - GPUContextPool.AVAILABLE_GPUS = config.getTextValue(DMLConfig.AVAILABLE_GPUS); - - String evictionPolicy = config.getTextValue(DMLConfig.GPU_EVICTION_POLICY).toUpperCase(); - try { - DMLScript.GPU_EVICTION_POLICY = EvictionPolicy.valueOf(evictionPolicy); - } - catch(IllegalArgumentException e) { - throw new RuntimeException("Unsupported eviction policy:" + evictionPolicy); - } + DMLScript.setGlobalFlags(config); } http://git-wip-us.apache.org/repos/asf/systemml/blob/13baec95/src/main/java/org/apache/sysml/conf/DMLConfig.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java b/src/main/java/org/apache/sysml/conf/DMLConfig.java index 4aad400..1333075 100644 --- a/src/main/java/org/apache/sysml/conf/DMLConfig.java +++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java @@ -92,6 +92,7 @@ public class DMLConfig // Fraction of available memory to use. The available memory is computer when the GPUContext is created // to handle the tradeoff on calling cudaMemGetInfo too often. public static final String GPU_MEMORY_UTILIZATION_FACTOR = "sysml.gpu.memory.util.factor"; + public static final String GPU_MEMORY_ALLOCATOR = "sysml.gpu.memory.allocator"; // String to specify the memory allocator to use. Supported values are: cuda, unified_memory public static final String FLOATING_POINT_PRECISION = "sysml.floating.point.precision"; // String to specify the datatype to use internally: supported values are double, single public static final String PRINT_GPU_MEMORY_INFO = "sysml.gpu.print.memoryInfo"; public static final String EVICTION_SHADOW_BUFFERSIZE = "sysml.gpu.eviction.shadow.bufferSize"; @@ -140,13 +141,13 @@ public class DMLConfig _defaultVals.put(EVICTION_SHADOW_BUFFERSIZE, "0.0" ); _defaultVals.put(STATS_MAX_WRAP_LEN, "30" ); _defaultVals.put(GPU_MEMORY_UTILIZATION_FACTOR, "0.9" ); + _defaultVals.put(GPU_MEMORY_ALLOCATOR, "cuda"); _defaultVals.put(AVAILABLE_GPUS, "-1"); _defaultVals.put(GPU_EVICTION_POLICY, "align_memory"); _defaultVals.put(SYNCHRONIZE_GPU, "false" ); _defaultVals.put(CACHING_BUFFER_SIZE, "0.15" ); _defaultVals.put(EAGER_CUDA_FREE, "false" ); _defaultVals.put(FLOATING_POINT_PRECISION, "double" ); - _defaultVals.put(PRINT_GPU_MEMORY_INFO, "false"); } public DMLConfig() { @@ -428,7 +429,8 @@ public class DMLConfig COMPRESSED_LINALG, CODEGEN, CODEGEN_COMPILER, CODEGEN_OPTIMIZER, CODEGEN_PLANCACHE, CODEGEN_LITERALS, EXTRA_FINEGRAINED_STATS, STATS_MAX_WRAP_LEN, PRINT_GPU_MEMORY_INFO, CACHING_BUFFER_SIZE, - AVAILABLE_GPUS, SYNCHRONIZE_GPU, EAGER_CUDA_FREE, FLOATING_POINT_PRECISION, GPU_EVICTION_POLICY, EVICTION_SHADOW_BUFFERSIZE + AVAILABLE_GPUS, SYNCHRONIZE_GPU, EAGER_CUDA_FREE, FLOATING_POINT_PRECISION, GPU_EVICTION_POLICY, EVICTION_SHADOW_BUFFERSIZE, + GPU_MEMORY_ALLOCATOR, GPU_MEMORY_UTILIZATION_FACTOR }; StringBuilder sb = new StringBuilder(); http://git-wip-us.apache.org/repos/asf/systemml/blob/13baec95/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CudaMemoryAllocator.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CudaMemoryAllocator.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CudaMemoryAllocator.java new file mode 100644 index 0000000..e74bea3 --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CudaMemoryAllocator.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.instructions.gpu.context; + +import static jcuda.runtime.JCuda.cudaMemGetInfo; + +import org.apache.sysml.api.DMLScript; + +import jcuda.CudaException; +import jcuda.Pointer; +import static jcuda.runtime.cudaError.cudaSuccess; +import static jcuda.runtime.JCuda.cudaMalloc; +import jcuda.runtime.cudaError; +import static jcuda.runtime.JCuda.cudaFree; + +public class CudaMemoryAllocator implements GPUMemoryAllocator { + + /** + * Allocate memory on the device. + * + * @param devPtr Pointer to allocated device memory + * @param size size in bytes + * @throws jcuda.CudaException if unable to allocate + */ + public void allocate(Pointer devPtr, long size) throws CudaException { + int status = cudaMalloc(devPtr, size); + if(status != cudaSuccess) { + throw new jcuda.CudaException("cudaMalloc failed:" + cudaError.stringFor(status)); + } + } + + /** + * Frees memory on the device + * + * @param devPtr Device pointer to memory to free + * @throws jcuda.CudaException if error occurs + */ + public void free(Pointer devPtr) throws CudaException { + int status = cudaFree(devPtr); + if(status != cudaSuccess) { + throw new jcuda.CudaException("cudaFree failed:" + cudaError.stringFor(status)); + } + } + + /** + * Check if there is enough memory to allocate a pointer of given size + * + * @param size size in bytes + * @return true if there is enough available memory to allocate a pointer of the given size + */ + public boolean canAllocate(long size) { + return size <= getAvailableMemory(); + } + + /** + * Gets the available memory on GPU that SystemML can use. + * + * @return the available memory in bytes + */ + public long getAvailableMemory() { + long free[] = { 0 }; + long total[] = { 0 }; + cudaMemGetInfo(free, total); + return (long) (free[0] * DMLScript.GPU_MEMORY_UTILIZATION_FACTOR); + } + +} http://git-wip-us.apache.org/repos/asf/systemml/blob/13baec95/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java index 2ac92a7..180a60f 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java @@ -211,7 +211,7 @@ public class GPUContext { * @return the available memory in bytes */ public long getAvailableMemory() { - return memoryManager.getAvailableMemory(); + return memoryManager.allocator.getAvailableMemory(); } /** http://git-wip-us.apache.org/repos/asf/systemml/blob/13baec95/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryAllocator.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryAllocator.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryAllocator.java new file mode 100644 index 0000000..0c99127 --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryAllocator.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.instructions.gpu.context; + +import jcuda.Pointer; + +public interface GPUMemoryAllocator { + + /** + * Allocate memory on the device. + * + * @param devPtr Pointer to allocated device memory + * @param size size in bytes + * @throws jcuda.CudaException if unable to allocate + */ + public void allocate(Pointer devPtr, long size) throws jcuda.CudaException; + + /** + * Frees memory on the device + * + * @param devPtr Device pointer to memory to free + * @throws jcuda.CudaException if error occurs + */ + public void free(Pointer devPtr) throws jcuda.CudaException; + + /** + * Check if there is enough memory to allocate a pointer of given size + * + * @param size size in bytes + * @return true if there is enough available memory to allocate a pointer of the given size + */ + public boolean canAllocate(long size); + + /** + * Gets the available memory on GPU that SystemML can use. + * + * @return the available memory in bytes + */ + public long getAvailableMemory(); + +} http://git-wip-us.apache.org/repos/asf/systemml/blob/13baec95/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java index c4ae253..75bce87 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java @@ -18,8 +18,6 @@ */ package org.apache.sysml.runtime.instructions.gpu.context; -import static jcuda.runtime.JCuda.cudaFree; -import static jcuda.runtime.JCuda.cudaMalloc; import static jcuda.runtime.JCuda.cudaMemGetInfo; import static jcuda.runtime.JCuda.cudaMemset; @@ -37,6 +35,7 @@ import java.util.stream.Collectors; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.sysml.api.DMLScript; +import org.apache.sysml.conf.DMLConfig; import org.apache.sysml.hops.OptimizerUtils; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.instructions.gpu.GPUInstruction; @@ -55,6 +54,7 @@ public class GPUMemoryManager { private static final boolean DEBUG_MEMORY_LEAK = false; private static final int [] DEBUG_MEMORY_LEAK_STACKTRACE_DEPTH = {5, 6, 7, 8, 9, 10}; // Avoids printing too much text while debuggin + protected final GPUMemoryAllocator allocator; /*****************************************************************************************/ // GPU Memory is divided into three major sections: // 1. Matrix Memory: Memory allocated to matrices in SystemML and addressable by GPUObjects. @@ -65,7 +65,7 @@ public class GPUMemoryManager { // To get the GPUObjects/Pointers in this section, please use getGPUObjects and getPointers methods of GPUMatrixMemoryManager. // To clear GPUObjects/Pointers in this section, please use clear and clearAll methods of GPUMatrixMemoryManager. // Both these methods allow to get/clear unlocked/locked and dirty/non-dirty objects of a certain size. - GPUMatrixMemoryManager matrixMemoryManager; + protected final GPUMatrixMemoryManager matrixMemoryManager; public GPUMatrixMemoryManager getGPUMatrixMemoryManager() { return matrixMemoryManager; } @@ -73,7 +73,7 @@ public class GPUMemoryManager { // 2. Rmvar-ed pointers: If sysml.gpu.eager.cudaFree is set to false, // then this manager caches pointers of the GPUObject on which rmvar instruction has been executed for future reuse. // We observe 2-3x improvement with this approach and hence recommend to set this flag to false. - GPULazyCudaFreeMemoryManager lazyCudaFreeMemoryManager; + protected final GPULazyCudaFreeMemoryManager lazyCudaFreeMemoryManager; public GPULazyCudaFreeMemoryManager getGPULazyCudaFreeMemoryManager() { return lazyCudaFreeMemoryManager; } @@ -90,7 +90,7 @@ public class GPUMemoryManager { /** * To record size of all allocated pointers allocated by above memory managers */ - HashMap<Pointer, PointerInfo> allPointers = new HashMap<>(); + protected final HashMap<Pointer, PointerInfo> allPointers = new HashMap<>(); /*****************************************************************************************/ @@ -131,6 +131,16 @@ public class GPUMemoryManager { public GPUMemoryManager(GPUContext gpuCtx) { matrixMemoryManager = new GPUMatrixMemoryManager(this); lazyCudaFreeMemoryManager = new GPULazyCudaFreeMemoryManager(this); + if(DMLScript.GPU_MEMORY_ALLOCATOR.equals("cuda")) { + allocator = new CudaMemoryAllocator(); + } + else if(DMLScript.GPU_MEMORY_ALLOCATOR.equals("unified_memory")) { + allocator = new UnifiedMemoryAllocator(); + } + else { + throw new RuntimeException("Unsupported value (" + DMLScript.GPU_MEMORY_ALLOCATOR + ") for the configuration " + DMLConfig.GPU_MEMORY_ALLOCATOR + + ". Supported values are cuda, unified_memory."); + } long free[] = { 0 }; long total[] = { 0 }; cudaMemGetInfo(free, total); @@ -159,7 +169,7 @@ public class GPUMemoryManager { private Pointer cudaMallocNoWarn(Pointer A, long size, String printDebugMessage) { long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; try { - cudaMalloc(A, size); + allocator.allocate(A, size); allPointers.put(A, new PointerInfo(size)); if(DMLScript.STATISTICS) { long totalTime = System.nanoTime() - t0; @@ -241,7 +251,7 @@ public class GPUMemoryManager { Pointer tmpA = (A == null) ? new Pointer() : null; // Step 2: Allocate a new pointer in the GPU memory (since memory is available) // Step 3 has potential to create holes as well as limit future reuse, hence perform this step before step 3. - if(A == null && size <= getAvailableMemory()) { + if(A == null && allocator.canAllocate(size)) { // This can fail in case of fragmented memory, so don't issue any warning A = cudaMallocNoWarn(tmpA, size, "allocate a new pointer"); } @@ -262,7 +272,7 @@ public class GPUMemoryManager { // than doing cuda free/malloc/memset. So, rmvar-ing every blocks (step 4) is preferred to eviction (step 5). if(A == null) { lazyCudaFreeMemoryManager.clearAll(); - if(size <= getAvailableMemory()) { + if(allocator.canAllocate(size)) { // This can fail in case of fragmented memory, so don't issue any warning A = cudaMallocNoWarn(tmpA, size, "allocate a new pointer after eager free"); } @@ -292,14 +302,26 @@ public class GPUMemoryManager { // Step 6: Try eviction/clearing one-by-one based on the given policy without size restriction if(A == null) { long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; + long currentAvailableMemory = allocator.getAvailableMemory(); + boolean canFit = false; // --------------------------------------------------------------- // Evict unlocked GPU objects one-by-one and try malloc List<GPUObject> unlockedGPUObjects = matrixMemoryManager.gpuObjects.stream() .filter(gpuObj -> !gpuObj.isLocked()).collect(Collectors.toList()); Collections.sort(unlockedGPUObjects, new EvictionPolicyBasedComparator(size)); while(A == null && unlockedGPUObjects.size() > 0) { - evictOrClear(unlockedGPUObjects.remove(unlockedGPUObjects.size()-1), opcode); - A = cudaMallocNoWarn(tmpA, size, null); + GPUObject evictedGPUObject = unlockedGPUObjects.remove(unlockedGPUObjects.size()-1); + evictOrClear(evictedGPUObject, opcode); + if(!canFit) { + currentAvailableMemory += evictedGPUObject.getSizeOnDevice(); + if(currentAvailableMemory >= size) + canFit = true; + } + if(canFit) { + // Checking before invoking cudaMalloc reduces the time spent in unnecessary cudaMalloc. + // This was the bottleneck for ResNet200 experiments with batch size > 32 on P100+Intel + A = cudaMallocNoWarn(tmpA, size, null); + } if(DMLScript.STATISTICS) GPUStatistics.cudaEvictCount.increment(); } @@ -382,7 +404,7 @@ public class GPUMemoryManager { } allPointers.remove(toFree); lazyCudaFreeMemoryManager.removeIfPresent(size, toFree); - cudaFree(toFree); + allocator.free(toFree); // JCuda.cudaDeviceSynchronize(); // Force a device synchronize after free-ing the pointer for debugging } else { @@ -587,23 +609,15 @@ public class GPUMemoryManager { return ret.toString(); } - /** - * Gets the available memory on GPU that SystemML can use. - * - * @return the available memory in bytes - */ - public long getAvailableMemory() { - long free[] = { 0 }; - long total[] = { 0 }; - cudaMemGetInfo(free, total); - return (long) (free[0] * DMLScript.GPU_MEMORY_UTILIZATION_FACTOR); - } - private static class CustomPointer extends Pointer { public CustomPointer(Pointer p) { super(p); } + public CustomPointer() { + super(); + } + @Override public long getNativePointer() { return super.getNativePointer(); http://git-wip-us.apache.org/repos/asf/systemml/blob/13baec95/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/UnifiedMemoryAllocator.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/UnifiedMemoryAllocator.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/UnifiedMemoryAllocator.java new file mode 100644 index 0000000..71c6fc3 --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/UnifiedMemoryAllocator.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.runtime.instructions.gpu.context; + +import static jcuda.runtime.JCuda.cudaFree; +import static jcuda.runtime.JCuda.cudaMallocManaged; +import static jcuda.runtime.JCuda.cudaMemGetInfo; +import static jcuda.runtime.cudaError.cudaSuccess; +import static jcuda.runtime.JCuda.cudaMemAttachGlobal; +import org.apache.sysml.api.DMLScript; + +import jcuda.CudaException; +import jcuda.Pointer; +import jcuda.runtime.cudaError; + +public class UnifiedMemoryAllocator implements GPUMemoryAllocator { + + /** + * Allocate memory on the device. + * + * @param devPtr Pointer to allocated device memory + * @param size size in bytes + * @throws jcuda.CudaException if unable to allocate + */ + public void allocate(Pointer devPtr, long size) throws CudaException { + int status = cudaMallocManaged(devPtr, size, cudaMemAttachGlobal); + if(status != cudaSuccess) { + throw new jcuda.CudaException("cudaMallocManaged failed:" + cudaError.stringFor(status)); + } + + } + + /** + * Frees memory on the device + * + * @param devPtr Device pointer to memory to free + * @throws jcuda.CudaException if error occurs + */ + public void free(Pointer devPtr) throws CudaException { + int status = cudaFree(devPtr); + if(status != cudaSuccess) { + throw new jcuda.CudaException("cudaFree failed:" + cudaError.stringFor(status)); + } + } + + private static long maxAvailableMemory = -1; + private static double gpuUtilizationFactor = -1; + + /** + * Check if there is enough memory to allocate a pointer of given size + * + * @param size size in bytes + * @return true if there is enough available memory to allocate a pointer of the given size + */ + public boolean canAllocate(long size) { + return true; // Unified memory can allocate any amount of memory. Note: all allocations are guarded by SystemML's optimizer which uses getAvailableMemory + } + + /** + * Gets the available memory on GPU that SystemML can use. + * + * @return the available memory in bytes + */ + public long getAvailableMemory() { + if(maxAvailableMemory < 0 || gpuUtilizationFactor != DMLScript.GPU_MEMORY_UTILIZATION_FACTOR) { + long free[] = { 0 }; + long total[] = { 0 }; + cudaMemGetInfo(free, total); + maxAvailableMemory = (long) (total[0] * DMLScript.GPU_MEMORY_UTILIZATION_FACTOR); + gpuUtilizationFactor = DMLScript.GPU_MEMORY_UTILIZATION_FACTOR; + } + return maxAvailableMemory; + } + +}