Repository: systemml
Updated Branches:
  refs/heads/master b366c0f89 -> 13baec95c


[SYSTEMML-445] Added support for unified memory allocator for GPU backend

- The allocator can be configured using the property 
'sysml.gpu.memory.allocator'.
- Minor bugfix for setting configuration properties related to GPU.
- Improved the performance of eviction by reducing number of cudaMalloc calls. 
This gave ~1.6x end-to-end improvement for ResNet-200 with batch size of 48 on 
Intel+P100.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/13baec95
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/13baec95
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/13baec95

Branch: refs/heads/master
Commit: 13baec95c37595576766b8be45f514a0d67053ea
Parents: b366c0f
Author: Niketan Pansare <npan...@us.ibm.com>
Authored: Sat Aug 4 15:03:46 2018 -0700
Committer: Niketan Pansare <npan...@us.ibm.com>
Committed: Sat Aug 4 15:03:46 2018 -0700

----------------------------------------------------------------------
 conf/SystemML-config.xml.template               |  2 +
 .../java/org/apache/sysml/api/DMLScript.java    | 70 ++++++++++++---
 .../apache/sysml/api/ScriptExecutorUtils.java   | 44 ----------
 .../sysml/api/mlcontext/ScriptExecutor.java     | 11 +--
 .../java/org/apache/sysml/conf/DMLConfig.java   |  6 +-
 .../gpu/context/CudaMemoryAllocator.java        | 83 ++++++++++++++++++
 .../instructions/gpu/context/GPUContext.java    |  2 +-
 .../gpu/context/GPUMemoryAllocator.java         | 57 ++++++++++++
 .../gpu/context/GPUMemoryManager.java           | 60 ++++++++-----
 .../gpu/context/UnifiedMemoryAllocator.java     | 91 ++++++++++++++++++++
 10 files changed, 336 insertions(+), 90 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/13baec95/conf/SystemML-config.xml.template
----------------------------------------------------------------------
diff --git a/conf/SystemML-config.xml.template 
b/conf/SystemML-config.xml.template
index ca1c17b..3ce88c1 100644
--- a/conf/SystemML-config.xml.template
+++ b/conf/SystemML-config.xml.template
@@ -112,4 +112,6 @@
    <!-- Fraction of available GPU memory to use. This is similar to 
TensorFlow's per_process_gpu_memory_fraction configuration property. (default: 
0.9) -->
    <sysml.gpu.memory.util.factor>0.9</sysml.gpu.memory.util.factor>
    
+   <!-- Allocator to use to allocate GPU device memory. Supported values are 
cuda, unified_memory (default: cuda) -->
+   <sysml.gpu.memory.allocator>cuda</sysml.gpu.memory.allocator>
 </root>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/systemml/blob/13baec95/src/main/java/org/apache/sysml/api/DMLScript.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/DMLScript.java 
b/src/main/java/org/apache/sysml/api/DMLScript.java
index fd0b861..d9413a8 100644
--- a/src/main/java/org/apache/sysml/api/DMLScript.java
+++ b/src/main/java/org/apache/sysml/api/DMLScript.java
@@ -79,6 +79,7 @@ import 
org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration;
 import org.apache.sysml.runtime.util.LocalFileUtils;
 import org.apache.sysml.runtime.util.MapReduceTool;
 import org.apache.sysml.utils.Explain;
+import org.apache.sysml.utils.NativeHelper;
 import org.apache.sysml.utils.Explain.ExplainCounts;
 import org.apache.sysml.utils.Explain.ExplainType;
 import org.apache.sysml.utils.Statistics;
@@ -125,7 +126,8 @@ public class DMLScript
        public static long              EVICTION_SHADOW_BUFFER_MAX_BYTES = 0;   
                      // maximum number of bytes to use for shadow buffer
        public static long              EVICTION_SHADOW_BUFFER_CURR_BYTES = 0;  
                      // number of bytes to use for shadow buffer
        public static double                    GPU_MEMORY_UTILIZATION_FACTOR = 
0.9;                                              // fraction of available GPU 
memory to use
-
+       public static String                    GPU_MEMORY_ALLOCATOR = "cuda";  
                                                          // GPU memory 
allocator to use
+       
        /**
         * Global variable indicating the script type (DML or PYDML). Can be 
used
         * for DML/PYDML-specific tasks, such as outputting booleans in the 
correct
@@ -416,16 +418,8 @@ public class DMLScript
                CompilerConfig cconf = 
OptimizerUtils.constructCompilerConfig(dmlconf);
                ConfigurationManager.setGlobalConfig(cconf);
                LOG.debug("\nDML config: \n" + dmlconf.getConfigInfo());
-
-               // Sets the GPUs to use for this process (a range, all GPUs, 
comma separated list or a specific GPU)
-               GPUContextPool.AVAILABLE_GPUS = 
dmlconf.getTextValue(DMLConfig.AVAILABLE_GPUS);
                
-               String evictionPolicy = 
dmlconf.getTextValue(DMLConfig.GPU_EVICTION_POLICY).toUpperCase();
-               try {
-                       DMLScript.GPU_EVICTION_POLICY = 
EvictionPolicy.valueOf(evictionPolicy);
-               } catch(IllegalArgumentException e) {
-                       throw new RuntimeException("Unsupported eviction 
policy:" + evictionPolicy);
-               }
+               setGlobalFlags(dmlconf);
 
                //Step 2: set local/remote memory if requested (for compile in 
AM context) 
                if( dmlconf.getBooleanValue(DMLConfig.YARN_APPMASTER) ){
@@ -499,6 +493,62 @@ public class DMLScript
        }
        
        /**
+        * Sets the global flags in DMLScript based on user provided 
configuration
+        * 
+        * @param dmlconf user provided configuration
+        */
+       public static void setGlobalFlags(DMLConfig dmlconf) {
+               // Sets the GPUs to use for this process (a range, all GPUs, 
comma separated list or a specific GPU)
+               GPUContextPool.AVAILABLE_GPUS = 
dmlconf.getTextValue(DMLConfig.AVAILABLE_GPUS);
+               
+               String evictionPolicy = 
dmlconf.getTextValue(DMLConfig.GPU_EVICTION_POLICY).toUpperCase();
+               try {
+                       DMLScript.GPU_EVICTION_POLICY = 
EvictionPolicy.valueOf(evictionPolicy);
+               } catch(IllegalArgumentException e) {
+                       throw new RuntimeException("Unsupported eviction 
policy:" + evictionPolicy);
+               }
+               
+               // Whether extra statistics useful for developers and others 
interested
+               // in digging into performance problems are recorded and 
displayed
+               DMLScript.FINEGRAINED_STATISTICS = DMLScript.STATISTICS && 
dmlconf.getBooleanValue(DMLConfig.EXTRA_FINEGRAINED_STATS);
+               CacheableData.CACHING_BUFFER_SIZE = 
dmlconf.getDoubleValue(DMLConfig.CACHING_BUFFER_SIZE);
+               if(CacheableData.CACHING_BUFFER_SIZE < 0 || 
CacheableData.CACHING_BUFFER_SIZE > 1) 
+                       throw new RuntimeException("Incorrect value (" + 
CacheableData.CACHING_BUFFER_SIZE + ") for the configuration " + 
DMLConfig.CACHING_BUFFER_SIZE);
+               
+               DMLScript.STATISTICS_MAX_WRAP_LEN = 
dmlconf.getIntValue(DMLConfig.STATS_MAX_WRAP_LEN);          
+               
NativeHelper.initialize(dmlconf.getTextValue(DMLConfig.NATIVE_BLAS_DIR), 
dmlconf.getTextValue(DMLConfig.NATIVE_BLAS).trim());
+               
+               DMLScript.SYNCHRONIZE_GPU = 
dmlconf.getBooleanValue(DMLConfig.SYNCHRONIZE_GPU);
+               DMLScript.EAGER_CUDA_FREE = 
dmlconf.getBooleanValue(DMLConfig.EAGER_CUDA_FREE);
+               DMLScript.PRINT_GPU_MEMORY_INFO = 
dmlconf.getBooleanValue(DMLConfig.PRINT_GPU_MEMORY_INFO);
+               DMLScript.GPU_MEMORY_UTILIZATION_FACTOR = 
dmlconf.getDoubleValue(DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR);
+               DMLScript.GPU_MEMORY_ALLOCATOR = 
dmlconf.getTextValue(DMLConfig.GPU_MEMORY_ALLOCATOR);
+               if(DMLScript.GPU_MEMORY_UTILIZATION_FACTOR < 0) {
+                       throw new RuntimeException("Incorrect value (" + 
DMLScript.GPU_MEMORY_UTILIZATION_FACTOR + ") for the configuration:" + 
DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR);
+               }
+               
+               DMLScript.FLOATING_POINT_PRECISION = 
dmlconf.getTextValue(DMLConfig.FLOATING_POINT_PRECISION);
+               
org.apache.sysml.runtime.matrix.data.LibMatrixCUDA.resetFloatingPointPrecision();
+               if(DMLScript.FLOATING_POINT_PRECISION.equals("double")) {
+                       DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES = 0;
+               }
+               else {
+                       double shadowBufferSize = 
dmlconf.getDoubleValue(DMLConfig.EVICTION_SHADOW_BUFFERSIZE);
+                       if(shadowBufferSize < 0 || shadowBufferSize > 1) 
+                               throw new RuntimeException("Incorrect value (" 
+ shadowBufferSize + ") for the configuration:" + 
DMLConfig.EVICTION_SHADOW_BUFFERSIZE);
+                       DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES = (long) 
(((double)InfrastructureAnalyzer.getLocalMaxMemory())*shadowBufferSize);
+                       if(DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES > 0 && 
+                                       
DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES > 
DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES) {
+                               // This will be printed in a very rare 
situation when:
+                               // 1. There is a memory leak which leads to 
non-cleared shadow buffer OR
+                               // 2. MLContext is registering to bunch of 
outputs that are all part of shadow buffer
+                               System.out.println("WARN: Cannot use the shadow 
buffer due to potentially cached GPU objects. Current shadow buffer size (in 
bytes):" 
+                                       + 
DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES + " > Max shadow buffer size (in 
bytes):" + DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES);
+                       }
+               }
+       }
+       
+       /**
         * Launcher for DML debugger. This method should be called after 
         * execution and debug properties have been correctly set, and 
customized parameters
         * 

http://git-wip-us.apache.org/repos/asf/systemml/blob/13baec95/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java 
b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
index 7a97fcf..4af6949 100644
--- a/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
+++ b/src/main/java/org/apache/sysml/api/ScriptExecutorUtils.java
@@ -28,15 +28,12 @@ import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.hops.codegen.SpoofCompiler;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.Program;
-import org.apache.sysml.runtime.controlprogram.caching.CacheableData;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
-import 
org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
 import org.apache.sysml.runtime.instructions.cp.Data;
 import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
 import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
 import org.apache.sysml.runtime.instructions.gpu.context.GPUObject;
-import org.apache.sysml.utils.NativeHelper;
 import org.apache.sysml.utils.Statistics;
 
 public class ScriptExecutorUtils {
@@ -75,47 +72,6 @@ public class ScriptExecutorUtils {
         *            output variables that were registered as part of MLContext
         */
        public static void executeRuntimeProgram(Program rtprog, 
ExecutionContext ec, DMLConfig dmlconf, int statisticsMaxHeavyHitters, 
Set<String> outputVariables) {
-               // Whether extra statistics useful for developers and others 
interested
-               // in digging into performance problems are recorded and 
displayed
-               DMLScript.FINEGRAINED_STATISTICS = DMLScript.STATISTICS && 
dmlconf.getBooleanValue(DMLConfig.EXTRA_FINEGRAINED_STATS);
-               CacheableData.CACHING_BUFFER_SIZE = 
dmlconf.getDoubleValue(DMLConfig.CACHING_BUFFER_SIZE);
-               if(CacheableData.CACHING_BUFFER_SIZE < 0 || 
CacheableData.CACHING_BUFFER_SIZE > 1) 
-                       throw new RuntimeException("Incorrect value (" + 
CacheableData.CACHING_BUFFER_SIZE + ") for the configuration " + 
DMLConfig.CACHING_BUFFER_SIZE);
-               
-               DMLScript.STATISTICS_MAX_WRAP_LEN = 
dmlconf.getIntValue(DMLConfig.STATS_MAX_WRAP_LEN);          
-               
NativeHelper.initialize(dmlconf.getTextValue(DMLConfig.NATIVE_BLAS_DIR), 
dmlconf.getTextValue(DMLConfig.NATIVE_BLAS).trim());
-               
-               if(DMLScript.USE_ACCELERATOR) {
-                       DMLScript.SYNCHRONIZE_GPU = 
dmlconf.getBooleanValue(DMLConfig.SYNCHRONIZE_GPU);
-                       DMLScript.EAGER_CUDA_FREE = 
dmlconf.getBooleanValue(DMLConfig.EAGER_CUDA_FREE);
-                       DMLScript.PRINT_GPU_MEMORY_INFO = 
dmlconf.getBooleanValue(DMLConfig.PRINT_GPU_MEMORY_INFO);
-                       DMLScript.GPU_MEMORY_UTILIZATION_FACTOR = 
dmlconf.getDoubleValue(DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR);
-                       if(DMLScript.GPU_MEMORY_UTILIZATION_FACTOR < 0 || 
DMLScript.GPU_MEMORY_UTILIZATION_FACTOR > 1) {
-                               throw new RuntimeException("Incorrect value (" 
+ DMLScript.GPU_MEMORY_UTILIZATION_FACTOR + ") for the configuration:" + 
DMLConfig.GPU_MEMORY_UTILIZATION_FACTOR);
-                       }
-                       
-                       DMLScript.FLOATING_POINT_PRECISION = 
dmlconf.getTextValue(DMLConfig.FLOATING_POINT_PRECISION);
-                       
org.apache.sysml.runtime.matrix.data.LibMatrixCUDA.resetFloatingPointPrecision();
-                       if(DMLScript.FLOATING_POINT_PRECISION.equals("double")) 
{
-                               DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES = 0;
-                       }
-                       else {
-                               double shadowBufferSize = 
dmlconf.getDoubleValue(DMLConfig.EVICTION_SHADOW_BUFFERSIZE);
-                               if(shadowBufferSize < 0 || shadowBufferSize > 
1) 
-                                       throw new RuntimeException("Incorrect 
value (" + shadowBufferSize + ") for the configuration:" + 
DMLConfig.EVICTION_SHADOW_BUFFERSIZE);
-                               DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES = 
(long) (((double)InfrastructureAnalyzer.getLocalMaxMemory())*shadowBufferSize);
-                               if(DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES > 
0 && 
-                                               
DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES > 
DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES) {
-                                       // This will be printed in a very rare 
situation when:
-                                       // 1. There is a memory leak which 
leads to non-cleared shadow buffer OR
-                                       // 2. MLContext is registering to bunch 
of outputs that are all part of shadow buffer
-                                       System.out.println("WARN: Cannot use 
the shadow buffer due to potentially cached GPU objects. Current shadow buffer 
size (in bytes):" 
-                                               + 
DMLScript.EVICTION_SHADOW_BUFFER_CURR_BYTES + " > Max shadow buffer size (in 
bytes):" + DMLScript.EVICTION_SHADOW_BUFFER_MAX_BYTES);
-                               }
-                       }
-               }
-               
-
                boolean exceptionThrown = false;
 
                Statistics.startRunTimer();

http://git-wip-us.apache.org/repos/asf/systemml/blob/13baec95/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java 
b/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java
index 00fc096..8e3bdaf 100644
--- a/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java
+++ b/src/main/java/org/apache/sysml/api/mlcontext/ScriptExecutor.java
@@ -246,16 +246,7 @@ public class ScriptExecutor {
                        throw new RuntimeException(ex);
                }
 
-               // set the GPUs to use for this process (a range, all GPUs, 
comma separated list or a specific GPU)
-               GPUContextPool.AVAILABLE_GPUS = 
config.getTextValue(DMLConfig.AVAILABLE_GPUS);
-               
-               String evictionPolicy = 
config.getTextValue(DMLConfig.GPU_EVICTION_POLICY).toUpperCase();
-               try {
-                       DMLScript.GPU_EVICTION_POLICY = 
EvictionPolicy.valueOf(evictionPolicy);
-               } 
-               catch(IllegalArgumentException e) {
-                       throw new RuntimeException("Unsupported eviction 
policy:" + evictionPolicy);
-               }
+               DMLScript.setGlobalFlags(config);
        }
        
 

http://git-wip-us.apache.org/repos/asf/systemml/blob/13baec95/src/main/java/org/apache/sysml/conf/DMLConfig.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/conf/DMLConfig.java 
b/src/main/java/org/apache/sysml/conf/DMLConfig.java
index 4aad400..1333075 100644
--- a/src/main/java/org/apache/sysml/conf/DMLConfig.java
+++ b/src/main/java/org/apache/sysml/conf/DMLConfig.java
@@ -92,6 +92,7 @@ public class DMLConfig
        // Fraction of available memory to use. The available memory is 
computer when the GPUContext is created
        // to handle the tradeoff on calling cudaMemGetInfo too often.
        public static final String GPU_MEMORY_UTILIZATION_FACTOR = 
"sysml.gpu.memory.util.factor";
+       public static final String GPU_MEMORY_ALLOCATOR = 
"sysml.gpu.memory.allocator"; // String to specify the memory allocator to use. 
Supported values are: cuda, unified_memory
        public static final String FLOATING_POINT_PRECISION = 
"sysml.floating.point.precision"; // String to specify the datatype to use 
internally: supported values are double, single
        public static final String PRINT_GPU_MEMORY_INFO = 
"sysml.gpu.print.memoryInfo";
        public static final String EVICTION_SHADOW_BUFFERSIZE = 
"sysml.gpu.eviction.shadow.bufferSize";
@@ -140,13 +141,13 @@ public class DMLConfig
                _defaultVals.put(EVICTION_SHADOW_BUFFERSIZE,  "0.0" );
                _defaultVals.put(STATS_MAX_WRAP_LEN,     "30" );
                _defaultVals.put(GPU_MEMORY_UTILIZATION_FACTOR,      "0.9" );
+               _defaultVals.put(GPU_MEMORY_ALLOCATOR,   "cuda");
                _defaultVals.put(AVAILABLE_GPUS,         "-1");
                _defaultVals.put(GPU_EVICTION_POLICY,    "align_memory");
                _defaultVals.put(SYNCHRONIZE_GPU,        "false" );
                _defaultVals.put(CACHING_BUFFER_SIZE,    "0.15" );
                _defaultVals.put(EAGER_CUDA_FREE,        "false" );
                _defaultVals.put(FLOATING_POINT_PRECISION,               
"double" );
-               _defaultVals.put(PRINT_GPU_MEMORY_INFO,  "false");
        }
        
        public DMLConfig() {
@@ -428,7 +429,8 @@ public class DMLConfig
                                COMPRESSED_LINALG, 
                                CODEGEN, CODEGEN_COMPILER, CODEGEN_OPTIMIZER, 
CODEGEN_PLANCACHE, CODEGEN_LITERALS,
                                EXTRA_FINEGRAINED_STATS, STATS_MAX_WRAP_LEN, 
PRINT_GPU_MEMORY_INFO, CACHING_BUFFER_SIZE,
-                               AVAILABLE_GPUS, SYNCHRONIZE_GPU, 
EAGER_CUDA_FREE, FLOATING_POINT_PRECISION, GPU_EVICTION_POLICY, 
EVICTION_SHADOW_BUFFERSIZE
+                               AVAILABLE_GPUS, SYNCHRONIZE_GPU, 
EAGER_CUDA_FREE, FLOATING_POINT_PRECISION, GPU_EVICTION_POLICY, 
EVICTION_SHADOW_BUFFERSIZE,
+                               GPU_MEMORY_ALLOCATOR, 
GPU_MEMORY_UTILIZATION_FACTOR
                }; 
                
                StringBuilder sb = new StringBuilder();

http://git-wip-us.apache.org/repos/asf/systemml/blob/13baec95/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CudaMemoryAllocator.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CudaMemoryAllocator.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CudaMemoryAllocator.java
new file mode 100644
index 0000000..e74bea3
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/CudaMemoryAllocator.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.instructions.gpu.context;
+
+import static jcuda.runtime.JCuda.cudaMemGetInfo;
+
+import org.apache.sysml.api.DMLScript;
+
+import jcuda.CudaException;
+import jcuda.Pointer;
+import static jcuda.runtime.cudaError.cudaSuccess;
+import static jcuda.runtime.JCuda.cudaMalloc;
+import jcuda.runtime.cudaError;
+import static jcuda.runtime.JCuda.cudaFree;
+
+public class CudaMemoryAllocator implements GPUMemoryAllocator {
+       
+       /**
+        * Allocate memory on the device. 
+        * 
+        * @param devPtr Pointer to allocated device memory
+        * @param size size in bytes
+        * @throws jcuda.CudaException if unable to allocate
+        */
+       public void allocate(Pointer devPtr, long size) throws CudaException {
+               int status = cudaMalloc(devPtr, size);
+               if(status != cudaSuccess) {
+                       throw new jcuda.CudaException("cudaMalloc failed:" + 
cudaError.stringFor(status));
+               }
+       }
+
+       /**
+        * Frees memory on the device
+        * 
+        * @param devPtr Device pointer to memory to free
+        * @throws jcuda.CudaException if error occurs
+        */
+       public void free(Pointer devPtr) throws CudaException {
+               int status = cudaFree(devPtr);
+               if(status != cudaSuccess) {
+                       throw new jcuda.CudaException("cudaFree failed:" + 
cudaError.stringFor(status));
+               }
+       }
+
+       /**
+        * Check if there is enough memory to allocate a pointer of given size 
+        * 
+        * @param size size in bytes
+        * @return true if there is enough available memory to allocate a 
pointer of the given size 
+        */
+       public boolean canAllocate(long size) {
+               return size <= getAvailableMemory();
+       }
+       
+       /**
+        * Gets the available memory on GPU that SystemML can use.
+        *
+        * @return the available memory in bytes
+        */
+       public long getAvailableMemory() {
+               long free[] = { 0 };
+               long total[] = { 0 };
+               cudaMemGetInfo(free, total);
+               return (long) (free[0] * 
DMLScript.GPU_MEMORY_UTILIZATION_FACTOR);
+       }
+
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/13baec95/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
index 2ac92a7..180a60f 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContext.java
@@ -211,7 +211,7 @@ public class GPUContext {
         * @return the available memory in bytes
         */
        public long getAvailableMemory() {
-               return memoryManager.getAvailableMemory();
+               return memoryManager.allocator.getAvailableMemory();
        }
 
        /**

http://git-wip-us.apache.org/repos/asf/systemml/blob/13baec95/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryAllocator.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryAllocator.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryAllocator.java
new file mode 100644
index 0000000..0c99127
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryAllocator.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.instructions.gpu.context;
+
+import jcuda.Pointer;
+
+public interface GPUMemoryAllocator {
+       
+       /**
+        * Allocate memory on the device. 
+        * 
+        * @param devPtr Pointer to allocated device memory
+        * @param size size in bytes
+        * @throws jcuda.CudaException if unable to allocate
+        */
+       public void allocate(Pointer devPtr, long size) throws 
jcuda.CudaException;
+       
+       /**
+        * Frees memory on the device
+        * 
+        * @param devPtr Device pointer to memory to free
+        * @throws jcuda.CudaException if error occurs
+        */
+       public void free(Pointer devPtr) throws jcuda.CudaException;
+       
+       /**
+        * Check if there is enough memory to allocate a pointer of given size 
+        * 
+        * @param size size in bytes
+        * @return true if there is enough available memory to allocate a 
pointer of the given size 
+        */
+       public boolean canAllocate(long size);
+       
+       /**
+        * Gets the available memory on GPU that SystemML can use.
+        *
+        * @return the available memory in bytes
+        */
+       public long getAvailableMemory();
+       
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/13baec95/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
index c4ae253..75bce87 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUMemoryManager.java
@@ -18,8 +18,6 @@
  */
 package org.apache.sysml.runtime.instructions.gpu.context;
 
-import static jcuda.runtime.JCuda.cudaFree;
-import static jcuda.runtime.JCuda.cudaMalloc;
 import static jcuda.runtime.JCuda.cudaMemGetInfo;
 import static jcuda.runtime.JCuda.cudaMemset;
 
@@ -37,6 +35,7 @@ import java.util.stream.Collectors;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
@@ -55,6 +54,7 @@ public class GPUMemoryManager {
        private static final boolean DEBUG_MEMORY_LEAK = false;
        private static final int [] DEBUG_MEMORY_LEAK_STACKTRACE_DEPTH = {5, 6, 
7, 8, 9, 10}; // Avoids printing too much text while debuggin
        
+       protected final GPUMemoryAllocator allocator;
        
/*****************************************************************************************/
        // GPU Memory is divided into three major sections:
        // 1. Matrix Memory: Memory allocated to matrices in SystemML and 
addressable by GPUObjects.
@@ -65,7 +65,7 @@ public class GPUMemoryManager {
        // To get the GPUObjects/Pointers in this section, please use 
getGPUObjects and getPointers methods of GPUMatrixMemoryManager.
        // To clear GPUObjects/Pointers in this section, please use clear and 
clearAll methods of GPUMatrixMemoryManager.
        // Both these methods allow to get/clear unlocked/locked and 
dirty/non-dirty objects of a certain size.
-       GPUMatrixMemoryManager matrixMemoryManager;
+       protected final GPUMatrixMemoryManager matrixMemoryManager;
        public GPUMatrixMemoryManager getGPUMatrixMemoryManager() {
                return matrixMemoryManager;
        }
@@ -73,7 +73,7 @@ public class GPUMemoryManager {
        // 2. Rmvar-ed pointers: If sysml.gpu.eager.cudaFree is set to false,
        // then this manager caches pointers of the GPUObject on which rmvar 
instruction has been executed for future reuse.
        // We observe 2-3x improvement with this approach and hence recommend 
to set this flag to false.
-       GPULazyCudaFreeMemoryManager lazyCudaFreeMemoryManager;
+       protected final GPULazyCudaFreeMemoryManager lazyCudaFreeMemoryManager;
        public GPULazyCudaFreeMemoryManager getGPULazyCudaFreeMemoryManager() {
                return lazyCudaFreeMemoryManager;
        }
@@ -90,7 +90,7 @@ public class GPUMemoryManager {
        /**
         * To record size of all allocated pointers allocated by above memory 
managers
         */
-       HashMap<Pointer, PointerInfo> allPointers = new HashMap<>();
+       protected final HashMap<Pointer, PointerInfo> allPointers = new 
HashMap<>();
        
        
/*****************************************************************************************/
        
@@ -131,6 +131,16 @@ public class GPUMemoryManager {
        public GPUMemoryManager(GPUContext gpuCtx) {
                matrixMemoryManager = new GPUMatrixMemoryManager(this);
                lazyCudaFreeMemoryManager = new 
GPULazyCudaFreeMemoryManager(this);
+               if(DMLScript.GPU_MEMORY_ALLOCATOR.equals("cuda")) {
+                       allocator = new CudaMemoryAllocator();
+               }
+               else 
if(DMLScript.GPU_MEMORY_ALLOCATOR.equals("unified_memory")) {
+                       allocator = new UnifiedMemoryAllocator();
+               }
+               else {
+                       throw new RuntimeException("Unsupported value (" + 
DMLScript.GPU_MEMORY_ALLOCATOR + ") for the configuration " + 
DMLConfig.GPU_MEMORY_ALLOCATOR 
+                                       + ". Supported values are cuda, 
unified_memory.");
+               }
                long free[] = { 0 };
                long total[] = { 0 };
                cudaMemGetInfo(free, total);
@@ -159,7 +169,7 @@ public class GPUMemoryManager {
        private Pointer cudaMallocNoWarn(Pointer A, long size, String 
printDebugMessage) {
                long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
                try {
-                       cudaMalloc(A, size);
+                       allocator.allocate(A, size);
                        allPointers.put(A, new PointerInfo(size));
                        if(DMLScript.STATISTICS) {
                                long totalTime = System.nanoTime() - t0;
@@ -241,7 +251,7 @@ public class GPUMemoryManager {
                Pointer tmpA = (A == null) ? new Pointer() : null;
                // Step 2: Allocate a new pointer in the GPU memory (since 
memory is available)
                // Step 3 has potential to create holes as well as limit future 
reuse, hence perform this step before step 3.
-               if(A == null && size <= getAvailableMemory()) {
+               if(A == null && allocator.canAllocate(size)) {
                        // This can fail in case of fragmented memory, so don't 
issue any warning
                        A = cudaMallocNoWarn(tmpA, size, "allocate a new 
pointer");
                }
@@ -262,7 +272,7 @@ public class GPUMemoryManager {
                // than doing cuda free/malloc/memset. So, rmvar-ing every 
blocks (step 4) is preferred to eviction (step 5).
                if(A == null) {
                        lazyCudaFreeMemoryManager.clearAll();
-                       if(size <= getAvailableMemory()) {
+                       if(allocator.canAllocate(size)) {
                                // This can fail in case of fragmented memory, 
so don't issue any warning
                                A = cudaMallocNoWarn(tmpA, size, "allocate a 
new pointer after eager free");
                        }
@@ -292,14 +302,26 @@ public class GPUMemoryManager {
                // Step 6: Try eviction/clearing one-by-one based on the given 
policy without size restriction
                if(A == null) {
                        long t0 =  DMLScript.STATISTICS ? System.nanoTime() : 0;
+                       long currentAvailableMemory = 
allocator.getAvailableMemory();
+                       boolean canFit = false;
                        // 
---------------------------------------------------------------
                        // Evict unlocked GPU objects one-by-one and try malloc
                        List<GPUObject> unlockedGPUObjects = 
matrixMemoryManager.gpuObjects.stream()
                                                .filter(gpuObj -> 
!gpuObj.isLocked()).collect(Collectors.toList());
                        Collections.sort(unlockedGPUObjects, new 
EvictionPolicyBasedComparator(size));
                        while(A == null && unlockedGPUObjects.size() > 0) {
-                               
evictOrClear(unlockedGPUObjects.remove(unlockedGPUObjects.size()-1), opcode);
-                               A = cudaMallocNoWarn(tmpA, size, null);
+                               GPUObject evictedGPUObject = 
unlockedGPUObjects.remove(unlockedGPUObjects.size()-1);
+                               evictOrClear(evictedGPUObject, opcode);
+                               if(!canFit) {
+                                       currentAvailableMemory += 
evictedGPUObject.getSizeOnDevice();
+                                       if(currentAvailableMemory >= size)
+                                               canFit = true;
+                               }
+                               if(canFit) {
+                                       // Checking before invoking cudaMalloc 
reduces the time spent in unnecessary cudaMalloc.
+                                       // This was the bottleneck for 
ResNet200 experiments with batch size > 32 on P100+Intel
+                                       A = cudaMallocNoWarn(tmpA, size, null); 
+                               }
                                if(DMLScript.STATISTICS) 
                                        
GPUStatistics.cudaEvictCount.increment();
                        }
@@ -382,7 +404,7 @@ public class GPUMemoryManager {
                        }
                        allPointers.remove(toFree);
                        lazyCudaFreeMemoryManager.removeIfPresent(size, toFree);
-                       cudaFree(toFree);
+                       allocator.free(toFree);
                        // JCuda.cudaDeviceSynchronize(); // Force a device 
synchronize after free-ing the pointer for debugging
                }
                else {
@@ -587,23 +609,15 @@ public class GPUMemoryManager {
                return ret.toString();
        }
        
-       /**
-        * Gets the available memory on GPU that SystemML can use.
-        *
-        * @return the available memory in bytes
-        */
-       public long getAvailableMemory() {
-               long free[] = { 0 };
-               long total[] = { 0 };
-               cudaMemGetInfo(free, total);
-               return (long) (free[0] * 
DMLScript.GPU_MEMORY_UTILIZATION_FACTOR);
-       }
-       
        private static class CustomPointer extends Pointer {
                public CustomPointer(Pointer p) {
                        super(p);
                }
                
+               public CustomPointer() {
+                       super();
+               }
+               
                @Override
                public long getNativePointer() {
                        return super.getNativePointer();

http://git-wip-us.apache.org/repos/asf/systemml/blob/13baec95/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/UnifiedMemoryAllocator.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/UnifiedMemoryAllocator.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/UnifiedMemoryAllocator.java
new file mode 100644
index 0000000..71c6fc3
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/UnifiedMemoryAllocator.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.instructions.gpu.context;
+
+import static jcuda.runtime.JCuda.cudaFree;
+import static jcuda.runtime.JCuda.cudaMallocManaged;
+import static jcuda.runtime.JCuda.cudaMemGetInfo;
+import static jcuda.runtime.cudaError.cudaSuccess;
+import static jcuda.runtime.JCuda.cudaMemAttachGlobal;
+import org.apache.sysml.api.DMLScript;
+
+import jcuda.CudaException;
+import jcuda.Pointer;
+import jcuda.runtime.cudaError;
+
+public class UnifiedMemoryAllocator  implements GPUMemoryAllocator {
+
+       /**
+        * Allocate memory on the device. 
+        * 
+        * @param devPtr Pointer to allocated device memory
+        * @param size size in bytes
+        * @throws jcuda.CudaException if unable to allocate
+        */
+       public void allocate(Pointer devPtr, long size) throws CudaException {
+               int status = cudaMallocManaged(devPtr, size, 
cudaMemAttachGlobal);
+               if(status != cudaSuccess) {
+                       throw new jcuda.CudaException("cudaMallocManaged 
failed:" + cudaError.stringFor(status));
+               }
+               
+       }
+
+       /**
+        * Frees memory on the device
+        * 
+        * @param devPtr Device pointer to memory to free
+        * @throws jcuda.CudaException if error occurs
+        */
+       public void free(Pointer devPtr) throws CudaException {
+               int status = cudaFree(devPtr);
+               if(status != cudaSuccess) {
+                       throw new jcuda.CudaException("cudaFree failed:" + 
cudaError.stringFor(status));
+               }
+       }
+       
+       private static long maxAvailableMemory = -1;
+       private static double gpuUtilizationFactor = -1;
+       
+       /**
+        * Check if there is enough memory to allocate a pointer of given size 
+        * 
+        * @param size size in bytes
+        * @return true if there is enough available memory to allocate a 
pointer of the given size 
+        */
+       public boolean canAllocate(long size) {
+               return true; // Unified memory can allocate any amount of 
memory. Note: all allocations are guarded by SystemML's optimizer which uses 
getAvailableMemory
+       }
+       
+       /**
+        * Gets the available memory on GPU that SystemML can use.
+        *
+        * @return the available memory in bytes
+        */
+       public long getAvailableMemory() {
+               if(maxAvailableMemory < 0 || gpuUtilizationFactor != 
DMLScript.GPU_MEMORY_UTILIZATION_FACTOR) {
+                       long free[] = { 0 };
+                       long total[] = { 0 };
+                       cudaMemGetInfo(free, total);
+                       maxAvailableMemory = (long) (total[0] * 
DMLScript.GPU_MEMORY_UTILIZATION_FACTOR);
+                       gpuUtilizationFactor = 
DMLScript.GPU_MEMORY_UTILIZATION_FACTOR;
+               }
+               return maxAvailableMemory;
+       }
+
+}

Reply via email to