systemml git commit: [SYSTEMML-1701] fix need to use -force for gpu

nakul02 Mon, 19 Jun 2017 11:48:06 -0700

Repository: systemml
Updated Branches:
  refs/heads/master 3cde999c0 -> df8d4a63d



[SYSTEMML-1701] fix need to use -force for gpu

Closes #546


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/df8d4a63
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/df8d4a63
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/df8d4a63

Branch: refs/heads/master
Commit: df8d4a63d8d09cae94b6ca2634e31da554302c72
Parents: 3cde999
Author: Nakul Jindal <naku...@gmail.com>
Authored: Mon Jun 19 11:44:22 2017 -0700
Committer: Nakul Jindal <naku...@gmail.com>
Committed: Mon Jun 19 11:47:27 2017 -0700

----------------------------------------------------------------------
 .../java/org/apache/sysml/hops/AggBinaryOp.java |  9 +++--
 .../java/org/apache/sysml/hops/AggUnaryOp.java  | 18 +++++-----
 .../java/org/apache/sysml/hops/BinaryOp.java    |  7 ++--
 src/main/java/org/apache/sysml/hops/Hop.java    |  4 ++-
 .../org/apache/sysml/hops/OptimizerUtils.java   |  5 +--
 .../java/org/apache/sysml/hops/ReorgOp.java     |  4 ++-
 .../java/org/apache/sysml/hops/TernaryOp.java   |  4 ++-
 .../gpu/context/GPUContextPool.java             | 35 +++++++++++++++++++-
 .../runtime/matrix/data/LibMatrixCUDA.java      |  2 +-
 9 files changed, 66 insertions(+), 22 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/df8d4a63/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/AggBinaryOp.java 
b/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
index 21dbbf1..c721efe 100644
--- a/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
@@ -49,6 +49,7 @@ import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.parser.Expression.ValueType;
 import 
org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat;
 import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.mapred.DistributedCacheInput;
@@ -150,7 +151,7 @@ public class AggBinaryOp extends Hop implements 
MultiThreadedHop
         */
        @Override
        public Lop constructLops() 
-               throws HopsException, LopsException 
+               throws HopsException, LopsException
        {
                //return already created lops
                if( getLops() != null )
@@ -546,7 +547,8 @@ public class AggBinaryOp extends Hop implements 
MultiThreadedHop
                int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
                
                ExecType et = ExecType.CP;
-               if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || 
getMemEstimate() < OptimizerUtils.GPU_MEMORY_BUDGET)) {
+               if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || 
getMemEstimate() < GPUContextPool
+                               .initialGPUMemBudget())) {
                        et = ExecType.GPU;
                }
                
@@ -625,7 +627,8 @@ public class AggBinaryOp extends Hop implements 
MultiThreadedHop
        {       
                Lop matmultCP = null;
                
-               if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || 
getMemEstimate() < OptimizerUtils.GPU_MEMORY_BUDGET)) {
+               if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || 
getMemEstimate() < GPUContextPool
+                               .initialGPUMemBudget())) {
                        Hop h1 = getInput().get(0);
                        Hop h2 = getInput().get(1);
                        Lop left; Lop right;

http://git-wip-us.apache.org/repos/asf/systemml/blob/df8d4a63/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java 
b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
index 8e681c1..eb469ab 100644
--- a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java
@@ -39,6 +39,7 @@ import org.apache.sysml.lops.LopProperties.ExecType;
 import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.parser.Expression.ValueType;
 import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
 
 
@@ -149,15 +150,16 @@ public class AggUnaryOp extends Hop implements 
MultiThreadedHop
                                }                               
                                else { //general case           
                                        int k = 
OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
-                                       if(DMLScript.USE_ACCELERATOR && 
(DMLScript.FORCE_ACCELERATOR || getMemEstimate() < 
OptimizerUtils.GPU_MEMORY_BUDGET)) {
+                                       if(DMLScript.USE_ACCELERATOR && 
(DMLScript.FORCE_ACCELERATOR || getMemEstimate() < GPUContextPool
+                                                       
.initialGPUMemBudget())) {
                                                // Only implemented methods for 
GPU
-                                               if (                     (_op 
== AggOp.SUM                      && (_direction == Direction.RowCol || 
_direction == Direction.Row || _direction == Direction.Col))
-                                                                               
|| (_op == AggOp.SUM_SQ         && (_direction == Direction.RowCol || 
_direction == Direction.Row || _direction == Direction.Col))
-                                                                               
|| (_op == AggOp.MAX                    && (_direction == Direction.RowCol || 
_direction == Direction.Row || _direction == Direction.Col))
-                                                                               
|| (_op == AggOp.MIN                    && (_direction == Direction.RowCol || 
_direction == Direction.Row || _direction == Direction.Col))
-                                                                               
|| (_op == AggOp.MEAN           && (_direction == Direction.RowCol || 
_direction == Direction.Row || _direction == Direction.Col))
-                                                                               
|| (_op == AggOp.VAR            && (_direction == Direction.RowCol || 
_direction == Direction.Row || _direction == Direction.Col))
-                                                                               
|| (_op == AggOp.PROD           && (_direction == Direction.RowCol))){
+                                               if ((_op == AggOp.SUM    && 
(_direction == Direction.RowCol || _direction == Direction.Row || _direction == 
Direction.Col))
+                                                || (_op == AggOp.SUM_SQ && 
(_direction == Direction.RowCol || _direction == Direction.Row || _direction == 
Direction.Col))
+                                                || (_op == AggOp.MAX    && 
(_direction == Direction.RowCol || _direction == Direction.Row || _direction == 
Direction.Col))
+                                                || (_op == AggOp.MIN    && 
(_direction == Direction.RowCol || _direction == Direction.Row || _direction == 
Direction.Col))
+                                                || (_op == AggOp.MEAN   && 
(_direction == Direction.RowCol || _direction == Direction.Row || _direction == 
Direction.Col))
+                                                || (_op == AggOp.VAR    && 
(_direction == Direction.RowCol || _direction == Direction.Row || _direction == 
Direction.Col))
+                                                || (_op == AggOp.PROD   && 
(_direction == Direction.RowCol))){
                                                        et = ExecType.GPU;
                                                        k = 1;
                                                }

http://git-wip-us.apache.org/repos/asf/systemml/blob/df8d4a63/src/main/java/org/apache/sysml/hops/BinaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/BinaryOp.java 
b/src/main/java/org/apache/sysml/hops/BinaryOp.java
index f3a2fa0..ed0d9ad 100644
--- a/src/main/java/org/apache/sysml/hops/BinaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/BinaryOp.java
@@ -53,6 +53,7 @@ import org.apache.sysml.lops.LopProperties.ExecType;
 import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.parser.Expression.ValueType;
 import 
org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
 import org.apache.sysml.runtime.matrix.mapred.DistributedCacheInput;
 
@@ -578,7 +579,8 @@ public class BinaryOp extends Hop
                        else //general case
                                ot = HopsOpOp2LopsU.get(op);
                        
-                       if(DMLScript.USE_ACCELERATOR && 
(DMLScript.FORCE_ACCELERATOR || getMemEstimate() < 
OptimizerUtils.GPU_MEMORY_BUDGET) 
+                       if(DMLScript.USE_ACCELERATOR && 
(DMLScript.FORCE_ACCELERATOR || getMemEstimate() < GPUContextPool
+                                       .initialGPUMemBudget())
                                        && (op == OpOp2.MULT || op == 
OpOp2.PLUS || op == OpOp2.MINUS || op == OpOp2.DIV || op == OpOp2.POW) ) {
                                et = ExecType.GPU;
                        }
@@ -596,7 +598,8 @@ public class BinaryOp extends Hop
                        ExecType et = optFindExecType();
                        if ( et == ExecType.CP ) 
                        {
-                               if(DMLScript.USE_ACCELERATOR && 
(DMLScript.FORCE_ACCELERATOR || getMemEstimate() < 
OptimizerUtils.GPU_MEMORY_BUDGET) 
+                               if(DMLScript.USE_ACCELERATOR && 
(DMLScript.FORCE_ACCELERATOR || getMemEstimate() < GPUContextPool
+                                               .initialGPUMemBudget())
                                                && (op == OpOp2.MULT || op == 
OpOp2.PLUS || op == OpOp2.MINUS || op == OpOp2.DIV || op == OpOp2.POW || op == 
OpOp2.SOLVE)) {
                                        et = ExecType.GPU;
                                }

http://git-wip-us.apache.org/repos/asf/systemml/blob/df8d4a63/src/main/java/org/apache/sysml/hops/Hop.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/Hop.java 
b/src/main/java/org/apache/sysml/hops/Hop.java
index 6541edc..9a71e7c 100644
--- a/src/main/java/org/apache/sysml/hops/Hop.java
+++ b/src/main/java/org/apache/sysml/hops/Hop.java
@@ -44,6 +44,7 @@ import 
org.apache.sysml.runtime.controlprogram.LocalVariableMap;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject.UpdateType;
 import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
 import org.apache.sysml.runtime.controlprogram.parfor.util.IDSequence;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.util.UtilFunctions;
@@ -787,7 +788,8 @@ public abstract class Hop
        }
        
        protected ExecType findGPUExecTypeByMemEstimate(ExecType et) {
-               if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || 
getMemEstimate() < OptimizerUtils.GPU_MEMORY_BUDGET)) {
+               if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || 
getMemEstimate() < GPUContextPool
+                               .initialGPUMemBudget())) {
                        return ExecType.GPU;
                }
                return et;

http://git-wip-us.apache.org/repos/asf/systemml/blob/df8d4a63/src/main/java/org/apache/sysml/hops/OptimizerUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/OptimizerUtils.java 
b/src/main/java/org/apache/sysml/hops/OptimizerUtils.java
index 79b7ee6..dcbc27a 100644
--- a/src/main/java/org/apache/sysml/hops/OptimizerUtils.java
+++ b/src/main/java/org/apache/sysml/hops/OptimizerUtils.java
@@ -199,10 +199,7 @@ public class OptimizerUtils
         * 
         */
        public static final boolean ALLOW_COMBINE_FILE_INPUT_FORMAT = true;
-       
-       
-       public static long GPU_MEMORY_BUDGET = -1;
-       
+
        //////////////////////
        // Optimizer levels //
        //////////////////////

http://git-wip-us.apache.org/repos/asf/systemml/blob/df8d4a63/src/main/java/org/apache/sysml/hops/ReorgOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ReorgOp.java 
b/src/main/java/org/apache/sysml/hops/ReorgOp.java
index 7645c46..8ed308b 100644
--- a/src/main/java/org/apache/sysml/hops/ReorgOp.java
+++ b/src/main/java/org/apache/sysml/hops/ReorgOp.java
@@ -35,6 +35,7 @@ import org.apache.sysml.lops.LopProperties.ExecType;
 import org.apache.sysml.lops.Transform.OperationTypes;
 import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.parser.Expression.ValueType;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
 
 /**
@@ -151,7 +152,8 @@ public class ReorgOp extends Hop implements MultiThreadedHop
                                        setLops(lin); //if input of size 1x1, 
avoid unnecessary transpose
                                else { //general case
                                        int k = 
OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
-                                       if(DMLScript.USE_ACCELERATOR && 
(DMLScript.FORCE_ACCELERATOR || getMemEstimate() < 
OptimizerUtils.GPU_MEMORY_BUDGET)) {
+                                       if(DMLScript.USE_ACCELERATOR && 
(DMLScript.FORCE_ACCELERATOR || getMemEstimate() < GPUContextPool
+                                                       
.initialGPUMemBudget())) {
                                                et = ExecType.GPU;
                                        }
                                        Transform transform1 = new Transform( 
lin, 

http://git-wip-us.apache.org/repos/asf/systemml/blob/df8d4a63/src/main/java/org/apache/sysml/hops/TernaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/TernaryOp.java 
b/src/main/java/org/apache/sysml/hops/TernaryOp.java
index b875387..458a346 100644
--- a/src/main/java/org/apache/sysml/hops/TernaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/TernaryOp.java
@@ -42,6 +42,7 @@ import 
org.apache.sysml.lops.PartialAggregate.CorrectionLocationType;
 import org.apache.sysml.parser.Statement;
 import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.parser.Expression.ValueType;
+import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
 
 /** Primary use cases for now, are
@@ -649,7 +650,8 @@ public class TernaryOp extends Hop
                        throw new HopsException("Unexpected operation: " + _op 
+ ", expecting " + OpOp3.PLUS_MULT + " or" +  OpOp3.MINUS_MULT);
                
                ExecType et = null;
-               if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || 
getMemEstimate() < OptimizerUtils.GPU_MEMORY_BUDGET) )
+               if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || 
getMemEstimate() < GPUContextPool
+                               .initialGPUMemBudget()) )
                        et = ExecType.GPU;
                else
                        et = optFindExecType();

http://git-wip-us.apache.org/repos/asf/systemml/blob/df8d4a63/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
index ac1c059..ef38da8 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java
@@ -46,6 +46,9 @@ public class GPUContextPool {
         */
        public static int PER_PROCESS_MAX_GPUS = -1;
 
+
+       private static long INITIAL_GPU_MEMORY_BUDGET = -1;
+
        /**
         * Whether cuda has been initialized
         */
@@ -80,6 +83,7 @@ public class GPUContextPool {
         * @throws DMLRuntimeException ?
         */
        public synchronized static void initializeGPU() throws 
DMLRuntimeException {
+               initialized = true;
                GPUContext.LOG.info("Initializing CUDA");
                long start = System.nanoTime();
                JCuda.setExceptionsEnabled(true);
@@ -110,7 +114,22 @@ public class GPUContextPool {
                        pool.add(gCtx);
                }
 
+               // Initialize the initial memory budget
+               // If there are heterogeneous GPUs on the machine (different 
memory sizes)
+               // initially available memory is set to the GPU with the lowest 
memory
+               // This is because at runtime, we wouldn't know which GPU a 
certain
+               // operation gets scheduled on
+               long minAvailableMemory = Integer.MAX_VALUE;
+               for (GPUContext gCtx : pool) {
+                       gCtx.initializeThread();
+                       minAvailableMemory = Math.min(minAvailableMemory, 
gCtx.getAvailableMemory());
+               }
+               INITIAL_GPU_MEMORY_BUDGET = minAvailableMemory;
+
+
                GPUContext.LOG.info("Total number of GPUs on the machine: " + 
deviceCount);
+               GPUContext.LOG.info("Initial GPU memory: " + 
initialGPUMemBudget());
+
                //int[] device = {-1};
                //cudaGetDevice(device);
                //cudaDeviceProp prop = getGPUProperties(device[0]);
@@ -119,7 +138,6 @@ public class GPUContextPool {
                //long sharedMemPerBlock = prop.sharedMemPerBlock;
                //LOG.debug("Active CUDA device number : " + device[0]);
                //LOG.debug("Max Blocks/Threads/SharedMem on active device: " + 
maxBlocks + "/" + maxThreadsPerBlock + "/" + sharedMemPerBlock);
-               initialized = true;
                GPUStatistics.cudaInitTime = System.nanoTime() - start;
        }
 
@@ -187,4 +205,19 @@ public class GPUContextPool {
 
        }
 
+       /**
+        * Gets the initial GPU memory budget. This is the minimum of the
+        * available memories across all the GPUs on the machine(s)
+        * @return minimum available memory
+        * @throws RuntimeException if error initializing the GPUs
+        */
+       public static synchronized long initialGPUMemBudget() throws 
RuntimeException {
+               try {
+                       if (!initialized)
+                               initializeGPU();
+                       return INITIAL_GPU_MEMORY_BUDGET;
+               } catch (DMLRuntimeException e){
+                       throw new RuntimeException(e);
+               }
+       }
 }

http://git-wip-us.apache.org/repos/asf/systemml/blob/df8d4a63/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index 48dd391..d8e0068 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -3261,7 +3261,7 @@ public class LibMatrixCUDA {
 
                        // step 4: compute QR factorization
             Pointer work = gCtx.allocate(instName, lwork[0] * Sizeof.DOUBLE);
-            Pointer tau = gCtx.allocate(instName, Math.max(m, m) * 
Sizeof.DOUBLE);
+            Pointer tau = gCtx.allocate(instName, m * Sizeof.DOUBLE);
             Pointer devInfo = gCtx.allocate(Sizeof.INT);
                        if (GPUStatistics.DISPLAY_STATISTICS) t0 = 
System.nanoTime();
                        
JCusolverDn.cusolverDnDgeqrf(gCtx.getCusolverDnHandle(), m, n, A, m, tau, work, 
lwork[0], devInfo);

systemml git commit: [SYSTEMML-1701] fix need to use -force for gpu

Reply via email to