Repository: systemml Updated Branches: refs/heads/master 3cde999c0 -> df8d4a63d
[SYSTEMML-1701] fix need to use -force for gpu Closes #546 Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/df8d4a63 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/df8d4a63 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/df8d4a63 Branch: refs/heads/master Commit: df8d4a63d8d09cae94b6ca2634e31da554302c72 Parents: 3cde999 Author: Nakul Jindal <naku...@gmail.com> Authored: Mon Jun 19 11:44:22 2017 -0700 Committer: Nakul Jindal <naku...@gmail.com> Committed: Mon Jun 19 11:47:27 2017 -0700 ---------------------------------------------------------------------- .../java/org/apache/sysml/hops/AggBinaryOp.java | 9 +++-- .../java/org/apache/sysml/hops/AggUnaryOp.java | 18 +++++----- .../java/org/apache/sysml/hops/BinaryOp.java | 7 ++-- src/main/java/org/apache/sysml/hops/Hop.java | 4 ++- .../org/apache/sysml/hops/OptimizerUtils.java | 5 +-- .../java/org/apache/sysml/hops/ReorgOp.java | 4 ++- .../java/org/apache/sysml/hops/TernaryOp.java | 4 ++- .../gpu/context/GPUContextPool.java | 35 +++++++++++++++++++- .../runtime/matrix/data/LibMatrixCUDA.java | 2 +- 9 files changed, 66 insertions(+), 22 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/df8d4a63/src/main/java/org/apache/sysml/hops/AggBinaryOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/AggBinaryOp.java b/src/main/java/org/apache/sysml/hops/AggBinaryOp.java index 21dbbf1..c721efe 100644 --- a/src/main/java/org/apache/sysml/hops/AggBinaryOp.java +++ b/src/main/java/org/apache/sysml/hops/AggBinaryOp.java @@ -49,6 +49,7 @@ import org.apache.sysml.parser.Expression.DataType; import org.apache.sysml.parser.Expression.ValueType; import org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat; import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.matrix.mapred.DistributedCacheInput; @@ -150,7 +151,7 @@ public class AggBinaryOp extends Hop implements MultiThreadedHop */ @Override public Lop constructLops() - throws HopsException, LopsException + throws HopsException, LopsException { //return already created lops if( getLops() != null ) @@ -546,7 +547,8 @@ public class AggBinaryOp extends Hop implements MultiThreadedHop int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads); ExecType et = ExecType.CP; - if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || getMemEstimate() < OptimizerUtils.GPU_MEMORY_BUDGET)) { + if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || getMemEstimate() < GPUContextPool + .initialGPUMemBudget())) { et = ExecType.GPU; } @@ -625,7 +627,8 @@ public class AggBinaryOp extends Hop implements MultiThreadedHop { Lop matmultCP = null; - if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || getMemEstimate() < OptimizerUtils.GPU_MEMORY_BUDGET)) { + if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || getMemEstimate() < GPUContextPool + .initialGPUMemBudget())) { Hop h1 = getInput().get(0); Hop h2 = getInput().get(1); Lop left; Lop right; http://git-wip-us.apache.org/repos/asf/systemml/blob/df8d4a63/src/main/java/org/apache/sysml/hops/AggUnaryOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java index 8e681c1..eb469ab 100644 --- a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java +++ b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java @@ -39,6 +39,7 @@ import org.apache.sysml.lops.LopProperties.ExecType; import org.apache.sysml.parser.Expression.DataType; import org.apache.sysml.parser.Expression.ValueType; import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; @@ -149,15 +150,16 @@ public class AggUnaryOp extends Hop implements MultiThreadedHop } else { //general case int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads); - if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || getMemEstimate() < OptimizerUtils.GPU_MEMORY_BUDGET)) { + if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || getMemEstimate() < GPUContextPool + .initialGPUMemBudget())) { // Only implemented methods for GPU - if ( (_op == AggOp.SUM && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) - || (_op == AggOp.SUM_SQ && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) - || (_op == AggOp.MAX && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) - || (_op == AggOp.MIN && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) - || (_op == AggOp.MEAN && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) - || (_op == AggOp.VAR && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) - || (_op == AggOp.PROD && (_direction == Direction.RowCol))){ + if ((_op == AggOp.SUM && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) + || (_op == AggOp.SUM_SQ && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) + || (_op == AggOp.MAX && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) + || (_op == AggOp.MIN && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) + || (_op == AggOp.MEAN && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) + || (_op == AggOp.VAR && (_direction == Direction.RowCol || _direction == Direction.Row || _direction == Direction.Col)) + || (_op == AggOp.PROD && (_direction == Direction.RowCol))){ et = ExecType.GPU; k = 1; } http://git-wip-us.apache.org/repos/asf/systemml/blob/df8d4a63/src/main/java/org/apache/sysml/hops/BinaryOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/BinaryOp.java b/src/main/java/org/apache/sysml/hops/BinaryOp.java index f3a2fa0..ed0d9ad 100644 --- a/src/main/java/org/apache/sysml/hops/BinaryOp.java +++ b/src/main/java/org/apache/sysml/hops/BinaryOp.java @@ -53,6 +53,7 @@ import org.apache.sysml.lops.LopProperties.ExecType; import org.apache.sysml.parser.Expression.DataType; import org.apache.sysml.parser.Expression.ValueType; import org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; import org.apache.sysml.runtime.matrix.mapred.DistributedCacheInput; @@ -578,7 +579,8 @@ public class BinaryOp extends Hop else //general case ot = HopsOpOp2LopsU.get(op); - if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || getMemEstimate() < OptimizerUtils.GPU_MEMORY_BUDGET) + if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || getMemEstimate() < GPUContextPool + .initialGPUMemBudget()) && (op == OpOp2.MULT || op == OpOp2.PLUS || op == OpOp2.MINUS || op == OpOp2.DIV || op == OpOp2.POW) ) { et = ExecType.GPU; } @@ -596,7 +598,8 @@ public class BinaryOp extends Hop ExecType et = optFindExecType(); if ( et == ExecType.CP ) { - if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || getMemEstimate() < OptimizerUtils.GPU_MEMORY_BUDGET) + if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || getMemEstimate() < GPUContextPool + .initialGPUMemBudget()) && (op == OpOp2.MULT || op == OpOp2.PLUS || op == OpOp2.MINUS || op == OpOp2.DIV || op == OpOp2.POW || op == OpOp2.SOLVE)) { et = ExecType.GPU; } http://git-wip-us.apache.org/repos/asf/systemml/blob/df8d4a63/src/main/java/org/apache/sysml/hops/Hop.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/Hop.java b/src/main/java/org/apache/sysml/hops/Hop.java index 6541edc..9a71e7c 100644 --- a/src/main/java/org/apache/sysml/hops/Hop.java +++ b/src/main/java/org/apache/sysml/hops/Hop.java @@ -44,6 +44,7 @@ import org.apache.sysml.runtime.controlprogram.LocalVariableMap; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject.UpdateType; import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext; import org.apache.sysml.runtime.controlprogram.parfor.util.IDSequence; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.util.UtilFunctions; @@ -787,7 +788,8 @@ public abstract class Hop } protected ExecType findGPUExecTypeByMemEstimate(ExecType et) { - if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || getMemEstimate() < OptimizerUtils.GPU_MEMORY_BUDGET)) { + if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || getMemEstimate() < GPUContextPool + .initialGPUMemBudget())) { return ExecType.GPU; } return et; http://git-wip-us.apache.org/repos/asf/systemml/blob/df8d4a63/src/main/java/org/apache/sysml/hops/OptimizerUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/OptimizerUtils.java b/src/main/java/org/apache/sysml/hops/OptimizerUtils.java index 79b7ee6..dcbc27a 100644 --- a/src/main/java/org/apache/sysml/hops/OptimizerUtils.java +++ b/src/main/java/org/apache/sysml/hops/OptimizerUtils.java @@ -199,10 +199,7 @@ public class OptimizerUtils * */ public static final boolean ALLOW_COMBINE_FILE_INPUT_FORMAT = true; - - - public static long GPU_MEMORY_BUDGET = -1; - + ////////////////////// // Optimizer levels // ////////////////////// http://git-wip-us.apache.org/repos/asf/systemml/blob/df8d4a63/src/main/java/org/apache/sysml/hops/ReorgOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/ReorgOp.java b/src/main/java/org/apache/sysml/hops/ReorgOp.java index 7645c46..8ed308b 100644 --- a/src/main/java/org/apache/sysml/hops/ReorgOp.java +++ b/src/main/java/org/apache/sysml/hops/ReorgOp.java @@ -35,6 +35,7 @@ import org.apache.sysml.lops.LopProperties.ExecType; import org.apache.sysml.lops.Transform.OperationTypes; import org.apache.sysml.parser.Expression.DataType; import org.apache.sysml.parser.Expression.ValueType; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; /** @@ -151,7 +152,8 @@ public class ReorgOp extends Hop implements MultiThreadedHop setLops(lin); //if input of size 1x1, avoid unnecessary transpose else { //general case int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads); - if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || getMemEstimate() < OptimizerUtils.GPU_MEMORY_BUDGET)) { + if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || getMemEstimate() < GPUContextPool + .initialGPUMemBudget())) { et = ExecType.GPU; } Transform transform1 = new Transform( lin, http://git-wip-us.apache.org/repos/asf/systemml/blob/df8d4a63/src/main/java/org/apache/sysml/hops/TernaryOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/TernaryOp.java b/src/main/java/org/apache/sysml/hops/TernaryOp.java index b875387..458a346 100644 --- a/src/main/java/org/apache/sysml/hops/TernaryOp.java +++ b/src/main/java/org/apache/sysml/hops/TernaryOp.java @@ -42,6 +42,7 @@ import org.apache.sysml.lops.PartialAggregate.CorrectionLocationType; import org.apache.sysml.parser.Statement; import org.apache.sysml.parser.Expression.DataType; import org.apache.sysml.parser.Expression.ValueType; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContextPool; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; /** Primary use cases for now, are @@ -649,7 +650,8 @@ public class TernaryOp extends Hop throw new HopsException("Unexpected operation: " + _op + ", expecting " + OpOp3.PLUS_MULT + " or" + OpOp3.MINUS_MULT); ExecType et = null; - if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || getMemEstimate() < OptimizerUtils.GPU_MEMORY_BUDGET) ) + if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || getMemEstimate() < GPUContextPool + .initialGPUMemBudget()) ) et = ExecType.GPU; else et = optFindExecType(); http://git-wip-us.apache.org/repos/asf/systemml/blob/df8d4a63/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java index ac1c059..ef38da8 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUContextPool.java @@ -46,6 +46,9 @@ public class GPUContextPool { */ public static int PER_PROCESS_MAX_GPUS = -1; + + private static long INITIAL_GPU_MEMORY_BUDGET = -1; + /** * Whether cuda has been initialized */ @@ -80,6 +83,7 @@ public class GPUContextPool { * @throws DMLRuntimeException ? */ public synchronized static void initializeGPU() throws DMLRuntimeException { + initialized = true; GPUContext.LOG.info("Initializing CUDA"); long start = System.nanoTime(); JCuda.setExceptionsEnabled(true); @@ -110,7 +114,22 @@ public class GPUContextPool { pool.add(gCtx); } + // Initialize the initial memory budget + // If there are heterogeneous GPUs on the machine (different memory sizes) + // initially available memory is set to the GPU with the lowest memory + // This is because at runtime, we wouldn't know which GPU a certain + // operation gets scheduled on + long minAvailableMemory = Integer.MAX_VALUE; + for (GPUContext gCtx : pool) { + gCtx.initializeThread(); + minAvailableMemory = Math.min(minAvailableMemory, gCtx.getAvailableMemory()); + } + INITIAL_GPU_MEMORY_BUDGET = minAvailableMemory; + + GPUContext.LOG.info("Total number of GPUs on the machine: " + deviceCount); + GPUContext.LOG.info("Initial GPU memory: " + initialGPUMemBudget()); + //int[] device = {-1}; //cudaGetDevice(device); //cudaDeviceProp prop = getGPUProperties(device[0]); @@ -119,7 +138,6 @@ public class GPUContextPool { //long sharedMemPerBlock = prop.sharedMemPerBlock; //LOG.debug("Active CUDA device number : " + device[0]); //LOG.debug("Max Blocks/Threads/SharedMem on active device: " + maxBlocks + "/" + maxThreadsPerBlock + "/" + sharedMemPerBlock); - initialized = true; GPUStatistics.cudaInitTime = System.nanoTime() - start; } @@ -187,4 +205,19 @@ public class GPUContextPool { } + /** + * Gets the initial GPU memory budget. This is the minimum of the + * available memories across all the GPUs on the machine(s) + * @return minimum available memory + * @throws RuntimeException if error initializing the GPUs + */ + public static synchronized long initialGPUMemBudget() throws RuntimeException { + try { + if (!initialized) + initializeGPU(); + return INITIAL_GPU_MEMORY_BUDGET; + } catch (DMLRuntimeException e){ + throw new RuntimeException(e); + } + } } http://git-wip-us.apache.org/repos/asf/systemml/blob/df8d4a63/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java index 48dd391..d8e0068 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java @@ -3261,7 +3261,7 @@ public class LibMatrixCUDA { // step 4: compute QR factorization Pointer work = gCtx.allocate(instName, lwork[0] * Sizeof.DOUBLE); - Pointer tau = gCtx.allocate(instName, Math.max(m, m) * Sizeof.DOUBLE); + Pointer tau = gCtx.allocate(instName, m * Sizeof.DOUBLE); Pointer devInfo = gCtx.allocate(Sizeof.INT); if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); JCusolverDn.cusolverDnDgeqrf(gCtx.getCusolverDnHandle(), m, n, A, m, tau, work, lwork[0], devInfo);