[SYSTEMML-1881] Tuning parfor degree of parallelism (over-provisioning) This patch addresses issues of under-utilized CPU resources in parfor contexts. For example, on Kmeans or MSVM with few runs or classes that are not a factor of the number of hardware threads, we assign the remaining parallelism too conservatively. Consider Kmeans with 10 runs and 16 hardware threads - in this case, we assign k=10 to parfor and k=1 to the operations in the parfor body. This patch fine-tunes this assignment by slightly over-provisioning CPU resources, which is usually a good idea due to barriers between operations. We now assign the remaining operation parallelism with k=round(maxK/parforK).
On the perftest Kmeans 100K x 1K scenario with 50 classes, 10 runs, and 16 hardware threads, this patch improved performance from 196s to 168s Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/ba73291c Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/ba73291c Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/ba73291c Branch: refs/heads/master Commit: ba73291c985d876eaeeb5719623461131bcc7f66 Parents: 2c57cf7 Author: Matthias Boehm <[email protected]> Authored: Sat Sep 2 14:30:11 2017 -0700 Committer: Matthias Boehm <[email protected]> Committed: Sat Sep 2 14:30:11 2017 -0700 ---------------------------------------------------------------------- .../parfor/opt/OptimizerRuleBased.java | 34 +++++++++++++------- 1 file changed, 22 insertions(+), 12 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/ba73291c/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java index b8da25a..9dada01 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java @@ -1247,12 +1247,12 @@ public class OptimizerRuleBased extends Optimizer //constrain max parfor parallelism by problem size int parforK = (int)((_N<kMax)? _N : kMax); - - + // if gpu mode is enabled, the amount of parallelism is set to // the smaller of the number of iterations and the number of GPUs // otherwise it default to the number of CPU cores and the // operations are run in CP mode + //FIXME rework for nested parfor parallelism and body w/o gpu ops if (DMLScript.USE_ACCELERATOR) { long perGPUBudget = GPUContextPool.initialGPUMemBudget(); double maxMemUsage = getMaxCPOnlyBudget(n); @@ -1264,15 +1264,14 @@ public class OptimizerRuleBased extends Optimizer parforK + "]"); } } - //set parfor degree of parallelism pfpb.setDegreeOfParallelism(parforK); - n.setK(parforK); + n.setK(parforK); //distribute remaining parallelism - int remainParforK = (int)Math.ceil(((double)(kMax-parforK+1))/parforK); - int remainOpsK = Math.max(_lkmaxCP / parforK, 1); + int remainParforK = getRemainingParallelismParFor(kMax, parforK); + int remainOpsK = getRemainingParallelismOps(_lkmaxCP, parforK); rAssignRemainingParallelism( n, remainParforK, remainOpsK ); } else // ExecType.MR/ExecType.SPARK @@ -1334,13 +1333,13 @@ public class OptimizerRuleBased extends Optimizer //set parfor degree of parallelism long id = c.getID(); c.setK(tmpK); - ParForProgramBlock pfpb = (ParForProgramBlock) OptTreeConverter - .getAbstractPlanMapping().getMappedProg(id)[1]; + ParForProgramBlock pfpb = (ParForProgramBlock) + OptTreeConverter.getAbstractPlanMapping().getMappedProg(id)[1]; pfpb.setDegreeOfParallelism(tmpK); - //distribute remaining parallelism - int remainParforK = (int)Math.ceil(((double)(parforK-tmpK+1))/tmpK); - int remainOpsK = Math.max(opsK / tmpK, 1); + //distribute remaining parallelism + int remainParforK = getRemainingParallelismParFor(parforK, tmpK); + int remainOpsK = getRemainingParallelismOps(opsK, tmpK); rAssignRemainingParallelism(c, remainParforK, remainOpsK); } else if( c.getNodeType() == NodeType.HOP ) @@ -1387,7 +1386,18 @@ public class OptimizerRuleBased extends Optimizer } } } - + + private static int getRemainingParallelismParFor(int parforK, int tmpK) { + //compute max remaining parfor parallelism k such that k * tmpK <= parforK + return (int)Math.ceil((double)(parforK-tmpK+1) / tmpK); + } + + private static int getRemainingParallelismOps(int opsK, int tmpK) { + //compute max remaining operations parallelism k with slight over-provisioning + //such that k * tmpK <= 1.5 * opsK; note that if parfor already exploits the + //maximum parallelism, this will not introduce any over-provisioning. + return (int)Math.max(Math.round((double)opsK / tmpK), 1); + } /////// //REWRITE set task partitioner
