[SYSTEMML-1881] Tuning parfor degree of parallelism (over-provisioning)

This patch addresses issues of under-utilized CPU resources in parfor
contexts. For example, on Kmeans or MSVM with few runs or classes that
are not a factor of the number of hardware threads, we assign the
remaining parallelism too conservatively. Consider Kmeans with 10 runs
and 16 hardware threads - in this case, we assign k=10 to parfor and k=1
to the operations in the parfor body. This patch fine-tunes this
assignment by slightly over-provisioning CPU resources, which is usually
a good idea due to barriers between operations. We now assign the
remaining operation parallelism with k=round(maxK/parforK). 

On the perftest Kmeans 100K x 1K scenario with 50 classes, 10 runs, and
16 hardware threads, this patch improved performance from 196s to 168s


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/ba73291c
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/ba73291c
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/ba73291c

Branch: refs/heads/master
Commit: ba73291c985d876eaeeb5719623461131bcc7f66
Parents: 2c57cf7
Author: Matthias Boehm <[email protected]>
Authored: Sat Sep 2 14:30:11 2017 -0700
Committer: Matthias Boehm <[email protected]>
Committed: Sat Sep 2 14:30:11 2017 -0700

----------------------------------------------------------------------
 .../parfor/opt/OptimizerRuleBased.java          | 34 +++++++++++++-------
 1 file changed, 22 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/ba73291c/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java
index b8da25a..9dada01 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java
@@ -1247,12 +1247,12 @@ public class OptimizerRuleBased extends Optimizer
                        
                        //constrain max parfor parallelism by problem size
                        int parforK = (int)((_N<kMax)? _N : kMax);
-
-
+                       
                        // if gpu mode is enabled, the amount of parallelism is 
set to
                        // the smaller of the number of iterations and the 
number of GPUs
                        // otherwise it default to the number of CPU cores and 
the
                        // operations are run in CP mode
+                       //FIXME rework for nested parfor parallelism and body 
w/o gpu ops
                        if (DMLScript.USE_ACCELERATOR) {
                                long perGPUBudget = 
GPUContextPool.initialGPUMemBudget();
                                double maxMemUsage = getMaxCPOnlyBudget(n);
@@ -1264,15 +1264,14 @@ public class OptimizerRuleBased extends Optimizer
                                                        parforK + "]");
                                }
                        }
-
                        
                        //set parfor degree of parallelism
                        pfpb.setDegreeOfParallelism(parforK);
-                       n.setK(parforK);        
+                       n.setK(parforK);
                        
                        //distribute remaining parallelism 
-                       int remainParforK = 
(int)Math.ceil(((double)(kMax-parforK+1))/parforK);
-                       int remainOpsK = Math.max(_lkmaxCP / parforK, 1);
+                       int remainParforK = getRemainingParallelismParFor(kMax, 
parforK);
+                       int remainOpsK = getRemainingParallelismOps(_lkmaxCP, 
parforK);
                        rAssignRemainingParallelism( n, remainParforK, 
remainOpsK ); 
                }
                else // ExecType.MR/ExecType.SPARK
@@ -1334,13 +1333,13 @@ public class OptimizerRuleBased extends Optimizer
                                        //set parfor degree of parallelism
                                        long id = c.getID();
                                        c.setK(tmpK);
-                                       ParForProgramBlock pfpb = 
(ParForProgramBlock) OptTreeConverter
-                                                  
.getAbstractPlanMapping().getMappedProg(id)[1];
+                                       ParForProgramBlock pfpb = 
(ParForProgramBlock) 
+                                               
OptTreeConverter.getAbstractPlanMapping().getMappedProg(id)[1];
                                        pfpb.setDegreeOfParallelism(tmpK);
                                        
-                                       //distribute remaining parallelism 
-                                       int remainParforK = 
(int)Math.ceil(((double)(parforK-tmpK+1))/tmpK);
-                                       int remainOpsK = Math.max(opsK / tmpK, 
1);
+                                       //distribute remaining parallelism
+                                       int remainParforK = 
getRemainingParallelismParFor(parforK, tmpK);
+                                       int remainOpsK = 
getRemainingParallelismOps(opsK, tmpK);
                                        rAssignRemainingParallelism(c, 
remainParforK, remainOpsK);
                                }
                                else if( c.getNodeType() == NodeType.HOP )
@@ -1387,7 +1386,18 @@ public class OptimizerRuleBased extends Optimizer
                        }
                }
        }
-
+       
+       private static int getRemainingParallelismParFor(int parforK, int tmpK) 
{
+               //compute max remaining parfor parallelism k such that k * tmpK 
<= parforK
+               return (int)Math.ceil((double)(parforK-tmpK+1) / tmpK);
+       }
+       
+       private static int getRemainingParallelismOps(int opsK, int tmpK) {
+               //compute max remaining operations parallelism k with slight 
over-provisioning 
+               //such that k * tmpK <= 1.5 * opsK; note that if parfor already 
exploits the
+               //maximum parallelism, this will not introduce any 
over-provisioning.
+               return (int)Math.max(Math.round((double)opsK / tmpK), 1);
+       }
        
        ///////
        //REWRITE set task partitioner

Reply via email to