Repository: systemml
Updated Branches:
  refs/heads/master 070f93976 -> ae9eadc85


[SYSTEMML-2450] Fix parfor optimizer partitioning rewrite, univar stats

With SYSTEMML-2357 we reworked the parfor partitioning rewrites from
statements to hops for correctness and to leverage other compilation
phases. However, this led to a performance regression on perftest univar
stats, 10Mx1K because two instead of one matrices where identified as
partitioning candidates and hence the fused data-partition-execute did
not apply (only valid for single partitioned matrix). 

We now guard this rewrite with a minimal memory constraint based on
local and remote memory budgets (as anyway done during runtime), which
improved the performance of univariate stats 10Mx1K from 452s to 218s
end-to-end.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/ae9eadc8
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/ae9eadc8
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/ae9eadc8

Branch: refs/heads/master
Commit: ae9eadc85597d239043732be83db04a06a17abbc
Parents: 070f939
Author: Matthias Boehm <[email protected]>
Authored: Tue Jul 17 21:21:17 2018 -0700
Committer: Matthias Boehm <[email protected]>
Committed: Tue Jul 17 21:21:17 2018 -0700

----------------------------------------------------------------------
 .../parfor/opt/OptimizerRuleBased.java          | 29 +++++++++++++-------
 1 file changed, 19 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/ae9eadc8/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java
index d345d01..035b265 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java
@@ -111,9 +111,9 @@ import org.apache.sysml.yarn.ropt.YarnClusterAnalyzer;
  * - 3) rewrite result partitioning (incl. recompile LIX)
  * - 4) rewrite set execution strategy
  * - 5) rewrite set operations exec type (incl. recompile)
- * - 6) rewrite use data colocation             
+ * - 6) rewrite use data colocation
  * - 7) rewrite set partition replication factor
- * - 8) rewrite set export replication factor 
+ * - 8) rewrite set export replication factor
  * - 9) rewrite use nested parallelism 
  * - 10) rewrite set degree of parallelism
  * - 11) rewrite set task partitioner
@@ -125,10 +125,10 @@ import org.apache.sysml.yarn.ropt.YarnClusterAnalyzer;
  * - 17) rewrite inject spark loop checkpointing 
  * - 18) rewrite inject spark repartition (for zipmm)
  * - 19) rewrite set spark eager rdd caching 
- * - 20) rewrite set result merge                               
+ * - 20) rewrite set result merge
  * - 21) rewrite set recompile memory budget
- * - 22) rewrite remove recursive parfor       
- * - 23) rewrite remove unnecessary parfor             
+ * - 22) rewrite remove recursive parfor
+ * - 23) rewrite remove unnecessary parfor
  *      
  * TODO fuse also result merge into fused data partitioning and execute
  *      (for writing the result directly from execute we need to partition
@@ -414,8 +414,10 @@ public class OptimizerRuleBased extends Optimizer
                        HashMap<String, PartitionFormat> cand2 = new 
HashMap<>();
                        for( String c : pfsb.getReadOnlyParentMatrixVars() ) {
                                PartitionFormat dpf = 
pfsb.determineDataPartitionFormat( c );
+                               double mem = getMemoryEstimate(c, vars);
                                if( dpf != PartitionFormat.NONE 
-                                       && dpf._dpf != 
PDataPartitionFormat.BLOCK_WISE_M_N ) {
+                                       && dpf._dpf != 
PDataPartitionFormat.BLOCK_WISE_M_N
+                                       && mem > _lm/2 && mem > _rm/2 ) {
                                        cand2.put( c, dpf );
                                }
                        }
@@ -426,7 +428,7 @@ public class OptimizerRuleBased extends Optimizer
                
                PDataPartitioner REMOTE = OptimizerUtils.isSparkExecutionMode() 
? 
                                PDataPartitioner.REMOTE_SPARK : 
PDataPartitioner.REMOTE_MR;
-               PDataPartitioner pdp = (apply)? REMOTE : PDataPartitioner.NONE; 
        
+               PDataPartitioner pdp = (apply)? REMOTE : PDataPartitioner.NONE;
                //NOTE: since partitioning is only applied in case of MR index 
access, we assume a large
                //      matrix and hence always apply REMOTE_MR (the benefit 
for large matrices outweigths
                //      potentially unnecessary MR jobs for smaller matrices)
@@ -521,14 +523,21 @@ public class OptimizerRuleBased extends Optimizer
                                        break;
                                case ROW_BLOCK_WISE_N:
                                        mem = 
OptimizerUtils.estimateSize(dpf._N, mo.getNumColumns()); 
-                                       break;  
+                                       break;
                                default:
                                        //do nothing
-                       }       
+                       }
                }
                
                return mem;
        }
+       
+       protected double getMemoryEstimate(String varName, LocalVariableMap 
vars) {
+               Data dat = vars.get(varName);
+               return (dat instanceof MatrixObject) ? 
+                       
OptimizerUtils.estimateSize(((MatrixObject)dat).getMatrixCharacteristics()) : 
+                       OptimizerUtils.DEFAULT_SIZE;
+       }
 
        protected static LopProperties.ExecType getRIXExecType( MatrixObject 
mo, PDataPartitionFormat dpf, boolean withSparsity ) 
        {
@@ -558,7 +567,7 @@ public class OptimizerRuleBased extends Optimizer
                                break;
                                
                        default:
-                               //do nothing    
+                               //do nothing
                }
                
                if( mem < OptimizerUtils.getLocalMemBudget() )

Reply via email to