Repository: systemml Updated Branches: refs/heads/master 070f93976 -> ae9eadc85
[SYSTEMML-2450] Fix parfor optimizer partitioning rewrite, univar stats With SYSTEMML-2357 we reworked the parfor partitioning rewrites from statements to hops for correctness and to leverage other compilation phases. However, this led to a performance regression on perftest univar stats, 10Mx1K because two instead of one matrices where identified as partitioning candidates and hence the fused data-partition-execute did not apply (only valid for single partitioned matrix). We now guard this rewrite with a minimal memory constraint based on local and remote memory budgets (as anyway done during runtime), which improved the performance of univariate stats 10Mx1K from 452s to 218s end-to-end. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/ae9eadc8 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/ae9eadc8 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/ae9eadc8 Branch: refs/heads/master Commit: ae9eadc85597d239043732be83db04a06a17abbc Parents: 070f939 Author: Matthias Boehm <[email protected]> Authored: Tue Jul 17 21:21:17 2018 -0700 Committer: Matthias Boehm <[email protected]> Committed: Tue Jul 17 21:21:17 2018 -0700 ---------------------------------------------------------------------- .../parfor/opt/OptimizerRuleBased.java | 29 +++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/ae9eadc8/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java index d345d01..035b265 100644 --- a/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java +++ b/src/main/java/org/apache/sysml/runtime/controlprogram/parfor/opt/OptimizerRuleBased.java @@ -111,9 +111,9 @@ import org.apache.sysml.yarn.ropt.YarnClusterAnalyzer; * - 3) rewrite result partitioning (incl. recompile LIX) * - 4) rewrite set execution strategy * - 5) rewrite set operations exec type (incl. recompile) - * - 6) rewrite use data colocation + * - 6) rewrite use data colocation * - 7) rewrite set partition replication factor - * - 8) rewrite set export replication factor + * - 8) rewrite set export replication factor * - 9) rewrite use nested parallelism * - 10) rewrite set degree of parallelism * - 11) rewrite set task partitioner @@ -125,10 +125,10 @@ import org.apache.sysml.yarn.ropt.YarnClusterAnalyzer; * - 17) rewrite inject spark loop checkpointing * - 18) rewrite inject spark repartition (for zipmm) * - 19) rewrite set spark eager rdd caching - * - 20) rewrite set result merge + * - 20) rewrite set result merge * - 21) rewrite set recompile memory budget - * - 22) rewrite remove recursive parfor - * - 23) rewrite remove unnecessary parfor + * - 22) rewrite remove recursive parfor + * - 23) rewrite remove unnecessary parfor * * TODO fuse also result merge into fused data partitioning and execute * (for writing the result directly from execute we need to partition @@ -414,8 +414,10 @@ public class OptimizerRuleBased extends Optimizer HashMap<String, PartitionFormat> cand2 = new HashMap<>(); for( String c : pfsb.getReadOnlyParentMatrixVars() ) { PartitionFormat dpf = pfsb.determineDataPartitionFormat( c ); + double mem = getMemoryEstimate(c, vars); if( dpf != PartitionFormat.NONE - && dpf._dpf != PDataPartitionFormat.BLOCK_WISE_M_N ) { + && dpf._dpf != PDataPartitionFormat.BLOCK_WISE_M_N + && mem > _lm/2 && mem > _rm/2 ) { cand2.put( c, dpf ); } } @@ -426,7 +428,7 @@ public class OptimizerRuleBased extends Optimizer PDataPartitioner REMOTE = OptimizerUtils.isSparkExecutionMode() ? PDataPartitioner.REMOTE_SPARK : PDataPartitioner.REMOTE_MR; - PDataPartitioner pdp = (apply)? REMOTE : PDataPartitioner.NONE; + PDataPartitioner pdp = (apply)? REMOTE : PDataPartitioner.NONE; //NOTE: since partitioning is only applied in case of MR index access, we assume a large // matrix and hence always apply REMOTE_MR (the benefit for large matrices outweigths // potentially unnecessary MR jobs for smaller matrices) @@ -521,14 +523,21 @@ public class OptimizerRuleBased extends Optimizer break; case ROW_BLOCK_WISE_N: mem = OptimizerUtils.estimateSize(dpf._N, mo.getNumColumns()); - break; + break; default: //do nothing - } + } } return mem; } + + protected double getMemoryEstimate(String varName, LocalVariableMap vars) { + Data dat = vars.get(varName); + return (dat instanceof MatrixObject) ? + OptimizerUtils.estimateSize(((MatrixObject)dat).getMatrixCharacteristics()) : + OptimizerUtils.DEFAULT_SIZE; + } protected static LopProperties.ExecType getRIXExecType( MatrixObject mo, PDataPartitionFormat dpf, boolean withSparsity ) { @@ -558,7 +567,7 @@ public class OptimizerRuleBased extends Optimizer break; default: - //do nothing + //do nothing } if( mem < OptimizerUtils.getLocalMemBudget() )
