[SYSTEMML-1374] New candidate-based code generation algorithm

This patch replaces the existing code generation algorithm with a new
candidate-based algorithm in order to simplify further extensions,
increase fusion potential, and enable cost-based plan decisions. 

The new algorithm has three major phases: (1) candidate exploration via
a new  Open-Fuse-Merge-Close interface of individual templates, (2)
candidate selection (w/ different policies for fuse all, fuse
non-overlapping, fuse cost-based), and (3) CPlan construction for
selected candidates.

Furthermore, this patch also includes various fixes and enhancements:
* Fix plan cache (cplan hashing and input comparisons)
* Fix spoof spark instruction (cellwise ops w/ sideways matrices)
* Fix spoof base operator (handling empty sideways inputs)
* Fix handling of compiled nan, +/-inf literals 
* Fix renaming of template input cnodes (class cast exceptions)
* Fix handling of invalid and closed fusion plans
* Fix hop rewrite utils (check for outer product w/ unknown dims)
* Fix cplan cleanup (exclude literals from input hops)
* Explain functionality of cpplan memo table
* Common subexpression elimination after spoof hop insertions
* Hardened open and fuse conditions of individual templates
* More efficient dag traversal (avoid redundant child probes)
* Test applied rowagg, outer, and cell templates
* New testcase for special row aggregate pattern mmchain-sprop


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/c5eddccf
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/c5eddccf
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/c5eddccf

Branch: refs/heads/master
Commit: c5eddccf88b00462ae223f755679d0164e63ac32
Parents: 143e614
Author: Matthias Boehm <[email protected]>
Authored: Wed Mar 15 10:57:57 2017 -0700
Committer: Matthias Boehm <[email protected]>
Committed: Wed Mar 15 10:57:57 2017 -0700

----------------------------------------------------------------------
 .../sysml/hops/codegen/SpoofCompiler.java       | 168 ++++--
 .../apache/sysml/hops/codegen/cplan/CNode.java  |   2 +-
 .../sysml/hops/codegen/cplan/CNodeCell.java     |  10 +-
 .../sysml/hops/codegen/cplan/CNodeData.java     |  12 +-
 .../hops/codegen/cplan/CNodeOuterProduct.java   |   4 +-
 .../sysml/hops/codegen/cplan/CNodeRowAgg.java   | 123 +++++
 .../hops/codegen/cplan/CNodeRowAggVector.java   | 118 ----
 .../sysml/hops/codegen/cplan/CNodeTpl.java      |  26 +
 .../sysml/hops/codegen/template/BaseTpl.java    |  99 +++-
 .../hops/codegen/template/CPlanMemoTable.java   | 280 ++++++++++
 .../sysml/hops/codegen/template/CellTpl.java    | 371 +++++--------
 .../hops/codegen/template/CplanRegister.java    | 169 ------
 .../hops/codegen/template/OuterProductTpl.java  | 551 +++++--------------
 .../sysml/hops/codegen/template/RowAggTpl.java  | 377 +++++--------
 .../hops/codegen/template/TemplateUtils.java    | 233 +++-----
 .../sysml/hops/rewrite/HopRewriteUtils.java     |  16 +-
 .../sysml/runtime/codegen/SpoofCellwise.java    |  32 +-
 .../sysml/runtime/codegen/SpoofOperator.java    |  20 +-
 .../runtime/codegen/SpoofOuterProduct.java      |  38 +-
 .../instructions/cp/SpoofCPInstruction.java     |   4 +-
 .../instructions/spark/SpoofSPInstruction.java  |  12 +-
 .../functions/codegen/AlgorithmKMeans.java      |   9 +-
 .../functions/codegen/AlgorithmLinregCG.java    |   3 +-
 .../functions/codegen/CellwiseTmplTest.java     |   5 +-
 .../functions/codegen/DAGCellwiseTmplTest.java  |   3 +-
 .../functions/codegen/OuterProdTmplTest.java    |  17 +-
 .../functions/codegen/RowAggTmplTest.java       |  16 +-
 .../functions/codegen/SumProductChainTest.java  |   4 +-
 .../scripts/functions/codegen/rowAggPattern5.R  |  31 ++
 .../functions/codegen/rowAggPattern5.dml        |  28 +
 src/test/scripts/functions/codegen/wdivmm.R     |   6 +-
 src/test/scripts/functions/codegen/wdivmm.dml   |   6 +-
 .../scripts/functions/codegen/wdivmmRight.R     |   6 +-
 .../scripts/functions/codegen/wdivmmRight.dml   |   6 +-
 .../functions/codegen/wdivmmRightNotranspose.R  |   6 +-
 .../codegen/wdivmmRightNotranspose.dml          |   6 +-
 .../functions/codegen/wdivmmTransposeOut.R      |   6 +-
 .../functions/codegen/wdivmmTransposeOut.dml    |   6 +-
 .../scripts/functions/codegen/wdivmmbasic.R     |   6 +-
 .../scripts/functions/codegen/wdivmmbasic.dml   |   6 +-
 src/test/scripts/functions/codegen/wsigmoid.R   |   6 +-
 src/test/scripts/functions/codegen/wsigmoid.dml |   6 +-
 42 files changed, 1340 insertions(+), 1513 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c5eddccf/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java 
b/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
index f40a883..ec24d41 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
@@ -23,6 +23,7 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.Iterator;
 import java.util.LinkedHashMap;
 import java.util.Map.Entry;
 import java.util.concurrent.ConcurrentHashMap;
@@ -39,13 +40,17 @@ import org.apache.sysml.hops.codegen.cplan.CNodeTpl;
 import org.apache.sysml.hops.codegen.cplan.CNodeUnary;
 import org.apache.sysml.hops.codegen.cplan.CNodeUnary.UnaryType;
 import org.apache.sysml.hops.codegen.template.BaseTpl;
-import org.apache.sysml.hops.codegen.template.CellTpl;
-import org.apache.sysml.hops.codegen.template.CplanRegister;
-import org.apache.sysml.hops.codegen.template.OuterProductTpl;
-import org.apache.sysml.hops.codegen.template.RowAggTpl;
+import org.apache.sysml.hops.codegen.template.BaseTpl.CloseType;
+import org.apache.sysml.hops.codegen.template.BaseTpl.TemplateType;
+import org.apache.sysml.hops.codegen.template.CPlanMemoTable;
+import org.apache.sysml.hops.codegen.template.CPlanMemoTable.MemoTableEntry;
+import org.apache.sysml.hops.codegen.template.TemplateUtils;
 import org.apache.sysml.hops.Hop;
 import org.apache.sysml.hops.HopsException;
 import org.apache.sysml.hops.rewrite.HopRewriteUtils;
+import org.apache.sysml.hops.rewrite.ProgramRewriteStatus;
+import org.apache.sysml.hops.rewrite.ProgramRewriter;
+import org.apache.sysml.hops.rewrite.RewriteCommonSubexpressionElimination;
 import org.apache.sysml.parser.DMLProgram;
 import org.apache.sysml.parser.ForStatement;
 import org.apache.sysml.parser.ForStatementBlock;
@@ -82,6 +87,8 @@ public class SpoofCompiler
        //for equal operators from (1) different hop dags and (2) repeated 
recompilation 
        private static ConcurrentHashMap<CNode, Class<?>> planCache = new 
ConcurrentHashMap<CNode, Class<?>>();
        
+       private static ProgramRewriter rewriteCSE = new ProgramRewriter(new 
RewriteCommonSubexpressionElimination(true));
+       
        public static void generateCode(DMLProgram dmlp) 
                throws LanguageException, HopsException, DMLRuntimeException
        {
@@ -212,7 +219,7 @@ public class SpoofCompiler
                        cplans = cleanupCPlans(cplans);
                                        
                        //explain before modification
-                       if( LDEBUG && cplans.size() > 0 ) { //existing cplans
+                       if( LDEBUG && !cplans.isEmpty() ) { //existing cplans
                                LOG.info("Codegen EXPLAIN (before optimize): 
\n"+Explain.explainHops(roots));
                        }
                        
@@ -252,12 +259,19 @@ public class SpoofCompiler
                                        
Statistics.incrementCodegenPlanCacheTotal();
                        }
                        
-                       //generate final hop dag
-                       ret = constructModifiedHopDag(roots, cplans, clas);
-                       
-                       //explain after modification
-                       if( LDEBUG && cplans.size() > 0 ) { //existing cplans
-                               LOG.info("Codegen EXPLAIN (after optimize): 
\n"+Explain.explainHops(roots));
+                       //create modified hop dag (operator replacement and CSE)
+                       if( !cplans.isEmpty() ) 
+                       {
+                               //generate final hop dag
+                               ret = constructModifiedHopDag(roots, cplans, 
clas);
+                               
+                               //run common subexpression elimination
+                               ret = rewriteCSE.rewriteHopDAGs(ret, new 
ProgramRewriteStatus());       
+                               
+                               //explain after modification
+                               if( LDEBUG ) {
+                                       LOG.info("Codegen EXPLAIN (after 
optimize): \n"+Explain.explainHops(roots));
+                               }
                        }
                }
                catch( Exception ex ) {
@@ -278,40 +292,112 @@ public class SpoofCompiler
 
        private static HashMap<Long, Pair<Hop[],CNodeTpl>> 
constructCPlans(ArrayList<Hop> roots, boolean compileLiterals) throws 
DMLException
        {
+               //explore cplan candidates
+               CPlanMemoTable memo = new CPlanMemoTable();
+               for( Hop hop : roots )
+                       rExploreCPlans(hop, memo, compileLiterals);
+               
+               //select optimal cplan candidates
+               memo.pruneSuboptimal();
+               
+               //construct actual cplan representations
                LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> ret = new 
LinkedHashMap<Long, Pair<Hop[],CNodeTpl>>();
-               for( Hop hop : roots ) {
-                       CplanRegister perRootCplans = new CplanRegister();
-                       HashSet<Long> memo = new HashSet<Long>();
-                       rConstructCPlans(hop, perRootCplans, memo, 
compileLiterals);
-                       
-                       for (Entry<Long, Pair<Hop[],CNodeTpl>> entry : 
perRootCplans.getTopLevelCplans().entrySet())
-                               if(!ret.containsKey(entry.getKey()))
-                                       ret.put(entry.getKey(), 
entry.getValue());
-               }
+               Hop.resetVisitStatus(roots);
+               for( Hop hop : roots )
+                       rConstructCPlans(hop, memo, ret, compileLiterals);
+               Hop.resetVisitStatus(roots);
+               
                return ret;
        }
        
-       private static void rConstructCPlans(Hop hop, CplanRegister cplanReg, 
HashSet<Long> memo, boolean compileLiterals) throws DMLException
+       private static void rExploreCPlans(Hop hop, CPlanMemoTable memo, 
boolean compileLiterals) 
+               throws DMLException
        {               
+               //top-down memoization of processed dag nodes
                if( memo.contains(hop.getHopID()) )
                        return;
                
-               //construct template instances
-               BaseTpl[] templates = new BaseTpl[]{
-                               new RowAggTpl(), new CellTpl(), new 
OuterProductTpl()};
+               //recursively process child nodes
+               for( Hop c : hop.getInput() )
+                       rExploreCPlans(c, memo, compileLiterals);
                
-               //process hop with all templates
-               for( BaseTpl tpl : templates ) {
-                       if( tpl.openTpl(hop) && 
tpl.findTplBoundaries(hop,cplanReg) ) {
-                               cplanReg.insertCpplans(tpl.getType(), 
-                                       tpl.constructTplCplan(compileLiterals));
-                       }               
+               //generate new node plans
+               for( BaseTpl tpl : TemplateUtils.TEMPLATES )
+                       if( tpl.open(hop) )
+                               memo.add(hop, tpl.getType());
+               
+               for( Hop c : hop.getInput() ) {
+                       if( memo.contains(c.getHopID()) )
+                               for( MemoTableEntry me : memo.get(c.getHopID()) 
) {
+                                       BaseTpl tpl = 
TemplateUtils.createTemplate(me.type, me.closed);
+                                       if( tpl.fuse(hop, c) )
+                                               genExplorePlans(tpl, hop, memo, 
hop.getInput(), c);     
+                               }       
+               }
+               
+               //prune subsumed / redundant plans
+               memo.pruneRedundant(hop.getHopID());
+               
+               //check if templates require close
+               if( memo.contains(hop.getHopID()) ) {
+                       Iterator<MemoTableEntry> iter = 
memo.get(hop.getHopID()).iterator();
+                       while( iter.hasNext() ) {
+                               MemoTableEntry me = iter.next();
+                               BaseTpl tpl = 
TemplateUtils.createTemplate(me.type);
+                               CloseType ccode = tpl.close(hop);
+                               if( ccode != CloseType.OPEN ) {
+                                       me.closed = true;
+                                       if( ccode == CloseType.CLOSED_INVALID )
+                                               iter.remove();
+                               }
+                       }
+               }
+       }
+       
+       private static void genExplorePlans(BaseTpl tpl, Hop hop, 
CPlanMemoTable memo, ArrayList<Hop> inputs, Hop exclude)
+       {
+               //handle unary operators
+               if( hop.getInput().size() == 1 ) {
+                       memo.add(hop, tpl.getType(), exclude.getHopID());
+               }
+               //handle binary operators
+               //TODO rework plan exploration step
+               else if( hop.getInput().size() == 2 ) {
+                       int input2ix = (inputs.get(0)==exclude ? 1:0);
+                       Hop input2 = inputs.get(input2ix); 
+                       long[] refs = (input2ix==1) ? new 
long[]{exclude.getHopID(), -1} : new long[]{-1, exclude.getHopID()};
+                       memo.add(hop, tpl.getType(), refs[0], refs[1]);         
+                       if( memo.contains(input2.getHopID()) && 
!memo.get(input2.getHopID()).get(0).closed
+                               && memo.get(input2.getHopID()).get(0).type == 
TemplateType.CellTpl && tpl.merge(hop, input2) ) {
+                               refs[input2ix] = input2.getHopID();
+                               memo.add(hop, tpl.getType(), refs[0], refs[1]); 
        
+                       }
+               }
+               else {
+                       LOG.warn("genExplorePlans currently only supports unary 
and binary operators.");
+               }
+       }
+       
+       private static void rConstructCPlans(Hop hop, CPlanMemoTable memo, 
HashMap<Long, Pair<Hop[],CNodeTpl>> cplans, boolean compileLiterals) 
+               throws DMLException
+       {               
+               //top-down memoization of processed dag nodes
+               if( hop.isVisited() )
+                       return;
+               
+               //generate cplan for existing memo table entry
+               if( memo.containsTopLevel(hop.getHopID()) ) {
+                       cplans.put(hop.getHopID(), TemplateUtils
+                               
.createTemplate(memo.getBest(hop.getHopID()).type)
+                               .constructCplan(hop, memo, compileLiterals));
+                       if( DMLScript.STATISTICS )
+                               Statistics.incrementCodegenCPlanCompile(1); 
                }
                
                //process childs recursively
-               memo.add(hop.getHopID());
                for( Hop c : hop.getInput() )
-                       rConstructCPlans(c, cplanReg, memo, compileLiterals);
+                       rConstructCPlans(c, memo, cplans, compileLiterals);
+               hop.setVisited();
        }
        
        ////////////////////
@@ -342,8 +428,15 @@ public class SpoofCompiler
                        CNodeTpl tmpCNode = 
cplans.get(hop.getHopID()).getValue();
                        hnew = new SpoofFusedOp(hop.getName(), 
hop.getDataType(), hop.getValueType(), 
                                        tmpCla.getValue(), false, 
tmpCNode.getOutputDimType());
-                       for( Hop in : tmpCla.getKey() ) {
-                               hnew.addInput(in); //add inputs
+                       Hop[] inHops = tmpCla.getKey();
+                       for( int i=0; i<inHops.length; i++ ) {
+                               if( tmpCNode instanceof CNodeOuterProduct 
+                                       && 
inHops[i].getHopID()==((CNodeData)tmpCNode.getInput().get(2)).getHopID()
+                                       && 
!TemplateUtils.hasTransposeParentUnderOuterProduct(inHops[i]) ) {
+                                       
hnew.addInput(HopRewriteUtils.createTranspose(inHops[i]));
+                               }
+                               else
+                                       hnew.addInput(inHops[i]); //add inputs
                        }
                        hnew.setOutputBlocksizes(hop.getRowsInBlock() , 
hop.getColsInBlock());
                        hnew.setDim1(hop.getDim1());
@@ -387,9 +480,10 @@ public class SpoofCompiler
                        else {
                                tpl.cleanupInputs(leafs);
                                ArrayList<Hop> tmp = new ArrayList<Hop>();
-                               for( Hop hop : inHops )
-                                       if( leafs.contains(hop.getHopID()) )
+                               for( Hop hop : inHops ) {
+                                       if( hop!= null && 
leafs.contains(hop.getHopID()) )
                                                tmp.add(hop);
+                               }
                                cplans2.put(e.getKey(), new 
Pair<Hop[],CNodeTpl>(
                                                tmp.toArray(new Hop[0]),tpl));
                        }
@@ -415,7 +509,7 @@ public class SpoofCompiler
        
        private static void rCollectLeafIDs(CNode node, HashSet<Long> leafs) {
                //collect leaf variable names
-               if( node instanceof CNodeData )
+               if( node instanceof CNodeData && !((CNodeData)node).isLiteral() 
)
                        leafs.add(((CNodeData) node).getHopID());
                
                //recursively process cplan

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c5eddccf/src/main/java/org/apache/sysml/hops/codegen/cplan/CNode.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNode.java 
b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNode.java
index 446da32..df40d58 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNode.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNode.java
@@ -186,7 +186,7 @@ public abstract class CNode
                CNode cthat = (CNode) that;
                boolean ret = _inputs.size() == cthat._inputs.size();
                for( int i=0; i<_inputs.size() && ret; i++ )
-                       ret &= _inputs.get(i).equals(_inputs.get(i));
+                       ret &= _inputs.get(i).equals(cthat._inputs.get(i));
                return ret 
                        && (_output == cthat._output || 
_output.equals(cthat._output))
                        && _dataType == cthat._dataType

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c5eddccf/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeCell.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeCell.java 
b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeCell.java
index db278e4..493c2dd 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeCell.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeCell.java
@@ -41,7 +41,7 @@ public class CNodeCell extends CNodeTpl
                        + "  public %TMP%() {\n"
                        + "    _type = CellType.%TYPE%;\n"
                        + "  }\n"
-                       + "  protected double genexecDense( double a, 
double[][] b, double[] scalars, int n, int m, int rowIndex, int colIndex) { \n"
+                       + "  protected double genexec( double a, double[][] b, 
double[] scalars, int n, int m, int rowIndex, int colIndex) { \n"
                        + "%BODY_dense%"
                        + "    return %OUT%;\n"
                        + "  } \n"
@@ -51,7 +51,7 @@ public class CNodeCell extends CNodeTpl
        private boolean _multipleConsumers = false;
        
        public CNodeCell(ArrayList<CNode> inputs, CNode output ) {
-               super(inputs,output);
+               super(inputs, output);
        }
        
        public void setMultipleConsumers(boolean flag) {
@@ -138,8 +138,10 @@ public class CNodeCell extends CNodeTpl
                        return false;
                
                CNodeCell that = (CNodeCell)o;
-               return super.equals(that)
-                       && _type == that._type;
+               return super.equals(that) 
+                       && _type == that._type
+                       && equalInputReferences(
+                               _output, that._output, _inputs, that._inputs);
        }
        
        @Override

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c5eddccf/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeData.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeData.java 
b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeData.java
index 6bf760f..88baa61 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeData.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeData.java
@@ -53,7 +53,14 @@ public class CNodeData extends CNode
                
        @Override
        public String getVarname() {
-               return _name;
+               if( "NaN".equals(_name) )
+                       return "Double.NaN";
+               else if( "Infinity".equals(_name) )
+                       return "Double.POSITIVE_INFINITY";
+               else if( "-Infinity".equals(_name) )
+                       return "Double.NEGATIVE_INFINITY";
+               else
+                       return _name;
        }
        
        public long getHopID() {
@@ -89,6 +96,7 @@ public class CNodeData extends CNode
        public boolean equals(Object o) {
                return (o instanceof CNodeData 
                        && super.equals(o)
-                       && (!isLiteral() || 
_name.equals(((CNodeData)o)._name)));
+                       && isLiteral() == ((CNodeData)o).isLiteral()
+                       && (isLiteral() ? _name.equals(((CNodeData)o)._name) : 
true));
        }
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c5eddccf/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeOuterProduct.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeOuterProduct.java 
b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeOuterProduct.java
index addc20b..e9d9008 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeOuterProduct.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeOuterProduct.java
@@ -160,7 +160,9 @@ public class CNodeOuterProduct extends CNodeTpl
                CNodeOuterProduct that = (CNodeOuterProduct)o;
                return super.equals(that)
                        && _type == that._type
-                       && _transposeOutput == that._transposeOutput;
+                       && _transposeOutput == that._transposeOutput
+                       && equalInputReferences(
+                               _output, that._output, _inputs, that._inputs);
        }
        
        @Override

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c5eddccf/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRowAgg.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRowAgg.java 
b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRowAgg.java
new file mode 100644
index 0000000..4f13c53
--- /dev/null
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRowAgg.java
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.hops.codegen.cplan;
+
+import java.util.ArrayList;
+
+import org.apache.sysml.hops.codegen.SpoofFusedOp.SpoofOutputDimsType;
+
+public class CNodeRowAgg extends CNodeTpl
+{
+       private static final String TEMPLATE = 
+                         "package codegen;\n"
+                       + "import java.util.Arrays;\n"
+                       + "import java.util.ArrayList;\n"
+                       + "import 
org.apache.sysml.runtime.codegen.LibSpoofPrimitives;\n"
+                       + "import 
org.apache.sysml.runtime.codegen.SpoofRowAggregate;\n"
+                       + "\n"
+                       + "public final class %TMP% extends SpoofRowAggregate { 
\n"
+                       + "  public %TMP%() {\n"
+                       + "    _colVector = %FLAG%;\n"
+                       + "  }\n"
+                       + "  protected void genexecRowDense( double[] a, int 
ai, double[][] b, double[] scalars, double[] c, int len, int rowIndex ) { \n"
+                       + "%BODY_dense%"
+                       + "  } \n"
+                       + "  protected void genexecRowSparse( double[] avals, 
int[] aix, int ai, double[][] b, double[] scalars, double[] c, int len, int 
rowIndex ) { \n"
+                       + "%BODY_sparse%"
+                       + "  } \n"                      
+                       + "}\n";
+
+       public CNodeRowAgg(ArrayList<CNode> inputs, CNode output ) {
+               super(inputs, output);
+       }
+       
+       
+       @Override
+       public String codegen(boolean sparse) {
+               // note: ignore sparse flag, generate both
+               String tmp = TEMPLATE;
+               
+               //rename inputs
+               rReplaceDataNode(_output, _inputs.get(0), "a"); // input matrix
+               renameInputs(_inputs, 1);
+               
+               //generate dense/sparse bodies
+               String tmpDense = _output.codegen(false);
+               _output.resetGenerated();
+               String tmpSparse = _output.codegen(true);
+               tmp = tmp.replaceAll("%TMP%", createVarname());
+               tmp = tmp.replaceAll("%BODY_dense%", tmpDense);
+               tmp = tmp.replaceAll("%BODY_sparse%", tmpSparse);
+               
+               //replace outputs 
+               tmp = tmp.replaceAll("%OUT%", "c");
+               tmp = tmp.replaceAll("%POSOUT%", "0");
+               
+               //replace size information
+               tmp = tmp.replaceAll("%LEN%", "len");
+               
+               //replace colvector information and start position
+               tmp = tmp.replaceAll("%FLAG%", 
String.valueOf(_output._cols==1));
+               tmp = tmp.replaceAll("bi", "0");
+               
+               return tmp;
+       }
+
+       @Override
+       public void setOutputDims() {
+               // TODO Auto-generated method stub
+               
+       }
+
+       @Override
+       public SpoofOutputDimsType getOutputDimType() {
+               return (_output._cols==1) ? 
+                       SpoofOutputDimsType.COLUMN_DIMS_ROWS : //column vector
+                       SpoofOutputDimsType.COLUMN_DIMS_COLS;  //row vector
+       }
+       
+       @Override
+       public CNodeTpl clone() {
+               return new CNodeRowAgg(_inputs, _output);
+       }
+       
+       @Override
+       public int hashCode() {
+               return super.hashCode();
+       }
+       
+       @Override 
+       public boolean equals(Object o) {
+               if(!(o instanceof CNodeRowAgg))
+                       return false;
+               
+               CNodeRowAgg that = (CNodeRowAgg)o;
+               return super.equals(o)
+                       && equalInputReferences(
+                               _output, that._output, _inputs, that._inputs);
+       }
+       
+       @Override
+       public String getTemplateInfo() {
+               StringBuilder sb = new StringBuilder();
+               sb.append("SPOOF ROWAGGREGATE");
+               return sb.toString();
+       }
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c5eddccf/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRowAggVector.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRowAggVector.java 
b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRowAggVector.java
deleted file mode 100644
index d3a39e3..0000000
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRowAggVector.java
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.sysml.hops.codegen.cplan;
-
-import java.util.ArrayList;
-
-import org.apache.sysml.hops.codegen.SpoofFusedOp.SpoofOutputDimsType;
-
-public class CNodeRowAggVector extends CNodeTpl
-{
-       private static final String TEMPLATE = 
-                         "package codegen;\n"
-                       + "import java.util.Arrays;\n"
-                       + "import java.util.ArrayList;\n"
-                       + "import 
org.apache.sysml.runtime.codegen.LibSpoofPrimitives;\n"
-                       + "import 
org.apache.sysml.runtime.codegen.SpoofRowAggregate;\n"
-                       + "\n"
-                       + "public final class %TMP% extends SpoofRowAggregate { 
\n"
-                       + "  public %TMP%() {\n"
-                       + "    _colVector = %FLAG%;\n"
-                       + "  }\n"
-                       + "  protected void genexecRowDense( double[] a, int 
ai, double[][] b, double[] scalars, double[] c, int len, int rowIndex ) { \n"
-                       + "%BODY_dense%"
-                       + "  } \n"
-                       + "  protected void genexecRowSparse( double[] avals, 
int[] aix, int ai, double[][] b, double[] scalars, double[] c, int len, int 
rowIndex ) { \n"
-                       + "%BODY_sparse%"
-                       + "  } \n"                      
-                       + "}\n";
-
-       public CNodeRowAggVector(ArrayList<CNode> inputs, CNode output ) {
-               super(inputs, output);
-       }
-       
-       
-       @Override
-       public String codegen(boolean sparse) {
-               // note: ignore sparse flag, generate both
-               String tmp = TEMPLATE;
-               
-               //rename inputs
-               rReplaceDataNode(_output, _inputs.get(0), "a"); // input matrix
-               renameInputs(_inputs, 1);
-               
-               //generate dense/sparse bodies
-               String tmpDense = _output.codegen(false);
-               _output.resetGenerated();
-               String tmpSparse = _output.codegen(true);
-               tmp = tmp.replaceAll("%TMP%", createVarname());
-               tmp = tmp.replaceAll("%BODY_dense%", tmpDense);
-               tmp = tmp.replaceAll("%BODY_sparse%", tmpSparse);
-               
-               //replace outputs 
-               tmp = tmp.replaceAll("%OUT%", "c");
-               tmp = tmp.replaceAll("%POSOUT%", "0");
-               
-               //replace size information
-               tmp = tmp.replaceAll("%LEN%", "len");
-               
-               //replace colvector information and start position
-               tmp = tmp.replaceAll("%FLAG%", 
String.valueOf(_output._cols==1));
-               tmp = tmp.replaceAll("bi", "0");
-               
-               return tmp;
-       }
-
-       @Override
-       public void setOutputDims() {
-               // TODO Auto-generated method stub
-               
-       }
-
-       @Override
-       public SpoofOutputDimsType getOutputDimType() {
-               return (_output._cols==1) ? 
-                       SpoofOutputDimsType.COLUMN_DIMS_ROWS : //column vector
-                       SpoofOutputDimsType.COLUMN_DIMS_COLS;  //row vector
-       }
-       
-       @Override
-       public CNodeTpl clone() {
-               return new CNodeRowAggVector(_inputs, _output);
-       }
-       
-       @Override
-       public int hashCode() {
-               return super.hashCode();
-       }
-       
-       @Override 
-       public boolean equals(Object o) {
-               return (o instanceof CNodeRowAggVector
-                       && super.equals(o));
-       }
-       
-       @Override
-       public String getTemplateInfo() {
-               StringBuilder sb = new StringBuilder();
-               sb.append("SPOOF ROWAGGREGATE");
-               return sb.toString();
-       }
-}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c5eddccf/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeTpl.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeTpl.java 
b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeTpl.java
index df20b4d..7d0ae8d 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeTpl.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeTpl.java
@@ -124,6 +124,7 @@ public abstract class CNodeTpl extends CNode implements 
Cloneable
                        
                        //replace lookup on top of leaf data node (for CSE only)
                        if( node._inputs.get(i) instanceof CNodeUnary
+                               && node._inputs.get(i)._inputs.get(0) 
instanceof CNodeData
                                && 
(((CNodeUnary)node._inputs.get(i)).getType()==UnaryType.LOOKUP_R
                                || 
((CNodeUnary)node._inputs.get(i)).getType()==UnaryType.LOOKUP_RC)) {
                                CNodeData tmp = 
(CNodeData)node._inputs.get(i)._inputs.get(0);
@@ -213,4 +214,29 @@ public abstract class CNodeTpl extends CNode implements 
Cloneable
                return (o instanceof CNodeTpl
                        && super.equals(o));
        }
+       
+       protected static boolean equalInputReferences(CNode current1, CNode 
current2, ArrayList<CNode> input1, ArrayList<CNode> input2) {
+               boolean ret = true;
+               
+               //process childs recursively
+               for( int i=0; ret && i<current1.getInput().size(); i++ )
+                       ret &= equalInputReferences(
+                                       current1.getInput().get(i), 
current2.getInput().get(i), input1, input2);
+               
+               if( ret && current1 instanceof CNodeData ) {
+                       ret &= indexOf(input1, (CNodeData)current1)
+                               == indexOf(input2, (CNodeData)current2);
+               }
+               
+               return ret;
+       }
+       
+       private static int indexOf(ArrayList<CNode> inputs, CNodeData probe) {
+               for( int i=0; i<inputs.size(); i++ ) {
+                       CNodeData cd = ((CNodeData)inputs.get(i));
+                       if( cd.getHopID()==probe.getHopID() )
+                               return i;
+               }
+               return -1;
+       }
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c5eddccf/src/main/java/org/apache/sysml/hops/codegen/template/BaseTpl.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/BaseTpl.java 
b/src/main/java/org/apache/sysml/hops/codegen/template/BaseTpl.java
index 4b7ecbf..2f643af 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/BaseTpl.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/BaseTpl.java
@@ -19,12 +19,7 @@
 
 package org.apache.sysml.hops.codegen.template;
 
-import java.util.ArrayList;
-import java.util.LinkedHashMap;
-
-import org.apache.sysml.api.DMLException;
 import org.apache.sysml.hops.Hop;
-import org.apache.sysml.hops.codegen.cplan.CNodeData;
 import org.apache.sysml.hops.codegen.cplan.CNodeTpl;
 import org.apache.sysml.runtime.matrix.data.Pair;
 
@@ -32,32 +27,98 @@ public abstract class BaseTpl
 {      
        public enum TemplateType {
                CellTpl,
-               OuterProductTpl,
+               OuterProdTpl,
                RowAggTpl
        }
        
-       private TemplateType _type = null;
-       
-       protected ArrayList<Hop> _matrixInputs = new ArrayList<Hop>();
-       protected Hop _initialHop;
-       protected Hop _endHop;
-       protected ArrayList<CNodeData> _initialCnodes = new 
ArrayList<CNodeData>();
-       protected ArrayList<Hop> _adddedMatrices = new ArrayList<Hop>();
-       protected boolean _endHopReached = false;
+       public enum CloseType {
+               CLOSED_VALID,
+               CLOSED_INVALID,
+               OPEN,
+       }
 
-       protected LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> _cpplans = new 
LinkedHashMap<Long, Pair<Hop[],CNodeTpl>>();
+       protected TemplateType _type = null;
+       protected boolean _closed = false;
        
        protected BaseTpl(TemplateType type) {
                _type = type;
+               _closed = false;
        }
        
        public TemplateType getType() {
                return _type;
        }
        
-       public abstract boolean openTpl(Hop hop);
-
-       public abstract boolean findTplBoundaries(Hop initialHop, CplanRegister 
cplanRegister);
+       public boolean isClosed() {
+               return _closed;
+       }
+       
+       /////////////////////////////////////////////
+       // Open-Fuse-Merge-Close interface 
+       // (for candidate generation and exploration)
+       
+       /**
+        * Indicates if this template can be opened at the given hop,
+        * where hop represents bottom (first operation on the inputs) 
+        * of the fused operator.
+        * 
+        * @param hop current hop
+        * @return true if template can be opened 
+        */
+       public abstract boolean open(Hop hop);
+       
+       /**
+        * Indicates if the template can be expanded to the given hop
+        * starting from an open template at the input.
+        * 
+        * @param hop current hop
+        * @param input hop with open template of same type
+        * @return true if the current hop can be fused into the operator.
+        */
+       public abstract boolean fuse(Hop hop, Hop input);
+       
+       /**
+        * Indicates if the template at the current hop can be expanded
+        * by merging another template available for one of its other inputs
+        * which is not yet covered by the template of the current hop. 
+        * 
+        * @param hop current hop
+        * @param input direct input of current hop with available template
+        * @return true if the the input hop can be fused into the current hop
+        */
+       public abstract boolean merge(Hop hop, Hop input);
+       
+       /**
+        * Indicates if the template must be closed at the current hop; either
+        * due to final operations (e.g., aggregate) or unsupported operations.
+        * 
+        * @param hop current hop
+        * @return close type (closed invalid, closed valid, open)
+        */
+       public abstract CloseType close(Hop hop);
+       
+       /**
+        * Mark the template as closed either invalid or valid.
+        */
+       public void close() {
+               _closed = true;
+       }
+       
+       /////////////////////////////////////////////
+       // CPlan construction interface
+       // (for plan creation of selected candidates)
        
-       public abstract LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> 
constructTplCplan(boolean compileLiterals) throws DMLException;
+       /**
+        * Constructs a single cplan rooted at the given hop, according 
+        * to the plan given in the memo structure for this particular 
+        * hop and its recursive inputs.  
+        * 
+        * @param hop root of cplan
+        * @param memo memoization table for partial subplans
+        * @param compileLiterals if true compile non-integer literals 
+        * as constants, otherwise variables. note: integer literals are 
+        * always compiled as constants.
+        * @return
+        */
+       public abstract Pair<Hop[], CNodeTpl> constructCplan(Hop hop, 
CPlanMemoTable memo, boolean compileLiterals);    
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c5eddccf/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java 
b/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java
new file mode 100644
index 0000000..ee41521
--- /dev/null
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java
@@ -0,0 +1,280 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.hops.codegen.template;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Map.Entry;
+
+import org.apache.commons.collections.CollectionUtils;
+import org.apache.sysml.hops.Hop;
+import org.apache.sysml.hops.codegen.template.BaseTpl.TemplateType;
+import org.apache.sysml.hops.rewrite.HopRewriteUtils;
+
+import scala.tools.jline_embedded.internal.Log;
+
+
+public class CPlanMemoTable 
+{
+       public enum PlanSelection {
+               FUSE_ALL,             //maximal fusion, possible w/ redundant 
compute
+               FUSE_NO_REDUNDANCY,   //fusion without redundant compute 
+               FUSE_COST_BASED,      //cost-based decision on materialization 
points
+       }
+       
+       private HashMap<Long, ArrayList<MemoTableEntry>> _plans;
+       private HashMap<Long, Hop> _hopRefs;
+       private HashSet<Long> _plansBlacklist;
+       
+       public CPlanMemoTable() {
+               _plans = new HashMap<Long, ArrayList<MemoTableEntry>>();
+               _hopRefs = new HashMap<Long, Hop>();
+               _plansBlacklist = new HashSet<Long>();
+       }
+       
+       public boolean contains(long hopID) {
+               return _plans.containsKey(hopID);
+       }
+       
+       public boolean containsTopLevel(long hopID) {
+               return !_plansBlacklist.contains(hopID)
+                       && getBest(hopID) != null;
+       }
+       
+       public void add(Hop hop, TemplateType type) {
+               add(hop, type, -1, -1, -1);
+       }
+       
+       public void add(Hop hop, TemplateType type, long in1) {
+               add(hop, type, in1, -1, -1);
+       }
+       
+       public void add(Hop hop, TemplateType type, long in1, long in2) {
+               add(hop, type, in1, in2, -1);
+       }
+       
+       public void add(Hop hop, TemplateType type, long in1, long in2, long 
in3) {
+               _hopRefs.put(hop.getHopID(), hop);
+               if( !_plans.containsKey(hop.getHopID()) )
+                       _plans.put(hop.getHopID(), new 
ArrayList<MemoTableEntry>());
+               _plans.get(hop.getHopID()).add(new MemoTableEntry(type, in1, 
in2, in3));
+       }
+
+       @SuppressWarnings("unchecked")
+       public void pruneRedundant(long hopID) {
+               if( !contains(hopID) )
+                       return;
+               
+               //prune redundant plans (i.e., equivalent) 
+               HashSet<MemoTableEntry> set = new HashSet<MemoTableEntry>();
+               ArrayList<MemoTableEntry> list = _plans.get(hopID);
+               for( MemoTableEntry me : list )
+                       set.add(me);
+               
+               //prune dominated plans (e.g., opened plan subsumed
+               //by fused plan if single consumer of input)
+               ArrayList<MemoTableEntry> rmList = new 
ArrayList<MemoTableEntry>();
+               Hop hop = _hopRefs.get(hopID);
+               for( MemoTableEntry e1 : set )
+                       for( MemoTableEntry e2 : set )
+                               if( e1 != e2 && e1.subsumes(e2) ) {
+                                       //check that childs don't have multiple 
consumers
+                                       boolean rmSafe = true; 
+                                       for( int i=0; i<=2; i++ )
+                                               rmSafe &= (e1.isPlanRef(i) && 
!e2.isPlanRef(i)) ?
+                                                       
hop.getInput().get(i).getParent().size()==1 : true;
+                                       if( rmSafe )
+                                               rmList.add(e2);
+                               }
+               
+               //update current entry list
+               list.clear();
+               list.addAll(CollectionUtils.subtract(set, rmList));
+       }
+
+       public void pruneSuboptimal() {
+               //build index of referenced entries
+               HashSet<Long> ix = new HashSet<Long>();
+               for( Entry<Long, ArrayList<MemoTableEntry>> e : 
_plans.entrySet() )
+                       for( MemoTableEntry me : e.getValue() ) {
+                               ix.add(me.input1); 
+                               ix.add(me.input2); 
+                               ix.add(me.input3);
+                       }
+               
+               //prune single-operation (not referenced, and no child 
references)
+               Iterator<Entry<Long, ArrayList<MemoTableEntry>>> iter = 
_plans.entrySet().iterator();
+               while( iter.hasNext() ) {
+                       Entry<Long, ArrayList<MemoTableEntry>> e = iter.next();
+                       if( !ix.contains(e.getKey()) ) {
+                               ArrayList<MemoTableEntry> list = e.getValue();
+                               for( int i=0; i<list.size(); i++ )
+                                       if( !list.get(i).hasPlanRef() )
+                                               list.remove(i--);
+                               if( list.isEmpty() )
+                                       iter.remove();
+                       }
+               }
+               
+               //prune dominated plans (e.g., plan referenced by other plan 
and this
+               //other plan is single consumer) by marking it as blacklisted 
because
+               //the chain of entries is still required for cplan construction
+               for( Entry<Long, ArrayList<MemoTableEntry>> e : 
_plans.entrySet() )
+                       for( MemoTableEntry me : e.getValue() ) {
+                               for( int i=0; i<=2; i++ )
+                                       if( me.isPlanRef(i) && 
_hopRefs.get(me.intput(i)).getParent().size()==1 )
+                                               
_plansBlacklist.add(me.intput(i));
+                       }
+       }
+
+       public ArrayList<MemoTableEntry> get(long hopID) {
+               return _plans.get(hopID);
+       }
+       
+       public MemoTableEntry getBest(long hopID) {
+               ArrayList<MemoTableEntry> tmp = get(hopID);
+               if( tmp == null || tmp.isEmpty() )
+                       return null;
+               
+               //get first plan by type preference
+               TemplateType[] ttPrefOrder = new 
TemplateType[]{TemplateType.RowAggTpl, 
+                               TemplateType.OuterProdTpl, 
TemplateType.CellTpl}; 
+               for( TemplateType pref : ttPrefOrder )
+                       for( MemoTableEntry me : tmp )
+                               if( me.type == pref && isValid(me, 
_hopRefs.get(hopID)) )
+                                       return me;
+               return null;
+       }
+       
+       //TODO revisit requirement for preference once cost-based pruning 
(pruneSuboptimal) ready
+       public MemoTableEntry getBest(long hopID, TemplateType pref) {
+               ArrayList<MemoTableEntry> tmp = get(hopID);
+               if( tmp.size()==1 ) //single plan available
+                       return tmp.get(0);
+               
+               //try to find plan with preferred type
+               Log.warn("Multiple memo table entries available, searching for 
preferred type.");
+               ArrayList<MemoTableEntry> tmp2 = new 
ArrayList<MemoTableEntry>();
+               for( MemoTableEntry me : tmp )
+                       if( me.type == pref )
+                               tmp2.add(me);
+               if( !tmp2.isEmpty() ) {
+                       if( tmp2.size() > 1 )
+                               Log.warn("Multiple memo table entries w/ 
preferred type available, return max refs entry.");
+                       return getMaxRefsEntry(tmp2);
+               }
+               else {
+                       Log.warn("Multiple memo table entries available but 
none with preferred type, return max refs entry.");
+                       return getMaxRefsEntry(tmp);
+               }
+       }
+       
+       private static MemoTableEntry getMaxRefsEntry(ArrayList<MemoTableEntry> 
tmp) {
+               int maxPos = 0;
+               int maxRefs = 0;
+               for( int i=0; i<tmp.size(); i++ ) {
+                       int cntRefs = tmp.get(i).countPlanRefs();
+                       if( cntRefs > maxRefs ) {
+                               maxRefs = cntRefs;
+                               maxPos = i;
+                       }
+               }
+               return tmp.get(maxPos);
+       }
+       
+       private static boolean isValid(MemoTableEntry me, Hop hop) {
+               return (me.type == TemplateType.OuterProdTpl 
+                               && (me.closed || 
HopRewriteUtils.isBinaryMatrixMatrixOperation(hop)))
+                       || (me.type == TemplateType.RowAggTpl && me.closed)     
+                       || (me.type == TemplateType.CellTpl);
+       }
+       
+       @Override
+       public String toString() {
+               StringBuilder sb = new StringBuilder();
+               sb.append("----------------------------------\n");
+               sb.append("MEMO TABLE: \n");
+               sb.append("----------------------------------\n");
+               for( Entry<Long, ArrayList<MemoTableEntry>> e : 
_plans.entrySet() ) {
+                       sb.append(e.getKey() + " 
"+_hopRefs.get(e.getKey()).getOpString()+": ");
+                       sb.append(Arrays.toString(e.getValue().toArray(new 
MemoTableEntry[0]))+"\n");
+               }
+               sb.append("----------------------------------\n");
+               return sb.toString();   
+       }
+       
+       public static class MemoTableEntry 
+       {
+               public final TemplateType type;
+               public final long input1; 
+               public final long input2;
+               public final long input3;
+               public boolean closed = false;
+               public MemoTableEntry(TemplateType t, long in1, long in2, long 
in3) {
+                       type = t;
+                       input1 = in1;
+                       input2 = in2;
+                       input3 = in3;
+               }
+               public boolean isPlanRef(int index) {
+                       return (index==0 && input1 >=0)
+                               || (index==1 && input2 >=0)
+                               || (index==2 && input3 >=0);
+               }
+               public boolean hasPlanRef() {
+                       return isPlanRef(0) || isPlanRef(1) || isPlanRef(2);
+               }
+               public int countPlanRefs() {
+                       return ((input1 >= 0) ? 1 : 0)
+                               +  ((input2 >= 0) ? 1 : 0)
+                               +  ((input3 >= 0) ? 1 : 0);
+               }
+               public long intput(int index) {
+                       return (index==0) ? input1 : (index==1) ? input2 : 
input3;
+               }
+               public boolean subsumes(MemoTableEntry that) {
+                       return (type == that.type 
+                               && !(!isPlanRef(0) && that.isPlanRef(0))
+                               && !(!isPlanRef(1) && that.isPlanRef(1))
+                               && !(!isPlanRef(2) && that.isPlanRef(2)));
+               }
+               
+               @Override
+               public int hashCode() {
+                       return Arrays.hashCode(
+                               new long[]{(long)type.ordinal(), input1, 
input2, input3});
+               }
+               @Override
+               public boolean equals(Object obj) {
+                       if( !(obj instanceof MemoTableEntry) )
+                               return false;
+                       MemoTableEntry that = (MemoTableEntry)obj;
+                       return type == that.type && input1 == that.input1
+                               && input2 == that.input2 && input3 == 
that.input3;
+               }
+               @Override
+               public String toString() {
+                       return type.name()+"("+input1+","+input2+","+input3+")";
+               }
+       }
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c5eddccf/src/main/java/org/apache/sysml/hops/codegen/template/CellTpl.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/template/CellTpl.java 
b/src/main/java/org/apache/sysml/hops/codegen/template/CellTpl.java
index 6b5930c..6a3cd83 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/CellTpl.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/CellTpl.java
@@ -22,10 +22,8 @@ package org.apache.sysml.hops.codegen.template;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
-import java.util.LinkedHashMap;
-import java.util.Map.Entry;
+import java.util.LinkedList;
 
-import org.apache.sysml.api.DMLException;
 import org.apache.sysml.hops.AggUnaryOp;
 import org.apache.sysml.hops.BinaryOp;
 import org.apache.sysml.hops.Hop;
@@ -42,271 +40,168 @@ import org.apache.sysml.hops.codegen.cplan.CNodeData;
 import org.apache.sysml.hops.codegen.cplan.CNodeTpl;
 import org.apache.sysml.hops.codegen.cplan.CNodeUnary;
 import org.apache.sysml.hops.codegen.cplan.CNodeUnary.UnaryType;
+import org.apache.sysml.hops.codegen.template.CPlanMemoTable.MemoTableEntry;
 import org.apache.sysml.hops.codegen.cplan.CNodeTernary;
 import org.apache.sysml.hops.codegen.cplan.CNodeTernary.TernaryType;
 import org.apache.sysml.hops.rewrite.HopRewriteUtils;
 import org.apache.sysml.parser.Expression.DataType;
-import org.apache.sysml.runtime.codegen.SpoofCellwise.CellType;
 import org.apache.sysml.runtime.matrix.data.Pair;
 
 public class CellTpl extends BaseTpl 
-{
-       
+{      
        public CellTpl() {
                super(TemplateType.CellTpl);
        }
-       
+
        @Override
-       public boolean openTpl(Hop hop) {
+       public boolean open(Hop hop) {
                return isValidOperation(hop);
        }
 
        @Override
-       public boolean findTplBoundaries(Hop initialHop, CplanRegister 
cplanRegister) {
-               _initialHop = initialHop;
-               rFindCellwisePattern(initialHop, new HashMap<Long, Hop>());
-               
-               //if cplanRegister has the initial hop then no need to 
reconstruct
-               if(cplanRegister.containsHop(TemplateType.CellTpl, 
_initialHop.getHopID()))
-                       return false;
-                       
-               //re-assign initialHop to fuse the sum/rowsums (before checking 
for chains)
-               //TODO add aggbinary (vector tsmm) as potential head for 
cellwise operation
-               for (Hop h : _initialHop.getParent()) {
-                       if( h instanceof AggUnaryOp && ((AggUnaryOp) h).getOp() 
== AggOp.SUM 
-                               && ((AggUnaryOp) h).getDirection()!= 
Direction.Col ) {
-                               _initialHop = h;  
-                       }
-               }
-               
-               //unary matrix && endHop found && endHop is not direct child of 
the initialHop (i.e., chain of operators)
-               if(_endHop != null && _endHop != _initialHop)
-               {
-                       // if final hop is unary add its child to the input 
-                       if(_endHop instanceof UnaryOp)
-                               _matrixInputs.add(_endHop.getInput().get(0));
-                       //if one input is scalar then add the other as major 
input
-                       else if(_endHop.getInput().get(0).getDataType() == 
DataType.SCALAR)
-                               _matrixInputs.add(_endHop.getInput().get(1));
-                       else if(_endHop.getInput().get(1).getDataType() == 
DataType.SCALAR)
-                               _matrixInputs.add(_endHop.getInput().get(0));
-                       //if one is matrix and the other is vector add the 
matrix
-                       else 
if(TemplateUtils.isMatrix(_endHop.getInput().get(0)) && 
TemplateUtils.isVector(_endHop.getInput().get(1)) )
-                               _matrixInputs.add(_endHop.getInput().get(0));
-                       else 
if(TemplateUtils.isMatrix(_endHop.getInput().get(1)) && 
TemplateUtils.isVector(_endHop.getInput().get(0)) )
-                               _matrixInputs.add(_endHop.getInput().get(1));
-                       //both are vectors (add any of them)
-                       else
-                               _matrixInputs.add(_endHop.getInput().get(0));
-                               
-                       return true;
-               }
-               
-               return false;
+       public boolean fuse(Hop hop, Hop input) {
+               return !isClosed() && (isValidOperation(hop) 
+                       || ( hop instanceof AggUnaryOp && ((AggUnaryOp) 
hop).getOp() == AggOp.SUM 
+                               && ((AggUnaryOp) hop).getDirection()!= 
Direction.Col));
        }
-       
-       private void rFindCellwisePattern(Hop h, HashMap<Long,Hop> memo)
-       {
-               if(memo.containsKey(h.getHopID()))
-                       return;
-               
-               //stop recursion if stopping operator
-               if(h.getDataType() == DataType.SCALAR || !isValidOperation(h))
-                       return;
-               
-               //process childs recursively
-               _endHop = h;
-               for( Hop in : h.getInput() )
-               {
-                       //propagate the _endHop from bottom to top
-                       if(memo.containsKey(in.getHopID()))
-                               _endHop=memo.get(in.getHopID());
-                       else
-                               rFindCellwisePattern(in,memo);
-               }
-       
-               memo.put(h.getHopID(), _endHop);        
+
+       @Override
+       public boolean merge(Hop hop, Hop input) {
+               //merge of other cell tpl possible
+               return (!isClosed() && isValidOperation(hop));
        }
 
        @Override
-       public LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> 
constructTplCplan(boolean compileLiterals)
-                       throws DMLException {
-               //re-assign the dimensions of inputs to match the generated 
code dimensions
-               _initialCnodes.add(new CNodeData(_matrixInputs.get(0), 1, 1, 
DataType.SCALAR));
-               
-               rConstructCellCplan(_initialHop,_initialHop, new 
HashSet<Long>(), compileLiterals);
-               return _cpplans;
+       public CloseType close(Hop hop) {
+               //need to close cell tpl after aggregation, see fuse for exact 
properties
+               if( hop instanceof AggUnaryOp && 
isValidOperation(hop.getInput().get(0)) )
+                       return CloseType.CLOSED_VALID;
+               else if( hop instanceof AggUnaryOp )
+                       return CloseType.CLOSED_INVALID;
+               else
+                       return CloseType.OPEN;
        }
-       
-       public CNode fuseCellWise(Hop initialHop,Hop matrixInput, boolean 
compileLiterals)
-                       throws DMLException {
-               //re-assign the dimensions of inputs to match the generated 
code dimensions
-               _initialHop = initialHop;
-               _matrixInputs.add(matrixInput);
+
+       @Override
+       public Pair<Hop[], CNodeTpl> constructCplan(Hop hop, CPlanMemoTable 
memo, boolean compileLiterals) 
+       {
+               //recursively process required cplan output
+               HashSet<Hop> inHops = new HashSet<Hop>();
+               HashMap<Long, CNode> tmp = new HashMap<Long, CNode>();
+               hop.resetVisitStatus();
+               rConstructCplan(hop, memo, tmp, inHops, compileLiterals);
+               hop.resetVisitStatus();
+               
+               //reorder inputs (ensure matrices/vectors come first, prune 
literals)
+               LinkedList<Hop> sinHops = new LinkedList<Hop>();
+               for( int i : new int[]{0,1,2} ) //matrices, vectors, scalars
+                       for( Hop h : inHops ) //matrices
+                               if( (i==0 && h.getDataType().isMatrix() && 
!TemplateUtils.isVector(h))
+                                       || (i==1 && h.getDataType().isMatrix() 
&& TemplateUtils.isVector(h))
+                                       || (i==2 && h.getDataType().isScalar() 
&& !tmp.get(h.getHopID()).isLiteral())) {
+                                       sinHops.add(h);
+                               }
                
-               constructTplCplan(compileLiterals);
-               Entry<Long, Pair<Hop[],CNodeTpl>> toplevel = 
TemplateUtils.getTopLevelCpplan(_cpplans);
-               if(toplevel != null)
-                       return toplevel.getValue().getValue().getOutput();
-               else 
-                       return null;
+               //construct template node
+               ArrayList<CNode> inputs = new ArrayList<CNode>();
+               for( Hop in : sinHops )
+                       inputs.add(tmp.get(in.getHopID()));
+               CNode output = tmp.get(hop.getHopID());
+               CNodeCell tpl = new CNodeCell(inputs, output);
+               tpl.setCellType(TemplateUtils.getCellType(hop));
+               
+               // return cplan instance
+               return new Pair<Hop[],CNodeTpl>(sinHops.toArray(new Hop[0]), 
tpl);
        }
        
-       private void rConstructCellCplan(Hop root, Hop hop, HashSet<Long> memo, 
boolean compileLiterals) 
-               throws DMLException
+       private void rConstructCplan(Hop hop, CPlanMemoTable memo, 
HashMap<Long, CNode> tmp, HashSet<Hop> inHops, boolean compileLiterals) 
        {
-               if( memo.contains(hop.getHopID()) )
-                       return;
-               
-               
-               //process childs recursively
-               for( Hop c : hop.getInput() )
-                       rConstructCellCplan(root, c, memo, compileLiterals);
+               //recursively process required childs
+               MemoTableEntry me = memo.getBest(hop.getHopID(), 
TemplateType.CellTpl);
+               for( int i=0; i<hop.getInput().size(); i++ ) {
+                       Hop c = hop.getInput().get(i);
+                       if( me.isPlanRef(i) )
+                               rConstructCplan(c, memo, tmp, inHops, 
compileLiterals);
+                       else {
+                               CNodeData cdata = 
TemplateUtils.createCNodeData(c, compileLiterals);    
+                               tmp.put(c.getHopID(), cdata);
+                               inHops.add(c);
+                       }
+               }
                
-                // first hop to enter here should be _endHop
-               
if(TemplateUtils.inputsAreGenerated(hop,_matrixInputs,_cpplans))  
-               // if direct children are DataGenOps, literals, or already in 
the cpplans then we are ready to generate code
+               //construct cnode for current hop
+               CNode out = null;
+               if(hop instanceof UnaryOp)
                {
-                       CNodeCell cellTmpl = null;
+                       CNode cdata1 = 
tmp.get(hop.getInput().get(0).getHopID());
+                       if( TemplateUtils.isColVector(cdata1) )
+                               cdata1 = new CNodeUnary(cdata1, 
UnaryType.LOOKUP_R);
+                       else if( cdata1 instanceof CNodeData && 
hop.getInput().get(0).getDataType().isMatrix() )
+                               cdata1 = new CNodeUnary(cdata1, 
UnaryType.LOOKUP_RC);
                        
-                       //Fetch operands
-                       CNode out = null;
-                       ArrayList<CNode> addedCNodes = new ArrayList<CNode>();
-                       ArrayList<Hop> addedHops = new ArrayList<Hop>();
-                       ArrayList<CNode> cnodeData = 
TemplateUtils.fetchOperands(hop, _cpplans, addedCNodes, addedHops, 
_initialCnodes, compileLiterals);
+                       String primitiveOpName = 
((UnaryOp)hop).getOp().toString();
+                       out = new CNodeUnary(cdata1, 
UnaryType.valueOf(primitiveOpName));
+               }
+               else if(hop instanceof BinaryOp)
+               {
+                       BinaryOp bop = (BinaryOp) hop;
+                       CNode cdata1 = 
tmp.get(hop.getInput().get(0).getHopID());
+                       CNode cdata2 = 
tmp.get(hop.getInput().get(1).getHopID());
+                       String primitiveOpName = bop.getOp().toString();
                        
-                       //if operands are scalar or independent from X 
-                       boolean independentOperands = hop != root && 
(hop.getDataType() == DataType.SCALAR || 
TemplateUtils.isOperandsIndependent(cnodeData, addedHops, new String[] 
{_matrixInputs.get(0).getName()}));
-                       if(!independentOperands)
-                       {
-                               if(hop instanceof UnaryOp)
-                               {
-                                       CNode cdata1 = cnodeData.get(0);
-                                       
-                                       //Primitive Operation haas the same 
name as Hop Type OpOp1
-                                       String primitiveOpName = 
((UnaryOp)hop).getOp().toString();
-                                       out = new CNodeUnary(cdata1, 
UnaryType.valueOf(primitiveOpName));
-                               }
-                               else if(hop instanceof BinaryOp)
-                               {
-                                       BinaryOp bop = (BinaryOp) hop;
-                                       CNode cdata1 = cnodeData.get(0);
-                                       CNode cdata2 = cnodeData.get(1);
-                                       
-                                       //Primitive Operation has the same name 
as Hop Type OpOp2
-                                       String primitiveOpName = 
bop.getOp().toString();
-                                       
-                                       //cdata1 is vector
-                                       if( TemplateUtils.isColVector(cdata1) )
-                                               cdata1 = new CNodeUnary(cdata1, 
UnaryType.LOOKUP_R);
-                                       else if( cdata1 instanceof CNodeData && 
hop.getInput().get(0).getDataType().isMatrix() )
-                                               cdata1 = new CNodeUnary(cdata1, 
UnaryType.LOOKUP_RC);
-                                       
-                                       //cdata2 is vector
-                                       if( TemplateUtils.isColVector(cdata2) )
-                                               cdata2 = new CNodeUnary(cdata2, 
UnaryType.LOOKUP_R);
-                                       else if( cdata2 instanceof CNodeData && 
hop.getInput().get(1).getDataType().isMatrix() )
-                                               cdata2 = new CNodeUnary(cdata2, 
UnaryType.LOOKUP_RC);
-                                       
-                                       if( bop.getOp()==OpOp2.POW && 
cdata2.isLiteral() && cdata2.getVarname().equals("2") )
-                                               out = new CNodeUnary(cdata1, 
UnaryType.POW2);
-                                       else if( bop.getOp()==OpOp2.MULT && 
cdata2.isLiteral() && cdata2.getVarname().equals("2") )
-                                               out = new CNodeUnary(cdata1, 
UnaryType.MULT2);
-                                       else //default binary   
-                                               out = new CNodeBinary(cdata1, 
cdata2, BinType.valueOf(primitiveOpName));
-                               }
-                               else if(hop instanceof TernaryOp) 
-                               {
-                                       TernaryOp top = (TernaryOp) hop;
-                                       CNode cdata1 = cnodeData.get(0);
-                                       CNode cdata2 = cnodeData.get(1);
-                                       CNode cdata3 = cnodeData.get(2);
-                                       
-                                       //cdata1 is vector
-                                       if( TemplateUtils.isColVector(cdata1) )
-                                               cdata1 = new CNodeUnary(cdata1, 
UnaryType.LOOKUP_R);
-                                       else if( cdata1 instanceof CNodeData && 
hop.getInput().get(0).getDataType().isMatrix() )
-                                               cdata1 = new CNodeUnary(cdata1, 
UnaryType.LOOKUP_RC);
-                                       
-                                       //cdata3 is vector
-                                       if( TemplateUtils.isColVector(cdata3) )
-                                               cdata3 = new CNodeUnary(cdata3, 
UnaryType.LOOKUP_R);
-                                       else if( cdata3 instanceof CNodeData && 
hop.getInput().get(2).getDataType().isMatrix() )
-                                               cdata3 = new CNodeUnary(cdata3, 
UnaryType.LOOKUP_RC);
-                                       
-                                       //construct ternary cnode, primitive 
operation derived from OpOp3
-                                       out = new CNodeTernary(cdata1, cdata2, 
cdata3, 
-                                                       
TernaryType.valueOf(top.getOp().toString()));
-                               }
-                               else if (hop instanceof AggUnaryOp && 
((AggUnaryOp)hop).getOp() == AggOp.SUM
-                                       && (((AggUnaryOp) hop).getDirection() 
== Direction.RowCol 
-                                       || ((AggUnaryOp) hop).getDirection() == 
Direction.Row) && root == hop)
-                               {
-                                       out = cnodeData.get(0);
-                               }
-                       }
-                       // wire output to the template
-                       if(out != null || independentOperands)
-                       {
-                               if(_cpplans.isEmpty())
-                               {
-                                       //first initialization has to have the 
first variable as input
-                                       ArrayList<CNode> initialInputs = new 
ArrayList<CNode>();                                        
-                                       
-                                       if(independentOperands) // pass the hop 
itself as an input instead of its children
-                                       {
-                                               CNode c =  new CNodeData(hop);
-                                               
initialInputs.addAll(_initialCnodes);
-                                               initialInputs.add(c);
-                                               cellTmpl = new 
CNodeCell(initialInputs, c); 
-                                               
cellTmpl.setDataType(hop.getDataType());
-                                               
cellTmpl.setCellType(CellType.NO_AGG);
-                                               
cellTmpl.setMultipleConsumers(hop.getParent().size()>1);
-                                               
-                                               _cpplans.put(hop.getHopID(), 
new Pair<Hop[],CNodeTpl>(new Hop[] {_matrixInputs.get(0),hop} ,cellTmpl));
-                                       }
-                                       else
-                                       {
-                                               
initialInputs.addAll(_initialCnodes);
-                                               initialInputs.addAll(cnodeData);
-                                               cellTmpl =  new 
CNodeCell(initialInputs, out); 
-                                               
cellTmpl.setDataType(hop.getDataType());
-                                               
cellTmpl.setCellType(CellType.NO_AGG);
-                                               
cellTmpl.setMultipleConsumers(hop.getParent().size()>1);
-                                               
-                                               //Hop[] hopArray = new 
Hop[hop.getInput().size()+1];
-                                               Hop[] hopArray = new 
Hop[addedHops.size()+1];
-                                               hopArray[0] = 
_matrixInputs.get(0);
-                                               
-                                               //System.arraycopy( 
hop.getInput().toArray(), 0, hopArray, 1, hop.getInput().size());
-                                               System.arraycopy( 
addedHops.toArray(), 0, hopArray, 1, addedHops.size());
-                                               
-                                               _cpplans.put(hop.getHopID(), 
new Pair<Hop[],CNodeTpl>(hopArray,cellTmpl));
-                                       }
-                               }
-                               else
-                               {
-                                       if(independentOperands)
-                                       {
-                                               CNode c =  new CNodeData(hop);
-                                               //clear Operands
-                                               addedCNodes.clear();
-                                               addedHops.clear();
-                                               
-                                               //added the current hop as the 
input
-                                               addedCNodes.add(c);
-                                               addedHops.add(hop);
-                                               out = c;
-                                       }
-                                       //wire the output to existing or new 
template   
-                                       
TemplateUtils.setOutputToExistingTemplate(hop, out, _cpplans, addedCNodes, 
addedHops);
-                               }
-                       }
-                       memo.add(hop.getHopID());
+                       //cdata1 is vector
+                       if( TemplateUtils.isColVector(cdata1) )
+                               cdata1 = new CNodeUnary(cdata1, 
UnaryType.LOOKUP_R);
+                       else if( cdata1 instanceof CNodeData && 
hop.getInput().get(0).getDataType().isMatrix() )
+                               cdata1 = new CNodeUnary(cdata1, 
UnaryType.LOOKUP_RC);
+                       
+                       //cdata2 is vector
+                       if( TemplateUtils.isColVector(cdata2) )
+                               cdata2 = new CNodeUnary(cdata2, 
UnaryType.LOOKUP_R);
+                       else if( cdata2 instanceof CNodeData && 
hop.getInput().get(1).getDataType().isMatrix() )
+                               cdata2 = new CNodeUnary(cdata2, 
UnaryType.LOOKUP_RC);
+                       
+                       if( bop.getOp()==OpOp2.POW && cdata2.isLiteral() && 
cdata2.getVarname().equals("2") )
+                               out = new CNodeUnary(cdata1, UnaryType.POW2);
+                       else if( bop.getOp()==OpOp2.MULT && cdata2.isLiteral() 
&& cdata2.getVarname().equals("2") )
+                               out = new CNodeUnary(cdata1, UnaryType.MULT2);
+                       else //default binary   
+                               out = new CNodeBinary(cdata1, cdata2, 
BinType.valueOf(primitiveOpName));
                }
+               else if(hop instanceof TernaryOp) 
+               {
+                       TernaryOp top = (TernaryOp) hop;
+                       CNode cdata1 = 
tmp.get(hop.getInput().get(0).getHopID());
+                       CNode cdata2 = 
tmp.get(hop.getInput().get(1).getHopID());
+                       CNode cdata3 = 
tmp.get(hop.getInput().get(2).getHopID());
+                       
+                       //cdata1 is vector
+                       if( TemplateUtils.isColVector(cdata1) )
+                               cdata1 = new CNodeUnary(cdata1, 
UnaryType.LOOKUP_R);
+                       else if( cdata1 instanceof CNodeData && 
hop.getInput().get(0).getDataType().isMatrix() )
+                               cdata1 = new CNodeUnary(cdata1, 
UnaryType.LOOKUP_RC);
+                       
+                       //cdata3 is vector
+                       if( TemplateUtils.isColVector(cdata3) )
+                               cdata3 = new CNodeUnary(cdata3, 
UnaryType.LOOKUP_R);
+                       else if( cdata3 instanceof CNodeData && 
hop.getInput().get(2).getDataType().isMatrix() )
+                               cdata3 = new CNodeUnary(cdata3, 
UnaryType.LOOKUP_RC);
+                       
+                       //construct ternary cnode, primitive operation derived 
from OpOp3
+                       out = new CNodeTernary(cdata1, cdata2, cdata3, 
+                                       
TernaryType.valueOf(top.getOp().toString()));
+               }
+               else if (hop instanceof AggUnaryOp && ((AggUnaryOp)hop).getOp() 
== AggOp.SUM
+                       && (((AggUnaryOp) hop).getDirection() == 
Direction.RowCol 
+                       || ((AggUnaryOp) hop).getDirection() == Direction.Row) )
+               {
+                       out = tmp.get(hop.getInput().get(0).getHopID());
+               }
+               
+               tmp.put(hop.getHopID(), out);
        }
-
-       private boolean isValidOperation(Hop hop) 
+       
+       private static boolean isValidOperation(Hop hop) 
        {       
                //prepare indicators for binary operations
                boolean isBinaryMatrixScalar = false;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c5eddccf/src/main/java/org/apache/sysml/hops/codegen/template/CplanRegister.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/hops/codegen/template/CplanRegister.java 
b/src/main/java/org/apache/sysml/hops/codegen/template/CplanRegister.java
deleted file mode 100644
index f14981d..0000000
--- a/src/main/java/org/apache/sysml/hops/codegen/template/CplanRegister.java
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.sysml.hops.codegen.template;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.LinkedHashMap;
-import java.util.Map.Entry;
-
-import org.apache.sysml.api.DMLScript;
-import org.apache.sysml.hops.Hop;
-import org.apache.sysml.hops.codegen.cplan.CNode;
-import org.apache.sysml.hops.codegen.cplan.CNodeCell;
-import org.apache.sysml.hops.codegen.cplan.CNodeData;
-import org.apache.sysml.hops.codegen.cplan.CNodeRowAggVector;
-import org.apache.sysml.hops.codegen.cplan.CNodeTpl;
-import org.apache.sysml.hops.codegen.cplan.CNodeUnary.UnaryType;
-import org.apache.sysml.hops.codegen.template.BaseTpl.TemplateType;
-import org.apache.sysml.parser.Expression.DataType;
-import org.apache.sysml.runtime.matrix.data.Pair;
-import org.apache.sysml.utils.Statistics;
-
-public class CplanRegister {
-       
-       //HashMap: key: TemplateType - Value: List of all the patterns fused by 
that template 
-       //LinkedHashMap: key: HopID of the original hop to be fused , Value: 
Input hops to the fused operation 
-               //Note: LinkedHashMap holds intermediate cplans as well (e.g, 
log(exp(round(X))) ) We store in the LinkedHashMao three keys 
-                           //for the three hops (log, exp and round). The key 
that was inserted last is the key of the hop to be fused
-               
-       private HashMap<TemplateType, ArrayList<LinkedHashMap<Long, 
Pair<Hop[],CNodeTpl>>>>  _cplans;
-       
-       public CplanRegister() {
-               _cplans = new HashMap<TemplateType, 
ArrayList<LinkedHashMap<Long, Pair<Hop[],CNodeTpl>>>>();
-       }
-       
-       public void insertCpplans(TemplateType type, LinkedHashMap<Long, 
Pair<Hop[],CNodeTpl>> cplans) {
-               if( !_cplans.containsKey(type) )
-                       _cplans.put(type, new ArrayList<LinkedHashMap<Long, 
Pair<Hop[],CNodeTpl>>>());
-               
-               _cplans.get(type).add(cplans);
-               
-               if( DMLScript.STATISTICS )
-                       Statistics.incrementCodegenCPlanCompile(1); 
-               //note: cplans.size() would also contain all subsets of cpplans
-       }
-
-       public boolean containsHop(TemplateType type, long hopID) {
-               if(!_cplans.containsKey(type))
-                       return false;
-               for (LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> cpplans : 
_cplans.get(type) )
-                       if(cpplans.containsKey(hopID))
-                               return true;
-               
-               return false;
-       }
-       
-       public LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> getTopLevelCplans()
-       {
-               if( _cplans.isEmpty() )
-                       return new LinkedHashMap<Long, Pair<Hop[],CNodeTpl>>();
-                       
-               //resolve conflicts, i.e., overlap, between template types 
-               resolvePlanConflicts(); 
-               
-               //extract top level (subsuming) cplans per type and operator 
chain
-               LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> ret = new 
LinkedHashMap<Long, Pair<Hop[],CNodeTpl>>();
-               for (TemplateType key : _cplans.keySet()) {
-                       for (LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> 
intermediateCplans : _cplans.get(key)) {
-                               Entry<Long, Pair<Hop[],CNodeTpl>> cplan = 
TemplateUtils.getTopLevelCpplan(intermediateCplans);
-                               if(cplan !=null)
-                                       ret.put(cplan.getKey(), 
cplan.getValue());                      
-                       }
-               }
-               
-               //merge top level plans if possible //TODO move to rowagg 
template
-               ret = mergeRowAggregateCellwisePlans(ret);
-               
-               return ret;
-       }
-       
-       /**
-        * Resolves conflicts between overlapping cplans of different types.
-        * 
-        */
-       private void resolvePlanConflicts()
-       {
-               //get different plan categories
-               ArrayList<LinkedHashMap<Long, Pair<Hop[], CNodeTpl>>> 
cellwisePlans = _cplans.get(TemplateType.CellTpl);
-               ArrayList<LinkedHashMap<Long, Pair<Hop[], CNodeTpl>>> 
outerprodPlans = _cplans.get(TemplateType.OuterProductTpl);
-               ArrayList<LinkedHashMap<Long, Pair<Hop[], CNodeTpl>>> 
rowaggPlans = _cplans.get(TemplateType.RowAggTpl);
-               
-               //prefer outer product plans over cellwise plans -> remove 
overlap
-               if( cellwisePlans != null && outerprodPlans != null ) {
-                       for( LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> 
outerprodCplan : outerprodPlans ) {
-                               for( LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> 
map : cellwisePlans )
-                                       for( Long key : outerprodCplan.keySet() 
)
-                                               map.remove(key);
-                       }               
-               }
-               
-               //prefer row aggregate plans over cellwise plans -> remove 
overlap
-               if( cellwisePlans != null && rowaggPlans != null ) {
-                       for( LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> 
rowaggCplan : rowaggPlans ) {
-                               for( LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> 
map : cellwisePlans )
-                                       for( Long key : rowaggCplan.keySet() )
-                                               map.remove(key);
-                       }       
-               }
-       }
-       
-       private static LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> 
mergeRowAggregateCellwisePlans(LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> plans)
-       {
-               LinkedHashMap<Long, Pair<Hop[],CNodeTpl>> ret = new 
LinkedHashMap<Long, Pair<Hop[],CNodeTpl>>(plans);
-               
-               //extract row aggregate templates
-               HashMap<Long, Pair<Hop[],CNodeTpl>> rowaggPlans = new 
HashMap<Long, Pair<Hop[],CNodeTpl>>();
-               for( Entry<Long, Pair<Hop[],CNodeTpl>> e : plans.entrySet() )
-                       if( e.getValue().getValue() instanceof 
CNodeRowAggVector )
-                               rowaggPlans.put(e.getKey(), e.getValue());
-               
-               //probe and merge row aggregate secondary inputs (by definition 
vectors)
-               for( Entry<Long, Pair<Hop[],CNodeTpl>> e : 
rowaggPlans.entrySet() ) {
-                       //check all inputs for existing cell plans
-                       Hop[] inputs = e.getValue().getKey();
-                       for( int i=1; i<inputs.length; i++ ) {
-                               long inhopID = inputs[i].getHopID();
-                               if( ret.containsKey(inhopID) && 
ret.get(inhopID).getValue() instanceof CNodeCell
-                                       && 
!((CNodeCell)ret.get(inhopID).getValue()).hasMultipleConsumers() ) 
-                               {
-                                       //merge row agg template
-                                       CNodeRowAggVector rowaggtpl = 
(CNodeRowAggVector) e.getValue().getValue();
-                                       CNodeCell celltpl = 
(CNodeCell)ret.get(inhopID).getValue();
-                                       
celltpl.getInput().get(0).setDataType(DataType.MATRIX);
-                                       
rowaggtpl.rReplaceDataNode(rowaggtpl.getOutput(), inhopID, celltpl.getOutput());
-                                       
rowaggtpl.rInsertLookupNode(rowaggtpl.getOutput(), 
((CNodeData)celltpl.getInput().get(0)).getHopID(), 
-                                                       new HashMap<Long, 
CNode>(), UnaryType.LOOKUP_R);
-                                       for( CNode input : celltpl.getInput() )
-                                               rowaggtpl.addInput(input);
-                                       HashSet<Long> inputIDs = 
TemplateUtils.rGetInputHopIDs(rowaggtpl.getOutput(), new HashSet<Long>());
-                                       Hop[] hops = 
TemplateUtils.mergeDistinct(inputIDs, inputs, ret.get(inhopID).getKey());
-                                       e.getValue().setKey(hops);
-                                       
-                                       //remove cell template 
-                                       ret.remove(inhopID);
-                               }
-                       }
-               }
-               
-               return ret;
-       }
-}

Reply via email to