[SYSTEMML-1507,1508] Generalized codegen rowwise template (agg types)

So far the codegen row aggregate template only supported rowwise column
aggregations like colSums(X <= rowMins(X)) or t(X) %*% (w * (X %*% v)).
Similar to the existing cellwise template, this patch now generalizes
the row aggregate template to a rowwise templates with column
aggregation, row aggregation, or no aggregation. This enables fusion of
complex rowwise patterns, which is important for algorithms like Kmeans
and Mlogreg. For example, we are now able to fuse the following
expressions into a single rowwise operation:

* Example without aggregation (single operator)
P = (D <= rowMins (D));
P = P / rowSums (P);
 
* Example with row aggregation (single operator)
exp_LT = exp (LT - rowMaxs (LT));
V = log (rowSums (exp_LT));

Furthermore, this patch includes the following code generator extensions
and fixes:

* Validity checks for merging rowwise templates (common input)
* Pruning of invalid plans after decisions on materialization points
* New row vector primitives for exp, log, and minus
* New row vector primitives for vector writes 
* Fix min, max, and sparse sum row vector primitives 
* Fix missing import of FastMath for builtins in multi aggregates
* Fix check for supported binary operations in row templates
* Cleanup and extensions of hop rewrite utils
* Various new testcases for rowwise templates, incl spark operations
* Refactoring all class names of row aggregate to rowwise


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/eeb4f270
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/eeb4f270
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/eeb4f270

Branch: refs/heads/master
Commit: eeb4f2708d96d2b0741cfcd9a4a03f775b97cdc2
Parents: cfc73fe
Author: Matthias Boehm <mboe...@gmail.com>
Authored: Mon Apr 10 12:53:13 2017 -0700
Committer: Matthias Boehm <mboe...@gmail.com>
Committed: Mon Apr 10 22:53:11 2017 -0700

----------------------------------------------------------------------
 .../sysml/hops/codegen/SpoofCompiler.java       |   3 +-
 .../sysml/hops/codegen/cplan/CNodeBinary.java   |  17 +-
 .../sysml/hops/codegen/cplan/CNodeMultiAgg.java |   1 +
 .../sysml/hops/codegen/cplan/CNodeRow.java      | 173 ++++++++++
 .../sysml/hops/codegen/cplan/CNodeRowAgg.java   | 134 -------
 .../sysml/hops/codegen/cplan/CNodeTernary.java  |   7 +-
 .../sysml/hops/codegen/cplan/CNodeUnary.java    |  32 +-
 .../hops/codegen/template/CPlanMemoTable.java   |   2 +-
 .../hops/codegen/template/PlanSelection.java    |   2 +-
 .../template/PlanSelectionFuseCostBased.java    |  49 +++
 .../hops/codegen/template/TemplateBase.java     |   2 +-
 .../hops/codegen/template/TemplateRow.java      | 345 +++++++++++++++++++
 .../hops/codegen/template/TemplateRowAgg.java   | 325 -----------------
 .../hops/codegen/template/TemplateUtils.java    |  55 ++-
 .../sysml/hops/rewrite/HopRewriteUtils.java     |  14 +-
 .../runtime/codegen/LibSpoofPrimitives.java     | 109 ++++--
 .../runtime/codegen/SpoofRowAggregate.java      | 208 -----------
 .../sysml/runtime/codegen/SpoofRowwise.java     | 285 +++++++++++++++
 .../instructions/spark/SpoofSPInstruction.java  |  50 ++-
 .../functions/codegen/RowAggTmplTest.java       | 174 ++++++++--
 .../scripts/functions/codegen/rowAggPattern11.R |  34 ++
 .../functions/codegen/rowAggPattern11.dml       |  28 ++
 .../scripts/functions/codegen/rowAggPattern12.R |  34 ++
 .../functions/codegen/rowAggPattern12.dml       |  28 ++
 .../scripts/functions/codegen/rowAggPattern13.R |  33 ++
 .../functions/codegen/rowAggPattern13.dml       |  27 ++
 26 files changed, 1410 insertions(+), 761 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java 
b/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
index 3dfb452..fdb8d9d 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/SpoofCompiler.java
@@ -440,7 +440,8 @@ public class SpoofCompiler
                                Hop input2 = hop.getInput().get(k);
                                if( memo.contains(input2.getHopID()) && 
!memo.get(input2.getHopID()).get(0).closed
                                        && 
TemplateUtils.isType(memo.get(input2.getHopID()).get(0).type, tpl.getType(), 
TemplateType.CellTpl)
-                                       && tpl.merge(hop, input2) ) 
+                                       && tpl.merge(hop, input2) && 
(tpl.getType()!=TemplateType.RowTpl || pos==-1 
+                                               || 
TemplateUtils.hasCommonRowTemplateMatrixInput(hop.getInput().get(pos), input2, 
memo)))
                                        P.crossProduct(k, -1L, 
input2.getHopID());
                                else
                                        P.crossProduct(k, -1L);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java 
b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
index b6b6ce5..4d54cd1 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
@@ -29,10 +29,12 @@ public class CNodeBinary extends CNode
 {
        public enum BinType {
                DOT_PRODUCT,
-               VECT_MULT_ADD, VECT_DIV_ADD, VECT_EQUAL_ADD, VECT_NOTEQUAL_ADD, 
-               VECT_LESS_ADD, VECT_LESSEQUAL_ADD, VECT_GREATER_ADD, 
VECT_GREATEREQUAL_ADD,
-               VECT_MULT_SCALAR, VECT_DIV_SCALAR, VECT_EQUAL_SCALAR, 
VECT_NOTEQUAL_SCALAR, 
-               VECT_LESS_SCALAR, VECT_LESSEQUAL_SCALAR, VECT_GREATER_SCALAR, 
VECT_GREATEREQUAL_SCALAR,
+               VECT_MULT_ADD, VECT_DIV_ADD, VECT_MINUS_ADD,
+               VECT_EQUAL_ADD, VECT_NOTEQUAL_ADD, VECT_LESS_ADD, 
+               VECT_LESSEQUAL_ADD, VECT_GREATER_ADD, VECT_GREATEREQUAL_ADD,
+               VECT_MULT_SCALAR, VECT_DIV_SCALAR, VECT_MINUS_SCALAR, 
+               VECT_EQUAL_SCALAR, VECT_NOTEQUAL_SCALAR, VECT_LESS_SCALAR, 
+               VECT_LESSEQUAL_SCALAR, VECT_GREATER_SCALAR, 
VECT_GREATEREQUAL_SCALAR,
                MULT, DIV, PLUS, MINUS, MODULUS, INTDIV, 
                LESS, LESSEQUAL, GREATER, GREATEREQUAL, EQUAL,NOTEQUAL,
                MIN, MAX, AND, OR, LOG, LOG_NZ, POW,
@@ -71,6 +73,7 @@ public class CNodeBinary extends CNode
                                }
                                
                                case VECT_DIV_SCALAR:
+                               case VECT_MINUS_SCALAR:
                                case VECT_MULT_SCALAR:
                                case VECT_EQUAL_SCALAR:
                                case VECT_NOTEQUAL_SCALAR:
@@ -130,7 +133,7 @@ public class CNodeBinary extends CNode
                        }
                }
                public boolean isVectorScalarPrimitive() {
-                       return this == VECT_DIV_SCALAR || this == 
VECT_MULT_SCALAR
+                       return this == VECT_DIV_SCALAR || this == 
VECT_MULT_SCALAR || this == VECT_MINUS_SCALAR
                                || this == VECT_EQUAL_SCALAR || this == 
VECT_NOTEQUAL_SCALAR
                                || this == VECT_LESS_SCALAR || this == 
VECT_LESSEQUAL_SCALAR
                                || this == VECT_GREATER_SCALAR || this == 
VECT_GREATEREQUAL_SCALAR;
@@ -211,6 +214,7 @@ public class CNodeBinary extends CNode
                        case DOT_PRODUCT: return "b(dot)";
                        case VECT_MULT_ADD: return "b(vma)";
                        case VECT_DIV_ADD: return "b(vda)";
+                       case VECT_MINUS_ADD: return "b(vmia)";
                        case VECT_EQUAL_ADD: return "b(veqa)";
                        case VECT_NOTEQUAL_ADD: return "b(vneqa)";
                        case VECT_LESS_ADD: return "b(vlta)";
@@ -219,6 +223,7 @@ public class CNodeBinary extends CNode
                        case VECT_GREATER_ADD: return "b(vgta)";
                        case VECT_MULT_SCALAR:  return "b(vm)";
                        case VECT_DIV_SCALAR:  return "b(vd)";
+                       case VECT_MINUS_SCALAR:  return "b(vmi)";
                        case VECT_EQUAL_SCALAR: return "b(veq)";
                        case VECT_NOTEQUAL_SCALAR: return "b(vneq)";
                        case VECT_LESS_SCALAR: return "b(vlt)";
@@ -253,6 +258,7 @@ public class CNodeBinary extends CNode
                        //VECT
                        case VECT_MULT_ADD: 
                        case VECT_DIV_ADD:
+                       case VECT_MINUS_ADD:
                        case VECT_EQUAL_ADD: 
                        case VECT_NOTEQUAL_ADD: 
                        case VECT_LESS_ADD: 
@@ -266,6 +272,7 @@ public class CNodeBinary extends CNode
                                
                        case VECT_DIV_SCALAR:   
                        case VECT_MULT_SCALAR:
+                       case VECT_MINUS_SCALAR:
                        case VECT_EQUAL_SCALAR: 
                        case VECT_NOTEQUAL_SCALAR: 
                        case VECT_LESS_SCALAR: 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeMultiAgg.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeMultiAgg.java 
b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeMultiAgg.java
index d9502be..95e1f75 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeMultiAgg.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeMultiAgg.java
@@ -35,6 +35,7 @@ public class CNodeMultiAgg extends CNodeTpl
                        + "import 
org.apache.sysml.runtime.codegen.SpoofMultiAggregate;\n"
                        + "import 
org.apache.sysml.runtime.codegen.SpoofCellwise;\n"
                        + "import 
org.apache.sysml.runtime.codegen.SpoofCellwise.AggOp;\n"
+                       + "import org.apache.commons.math3.util.FastMath;\n"
                        + "\n"
                        + "public final class %TMP% extends SpoofMultiAggregate 
{ \n"
                        + "  public %TMP%() {\n"

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRow.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRow.java 
b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRow.java
new file mode 100644
index 0000000..3cc2e3b
--- /dev/null
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRow.java
@@ -0,0 +1,173 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.hops.codegen.cplan;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import org.apache.sysml.hops.codegen.SpoofFusedOp.SpoofOutputDimsType;
+import org.apache.sysml.runtime.codegen.SpoofRowwise.RowType;
+
+public class CNodeRow extends CNodeTpl
+{
+       private static final String TEMPLATE = 
+                         "package codegen;\n"
+                       + "import 
org.apache.sysml.runtime.codegen.LibSpoofPrimitives;\n"
+                       + "import 
org.apache.sysml.runtime.codegen.SpoofRowwise;\n"
+                       + "import 
org.apache.sysml.runtime.codegen.SpoofRowwise.RowType;\n"
+                       + "import org.apache.commons.math3.util.FastMath;\n"
+                       + "\n"
+                       + "public final class %TMP% extends SpoofRowwise { \n"
+                       + "  public %TMP%() {\n"
+                       + "    super(RowType.%TYPE%, %VECT_MEM%);\n"
+                       + "  }\n"
+                       + "  protected void genexecRowDense( double[] a, int 
ai, double[][] b, double[] scalars, double[] c, int len, int rowIndex ) { \n"
+                       + "%BODY_dense%"
+                       + "  }\n"
+                       + "  protected void genexecRowSparse( double[] avals, 
int[] aix, int ai, double[][] b, double[] scalars, double[] c, int len, int 
rowIndex ) { \n"
+                       + "%BODY_sparse%"
+                       + "  }\n"                       
+                       + "}\n";
+
+       private static final String TEMPLATE_ROWAGG_OUT = "    c[rowIndex] = 
%IN%;\n";
+       private static final String TEMPLATE_NOAGG_OUT = "    
LibSpoofPrimitives.vectWrite(%IN%, c, rowIndex*len, len);\n";
+       
+       public CNodeRow(ArrayList<CNode> inputs, CNode output ) {
+               super(inputs, output);
+       }
+       
+       private RowType _type = null; //access pattern 
+       private int _numVectors = -1; //number of intermediate vectors
+       
+       public void setNumVectorIntermediates(int num) {
+               _numVectors = num;
+       }
+       
+       public int getNumVectorIntermediates() {
+               return _numVectors;
+       }
+       
+       public void setRowType(RowType type) {
+               _type = type;
+               _hash = 0;
+       }
+       
+       public RowType getRowType() {
+               return _type;
+       }
+       
+       @Override
+       public String codegen(boolean sparse) {
+               // note: ignore sparse flag, generate both
+               String tmp = TEMPLATE;
+               
+               //rename inputs
+               rReplaceDataNode(_output, _inputs.get(0), "a"); // input matrix
+               renameInputs(_inputs, 1);
+               
+               //generate dense/sparse bodies
+               String tmpDense = _output.codegen(false)
+                       + getOutputStatement(_output.getVarname());
+               _output.resetGenerated();
+               String tmpSparse = _output.codegen(true)
+                       + getOutputStatement(_output.getVarname());
+               tmp = tmp.replaceAll("%TMP%", createVarname());
+               tmp = tmp.replaceAll("%BODY_dense%", tmpDense);
+               tmp = tmp.replaceAll("%BODY_sparse%", tmpSparse);
+               
+               //replace outputs 
+               tmp = tmp.replaceAll("%OUT%", "c");
+               tmp = tmp.replaceAll("%POSOUT%", "0");
+               
+               //replace size information
+               tmp = tmp.replaceAll("%LEN%", "len");
+               
+               //replace colvector information and number of vector 
intermediates
+               tmp = tmp.replaceAll("%TYPE%", _type.name());
+               tmp = tmp.replaceAll("%VECT_MEM%", String.valueOf(_numVectors));
+               
+               return tmp;
+       }
+       
+       private String getOutputStatement(String varName) {
+               if( !_type.isColumnAgg() ) {
+                       String tmp = (_type==RowType.NO_AGG) ?
+                               TEMPLATE_NOAGG_OUT : TEMPLATE_ROWAGG_OUT;
+                       return tmp.replace("%IN%", varName);
+               }
+               return "";
+       }
+
+       @Override
+       public void setOutputDims() {
+               // TODO Auto-generated method stub
+               
+       }
+
+       @Override
+       public SpoofOutputDimsType getOutputDimType() {
+               return (_output._cols==1) ? 
+                       SpoofOutputDimsType.COLUMN_DIMS_ROWS : //column vector
+                       SpoofOutputDimsType.COLUMN_DIMS_COLS;  //row vector
+       }
+       
+       @Override
+       public CNodeTpl clone() {
+               CNodeRow tmp = new CNodeRow(_inputs, _output);
+               tmp.setRowType(_type);
+               tmp.setNumVectorIntermediates(_numVectors);
+               return tmp;
+       }
+       
+       @Override
+       public int hashCode() {
+               if( _hash == 0 ) {
+                       int h1 = super.hashCode();
+                       int h2 = _type.hashCode();
+                       int h3 = _numVectors;
+                       _hash = Arrays.hashCode(new int[]{h1,h2,h3});
+               }
+               return _hash;
+       }
+       
+       @Override 
+       public boolean equals(Object o) {
+               if(!(o instanceof CNodeRow))
+                       return false;
+               
+               CNodeRow that = (CNodeRow)o;
+               return super.equals(o)
+                       && _type == that._type
+                       && _numVectors == that._numVectors      
+                       && equalInputReferences(
+                               _output, that._output, _inputs, that._inputs);
+       }
+       
+       @Override
+       public String getTemplateInfo() {
+               StringBuilder sb = new StringBuilder();
+               sb.append("SPOOF ROWAGGREGATE [type=");
+               sb.append(_type.name());
+               sb.append(", reqVectMem=");
+               sb.append(_numVectors);
+               sb.append("]");
+               return sb.toString();
+       }
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRowAgg.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRowAgg.java 
b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRowAgg.java
deleted file mode 100644
index 846c88d..0000000
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRowAgg.java
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.sysml.hops.codegen.cplan;
-
-import java.util.ArrayList;
-
-import org.apache.sysml.hops.codegen.SpoofFusedOp.SpoofOutputDimsType;
-
-public class CNodeRowAgg extends CNodeTpl
-{
-       private static final String TEMPLATE = 
-                         "package codegen;\n"
-                       + "import 
org.apache.sysml.runtime.codegen.LibSpoofPrimitives;\n"
-                       + "import 
org.apache.sysml.runtime.codegen.SpoofRowAggregate;\n"
-                       + "\n"
-                       + "public final class %TMP% extends SpoofRowAggregate { 
\n"
-                       + "  public %TMP%() {\n"
-                       + "    super(%COL_VECTOR%, %VECT_MEM%);\n"
-                       + "  }\n"
-                       + "  protected void genexecRowDense( double[] a, int 
ai, double[][] b, double[] scalars, double[] c, int len, int rowIndex ) { \n"
-                       + "%BODY_dense%"
-                       + "  }\n"
-                       + "  protected void genexecRowSparse( double[] avals, 
int[] aix, int ai, double[][] b, double[] scalars, double[] c, int len, int 
rowIndex ) { \n"
-                       + "%BODY_sparse%"
-                       + "  }\n"                       
-                       + "}\n";
-
-       public CNodeRowAgg(ArrayList<CNode> inputs, CNode output ) {
-               super(inputs, output);
-       }
-       
-       //number of intermediate vectors
-       private int _numVectors = -1;
-       
-       public void setNumVectorIntermediates(int num) {
-               _numVectors = num;
-       }
-       
-       public int getNumVectorIntermediates() {
-               return _numVectors;
-       }
-       
-       @Override
-       public String codegen(boolean sparse) {
-               // note: ignore sparse flag, generate both
-               String tmp = TEMPLATE;
-               
-               //rename inputs
-               rReplaceDataNode(_output, _inputs.get(0), "a"); // input matrix
-               renameInputs(_inputs, 1);
-               
-               //generate dense/sparse bodies
-               String tmpDense = _output.codegen(false);
-               _output.resetGenerated();
-               String tmpSparse = _output.codegen(true);
-               tmp = tmp.replaceAll("%TMP%", createVarname());
-               tmp = tmp.replaceAll("%BODY_dense%", tmpDense);
-               tmp = tmp.replaceAll("%BODY_sparse%", tmpSparse);
-               
-               //replace outputs 
-               tmp = tmp.replaceAll("%OUT%", "c");
-               tmp = tmp.replaceAll("%POSOUT%", "0");
-               
-               //replace size information
-               tmp = tmp.replaceAll("%LEN%", "len");
-               
-               //replace colvector information and number of vector 
intermediates
-               tmp = tmp.replaceAll("%COL_VECTOR%", 
String.valueOf(_output._cols==1));
-               tmp = tmp.replaceAll("%VECT_MEM%", String.valueOf(_numVectors));
-               
-               return tmp;
-       }
-
-       @Override
-       public void setOutputDims() {
-               // TODO Auto-generated method stub
-               
-       }
-
-       @Override
-       public SpoofOutputDimsType getOutputDimType() {
-               return (_output._cols==1) ? 
-                       SpoofOutputDimsType.COLUMN_DIMS_ROWS : //column vector
-                       SpoofOutputDimsType.COLUMN_DIMS_COLS;  //row vector
-       }
-       
-       @Override
-       public CNodeTpl clone() {
-               return new CNodeRowAgg(_inputs, _output);
-       }
-       
-       @Override
-       public int hashCode() {
-               return super.hashCode();
-       }
-       
-       @Override 
-       public boolean equals(Object o) {
-               if(!(o instanceof CNodeRowAgg))
-                       return false;
-               
-               CNodeRowAgg that = (CNodeRowAgg)o;
-               return super.equals(o)
-                       && _numVectors == that._numVectors      
-                       && equalInputReferences(
-                               _output, that._output, _inputs, that._inputs);
-       }
-       
-       @Override
-       public String getTemplateInfo() {
-               StringBuilder sb = new StringBuilder();
-               sb.append("SPOOF ROWAGGREGATE [reqVectMem=");
-               sb.append(_numVectors);
-               sb.append("]");
-               return sb.toString();
-       }
-}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeTernary.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeTernary.java 
b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeTernary.java
index b3b1942..2a868f8 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeTernary.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeTernary.java
@@ -54,7 +54,9 @@ public class CNodeTernary extends CNode
                                        return "    double %TMP% = 
Double.isNaN(%IN1%) ? %IN3% : %IN1%;\n";
                                        
                                case LOOKUP_RC1:
-                                       return "    double %TMP% = 
getValue(%IN1%, rowIndex*%IN2%+%IN3%-1);\n"; 
+                                       return sparse ?
+                                                       "    double %TMP% = 
getValue(%IN1v%, rowIndex*%IN2%+%IN3%-1);\n" :      
+                                                       "    double %TMP% = 
getValue(%IN1%, rowIndex*%IN2%+%IN3%-1);\n";        
                                        
                                default: 
                                        throw new RuntimeException("Invalid 
ternary type: "+this.toString());
@@ -94,6 +96,9 @@ public class CNodeTernary extends CNode
                tmp = tmp.replaceAll("%TMP%", var);
                for( int j=1; j<=3; j++ ) {
                        String varj = _inputs.get(j-1).getVarname();
+                       //replace sparse and dense inputs
+                       tmp = tmp.replaceAll("%IN"+j+"v%", 
+                               varj+(varj.startsWith("b")?"":"vals") );
                        tmp = tmp.replaceAll("%IN"+j+"%", varj );
                }
                sb.append(tmp);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java 
b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
index 025033b..437100f 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
@@ -30,6 +30,7 @@ public class CNodeUnary extends CNode
        public enum UnaryType {
                LOOKUP_R, LOOKUP_RC, LOOKUP0, //codegen specific
                ROW_SUMS, ROW_MINS, ROW_MAXS, //codegen specific
+               VECT_EXP_SCALAR, VECT_LOG_SCALAR,
                EXP, POW2, MULT2, SQRT, LOG, LOG_NZ,
                ABS, ROUND, CEIL, FLOOR, SIGN, 
                SIN, COS, TAN, ASIN, ACOS, ATAN,
@@ -46,10 +47,19 @@ public class CNodeUnary extends CNode
                        switch( this ) {
                                case ROW_SUMS:
                                case ROW_MINS:
-                               case ROW_MAXS:
+                               case ROW_MAXS: {
                                        String vectName = 
StringUtils.capitalize(this.toString().substring(4,7).toLowerCase());
                                        return sparse ? "    double %TMP% = 
LibSpoofPrimitives.vect"+vectName+"(%IN1v%, %IN1i%, %POS1%, %LEN%);\n": 
                                                                        "    
double %TMP% = LibSpoofPrimitives.vect"+vectName+"(%IN1%, %POS1%, %LEN%);\n"; 
+                               }
+                       
+                               case VECT_EXP_SCALAR:
+                               case VECT_LOG_SCALAR: {
+                                       String vectName = 
getVectorPrimitiveName();
+                                       return sparse ? "    double[] %TMP% = 
LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN1i%, %POS1%, %LEN%);\n" : 
+                                                                       "    
double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %POS1%, 
%LEN%);\n";
+                               }
+                                       
                                case EXP:
                                        return "    double %TMP% = 
FastMath.exp(%IN1%);\n";
                            case LOOKUP_R:
@@ -101,6 +111,17 @@ public class CNodeUnary extends CNode
                                        throw new RuntimeException("Invalid 
unary type: "+this.toString());
                        }
                }
+               public boolean isVectorScalarPrimitive() {
+                       return this == UnaryType.VECT_EXP_SCALAR 
+                               || this == UnaryType.VECT_LOG_SCALAR;
+               }
+               public UnaryType getVectorAddPrimitive() {
+                       return 
UnaryType.valueOf("VECT_"+getVectorPrimitiveName().toUpperCase()+"_ADD");
+               }
+               public String getVectorPrimitiveName() {
+                       String [] tmp = this.name().split("_");
+                       return StringUtils.capitalize(tmp[1].toLowerCase());
+               }
        }
        
        private UnaryType _type;
@@ -163,6 +184,8 @@ public class CNodeUnary extends CNode
                        case ROW_SUMS:  return "u(R+)";
                        case ROW_MINS:  return "u(Rmin)";
                        case ROW_MAXS:  return "u(Rmax)";
+                       case VECT_EXP_SCALAR: return "u(vexp)";
+                       case VECT_LOG_SCALAR: return "u(vlog)";
                        case LOOKUP_R:  return "u(ixr)";
                        case LOOKUP_RC: return "u(ixrc)";
                        case LOOKUP0:   return "u(ix0)";
@@ -174,6 +197,13 @@ public class CNodeUnary extends CNode
        @Override
        public void setOutputDims() {
                switch(_type) {
+                       case VECT_EXP_SCALAR:
+                       case VECT_LOG_SCALAR:   
+                               _rows = _inputs.get(0)._rows;
+                               _cols = _inputs.get(0)._cols;
+                               _dataType= DataType.MATRIX;
+                               break;
+                       
                        case ROW_SUMS:
                        case ROW_MINS:
                        case ROW_MAXS:

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java 
b/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java
index d306f0f..8f0a8fb 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/CPlanMemoTable.java
@@ -253,7 +253,7 @@ public class CPlanMemoTable
        
        public static class MemoTableEntry 
        {
-               public final TemplateType type;
+               public TemplateType type;
                public final long input1; 
                public final long input2;
                public final long input3;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelection.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelection.java 
b/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelection.java
index 142040b..85126da 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelection.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelection.java
@@ -58,7 +58,7 @@ public abstract class PlanSelection
        protected static boolean isValid(MemoTableEntry me, Hop hop) {
                return (me.type == TemplateType.OuterProdTpl 
                                && (me.closed || 
HopRewriteUtils.isBinaryMatrixMatrixOperation(hop)))
-                       || (me.type == TemplateType.RowAggTpl && me.closed)     
+                       || (me.type == TemplateType.RowTpl)     
                        || (me.type == TemplateType.CellTpl)
                        || (me.type == TemplateType.MultiAggTpl);
        }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelectionFuseCostBased.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelectionFuseCostBased.java
 
b/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelectionFuseCostBased.java
index a9cd90e..3c98090 100644
--- 
a/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelectionFuseCostBased.java
+++ 
b/src/main/java/org/apache/sysml/hops/codegen/template/PlanSelectionFuseCostBased.java
@@ -68,6 +68,8 @@ public class PlanSelectionFuseCostBased extends PlanSelection
        private static final double COMPUTE_BANDWIDTH = 2d*1024*1024*1024 
//2GFLOPs/core
                * InfrastructureAnalyzer.getLocalParallelism();
        
+       private final static TemplateRow ROW_TPL = new TemplateRow();
+       
        @Override
        public void selectPlans(CPlanMemoTable memo, ArrayList<Hop> roots) 
        {
@@ -466,6 +468,11 @@ public class PlanSelectionFuseCostBased extends 
PlanSelection
                        for( Long hopID : R )
                                rPruneSuboptimalPlans(memo, 
memo._hopRefs.get(hopID), 
                                        visited, partition, M, bestPlan);
+                       HashSet<Long> visited2 = new HashSet<Long>();
+                       for( Long hopID : R )
+                               rPruneInvalidPlans(memo, 
memo._hopRefs.get(hopID), 
+                                       visited2, partition, M, bestPlan);
+                       
                        for( Long hopID : R )
                                rSelectPlansFuseAll(memo, 
                                        memo._hopRefs.get(hopID), null, 
partition);
@@ -498,6 +505,48 @@ public class PlanSelectionFuseCostBased extends 
PlanSelection
                visited.add(current.getHopID());                
        }
        
+       private static void rPruneInvalidPlans(CPlanMemoTable memo, Hop 
current, HashSet<Long> visited, HashSet<Long> partition, ArrayList<Long> M, 
boolean[] plan) {
+               //memoization (not via hops because in middle of dag)
+               if( visited.contains(current.getHopID()) )
+                       return;
+               
+               //process children recursively
+               for( Hop c : current.getInput() )
+                       rPruneInvalidPlans(memo, c, visited, partition, M, 
plan);
+               
+               //find invalid row aggregate leaf nodes (see TemplateRow.open) 
w/o matrix inputs, 
+               //i.e., plans that become invalid after the previous pruning 
step
+               long hopID = current.getHopID();
+               if( partition.contains(hopID) && memo.contains(hopID, 
TemplateType.RowTpl) ) {
+                       for( MemoTableEntry me : memo.get(hopID) ) {
+                               if( me.type==TemplateType.RowTpl ) {
+                                       //convert leaf node with pure vector 
inputs
+                                       if( !me.hasPlanRef() && 
!TemplateUtils.hasMatrixInput(current) ) {
+                                               me.type = TemplateType.CellTpl;
+                                               if( LOG.isTraceEnabled() )
+                                                       LOG.trace("Converted 
leaf memo table entry from row to cell: "+me);
+                                       }
+                                       
+                                       //convert inner node without row 
template input
+                                       if( me.hasPlanRef() && 
!ROW_TPL.open(current) ) {
+                                               boolean hasRowInput = false;
+                                               for( int i=0; i<3; i++ )
+                                                       if( me.isPlanRef(i) )
+                                                               hasRowInput |= 
memo.contains(me.input(i), TemplateType.RowTpl);
+                                               if( !hasRowInput ) {
+                                                       me.type = 
TemplateType.CellTpl;
+                                                       if( 
LOG.isTraceEnabled() )
+                                                               
LOG.trace("Converted inner memo table entry from row to cell: "+me);    
+                                               }
+                                       }
+                                       
+                               }
+                       }
+               }
+               
+               visited.add(current.getHopID());                
+       }
+       
        private void rSelectPlansFuseAll(CPlanMemoTable memo, Hop current, 
TemplateType currentType, HashSet<Long> partition) 
        {       
                if( isVisited(current.getHopID(), currentType) 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/codegen/template/TemplateBase.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateBase.java 
b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateBase.java
index 4fceb8a..8ed52f6 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateBase.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateBase.java
@@ -28,7 +28,7 @@ public abstract class TemplateBase
        public enum TemplateType {
                //ordering specifies type preferences
                MultiAggTpl,
-               RowAggTpl,
+               RowTpl,
                OuterProdTpl,
                CellTpl;
                public int getRank() {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java 
b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
new file mode 100644
index 0000000..5e48e44
--- /dev/null
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRow.java
@@ -0,0 +1,345 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.hops.codegen.template;
+
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import org.apache.sysml.hops.AggBinaryOp;
+import org.apache.sysml.hops.AggUnaryOp;
+import org.apache.sysml.hops.BinaryOp;
+import org.apache.sysml.hops.Hop;
+import org.apache.sysml.hops.IndexingOp;
+import org.apache.sysml.hops.LiteralOp;
+import org.apache.sysml.hops.ParameterizedBuiltinOp;
+import org.apache.sysml.hops.TernaryOp;
+import org.apache.sysml.hops.UnaryOp;
+import org.apache.sysml.hops.codegen.cplan.CNode;
+import org.apache.sysml.hops.codegen.cplan.CNodeBinary;
+import org.apache.sysml.hops.codegen.cplan.CNodeBinary.BinType;
+import org.apache.sysml.hops.codegen.cplan.CNodeTernary.TernaryType;
+import org.apache.sysml.hops.codegen.cplan.CNodeData;
+import org.apache.sysml.hops.codegen.cplan.CNodeRow;
+import org.apache.sysml.hops.codegen.cplan.CNodeTernary;
+import org.apache.sysml.hops.codegen.cplan.CNodeTpl;
+import org.apache.sysml.hops.codegen.cplan.CNodeUnary;
+import org.apache.sysml.hops.codegen.cplan.CNodeUnary.UnaryType;
+import org.apache.sysml.hops.codegen.template.CPlanMemoTable.MemoTableEntry;
+import org.apache.sysml.hops.rewrite.HopRewriteUtils;
+import org.apache.sysml.hops.Hop.AggOp;
+import org.apache.sysml.hops.Hop.Direction;
+import org.apache.sysml.hops.Hop.OpOp1;
+import org.apache.sysml.hops.Hop.OpOp2;
+import org.apache.sysml.parser.Expression.DataType;
+import org.apache.sysml.runtime.matrix.data.Pair;
+
+public class TemplateRow extends TemplateBase 
+{
+       private static final Hop.AggOp[] SUPPORTED_ROW_AGG = new 
AggOp[]{AggOp.SUM, AggOp.MIN, AggOp.MAX};
+       private static final Hop.OpOp1[] SUPPORTED_VECT_UNARY = new 
OpOp1[]{OpOp1.EXP, OpOp1.LOG};
+       private static final Hop.OpOp2[] SUPPORTED_VECT_BINARY = new 
OpOp2[]{OpOp2.MULT, OpOp2.DIV, OpOp2.MINUS, 
+                       OpOp2.EQUAL, OpOp2.NOTEQUAL, OpOp2.LESS, 
OpOp2.LESSEQUAL, OpOp2.GREATER, OpOp2.GREATEREQUAL};
+       
+       public TemplateRow() {
+               super(TemplateType.RowTpl);
+       }
+       
+       public TemplateRow(boolean closed) {
+               super(TemplateType.RowTpl, closed);
+       }
+       
+       @Override
+       public boolean open(Hop hop) {
+               return (hop instanceof AggBinaryOp && hop.getDim2()==1
+                       && hop.getInput().get(0).getDim1()>1 && 
hop.getInput().get(0).getDim2()>1)
+                       || (hop instanceof AggUnaryOp && 
((AggUnaryOp)hop).getDirection()!=Direction.RowCol 
+                               && hop.getInput().get(0).getDim1()>1 && 
hop.getInput().get(0).getDim2()>1);
+       }
+
+       @Override
+       public boolean fuse(Hop hop, Hop input) {
+               return !isClosed() && 
+                       (  (hop instanceof BinaryOp && 
TemplateUtils.isOperationSupported(hop) 
+                               && 
(HopRewriteUtils.isBinaryMatrixColVectorOperation(hop)
+                                       || 
HopRewriteUtils.isBinaryMatrixScalarOperation(hop)) ) 
+                       || ((hop instanceof UnaryOp || hop instanceof 
ParameterizedBuiltinOp) 
+                                       && TemplateCell.isValidOperation(hop))  
        
+                       || (hop instanceof AggUnaryOp && 
((AggUnaryOp)hop).getDirection()!=Direction.RowCol)
+                       || (hop instanceof AggBinaryOp && hop.getDim1()>1 
+                               && 
HopRewriteUtils.isTransposeOperation(hop.getInput().get(0))));
+       }
+
+       @Override
+       public boolean merge(Hop hop, Hop input) {
+               //merge rowagg tpl with cell tpl if input is a vector
+               return !isClosed() &&
+                       ((hop instanceof BinaryOp && input.getDim2()==1 
//matrix-scalar/vector-vector ops )
+                               && TemplateUtils.isOperationSupported(hop))
+                        ||(hop instanceof AggBinaryOp && input.getDim2()==1
+                               && 
HopRewriteUtils.isTransposeOperation(hop.getInput().get(0))));
+       }
+
+       @Override
+       public CloseType close(Hop hop) {
+               //close on column aggregate (e.g., colSums, t(X)%*%y)
+               if( hop instanceof AggUnaryOp && 
((AggUnaryOp)hop).getDirection()==Direction.Col
+                       || (hop instanceof AggBinaryOp && 
HopRewriteUtils.isTransposeOperation(hop.getInput().get(0))) )
+                       return CloseType.CLOSED_VALID;
+               else
+                       return CloseType.OPEN;
+       }
+
+       @Override
+       public Pair<Hop[], CNodeTpl> constructCplan(Hop hop, CPlanMemoTable 
memo, boolean compileLiterals) {
+               //recursively process required cplan output
+               HashSet<Hop> inHops = new HashSet<Hop>();
+               HashMap<String, Hop> inHops2 = new HashMap<String,Hop>();
+               HashMap<Long, CNode> tmp = new HashMap<Long, CNode>();
+               hop.resetVisitStatus();
+               rConstructCplan(hop, memo, tmp, inHops, inHops2, 
compileLiterals);
+               hop.resetVisitStatus();
+               
+               //reorder inputs (ensure matrix is first input, and other 
inputs ordered by size)
+               List<Hop> sinHops = inHops.stream()
+                       .filter(h -> !(h.getDataType().isScalar() && 
tmp.get(h.getHopID()).isLiteral()))
+                       .sorted(new 
HopInputComparator(inHops2.get("X"))).collect(Collectors.toList());
+               
+               //construct template node
+               ArrayList<CNode> inputs = new ArrayList<CNode>();
+               for( Hop in : sinHops )
+                       inputs.add(tmp.get(in.getHopID()));
+               CNode output = tmp.get(hop.getHopID());
+               CNodeRow tpl = new CNodeRow(inputs, output);
+               tpl.setRowType(TemplateUtils.getRowType(hop, sinHops.get(0)));
+               tpl.setNumVectorIntermediates(TemplateUtils
+                       .countVectorIntermediates(output));
+               
+               // return cplan instance
+               return new Pair<Hop[],CNodeTpl>(sinHops.toArray(new Hop[0]), 
tpl);
+       }
+
+       private void rConstructCplan(Hop hop, CPlanMemoTable memo, 
HashMap<Long, CNode> tmp, HashSet<Hop> inHops, HashMap<String, Hop> inHops2, 
boolean compileLiterals) 
+       {       
+               //memoization for common subexpression elimination and to avoid 
redundant work 
+               if( tmp.containsKey(hop.getHopID()) )
+                       return;
+               
+               //recursively process required childs
+               MemoTableEntry me = memo.getBest(hop.getHopID(), 
TemplateType.RowTpl);
+               for( int i=0; i<hop.getInput().size(); i++ ) {
+                       Hop c = hop.getInput().get(i);
+                       if( me.isPlanRef(i) )
+                               rConstructCplan(c, memo, tmp, inHops, inHops2, 
compileLiterals);
+                       else {
+                               CNodeData cdata = 
TemplateUtils.createCNodeData(c, compileLiterals);    
+                               tmp.put(c.getHopID(), cdata);
+                               inHops.add(c);
+                       }
+               }
+               
+               //construct cnode for current hop
+               CNode out = null;
+               if(hop instanceof AggUnaryOp)
+               {
+                       CNode cdata1 = 
tmp.get(hop.getInput().get(0).getHopID());
+                       if( ((AggUnaryOp)hop).getDirection() == Direction.Row 
&& HopRewriteUtils.isAggUnaryOp(hop, SUPPORTED_ROW_AGG) ) {
+                               if(hop.getInput().get(0).getDim2()==1)
+                                       out = 
(cdata1.getDataType()==DataType.SCALAR) ? cdata1 : new 
CNodeUnary(cdata1,UnaryType.LOOKUP_R);
+                               else {
+                                       String opcode = 
"ROW_"+((AggUnaryOp)hop).getOp().name().toUpperCase()+"S";
+                                       out = new CNodeUnary(cdata1, 
UnaryType.valueOf(opcode));
+                                       inHops2.put("X", hop.getInput().get(0));
+                               }
+                       }
+                       else  if (((AggUnaryOp)hop).getDirection() == 
Direction.Col && ((AggUnaryOp)hop).getOp() == AggOp.SUM ) {
+                               //vector add without temporary copy
+                               if( cdata1 instanceof CNodeBinary && 
((CNodeBinary)cdata1).getType().isVectorScalarPrimitive() )
+                                       out = new 
CNodeBinary(cdata1.getInput().get(0), cdata1.getInput().get(1), 
+                                                       
((CNodeBinary)cdata1).getType().getVectorAddPrimitive());
+                               else    
+                                       out = cdata1;
+                       }
+               }
+               else if(hop instanceof AggBinaryOp)
+               {
+                       CNode cdata1 = 
tmp.get(hop.getInput().get(0).getHopID());
+                       CNode cdata2 = 
tmp.get(hop.getInput().get(1).getHopID());
+                       
+                       if( 
HopRewriteUtils.isTransposeOperation(hop.getInput().get(0)) )
+                       {
+                               //correct input under transpose
+                               cdata1 = TemplateUtils.skipTranspose(cdata1, 
hop.getInput().get(0), tmp, compileLiterals);
+                               inHops.remove(hop.getInput().get(0)); 
+                               
inHops.add(hop.getInput().get(0).getInput().get(0));
+                               
+                               out = new CNodeBinary(cdata1, cdata2, 
BinType.VECT_MULT_ADD);
+                       }
+                       else
+                       {
+                               if(hop.getInput().get(0).getDim2()==1 && 
hop.getInput().get(1).getDim2()==1)
+                                       out = new 
CNodeBinary((cdata1.getDataType()==DataType.SCALAR)? cdata1 : new 
CNodeUnary(cdata1, UnaryType.LOOKUP0),
+                                               
(cdata2.getDataType()==DataType.SCALAR)? cdata2 : new CNodeUnary(cdata2, 
UnaryType.LOOKUP0), BinType.MULT);
+                               else {
+                                       out = new CNodeBinary(cdata1, cdata2, 
BinType.DOT_PRODUCT);
+                                       inHops2.put("X", hop.getInput().get(0));
+                               }
+                       }
+               }
+               else if(hop instanceof UnaryOp)
+               {
+                       CNode cdata1 = 
tmp.get(hop.getInput().get(0).getHopID());
+                       
+                       // if one input is a matrix then we need to do vector 
by scalar operations
+                       if(hop.getInput().get(0).getDim1() > 1 && 
hop.getInput().get(0).getDim2() > 1 ) 
+                       {
+                               if( HopRewriteUtils.isUnary(hop, 
SUPPORTED_VECT_UNARY) ) {
+                                       String opname = 
"VECT_"+((UnaryOp)hop).getOp().name()+"_SCALAR";
+                                       out = new CNodeUnary(cdata1, 
UnaryType.valueOf(opname));
+                               }
+                               else 
+                                       throw new RuntimeException("Unsupported 
unary matrix "
+                                                       + "operation: " + 
((UnaryOp)hop).getOp().name());
+                       }
+                       else //general scalar case
+                       {
+                               if( TemplateUtils.isColVector(cdata1) )
+                                       cdata1 = new CNodeUnary(cdata1, 
UnaryType.LOOKUP_R);
+                               else if( cdata1 instanceof CNodeData && 
hop.getInput().get(0).getDataType().isMatrix() )
+                                       cdata1 = new CNodeUnary(cdata1, 
UnaryType.LOOKUP_RC);
+                               
+                               String primitiveOpName = 
((UnaryOp)hop).getOp().toString();
+                               out = new CNodeUnary(cdata1, 
UnaryType.valueOf(primitiveOpName));
+                       }
+               }
+               else if(hop instanceof BinaryOp)
+               {
+                       CNode cdata1 = 
tmp.get(hop.getInput().get(0).getHopID());
+                       CNode cdata2 = 
tmp.get(hop.getInput().get(1).getHopID());
+                       
+                       // if one input is a matrix then we need to do vector 
by scalar operations
+                       if(hop.getInput().get(0).getDim1() > 1 && 
hop.getInput().get(0).getDim2() > 1 )
+                       {
+                               if( HopRewriteUtils.isBinary(hop, 
SUPPORTED_VECT_BINARY) ) {
+                                       String opname = 
"VECT_"+((BinaryOp)hop).getOp().name()+"_SCALAR";
+                                       if( TemplateUtils.isColVector(cdata2) )
+                                               cdata2 = new CNodeUnary(cdata2, 
UnaryType.LOOKUP_R);
+                                       out = new CNodeBinary(cdata1, cdata2, 
BinType.valueOf(opname));
+                               }
+                               else 
+                                       throw new RuntimeException("Unsupported 
binary matrix "
+                                                       + "operation: " + 
((BinaryOp)hop).getOp().name());
+                       }
+                       else //one input is a vector/scalar other is a scalar
+                       {
+                               String primitiveOpName = 
((BinaryOp)hop).getOp().toString();
+                               if( TemplateUtils.isColVector(cdata1) )
+                                       cdata1 = new CNodeUnary(cdata1, 
UnaryType.LOOKUP_R);
+                               if( TemplateUtils.isColVector(cdata2) )
+                                       cdata2 = new CNodeUnary(cdata2, 
UnaryType.LOOKUP_R);
+                               out = new CNodeBinary(cdata1, cdata2, 
BinType.valueOf(primitiveOpName));        
+                       }
+               }
+               else if(hop instanceof TernaryOp) 
+               {
+                       TernaryOp top = (TernaryOp) hop;
+                       CNode cdata1 = 
tmp.get(hop.getInput().get(0).getHopID());
+                       CNode cdata2 = 
tmp.get(hop.getInput().get(1).getHopID());
+                       CNode cdata3 = 
tmp.get(hop.getInput().get(2).getHopID());
+                       
+                       //cdata1 is vector
+                       if( TemplateUtils.isColVector(cdata1) )
+                               cdata1 = new CNodeUnary(cdata1, 
UnaryType.LOOKUP_R);
+                       else if( cdata1 instanceof CNodeData && 
hop.getInput().get(0).getDataType().isMatrix() )
+                               cdata1 = new CNodeUnary(cdata1, 
UnaryType.LOOKUP_RC);
+                       
+                       //cdata3 is vector
+                       if( TemplateUtils.isColVector(cdata3) )
+                               cdata3 = new CNodeUnary(cdata3, 
UnaryType.LOOKUP_R);
+                       else if( cdata3 instanceof CNodeData && 
hop.getInput().get(2).getDataType().isMatrix() )
+                               cdata3 = new CNodeUnary(cdata3, 
UnaryType.LOOKUP_RC);
+                       
+                       //construct ternary cnode, primitive operation derived 
from OpOp3
+                       out = new CNodeTernary(cdata1, cdata2, cdata3, 
+                                       
TernaryType.valueOf(top.getOp().toString()));
+               }
+               else if( hop instanceof ParameterizedBuiltinOp ) 
+               {
+                       CNode cdata1 = 
tmp.get(((ParameterizedBuiltinOp)hop).getTargetHop().getHopID());
+                       if( TemplateUtils.isColVector(cdata1) )
+                               cdata1 = new CNodeUnary(cdata1, 
UnaryType.LOOKUP_R);
+                       else if( cdata1 instanceof CNodeData && 
hop.getInput().get(0).getDataType().isMatrix() )
+                               cdata1 = new CNodeUnary(cdata1, 
UnaryType.LOOKUP_RC);
+                       
+                       CNode cdata2 = 
tmp.get(((ParameterizedBuiltinOp)hop).getParameterHop("pattern").getHopID());
+                       CNode cdata3 = 
tmp.get(((ParameterizedBuiltinOp)hop).getParameterHop("replacement").getHopID());
+                       TernaryType ttype = (cdata2.isLiteral() && 
cdata2.getVarname().equals("Double.NaN")) ? 
+                                       TernaryType.REPLACE_NAN : 
TernaryType.REPLACE;
+                       out = new CNodeTernary(cdata1, cdata2, cdata3, ttype);
+               }
+               else if( hop instanceof IndexingOp ) 
+               {
+                       CNode cdata1 = 
tmp.get(hop.getInput().get(0).getHopID());
+                       out = new CNodeTernary(cdata1, 
+                                       TemplateUtils.createCNodeData(new 
LiteralOp(hop.getInput().get(0).getDim2()), true), 
+                                       
TemplateUtils.createCNodeData(hop.getInput().get(4), true),
+                                       TernaryType.LOOKUP_RC1);
+               }
+               
+               if( out == null ) {
+                       throw new RuntimeException(hop.getHopID()+" 
"+hop.getOpString());
+               }
+               
+               if( out.getDataType().isMatrix() ) {
+                       out.setNumRows(hop.getDim1());
+                       out.setNumCols(hop.getDim2());
+               }
+               
+               tmp.put(hop.getHopID(), out);
+       }
+       
+       /**
+        * Comparator to order input hops of the row aggregate template. We try 
+        * to order matrices-vectors-scalars via sorting by number of cells but 
+        * we keep the given main input always at the first position.
+        */
+       public static class HopInputComparator implements Comparator<Hop> 
+       {
+               private final Hop _X;
+               
+               public HopInputComparator(Hop X) {
+                       _X = X;
+               }
+               
+               @Override
+               public int compare(Hop h1, Hop h2) {
+                       long ncells1 = h1.getDataType()==DataType.SCALAR ? 
Long.MIN_VALUE : 
+                               (h1==_X) ? Long.MAX_VALUE : 
+                               h1.dimsKnown() ? h1.getDim1()*h1.getDim2() : 
Long.MAX_VALUE-1;
+                       long ncells2 = h2.getDataType()==DataType.SCALAR ? 
Long.MIN_VALUE : 
+                               (h2==_X) ? Long.MAX_VALUE : 
+                               h2.dimsKnown() ? h2.getDim1()*h2.getDim2() : 
Long.MAX_VALUE-1;
+                       return (ncells1 > ncells2) ? -1 : (ncells1 < ncells2) ? 
1 : 0; 
+               }
+       }
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRowAgg.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRowAgg.java 
b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRowAgg.java
deleted file mode 100644
index 49d0cb8..0000000
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRowAgg.java
+++ /dev/null
@@ -1,325 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.sysml.hops.codegen.template;
-
-import java.util.ArrayList;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.stream.Collectors;
-
-import org.apache.sysml.hops.AggBinaryOp;
-import org.apache.sysml.hops.AggUnaryOp;
-import org.apache.sysml.hops.BinaryOp;
-import org.apache.sysml.hops.Hop;
-import org.apache.sysml.hops.IndexingOp;
-import org.apache.sysml.hops.LiteralOp;
-import org.apache.sysml.hops.ParameterizedBuiltinOp;
-import org.apache.sysml.hops.TernaryOp;
-import org.apache.sysml.hops.UnaryOp;
-import org.apache.sysml.hops.codegen.cplan.CNode;
-import org.apache.sysml.hops.codegen.cplan.CNodeBinary;
-import org.apache.sysml.hops.codegen.cplan.CNodeBinary.BinType;
-import org.apache.sysml.hops.codegen.cplan.CNodeTernary.TernaryType;
-import org.apache.sysml.hops.codegen.cplan.CNodeData;
-import org.apache.sysml.hops.codegen.cplan.CNodeRowAgg;
-import org.apache.sysml.hops.codegen.cplan.CNodeTernary;
-import org.apache.sysml.hops.codegen.cplan.CNodeTpl;
-import org.apache.sysml.hops.codegen.cplan.CNodeUnary;
-import org.apache.sysml.hops.codegen.cplan.CNodeUnary.UnaryType;
-import org.apache.sysml.hops.codegen.template.CPlanMemoTable.MemoTableEntry;
-import org.apache.sysml.hops.rewrite.HopRewriteUtils;
-import org.apache.sysml.hops.Hop.AggOp;
-import org.apache.sysml.hops.Hop.Direction;
-import org.apache.sysml.hops.Hop.OpOp2;
-import org.apache.sysml.parser.Expression.DataType;
-import org.apache.sysml.runtime.matrix.data.Pair;
-
-public class TemplateRowAgg extends TemplateBase 
-{
-       private static final Hop.AggOp[] SUPPORTED_ROW_AGG = new 
AggOp[]{AggOp.SUM, AggOp.MIN, AggOp.MAX};
-       private static final Hop.OpOp2[] SUPPORTED_VECT_BINARY = new 
OpOp2[]{OpOp2.MULT, OpOp2.DIV, 
-                       OpOp2.EQUAL, OpOp2.NOTEQUAL, OpOp2.LESS, 
OpOp2.LESSEQUAL, OpOp2.GREATER, OpOp2.GREATEREQUAL};
-       
-       public TemplateRowAgg() {
-               super(TemplateType.RowAggTpl);
-       }
-       
-       public TemplateRowAgg(boolean closed) {
-               super(TemplateType.RowAggTpl, closed);
-       }
-       
-       @Override
-       public boolean open(Hop hop) {
-               return (hop instanceof AggBinaryOp && hop.getDim2()==1
-                       && hop.getInput().get(0).getDim1()>1 && 
hop.getInput().get(0).getDim2()>1)
-                       || (hop instanceof AggUnaryOp && 
((AggUnaryOp)hop).getDirection()!=Direction.RowCol 
-                               && hop.getInput().get(0).getDim1()>1 && 
hop.getInput().get(0).getDim2()>1);
-       }
-
-       @Override
-       public boolean fuse(Hop hop, Hop input) {
-               return !isClosed() && 
-                       (  (hop instanceof BinaryOp && 
(HopRewriteUtils.isBinaryMatrixColVectorOperation(hop)
-                                       || 
HopRewriteUtils.isBinaryMatrixScalarOperation(hop)) ) 
-                       || ((hop instanceof UnaryOp || hop instanceof 
ParameterizedBuiltinOp) 
-                                       && TemplateCell.isValidOperation(hop))  
        
-                       || (hop instanceof AggUnaryOp && 
((AggUnaryOp)hop).getDirection()!=Direction.RowCol)
-                       || (hop instanceof AggBinaryOp && hop.getDim1()>1 
-                               && 
HopRewriteUtils.isTransposeOperation(hop.getInput().get(0))));
-       }
-
-       @Override
-       public boolean merge(Hop hop, Hop input) {
-               //merge rowagg tpl with cell tpl if input is a vector
-               return !isClosed() &&
-                       ((hop instanceof BinaryOp && input.getDim2()==1) 
//matrix-scalar/vector-vector ops )
-                        ||(hop instanceof AggBinaryOp && input.getDim2()==1
-                               && 
HopRewriteUtils.isTransposeOperation(hop.getInput().get(0))));
-       }
-
-       @Override
-       public CloseType close(Hop hop) {
-               //close on column aggregate (e.g., colSums, t(X)%*%y)
-               if( hop instanceof AggUnaryOp && 
((AggUnaryOp)hop).getDirection()==Direction.Col
-                       || (hop instanceof AggBinaryOp && 
HopRewriteUtils.isTransposeOperation(hop.getInput().get(0))) )
-                       return CloseType.CLOSED_VALID;
-               else
-                       return CloseType.OPEN;
-       }
-
-       @Override
-       public Pair<Hop[], CNodeTpl> constructCplan(Hop hop, CPlanMemoTable 
memo, boolean compileLiterals) {
-               //recursively process required cplan output
-               HashSet<Hop> inHops = new HashSet<Hop>();
-               HashMap<String, Hop> inHops2 = new HashMap<String,Hop>();
-               HashMap<Long, CNode> tmp = new HashMap<Long, CNode>();
-               hop.resetVisitStatus();
-               rConstructCplan(hop, memo, tmp, inHops, inHops2, 
compileLiterals);
-               hop.resetVisitStatus();
-               
-               //reorder inputs (ensure matrix is first input, and other 
inputs ordered by size)
-               List<Hop> sinHops = inHops.stream()
-                       .filter(h -> !(h.getDataType().isScalar() && 
tmp.get(h.getHopID()).isLiteral()))
-                       .sorted(new 
HopInputComparator(inHops2.get("X"))).collect(Collectors.toList());
-               
-               //construct template node
-               ArrayList<CNode> inputs = new ArrayList<CNode>();
-               for( Hop in : sinHops )
-                       inputs.add(tmp.get(in.getHopID()));
-               CNode output = tmp.get(hop.getHopID());
-               CNodeRowAgg tpl = new CNodeRowAgg(inputs, output);
-               tpl.setNumVectorIntermediates(TemplateUtils
-                       .countVectorIntermediates(output));
-               
-               // return cplan instance
-               return new Pair<Hop[],CNodeTpl>(sinHops.toArray(new Hop[0]), 
tpl);
-       }
-
-       private void rConstructCplan(Hop hop, CPlanMemoTable memo, 
HashMap<Long, CNode> tmp, HashSet<Hop> inHops, HashMap<String, Hop> inHops2, 
boolean compileLiterals) 
-       {       
-               //memoization for common subexpression elimination and to avoid 
redundant work 
-               if( tmp.containsKey(hop.getHopID()) )
-                       return;
-               
-               //recursively process required childs
-               MemoTableEntry me = memo.getBest(hop.getHopID(), 
TemplateType.RowAggTpl);
-               for( int i=0; i<hop.getInput().size(); i++ ) {
-                       Hop c = hop.getInput().get(i);
-                       if( me.isPlanRef(i) )
-                               rConstructCplan(c, memo, tmp, inHops, inHops2, 
compileLiterals);
-                       else {
-                               CNodeData cdata = 
TemplateUtils.createCNodeData(c, compileLiterals);    
-                               tmp.put(c.getHopID(), cdata);
-                               inHops.add(c);
-                       }
-               }
-               
-               //construct cnode for current hop
-               CNode out = null;
-               if(hop instanceof AggUnaryOp)
-               {
-                       CNode cdata1 = 
tmp.get(hop.getInput().get(0).getHopID());
-                       if( ((AggUnaryOp)hop).getDirection() == Direction.Row 
&& HopRewriteUtils.isAggUnaryOp(hop, SUPPORTED_ROW_AGG) ) {
-                               if(hop.getInput().get(0).getDim2()==1)
-                                       out = 
(cdata1.getDataType()==DataType.SCALAR) ? cdata1 : new 
CNodeUnary(cdata1,UnaryType.LOOKUP_R);
-                               else {
-                                       String opcode = 
"ROW_"+((AggUnaryOp)hop).getOp().name().toUpperCase()+"S";
-                                       out = new CNodeUnary(cdata1, 
UnaryType.valueOf(opcode));
-                                       inHops2.put("X", hop.getInput().get(0));
-                               }
-                       }
-                       else  if (((AggUnaryOp)hop).getDirection() == 
Direction.Col && ((AggUnaryOp)hop).getOp() == AggOp.SUM ) {
-                               //vector add without temporary copy
-                               if( cdata1 instanceof CNodeBinary && 
((CNodeBinary)cdata1).getType().isVectorScalarPrimitive() )
-                                       out = new 
CNodeBinary(cdata1.getInput().get(0), cdata1.getInput().get(1), 
-                                                       
((CNodeBinary)cdata1).getType().getVectorAddPrimitive());
-                               else    
-                                       out = cdata1;
-                       }
-               }
-               else if(hop instanceof AggBinaryOp)
-               {
-                       CNode cdata1 = 
tmp.get(hop.getInput().get(0).getHopID());
-                       CNode cdata2 = 
tmp.get(hop.getInput().get(1).getHopID());
-                       
-                       if( 
HopRewriteUtils.isTransposeOperation(hop.getInput().get(0)) )
-                       {
-                               //correct input under transpose
-                               cdata1 = TemplateUtils.skipTranspose(cdata1, 
hop.getInput().get(0), tmp, compileLiterals);
-                               inHops.remove(hop.getInput().get(0)); 
-                               
inHops.add(hop.getInput().get(0).getInput().get(0));
-                               
-                               out = new CNodeBinary(cdata1, cdata2, 
BinType.VECT_MULT_ADD);
-                       }
-                       else
-                       {
-                               if(hop.getInput().get(0).getDim2()==1 && 
hop.getInput().get(1).getDim2()==1)
-                                       out = new 
CNodeBinary((cdata1.getDataType()==DataType.SCALAR)? cdata1 : new 
CNodeUnary(cdata1, UnaryType.LOOKUP0),
-                                               
(cdata2.getDataType()==DataType.SCALAR)? cdata2 : new CNodeUnary(cdata2, 
UnaryType.LOOKUP0), BinType.MULT);
-                               else {
-                                       out = new CNodeBinary(cdata1, cdata2, 
BinType.DOT_PRODUCT);
-                                       inHops2.put("X", hop.getInput().get(0));
-                               }
-                       }
-               }
-               else if(hop instanceof UnaryOp)
-               {
-                       CNode cdata1 = 
tmp.get(hop.getInput().get(0).getHopID());
-                       if( TemplateUtils.isColVector(cdata1) )
-                               cdata1 = new CNodeUnary(cdata1, 
UnaryType.LOOKUP_R);
-                       else if( cdata1 instanceof CNodeData && 
hop.getInput().get(0).getDataType().isMatrix() )
-                               cdata1 = new CNodeUnary(cdata1, 
UnaryType.LOOKUP_RC);
-                       
-                       String primitiveOpName = 
((UnaryOp)hop).getOp().toString();
-                       out = new CNodeUnary(cdata1, 
UnaryType.valueOf(primitiveOpName));
-               }
-               else if(hop instanceof BinaryOp)
-               {
-                       CNode cdata1 = 
tmp.get(hop.getInput().get(0).getHopID());
-                       CNode cdata2 = 
tmp.get(hop.getInput().get(1).getHopID());
-                       
-                       // if one input is a matrix then we need to do vector 
by scalar operations
-                       if(hop.getInput().get(0).getDim1() > 1 && 
hop.getInput().get(0).getDim2() > 1 )
-                       {
-                               if( HopRewriteUtils.isBinary(hop, 
SUPPORTED_VECT_BINARY) ) {
-                                       String opname = 
"VECT_"+((BinaryOp)hop).getOp().name()+"_SCALAR";
-                                       if( TemplateUtils.isColVector(cdata2) )
-                                               cdata2 = new CNodeUnary(cdata2, 
UnaryType.LOOKUP_R);
-                                       out = new CNodeBinary(cdata1, cdata2, 
BinType.valueOf(opname));
-                               }
-                               else 
-                                       throw new RuntimeException("Unsupported 
binary matrix "
-                                                       + "operation: " + 
((BinaryOp)hop).getOp().name());
-                       }
-                       else //one input is a vector/scalar other is a scalar
-                       {
-                               String primitiveOpName = 
((BinaryOp)hop).getOp().toString();
-                               if( TemplateUtils.isColVector(cdata1) )
-                                       cdata1 = new CNodeUnary(cdata1, 
UnaryType.LOOKUP_R);
-                               if( TemplateUtils.isColVector(cdata2) )
-                                       cdata2 = new CNodeUnary(cdata2, 
UnaryType.LOOKUP_R);
-                               out = new CNodeBinary(cdata1, cdata2, 
BinType.valueOf(primitiveOpName));        
-                       }
-               }
-               else if(hop instanceof TernaryOp) 
-               {
-                       TernaryOp top = (TernaryOp) hop;
-                       CNode cdata1 = 
tmp.get(hop.getInput().get(0).getHopID());
-                       CNode cdata2 = 
tmp.get(hop.getInput().get(1).getHopID());
-                       CNode cdata3 = 
tmp.get(hop.getInput().get(2).getHopID());
-                       
-                       //cdata1 is vector
-                       if( TemplateUtils.isColVector(cdata1) )
-                               cdata1 = new CNodeUnary(cdata1, 
UnaryType.LOOKUP_R);
-                       else if( cdata1 instanceof CNodeData && 
hop.getInput().get(0).getDataType().isMatrix() )
-                               cdata1 = new CNodeUnary(cdata1, 
UnaryType.LOOKUP_RC);
-                       
-                       //cdata3 is vector
-                       if( TemplateUtils.isColVector(cdata3) )
-                               cdata3 = new CNodeUnary(cdata3, 
UnaryType.LOOKUP_R);
-                       else if( cdata3 instanceof CNodeData && 
hop.getInput().get(2).getDataType().isMatrix() )
-                               cdata3 = new CNodeUnary(cdata3, 
UnaryType.LOOKUP_RC);
-                       
-                       //construct ternary cnode, primitive operation derived 
from OpOp3
-                       out = new CNodeTernary(cdata1, cdata2, cdata3, 
-                                       
TernaryType.valueOf(top.getOp().toString()));
-               }
-               else if( hop instanceof ParameterizedBuiltinOp ) 
-               {
-                       CNode cdata1 = 
tmp.get(((ParameterizedBuiltinOp)hop).getTargetHop().getHopID());
-                       if( TemplateUtils.isColVector(cdata1) )
-                               cdata1 = new CNodeUnary(cdata1, 
UnaryType.LOOKUP_R);
-                       else if( cdata1 instanceof CNodeData && 
hop.getInput().get(0).getDataType().isMatrix() )
-                               cdata1 = new CNodeUnary(cdata1, 
UnaryType.LOOKUP_RC);
-                       
-                       CNode cdata2 = 
tmp.get(((ParameterizedBuiltinOp)hop).getParameterHop("pattern").getHopID());
-                       CNode cdata3 = 
tmp.get(((ParameterizedBuiltinOp)hop).getParameterHop("replacement").getHopID());
-                       TernaryType ttype = (cdata2.isLiteral() && 
cdata2.getVarname().equals("Double.NaN")) ? 
-                                       TernaryType.REPLACE_NAN : 
TernaryType.REPLACE;
-                       out = new CNodeTernary(cdata1, cdata2, cdata3, ttype);
-               }
-               else if( hop instanceof IndexingOp ) 
-               {
-                       CNode cdata1 = 
tmp.get(hop.getInput().get(0).getHopID());
-                       out = new CNodeTernary(cdata1, 
-                                       TemplateUtils.createCNodeData(new 
LiteralOp(hop.getInput().get(0).getDim2()), true), 
-                                       
TemplateUtils.createCNodeData(hop.getInput().get(4), true),
-                                       TernaryType.LOOKUP_RC1);
-               }
-               
-               if( out == null ) {
-                       throw new RuntimeException(hop.getHopID()+" 
"+hop.getOpString());
-               }
-               
-               if( out.getDataType().isMatrix() ) {
-                       out.setNumRows(hop.getDim1());
-                       out.setNumCols(hop.getDim2());
-               }
-               
-               tmp.put(hop.getHopID(), out);
-       }
-       
-       /**
-        * Comparator to order input hops of the row aggregate template. We try 
-        * to order matrices-vectors-scalars via sorting by number of cells but 
-        * we keep the given main input always at the first position.
-        */
-       public static class HopInputComparator implements Comparator<Hop> 
-       {
-               private final Hop _X;
-               
-               public HopInputComparator(Hop X) {
-                       _X = X;
-               }
-               
-               @Override
-               public int compare(Hop h1, Hop h2) {
-                       long ncells1 = h1.getDataType()==DataType.SCALAR ? 
Long.MIN_VALUE : 
-                               (h1==_X) ? Long.MAX_VALUE : 
-                               h1.dimsKnown() ? h1.getDim1()*h1.getDim2() : 
Long.MAX_VALUE-1;
-                       long ncells2 = h2.getDataType()==DataType.SCALAR ? 
Long.MIN_VALUE : 
-                               (h2==_X) ? Long.MAX_VALUE : 
-                               h2.dimsKnown() ? h2.getDim1()*h2.getDim2() : 
Long.MAX_VALUE-1;
-                       return (ncells1 > ncells2) ? -1 : (ncells1 < ncells2) ? 
1 : 0; 
-               }
-       }
-}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java 
b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
index e8d2086..502e0ef 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
@@ -43,6 +43,7 @@ import org.apache.sysml.hops.codegen.cplan.CNodeData;
 import org.apache.sysml.hops.codegen.cplan.CNodeTernary;
 import org.apache.sysml.hops.codegen.cplan.CNodeUnary;
 import org.apache.sysml.hops.codegen.cplan.CNodeUnary.UnaryType;
+import org.apache.sysml.hops.codegen.template.CPlanMemoTable.MemoTableEntry;
 import org.apache.sysml.hops.codegen.template.TemplateBase.TemplateType;
 import org.apache.sysml.hops.rewrite.HopRewriteUtils;
 import org.apache.sysml.hops.codegen.cplan.CNodeTernary.TernaryType;
@@ -50,11 +51,12 @@ import org.apache.sysml.hops.codegen.cplan.CNodeTpl;
 import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.runtime.codegen.SpoofCellwise.CellType;
 import org.apache.sysml.runtime.codegen.SpoofOuterProduct.OutProdType;
+import org.apache.sysml.runtime.codegen.SpoofRowwise.RowType;
 import org.apache.sysml.runtime.util.UtilFunctions;
 
 public class TemplateUtils 
 {
-       public static final TemplateBase[] TEMPLATES = new TemplateBase[]{new 
TemplateRowAgg(), new TemplateCell(), new TemplateOuterProduct()};
+       public static final TemplateBase[] TEMPLATES = new TemplateBase[]{new 
TemplateRow(), new TemplateCell(), new TemplateOuterProduct()};
        
        public static boolean isVector(Hop hop) {
                return (hop.getDataType() == DataType.MATRIX 
@@ -99,6 +101,13 @@ public class TemplateUtils
                        && left.getDataType().isMatrix() && 
right.getDataType().isMatrix()
                        && left.getDim2() > right.getDim2();
        }
+       
+       public static boolean hasMatrixInput( Hop hop ) {
+               for( Hop c : hop.getInput() )
+                       if( isMatrix(c) )
+                               return true;
+               return false;
+       }
 
        public static boolean isOperationSupported(Hop h) {
                if(h instanceof  UnaryOp)
@@ -187,7 +196,7 @@ public class TemplateUtils
                TemplateBase tpl = null;
                switch( type ) {
                        case CellTpl: tpl = new TemplateCell(closed); break;
-                       case RowAggTpl: tpl = new TemplateRowAgg(closed); break;
+                       case RowTpl: tpl = new TemplateRow(closed); break;
                        case MultiAggTpl: tpl = new TemplateMultiAgg(closed); 
break;
                        case OuterProdTpl: tpl = new 
TemplateOuterProduct(closed); break;
                }
@@ -197,8 +206,8 @@ public class TemplateUtils
        public static TemplateBase[] createCompatibleTemplates(TemplateType 
type, boolean closed) {
                TemplateBase[] tpl = null;
                switch( type ) {
-                       case CellTpl: tpl = new TemplateBase[]{new 
TemplateCell(closed), new TemplateRowAgg(closed)}; break;
-                       case RowAggTpl: tpl = new TemplateBase[]{new 
TemplateRowAgg(closed)}; break;
+                       case CellTpl: tpl = new TemplateBase[]{new 
TemplateCell(closed), new TemplateRow(closed)}; break;
+                       case RowTpl: tpl = new TemplateBase[]{new 
TemplateRow(closed)}; break;
                        case MultiAggTpl: tpl = new TemplateBase[]{new 
TemplateMultiAgg(closed)}; break;
                        case OuterProdTpl: tpl = new TemplateBase[]{new 
TemplateOuterProduct(closed)}; break;
                }
@@ -211,6 +220,17 @@ public class TemplateUtils
                        CellType.FULL_AGG : CellType.ROW_AGG) : CellType.NO_AGG;
        }
        
+       public static RowType getRowType(Hop output, Hop input) {
+               if( HopRewriteUtils.isEqualSize(output, input) )
+                       return RowType.NO_AGG;
+               else if( output.getDim1()==input.getDim1() && 
output.getDim2()==1 )
+                       return RowType.ROW_AGG;
+               else if( output.getDim1()==input.getDim2() && 
output.getDim2()==1 )
+                       return RowType.COL_AGG_T;
+               else
+                       return RowType.COL_AGG;
+       }
+       
        public static AggOp getAggOp(Hop hop) {
                return (hop instanceof AggUnaryOp) ? ((AggUnaryOp)hop).getOp() :
                        (hop instanceof AggBinaryOp) ? AggOp.SUM : null;
@@ -292,11 +312,36 @@ public class TemplateUtils
                int ret = 0;
                for( CNode c : node.getInput() )
                        ret += countVectorIntermediates(c);
-               return ret + ((node instanceof CNodeBinary 
+               int cntBin = ((node instanceof CNodeBinary 
                        && 
((CNodeBinary)node).getType().isVectorScalarPrimitive()) ? 1 : 0);
+               int cntUn = ((node instanceof CNodeUnary
+                               && 
((CNodeUnary)node).getType().isVectorScalarPrimitive()) ? 1 : 0);
+               return ret + cntBin + cntUn;
        }
 
        public static boolean isType(TemplateType type, TemplateType... 
validTypes) {
                return ArrayUtils.contains(validTypes, type);
        }
+       
+       public static boolean hasCommonRowTemplateMatrixInput(Hop input1, Hop 
input2, CPlanMemoTable memo) {
+               //if second input has no row template, it's always true
+               if( !memo.contains(input2.getHopID(), TemplateType.RowTpl) )
+                       return true;
+               //check for common row template input
+               return getRowTemplateMatrixInput(input1, memo)
+                       == getRowTemplateMatrixInput(input2, memo);
+       }
+       
+       public static long getRowTemplateMatrixInput(Hop current, 
CPlanMemoTable memo) {
+               MemoTableEntry me = memo.getBest(current.getHopID(), 
TemplateType.RowTpl);
+               long ret = -1;
+               for( int i=0; ret<0 && i<current.getInput().size(); i++ ) {
+                       Hop input = current.getInput().get(i);
+                       if( me.isPlanRef(i) && memo.contains(input.getHopID(), 
TemplateType.RowTpl) )
+                               ret = getRowTemplateMatrixInput(input, memo);
+                       else if( !me.isPlanRef(i) && isMatrix(input) )
+                               ret = input.getHopID();
+               }
+               return ret;
+       }
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java 
b/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
index a4b6ec1..0bad2f6 100644
--- a/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
+++ b/src/main/java/org/apache/sysml/hops/rewrite/HopRewriteUtils.java
@@ -792,13 +792,8 @@ public class HopRewriteUtils
        }
        
        public static boolean isBinary(Hop hop, OpOp2... types) {
-               if( hop instanceof BinaryOp ) {
-                       BinaryOp bop = (BinaryOp) hop;
-                       for( OpOp2 type : types )
-                               if( type == bop.getOp() )
-                                       return true;
-               }
-               return false;
+               return ( hop instanceof BinaryOp 
+                       && ArrayUtils.contains(types, ((BinaryOp) 
hop).getOp()));
        }
        
        public static boolean isBinary(Hop hop, OpOp2 type, int maxParents) {
@@ -832,6 +827,11 @@ public class HopRewriteUtils
                return isUnary(hop, type) && hop.getParent().size() <= 
maxParents;
        }
        
+       public static boolean isUnary(Hop hop, OpOp1... types) {
+               return ( hop instanceof UnaryOp 
+                       && ArrayUtils.contains(types, ((UnaryOp) hop).getOp()));
+       }
+       
        public static boolean isMatrixMultiply(Hop hop) {
                return hop instanceof AggBinaryOp && 
((AggBinaryOp)hop).isMatrixMultiply();
        }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java 
b/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
index 6907b0b..9283c46 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
@@ -22,6 +22,7 @@ package org.apache.sysml.runtime.codegen;
 import java.util.Arrays;
 import java.util.LinkedList;
 
+import org.apache.commons.math3.util.FastMath;
 import org.apache.sysml.runtime.functionobjects.IntegerDivide;
 import org.apache.sysml.runtime.functionobjects.Modulus;
 import org.apache.sysml.runtime.matrix.data.LibMatrixMult;
@@ -73,6 +74,10 @@ public class LibSpoofPrimitives
                LibMatrixMult.vectMultiplyAdd(bval, a, c, bix, bi, 0, len);
                return c;
        }
+       
+       public static void vectWrite(double[] a, double[] c, int ci, int len) {
+               System.arraycopy(a, 0, c, ci, len);
+       }
 
        // custom vector sums, mins, maxs
        
@@ -113,32 +118,14 @@ public class LibSpoofPrimitives
         * @return sum value
         */
        public static double vectSum(double[] avals, int[] aix, int ai, int 
len) {
-               double val = 0;
-               final int bn = len%8;
-                               
-               //compute rest
-               for( int i = ai; i < ai+bn; i++ )
-                       val += avals[ aix[i] ];
-               
-               //unrolled 8-block (for better instruction-level parallelism)
-               for( int i = ai+bn; i < ai+len; i+=8 )
-               {
-                       //read 64B of a via 'gather'
-                       //compute cval' = sum(a) + cval
-                       val += avals[ aix[i+0] ] + avals[ aix[i+1] ]
-                            + avals[ aix[i+2] ] + avals[ aix[i+3] ]
-                            + avals[ aix[i+4] ] + avals[ aix[i+5] ]
-                            + avals[ aix[i+6] ] + avals[ aix[i+7] ];
-               }
-               
-               //scalar result
-               return val; 
+               //forward to dense as column indexes not required here
+               return vectSum(avals, ai, len);
        }
        
        public static double vectMin(double[] a, int ai, int len) { 
                double val = Double.MAX_VALUE;
                for( int i = ai; i < ai+len; i++ )
-                       val = Math.min(a[ai], val);
+                       val = Math.min(a[i], val);
                return val; 
        } 
        
@@ -152,7 +139,7 @@ public class LibSpoofPrimitives
        public static double vectMax(double[] a, int ai, int len) { 
                double val = -Double.MAX_VALUE;
                for( int i = ai; i < ai+len; i++ )
-                       val = Math.max(a[ai], val);
+                       val = Math.max(a[i], val);
                return val; 
        } 
        
@@ -189,6 +176,84 @@ public class LibSpoofPrimitives
                return c;
        }
        
+       //custom vector minus
+       
+       public static void vectMinusAdd(double[] a, double bval, double[] c, 
int ai, int ci, int len) {
+               for( int j = ai; j < ai+len; j++, ci++)
+                       c[ci] +=  a[j] - bval;
+       } 
+
+       public static void vectMinusAdd(double[] a, double bval, double[] c, 
int[] aix, int ai, int ci, int len) {
+               for( int j = ai; j < ai+len; j++ )
+                       c[ci + aix[j]] += a[j] - bval;
+       }
+       
+       public static double[] vectMinusWrite(double[] a, double bval, int ai, 
int len) {
+               double[] c = allocVector(len, false);
+               for( int j = 0; j < len; j++, ai++)
+                       c[j] = a[ai] - bval;
+               return c;
+       }
+
+       public static double[] vectMinusWrite(double[] a, double bval, int[] 
aix, int ai, int len) {
+               double[] c = allocVector(len, true);
+               for( int j = ai; j < ai+len; j++ )
+                       c[aix[j]] = a[j] - bval;
+               return c;
+       }
+
+       //custom exp
+       
+       public static void vectExpAdd(double[] a, double[] c, int ai, int ci, 
int len) {
+               for( int j = ai; j < ai+len; j++, ci++)
+                       c[ci] +=  FastMath.exp(a[j]);
+       } 
+
+       public static void vectExpAdd(double[] a, double[] c, int[] aix, int 
ai, int ci, int len) {
+               for( int j = ai; j < ai+len; j++ )
+                       c[ci + aix[j]] += FastMath.exp(a[j]);
+       }
+       
+       public static double[] vectExpWrite(double[] a, int ai, int len) {
+               double[] c = allocVector(len, false);
+               for( int j = 0; j < len; j++, ai++)
+                       c[j] = FastMath.exp(a[ai]);
+               return c;
+       }
+
+       public static double[] vectExpWrite(double[] a, int[] aix, int ai, int 
len) {
+               double[] c = allocVector(len, true);
+               for( int j = ai; j < ai+len; j++ )
+                       c[aix[j]] = FastMath.exp(a[j]);
+               return c;
+       }
+
+       //custom log
+       
+       public static void vectLogAdd(double[] a, double[] c, int ai, int ci, 
int len) {
+               for( int j = ai; j < ai+len; j++, ci++)
+                       c[ci] +=  FastMath.log(a[j]);
+       } 
+
+       public static void vectLogAdd(double[] a, double[] c, int[] aix, int 
ai, int ci, int len) {
+               for( int j = ai; j < ai+len; j++ )
+                       c[ci + aix[j]] += FastMath.log(a[j]);
+       }
+       
+       public static double[] vectLogWrite(double[] a, int ai, int len) {
+               double[] c = allocVector(len, false);
+               for( int j = 0; j < len; j++, ai++)
+                       c[j] = FastMath.log(a[ai]);
+               return c;
+       }
+
+       public static double[] vectLogWrite(double[] a, int[] aix, int ai, int 
len) {
+               double[] c = allocVector(len, true);
+               for( int j = ai; j < ai+len; j++ )
+                       c[aix[j]] = FastMath.log(a[j]);
+               return c;
+       }
+       
        //custom vector equal
        
        public static void vectEqualAdd(double[] a, double bval, double[] c, 
int ai, int ci, int len) {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowAggregate.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowAggregate.java 
b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowAggregate.java
deleted file mode 100644
index 0224b9a..0000000
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowAggregate.java
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.sysml.runtime.codegen;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.concurrent.Callable;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
-
-import org.apache.sysml.runtime.DMLRuntimeException;
-import org.apache.sysml.runtime.instructions.cp.ScalarObject;
-import org.apache.sysml.runtime.matrix.data.LibMatrixMult;
-import org.apache.sysml.runtime.matrix.data.MatrixBlock;
-import org.apache.sysml.runtime.matrix.data.SparseBlock;
-import org.apache.sysml.runtime.util.UtilFunctions;
-
-
-public abstract class SpoofRowAggregate extends SpoofOperator
-{
-       private static final long serialVersionUID = 6242910797139642998L;
-       private static final long PAR_NUMCELL_THRESHOLD = 1024*1024;   //Min 1M 
elements
-       
-       protected final boolean _colVector;
-       protected final int _reqVectMem;
-       
-       public SpoofRowAggregate(boolean colVector, int reqVectMem) {
-               _colVector = colVector;
-               _reqVectMem = reqVectMem;
-       }
-
-       @Override
-       public String getSpoofType() {
-               return "RA" +  getClass().getName().split("\\.")[1];
-       }
-       
-       @Override
-       public void execute(ArrayList<MatrixBlock> inputs, 
ArrayList<ScalarObject> scalarObjects, MatrixBlock out)      
-               throws DMLRuntimeException
-       {
-               //sanity check
-               if( inputs==null || inputs.size() < 1 || out==null )
-                       throw new RuntimeException("Invalid input arguments.");
-               
-               //result allocation and preparations
-               out.reset(_colVector ? inputs.get(0).getNumColumns() : 1, 
-                       _colVector ? 1 : inputs.get(0).getNumColumns(), false);
-               out.allocateDenseBlock();
-               double[] c = out.getDenseBlock();
-               
-               //input preparation
-               double[][] b = prepInputMatrices(inputs);
-               double[] scalars = prepInputScalars(scalarObjects);
-               
-               //core sequential execute
-               final int m = inputs.get(0).getNumRows();
-               final int n = inputs.get(0).getNumColumns();            
-               LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, 
out.getNumColumns());
-               if( !inputs.get(0).isInSparseFormat() )
-                       executeDense(inputs.get(0).getDenseBlock(), b, scalars, 
c, n, 0, m);
-               else
-                       executeSparse(inputs.get(0).getSparseBlock(), b, 
scalars, c, n, 0, m);
-       
-               //post-processing
-               LibSpoofPrimitives.cleanupThreadLocalMemory();
-               out.recomputeNonZeros();        
-       }
-       
-       @Override
-       public void execute(ArrayList<MatrixBlock> inputs, 
ArrayList<ScalarObject> scalarObjects, MatrixBlock out, int k)       
-               throws DMLRuntimeException
-       {
-               //redirect to serial execution
-               if( k <= 1 || 
(long)inputs.get(0).getNumRows()*inputs.get(0).getNumColumns()<PAR_NUMCELL_THRESHOLD
 ) {
-                       execute(inputs, scalarObjects, out);
-                       return;
-               }
-               
-               //sanity check
-               if( inputs==null || inputs.size() < 1 || out==null )
-                       throw new RuntimeException("Invalid input arguments.");
-               
-               //result allocation and preparations
-               out.reset(_colVector ? inputs.get(0).getNumColumns() : 1, 
-                       _colVector ? 1 : inputs.get(0).getNumColumns(), false);
-               out.allocateDenseBlock();
-               
-               //input preparation
-               double[][] b = prepInputMatrices(inputs);
-               double[] scalars = prepInputScalars(scalarObjects);
-               
-               //core parallel execute
-               final int m = inputs.get(0).getNumRows();
-               final int n = inputs.get(0).getNumColumns();            
-               try {
-                       ExecutorService pool = Executors.newFixedThreadPool( k 
);
-                       ArrayList<ParExecTask> tasks = new 
ArrayList<ParExecTask>();
-                       int nk = UtilFunctions.roundToNext(Math.min(8*k,m/32), 
k);
-                       int blklen = (int)(Math.ceil((double)m/nk));
-                       for( int i=0; i<nk & i*blklen<m; i++ )
-                               tasks.add(new ParExecTask(inputs.get(0), b, 
scalars, n, i*blklen, Math.min((i+1)*blklen, m)));
-                       //execute tasks
-                       List<Future<double[]>> taskret = pool.invokeAll(tasks); 
-                       pool.shutdown();
-                       //aggregate partial results
-                       for( Future<double[]> task : taskret )
-                               LibMatrixMult.vectAdd(task.get(), 
out.getDenseBlock(), 0, 0, n);
-               }
-               catch(Exception ex) {
-                       throw new DMLRuntimeException(ex);
-               }
-               
-               //post-processing
-               out.recomputeNonZeros();        
-       }
-       
-       private void executeDense(double[] a, double[][] b, double[] scalars, 
double[] c, int n, int rl, int ru) 
-       {
-               if( a == null )
-                       return;
-               
-               for( int i=rl, aix=rl*n; i<ru; i++, aix+=n ) {
-                       //call generated method
-                       genexecRowDense( a, aix, b, scalars, c, n, i );
-               }
-       }
-       
-       private void executeSparse(SparseBlock sblock, double[][] b, double[] 
scalars, double[] c, int n, int rl, int ru) 
-       {
-               if( sblock == null )
-                       return;
-                       
-               for( int i=rl; i<ru; i++ ) {
-                       if( !sblock.isEmpty(i) ) {
-                               double[] avals = sblock.values(i);
-                               int[] aix = sblock.indexes(i);
-                               int apos = sblock.pos(i);
-                               int alen = sblock.size(i);
-                               
-                               //call generated method
-                               genexecRowSparse(avals, aix, apos, b, scalars, 
c, alen, i);
-                       }
-               }
-       }
-       
-       //methods to be implemented by generated operators of type 
SpoofRowAggrgate 
-       
-       protected abstract void genexecRowDense( double[] a, int ai, double[][] 
b, double[] scalars, double[] c, int len, int rowIndex );
-       
-       protected abstract void genexecRowSparse( double[] avals, int[] aix, 
int ai, double[][] b, double[] scalars, double[] c, int len, int rowIndex );
-
-       
-       /**
-        * Task for multi-threaded operations.
-        */
-       private class ParExecTask implements Callable<double[]> 
-       {
-               private final MatrixBlock _a;
-               private final double[][] _b;
-               private final double[] _scalars;
-               private final int _clen;
-               private final int _rl;
-               private final int _ru;
-
-               protected ParExecTask( MatrixBlock a, double[][] b, double[] 
scalars, int clen, int rl, int ru ) {
-                       _a = a;
-                       _b = b;
-                       _scalars = scalars;
-                       _clen = clen;
-                       _rl = rl;
-                       _ru = ru;
-               }
-               
-               @Override
-               public double[] call() throws DMLRuntimeException {
-                       
-                       //allocate vector intermediates and partial output
-                       LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, 
_clen);
-                       double[] c = new double[_clen];
-                       
-                       if( !_a.isInSparseFormat() )
-                               executeDense(_a.getDenseBlock(), _b, _scalars, 
c, _clen, _rl, _ru);
-                       else
-                               executeSparse(_a.getSparseBlock(), _b, 
_scalars, c, _clen, _rl, _ru);
-                       
-                       LibSpoofPrimitives.cleanupThreadLocalMemory();
-                       return c;
-               }
-       }
-}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/eeb4f270/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java 
b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
new file mode 100644
index 0000000..b100a89
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowwise.java
@@ -0,0 +1,285 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.codegen;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.cp.ScalarObject;
+import org.apache.sysml.runtime.matrix.data.LibMatrixMult;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.data.SparseBlock;
+import org.apache.sysml.runtime.util.UtilFunctions;
+
+
+public abstract class SpoofRowwise extends SpoofOperator
+{
+       private static final long serialVersionUID = 6242910797139642998L;
+       private static final long PAR_NUMCELL_THRESHOLD = 1024*1024;   //Min 1M 
elements
+       
+       public enum RowType {
+               NO_AGG,    //no aggregation
+               ROW_AGG,   //row aggregation (e.g., rowSums() or X %*% v)
+               COL_AGG,   //col aggregation (e.g., colSums() or t(y) %*% X)
+               COL_AGG_T; //transposed col aggregation (e.g., t(X) %*% y)
+               
+               public boolean isColumnAgg() {
+                       return (this == COL_AGG || this == COL_AGG_T);
+               }
+       }
+       
+       protected final RowType _type;
+       protected final int _reqVectMem;
+       
+       public SpoofRowwise(RowType type, int reqVectMem) {
+               _type = type;
+               _reqVectMem = reqVectMem;
+       }
+       
+       public RowType getRowType() {
+               return _type;
+       }
+
+       @Override
+       public String getSpoofType() {
+               return "RA" +  getClass().getName().split("\\.")[1];
+       }
+       
+       @Override
+       public void execute(ArrayList<MatrixBlock> inputs, 
ArrayList<ScalarObject> scalarObjects, MatrixBlock out)      
+               throws DMLRuntimeException
+       {
+               //sanity check
+               if( inputs==null || inputs.size() < 1 || out==null )
+                       throw new RuntimeException("Invalid input arguments.");
+               
+               //result allocation and preparations
+               final int m = inputs.get(0).getNumRows();
+               final int n = inputs.get(0).getNumColumns();
+               allocateOutputMatrix(m, n, out);
+               double[] c = out.getDenseBlock();
+               
+               //input preparation
+               double[][] b = prepInputMatrices(inputs);
+               double[] scalars = prepInputScalars(scalarObjects);
+               
+               //core sequential execute
+               
+               LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, n);
+               if( !inputs.get(0).isInSparseFormat() )
+                       executeDense(inputs.get(0).getDenseBlock(), b, scalars, 
c, n, 0, m);
+               else
+                       executeSparse(inputs.get(0).getSparseBlock(), b, 
scalars, c, n, 0, m);
+       
+               //post-processing
+               LibSpoofPrimitives.cleanupThreadLocalMemory();
+               out.recomputeNonZeros();
+               out.examSparsity();
+       }
+       
+       @Override
+       public void execute(ArrayList<MatrixBlock> inputs, 
ArrayList<ScalarObject> scalarObjects, MatrixBlock out, int k)       
+               throws DMLRuntimeException
+       {
+               //redirect to serial execution
+               if( k <= 1 || 
(long)inputs.get(0).getNumRows()*inputs.get(0).getNumColumns()<PAR_NUMCELL_THRESHOLD
 ) {
+                       execute(inputs, scalarObjects, out);
+                       return;
+               }
+               
+               //sanity check
+               if( inputs==null || inputs.size() < 1 || out==null )
+                       throw new RuntimeException("Invalid input arguments.");
+               
+               //result allocation and preparations
+               final int m = inputs.get(0).getNumRows();
+               final int n = inputs.get(0).getNumColumns();
+               allocateOutputMatrix(m, n, out);
+               
+               //input preparation
+               double[][] b = prepInputMatrices(inputs);
+               double[] scalars = prepInputScalars(scalarObjects);
+               
+               //core parallel execute
+               ExecutorService pool = Executors.newFixedThreadPool( k );
+               int nk = UtilFunctions.roundToNext(Math.min(8*k,m/32), k);
+               int blklen = (int)(Math.ceil((double)m/nk));
+               try
+               {
+                       if( _type.isColumnAgg() ) {
+                               //execute tasks
+                               ArrayList<ParColAggTask> tasks = new 
ArrayList<ParColAggTask>();
+                               for( int i=0; i<nk & i*blklen<m; i++ )
+                                       tasks.add(new 
ParColAggTask(inputs.get(0), b, scalars, n, i*blklen, Math.min((i+1)*blklen, 
m)));
+                               List<Future<double[]>> taskret = 
pool.invokeAll(tasks); 
+                               //aggregate partial results
+                               for( Future<double[]> task : taskret )
+                                       LibMatrixMult.vectAdd(task.get(), 
out.getDenseBlock(), 0, 0, n);
+                               out.recomputeNonZeros();
+                       }
+                       else {
+                               //execute tasks
+                               ArrayList<ParExecTask> tasks = new 
ArrayList<ParExecTask>();
+                               for( int i=0; i<nk & i*blklen<m; i++ )
+                                       tasks.add(new 
ParExecTask(inputs.get(0), b, out, scalars, n, i*blklen, Math.min((i+1)*blklen, 
m)));
+                               List<Future<Long>> taskret = 
pool.invokeAll(tasks);
+                               //aggregate nnz, no need to aggregate results
+                               long nnz = 0;
+                               for( Future<Long> task : taskret )
+                                       nnz += task.get();
+                               out.setNonZeros(nnz);
+                       }
+                       
+                       pool.shutdown();
+                       out.examSparsity();
+               }
+               catch(Exception ex) {
+                       throw new DMLRuntimeException(ex);
+               }       
+       }
+       
+       private void allocateOutputMatrix(int m, int n, MatrixBlock out) {
+               switch( _type ) {
+                       case NO_AGG: out.reset(m, n, false); break;
+                       case ROW_AGG: out.reset(m, 1, false); break;
+                       case COL_AGG: out.reset(1, n, false); break;
+                       case COL_AGG_T: out.reset(n, 1, false); break;
+               }
+               out.allocateDenseBlock();
+       }
+       
+       private void executeDense(double[] a, double[][] b, double[] scalars, 
double[] c, int n, int rl, int ru) 
+       {
+               if( a == null )
+                       return;
+               
+               for( int i=rl, aix=rl*n; i<ru; i++, aix+=n ) {
+                       //call generated method
+                       genexecRowDense( a, aix, b, scalars, c, n, i );
+               }
+       }
+       
+       private void executeSparse(SparseBlock sblock, double[][] b, double[] 
scalars, double[] c, int n, int rl, int ru) 
+       {
+               if( sblock == null )
+                       return;
+                       
+               for( int i=rl; i<ru; i++ ) {
+                       if( !sblock.isEmpty(i) ) {
+                               double[] avals = sblock.values(i);
+                               int[] aix = sblock.indexes(i);
+                               int apos = sblock.pos(i);
+                               int alen = sblock.size(i);
+                               
+                               //call generated method
+                               genexecRowSparse(avals, aix, apos, b, scalars, 
c, alen, i);
+                       }
+               }
+       }
+       
+       //methods to be implemented by generated operators of type 
SpoofRowAggrgate 
+       
+       protected abstract void genexecRowDense( double[] a, int ai, double[][] 
b, double[] scalars, double[] c, int len, int rowIndex );
+       
+       protected abstract void genexecRowSparse( double[] avals, int[] aix, 
int ai, double[][] b, double[] scalars, double[] c, int len, int rowIndex );
+
+       
+       /**
+        * Task for multi-threaded column aggregation operations.
+        */
+       private class ParColAggTask implements Callable<double[]> 
+       {
+               private final MatrixBlock _a;
+               private final double[][] _b;
+               private final double[] _scalars;
+               private final int _clen;
+               private final int _rl;
+               private final int _ru;
+
+               protected ParColAggTask( MatrixBlock a, double[][] b, double[] 
scalars, int clen, int rl, int ru ) {
+                       _a = a;
+                       _b = b;
+                       _scalars = scalars;
+                       _clen = clen;
+                       _rl = rl;
+                       _ru = ru;
+               }
+               
+               @Override
+               public double[] call() throws DMLRuntimeException {
+                       
+                       //allocate vector intermediates and partial output
+                       LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, 
_clen);
+                       double[] c = new double[_clen];
+                       
+                       if( !_a.isInSparseFormat() )
+                               executeDense(_a.getDenseBlock(), _b, _scalars, 
c, _clen, _rl, _ru);
+                       else
+                               executeSparse(_a.getSparseBlock(), _b, 
_scalars, c, _clen, _rl, _ru);
+                       
+                       LibSpoofPrimitives.cleanupThreadLocalMemory();
+                       return c;
+               }
+       }
+       
+       /**
+        * Task for multi-threaded execution with no or row aggregation.
+        */
+       private class ParExecTask implements Callable<Long> 
+       {
+               private final MatrixBlock _a;
+               private final double[][] _b;
+               private final MatrixBlock _c;
+               private final double[] _scalars;
+               private final int _clen;
+               private final int _rl;
+               private final int _ru;
+
+               protected ParExecTask( MatrixBlock a, double[][] b, MatrixBlock 
c, double[] scalars, int clen, int rl, int ru ) {
+                       _a = a;
+                       _b = b;
+                       _c = c;
+                       _scalars = scalars;
+                       _clen = clen;
+                       _rl = rl;
+                       _ru = ru;
+               }
+               
+               @Override
+               public Long call() throws DMLRuntimeException {
+                       //allocate vector intermediates
+                       LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, 
_clen);
+                       
+                       if( !_a.isInSparseFormat() )
+                               executeDense(_a.getDenseBlock(), _b, _scalars, 
_c.getDenseBlock(), _clen, _rl, _ru);
+                       else
+                               executeSparse(_a.getSparseBlock(), _b, 
_scalars, _c.getDenseBlock(), _clen, _rl, _ru);
+                       LibSpoofPrimitives.cleanupThreadLocalMemory();
+                       
+                       //maintain nnz for row partition
+                       return _c.recomputeNonZeros(_rl, _ru-1, 0, 
_c.getNumColumns()-1);
+               }
+       }
+}


Reply via email to