[SYSTEMML-1362] Handling of temporary vectors in generated operators

This patch adds proper handling and allocation of temporary vectors in
codegen row aggregate templates. So far, these scenarios crashed if the
vector operations were not condensed into the partial aggregation. For
example, colSums(X<=rowSums(X)) can be condensed, but
colSums((X<=rowSums(X))==1) cannot because X<=rowSums(X) creates a row
intermediate that is used by the subsequent operation. 

However, an initial implementation which allocated temporary vector
intermediates showed significant performance problems. Hence, this patch
also includes a very lean memory management of intermediates for resue.
The basic idea is to (1) count the number of vector intermediates, (2)
establish a thread-local ring buffer of pre-allocated row vectors, and
(3) reuse these intermediates for sparse and dense vectors operations w/
and w/o reset. On a scenario of an 1M x 1K (8GB) input matrix and
colSums((X<=rowSums(X))==1), this temporary reuse improved performance
from 743ms to 315ms, i.e., it saturated peak memory bandwidth of 25GB/s.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/7491a680
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/7491a680
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/7491a680

Branch: refs/heads/master
Commit: 7491a680b645ea205faf296c924358014c273892
Parents: 70e5aa1
Author: Matthias Boehm <[email protected]>
Authored: Tue Mar 28 22:28:53 2017 -0700
Committer: Matthias Boehm <[email protected]>
Committed: Tue Mar 28 22:29:53 2017 -0700

----------------------------------------------------------------------
 .../sysml/hops/codegen/cplan/CNodeBinary.java   |  38 ++--
 .../sysml/hops/codegen/cplan/CNodeRowAgg.java   |  23 ++-
 .../sysml/hops/codegen/cplan/CNodeUnary.java    |   4 +-
 .../codegen/template/TemplateOuterProduct.java  |   4 +-
 .../hops/codegen/template/TemplateRowAgg.java   |   6 +-
 .../hops/codegen/template/TemplateUtils.java    |   8 +
 .../runtime/codegen/LibSpoofPrimitives.java     | 196 +++++++++++++------
 .../runtime/codegen/SpoofRowAggregate.java      |  17 +-
 .../functions/codegen/RowAggTmplTest.java       |  15 +-
 .../scripts/functions/codegen/rowAggPattern8.R  |  29 +++
 .../functions/codegen/rowAggPattern8.dml        |  26 +++
 11 files changed, 270 insertions(+), 96 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/7491a680/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java 
b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
index 36b0354..41f99c9 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeBinary.java
@@ -56,7 +56,7 @@ public class CNodeBinary extends CNode
                                case DOT_PRODUCT:   
                                        return sparse ? "    double %TMP% = 
LibSpoofPrimitives.dotProduct(%IN1v%, %IN2%, %IN1i%, %POS1%, %POS2%, %LEN%);\n" 
:
                                                                        "    
double %TMP% = LibSpoofPrimitives.dotProduct(%IN1%, %IN2%, %POS1%, %POS2%, 
%LEN%);\n";
-                                       
+                               
                                case VECT_MULT_ADD:
                                case VECT_DIV_ADD:
                                case VECT_EQUAL_ADD:
@@ -66,9 +66,10 @@ public class CNodeBinary extends CNode
                                case VECT_GREATER_ADD:
                                case VECT_GREATEREQUAL_ADD: {
                                        String vectName = 
getVectorPrimitiveName();
-                                       return sparse ? "    
LibSpoofPrimitives.vect"+vectName+"Add(%IN1%, %IN2v%, %OUT%, %IN2i%, %POS2%, 
%POSOUT%, %LEN%);\n" : 
-                                                                       "    
LibSpoofPrimitives.vect"+vectName+"Add(%IN1%, %IN2%, %OUT%, %POS2%, %POSOUT%, 
%LEN%);\n";
+                                       return sparse ? "    
LibSpoofPrimitives.vect"+vectName+"Add(%IN1v%, %IN2%, %OUT%, %IN1i%, %POS1%, 
%POSOUT%, %LEN%);\n" : 
+                                                                       "    
LibSpoofPrimitives.vect"+vectName+"Add(%IN1%, %IN2%, %OUT%, %POS1%, %POSOUT%, 
%LEN%);\n";
                                }
+                               
                                case VECT_DIV_SCALAR:
                                case VECT_MULT_SCALAR:
                                case VECT_EQUAL_SCALAR:
@@ -78,9 +79,10 @@ public class CNodeBinary extends CNode
                                case VECT_GREATER_SCALAR:
                                case VECT_GREATEREQUAL_SCALAR: {
                                        String vectName = 
getVectorPrimitiveName();
-                                       return sparse ? "    
LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN1i%, %IN2%,  %OUT%, %POS1%, 
%POSOUT%, %LEN%);\n" : 
-                                                                       "    
LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2%, %OUT%, %POS1%, %POSOUT%, 
%LEN%);\n";
+                                       return sparse ? "    double[] %TMP% = 
LibSpoofPrimitives.vect"+vectName+"Write(%IN1v%, %IN2%, %IN1i%, %POS1%, 
%LEN%);\n" : 
+                                                                       "    
double[] %TMP% = LibSpoofPrimitives.vect"+vectName+"Write(%IN1%, %IN2%, %POS1%, 
%LEN%);\n";
                                }
+                               
                                /*Can be replaced by function objects*/
                                case MULT:
                                        return "    double %TMP% = %IN1% * 
%IN2%;\n" ;
@@ -171,23 +173,25 @@ public class CNodeBinary extends CNode
                sb.append(_inputs.get(0).codegen(sparse));
                sb.append(_inputs.get(1).codegen(sparse));
                
-               //generate binary operation
+               //generate binary operation (use sparse template, if data input)
+               boolean lsparse = sparse && (_inputs.get(0) instanceof 
CNodeData);
                String var = createVarname();
-               String tmp = _type.getTemplate(sparse);
+               String tmp = _type.getTemplate(lsparse);
                tmp = tmp.replaceAll("%TMP%", var);
+               
+               //replace input references and start indexes
                for( int j=1; j<=2; j++ ) {
                        String varj = _inputs.get(j-1).getVarname();
-                       if( sparse && !tmp.contains("%IN"+j+"%") ) {
-                               tmp = tmp.replaceAll("%IN"+j+"v%", varj+"vals");
-                               tmp = tmp.replaceAll("%IN"+j+"i%", varj+"ix");
-                       }
-                       else
-                               tmp = tmp.replaceAll("%IN"+j+"%", varj );
                        
-                       if(varj.startsWith("b")  ) //i.e. b.get(index)
-                               tmp = tmp.replaceAll("%POS"+j+"%", "bi");
-                       else
-                               tmp = tmp.replaceAll("%POS"+j+"%", varj+"i");
+                       //replace sparse and dense inputs
+                       tmp = tmp.replaceAll("%IN"+j+"v%", varj+"vals");
+                       tmp = tmp.replaceAll("%IN"+j+"i%", varj+"ix");
+                       tmp = tmp.replaceAll("%IN"+j+"%", varj );
+                       
+                       //replace start position of main input
+                       tmp = tmp.replaceAll("%POS"+j+"%", 
(!varj.startsWith("b") 
+                               && _inputs.get(j-1) instanceof CNodeData 
+                               && _inputs.get(j-1).getDataType().isMatrix()) ? 
varj+"i" : "0");
                }
                sb.append(tmp);
                

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/7491a680/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRowAgg.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRowAgg.java 
b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRowAgg.java
index 7073141..846c88d 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRowAgg.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeRowAgg.java
@@ -32,7 +32,7 @@ public class CNodeRowAgg extends CNodeTpl
                        + "\n"
                        + "public final class %TMP% extends SpoofRowAggregate { 
\n"
                        + "  public %TMP%() {\n"
-                       + "    _colVector = %FLAG%;\n"
+                       + "    super(%COL_VECTOR%, %VECT_MEM%);\n"
                        + "  }\n"
                        + "  protected void genexecRowDense( double[] a, int 
ai, double[][] b, double[] scalars, double[] c, int len, int rowIndex ) { \n"
                        + "%BODY_dense%"
@@ -46,6 +46,16 @@ public class CNodeRowAgg extends CNodeTpl
                super(inputs, output);
        }
        
+       //number of intermediate vectors
+       private int _numVectors = -1;
+       
+       public void setNumVectorIntermediates(int num) {
+               _numVectors = num;
+       }
+       
+       public int getNumVectorIntermediates() {
+               return _numVectors;
+       }
        
        @Override
        public String codegen(boolean sparse) {
@@ -71,9 +81,9 @@ public class CNodeRowAgg extends CNodeTpl
                //replace size information
                tmp = tmp.replaceAll("%LEN%", "len");
                
-               //replace colvector information and start position
-               tmp = tmp.replaceAll("%FLAG%", 
String.valueOf(_output._cols==1));
-               tmp = tmp.replaceAll("bi", "0");
+               //replace colvector information and number of vector 
intermediates
+               tmp = tmp.replaceAll("%COL_VECTOR%", 
String.valueOf(_output._cols==1));
+               tmp = tmp.replaceAll("%VECT_MEM%", String.valueOf(_numVectors));
                
                return tmp;
        }
@@ -108,6 +118,7 @@ public class CNodeRowAgg extends CNodeTpl
                
                CNodeRowAgg that = (CNodeRowAgg)o;
                return super.equals(o)
+                       && _numVectors == that._numVectors      
                        && equalInputReferences(
                                _output, that._output, _inputs, that._inputs);
        }
@@ -115,7 +126,9 @@ public class CNodeRowAgg extends CNodeTpl
        @Override
        public String getTemplateInfo() {
                StringBuilder sb = new StringBuilder();
-               sb.append("SPOOF ROWAGGREGATE");
+               sb.append("SPOOF ROWAGGREGATE [reqVectMem=");
+               sb.append(_numVectors);
+               sb.append("]");
                return sb.toString();
        }
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/7491a680/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java 
b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
index 5d4c63e..5fcb7bc 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/cplan/CNodeUnary.java
@@ -43,8 +43,8 @@ public class CNodeUnary extends CNode
                public String getTemplate(boolean sparse) {
                        switch( this ) {
                                case ROW_SUMS:
-                                       return sparse ? "    double %TMP% = 
LibSpoofPrimitives.vectSum( %IN1v%, %IN1i%, %POS1%, %LEN%);\n": 
-                                                                       "    
double %TMP% = LibSpoofPrimitives.vectSum( %IN1%, %POS1%,  %LEN%);\n"; 
+                                       return sparse ? "    double %TMP% = 
LibSpoofPrimitives.vectSum(%IN1v%, %IN1i%, %POS1%, %LEN%);\n": 
+                                                                       "    
double %TMP% = LibSpoofPrimitives.vectSum(%IN1%, %POS1%, %LEN%);\n"; 
                                case EXP:
                                        return "    double %TMP% = 
FastMath.exp(%IN1%);\n";
                            case LOOKUP_R:

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/7491a680/src/main/java/org/apache/sysml/hops/codegen/template/TemplateOuterProduct.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateOuterProduct.java
 
b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateOuterProduct.java
index 68aaa2a..5918e5e 100644
--- 
a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateOuterProduct.java
+++ 
b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateOuterProduct.java
@@ -193,9 +193,9 @@ public class TemplateOuterProduct extends TemplateBase {
                        //final left/right matrix mult, see close
                        else {
                                if( cdata1.getDataType().isScalar() )
-                                       out = new CNodeBinary(cdata1, cdata2, 
BinType.VECT_MULT_ADD);   
-                               else
                                        out = new CNodeBinary(cdata2, cdata1, 
BinType.VECT_MULT_ADD);   
+                               else
+                                       out = new CNodeBinary(cdata1, cdata2, 
BinType.VECT_MULT_ADD);   
                        }
                }
                else if( HopRewriteUtils.isTransposeOperation(hop) ) 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/7491a680/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRowAgg.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRowAgg.java 
b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRowAgg.java
index 31c0f79..b384f52 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRowAgg.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateRowAgg.java
@@ -118,6 +118,8 @@ public class TemplateRowAgg extends TemplateBase
                        inputs.add(tmp.get(in.getHopID()));
                CNode output = tmp.get(hop.getHopID());
                CNodeRowAgg tpl = new CNodeRowAgg(inputs, output);
+               tpl.setNumVectorIntermediates(TemplateUtils
+                       .countVectorIntermediates(output));
                
                // return cplan instance
                return new Pair<Hop[],CNodeTpl>(sinHops.toArray(new Hop[0]), 
tpl);
@@ -172,7 +174,7 @@ public class TemplateRowAgg extends TemplateBase
                                inHops.remove(hop.getInput().get(0)); 
                                
inHops.add(hop.getInput().get(0).getInput().get(0));
                                
-                               out = new CNodeBinary(cdata2, cdata1, 
BinType.VECT_MULT_ADD);
+                               out = new CNodeBinary(cdata1, cdata2, 
BinType.VECT_MULT_ADD);
                        }
                        else
                        {
@@ -206,7 +208,7 @@ public class TemplateRowAgg extends TemplateBase
                        {
                                if( HopRewriteUtils.isBinary(hop, 
SUPPORTED_VECT_BINARY) ) {
                                        String opname = 
"VECT_"+((BinaryOp)hop).getOp().name()+"_SCALAR";
-                                       out = new CNodeBinary(cdata2, cdata1, 
BinType.valueOf(opname));
+                                       out = new CNodeBinary(cdata1, cdata2, 
BinType.valueOf(opname));
                                }
                                else 
                                        throw new RuntimeException("Unsupported 
binary matrix "

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/7491a680/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java 
b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
index a172af5..adab46c 100644
--- a/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
+++ b/src/main/java/org/apache/sysml/hops/codegen/template/TemplateUtils.java
@@ -272,4 +272,12 @@ public class TemplateUtils
                                        || 
((CNodeUnary)c).getType()==UnaryType.LOOKUP_RC)));
                return ret;
        }
+       
+       public static int countVectorIntermediates(CNode node) {
+               int ret = 0;
+               for( CNode c : node.getInput() )
+                       ret += countVectorIntermediates(c);
+               return ret + ((node instanceof CNodeBinary 
+                       && 
((CNodeBinary)node).getType().isVectorScalarPrimitive()) ? 1 : 0);
+       }
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/7491a680/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java 
b/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
index 9b83006..52e9893 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/LibSpoofPrimitives.java
@@ -19,6 +19,9 @@
 
 package org.apache.sysml.runtime.codegen;
 
+import java.util.Arrays;
+import java.util.LinkedList;
+
 import org.apache.sysml.runtime.matrix.data.LibMatrixMult;
 
 /**
@@ -30,6 +33,12 @@ import org.apache.sysml.runtime.matrix.data.LibMatrixMult;
  */
 public class LibSpoofPrimitives 
 {
+       //global pool of reusable vectors, individual operations set up their 
own thread-local
+       //ring buffers of reusable vectors with specific number of vectors and 
vector sizes 
+       private static ThreadLocal<LinkedList<double[]>> memPool = new 
ThreadLocal<LinkedList<double[]>>() {
+               @Override protected LinkedList<double[]> initialValue() { 
return new LinkedList<double[]>(); }
+       };
+       
        // forwarded calls to LibMatrixMult
        
        public static double dotProduct(double[] a, double[] b, int ai, int bi, 
int len) {
@@ -40,20 +49,24 @@ public class LibSpoofPrimitives
                return LibMatrixMult.dotProduct(a, b, aix, ai, bi, len);
        }
        
-       public static void vectMultAdd(double aval, double[] b, double[] c, int 
bi, int ci, int len) {
-               LibMatrixMult.vectMultiplyAdd(aval, b, c, bi, ci, len);
+       public static void vectMultAdd(double[] a, double bval, double[] c, int 
bi, int ci, int len) {
+               LibMatrixMult.vectMultiplyAdd(bval, a, c, bi, ci, len);
        }
        
-       public static void vectMultAdd(double aval, double[] b, double[] c, 
int[] bix, int bi, int ci, int len) {
-               LibMatrixMult.vectMultiplyAdd(aval, b, c, bix, bi, ci, len);
+       public static void vectMultAdd(double[] a, double bval, double[] c, 
int[] bix, int bi, int ci, int len) {
+               LibMatrixMult.vectMultiplyAdd(bval, a, c, bix, bi, ci, len);
        }
        
-       public static void vectMultWrite(double aval, double[] b, double[] c, 
int bi, int ci, int len) {
-               LibMatrixMult.vectMultiplyWrite(aval, b, c, bi, ci, len);
+       public static double[] vectMultWrite(double[] a, double bval, int bi, 
int len) {
+               double[] c = allocVector(len, false);
+               LibMatrixMult.vectMultiplyWrite(bval, a, c, bi, 0, len);
+               return c;
        }
        
-       public static void vectMultWrite(double aval, double[] b, double[] c, 
int[] bix, int bi, int ci, int len) {
-               LibMatrixMult.vectMultiplyAdd(aval, b, c, bix, bi, ci, len);
+       public static double[] vectMultWrite(double[] a, double bval, int[] 
bix, int bi, int len) {
+               double[] c = allocVector(len, true);
+               LibMatrixMult.vectMultiplyAdd(bval, a, c, bix, bi, 0, len);
+               return c;
        }
 
        // custom vector sums
@@ -119,155 +132,214 @@ public class LibSpoofPrimitives
        
        //custom vector div
        
-       public static void vectDivAdd(double bval, double[] a, double[] c, int 
ai, int ci, int len) {
+       public static void vectDivAdd(double[] a, double bval, double[] c, int 
ai, int ci, int len) {
                for( int j = ai; j < ai+len; j++, ci++)
                        c[ci] +=  a[j] / bval;
        } 
 
-       public static void vectDivAdd(double bval, double[] a, double[] c, 
int[] aix, int ai, int ci, int len) {
+       public static void vectDivAdd(double[] a, double bval, double[] c, 
int[] aix, int ai, int ci, int len) {
                for( int j = ai; j < ai+len; j++ )
                        c[ci + aix[j]] += a[j] / bval;
        }
        
-       public static void vectDivWrite(double bval, double[] a, double[] c, 
int ai, int ci, int len) {
-               for( int j = ai; j < ai+len; j++, ci++)
-                       c[ci] = a[j] / bval;
+       public static double[] vectDivWrite(double[] a, double bval, int ai, 
int len) {
+               double[] c = allocVector(len, false);
+               for( int j = 0; j < len; j++, ai++)
+                       c[j] = a[ai] / bval;
+               return c;
        }
 
-       public static void vectDivWrite(double bval, double[] a, int[] aix, 
double[] c, int ai, int ci, int len) {
+       public static double[] vectDivWrite(double[] a, double bval, int[] aix, 
int ai, int len) {
+               double[] c = allocVector(len, true);
                for( int j = ai; j < ai+len; j++ )
-                       c[ci + aix[j]] = a[j] / bval;
+                       c[aix[j]] = a[j] / bval;
+               return c;
        }
        
        //custom vector equal
        
-       public static void vectEqualAdd(double bval, double[] a, double[] c, 
int ai, int ci, int len) {
+       public static void vectEqualAdd(double[] a, double bval, double[] c, 
int ai, int ci, int len) {
                for( int j = ai; j < ai+len; j++, ci++)
                        c[ci] += (a[j] == bval) ? 1 : 0;
        } 
 
-       public static void vectEqualAdd(double bval, double[] a, double[] c, 
int[] aix, int ai, int ci, int len) {
+       public static void vectEqualAdd(double[] a, double bval, double[] c, 
int[] aix, int ai, int ci, int len) {
                for( int j = ai; j < ai+len; j++ )
                        c[ci + aix[j]] += (a[j] == bval) ? 1 : 0;
        }
        
-       public static void vectEqualWrite(double bval, double[] a, double[] c, 
int ai, int ci, int len) {
-               for( int j = ai; j < ai+len; j++, ci++)
-                       c[ci] = (a[j] == bval) ? 1 : 0;
+       public static double[] vectEqualWrite(double[] a, double bval, int ai, 
int len) {
+               double[] c = allocVector(len, false);
+               for( int j = 0; j < len; j++, ai++)
+                       c[j] = (a[ai] == bval) ? 1 : 0;
+               return c;
        }
 
-       public static void vectEqualWrite(double bval, double[] a, int[] aix, 
double[] c, int ai, int ci, int len) {
+       public static double[] vectEqualWrite(double[] a, double bval, int[] 
aix, int ai, int len) {
+               double[] c = allocVector(len, true);
                for( int j = ai; j < ai+len; j++ )
-                       c[ci + aix[j]] = (a[j] == bval) ? 1 : 0;
-       }
+                       c[aix[j]] = (a[j] == bval) ? 1 : 0;
+               return c;
+       }       
        
-       //custom vector notequal
+       //custom vector not equal
        
-       public static void vectNotequalAdd(double bval, double[] a, double[] c, 
int ai, int ci, int len) {
+       public static void vectNotequalAdd(double[] a, double bval, double[] c, 
int ai, int ci, int len) {
                for( int j = ai; j < ai+len; j++, ci++)
                        c[ci] += (a[j] != bval) ? 1 : 0;
        } 
 
-       public static void vectNotequalAdd(double bval, double[] a, double[] c, 
int[] aix, int ai, int ci, int len) {
+       public static void vectNotequalAdd(double[] a, double bval, double[] c, 
int[] aix, int ai, int ci, int len) {
                for( int j = ai; j < ai+len; j++ )
                        c[ci + aix[j]] += (a[j] != bval) ? 1 : 0;
        }
        
-       public static void vectNotequalWrite(double bval, double[] a, double[] 
c, int ai, int ci, int len) {
-               for( int j = ai; j < ai+len; j++, ci++)
-                       c[ci] = (a[j] != bval) ? 1 : 0;
+       public static double[] vectNotequalWrite(double[] a, double bval, int 
ai, int len) {
+               double[] c = allocVector(len, false);
+               for( int j = 0; j < len; j++, ai++)
+                       c[j] = (a[j] != bval) ? 1 : 0;
+               return c;
        }
 
-       public static void vectNotequalWrite(double bval, double[] a, int[] 
aix, double[] c, int ai, int ci, int len) {
+       public static double[] vectNotequalWrite(double[] a, double bval, int[] 
aix, int ai, int len) {
+               double[] c = allocVector(len, true);
                for( int j = ai; j < ai+len; j++ )
-                       c[ci + aix[j]] = (a[j] != bval) ? 1 : 0;
+                       c[aix[j]] = (a[j] != bval) ? 1 : 0;
+               return c;
        }
        
        //custom vector less
        
-       public static void vectLessAdd(double bval, double[] a, double[] c, int 
ai, int ci, int len) {
+       public static void vectLessAdd(double[] a, double bval, double[] c, int 
ai, int ci, int len) {
                for( int j = ai; j < ai+len; j++, ci++)
                        c[ci] += (a[j] < bval) ? 1 : 0;
        } 
 
-       public static void vectLessAdd(double bval, double[] a, double[] c, 
int[] aix, int ai, int ci, int len) {
+       public static void vectLessAdd(double[] a, double bval, double[] c, 
int[] aix, int ai, int ci, int len) {
                for( int j = ai; j < ai+len; j++ )
                        c[ci + aix[j]] += (a[j] < bval) ? 1 : 0;
        }
        
-       public static void vectLessWrite(double bval, double[] a, double[] c, 
int ai, int ci, int len) {
-               for( int j = ai; j < ai+len; j++, ci++)
-                       c[ci] = (a[j] < bval) ? 1 : 0;
+       public static double[] vectLessWrite(double[] a, double bval, int ai, 
int len) {
+               double[] c = allocVector(len, false);
+               for( int j = 0; j < len; j++, ai++)
+                       c[j] = (a[j] < bval) ? 1 : 0;
+               return c;
        }
 
-       public static void vectLessWrite(double bval, double[] a, int[] aix, 
double[] c, int ai, int ci, int len) {
+       public static double[] vectLessWrite(double[] a, double bval, int[] 
aix, int ai, int len) {
+               double[] c = allocVector(len, true);
                for( int j = ai; j < ai+len; j++ )
-                       c[ci + aix[j]] = (a[j] < bval) ? 1 : 0;
+                       c[aix[j]] = (a[j] < bval) ? 1 : 0;
+               return c;
        }
        
-       //custom vector lessequal
+       //custom vector less equal
        
-       public static void vectLessequalAdd(double bval, double[] a, double[] 
c, int ai, int ci, int len) {
+       public static void vectLessequalAdd(double[] a, double bval, double[] 
c, int ai, int ci, int len) {
                for( int j = ai; j < ai+len; j++, ci++)
                        c[ci] += (a[j] <= bval) ? 1 : 0;
        } 
 
-       public static void vectLessequalAdd(double bval, double[] a, double[] 
c, int[] aix, int ai, int ci, int len) {
+       public static void vectLessequalAdd(double[] a, double bval, double[] 
c, int[] aix, int ai, int ci, int len) {
                for( int j = ai; j < ai+len; j++ )
                        c[ci + aix[j]] += (a[j] <= bval) ? 1 : 0;
        }
        
-       public static void vectLessequalWrite(double bval, double[] a, double[] 
c, int ai, int ci, int len) {
-               for( int j = ai; j < ai+len; j++, ci++)
-                       c[ci] = (a[j] <= bval) ? 1 : 0;
+       public static double[] vectLessequalWrite(double[] a, double bval, int 
ai, int len) {
+               double[] c = allocVector(len, false);
+               for( int j = 0; j < len; j++, ai++)
+                       c[j] = (a[j] <= bval) ? 1 : 0;
+               return c;
        }
 
-       public static void vectLessequalWrite(double bval, double[] a, int[] 
aix, double[] c, int ai, int ci, int len) {
+       public static double[] vectLessequalWrite(double[] a, double bval, 
int[] aix, int ai, int len) {
+               double[] c = allocVector(len, true);
                for( int j = ai; j < ai+len; j++ )
-                       c[ci + aix[j]] = (a[j] <= bval) ? 1 : 0;
+                       c[aix[j]] = (a[j] <= bval) ? 1 : 0;
+               return c;
        }
 
        //custom vector greater
        
-       public static void vectGreaterAdd(double bval, double[] a, double[] c, 
int ai, int ci, int len) {
+       public static void vectGreaterAdd(double[] a, double bval, double[] c, 
int ai, int ci, int len) {
                for( int j = ai; j < ai+len; j++, ci++)
                        c[ci] += (a[j] > bval) ? 1 : 0;
        } 
 
-       public static void vectGreaterAdd(double bval, double[] a, double[] c, 
int[] aix, int ai, int ci, int len) {
+       public static void vectGreaterAdd(double[] a, double bval, double[] c, 
int[] aix, int ai, int ci, int len) {
                for( int j = ai; j < ai+len; j++ )
                        c[ci + aix[j]] += (a[j] > bval) ? 1 : 0;
        }
        
-       public static void vectGreaterWrite(double bval, double[] a, double[] 
c, int ai, int ci, int len) {
-               for( int j = ai; j < ai+len; j++, ci++)
-                       c[ci] = (a[j] > bval) ? 1 : 0;
+       public static double[] vectGreaterWrite(double[] a, double bval, int 
ai, int len) {
+               double[] c = allocVector(len, false);
+               for( int j = 0; j < len; j++, ai++)
+                       c[j] = (a[j] > bval) ? 1 : 0;
+               return c;
        }
 
-       public static void vectGreaterWrite(double bval, double[] a, int[] aix, 
double[] c, int ai, int ci, int len) {
+       public static double[] vectGreaterWrite(double[] a, double bval, int[] 
aix, int ai, int len) {
+               double[] c = allocVector(len, true);
                for( int j = ai; j < ai+len; j++ )
-                       c[ci + aix[j]] = (a[j] > bval) ? 1 : 0;
-       }
+                       c[aix[j]] = (a[j] > bval) ? 1 : 0;
+               return c;
+       }       
        
-       //custom vector greaterequal
+       //custom vector greater equal
        
-       public static void vectGreaterequalAdd(double bval, double[] a, 
double[] c, int ai, int ci, int len) {
+       public static void vectGreaterequalAdd(double[] a, double bval, 
double[] c, int ai, int ci, int len) {
                for( int j = ai; j < ai+len; j++, ci++)
                        c[ci] += (a[j] >= bval) ? 1 : 0;
        } 
 
-       public static void vectGreaterequalAdd(double bval, double[] a, 
double[] c, int[] aix, int ai, int ci, int len) {
+       public static void vectGreaterequalAdd(double[] a, double bval, 
double[] c, int[] aix, int ai, int ci, int len) {
                for( int j = ai; j < ai+len; j++ )
                        c[ci + aix[j]] += (a[j] >= bval) ? 1 : 0;
        }
        
-       public static void vectGreaterequalWrite(double bval, double[] a, 
double[] c, int ai, int ci, int len) {
-               for( int j = ai; j < ai+len; j++, ci++)
-                       c[ci] = (a[j] >= bval) ? 1 : 0;
+       public static double[] vectGreaterequalWrite(double[] a, double bval, 
int ai, int len) {
+               double[] c = allocVector(len, false);
+               for( int j = 0; j < len; j++, ai++)
+                       c[j] = (a[j] >= bval) ? 1 : 0;
+               return c;
        }
 
-       public static void vectGreaterequalWrite(double bval, double[] a, int[] 
aix, double[] c, int ai, int ci, int len) {
+       public static double[] vectGreaterequalWrite(double[] a, double bval, 
int[] aix, int ai, int len) {
+               double[] c = allocVector(len, true);
                for( int j = ai; j < ai+len; j++ )
-                       c[ci + aix[j]] = (a[j] >= bval) ? 1 : 0;
+                       c[aix[j]] = (a[j] >= bval) ? 1 : 0;
+               return c;
+       }
+       
+       //dynamic memory management
+       
+       public static void setupThreadLocalMemory(int numVectors, int len) {
+               LinkedList<double[]> list = new LinkedList<double[]>();
+               for( int i=0; i<numVectors; i++ )
+                       list.addLast(new double[len]);
+               memPool.set(list);
+       }
+       
+       public static void cleanupThreadLocalMemory() {
+               memPool.remove();
+       }
+       
+       private static double[] allocVector(int len, boolean reset) {
+               LinkedList<double[]> list = memPool.get();
+               
+               //sanity check for missing setup
+               if( list.isEmpty() )
+                       return new double[len];
+               
+               //get and re-queue first entry
+               double[] tmp = list.removeFirst();
+               list.addLast(tmp);
+               
+               //reset vector if required
+               if( reset )
+                       Arrays.fill(tmp, 0);
+               return tmp;
        }
 }
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/7491a680/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowAggregate.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowAggregate.java 
b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowAggregate.java
index c0b58a8..6df9b67 100644
--- a/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowAggregate.java
+++ b/src/main/java/org/apache/sysml/runtime/codegen/SpoofRowAggregate.java
@@ -39,10 +39,12 @@ public abstract class SpoofRowAggregate extends 
SpoofOperator
        private static final long serialVersionUID = 6242910797139642998L;
        private static final long PAR_NUMCELL_THRESHOLD = 1024*1024;   //Min 1M 
elements
        
-       protected boolean _colVector = false;
+       protected final boolean _colVector;
+       protected final int _reqVectMem;
        
-       public SpoofRowAggregate() {
-
+       public SpoofRowAggregate(boolean colVector, int reqVectMem) {
+               _colVector = colVector;
+               _reqVectMem = reqVectMem;
        }
 
        @Override
@@ -66,12 +68,14 @@ public abstract class SpoofRowAggregate extends 
SpoofOperator
                //core sequential execute
                final int m = inputs.get(0).getNumRows();
                final int n = inputs.get(0).getNumColumns();            
+               LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, 
out.getNumColumns());
                if( !inputs.get(0).isInSparseFormat() )
                        executeDense(inputs.get(0).getDenseBlock(), b, scalars, 
c, n, 0, m);
                else
                        executeSparse(inputs.get(0).getSparseBlock(), b, 
scalars, c, n, 0, m);
        
                //post-processing
+               LibSpoofPrimitives.cleanupThreadLocalMemory();
                out.recomputeNonZeros();        
        }
        
@@ -176,12 +180,17 @@ public abstract class SpoofRowAggregate extends 
SpoofOperator
                
                @Override
                public double[] call() throws DMLRuntimeException {
+                       
+                       //allocate vector intermediates and partial output
+                       LibSpoofPrimitives.setupThreadLocalMemory(_reqVectMem, 
_clen);
                        double[] c = new double[_clen];
+                       
                        if( !_a.isInSparseFormat() )
                                executeDense(_a.getDenseBlock(), _b, _scalars, 
c, _clen, _rl, _ru);
                        else
                                executeSparse(_a.getSparseBlock(), _b, 
_scalars, c, _clen, _rl, _ru);
-                               
+                       
+                       LibSpoofPrimitives.cleanupThreadLocalMemory();
                        return c;
                }
        }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/7491a680/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
 
b/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
index 0d9bc1d..c2ae38b 100644
--- 
a/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
+++ 
b/src/test/java/org/apache/sysml/test/integration/functions/codegen/RowAggTmplTest.java
@@ -43,7 +43,8 @@ public class RowAggTmplTest extends AutomatedTestBase
        private static final String TEST_NAME5 = TEST_NAME+"5";
        private static final String TEST_NAME6 = TEST_NAME+"6";
        private static final String TEST_NAME7 = TEST_NAME+"7";
-
+       private static final String TEST_NAME8 = TEST_NAME+"8"; 
//colSums((X/rowSums(X))>0.7)
+       
        private static final String TEST_DIR = "functions/codegen/";
        private static final String TEST_CLASS_DIR = TEST_DIR + 
RowAggTmplTest.class.getSimpleName() + "/";
        private final static String TEST_CONF = "SystemML-config-codegen.xml";
@@ -54,7 +55,7 @@ public class RowAggTmplTest extends AutomatedTestBase
        @Override
        public void setUp() {
                TestUtils.clearAssertionInformation();
-               for(int i=1; i<=7; i++)
+               for(int i=1; i<=8; i++)
                        addTestConfiguration( TEST_NAME+i, new 
TestConfiguration(TEST_CLASS_DIR, TEST_NAME+i, new String[] { String.valueOf(i) 
}) );
        }
        
@@ -93,6 +94,11 @@ public class RowAggTmplTest extends AutomatedTestBase
                testCodegenIntegration( TEST_NAME7, true, ExecType.CP );        
        }
        
+       @Test
+       public void testCodegenRowAggRewrite8() {
+               testCodegenIntegration( TEST_NAME8, true, ExecType.CP );        
+       }
+       
        @Test   
        public void testCodegenRowAgg1() {
                testCodegenIntegration( TEST_NAME1, false, ExecType.CP );
@@ -128,6 +134,11 @@ public class RowAggTmplTest extends AutomatedTestBase
                testCodegenIntegration( TEST_NAME7, false, ExecType.CP );       
        }
        
+       @Test
+       public void testCodegenRowAgg8() {
+               testCodegenIntegration( TEST_NAME8, false, ExecType.CP );       
+       }
+       
        private void testCodegenIntegration( String testname, boolean rewrites, 
ExecType instType )
        {       
                boolean oldFlag = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/7491a680/src/test/scripts/functions/codegen/rowAggPattern8.R
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern8.R 
b/src/test/scripts/functions/codegen/rowAggPattern8.R
new file mode 100644
index 0000000..5030636
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern8.R
@@ -0,0 +1,29 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args<-commandArgs(TRUE)
+options(digits=22)
+library("Matrix")
+
+X = matrix(seq(1,15), 5, 3, byrow=TRUE);
+S = colSums((X/rowSums(X))>0.7)
+
+writeMM(as(S, "CsparseMatrix"), paste(args[2], "S", sep="")); 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/7491a680/src/test/scripts/functions/codegen/rowAggPattern8.dml
----------------------------------------------------------------------
diff --git a/src/test/scripts/functions/codegen/rowAggPattern8.dml 
b/src/test/scripts/functions/codegen/rowAggPattern8.dml
new file mode 100644
index 0000000..b4ede87
--- /dev/null
+++ b/src/test/scripts/functions/codegen/rowAggPattern8.dml
@@ -0,0 +1,26 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X = matrix(seq(1,15), rows=5, cols=3);
+S = colSums((X/rowSums(X))>0.7)
+
+write(S,$1)
+

Reply via email to