Repository: incubator-systemml
Updated Branches:
  refs/heads/master 5f4b8bc3f -> 256deb4c4


[SYSTEMML-446][SYSTEMML-727][SYSTEMML-742] maxpool (+bwd), tsmm GPU inst and 
GPU eviction policy

Closes #206.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/256deb4c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/256deb4c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/256deb4c

Branch: refs/heads/master
Commit: 256deb4c45cb5fe8ab4eadd18dd6d509608f0752
Parents: 5f4b8bc
Author: taasawat <[email protected]>
Authored: Thu Aug 11 10:36:26 2016 -0700
Committer: Deron Eriksson <[email protected]>
Committed: Thu Aug 11 10:36:26 2016 -0700

----------------------------------------------------------------------
 .../java/org/apache/sysml/hops/AggBinaryOp.java |  10 +-
 .../org/apache/sysml/hops/ConvolutionOp.java    |  30 ++-
 .../controlprogram/caching/CacheableData.java   |  20 +-
 .../context/ExecutionContext.java               |   4 +-
 .../instructions/GPUInstructionParser.java      |   8 +-
 .../gpu/AggregateBinaryGPUInstruction.java      |   5 +-
 .../gpu/ConvolutionGPUInstruction.java          | 135 +++++++++---
 .../instructions/gpu/GPUInstruction.java        |   2 +-
 .../instructions/gpu/MMTSJGPUInstruction.java   | 123 +++++++++++
 .../instructions/gpu/context/GPUObject.java     | 142 ++++++------
 .../instructions/gpu/context/JCudaObject.java   |  47 +++-
 .../runtime/matrix/data/LibMatrixCUDA.java      | 216 +++++++++++++++++++
 .../sysml/runtime/util/ConvolutionUtils.java    |  73 ++++---
 .../TransposeSelfMatrixMultiplication.java      | 175 +++++++++++++++
 .../matrix/TransposeSelfMatrixMultiplication.R  |  37 ++++
 .../TransposeSelfMatrixMultiplication.dml       |  28 +++
 16 files changed, 901 insertions(+), 154 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/256deb4c/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/AggBinaryOp.java 
b/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
index ea58ebd..fa08f60 100644
--- a/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
+++ b/src/main/java/org/apache/sysml/hops/AggBinaryOp.java
@@ -555,8 +555,16 @@ public class AggBinaryOp extends Hop implements 
MultiThreadedHop
                throws HopsException, LopsException
        {
                int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads);
+               
+               ExecType et = ExecType.CP;
+//             if(DMLScript.USE_ACCELERATOR && (DMLScript.FORCE_ACCELERATOR || 
getMemEstimate() < OptimizerUtils.GPU_MEMORY_BUDGET)) {
+               //TODO: Fix me. Currently forcing the instruction to GPU if gpu 
flag is set
+               if(DMLScript.USE_ACCELERATOR) {
+                       et = ExecType.GPU;
+               }
+               
                Lop matmultCP = new 
MMTSJ(getInput().get(mmtsj.isLeft()?1:0).constructLops(),
-                                                getDataType(), getValueType(), 
ExecType.CP, mmtsj, k);
+                                                getDataType(), getValueType(), 
et, mmtsj, k);
        
                matmultCP.getOutputParameters().setDimensions(getDim1(), 
getDim2(), getRowsInBlock(), getColsInBlock(), getNnz());
                setLineNumbers( matmultCP );

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/256deb4c/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java 
b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
index 3da2cd2..fe277d1 100644
--- a/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
+++ b/src/main/java/org/apache/sysml/hops/ConvolutionOp.java
@@ -21,6 +21,7 @@ package org.apache.sysml.hops;
 
 import java.util.ArrayList;
 
+import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.hops.Hop.MultiThreadedHop;
 import org.apache.sysml.lops.ConvolutionTransform;
@@ -111,8 +112,6 @@ public class ConvolutionOp extends Hop  implements 
MultiThreadedHop
                        case RESHAPE_COL:
                        case ROTATE180:
                        case COL2IM:
-                       case MAX_POOLING:
-                       case MAX_POOLING_BACKWARD:
                        {       
                                et = ExecType.CP; // TODO: Since max_backwards 
and other Convolution Ops only implemented for CP
                                
@@ -127,7 +126,27 @@ public class ConvolutionOp extends Hop  implements 
MultiThreadedHop
                                        throw new HopsException("Unimplemented 
ConvolutionOp for execution type: " + et.name());
                                }
                                // break;
-                       }       
+                       }
+                       case MAX_POOLING:
+                       case MAX_POOLING_BACKWARD:
+                       {       
+                               //TODO: Fix me. Currently forcing the 
instruction to GPU if gpu flag is set
+                               if(DMLScript.USE_ACCELERATOR) {
+                                       et = ExecType.GPU;
+                                       setLops(constructConvolutionLops(et, 
inputs));
+                                       break;
+                               }
+                               else if(et == ExecType.CP) {
+                                       setLops(constructConvolutionLops(et, 
inputs));
+                                       break;
+                               }                       
+                               else {
+                                       // TODO: Add support for SPARK/MR 
backends once we are happy with the performance of
+                                       // single node Lenet script. 
+                                       throw new HopsException("Unimplemented 
ConvolutionOp for execution type: " + et.name());
+                               }
+                               // break;
+                       }
                        case DIRECT_CONV2D:
                        case DIRECT_CONV2D_BACKWARD_DATA:
                        case DIRECT_CONV2D_BACKWARD_FILTER:
@@ -385,6 +404,11 @@ public class ConvolutionOp extends Hop  implements 
MultiThreadedHop
        protected ExecType optFindExecType() throws HopsException {
                
                checkAndSetForcedPlatform();
+               
+               //TODO: Remove this once memEstimate is fixed for these 
instructions 
+               if((op == ConvOp.MAX_POOLING || op == 
ConvOp.MAX_POOLING_BACKWARD) && DMLScript.USE_ACCELERATOR) {
+                       return ExecType.GPU;
+               }
        
                ExecType REMOTE = OptimizerUtils.isSparkExecutionMode() ? 
ExecType.SPARK : ExecType.MR;
                

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/256deb4c/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
index c7425c1..d043879 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/caching/CacheableData.java
@@ -428,10 +428,13 @@ public abstract class CacheableData<T extends CacheBlock> 
extends Data
                //get object from cache
                if( _data == null )
                        getCache();
-               
-               if( _gpuHandle != null )
+                       
+               //call acquireHostRead if gpuHandle is set as well as is 
allocated  
+               if( _gpuHandle != null && _gpuHandle.isAllocated()) {
                        _gpuHandle.acquireHostRead();
-               
+                       if( _data == null )
+                               getCache();
+               }
                //read data from HDFS/RDD if required
                //(probe data for cache_nowrite / jvm_reuse)  
                if( isEmpty(true) && _data==null ) 
@@ -446,7 +449,7 @@ public abstract class CacheableData<T extends CacheBlock> 
extends Data
                                        //check filename
                                        if( _hdfsFileName == null )
                                                throw new 
CacheException("Cannot read matrix for empty filename.");
-
+                                       
                                        //read cacheable data from hdfs
                                        _data = readBlobFromHDFS( _hdfsFileName 
);
                                        
@@ -462,7 +465,7 @@ public abstract class CacheableData<T extends CacheBlock> 
extends Data
                                        //mark for initial local write (prevent 
repeated execution of rdd operations)
                                        if( writeStatus.booleanValue() )
                                                _requiresLocalWrite = 
CACHING_WRITE_CACHE_ON_READ;
-                                       else            
+                                       else
                                                _requiresLocalWrite = true;
                                }
                                
@@ -571,18 +574,19 @@ public abstract class CacheableData<T extends CacheBlock> 
extends Data
                if (! isAvailableToModify ())
                        throw new CacheException ("CacheableData not available 
to modify.");
                
-               //clear old data 
-               clearData(); 
+               //clear old data
+               clearData();
                
                //cache status maintenance
                acquire (true, false); //no need to load evicted matrix
+               
                setDirty(true);
                _isAcquireFromEmpty = false;
                
                //set references to new data
                if (newData == null)
                        throw new CacheException("acquireModify with empty 
cache block.");
-               _data = newData; 
+               _data = newData;
                updateStatusPinned(true);
                
                if( DMLScript.STATISTICS ){

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/256deb4c/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
index 3e85a76..70a5b4f 100644
--- 
a/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
+++ 
b/src/main/java/org/apache/sysml/runtime/controlprogram/context/ExecutionContext.java
@@ -309,7 +309,7 @@ public class ExecutionContext
                throws DMLRuntimeException 
        {
                MatrixObject mo = getMatrixObject(varName);
-               mo.getGPUObject().release(false);
+               mo.getGPUObject().releaseInput();
        }
        
        /**
@@ -383,7 +383,7 @@ public class ExecutionContext
                if(mo.getGPUObject() == null || !mo.getGPUObject().isAllocated) 
{
                        throw new DMLRuntimeException("No output is allocated 
on GPU");
                }
-               mo.getGPUObject().release(true);
+               mo.getGPUObject().releaseOutput();
        }
        
        /**

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/256deb4c/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
index 48e67e1..20527df 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
@@ -25,6 +25,7 @@ import 
org.apache.sysml.runtime.instructions.gpu.AggregateBinaryGPUInstruction;
 import org.apache.sysml.runtime.instructions.gpu.ConvolutionGPUInstruction;
 import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
 import 
org.apache.sysml.runtime.instructions.gpu.GPUInstruction.GPUINSTRUCTION_TYPE;
+import org.apache.sysml.runtime.instructions.gpu.MMTSJGPUInstruction;
 
 public class GPUInstructionParser  extends InstructionParser 
 {
@@ -34,8 +35,10 @@ public class GPUInstructionParser  extends InstructionParser
                String2GPUInstructionType.put( "conv2d",                 
GPUINSTRUCTION_TYPE.Convolution);
                String2GPUInstructionType.put( "conv2d_backward_filter", 
GPUINSTRUCTION_TYPE.Convolution);
                String2GPUInstructionType.put( "conv2d_backward_data",   
GPUINSTRUCTION_TYPE.Convolution);
+               String2GPUInstructionType.put( "maxpooling",             
GPUINSTRUCTION_TYPE.Convolution);
+               String2GPUInstructionType.put( "maxpooling_backward",    
GPUINSTRUCTION_TYPE.Convolution);
                String2GPUInstructionType.put( "ba+*",                   
GPUINSTRUCTION_TYPE.AggregateBinary);
-               
+               String2GPUInstructionType.put( "tsmm",                   
GPUINSTRUCTION_TYPE.MMTSJ);
        }
        
        public static GPUInstruction parseSingleInstruction (String str ) 
@@ -68,6 +71,9 @@ public class GPUInstructionParser  extends InstructionParser
                        case Convolution:
                                return 
ConvolutionGPUInstruction.parseInstruction(str);
                                
+                       case MMTSJ:
+                               return 
MMTSJGPUInstruction.parseInstruction(str);
+                               
                        default: 
                                throw new DMLRuntimeException("Invalid GPU 
Instruction Type: " + gputype );
                }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/256deb4c/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
index 3dc98ba..9c413d0 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/AggregateBinaryGPUInstruction.java
@@ -68,16 +68,15 @@ public class AggregateBinaryGPUInstruction extends 
GPUInstruction
                String opcode = parts[0];
 
                if ( !opcode.equalsIgnoreCase("ba+*")) {
-                       throw new 
DMLRuntimeException("AggregateBinaryInstruction.parseInstruction():: Unknown 
opcode " + opcode);
+                       throw new 
DMLRuntimeException("AggregateBinaryInstruction.parseInstruction():: Unknown 
opcode " + opcode);
                }
                
                InstructionUtils.checkNumFields( parts, 5 );
                CPOperand in1 = new CPOperand(parts[1]);
                CPOperand in2 = new CPOperand(parts[2]);
-               CPOperand out = new CPOperand(parts[3]);                
+               CPOperand out = new CPOperand(parts[3]);
                boolean isLeftTransposed = Boolean.parseBoolean(parts[4]);
                boolean isRightTransposed = Boolean.parseBoolean(parts[5]);
-               
                AggregateOperator agg = new AggregateOperator(0, 
Plus.getPlusFnObject());
                AggregateBinaryOperator aggbin = new 
AggregateBinaryOperator(Multiply.getMultiplyFnObject(), agg, 1);
                return new AggregateBinaryGPUInstruction(aggbin, in1, in2, out, 
opcode, str, isLeftTransposed, isRightTransposed);      

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/256deb4c/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
index ab6d7a4..e489f1c 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
@@ -23,9 +23,12 @@ import java.util.ArrayList;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
+import 
org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
 import org.apache.sysml.runtime.functionobjects.SwapIndex;
+import org.apache.sysml.runtime.instructions.Instruction;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.instructions.cp.ConvolutionCPInstruction;
 import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
 import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
@@ -64,43 +67,88 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                String[] parts = 
InstructionUtils.getInstructionPartsWithValueType(str);
                String opcode = parts[0];
                
-               if( !( opcode.equalsIgnoreCase("conv2d")
+               if( ( opcode.equalsIgnoreCase("conv2d")
                         || opcode.equalsIgnoreCase("conv2d_backward_filter")
-                        || opcode.equalsIgnoreCase("conv2d_backward_data")) ) 
-               {
-                       throw new DMLRuntimeException("Unknown opcode while 
parsing a ConvolutionGPUInstruction: " + str);      
+                        || opcode.equalsIgnoreCase("conv2d_backward_data")
+                        || opcode.equalsIgnoreCase("maxpooling_backward")) ) {
+                       InstructionUtils.checkNumFields(parts, 15);
+                       CPOperand in1 = new CPOperand(parts[1]);
+                       CPOperand in2 = new CPOperand(parts[2]);
+                       CPOperand out = new CPOperand(parts[15]);
+               
+                       ArrayList<CPOperand> stride = new 
ArrayList<CPOperand>();
+                       ArrayList<CPOperand> padding = new 
ArrayList<CPOperand>();
+                       ArrayList<CPOperand> input_shape = new 
ArrayList<CPOperand>();
+                       ArrayList<CPOperand> filter_shape = new 
ArrayList<CPOperand>();
+                       stride.add(new CPOperand(parts[3]));
+                       stride.add(new CPOperand(parts[4]));
+                       padding.add(new CPOperand(parts[5]));
+                       padding.add(new CPOperand(parts[6]));
+                       input_shape.add(new CPOperand(parts[7]));
+                       input_shape.add(new CPOperand(parts[8]));
+                       input_shape.add(new CPOperand(parts[9]));
+                       input_shape.add(new CPOperand(parts[10]));
+                       filter_shape.add(new CPOperand(parts[11]));
+                       filter_shape.add(new CPOperand(parts[12]));
+                       filter_shape.add(new CPOperand(parts[13]));
+                       filter_shape.add(new CPOperand(parts[14]));
+
+                       return new ConvolutionGPUInstruction(in1, in2, out, 
opcode, str, stride,
+                                       padding, input_shape, filter_shape);
                }
+               else if (opcode.equalsIgnoreCase("maxpooling")) {
+                       InstructionUtils.checkNumFields(parts, 14);
+                       CPOperand in1 = new CPOperand(parts[1]);
+                       CPOperand out = new CPOperand(parts[14]);
                
-               InstructionUtils.checkNumFields(parts, 15);
-               CPOperand in1 = new CPOperand(parts[1]);
-               CPOperand in2 = new CPOperand(parts[2]);
-               CPOperand out = new CPOperand(parts[15]);
-       
-               ArrayList<CPOperand> stride = new ArrayList<CPOperand>();
-               ArrayList<CPOperand> padding = new ArrayList<CPOperand>();
-               ArrayList<CPOperand> input_shape = new ArrayList<CPOperand>();
-               ArrayList<CPOperand> filter_shape = new ArrayList<CPOperand>();
-               stride.add(new CPOperand(parts[3]));
-               stride.add(new CPOperand(parts[4]));
-               padding.add(new CPOperand(parts[5]));
-               padding.add(new CPOperand(parts[6]));
-               input_shape.add(new CPOperand(parts[7]));
-               input_shape.add(new CPOperand(parts[8]));
-               input_shape.add(new CPOperand(parts[9]));
-               input_shape.add(new CPOperand(parts[10]));
-               filter_shape.add(new CPOperand(parts[11]));
-               filter_shape.add(new CPOperand(parts[12]));
-               filter_shape.add(new CPOperand(parts[13]));
-               filter_shape.add(new CPOperand(parts[14]));
+                       ArrayList<CPOperand> stride = new 
ArrayList<CPOperand>();
+                       ArrayList<CPOperand> padding = new 
ArrayList<CPOperand>();
+                       ArrayList<CPOperand> input_shape = new 
ArrayList<CPOperand>();
+                       ArrayList<CPOperand> filter_shape = new 
ArrayList<CPOperand>();
+                       stride.add(new CPOperand(parts[2]));
+                       stride.add(new CPOperand(parts[3]));
+                       padding.add(new CPOperand(parts[4]));
+                       padding.add(new CPOperand(parts[5]));
+                       input_shape.add(new CPOperand(parts[6]));
+                       input_shape.add(new CPOperand(parts[7]));
+                       input_shape.add(new CPOperand(parts[8]));
+                       input_shape.add(new CPOperand(parts[9]));
+                       filter_shape.add(new CPOperand(parts[10]));
+                       filter_shape.add(new CPOperand(parts[11]));
+                       filter_shape.add(new CPOperand(parts[12]));
+                       filter_shape.add(new CPOperand(parts[13]));
 
-               return new ConvolutionGPUInstruction(in1, in2, out, opcode, 
str, stride,
-                               padding, input_shape, filter_shape);
+                       return new ConvolutionGPUInstruction(in1, null, out, 
opcode, str, stride,
+                                       padding, input_shape, filter_shape);
+               }
+               else {
+                       throw new DMLRuntimeException("Unknown opcode while 
parsing a ConvolutionGPUInstruction: " + str);      
+               }
+       }
+       
+       private boolean isSparse(ExecutionContext ec, String var) throws 
DMLRuntimeException {
+               MatrixObject mo = ec.getMatrixObject(var);
+               return LibMatrixCUDA.isInSparseFormat(mo);
        }
        
        @Override
        public void processInstruction(ExecutionContext ec) 
                        throws DMLRuntimeException 
        {
+               // TODO: Fix Me. Currently calling CP if data is sparse
+               if (instOpcode.equalsIgnoreCase("maxpooling")) {
+                       if(     isSparse(ec, _input1.getName())) {
+                               
ConvolutionCPInstruction.parseInstruction(this.toString() + 
Instruction.OPERAND_DELIM + 
InfrastructureAnalyzer.getLocalParallelism()).processInstruction(ec);
+                               return;
+                       }
+               }
+               else {
+                       if(     isSparse(ec, _input1.getName()) || isSparse(ec, 
_input2.getName())) {
+                               
ConvolutionCPInstruction.parseInstruction(this.toString() + 
Instruction.OPERAND_DELIM + 
InfrastructureAnalyzer.getLocalParallelism()).processInstruction(ec);
+                               return;
+                       }
+               }
+               
                Statistics.incrementNoOfExecutedGPUInst();
                                        
                int pad_h = getScalarInput(ec, _padding, 0);
@@ -136,7 +184,6 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                        MatrixObject out = 
ec.getMatrixOutputForGPUInstruction(_output.getName(), false);
                        LibMatrixCUDA.conv2d(image, filter, out, N, C, H, W,
                                        K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
-                       
                }
                else if (instOpcode.equalsIgnoreCase("conv2d_backward_filter")) 
{
                        MatrixObject image = 
ec.getMatrixInputForGPUInstruction(_input1.getName());
@@ -172,13 +219,43 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                        LibMatrixCUDA.conv2d_backward_data(filter, dout, out, 
N, C, H, W,
                                        K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
                }
+               else if (instOpcode.equalsIgnoreCase("maxpooling")) {
+                       MatrixObject image = 
ec.getMatrixInputForGPUInstruction(_input1.getName());
+                       if(LibMatrixCUDA.isInSparseFormat(image))
+                               throw new DMLRuntimeException("Sparse 
maxpooling not implemented");
+                       if(image.getNumRows() != N || image.getNumColumns() != 
C*H*W) 
+                               throw new DMLRuntimeException("Incorrect 
dimensions for image in maxpooling: " + 
+                                               image.getNumRows() + " != " +  
N + " || " + image.getNumColumns() + " != " + C*H*W);
+                       
+                       ec.setMetaData(_output.getName(), N, C * P * Q);
+                       MatrixObject out = 
ec.getMatrixOutputForGPUInstruction(_output.getName(), false);
+                       LibMatrixCUDA.maxpooling(image, out, N, C, H, W,
+                                       K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
+               }
+               else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) {
+                       MatrixObject image = 
ec.getMatrixInputForGPUInstruction(_input1.getName());
+                       MatrixObject dout = 
ec.getMatrixInputForGPUInstruction(_input2.getName());
+                       if(LibMatrixCUDA.isInSparseFormat(image) || 
LibMatrixCUDA.isInSparseFormat(dout))
+                               throw new DMLRuntimeException("Sparse 
maxpooling_backward_data not implemented");
+                       if(dout.getNumRows() != N || dout.getNumColumns() != 
C*P*Q) 
+                               throw new DMLRuntimeException("Incorrect 
dimensions for dout in maxpooling_backward");
+                       if(image.getNumRows() != N || image.getNumColumns() != 
C*H*W) 
+                               throw new DMLRuntimeException("Incorrect 
dimensions for image in maxpooling_backward: " + 
+                                               image.getNumRows() + " != " +  
N + " || " + image.getNumColumns() + " != " + K*P*Q);
+                       
+                       ec.setMetaData(_output.getName(), N, C * H * W);
+                       MatrixObject out = 
ec.getMatrixOutputForGPUInstruction(_output.getName(), false);
+                       LibMatrixCUDA.maxpooling_backward(image, dout, out, N, 
C, H, W,
+                                       K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
+               }
                else {
                        throw new DMLRuntimeException("Unsupported GPU context 
for " + instOpcode);
                }
                
                // release inputs/outputs
                ec.releaseMatrixInputForGPUInstruction(_input1.getName());
-               ec.releaseMatrixInputForGPUInstruction(_input2.getName());
+               if (!instOpcode.equalsIgnoreCase("maxpooling"))
+                       
ec.releaseMatrixInputForGPUInstruction(_input2.getName());
                ec.releaseMatrixOutputForGPUInstruction(_output.getName());
        }
        

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/256deb4c/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
index ce9646b..d842ac8 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/GPUInstruction.java
@@ -28,7 +28,7 @@ import org.apache.sysml.runtime.matrix.operators.Operator;
 
 public abstract class GPUInstruction extends Instruction 
 {
-       public enum GPUINSTRUCTION_TYPE { AggregateBinary, Convolution }; 
+       public enum GPUINSTRUCTION_TYPE { AggregateBinary, Convolution, MMTSJ 
}; 
        
        protected GPUINSTRUCTION_TYPE _gputype;
        protected Operator _optr;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/256deb4c/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
new file mode 100644
index 0000000..4709085
--- /dev/null
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/MMTSJGPUInstruction.java
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.instructions.gpu;
+
+/*
+ * Parses and processes the MMTSJ GPU Instruction
+ * @function   GPUInstruction(...)
+ * @function   parseInstruction(...)
+ * @function   processInstruction(...)
+ * @function   getMMTSJType(...)
+ */
+import org.apache.sysml.lops.MMTSJ.MMTSJType;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
+import org.apache.sysml.runtime.instructions.InstructionUtils;
+import org.apache.sysml.runtime.instructions.cp.CPOperand;
+import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA;
+import org.apache.sysml.runtime.matrix.operators.Operator;
+import org.apache.sysml.utils.Statistics;
+
+public class MMTSJGPUInstruction extends GPUInstruction
+{
+
+        private MMTSJType _type = null;
+        
+        CPOperand _input;
+        CPOperand _output;
+
+        /**
+         * @param op   operator
+         * @param in1  input
+         * @param type left/right, left-> A' %*% A, right-> A %*% A'
+         * @param out  output
+         * @param opcode
+         * @param istr
+         */
+        public MMTSJGPUInstruction(Operator op, CPOperand in1, MMTSJType type, 
CPOperand out,  String opcode, String istr)
+        {
+                super(op, opcode, istr);
+                _gputype = GPUINSTRUCTION_TYPE.MMTSJ;
+                _type = type;
+                _input = in1;
+                _output = out;
+        }
+
+        /**
+         * parse MMTSJ GPU instruction
+         * @param str
+         * @return
+         * @throws DMLRuntimeException
+         */
+        public static MMTSJGPUInstruction parseInstruction ( String str )
+               throws DMLRuntimeException
+        {
+                String[] parts = 
InstructionUtils.getInstructionPartsWithValueType(str);
+                InstructionUtils.checkNumFields ( parts, 3 );
+
+                String opcode = parts[0];
+                CPOperand in1 = new CPOperand(parts[1]);
+                CPOperand out = new CPOperand(parts[2]);
+                MMTSJType titype = MMTSJType.valueOf(parts[3]);
+
+                if(!opcode.equalsIgnoreCase("tsmm"))
+                        throw new DMLRuntimeException("Unknown opcode while 
parsing an MMTSJGPUInstruction: " + str);
+                else
+                        return new MMTSJGPUInstruction(new Operator(true), 
in1, titype, out, opcode, str);
+        }
+
+        /**
+         * process MMTSJ GPU instruction 
+         * @param ec   execution context
+         * @throws DMLRuntimeException
+         */
+        @Override
+        public void processInstruction(ExecutionContext ec)
+                throws DMLRuntimeException
+        {
+                Statistics.incrementNoOfExecutedGPUInst();
+
+                //get input
+                MatrixObject mat = 
ec.getMatrixInputForGPUInstruction(_input.getName());
+               
+                boolean isLeftTransposed = ( _type == MMTSJType.LEFT);
+
+                int rlen = (int) (isLeftTransposed? mat.getNumColumns() : 
mat.getNumRows());
+                int clen = rlen;
+
+                //execute operations 
+                ec.setMetaData(_output.getName(), rlen, clen);
+                MatrixObject out = 
ec.getMatrixOutputForGPUInstruction(_output.getName(), false);
+                LibMatrixCUDA.matmultTSMM(mat, out, isLeftTransposed);
+                
+                ec.releaseMatrixInputForGPUInstruction(_input.getName());
+                ec.releaseMatrixOutputForGPUInstruction(_output.getName());
+        }
+
+        /**
+         * returns left/right depending on the type of MMTSJ instruction
+         * @return _type
+         */
+        public MMTSJType getMMTSJType()
+        {
+                return _type;
+        }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/256deb4c/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
index 45b8c5b..33f7099 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/GPUObject.java
@@ -21,6 +21,7 @@ package org.apache.sysml.runtime.instructions.gpu.context;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
 
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.caching.CacheException;
@@ -30,8 +31,14 @@ import org.apache.sysml.utils.Statistics;
 //FIXME merge JCudaObject into GPUObject to avoid unnecessary complexity
 public abstract class GPUObject 
 {
+       public enum EvictionPolicy {
+        LRU, LFU, MIN_EVICT
+    }
+       public static final EvictionPolicy evictionPolicy = EvictionPolicy.LRU;
        protected boolean isDeviceCopyModified = false;
        protected AtomicInteger numLocks = new AtomicInteger(0);
+       AtomicLong timestamp = new AtomicLong(0);
+       
        protected boolean isInSparseFormat = false;
        public boolean isAllocated = false;
        protected MatrixObject mat = null;
@@ -52,8 +59,8 @@ public abstract class GPUObject
        public abstract void acquireDenseDeviceModify(int numElemsToAllocate) 
throws DMLRuntimeException;
        public abstract void acquireHostRead() throws CacheException;
        public abstract void acquireHostModify() throws CacheException;
-       public abstract void release(boolean isGPUCopyModified) throws 
CacheException;
-       
+       public abstract void releaseInput() throws CacheException;
+       public abstract void releaseOutput() throws CacheException;
        
        // package-level visibility as these methods are guarded by underlying 
GPUContext
        abstract void allocateMemoryOnDevice(int numElemToAllocate) throws 
DMLRuntimeException;
@@ -64,72 +71,73 @@ public abstract class GPUObject
        
        
        /**
-        * It finds matrix toBeRemoved such that toBeRemoved.GPUSize >= size
-        * // TODO: it is the smallest matrix size that satisfy the above 
condition. For now just evicting the largest pointer.
-        * Then returns toBeRemoved. 
-        * 
+        * It finds matrix toBeRemoved such that toBeRemoved.GPUSize is the 
smallest one whose size is greater than the eviction size
+        * // TODO: update it with hybrid policy
+        * @return toBeRemoved
         */
-       protected void evict(long GPUSize) throws DMLRuntimeException {
-               if(GPUContext.allocatedPointers.size() == 0) {
-                       throw new DMLRuntimeException("There is not enough 
memory on device for this matrix!");
-               }
-               
-               Statistics.cudaEvictionCount.addAndGet(1);
-               
-               synchronized(evictionLock) {
-                       Collections.sort(GPUContext.allocatedPointers, new 
Comparator<GPUObject>() {
-       
-                               @Override
-                               public int compare(GPUObject p1, GPUObject p2) {
-                                       int p1Val = p1.numLocks.get();
-                                       int p2Val = p2.numLocks.get();
-                                       
-                                       if(p1Val < 0 || p2Val < 0) {
-                                               throw new 
RuntimeException("Number of locks cannot be negative");
-                                       }
-                                       else if(p1Val == 0 && p2Val == 0) {
-                                               // Both p1 and p2 are unlocked, 
return largest object
-                                               // TODO: Modify this !!
-                                               long p1Size = 0; long p2Size = 
0;
-                                               try {
-                                                       p1Size = 
p1.getSizeOnDevice();
-                                                       p2Size = 
p2.getSizeOnDevice();
-                                               } catch (DMLRuntimeException e) 
{
-                                                       throw new 
RuntimeException(e);
-                                               }
-                                               if(p1Size == p2Size) {
-                                                       return 0;
-                                               }
-                                               else if(p1Size < p2Size) {
-                                                       return 1;
-                                               }
-                                               else {
-                                                       return -1;
-                                               }
-                                       }
-                                       else if(p1Val > p2Val) {
-                                               // There are more locks on p1
-                                               return 1;
-                                       }
-                                       else {
-                                               // There are more locks on p2
-                                               return -1;
-                                       }
-                               }
-                       });
-                       
-                       
-                       while(GPUSize > getAvailableMemory() && 
GPUContext.allocatedPointers.size() > 0) {
-                               GPUObject toBeRemoved = 
GPUContext.allocatedPointers.get(GPUContext.allocatedPointers.size() - 1);
-                               if(toBeRemoved.numLocks.get() != 0) {
-                                       throw new DMLRuntimeException("There is 
not enough memory on device for this matrix!");
-                               }
-                               if(toBeRemoved.isDeviceCopyModified) {
-                                       toBeRemoved.copyFromDeviceToHost();
-                               }
-                               toBeRemoved.clearData();
-                       }
-               }
+       protected void evict(final long GPUSize) throws DMLRuntimeException {
+        if(GPUContext.allocatedPointers.size() == 0) {
+                throw new DMLRuntimeException("There is not enough memory on 
device for this matrix!");
+        }
+        
+        Statistics.cudaEvictionCount.addAndGet(1);
+
+        synchronized(evictionLock) {
+               Collections.sort(GPUContext.allocatedPointers, new 
Comparator<GPUObject>() {
+
+                       @Override
+                public int compare(GPUObject p1, GPUObject p2) {
+                       long p1Val = p1.numLocks.get();
+                       long p2Val = p2.numLocks.get();
+
+                       if(p1Val>0 && p2Val>0) {
+                               // Both are locked, so don't sort
+                        return 0;
+                       }
+                       else if(p1Val>0 || p2Val>0) {
+                               // Put the unlocked one to RHS
+                               return Long.compare(p2Val, p1Val);
+                    }
+                       else {
+                               // Both are unlocked
+
+                               if(evictionPolicy == EvictionPolicy.MIN_EVICT) {
+                                       long p1Size = 0; long p2Size = 0;
+                               try {
+                                       p1Size = p1.getSizeOnDevice() - GPUSize;
+                               p2Size = p2.getSizeOnDevice() - GPUSize;
+                               } catch (DMLRuntimeException e) {
+                                       throw new RuntimeException(e);
+                               }
+
+                               if(p1Size>=0 && p2Size>=0 ) {
+                                       return Long.compare(p2Size, p1Size);
+                               }
+                               else {
+                                       return Long.compare(p1Size, p2Size);
+                               }
+                       }
+                               else if(evictionPolicy == EvictionPolicy.LRU || 
evictionPolicy == EvictionPolicy.LFU) {
+                                       return Long.compare(p2.timestamp.get(), 
p1.timestamp.get());
+                       }
+                       else {
+                               throw new RuntimeException("Unsupported 
eviction policy:" + evictionPolicy.name());
+                       }
+                       }
+               }
+               });
+
+               while(GPUSize > getAvailableMemory() && 
GPUContext.allocatedPointers.size() > 0) {
+                       GPUObject toBeRemoved = 
GPUContext.allocatedPointers.get(GPUContext.allocatedPointers.size() - 1);
+                       if(toBeRemoved.numLocks.get() > 0) {
+                               throw new DMLRuntimeException("There is not 
enough memory on device for this matrix!");
+               }
+                       if(toBeRemoved.isDeviceCopyModified) {
+                               toBeRemoved.copyFromDeviceToHost();
+               }
+               toBeRemoved.clearData();
+               }
+        }
        }
        
        public void clearData() throws CacheException {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/256deb4c/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
index 811f2dd..7f0b26b 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/context/JCudaObject.java
@@ -91,6 +91,9 @@ public class JCudaObject extends GPUObject {
                                throw new CacheException(e);
                        }
                }
+               else {
+                       throw new CacheException("Cannot perform 
acquireHostRead as the GPU data is not allocated:" + mat.getVarName());
+               }
        }
        
        @Override
@@ -108,11 +111,47 @@ public class JCudaObject extends GPUObject {
                }
        }
        
-       public void release(boolean isGPUCopyModified) throws CacheException {
+       /**
+        * updates the locks depending on the eviction policy selected
+        * @throws CacheException if there is no locked GPU Object
+        */
+       private void updateReleaseLocks() throws CacheException {
                if(numLocks.addAndGet(-1) < 0) {
-                       throw new CacheException("Redundant release of GPU 
object");
+            throw new CacheException("Redundant release of GPU object");
+               }
+               if(evictionPolicy == EvictionPolicy.LRU) {
+            timestamp.set(System.nanoTime());
+               }
+               else if(evictionPolicy == EvictionPolicy.LFU) {
+            timestamp.addAndGet(1);
+               }
+               else if(evictionPolicy == EvictionPolicy.MIN_EVICT) {
+            // Do Nothing
+               }
+               else {
+            throw new CacheException("The eviction policy is not supported:" + 
evictionPolicy.name());
                }
-               isDeviceCopyModified = isGPUCopyModified;
+       }
+       
+       /**
+        * releases input allocated on GPU
+        * @throws CacheException if data is not allocated
+        */
+       public void releaseInput() throws CacheException {
+               updateReleaseLocks();
+               if(!isAllocated)
+                       throw new CacheException("Attempting to release an 
input before allocating it");
+       }
+       
+       /**
+        * releases output allocated on GPU
+        * @throws CacheException if data is not allocated
+        */
+       public void releaseOutput() throws CacheException {
+               updateReleaseLocks();
+               isDeviceCopyModified = true;
+               if(!isAllocated)
+                       throw new CacheException("Attempting to release an 
output before allocating it");
        }
 
        @Override
@@ -214,7 +253,7 @@ public class JCudaObject extends GPUObject {
                                double [] data = tmp.getDenseBlock();
                                
                                cudaMemcpy(Pointer.to(data), jcudaPointer, 
data.length * Sizeof.DOUBLE, cudaMemcpyDeviceToHost);
-                               
+
                                tmp.recomputeNonZeros();
                                mat.acquireModify(tmp);
                                mat.release();

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/256deb4c/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index 4a94f6a..52272a0 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -22,29 +22,39 @@ package org.apache.sysml.runtime.matrix.data;
 import static jcuda.jcudnn.JCudnn.cudnnConvolutionBackwardData;
 import static jcuda.jcudnn.JCudnn.cudnnConvolutionBackwardFilter;
 import static jcuda.jcudnn.JCudnn.cudnnConvolutionForward;
+import static jcuda.jcudnn.JCudnn.cudnnPoolingForward;
+import static jcuda.jcudnn.JCudnn.cudnnPoolingBackward;
 import static jcuda.jcudnn.JCudnn.cudnnCreateConvolutionDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnCreateFilterDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnCreatePoolingDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnDestroyConvolutionDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnDestroyFilterDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnDestroyTensorDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnDestroyPoolingDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize;
 import static 
jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize;
 import static jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardWorkspaceSize;
 import static jcuda.jcudnn.JCudnn.cudnnSetConvolution2dDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnSetFilter4dDescriptor;
 import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnSetPooling2dDescriptor;
 import static jcuda.jcudnn.cudnnConvolutionMode.CUDNN_CROSS_CORRELATION;
 import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
 import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
+import static jcuda.jcudnn.cudnnPoolingMode.CUDNN_POOLING_MAX;
 import jcuda.jcudnn.cudnnConvolutionFwdPreference;
+import static jcuda.runtime.JCuda.cudaMalloc;
 import static jcuda.runtime.JCuda.cudaFree;
 import jcuda.Pointer;
+import jcuda.Sizeof;
 import jcuda.jcublas.JCublas;
+import jcuda.jcublas.JCublas2;
 import jcuda.jcublas.cublasHandle;
 import jcuda.jcudnn.cudnnConvolutionDescriptor;
 import jcuda.jcudnn.cudnnFilterDescriptor;
 import jcuda.jcudnn.cudnnHandle;
+import jcuda.jcudnn.cudnnPoolingDescriptor;
 import jcuda.jcudnn.cudnnTensorDescriptor;
 
 import org.apache.sysml.runtime.DMLRuntimeException;
@@ -175,6 +185,23 @@ public class LibMatrixCUDA {
                return filterDesc;
        }
        
+       /**
+        * allocates pooling descriptor, used in poolingForward and 
poolingBackward
+        * @param R                     pooling window height
+        * @param S                     pooling window width
+        * @param pad_h         vertical padding
+        * @param pad_w         horizontal padding
+        * @param stride_h      pooling vertical stride
+        * @param stride_w      pooling horizontal stride
+        * @return
+        */
+       private static cudnnPoolingDescriptor allocatePoolingDescriptor(int R, 
int S, int pad_h, int pad_w, int stride_h, int stride_w) {
+               cudnnPoolingDescriptor poolingDesc = new 
cudnnPoolingDescriptor();
+               cudnnCreatePoolingDescriptor(poolingDesc);
+               cudnnSetPooling2dDescriptor(poolingDesc, CUDNN_POOLING_MAX, R, 
S, pad_h, pad_w, stride_h, stride_w);
+               return poolingDesc;
+       }
+       
        public static void conv2d_backward_filter(MatrixObject image, 
MatrixObject dout,
                        MatrixObject outputBlock, int N, int C, int H, int W, 
int K, int R,
                        int S, int pad_h, int pad_w, int stride_h, int 
stride_w, int P,
@@ -240,6 +267,48 @@ public class LibMatrixCUDA {
                
        }
 
+       /**
+        * Performs tsmm, A %*% A' or A' %*% A, on GPU by exploiting 
cublasDsyrk(...)
+        * @param left  input matrix, as in a tsmm expression like A %*% A' or 
A' %*% A, we just need to check whether the left one is transposed or not, I 
named it 'left'
+        * @param output
+        * @param isLeftTransposed
+        * @throws DMLRuntimeException
+        */
+       public static void matmultTSMM(MatrixObject left, MatrixObject output,
+            boolean isLeftTransposed) throws DMLRuntimeException {
+           if(isInSparseFormat(left)) {
+                   throw new DMLRuntimeException("Sparse GPU TSMM is not 
implemented");
+           }
+       
+           // Since CuBLAS expects inputs in column-major format,
+           // reverse the order of matrix-multiplication and take care of 
dimension mismatch.      
+           char transa = isLeftTransposed ? 'N' : 'T';
+           // Note: the dimensions are swapped
+           int m = (int) (isLeftTransposed ? left.getNumColumns() : 
left.getNumRows());
+           int k = (int) (isLeftTransposed ? left.getNumRows() : 
left.getNumColumns());
+       
+           if(m == -1)
+                   throw new DMLRuntimeException("Incorrect dimensions");
+       
+           double alpha = 1.0d;
+           double beta = 0.0d;
+       
+           int lda = (int) (isLeftTransposed ? m : k);
+           int ldc = m;
+       
+           if(!left.getGPUObject().isAllocated)
+                   throw new DMLRuntimeException("Input is not allocated:" + 
left.getGPUObject().isAllocated);
+           if(!output.getGPUObject().isAllocated)
+                   throw new DMLRuntimeException("Output is not allocated:" + 
output.getGPUObject().isAllocated);
+       
+           Pointer A = ((JCudaObject)left.getGPUObject()).jcudaPointer;
+           Pointer C = ((JCudaObject)output.getGPUObject()).jcudaPointer;
+           
+           //TODO: Fix it if there is a cuBLAS API to do flipping
+           JCublas.cublasDsyrk('U',transa, m, k, alpha, A, lda, beta, C, ldc);
+           JCublas.cublasDsyrk('L',transa, m, k, alpha, A, lda, beta, C, ldc);
+       }
+       
        public static void matmult(MatrixObject left1, MatrixObject right1, 
MatrixObject output, 
                        boolean isLeftTransposed1, boolean isRightTransposed1) 
throws DMLRuntimeException {
                if(isInSparseFormat(left1) || isInSparseFormat(right1)) {
@@ -349,6 +418,153 @@ public class LibMatrixCUDA {
                }
        }
        
+       /**
+        * performs maxpooling on GPU by exploiting cudnnPoolingForward(...)
+        * @param image
+        * @param outputBlock
+        * @param N                             batch size
+        * @param C                             number of channels
+        * @param H                             height of image
+        * @param W                             width of image
+        * @param K                             number of filters
+        * @param R                             height of filter
+        * @param S                             width of filter
+        * @param pad_h                 vertical padding
+        * @param pad_w                 horizontal padding
+        * @param stride_h              horizontal stride
+        * @param stride_w              vertical stride
+        * @param P                             (H - R + 1 + 2*pad_h)/stride_h
+        * @param Q                             (W - S + 1 + 2*pad_w)/stride_w
+        * @throws DMLRuntimeException
+        */
+       public static void maxpooling(MatrixObject image,
+                       MatrixObject outputBlock, int N, int C, int H, int W, 
int K, int R,
+                       int S, int pad_h, int pad_w, int stride_h, int 
stride_w, int P,
+                       int Q) throws DMLRuntimeException {
+               Pointer alpha = null;
+               Pointer beta = null;
+               cudnnTensorDescriptor xDesc = null;
+               cudnnTensorDescriptor yDesc = null;
+               cudnnPoolingDescriptor poolingDesc = null;
+
+               try {
+                       // Allocate descriptors
+                       yDesc = allocateTensorDescriptor(N, C, P, Q);
+                       xDesc = allocateTensorDescriptor(N, C, H, W);
+                       poolingDesc = allocatePoolingDescriptor(R, S, pad_h, 
pad_w, stride_h, stride_w);
+                       
+                       // Allocate data
+                       Pointer x = 
((JCudaObject)image.getGPUObject()).jcudaPointer; 
+                       Pointer y = 
((JCudaObject)outputBlock.getGPUObject()).jcudaPointer; 
+                       
+                       alpha = pointerTo(1.0);
+                       beta = pointerTo(0.0f);
+                       
+                       int status = cudnnPoolingForward(cudnnHandle, 
poolingDesc, alpha, xDesc, x, beta, yDesc, y);
+                       
+                       if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
+                               throw new DMLRuntimeException("Could not 
executed cudnnPoolingForward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
+                       }
+               }
+               finally {
+                       if(alpha != null)
+                               cudaFree(alpha);
+                       if(beta != null)
+                               cudaFree(beta);
+                       if(yDesc != null)
+                               cudnnDestroyTensorDescriptor(yDesc);
+                       if(xDesc != null)
+                               cudnnDestroyTensorDescriptor(xDesc);
+                       if(poolingDesc != null)
+                               cudnnDestroyPoolingDescriptor(poolingDesc);
+               }
+       }
+       
+       /**
+        * performs maxpoolingBackward on GPU by exploiting 
cudnnPoolingBackward(...)
+        * @param image
+        * @param dout                  delta matrix, output of previous layer
+        * @param outputBlock
+        * @param N                             batch size
+        * @param C                             number of channels
+        * @param H                             height of image
+        * @param W                             width of image
+        * @param K                             number of filters
+        * @param R                             height of filter
+        * @param S                             width of filter
+        * @param pad_h                 vertical padding
+        * @param pad_w                 horizontal padding
+        * @param stride_h              horizontal stride
+        * @param stride_w              vertical stride
+        * @param P                             (H - R + 1 + 2*pad_h)/stride_h
+        * @param Q                             (W - S + 1 + 2*pad_w)/stride_w
+        * @throws DMLRuntimeException
+        */
+       public static void maxpooling_backward(MatrixObject image, MatrixObject 
dout,
+                       MatrixObject outputBlock, int N, int C, int H, int W, 
int K, int R,
+                       int S, int pad_h, int pad_w, int stride_h, int 
stride_w, int P,
+                       int Q) throws DMLRuntimeException {
+               Pointer alpha = null;
+               Pointer beta = null;
+               cudnnTensorDescriptor xDesc = null;
+               cudnnTensorDescriptor yDesc = null;
+               cudnnTensorDescriptor dyDesc = null;
+               cudnnTensorDescriptor dxDesc = null;
+               cudnnPoolingDescriptor poolingDesc = null;
+
+               try {
+                       // Allocate descriptors
+                       xDesc = allocateTensorDescriptor(N, C, H, W);
+                       yDesc = allocateTensorDescriptor(N, C, P, Q);
+                       dxDesc = allocateTensorDescriptor(N, C, H, W);
+                       dyDesc = allocateTensorDescriptor(N, C, P, Q);
+                       
+                       poolingDesc = allocatePoolingDescriptor(R, S, pad_h, 
pad_w, stride_h, stride_w);
+                       
+                       // Calling PoolForward first, y is one of the inputs 
for poolBackward
+                       // TODO: Remove calling poolForward after necessary 
changes at language level for poolBackward
+                       Pointer y = new Pointer();
+                       long numBytes = N*C*P*Q*Sizeof.DOUBLE;
+                       cudaMalloc(y, numBytes);
+                       
+                       // Allocate data
+                       Pointer x = 
((JCudaObject)image.getGPUObject()).jcudaPointer; 
+                       Pointer dx = 
((JCudaObject)outputBlock.getGPUObject()).jcudaPointer;
+                       Pointer dy = 
((JCudaObject)dout.getGPUObject()).jcudaPointer;
+                       
+                       alpha = pointerTo(1.0);
+                       beta = pointerTo(0.0f);
+                       
+                       int status = cudnnPoolingForward(cudnnHandle, 
poolingDesc, alpha, xDesc, x, beta, yDesc, y);
+                       if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
+                               throw new DMLRuntimeException("Could not 
executed cudnnPoolingForward before cudnnPoolingBackward: " + 
jcuda.jcudnn.cudnnStatus.stringFor(status));
+                       }
+                       
+                       status = cudnnPoolingBackward(cudnnHandle, poolingDesc, 
alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+                       
+                       if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
+                               throw new DMLRuntimeException("Could not 
executed cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
+                       }
+                       
+                       cudaFree(y);
+               }
+               finally {
+                       if(alpha != null)
+                               cudaFree(alpha);
+                       if(beta != null)
+                               cudaFree(beta);
+                       if(yDesc != null)
+                               cudnnDestroyTensorDescriptor(yDesc);
+                       if(xDesc != null)
+                               cudnnDestroyTensorDescriptor(xDesc);
+                       if(dyDesc != null)
+                               cudnnDestroyTensorDescriptor(dyDesc);
+                       if(dxDesc != null)
+                               cudnnDestroyTensorDescriptor(dxDesc);
+                       if(poolingDesc != null)
+                               cudnnDestroyPoolingDescriptor(poolingDesc);     
+               }       
+       }
        public static boolean isInSparseFormat(MatrixObject mo) {
                if(mo.getGPUObject() != null && mo.getGPUObject().isAllocated())
                        return mo.getGPUObject().isInSparseFormat();

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/256deb4c/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java 
b/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
index f2482e1..db244ff 100644
--- a/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
+++ b/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
@@ -190,42 +190,45 @@ public class ConvolutionUtils {
        }
        
        public static Lop constructConvolutionBackwardDataLops(Hop currentHop, 
ExecType et) throws HopsException, LopsException {
-               if(DMLScript.USE_ACCELERATOR)
-                       et = ExecType.GPU; // TODO: Add memory estimate checks
-               else
-                       return null;
-               
-               if(currentHop != null && isConvolutionOp(currentHop, 
ConvOp.COL2IM)) {
-                       Hop temp = currentHop.getInput().get(0);
-                       if(temp != null && isTranspose(temp)) {
-                               Hop matMult = temp.getInput().get(0);
-                               if(matMult != null && isMatMult(matMult)) {
-                                       Hop rotate180 = 
matMult.getInput().get(0);
-                                       Hop filter = matMult.getInput().get(1);
-                                       if(isConvolutionOp(rotate180, 
ConvOp.ROTATE180)) {
-                                               ArrayList<Hop> inputs = new 
ArrayList<Hop>();
-                                               inputs.add(filter);
-                                               
inputs.add(rotate180.getInput().get(0));
-                                               for(int i = 1; i < 
rotate180.getInput().size(); i++) {
-                                                       
inputs.add(rotate180.getInput().get(i));
-                                               }
-                                               
-                                               // N, C * H * W
-                                               long N = 
currentHop.computeSizeInformation(inputs.get(6));
-                                               long C = 
currentHop.computeSizeInformation(inputs.get(7));
-                                               long H = 
currentHop.computeSizeInformation(inputs.get(8));
-                                               long W = 
currentHop.computeSizeInformation(inputs.get(9));
-                                               long rlen = N;
-                                               long clen = 
ConvolutionOp.getExtractedVal(C, H, W);
-                                               return 
ConvolutionOp.constructFusedConvolutionLops(et, inputs, 
ConvOp.DIRECT_CONV2D_BACKWARD_DATA, (ConvolutionOp) rotate180, rlen, clen);
-                                               
-                                               
-                                       }
-                               }
-                       }
-               }
+               return null; // Until we add CP conv2d_backward_data
                
-               return null;
+               //TODO: uncomment the following after CP conv2d_backward_data 
is added
+//             if(DMLScript.USE_ACCELERATOR)
+//                     et = ExecType.GPU; // TODO: Add memory estimate checks
+//             else
+//                     return null;
+//             
+//             if(currentHop != null && isConvolutionOp(currentHop, 
ConvOp.COL2IM)) {
+//                     Hop temp = currentHop.getInput().get(0);
+//                     if(temp != null && isTranspose(temp)) {
+//                             Hop matMult = temp.getInput().get(0);
+//                             if(matMult != null && isMatMult(matMult)) {
+//                                     Hop rotate180 = 
matMult.getInput().get(0);
+//                                     Hop filter = matMult.getInput().get(1);
+//                                     if(isConvolutionOp(rotate180, 
ConvOp.ROTATE180)) {
+//                                             ArrayList<Hop> inputs = new 
ArrayList<Hop>();
+//                                             inputs.add(filter);
+//                                             
inputs.add(rotate180.getInput().get(0));
+//                                             for(int i = 1; i < 
rotate180.getInput().size(); i++) {
+//                                                     
inputs.add(rotate180.getInput().get(i));
+//                                             }
+//                                             
+//                                             // N, C * H * W
+//                                             long N = 
currentHop.computeSizeInformation(inputs.get(6));
+//                                             long C = 
currentHop.computeSizeInformation(inputs.get(7));
+//                                             long H = 
currentHop.computeSizeInformation(inputs.get(8));
+//                                             long W = 
currentHop.computeSizeInformation(inputs.get(9));
+//                                             long rlen = N;
+//                                             long clen = 
ConvolutionOp.getExtractedVal(C, H, W);
+//                                             return 
ConvolutionOp.constructFusedConvolutionLops(et, inputs, 
ConvOp.DIRECT_CONV2D_BACKWARD_DATA, (ConvolutionOp) rotate180, rlen, clen);
+//                                             
+//                                             
+//                                     }
+//                             }
+//                     }
+//             }
+//             
+//             return null;
        }
        
        

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/256deb4c/src/test/java/org/apache/sysml/test/integration/functions/binary/matrix/TransposeSelfMatrixMultiplication.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/sysml/test/integration/functions/binary/matrix/TransposeSelfMatrixMultiplication.java
 
b/src/test/java/org/apache/sysml/test/integration/functions/binary/matrix/TransposeSelfMatrixMultiplication.java
new file mode 100644
index 0000000..b69248d
--- /dev/null
+++ 
b/src/test/java/org/apache/sysml/test/integration/functions/binary/matrix/TransposeSelfMatrixMultiplication.java
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.test.integration.functions.binary.matrix;
+
+import java.util.HashMap;
+
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
+import org.apache.sysml.lops.LopProperties.ExecType;
+import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysml.test.integration.AutomatedTestBase;
+import org.apache.sysml.test.integration.TestConfiguration;
+import org.apache.sysml.test.utils.TestUtils;
+
+/**
+ * This test investigates the specific Hop-Lop rewrite t(X)%*%v -> t(t(v)%*%X).
+ * 
+ */
+public class TransposeSelfMatrixMultiplication extends AutomatedTestBase
+{
+
+        private final static String TEST_NAME1 = 
"TransposeSelfMatrixMultiplication";
+        private final static String TEST_DIR = "functions/binary/matrix/";
+        private final static String TEST_CLASS_DIR = TEST_DIR + 
TransposeSelfMatrixMultiplication.class.getSimpleName() + "/";
+        private final static double eps = 1e-10;
+
+        //multiblock
+        private final static int rowsA1 = 3;
+        private final static int colsA1 = 3;
+
+        //singleblock
+        private final static int rowsA2 = 2407;
+        private final static int colsA2 = 73;
+
+
+        private final static double sparsity1 = 0.7;
+        private final static double sparsity2 = 0.1;
+
+
+        @Override
+        public void setUp()
+        {
+                addTestConfiguration( TEST_NAME1,
+                        new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new 
String[] { "C" }) );
+
+                if (TEST_CACHE_ENABLED) {
+                        setOutAndExpectedDeletionDisabled(true);
+                }
+        }
+
+        @BeforeClass
+        public static void init()
+        {
+                TestUtils.clearDirectory(TEST_DATA_DIR + TEST_CLASS_DIR);
+        }
+
+        @AfterClass
+        public static void cleanUp()
+        {
+                if (TEST_CACHE_ENABLED) {
+                        TestUtils.clearDirectory(TEST_DATA_DIR + 
TEST_CLASS_DIR);
+                }
+        }
+
+        @Test
+        public void testTransposeMMDenseDenseCP1()
+        {
+                       /**
+                        * test case to test the pattern X %*% t(X) and t(X) 
%*% X 
+                        * @param1      isSparse
+                        * @param2      ExecType
+                        * @param3      isVector
+                        * @param4      isLeftTransposed        for A %*% A', 
it's false; for A' %*% A, it's true
+                        */
+                runTransposeSelfMatrixMultiplication(false, ExecType.CP, 
false, true);
+        }
+
+        @Test
+        public void testTransposeMMDenseDenseCP2()
+        {
+                runTransposeSelfMatrixMultiplication(false, ExecType.CP, 
false, false);
+        }
+        
+        /**
+         * 
+         * @param sparseM1
+         * @param sparseM2
+         * @param instType
+         */
+        private void runTransposeSelfMatrixMultiplication( boolean sparseM1, 
ExecType instType, boolean vectorM2, boolean isLeftTransposed)
+        {
+                //rtplatform for MR
+                RUNTIME_PLATFORM platformOld = rtplatform;
+                switch( instType ){
+                        case MR: rtplatform = RUNTIME_PLATFORM.HADOOP; break;
+                        case SPARK: rtplatform = RUNTIME_PLATFORM.SPARK; break;
+                        default: rtplatform = RUNTIME_PLATFORM.HYBRID; break;
+                }
+
+                boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
+                if( rtplatform == RUNTIME_PLATFORM.SPARK )
+                       DMLScript.USE_LOCAL_SPARK_CONFIG = true;
+
+                int rowsA = vectorM2 ? rowsA2 : rowsA1;
+                int colsA = vectorM2 ? colsA2 : colsA1;
+
+                String TEST_NAME = TEST_NAME1;
+
+                try
+                {
+                        TestConfiguration config = 
getTestConfiguration(TEST_NAME);
+
+                        double sparsityM1 = sparseM1?sparsity2:sparsity1;
+
+                        String TEST_CACHE_DIR = "";
+                        if (TEST_CACHE_ENABLED)
+                        {
+                                TEST_CACHE_DIR = sparsityM1 + "_" + vectorM2 + 
"_" + isLeftTransposed + "/";
+                        }
+
+                        loadTestConfiguration(config, TEST_CACHE_DIR);
+
+                        /* This is for running the junit test the new way, 
i.e., construct the arguments directly */
+                        String HOME = SCRIPT_DIR + TEST_DIR;
+                        fullDMLScriptName = HOME + TEST_NAME + ".dml";
+                        programArgs = new String[]{"-explain","-args",
+                                input("A"), Integer.toString(rowsA), 
Integer.toString(colsA),
+                                ("" + isLeftTransposed).toUpperCase(),
+                                output("C")};
+
+                        fullRScriptName = HOME + TEST_NAME + ".R";
+                        rCmd = "Rscript" + " " + fullRScriptName + " " +
+                        inputDir() + " " + isLeftTransposed + " " + 
expectedDir();
+
+                        //generate actual dataset
+                        double[][] A = getRandomMatrix(rowsA, colsA, 0, 1, 
sparsityM1, 7);
+                        writeInputMatrix("A", A, true);
+
+                        boolean exceptionExpected = false;
+                        runTest(true, exceptionExpected, null, -1);
+                        runRScript(true);
+
+                        //compare matrices 
+                        HashMap<CellIndex, Double> dmlfile = 
readDMLMatrixFromHDFS("C");
+                        HashMap<CellIndex, Double> rfile  = 
readRMatrixFromFS("C");
+                        TestUtils.compareMatrices(dmlfile, rfile, eps, 
"Stat-DML", "Stat-R");
+                }
+                finally
+                {
+                        rtplatform = platformOld;
+                        DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
+                }
+        }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/256deb4c/src/test/scripts/functions/binary/matrix/TransposeSelfMatrixMultiplication.R
----------------------------------------------------------------------
diff --git 
a/src/test/scripts/functions/binary/matrix/TransposeSelfMatrixMultiplication.R 
b/src/test/scripts/functions/binary/matrix/TransposeSelfMatrixMultiplication.R
new file mode 100644
index 0000000..1695d40
--- /dev/null
+++ 
b/src/test/scripts/functions/binary/matrix/TransposeSelfMatrixMultiplication.R
@@ -0,0 +1,37 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args <- commandArgs(TRUE)
+options(digits=22)
+
+library("Matrix")
+
+A <- as.matrix(readMM(paste(args[1], "A.mtx", sep="")))
+
+if(args[2] == "true") {
+        C = t(A) %*% A;
+}
+if(args[2] == "false") {
+        C = A %*% t(A);
+}
+
+
+writeMM(as(C, "CsparseMatrix"), paste(args[3], "C", sep=""));
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/256deb4c/src/test/scripts/functions/binary/matrix/TransposeSelfMatrixMultiplication.dml
----------------------------------------------------------------------
diff --git 
a/src/test/scripts/functions/binary/matrix/TransposeSelfMatrixMultiplication.dml
 
b/src/test/scripts/functions/binary/matrix/TransposeSelfMatrixMultiplication.dml
new file mode 100644
index 0000000..2825876
--- /dev/null
+++ 
b/src/test/scripts/functions/binary/matrix/TransposeSelfMatrixMultiplication.dml
@@ -0,0 +1,28 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+A = read($1, rows=$2, cols=$3, format="text");
+isLeftTransposed = $4;
+if(isLeftTransposed)
+        C = t(A) %*% A;
+else
+        C = A %*% t(A);
+write(C, $5, format="text");
\ No newline at end of file

Reply via email to