http://git-wip-us.apache.org/repos/asf/systemml/blob/9fa5a09b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java 
b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
deleted file mode 100644
index bccb1ea..0000000
--- a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.sysml.lops;
-
-import org.apache.sysml.lops.LopProperties.ExecLocation;
-import org.apache.sysml.lops.LopProperties.ExecType;
-import org.apache.sysml.lops.compile.JobType;
-import org.apache.sysml.parser.Expression.DataType;
-import org.apache.sysml.parser.Expression.ValueType;
-
-public class ConvolutionTransform extends Lop
-{
-       public enum OperationTypes {
-               MAX_POOL, MAX_POOL_BACKWARD, AVG_POOL, AVG_POOL_BACKWARD,
-               RELU_MAX_POOLING, RELU_MAX_POOLING_BACKWARD, RELU_BACKWARD,
-               CONV2D, CONV2D_BACKWARD_FILTER, CONV2D_BACKWARD_DATA,
-               BIAS_ADD, CONV2D_BIAS_ADD, BIAS_MULTIPLY, CHANNEL_SUMS
-       }
-       
-       private OperationTypes operation;
-       private double intermediateMemBudget;
-       private final int numThreads;
-       
-       /**
-        * Constructor when we have one input.
-        * 
-        * @param input low-level operator
-        * @param op convolution transform operation type
-        * @param dt data type
-        * @param vt value type
-        * @param et execution type
-        * @param k number of threads
-        * @param intermediateMemBudget intermediate memory budget
-        */
-       public ConvolutionTransform(Lop input, 
ConvolutionTransform.OperationTypes op, DataType dt, ValueType vt, ExecType et, 
int k, double intermediateMemBudget) 
-       {
-               super(Lop.Type.Transform, dt, vt);
-               init(input, op, dt, vt, et);
-               numThreads = k;
-               this.intermediateMemBudget = intermediateMemBudget;
-       }
-       
-       public ConvolutionTransform(Lop input1, Lop input2, 
ConvolutionTransform.OperationTypes op, DataType dt, ValueType vt, ExecType et, 
int k) 
-       {
-               super(Lop.Type.Transform, dt, vt);
-               init(input1, op, dt, vt, et);
-               numThreads = k;
-               this.addInput(input2);
-               input2.addOutput(this);
-               setLevel();
-       }
-       
-       public ConvolutionTransform(Lop input1, Lop input2, Lop input3, 
ConvolutionTransform.OperationTypes op, DataType dt, ValueType vt, ExecType et, 
int k) 
-       {
-               super(Lop.Type.Transform, dt, vt);
-               init(input1, op, dt, vt, et);
-               numThreads = k;
-               this.addInput(input2);
-               input2.addOutput(this);
-               this.addInput(input3);
-               input3.addOutput(this);
-               setLevel();
-       }
-
-       private void init (Lop input, ConvolutionTransform.OperationTypes op, 
DataType dt, ValueType vt, ExecType et) 
-       {
-               operation = op;
- 
-               this.addInput(input);
-               input.addOutput(this);
-
-               boolean breaksAlignment = true;
-               boolean aligner = false;
-               boolean definesMRJob = false;
-               if ( et == ExecType.MR ) {
-                       throw new RuntimeException("The execution type is not 
supported: " + et.name());
-               }
-               else //CP/SPARK
-               {
-                       // <code>breaksAlignment</code> is not meaningful when 
<code>Transform</code> executes in CP. 
-                       breaksAlignment = false;
-                       lps.addCompatibility(JobType.INVALID);
-                       lps.setProperties( inputs, et, 
ExecLocation.ControlProgram, breaksAlignment, aligner, definesMRJob );
-               }
-       }
-       
-       public void updateLopProperties() {
-               lps.setLevel(inputs);
-       }
-
-       @Override
-       public String toString() {
-
-               return " Operation: " + operation;
-       }
-
-       /**
-        * method to get operation type
-        * @return operation type
-        */
-        
-       public OperationTypes getOperationType()
-       {
-               return operation;
-       }
-
-       private String getOpcode() {
-               switch(operation) {
-                               
-               case MAX_POOL:
-                       return "maxpooling";
-                       
-               case RELU_MAX_POOLING:
-                       return "relu_maxpooling";
-                       
-               case RELU_MAX_POOLING_BACKWARD:
-                       return "relu_maxpooling_backward";
-                       
-               case RELU_BACKWARD:
-                       return "relu_backward";
-                       
-               case MAX_POOL_BACKWARD:
-                       return "maxpooling_backward";
-               
-               case AVG_POOL:
-                       return "avgpooling";
-                       
-               case AVG_POOL_BACKWARD:
-                       return "avgpooling_backward";
-               
-               case CONV2D:
-                       return "conv2d";
-               
-               case CONV2D_BIAS_ADD:
-                       return "conv2d_bias_add";
-               
-               case BIAS_ADD:
-                       return "bias_add";
-               
-               case BIAS_MULTIPLY:
-                       return "bias_multiply";
-                       
-               case CONV2D_BACKWARD_FILTER:
-                       return "conv2d_backward_filter";
-                       
-               case CONV2D_BACKWARD_DATA:
-                       return "conv2d_backward_data";
-                       
-               case CHANNEL_SUMS:
-                       return "channel_sums";
-                       
-               default:
-                       throw new 
UnsupportedOperationException(this.printErrorLocation() + "Instruction is not 
defined for Transform operation " + operation);
-                               
-               }
-       }
-       
-       @Override
-       public String getInstructions(String input, String bias, String output) 
{
-               if(operation == OperationTypes.BIAS_ADD || operation == 
OperationTypes.BIAS_MULTIPLY || operation == OperationTypes.RELU_BACKWARD) {
-                       StringBuilder sb = new StringBuilder();
-                       sb.append( getExecType() );
-                       
-                       sb.append( OPERAND_DELIMITOR );
-                       sb.append( getOpcode() );
-                       sb.append( OPERAND_DELIMITOR );
-                       sb.append( getInputs().get(0).prepInputOperand(input));
-                       sb.append( OPERAND_DELIMITOR );
-                       sb.append( getInputs().get(0).prepInputOperand(bias));
-                       //output
-                       sb.append( OPERAND_DELIMITOR );
-                       sb.append( this.prepOutputOperand(output));
-                       
-                       //append degree of parallelism
-                       if( getExecType()==ExecType.CP ) {
-                               sb.append( OPERAND_DELIMITOR );
-                               sb.append( numThreads );
-                       }
-                       
-                       sb.append( OPERAND_DELIMITOR );
-                       sb.append( intermediateMemBudget );
-                       return sb.toString();
-               }
-               else {
-                       throw new LopsException("The operation is not supported 
with two operands:" + operation.name());
-               }
-       }
-       
-       @Override
-       public String getInstructions(String input, String C, String HW, String 
output) {
-               if(operation == OperationTypes.CHANNEL_SUMS) {
-                       StringBuilder sb = new StringBuilder();
-                       sb.append( getExecType() );
-                       
-                       sb.append( OPERAND_DELIMITOR );
-                       sb.append( getOpcode() );
-                       sb.append( OPERAND_DELIMITOR );
-                       sb.append( getInputs().get(0).prepInputOperand(input));
-                       sb.append( OPERAND_DELIMITOR );
-                       sb.append( getInputs().get(1).prepInputOperand(C));
-                       sb.append( OPERAND_DELIMITOR );
-                       sb.append( getInputs().get(2).prepInputOperand(HW));
-                       //output
-                       sb.append( OPERAND_DELIMITOR );
-                       sb.append( this.prepOutputOperand(output));
-                       
-                       return sb.toString();
-               }
-               else {
-                       throw new LopsException("The operation is not supported 
with three operands:" + operation.name());
-               }
-       }
-       
-       @Override
-       public String getInstructions(String[] inputs, String output) {
-               StringBuilder sb = new StringBuilder();
-               appendOpcode(sb);
-               
-               for( int i=0; i<inputs.length-12; i++ ) {
-                       if( i > 0 )
-                               sb.append( OPERAND_DELIMITOR );
-                       sb.append( 
getInputs().get(i).prepInputOperand(inputs[i]));
-               }
-               appendOperands(inputs.length-12, inputs.length, output, sb);
-               
-               return sb.toString();
-       }
-
-       public void appendOpcode(StringBuilder sb) {
-               sb.append( getExecType() );
-               sb.append( OPERAND_DELIMITOR );
-               sb.append( getOpcode() );
-               sb.append( OPERAND_DELIMITOR );
-       }
-       
-       public void appendOperands(int startInputIndex, int endInputIndex, 
String output, StringBuilder sb) {
-               for( int i=startInputIndex; i < endInputIndex; i++ ) {
-                       Lop ltmp = getInputs().get(i);
-                       sb.append( OPERAND_DELIMITOR );
-                       sb.append( ltmp.prepScalarInputOperand(getExecType()));
-               }
-               
-               //output
-               sb.append( OPERAND_DELIMITOR );
-               sb.append( this.prepOutputOperand(output));
-               
-               //append degree of parallelism
-               if( getExecType()==ExecType.CP ) {
-                       sb.append( OPERAND_DELIMITOR );
-                       sb.append( numThreads );
-               }
-               
-               sb.append( OPERAND_DELIMITOR );
-               sb.append( intermediateMemBudget );
-       }
-
-}

http://git-wip-us.apache.org/repos/asf/systemml/blob/9fa5a09b/src/main/java/org/apache/sysml/lops/DnnTransform.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/lops/DnnTransform.java 
b/src/main/java/org/apache/sysml/lops/DnnTransform.java
new file mode 100644
index 0000000..02dcec1
--- /dev/null
+++ b/src/main/java/org/apache/sysml/lops/DnnTransform.java
@@ -0,0 +1,274 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.lops;
+
+import org.apache.sysml.lops.LopProperties.ExecLocation;
+import org.apache.sysml.lops.LopProperties.ExecType;
+import org.apache.sysml.lops.compile.JobType;
+import org.apache.sysml.parser.Expression.DataType;
+import org.apache.sysml.parser.Expression.ValueType;
+
+public class DnnTransform extends Lop
+{
+       public enum OperationTypes {
+               MAX_POOL, MAX_POOL_BACKWARD, AVG_POOL, AVG_POOL_BACKWARD,
+               RELU_MAX_POOLING, RELU_MAX_POOLING_BACKWARD, RELU_BACKWARD,
+               CONV2D, CONV2D_BACKWARD_FILTER, CONV2D_BACKWARD_DATA,
+               BIAS_ADD, CONV2D_BIAS_ADD, BIAS_MULTIPLY, CHANNEL_SUMS
+       }
+       
+       private OperationTypes operation;
+       private double intermediateMemBudget;
+       private final int numThreads;
+       
+       /**
+        * Constructor when we have one input.
+        * 
+        * @param input low-level operator
+        * @param op convolution transform operation type
+        * @param dt data type
+        * @param vt value type
+        * @param et execution type
+        * @param k number of threads
+        * @param intermediateMemBudget intermediate memory budget
+        */
+       public DnnTransform(Lop input, DnnTransform.OperationTypes op, DataType 
dt, ValueType vt, ExecType et, int k, double intermediateMemBudget) 
+       {
+               super(Lop.Type.Transform, dt, vt);
+               init(input, op, dt, vt, et);
+               numThreads = k;
+               this.intermediateMemBudget = intermediateMemBudget;
+       }
+       
+       public DnnTransform(Lop input1, Lop input2, DnnTransform.OperationTypes 
op, DataType dt, ValueType vt, ExecType et, int k) 
+       {
+               super(Lop.Type.Transform, dt, vt);
+               init(input1, op, dt, vt, et);
+               numThreads = k;
+               this.addInput(input2);
+               input2.addOutput(this);
+               setLevel();
+       }
+       
+       public DnnTransform(Lop input1, Lop input2, Lop input3, 
DnnTransform.OperationTypes op, DataType dt, ValueType vt, ExecType et, int k) 
+       {
+               super(Lop.Type.Transform, dt, vt);
+               init(input1, op, dt, vt, et);
+               numThreads = k;
+               this.addInput(input2);
+               input2.addOutput(this);
+               this.addInput(input3);
+               input3.addOutput(this);
+               setLevel();
+       }
+
+       private void init (Lop input, DnnTransform.OperationTypes op, DataType 
dt, ValueType vt, ExecType et) 
+       {
+               operation = op;
+ 
+               this.addInput(input);
+               input.addOutput(this);
+
+               boolean breaksAlignment = true;
+               boolean aligner = false;
+               boolean definesMRJob = false;
+               if ( et == ExecType.MR ) {
+                       throw new RuntimeException("The execution type is not 
supported: " + et.name());
+               }
+               else //CP/SPARK
+               {
+                       // <code>breaksAlignment</code> is not meaningful when 
<code>Transform</code> executes in CP. 
+                       breaksAlignment = false;
+                       lps.addCompatibility(JobType.INVALID);
+                       lps.setProperties( inputs, et, 
ExecLocation.ControlProgram, breaksAlignment, aligner, definesMRJob );
+               }
+       }
+       
+       public void updateLopProperties() {
+               lps.setLevel(inputs);
+       }
+
+       @Override
+       public String toString() {
+
+               return " Operation: " + operation;
+       }
+
+       /**
+        * method to get operation type
+        * @return operation type
+        */
+        
+       public OperationTypes getOperationType()
+       {
+               return operation;
+       }
+
+       private String getOpcode() {
+               switch(operation) {
+                               
+               case MAX_POOL:
+                       return "maxpooling";
+                       
+               case RELU_MAX_POOLING:
+                       return "relu_maxpooling";
+                       
+               case RELU_MAX_POOLING_BACKWARD:
+                       return "relu_maxpooling_backward";
+                       
+               case RELU_BACKWARD:
+                       return "relu_backward";
+                       
+               case MAX_POOL_BACKWARD:
+                       return "maxpooling_backward";
+               
+               case AVG_POOL:
+                       return "avgpooling";
+                       
+               case AVG_POOL_BACKWARD:
+                       return "avgpooling_backward";
+               
+               case CONV2D:
+                       return "conv2d";
+               
+               case CONV2D_BIAS_ADD:
+                       return "conv2d_bias_add";
+               
+               case BIAS_ADD:
+                       return "bias_add";
+               
+               case BIAS_MULTIPLY:
+                       return "bias_multiply";
+                       
+               case CONV2D_BACKWARD_FILTER:
+                       return "conv2d_backward_filter";
+                       
+               case CONV2D_BACKWARD_DATA:
+                       return "conv2d_backward_data";
+                       
+               case CHANNEL_SUMS:
+                       return "channel_sums";
+                       
+               default:
+                       throw new 
UnsupportedOperationException(this.printErrorLocation() + "Instruction is not 
defined for Transform operation " + operation);
+                               
+               }
+       }
+       
+       @Override
+       public String getInstructions(String input, String bias, String output) 
{
+               if(operation == OperationTypes.BIAS_ADD || operation == 
OperationTypes.BIAS_MULTIPLY || operation == OperationTypes.RELU_BACKWARD) {
+                       StringBuilder sb = new StringBuilder();
+                       sb.append( getExecType() );
+                       
+                       sb.append( OPERAND_DELIMITOR );
+                       sb.append( getOpcode() );
+                       sb.append( OPERAND_DELIMITOR );
+                       sb.append( getInputs().get(0).prepInputOperand(input));
+                       sb.append( OPERAND_DELIMITOR );
+                       sb.append( getInputs().get(0).prepInputOperand(bias));
+                       //output
+                       sb.append( OPERAND_DELIMITOR );
+                       sb.append( this.prepOutputOperand(output));
+                       
+                       //append degree of parallelism
+                       if( getExecType()==ExecType.CP ) {
+                               sb.append( OPERAND_DELIMITOR );
+                               sb.append( numThreads );
+                       }
+                       
+                       sb.append( OPERAND_DELIMITOR );
+                       sb.append( intermediateMemBudget );
+                       return sb.toString();
+               }
+               else {
+                       throw new LopsException("The operation is not supported 
with two operands:" + operation.name());
+               }
+       }
+       
+       @Override
+       public String getInstructions(String input, String C, String HW, String 
output) {
+               if(operation == OperationTypes.CHANNEL_SUMS) {
+                       StringBuilder sb = new StringBuilder();
+                       sb.append( getExecType() );
+                       
+                       sb.append( OPERAND_DELIMITOR );
+                       sb.append( getOpcode() );
+                       sb.append( OPERAND_DELIMITOR );
+                       sb.append( getInputs().get(0).prepInputOperand(input));
+                       sb.append( OPERAND_DELIMITOR );
+                       sb.append( getInputs().get(1).prepInputOperand(C));
+                       sb.append( OPERAND_DELIMITOR );
+                       sb.append( getInputs().get(2).prepInputOperand(HW));
+                       //output
+                       sb.append( OPERAND_DELIMITOR );
+                       sb.append( this.prepOutputOperand(output));
+                       
+                       return sb.toString();
+               }
+               else {
+                       throw new LopsException("The operation is not supported 
with three operands:" + operation.name());
+               }
+       }
+       
+       @Override
+       public String getInstructions(String[] inputs, String output) {
+               StringBuilder sb = new StringBuilder();
+               appendOpcode(sb);
+               
+               for( int i=0; i<inputs.length-12; i++ ) {
+                       if( i > 0 )
+                               sb.append( OPERAND_DELIMITOR );
+                       sb.append( 
getInputs().get(i).prepInputOperand(inputs[i]));
+               }
+               appendOperands(inputs.length-12, inputs.length, output, sb);
+               
+               return sb.toString();
+       }
+
+       public void appendOpcode(StringBuilder sb) {
+               sb.append( getExecType() );
+               sb.append( OPERAND_DELIMITOR );
+               sb.append( getOpcode() );
+               sb.append( OPERAND_DELIMITOR );
+       }
+       
+       public void appendOperands(int startInputIndex, int endInputIndex, 
String output, StringBuilder sb) {
+               for( int i=startInputIndex; i < endInputIndex; i++ ) {
+                       Lop ltmp = getInputs().get(i);
+                       sb.append( OPERAND_DELIMITOR );
+                       sb.append( ltmp.prepScalarInputOperand(getExecType()));
+               }
+               
+               //output
+               sb.append( OPERAND_DELIMITOR );
+               sb.append( this.prepOutputOperand(output));
+               
+               //append degree of parallelism
+               if( getExecType()==ExecType.CP ) {
+                       sb.append( OPERAND_DELIMITOR );
+                       sb.append( numThreads );
+               }
+               
+               sb.append( OPERAND_DELIMITOR );
+               sb.append( intermediateMemBudget );
+       }
+
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/9fa5a09b/src/main/java/org/apache/sysml/parser/BuiltinFunctionExpression.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/parser/BuiltinFunctionExpression.java 
b/src/main/java/org/apache/sysml/parser/BuiltinFunctionExpression.java
index 3ca8e1d..ca78106 100644
--- a/src/main/java/org/apache/sysml/parser/BuiltinFunctionExpression.java
+++ b/src/main/java/org/apache/sysml/parser/BuiltinFunctionExpression.java
@@ -29,7 +29,7 @@ import org.apache.commons.lang.ArrayUtils;
 import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.parser.LanguageException.LanguageErrorCodes;
 import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
-import org.apache.sysml.runtime.util.ConvolutionUtils;
+import org.apache.sysml.runtime.util.DnnUtils;
 import org.apache.sysml.runtime.util.UtilFunctions;
 
 public class BuiltinFunctionExpression extends DataIdentifier 
@@ -40,7 +40,7 @@ public class BuiltinFunctionExpression extends DataIdentifier
        public BuiltinFunctionExpression(ParserRuleContext ctx, 
BuiltinFunctionOp bifop, ArrayList<ParameterExpression> args, String fname) {
                _opcode = bifop;
                setCtxValuesAndFilename(ctx, fname);
-               args = expandConvolutionArguments(args);
+               args = expandDnnArguments(args);
                _args = new Expression[args.size()];
                for(int i=0; i < args.size(); i++) {
                        _args[i] = args.get(i).getExpr();
@@ -391,7 +391,7 @@ public class BuiltinFunctionExpression extends 
DataIdentifier
                out.setBlockDimensions(exp.getOutput().getRowsInBlock(), 
exp.getOutput().getColumnsInBlock());
        }
        
-       private static ArrayList<ParameterExpression> 
orderConvolutionParams(ArrayList<ParameterExpression> paramExpression, int 
skip) {
+       private static ArrayList<ParameterExpression> 
orderDnnParams(ArrayList<ParameterExpression> paramExpression, int skip) {
                ArrayList<ParameterExpression> newParams = new ArrayList<>();
 
                for(int i = 0; i < skip; i++)
@@ -458,14 +458,14 @@ public class BuiltinFunctionExpression extends 
DataIdentifier
                return newParamExpressions;
        }
        
-       private ArrayList<ParameterExpression> 
expandConvolutionArguments(ArrayList<ParameterExpression> paramExpression) {
+       private ArrayList<ParameterExpression> 
expandDnnArguments(ArrayList<ParameterExpression> paramExpression) {
                try {
                        if(_opcode == BuiltinFunctionOp.CONV2D || _opcode == 
BuiltinFunctionOp.CONV2D_BACKWARD_FILTER 
                                        || _opcode == 
BuiltinFunctionOp.CONV2D_BACKWARD_DATA) {
                                HashSet<String> expand = new HashSet<>();
                                expand.add("input_shape"); 
expand.add("filter_shape"); expand.add("stride"); expand.add("padding");
                                paramExpression = 
expandListParams(paramExpression, expand);
-                               paramExpression = 
orderConvolutionParams(paramExpression, 2);
+                               paramExpression = 
orderDnnParams(paramExpression, 2);
                        }
                        else if(_opcode == BuiltinFunctionOp.MAX_POOL || 
_opcode == BuiltinFunctionOp.AVG_POOL ||  
                                        _opcode == 
BuiltinFunctionOp.MAX_POOL_BACKWARD || _opcode == 
BuiltinFunctionOp.AVG_POOL_BACKWARD) {
@@ -476,9 +476,9 @@ public class BuiltinFunctionExpression extends 
DataIdentifier
                                paramExpression.add(new 
ParameterExpression("filter_shape2", new IntIdentifier(1, this)));
                                paramExpression = 
replaceListParams(paramExpression, "pool_size", "filter_shape", 3);
                                if(_opcode == 
BuiltinFunctionOp.MAX_POOL_BACKWARD || _opcode == 
BuiltinFunctionOp.AVG_POOL_BACKWARD)
-                                       paramExpression = 
orderConvolutionParams(paramExpression, 2);
+                                       paramExpression = 
orderDnnParams(paramExpression, 2);
                                else
-                                       paramExpression = 
orderConvolutionParams(paramExpression, 1);
+                                       paramExpression = 
orderDnnParams(paramExpression, 1);
                        }
                }
                catch(LanguageException e) {
@@ -1393,8 +1393,8 @@ public class BuiltinFunctionExpression extends 
DataIdentifier
                                                output.setDimensions(N, C*H*W);
                                        }
                                        else if(H > 0 && W > 0 && stride_h > 0 
&& stride_w > 0 && pad_h >= 0 && pad_w >= 0 && R > 0 && S > 0) {
-                                               long P = 
ConvolutionUtils.getP(H, R, stride_h, pad_h);
-                                               long Q = 
ConvolutionUtils.getQ(W, S, stride_w, pad_w);
+                                               long P = DnnUtils.getP(H, R, 
stride_h, pad_h);
+                                               long Q = DnnUtils.getQ(W, S, 
stride_w, pad_w);
                                                
                                                // Try to set both rows and 
columns
                                                if(this.getOpCode() == 
BuiltinFunctionOp.CONV2D) 

http://git-wip-us.apache.org/repos/asf/systemml/blob/9fa5a09b/src/main/java/org/apache/sysml/parser/DMLTranslator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/parser/DMLTranslator.java 
b/src/main/java/org/apache/sysml/parser/DMLTranslator.java
index d29a8f4..08df72f 100644
--- a/src/main/java/org/apache/sysml/parser/DMLTranslator.java
+++ b/src/main/java/org/apache/sysml/parser/DMLTranslator.java
@@ -34,14 +34,14 @@ import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.hops.AggBinaryOp;
 import org.apache.sysml.hops.AggUnaryOp;
 import org.apache.sysml.hops.BinaryOp;
-import org.apache.sysml.hops.ConvolutionOp;
+import org.apache.sysml.hops.DnnOp;
 import org.apache.sysml.hops.DataGenOp;
 import org.apache.sysml.hops.DataOp;
 import org.apache.sysml.hops.FunctionOp;
 import org.apache.sysml.hops.FunctionOp.FunctionType;
 import org.apache.sysml.hops.Hop;
 import org.apache.sysml.hops.Hop.AggOp;
-import org.apache.sysml.hops.Hop.ConvOp;
+import org.apache.sysml.hops.Hop.OpOpDnn;
 import org.apache.sysml.hops.Hop.DataGenMethod;
 import org.apache.sysml.hops.Hop.DataOpTypes;
 import org.apache.sysml.hops.Hop.Direction;
@@ -2689,30 +2689,30 @@ public class DMLTranslator
                        ArrayList<Hop> inHops1 = new ArrayList<>();
                        inHops1.add(expr);
                        inHops1.add(expr2);
-                       currBuiltinOp = new ConvolutionOp(target.getName(), 
target.getDataType(), target.getValueType(),
-                               ConvOp.valueOf(source.getOpCode().name()), 
inHops1);
+                       currBuiltinOp = new DnnOp(target.getName(), 
target.getDataType(), target.getValueType(),
+                               OpOpDnn.valueOf(source.getOpCode().name()), 
inHops1);
                        setBlockSizeAndRefreshSizeInfo(expr, currBuiltinOp);
                        break;
                }
                case AVG_POOL:
                case MAX_POOL: {
-                       currBuiltinOp = new ConvolutionOp(target.getName(), 
target.getDataType(), target.getValueType(),
-                               ConvOp.valueOf(source.getOpCode().name()), 
getALHopsForPoolingForwardIM2COL(expr, source, 1, hops));
+                       currBuiltinOp = new DnnOp(target.getName(), 
target.getDataType(), target.getValueType(),
+                               OpOpDnn.valueOf(source.getOpCode().name()), 
getALHopsForPoolingForwardIM2COL(expr, source, 1, hops));
                        setBlockSizeAndRefreshSizeInfo(expr, currBuiltinOp);
                        break;
                }
                case AVG_POOL_BACKWARD:
                case MAX_POOL_BACKWARD: {
-                       currBuiltinOp = new ConvolutionOp(target.getName(), 
target.getDataType(), target.getValueType(),
-                               ConvOp.valueOf(source.getOpCode().name()), 
getALHopsForConvOpPoolingCOL2IM(expr, source, 1, hops));
+                       currBuiltinOp = new DnnOp(target.getName(), 
target.getDataType(), target.getValueType(),
+                               OpOpDnn.valueOf(source.getOpCode().name()), 
getALHopsForConvOpPoolingCOL2IM(expr, source, 1, hops));
                        setBlockSizeAndRefreshSizeInfo(expr, currBuiltinOp);
                        break;
                }
                case CONV2D:
                case CONV2D_BACKWARD_FILTER:
                case CONV2D_BACKWARD_DATA: {
-                       currBuiltinOp = new ConvolutionOp(target.getName(), 
target.getDataType(), target.getValueType(),
-                               ConvOp.valueOf(source.getOpCode().name()), 
getALHopsForConvOp(expr, source, 1, hops));
+                       currBuiltinOp = new DnnOp(target.getName(), 
target.getDataType(), target.getValueType(),
+                               OpOpDnn.valueOf(source.getOpCode().name()), 
getALHopsForConvOp(expr, source, 1, hops));
                        setBlockSizeAndRefreshSizeInfo(expr, currBuiltinOp);
                        break;
                }

http://git-wip-us.apache.org/repos/asf/systemml/blob/9fa5a09b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java 
b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
index f01d3ae..fcc27e9 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java
@@ -39,7 +39,7 @@ import org.apache.sysml.runtime.instructions.cp.CPInstruction;
 import org.apache.sysml.runtime.instructions.cp.CPInstruction.CPType;
 import org.apache.sysml.runtime.instructions.cp.CentralMomentCPInstruction;
 import org.apache.sysml.runtime.instructions.cp.CompressionCPInstruction;
-import org.apache.sysml.runtime.instructions.cp.ConvolutionCPInstruction;
+import org.apache.sysml.runtime.instructions.cp.DnnCPInstruction;
 import org.apache.sysml.runtime.instructions.cp.CovarianceCPInstruction;
 import org.apache.sysml.runtime.instructions.cp.DataGenCPInstruction;
 import org.apache.sysml.runtime.instructions.cp.DataPartitionCPInstruction;
@@ -235,22 +235,22 @@ public class CPInstructionParser extends InstructionParser
                String2CPInstructionType.put( "rsort"      , CPType.Reorg);
 
                // Opcodes related to convolutions
-               String2CPInstructionType.put( "relu_backward"      , 
CPType.Convolution);
-               String2CPInstructionType.put( "relu_maxpooling"      , 
CPType.Convolution);
-               String2CPInstructionType.put( "relu_maxpooling_backward"      , 
CPType.Convolution);
-               String2CPInstructionType.put( "maxpooling"      , 
CPType.Convolution);
-               String2CPInstructionType.put( "maxpooling_backward"      , 
CPType.Convolution);
-               String2CPInstructionType.put( "avgpooling"      , 
CPType.Convolution);
-               String2CPInstructionType.put( "avgpooling_backward"      , 
CPType.Convolution);
-               String2CPInstructionType.put( "conv2d"      , 
CPType.Convolution);
-               String2CPInstructionType.put( "conv2d_bias_add"      , 
CPType.Convolution);
-               String2CPInstructionType.put( "conv2d_backward_filter"      , 
CPType.Convolution);
-               String2CPInstructionType.put( "conv2d_backward_data"      , 
CPType.Convolution);
-               String2CPInstructionType.put( "bias_add"      , 
CPType.Convolution);
-               String2CPInstructionType.put( "bias_multiply"      , 
CPType.Convolution);
-               String2CPInstructionType.put( "channel_sums"      , 
CPType.Convolution);
-               String2CPInstructionType.put( "batch_norm2d",           
CPType.Convolution);
-               String2CPInstructionType.put( "batch_norm2d_backward",  
CPType.Convolution);
+               String2CPInstructionType.put( "relu_backward"      , 
CPType.Dnn);
+               String2CPInstructionType.put( "relu_maxpooling"      , 
CPType.Dnn);
+               String2CPInstructionType.put( "relu_maxpooling_backward"      , 
CPType.Dnn);
+               String2CPInstructionType.put( "maxpooling"      , CPType.Dnn);
+               String2CPInstructionType.put( "maxpooling_backward"      , 
CPType.Dnn);
+               String2CPInstructionType.put( "avgpooling"      , CPType.Dnn);
+               String2CPInstructionType.put( "avgpooling_backward"      , 
CPType.Dnn);
+               String2CPInstructionType.put( "conv2d"      , CPType.Dnn);
+               String2CPInstructionType.put( "conv2d_bias_add"      , 
CPType.Dnn);
+               String2CPInstructionType.put( "conv2d_backward_filter"      , 
CPType.Dnn);
+               String2CPInstructionType.put( "conv2d_backward_data"      , 
CPType.Dnn);
+               String2CPInstructionType.put( "bias_add"      , CPType.Dnn);
+               String2CPInstructionType.put( "bias_multiply"      , 
CPType.Dnn);
+               String2CPInstructionType.put( "channel_sums"      , CPType.Dnn);
+               String2CPInstructionType.put( "batch_norm2d",           
CPType.Dnn);
+               String2CPInstructionType.put( "batch_norm2d_backward",  
CPType.Dnn);
                
                // Quaternary instruction opcodes
                String2CPInstructionType.put( "wsloss"  , CPType.Quaternary);
@@ -344,8 +344,8 @@ public class CPInstructionParser extends InstructionParser
                        case Reorg:
                                return ReorgCPInstruction.parseInstruction(str);
                                
-                       case Convolution:
-                                return 
ConvolutionCPInstruction.parseInstruction(str);
+                       case Dnn:
+                                return DnnCPInstruction.parseInstruction(str);
                                
                        case UaggOuterChain:
                                return 
UaggOuterChainCPInstruction.parseInstruction(str);

http://git-wip-us.apache.org/repos/asf/systemml/blob/9fa5a09b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
index 8e9bb47..59c7350 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java
@@ -26,7 +26,7 @@ import 
org.apache.sysml.runtime.instructions.gpu.AggregateBinaryGPUInstruction;
 import 
org.apache.sysml.runtime.instructions.gpu.ArithmeticBinaryGPUInstruction;
 import org.apache.sysml.runtime.instructions.gpu.BuiltinBinaryGPUInstruction;
 import org.apache.sysml.runtime.instructions.gpu.BuiltinUnaryGPUInstruction;
-import org.apache.sysml.runtime.instructions.gpu.ConvolutionGPUInstruction;
+import org.apache.sysml.runtime.instructions.gpu.DnnGPUInstruction;
 import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
 import org.apache.sysml.runtime.instructions.gpu.MatrixIndexingGPUInstruction;
 import 
org.apache.sysml.runtime.instructions.gpu.MatrixMatrixAxpyGPUInstruction;
@@ -44,22 +44,22 @@ public class GPUInstructionParser  extends InstructionParser
                String2GPUInstructionType = new HashMap<>();
 
                // Neural Network Operators
-               String2GPUInstructionType.put( "relu_backward",          
GPUINSTRUCTION_TYPE.Convolution);
-               String2GPUInstructionType.put( "conv2d",                 
GPUINSTRUCTION_TYPE.Convolution);
-               String2GPUInstructionType.put( "conv2d_bias_add",        
GPUINSTRUCTION_TYPE.Convolution);
-               String2GPUInstructionType.put( "conv2d_backward_filter", 
GPUINSTRUCTION_TYPE.Convolution);
-               String2GPUInstructionType.put( "conv2d_backward_data",   
GPUINSTRUCTION_TYPE.Convolution);
-               String2GPUInstructionType.put( "maxpooling",             
GPUINSTRUCTION_TYPE.Convolution);
-               String2GPUInstructionType.put( "maxpooling_backward",    
GPUINSTRUCTION_TYPE.Convolution);
-               String2GPUInstructionType.put( "avgpooling",             
GPUINSTRUCTION_TYPE.Convolution);
-               String2GPUInstructionType.put( "avgpooling_backward",    
GPUINSTRUCTION_TYPE.Convolution);
-               String2GPUInstructionType.put( "bias_add",               
GPUINSTRUCTION_TYPE.Convolution);
-               String2GPUInstructionType.put( "bias_multiply",          
GPUINSTRUCTION_TYPE.Convolution);
-               String2GPUInstructionType.put( "channel_sums",          
GPUINSTRUCTION_TYPE.Convolution);
-               String2GPUInstructionType.put( "lstm",                  
GPUINSTRUCTION_TYPE.Convolution);
-               String2GPUInstructionType.put( "lstm_backward",         
GPUINSTRUCTION_TYPE.Convolution);
-               String2GPUInstructionType.put( "batch_norm2d",           
GPUINSTRUCTION_TYPE.Convolution);
-               String2GPUInstructionType.put( "batch_norm2d_backward",  
GPUINSTRUCTION_TYPE.Convolution);
+               String2GPUInstructionType.put( "relu_backward",          
GPUINSTRUCTION_TYPE.Dnn);
+               String2GPUInstructionType.put( "conv2d",                 
GPUINSTRUCTION_TYPE.Dnn);
+               String2GPUInstructionType.put( "conv2d_bias_add",        
GPUINSTRUCTION_TYPE.Dnn);
+               String2GPUInstructionType.put( "conv2d_backward_filter", 
GPUINSTRUCTION_TYPE.Dnn);
+               String2GPUInstructionType.put( "conv2d_backward_data",   
GPUINSTRUCTION_TYPE.Dnn);
+               String2GPUInstructionType.put( "maxpooling",             
GPUINSTRUCTION_TYPE.Dnn);
+               String2GPUInstructionType.put( "maxpooling_backward",    
GPUINSTRUCTION_TYPE.Dnn);
+               String2GPUInstructionType.put( "avgpooling",             
GPUINSTRUCTION_TYPE.Dnn);
+               String2GPUInstructionType.put( "avgpooling_backward",    
GPUINSTRUCTION_TYPE.Dnn);
+               String2GPUInstructionType.put( "bias_add",               
GPUINSTRUCTION_TYPE.Dnn);
+               String2GPUInstructionType.put( "bias_multiply",          
GPUINSTRUCTION_TYPE.Dnn);
+               String2GPUInstructionType.put( "channel_sums",          
GPUINSTRUCTION_TYPE.Dnn);
+               String2GPUInstructionType.put( "lstm",                  
GPUINSTRUCTION_TYPE.Dnn);
+               String2GPUInstructionType.put( "lstm_backward",         
GPUINSTRUCTION_TYPE.Dnn);
+               String2GPUInstructionType.put( "batch_norm2d",           
GPUINSTRUCTION_TYPE.Dnn);
+               String2GPUInstructionType.put( "batch_norm2d_backward",  
GPUINSTRUCTION_TYPE.Dnn);
                
                // Matrix Multiply Operators
                String2GPUInstructionType.put( "ba+*",  
GPUINSTRUCTION_TYPE.AggregateBinary);
@@ -182,8 +182,8 @@ public class GPUInstructionParser  extends InstructionParser
                        case Append:
                                return 
MatrixAppendGPUInstruction.parseInstruction(str);
 
-                       case Convolution:
-                               return 
ConvolutionGPUInstruction.parseInstruction(str);
+                       case Dnn:
+                               return DnnGPUInstruction.parseInstruction(str);
                                
                        case MMTSJ:
                                return 
MMTSJGPUInstruction.parseInstruction(str);

http://git-wip-us.apache.org/repos/asf/systemml/blob/9fa5a09b/src/main/java/org/apache/sysml/runtime/instructions/SPInstructionParser.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/SPInstructionParser.java 
b/src/main/java/org/apache/sysml/runtime/instructions/SPInstructionParser.java
index dd91b9f..efec463 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/SPInstructionParser.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/SPInstructionParser.java
@@ -51,7 +51,7 @@ import 
org.apache.sysml.runtime.instructions.spark.CastSPInstruction;
 import org.apache.sysml.runtime.instructions.spark.CentralMomentSPInstruction;
 import org.apache.sysml.runtime.instructions.spark.CheckpointSPInstruction;
 import org.apache.sysml.runtime.instructions.spark.CompressionSPInstruction;
-import org.apache.sysml.runtime.instructions.spark.ConvolutionSPInstruction;
+import org.apache.sysml.runtime.instructions.spark.DnnSPInstruction;
 import org.apache.sysml.runtime.instructions.spark.CovarianceSPInstruction;
 import org.apache.sysml.runtime.instructions.spark.CpmmSPInstruction;
 import 
org.apache.sysml.runtime.instructions.spark.CumulativeAggregateSPInstruction;
@@ -138,10 +138,10 @@ public class SPInstructionParser extends InstructionParser
                String2SPInstructionType.put( "tack+*"     , 
SPType.AggregateTernary);
 
                // Neural network operators
-               String2SPInstructionType.put( "conv2d",                 
SPType.Convolution);
-               String2SPInstructionType.put( "conv2d_bias_add", 
SPType.Convolution);
-               String2SPInstructionType.put( "maxpooling",             
SPType.Convolution);
-               String2SPInstructionType.put( "relu_maxpooling",          
SPType.Convolution);
+               String2SPInstructionType.put( "conv2d",                 
SPType.Dnn);
+               String2SPInstructionType.put( "conv2d_bias_add", SPType.Dnn);
+               String2SPInstructionType.put( "maxpooling",             
SPType.Dnn);
+               String2SPInstructionType.put( "relu_maxpooling",          
SPType.Dnn);
                
                String2SPInstructionType.put( RightIndex.OPCODE, 
SPType.MatrixIndexing);
                String2SPInstructionType.put( LeftIndex.OPCODE, 
SPType.MatrixIndexing);
@@ -370,8 +370,8 @@ public class SPInstructionParser extends InstructionParser
                        case AggregateTernary:
                                return 
AggregateTernarySPInstruction.parseInstruction(str);
                                
-                       case Convolution:
-                                return 
ConvolutionSPInstruction.parseInstruction(str);
+                       case Dnn:
+                                return DnnSPInstruction.parseInstruction(str);
 
                        case MatrixIndexing:
                                return 
IndexingSPInstruction.parseInstruction(str);

http://git-wip-us.apache.org/repos/asf/systemml/blob/9fa5a09b/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java 
b/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java
index afad85f..52da951 100644
--- a/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java
+++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/CPInstruction.java
@@ -34,7 +34,7 @@ public abstract class CPInstruction extends Instruction
                MultiReturnParameterizedBuiltin, ParameterizedBuiltin, 
MultiReturnBuiltin,
                Builtin, Reorg, Variable, External, Append, Rand, QSort, QPick,
                MatrixIndexing, MMTSJ, PMMJ, MMChain, MatrixReshape, Partition, 
Compression, SpoofFused,
-               StringInit, CentralMoment, Covariance, UaggOuterChain, 
Convolution }
+               StringInit, CentralMoment, Covariance, UaggOuterChain, Dnn }
        
        protected final CPType _cptype;
        protected final Operator _optr;

http://git-wip-us.apache.org/repos/asf/systemml/blob/9fa5a09b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
deleted file mode 100644
index 97d050d..0000000
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
+++ /dev/null
@@ -1,645 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * 
- *   http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.sysml.runtime.instructions.cp;
-
-import java.util.ArrayList;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.sysml.api.DMLScript;
-import org.apache.sysml.hops.OptimizerUtils;
-import org.apache.sysml.runtime.DMLRuntimeException;
-import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
-import org.apache.sysml.runtime.instructions.InstructionUtils;
-import org.apache.sysml.runtime.matrix.data.ConvolutionParameters;
-import org.apache.sysml.runtime.matrix.data.LibMatrixDNN;
-import org.apache.sysml.runtime.matrix.data.LibMatrixDNN.PoolingType;
-import org.apache.sysml.runtime.matrix.data.LibMatrixNative;
-import org.apache.sysml.runtime.matrix.data.MatrixBlock;
-import org.apache.sysml.runtime.util.ConvolutionUtils;
-import org.apache.sysml.utils.NativeHelper;
-
-public class ConvolutionCPInstruction extends UnaryCPInstruction {
-       private static final Log LOG = 
LogFactory.getLog(ConvolutionCPInstruction.class.getName());
-       private static boolean warnedUnderUtilitization = false;
-       
-       private final CPOperand _in2;
-       private final CPOperand _in3;
-       private final CPOperand _in4;
-       private final CPOperand _in5;
-       private final CPOperand _in6;
-       private final CPOperand _in7;
-       private final CPOperand _in8;
-       private final CPOperand _out2;
-       private final CPOperand _out3;
-       private final CPOperand _out4;
-       private final CPOperand _out5;
-       private final ArrayList<CPOperand> _input_shape;
-       private final ArrayList<CPOperand> _filter_shape;
-       private final ArrayList<CPOperand> _stride;
-       private final ArrayList<CPOperand> _padding;
-       private final int _numThreads;
-       private final double _intermediateMemoryBudget;
-       
-       public ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand 
in3, CPOperand out, 
-                       ArrayList<CPOperand> stride, ArrayList<CPOperand> 
padding, ArrayList<CPOperand> input_shape,
-                       ArrayList<CPOperand> filter_shape, int numThreads, 
double intermediateMemoryBudget, String opcode, String istr) {
-               super(CPType.Convolution, null, in, out, opcode, istr);
-               _in2 = in2;
-               _in3 = in3;
-               _in4 = null; _in5 = null; _in6 = null; _in7 = null; _in8 = null;
-               _out2 = null; _out3 = null; _out4 = null; _out5 = null;
-               _stride = stride;
-               _padding = padding;
-               _input_shape = input_shape;
-               _filter_shape = filter_shape;
-               _numThreads = numThreads;
-               _intermediateMemoryBudget = intermediateMemoryBudget;
-       }
-       
-       public ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand 
out, String opcode, String istr, int numThreads, double 
intermediateMemoryBudget) {
-               this(in, in2, null, out, null, null, null, null, numThreads, 
intermediateMemoryBudget, opcode, istr);
-               if( !(opcode.equals("bias_add") || 
opcode.equals("relu_backward") || opcode.equals("bias_multiply") ) ) {
-                       throw new DMLRuntimeException("Incorrect usage. 
Expected the opcode to be bias_add or bias_multiply or relu_backward, but found 
" + opcode);
-               }
-       }
-       
-       public ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand 
in3, CPOperand out, String opcode, String istr, int numThreads, double 
intermediateMemoryBudget) {
-               this(in, in2, in3, out, null, null, null, null, numThreads, 
intermediateMemoryBudget, opcode, istr);
-               if( !opcode.equals("channel_sums") ) {
-                       throw new DMLRuntimeException("Incorrect usage. 
Expected the opcode to be channel_sums, but found " + opcode);
-               }
-       }
-
-       private ConvolutionCPInstruction(CPOperand in, CPOperand out, String 
opcode, String istr,
-                       ArrayList<CPOperand> stride, ArrayList<CPOperand> 
padding, ArrayList<CPOperand> input_shape,
-                       ArrayList<CPOperand> filter_shape, int numThreads, 
double intermediateMemoryBudget) {
-               this(in, null, null, out, stride, padding, input_shape, 
filter_shape, numThreads, intermediateMemoryBudget, opcode, istr);
-       }
-       
-       public ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand 
out, String opcode,
-                       String istr, ArrayList<CPOperand> stride,
-                       ArrayList<CPOperand> padding, ArrayList<CPOperand> 
input_shape,
-                       ArrayList<CPOperand> filter_shape, int numThreads, 
double intermediateMemoryBudget) {
-               this(in, in2, null, out, stride, padding, input_shape, 
filter_shape, numThreads, intermediateMemoryBudget, opcode, istr);
-       }
-       
-       public ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand 
in3, CPOperand out, String opcode,
-                       String istr, ArrayList<CPOperand> stride,
-                       ArrayList<CPOperand> padding, ArrayList<CPOperand> 
input_shape,
-                       ArrayList<CPOperand> filter_shape, int numThreads, 
double intermediateMemoryBudget) {
-               this(in, in2, in3, out, stride, padding, input_shape, 
filter_shape, numThreads, intermediateMemoryBudget, opcode, istr);
-       }
-       
-       public ConvolutionCPInstruction(CPOperand in1, CPOperand in2, CPOperand 
in3, CPOperand in4, CPOperand in5,
-                       CPOperand in6, CPOperand in7, CPOperand in8,
-                       CPOperand out, CPOperand out2, CPOperand out3, 
CPOperand out4, CPOperand out5, String opcode, String istr, 
-                       double intermediateMemoryBudget) throws 
DMLRuntimeException {
-               super(CPType.Convolution, null, in1, out, opcode, istr);
-               _in2 = in2;
-               _in3 = in3;
-               _in4 = in4;
-               _in5 = in5;
-               _in6 = in6;
-               _in7 = in7;
-               _in8 = in8;
-               _out2 = out2;
-               _out3 = out3;
-               _out4 = out4;
-               _out5 = out5;
-               _stride = null;
-               _padding = null;
-               _input_shape = null;
-               _filter_shape = null;
-               _numThreads = 0;
-               _intermediateMemoryBudget = intermediateMemoryBudget;
-       }
-
-       public static ConvolutionCPInstruction parseInstruction(String str) {
-
-               String[] parts = 
InstructionUtils.getInstructionPartsWithValueType(str);
-               String opcode = parts[0];
-               if (opcode.equalsIgnoreCase("maxpooling") || 
opcode.equalsIgnoreCase("relu_maxpooling") ||
-                       opcode.equalsIgnoreCase("avgpooling")) {
-                       InstructionUtils.checkNumFields(parts, 16);
-                       // stride1, stride2, padding1, padding2
-                       // input_shape1, input_shape2, input_shape3, 
input_shape4,
-                       // filter_shape1, filter_shape2, filter_shape3, 
filter_shape4, k
-                       CPOperand in = new CPOperand(parts[1]);
-                       CPOperand out = new CPOperand(parts[14]);
-
-                       ArrayList<CPOperand> stride = new ArrayList<>();
-                       ArrayList<CPOperand> padding = new ArrayList<>();
-                       ArrayList<CPOperand> input_shape = new ArrayList<>();
-                       ArrayList<CPOperand> filter_shape = new ArrayList<>();
-                       stride.add(new CPOperand(parts[2]));
-                       stride.add(new CPOperand(parts[3]));
-                       padding.add(new CPOperand(parts[4]));
-                       padding.add(new CPOperand(parts[5]));
-                       input_shape.add(new CPOperand(parts[6]));
-                       input_shape.add(new CPOperand(parts[7]));
-                       input_shape.add(new CPOperand(parts[8]));
-                       input_shape.add(new CPOperand(parts[9]));
-                       filter_shape.add(new CPOperand(parts[10]));
-                       filter_shape.add(new CPOperand(parts[11]));
-                       filter_shape.add(new CPOperand(parts[12]));
-                       filter_shape.add(new CPOperand(parts[13]));
-                       int k = Integer.parseInt(parts[15]);
-
-                       return new ConvolutionCPInstruction(in, out, opcode, 
str, stride,
-                                       padding, input_shape, filter_shape, k, 
Double.parseDouble(parts[16]));
-               } 
-               else if (opcode.equalsIgnoreCase("maxpooling_backward") || 
opcode.equalsIgnoreCase("relu_maxpooling_backward")
-                               || 
opcode.equalsIgnoreCase("avgpooling_backward")
-                               || opcode.equalsIgnoreCase("conv2d")
-                               || 
opcode.equalsIgnoreCase("conv2d_backward_filter")
-                               || 
opcode.equalsIgnoreCase("conv2d_backward_data")) {
-                       InstructionUtils.checkNumFields(parts, 17);
-                       // dout, stride1, stride2, padding1, padding2
-                       // input_shape1, input_shape2, input_shape3, 
input_shape4,
-                       // filter_shape1, filter_shape2, filter_shape3, 
filter_shape4, k
-                       CPOperand in = new CPOperand(parts[1]);
-                       CPOperand in2 = new CPOperand(parts[2]);
-                       CPOperand out = new CPOperand(parts[15]);
-
-                       ArrayList<CPOperand> stride = new ArrayList<>();
-                       ArrayList<CPOperand> padding = new ArrayList<>();
-                       ArrayList<CPOperand> input_shape = new ArrayList<>();
-                       ArrayList<CPOperand> filter_shape = new ArrayList<>();
-                       stride.add(new CPOperand(parts[3]));
-                       stride.add(new CPOperand(parts[4]));
-                       padding.add(new CPOperand(parts[5]));
-                       padding.add(new CPOperand(parts[6]));
-                       input_shape.add(new CPOperand(parts[7]));
-                       input_shape.add(new CPOperand(parts[8]));
-                       input_shape.add(new CPOperand(parts[9]));
-                       input_shape.add(new CPOperand(parts[10]));
-                       filter_shape.add(new CPOperand(parts[11]));
-                       filter_shape.add(new CPOperand(parts[12]));
-                       filter_shape.add(new CPOperand(parts[13]));
-                       filter_shape.add(new CPOperand(parts[14]));
-                       int k = Integer.parseInt(parts[16]);
-
-                       return new ConvolutionCPInstruction(in, in2, out, 
opcode, str, stride,
-                                       padding, input_shape, filter_shape, k, 
Double.parseDouble(parts[17]));
-               }
-               else if (opcode.equalsIgnoreCase("conv2d_bias_add")) {
-                       InstructionUtils.checkNumFields(parts, 18);
-                       // dout, stride1, stride2, padding1, padding2
-                       // input_shape1, input_shape2, input_shape3, 
input_shape4,
-                       // filter_shape1, filter_shape2, filter_shape3, 
filter_shape4, k
-                       CPOperand in = new CPOperand(parts[1]);
-                       CPOperand in2 = new CPOperand(parts[2]);
-                       CPOperand in3 = new CPOperand(parts[3]);
-                       CPOperand out = new CPOperand(parts[16]);
-
-                       ArrayList<CPOperand> stride = new ArrayList<>();
-                       ArrayList<CPOperand> padding = new ArrayList<>();
-                       ArrayList<CPOperand> input_shape = new ArrayList<>();
-                       ArrayList<CPOperand> filter_shape = new ArrayList<>();
-                       stride.add(new CPOperand(parts[4]));
-                       stride.add(new CPOperand(parts[5]));
-                       padding.add(new CPOperand(parts[6]));
-                       padding.add(new CPOperand(parts[7]));
-                       input_shape.add(new CPOperand(parts[8]));
-                       input_shape.add(new CPOperand(parts[9]));
-                       input_shape.add(new CPOperand(parts[10]));
-                       input_shape.add(new CPOperand(parts[11]));
-                       filter_shape.add(new CPOperand(parts[12]));
-                       filter_shape.add(new CPOperand(parts[13]));
-                       filter_shape.add(new CPOperand(parts[14]));
-                       filter_shape.add(new CPOperand(parts[15]));
-                       int k = Integer.parseInt(parts[17]);
-
-                       return new ConvolutionCPInstruction(in, in2, in3, out, 
opcode, str, stride,
-                                       padding, input_shape, filter_shape, k, 
Double.parseDouble(parts[18]));
-               }
-               else if (opcode.equalsIgnoreCase("bias_add") || 
opcode.equals("relu_backward") || opcode.equalsIgnoreCase("bias_multiply") ) {
-                       InstructionUtils.checkNumFields(parts, 5);
-                       CPOperand in = new CPOperand(parts[1]);
-                       CPOperand in2 = new CPOperand(parts[2]);
-                       CPOperand out = new CPOperand(parts[3]);
-                       int k = Integer.parseInt(parts[4]);
-                       return new ConvolutionCPInstruction(in, in2, out, 
opcode, str, k, Double.parseDouble(parts[5]));
-               }
-               else if (opcode.equalsIgnoreCase("channel_sums")) {
-                       InstructionUtils.checkNumFields(parts, 4);
-                       CPOperand in = new CPOperand(parts[1]);
-                       CPOperand in2 = new CPOperand(parts[2]);
-                       CPOperand in3 = new CPOperand(parts[3]);
-                       CPOperand out = new CPOperand(parts[4]);
-                       return new ConvolutionCPInstruction(in, in2, in3, out, 
opcode, str, -1, 0);
-               }
-               else if (opcode.equalsIgnoreCase("batch_norm2d")) {
-                       InstructionUtils.checkNumFields(parts, 13);
-                       CPOperand in1 = new CPOperand(parts[1]); // image
-                       CPOperand in2 = new CPOperand(parts[2]); // scale
-                       CPOperand in3 = new CPOperand(parts[3]); // bias
-                       CPOperand in4 = new CPOperand(parts[4]); // runningMean
-                       CPOperand in5 = new CPOperand(parts[5]); // runningVar
-                       CPOperand in6 = new CPOperand(parts[6]); // mode
-                       CPOperand in7 = new CPOperand(parts[7]); // epsilon
-                       CPOperand in8 = new CPOperand(parts[8]); // 
exponentialAverageFactor
-                       CPOperand out = new CPOperand(parts[9]);  // ret
-                       CPOperand out2 = new CPOperand(parts[10]); // 
retRunningMean
-                       CPOperand out3 = new CPOperand(parts[11]); // 
retRunningVar
-                       CPOperand out4 = new CPOperand(parts[12]); // 
resultSaveMean
-                       CPOperand out5 = new CPOperand(parts[13]); // 
resultSaveInvVariance
-                       return new ConvolutionCPInstruction(in1, in2, in3, in4, 
in5, in6, in7, in8, out, out2, out3, out4, out5, opcode, str, 0);
-               }
-               else if (opcode.equalsIgnoreCase("batch_norm2d_backward")) {
-                       InstructionUtils.checkNumFields(parts, 9);
-                       CPOperand in1 = new CPOperand(parts[1]); // image
-                       CPOperand in2 = new CPOperand(parts[2]); // dout
-                       CPOperand in3 = new CPOperand(parts[3]); // scale
-                       CPOperand in4 = new CPOperand(parts[4]); // epsilon
-                       CPOperand in5 = new CPOperand(parts[5]); // 
resultSaveMean
-                       CPOperand in6 = new CPOperand(parts[6]); // 
resultSaveInvVariance
-                       CPOperand out = new CPOperand(parts[7]);  // dX
-                       CPOperand out2 = new CPOperand(parts[8]); // dScale
-                       CPOperand out3 = new CPOperand(parts[9]); // dBias
-                       return new ConvolutionCPInstruction(in1, in2, in3, in4, 
in5, in6, null, null, out, out2, out3, null, null, opcode, str, 0);
-               }
-               else {
-                       throw new DMLRuntimeException("Unknown opcode while 
parsing a ConvolutionCPInstruction: " + str);
-               }
-       }
-
-       private static int getScalarInput(ExecutionContext ec, 
ArrayList<CPOperand> aL, int index) {
-               return (int) ec.getScalarInput(aL.get(index).getName(),
-                       aL.get(index).getValueType(), 
aL.get(index).isLiteral()).getLongValue();
-       }
-       
-       public void processReluBackwardInstruction(ExecutionContext ec) {
-               // (X > 0) * dout
-               MatrixBlock input = ec.getMatrixInput(input1.getName(), 
getExtendedOpcode());
-               MatrixBlock dout = ec.getMatrixInput(_in2.getName(), 
getExtendedOpcode());
-               MatrixBlock outputBlock = new MatrixBlock(input.getNumRows(), 
input.getNumColumns(),
-                       input.isInSparseFormat() || dout.isInSparseFormat() );
-               
-               if( !input.isEmpty() && !dout.isEmpty() ) { //sparse-safe
-                       outputBlock.allocateBlock();
-                       LibMatrixDNN.reluBackward(input, dout, outputBlock, 
_numThreads);
-               }
-               
-               // release inputs/outputs
-               ec.releaseMatrixInput(input1.getName(), getExtendedOpcode());
-               ec.releaseMatrixInput(_in2.getName(), getExtendedOpcode());
-               ec.setMatrixOutput(getOutputVariableName(), outputBlock, 
getExtendedOpcode());
-       }
-       
-       public void processBiasAddInstruction(ExecutionContext ec) {
-               MatrixBlock input = ec.getMatrixInput(input1.getName(), 
getExtendedOpcode());
-               MatrixBlock bias = ec.getMatrixInput(_in2.getName(), 
getExtendedOpcode());
-               MatrixBlock outputBlock = null;
-               
-               if(bias.getNumColumns() != 1) {
-                       throw new DMLRuntimeException("Expected the number of 
columns of bias matrix to be 1, but found " + bias.getNumColumns());
-               }
-               
-               if(input.isEmpty() && bias.isEmpty()) {
-                       outputBlock = new MatrixBlock(input.getNumRows(), 
input.getNumColumns(), true);
-               }
-               else if(bias.isEmpty()) {
-                       outputBlock = new MatrixBlock(input);
-               }
-               else {
-                       // As we always fill the output first with bias
-                       outputBlock = new MatrixBlock(input.getNumRows(), 
input.getNumColumns(), false);
-                       outputBlock.allocateDenseBlock();
-                       LibMatrixDNN.biasAdd(input, bias, outputBlock, 
_numThreads);
-               }
-               
-               // release inputs/outputs
-               ec.releaseMatrixInput(input1.getName(), getExtendedOpcode());
-               ec.releaseMatrixInput(_in2.getName(), getExtendedOpcode());
-               ec.setMatrixOutput(getOutputVariableName(), outputBlock, 
getExtendedOpcode());
-       }
-       
-       public void processBiasMultiplyInstruction(ExecutionContext ec) {
-               MatrixBlock input = ec.getMatrixInput(input1.getName(), 
getExtendedOpcode());
-               MatrixBlock bias = ec.getMatrixInput(_in2.getName(), 
getExtendedOpcode());
-               MatrixBlock outputBlock = null;
-               
-               if(bias.getNumColumns() != 1) {
-                       throw new DMLRuntimeException("Expected the number of 
columns of bias matrix to be 1, but found " + bias.getNumColumns());
-               }
-               
-               if(bias.isEmpty()) {
-                       // Anything multiplied by zero is zero
-                       outputBlock = new MatrixBlock(input.getNumRows(), 
input.getNumColumns(), true);
-               }
-               else {
-                       // As we always fill the output first with bias
-                       outputBlock = new MatrixBlock(input.getNumRows(), 
input.getNumColumns(), 
-                               input.isInSparseFormat()).allocateBlock();
-                       LibMatrixDNN.biasMultiply(input, bias, outputBlock, 
_numThreads);
-               }
-               
-               // release inputs/outputs
-               ec.releaseMatrixInput(input1.getName(), getExtendedOpcode());
-               ec.releaseMatrixInput(_in2.getName(), getExtendedOpcode());
-               ec.setMatrixOutput(getOutputVariableName(), outputBlock, 
getExtendedOpcode());
-       }
-       
-       public void processChannelSumsInstruction(ExecutionContext ec) {
-               MatrixBlock input = ec.getMatrixInput(input1.getName(), 
getExtendedOpcode());
-               int C = (int) ec.getScalarInput(_in2.getName(), 
_in2.getValueType(), _in2.isLiteral()).getLongValue();
-               int HW = (int) ec.getScalarInput(_in3.getName(), 
_in3.getValueType(), _in3.isLiteral()).getLongValue();
-               if(C*HW != input.getNumColumns()) {
-                       throw new DMLRuntimeException("Expected rows*cols" + C 
+ "*" + HW + " to be equal to number of columns of input " + 
input.getNumColumns());
-               }
-               MatrixBlock outputBlock = null;
-               if(input.isEmpty()) {
-                       outputBlock = new MatrixBlock(C, 1, true);
-               }
-               else {
-                       outputBlock = new MatrixBlock(C, 1, 
false).allocateBlock();
-                       LibMatrixDNN.channelSums(input, outputBlock, C, HW);
-               }
-               
-               // release inputs/outputs
-               ec.releaseMatrixInput(input1.getName(), getExtendedOpcode());
-               ec.setMatrixOutput(getOutputVariableName(), outputBlock, 
getExtendedOpcode());
-       }
-       
-       
-       
-       public void processBatchNorm2dInstruction(ExecutionContext ec) {
-               MatrixBlock image = ec.getMatrixInput(input1.getName(), 
getExtendedOpcode());
-               MatrixBlock scale = ec.getMatrixInput(_in2.getName(), 
getExtendedOpcode());
-               MatrixBlock bias = ec.getMatrixInput(_in3.getName(), 
getExtendedOpcode());
-               MatrixBlock runningMean = ec.getMatrixInput(_in4.getName(), 
getExtendedOpcode());
-               MatrixBlock runningVar = ec.getMatrixInput(_in5.getName(), 
getExtendedOpcode());
-               String phase = ec.getScalarInput(_in6.getName(), 
_in6.getValueType(), _in6.isLiteral()).getStringValue();
-               double epsilon = ec.getScalarInput(_in7.getName(), 
_in7.getValueType(), _in7.isLiteral()).getDoubleValue();
-               double mu = ec.getScalarInput(_in8.getName(), 
_in8.getValueType(), _in8.isLiteral()).getDoubleValue();
-               
-               MatrixBlock ret = new MatrixBlock(image.getNumRows(), 
image.getNumColumns(), false).allocateBlock();
-               MatrixBlock retRunningMean = new 
MatrixBlock(runningMean.getNumRows(), runningMean.getNumColumns(), 
false).allocateBlock();
-               MatrixBlock retRunningVar = new 
MatrixBlock(runningVar.getNumRows(), runningVar.getNumColumns(), 
false).allocateBlock();
-               MatrixBlock resultSaveMean = new 
MatrixBlock(runningMean.getNumRows(), runningMean.getNumColumns(), 
false).allocateBlock();
-               MatrixBlock resultSaveInvVariance = new 
MatrixBlock(runningVar.getNumRows(), runningVar.getNumColumns(), 
false).allocateBlock();
-               
-               LibMatrixDNN.batchNorm2D(image, scale, bias, runningMean, 
runningVar, phase, epsilon, mu, ret, 
-                               retRunningMean, retRunningVar, resultSaveMean, 
resultSaveInvVariance);
-               
-               // release inputs/outputs
-               ec.releaseMatrixInput(input1.getName(), getExtendedOpcode());
-               ec.releaseMatrixInput(_in2.getName(), getExtendedOpcode());
-               ec.releaseMatrixInput(_in3.getName(), getExtendedOpcode());
-               ec.releaseMatrixInput(_in4.getName(), getExtendedOpcode());
-               ec.releaseMatrixInput(_in5.getName(), getExtendedOpcode());
-               ec.setMatrixOutput(output.getName(), ret, getExtendedOpcode());
-               ec.setMatrixOutput(_out2.getName(), retRunningMean, 
getExtendedOpcode());
-               ec.setMatrixOutput(_out3.getName(), retRunningVar, 
getExtendedOpcode());
-               ec.setMatrixOutput(_out4.getName(), resultSaveMean, 
getExtendedOpcode());
-               ec.setMatrixOutput(_out5.getName(), resultSaveInvVariance, 
getExtendedOpcode());
-       }
-       
-       public void processBatchNorm2dBackwardInstruction(ExecutionContext ec) {
-               MatrixBlock image = ec.getMatrixInput(input1.getName(), 
getExtendedOpcode());
-               MatrixBlock dout = ec.getMatrixInput(_in2.getName(), 
getExtendedOpcode());
-               MatrixBlock scale = ec.getMatrixInput(_in3.getName(), 
getExtendedOpcode());
-               double epsilon = ec.getScalarInput(_in4.getName(), 
_in4.getValueType(), _in4.isLiteral()).getDoubleValue();
-               MatrixBlock resultSaveMean = ec.getMatrixInput(_in5.getName(), 
getExtendedOpcode());
-               MatrixBlock resultSaveInvVariance = 
ec.getMatrixInput(_in6.getName(), getExtendedOpcode());
-               
-               MatrixBlock dX = new MatrixBlock(image.getNumRows(), 
image.getNumColumns(), false).allocateBlock();
-               MatrixBlock dScale = new MatrixBlock(scale.getNumRows(), 
scale.getNumColumns(), false).allocateBlock();
-               MatrixBlock dBias = new MatrixBlock(scale.getNumRows(), 
scale.getNumColumns(), false).allocateBlock();
-               
-               LibMatrixDNN.batchNorm2DBackward(image, dout, scale, epsilon, 
resultSaveMean, resultSaveInvVariance, dX, dScale, dBias);
-               
-               // release inputs/outputs
-               ec.releaseMatrixInput(input1.getName(), getExtendedOpcode());
-               ec.releaseMatrixInput(_in2.getName(), getExtendedOpcode());
-               ec.releaseMatrixInput(_in3.getName(), getExtendedOpcode());
-               ec.releaseMatrixInput(_in5.getName(), getExtendedOpcode());
-               ec.releaseMatrixInput(_in6.getName(), getExtendedOpcode());
-               ec.setMatrixOutput(output.getName(), dX, getExtendedOpcode());
-               ec.setMatrixOutput(_out2.getName(), dScale, 
getExtendedOpcode());
-               ec.setMatrixOutput(_out3.getName(), dBias, getExtendedOpcode());
-       }
-       
-       
-       // Assumption: enableNative && NativeHelper.isNativeLibraryLoaded() is 
true
-       // This increases the number of native calls. For example:the cases 
where filter is sparse but input is dense
-       private static boolean isFilterSparse(MatrixBlock filter) {
-               long numElems = filter.getNumRows()*filter.getNumColumns();
-               // if filter is less than 10 MB in dense format (which handles 
almost all the cases).
-               // In fact, using threshold of 1 MB is still sufficient for 
common CNNs.
-               if(filter.isInSparseFormat() && numElems < 10e+6)
-                       filter.sparseToDense(); 
-               return filter.isInSparseFormat();
-       }
-       
-       
-       @Override
-       public void processInstruction(ExecutionContext ec) {
-               
-               if (instOpcode.equalsIgnoreCase("bias_add")) {
-                       processBiasAddInstruction(ec);
-                       return;
-               }
-               else if (instOpcode.equalsIgnoreCase("bias_multiply")) {
-                       processBiasMultiplyInstruction(ec);
-                       return;
-               }
-               else if (instOpcode.equalsIgnoreCase("relu_backward")) {
-                       processReluBackwardInstruction(ec);
-                       return;
-               }
-               else if (instOpcode.equalsIgnoreCase("channel_sums")) {
-                       processChannelSumsInstruction(ec);
-                       return;
-               }
-               else if (instOpcode.equalsIgnoreCase("batch_norm2d")) {
-                       processBatchNorm2dInstruction(ec);
-                       return;
-               }
-               else if (instOpcode.equalsIgnoreCase("batch_norm2d_backward")) {
-                       processBatchNorm2dBackwardInstruction(ec);
-                       return;
-               }
-               
-               // acquire inputs
-               MatrixBlock outputBlock = null;
-               MatrixBlock matBlock = 
instOpcode.equalsIgnoreCase("avgpooling_backward") ? null : 
ec.getMatrixInput(input1.getName(), getExtendedOpcode());
-               int pad_h = getScalarInput(ec, _padding, 0);
-               int pad_w = getScalarInput(ec, _padding, 1);
-               int stride_h = getScalarInput(ec, _stride, 0);
-               int stride_w = getScalarInput(ec, _stride, 1);
-
-               int N = getScalarInput(ec, _input_shape, 0);
-               int C = getScalarInput(ec, _input_shape, 1);
-               int H = getScalarInput(ec, _input_shape, 2);
-               int W = getScalarInput(ec, _input_shape, 3);
-
-               int K = getScalarInput(ec, _filter_shape, 0);
-               
-               int R = getScalarInput(ec, _filter_shape, 2);
-               int S = getScalarInput(ec, _filter_shape, 3);
-               int P = (int) ConvolutionUtils.getP(H, R, stride_h, pad_h);
-               int Q = (int) ConvolutionUtils.getQ(W, S, stride_w, pad_w);
-               
-               ConvolutionParameters params = new ConvolutionParameters(N, C, 
H, W, K, R, S, stride_h, stride_w, pad_h, pad_w, _numThreads);
-               params.enableNative = NativeHelper.isNativeLibraryLoaded();
-               if (instOpcode.equalsIgnoreCase("maxpooling") || 
instOpcode.equalsIgnoreCase("relu_maxpooling") ||
-                       instOpcode.equalsIgnoreCase("avgpooling")) {
-                       if(matBlock.isEmpty()) {
-                               outputBlock = new MatrixBlock(N, C*P*Q, true);
-                       }
-                       else {
-                               outputBlock = new MatrixBlock(N, C*P*Q, 
false).allocateBlock();
-                               
-                               PoolingType poolType = 
(instOpcode.equalsIgnoreCase("maxpooling") || 
instOpcode.equalsIgnoreCase("relu_maxpooling")) ? PoolingType.MAX : 
PoolingType.AVG;
-                               
if(instOpcode.equalsIgnoreCase("relu_maxpooling"))
-                                       params.minValForMaxPoolOperations = 0;
-                               LibMatrixDNN.pooling(matBlock, outputBlock, 
params, poolType);
-                       }
-               }
-               else if (instOpcode.equalsIgnoreCase("maxpooling_backward") || 
instOpcode.equalsIgnoreCase("relu_maxpooling_backward") ||
-                               
instOpcode.equalsIgnoreCase("avgpooling_backward")) {
-                       MatrixBlock dout = ec.getMatrixInput(_in2.getName(), 
getExtendedOpcode());
-                       boolean isEmpty = 
instOpcode.equalsIgnoreCase("avgpooling_backward") ? dout.isEmpty() : 
(matBlock.isEmpty() || dout.isEmpty());
-                       if(isEmpty) {
-                               outputBlock = new MatrixBlock(N, C*H*W, true);
-                       }
-                       else {
-                               outputBlock = new MatrixBlock(N, C*H*W, 
false).allocateBlock();
-                               PoolingType poolType = 
(instOpcode.equalsIgnoreCase("maxpooling_backward") || 
instOpcode.equalsIgnoreCase("relu_maxpooling_backward")) ? PoolingType.MAX : 
PoolingType.AVG;
-                               boolean performReLUBackward = 
instOpcode.equalsIgnoreCase("relu_maxpooling_backward");
-                               if(performReLUBackward)
-                                       params.minValForMaxPoolOperations = 0;
-                               LibMatrixDNN.poolingBackward(matBlock, dout, 
outputBlock, params, performReLUBackward, poolType);
-                       }
-                       ec.releaseMatrixInput(_in2.getName(), 
getExtendedOpcode());
-               }
-               else if (instOpcode.equalsIgnoreCase("conv2d")) {
-                       resetNumThreads(params, C*R*S, P*Q, 
matBlock.getNonZeros() / (matBlock.getNumRows()*matBlock.getNumColumns()));
-                       MatrixBlock filter = ec.getMatrixInput(_in2.getName(), 
getExtendedOpcode());
-                       if(filter.isEmpty() || matBlock.isEmpty()) {
-                               outputBlock = new MatrixBlock(N, K*P*Q, true);
-                       }
-                       else {
-                               boolean sparse = matBlock.isUltraSparse(false) 
&& params.bias == null
-                                       && matBlock.getInMemorySize() < 
MatrixBlock.estimateSizeDenseInMemory(N, K*P*Q);
-                               outputBlock = new MatrixBlock(N, K*P*Q, 
sparse).allocateBlock();
-                               if(params.enableNative && 
!isFilterSparse(filter) && !matBlock.isInSparseFormat())
-                                       LibMatrixNative.conv2d(matBlock, 
filter, outputBlock, params);
-                               else
-                                       LibMatrixDNN.conv2d(matBlock, filter, 
outputBlock, params);
-                       }
-                       ec.releaseMatrixInput(_in2.getName(), 
getExtendedOpcode());
-               }
-               else if (instOpcode.equalsIgnoreCase("conv2d_bias_add")) {
-                       resetNumThreads(params, C*R*S, P*Q, 
matBlock.getNonZeros() / (matBlock.getNumRows()*matBlock.getNumColumns()));
-                       MatrixBlock filter = ec.getMatrixInput(_in3.getName(), 
getExtendedOpcode());
-                       MatrixBlock bias = ec.getMatrixInput(_in2.getName(), 
getExtendedOpcode());
-                       if(bias.getNumRows() != params.K || 
bias.getNumColumns() != 1) {
-                               throw new DMLRuntimeException("Incorrect shape 
of bias matrix: [" + bias.getNumRows() + " " + bias.getNumColumns() + "]. "
-                                               + "Expected: [" + params.K + ", 
1]");
-                       }
-                       boolean isOutputConvEmpty = filter.isEmpty() || 
matBlock.isEmpty();
-                       if(isOutputConvEmpty && bias.isEmpty()) {
-                               // bias_add(empty mb, empty mb) = empty mb
-                               outputBlock = new MatrixBlock(N, K*P*Q, true);
-                       }
-                       else if(isOutputConvEmpty && !bias.isEmpty()) {
-                               // Add bias to empty output block
-                               // bias_add(empty mb, bias)
-                               outputBlock = new MatrixBlock(N, K*P*Q, 
false).allocateBlock();
-                               for(int n = 0;  n < params.N; n++) 
-                                       ConvolutionUtils.fillBias(bias, 
outputBlock.getDenseBlockValues(),
-                                               n, n+1, params.N, params.K, 
params.P*params.Q);
-                       }
-                       else {
-                               outputBlock = new MatrixBlock(N, K*P*Q, 
false).allocateBlock();
-                               if(!bias.isEmpty()) {
-                                       // Handle situation where both input 
and filter are non empty, but bias is empty
-                                       params.bias = bias;
-                               }
-                               if(params.enableNative && 
!isFilterSparse(filter) && !matBlock.isInSparseFormat())
-                                       LibMatrixNative.conv2d(matBlock, 
filter, outputBlock, params);
-                               else
-                                       LibMatrixDNN.conv2d(matBlock, filter, 
outputBlock, params);
-                       }
-                       ec.releaseMatrixInput(_in3.getName(), 
getExtendedOpcode());
-                       ec.releaseMatrixInput(_in2.getName(), 
getExtendedOpcode());
-               }
-               else if (instOpcode.equalsIgnoreCase("conv2d_backward_filter")) 
{
-                       MatrixBlock dout = ec.getMatrixInput(_in2.getName(), 
getExtendedOpcode());
-                       if(dout.isEmpty() || matBlock.isEmpty()) {
-                               outputBlock = new MatrixBlock(K, C*R*S, true);
-                       }
-                       else {
-                               outputBlock = new MatrixBlock(K, C*R*S, 
false).allocateBlock();
-                               if(params.enableNative && 
!matBlock.isInSparseFormat() && !dout.isInSparseFormat())
-                                       
LibMatrixNative.conv2dBackwardFilter(matBlock, dout, outputBlock, params);
-                               else
-                                       
LibMatrixDNN.conv2dBackwardFilter(matBlock, dout, outputBlock, params);
-                       }
-                       ec.releaseMatrixInput(_in2.getName(), 
getExtendedOpcode());
-               }
-               else if (instOpcode.equalsIgnoreCase("conv2d_backward_data")) {
-                       MatrixBlock dout = ec.getMatrixInput(_in2.getName(), 
getExtendedOpcode());
-                       if(dout.isEmpty() || matBlock.isEmpty()) {
-                               outputBlock = new MatrixBlock(N, C * H * W, 
true);
-                       }
-                       else {
-                               outputBlock = new MatrixBlock(N, C * H * W, 
false).allocateBlock();
-                               if(params.enableNative && 
!isFilterSparse(matBlock) && !dout.isInSparseFormat())
-                                       
LibMatrixNative.conv2dBackwardData(matBlock, dout, outputBlock, params);
-                               else
-                                       
LibMatrixDNN.conv2dBackwardData(matBlock, dout, outputBlock, params);
-                       }
-                       ec.releaseMatrixInput(_in2.getName(), 
getExtendedOpcode());
-               }
-               else {
-                       throw new DMLRuntimeException("Unsupported op code " + 
instOpcode);
-               }
-               
-               // release inputs/outputs
-               if(!instOpcode.equalsIgnoreCase("avgpooling_backward"))
-                       ec.releaseMatrixInput(input1.getName(), 
getExtendedOpcode());
-               ec.setMatrixOutput(getOutputVariableName(), outputBlock, 
getExtendedOpcode());
-       }
-       
-       /**
-        * Reset the number of thread to respect the intermediate CP memory 
budget
-        * 
-        * @param params convolution parameters
-        * @param numRows number of rows of intermediate matrix used per thread
-        * @param numCols number of rows of intermediate matrix used per thread
-        * @param sparsity sparsity of intermediate matrix used per thread
-        */
-       private void resetNumThreads(ConvolutionParameters params, int numRows, 
int numCols, double sparsity) {
-               if(DMLScript.USE_ACCELERATOR) {
-                       double memBudget1Thread = 
OptimizerUtils.estimateSizeExactSparsity(numRows, numCols, sparsity);
-                       int limitedDegreeOfParallelism = (int) 
Math.floor(_intermediateMemoryBudget / memBudget1Thread);
-                       if(params.numThreads > limitedDegreeOfParallelism) {
-                               params.numThreads = limitedDegreeOfParallelism;
-                               if(!warnedUnderUtilitization)
-                                       LOG.warn("CPU Under-utilization to 
respect the intermediate memory budget. To avoid this, please try reducing the 
mini-batch or forcing gpu execution.");
-                               warnedUnderUtilitization = true;
-                       }
-               }
-       }
-}

Reply via email to