[2/3] incubator-systemml git commit: [SYSTEMML-769] Adding support for native BLAS for Linux

niketanpansare Sun, 30 Apr 2017 11:18:05 -0700

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
index f839990..c794932 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java
@@ -21,15 +21,20 @@ package org.apache.sysml.runtime.instructions.cp;
 
 import java.util.ArrayList;
 import java.util.Arrays;
+
+import org.apache.sysml.conf.ConfigurationManager;
+import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
 import org.apache.sysml.runtime.functionobjects.SwapIndex;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.matrix.data.ConvolutionParameters;
 import org.apache.sysml.runtime.matrix.data.LibMatrixDNN;
+import org.apache.sysml.runtime.matrix.data.LibMatrixNative;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
+import org.apache.sysml.utils.NativeHelper;
 
 public class ConvolutionCPInstruction extends UnaryCPInstruction 
 {      
@@ -131,7 +136,7 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction
                        return new ConvolutionCPInstruction(in, out, opcode, 
str, stride,
                                        padding, input_shape, filter_shape, k);
                } 
-               else if (opcode.equalsIgnoreCase("maxpooling_backward")
+               else if (opcode.equalsIgnoreCase("maxpooling_backward") || 
opcode.equalsIgnoreCase("relu_maxpooling_backward")
                                || opcode.equalsIgnoreCase("conv2d")
                                || 
opcode.equalsIgnoreCase("conv2d_backward_filter")
                                || 
opcode.equalsIgnoreCase("conv2d_backward_data")) {
@@ -288,6 +293,17 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction
                ec.setMatrixOutput(getOutputVariableName(), outputBlock);
        }
        
+       // Assumption: enableNative && NativeHelper.isNativeLibraryLoaded() is 
true
+       // This increases the number of native calls. For example:the cases 
where filter is sparse but input is dense
+       private boolean isFilterSparse(MatrixBlock filter) throws 
DMLRuntimeException {
+               long numElems = filter.getNumRows()*filter.getNumColumns();
+               // if filter is less than 10 MB in dense format (which handles 
almost all the cases).
+               // In fact, using threshold of 1 MB is still sufficient for 
common CNNs.
+               if(filter.isInSparseFormat() && numElems < 10e+6)
+                       filter.sparseToDense(); 
+               return filter.isInSparseFormat();
+       }
+       
        
        @Override
        public void processInstruction(ExecutionContext ec)
@@ -326,6 +342,7 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction
                int Q = (int) ConvolutionUtils.getQ(W, S, stride_w, pad_w);
                
                ConvolutionParameters params = new ConvolutionParameters(N, C, 
H, W, K, R, S, stride_h, stride_w, pad_h, pad_w, _numThreads);
+               params.enableNative = 
ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.NATIVE_BLAS) && 
NativeHelper.isNativeLibraryLoaded();
                if (instOpcode.equalsIgnoreCase("maxpooling") || 
instOpcode.equalsIgnoreCase("relu_maxpooling")) {
                        if(matBlock.isEmptyBlock()) {
                                outputBlock = new MatrixBlock(N, C*P*Q, true);
@@ -337,14 +354,17 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction
                                LibMatrixDNN.maxpooling(matBlock, outputBlock, 
params);
                        }
                }
-               else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) {
+               else if (instOpcode.equalsIgnoreCase("maxpooling_backward") || 
instOpcode.equalsIgnoreCase("relu_maxpooling_backward")) {
                        MatrixBlock dout = ec.getMatrixInput(_in2.getName());
                        if(matBlock.isEmptyBlock() || dout.isEmptyBlock()) {
                                outputBlock = new MatrixBlock(N, C*H*W, true);
                        }
                        else {
                                outputBlock = getDenseOutputBlock(N, C*H*W);
-                               LibMatrixDNN.maxpoolingBackward(matBlock, dout, 
outputBlock, params);
+                               
if(instOpcode.equalsIgnoreCase("maxpooling_backward"))
+                                       
LibMatrixDNN.maxpoolingBackward(matBlock, dout, outputBlock, params, false);
+                               else
+                                       
LibMatrixDNN.maxpoolingBackward(matBlock, dout, outputBlock, params, true);
                        }
                        ec.releaseMatrixInput(_in2.getName());
                }
@@ -355,7 +375,10 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction
                        }
                        else {
                                outputBlock = getDenseOutputBlock(N, K*P*Q);
-                               LibMatrixDNN.conv2d(matBlock, filter, 
outputBlock, params);
+                               if(params.enableNative && 
!isFilterSparse(filter) && !matBlock.isInSparseFormat())
+                                       LibMatrixNative.conv2d(matBlock, 
filter, outputBlock, params);
+                               else
+                                       LibMatrixDNN.conv2d(matBlock, filter, 
outputBlock, params);
                        }
                        ec.releaseMatrixInput(_in2.getName());
                }
@@ -367,9 +390,13 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction
                        }
                        else {
                                outputBlock = getDenseOutputBlock(N, K*P*Q);
-                               if(!bias.isEmptyBlock())
+                               if(!bias.isEmptyBlock()) {
                                        params.bias = bias;
-                               LibMatrixDNN.conv2d(matBlock, filter, 
outputBlock, params);
+                               }
+                               if(params.enableNative && 
!isFilterSparse(filter) && !matBlock.isInSparseFormat())
+                                       LibMatrixNative.conv2d(matBlock, 
filter, outputBlock, params);
+                               else
+                                       LibMatrixDNN.conv2d(matBlock, filter, 
outputBlock, params);
                        }
                        ec.releaseMatrixInput(_in3.getName());
                        ec.releaseMatrixInput(_in2.getName());
@@ -381,7 +408,10 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction
                        }
                        else {
                                outputBlock = getDenseOutputBlock(K, C*R*S);
-                               LibMatrixDNN.conv2dBackwardFilter(matBlock, 
dout, outputBlock, params);
+                               if(params.enableNative && 
!matBlock.isInSparseFormat() && !dout.isInSparseFormat())
+                                       
LibMatrixNative.conv2dBackwardFilter(matBlock, dout, outputBlock, params);
+                               else
+                                       
LibMatrixDNN.conv2dBackwardFilter(matBlock, dout, outputBlock, params);
                        }
                        ec.releaseMatrixInput(_in2.getName());
                }
@@ -392,7 +422,10 @@ public class ConvolutionCPInstruction extends 
UnaryCPInstruction
                        }
                        else {
                                outputBlock = getDenseOutputBlock(N, C * H * W);
-                               LibMatrixDNN.conv2dBackwardData(matBlock, dout, 
outputBlock, params);
+                               if(params.enableNative && 
!isFilterSparse(matBlock) && !dout.isInSparseFormat())
+                                       
LibMatrixNative.conv2dBackwardData(matBlock, dout, outputBlock, params);
+                               else
+                                       
LibMatrixDNN.conv2dBackwardData(matBlock, dout, outputBlock, params);
                        }
                        ec.releaseMatrixInput(_in2.getName());
                }


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
index 497b182..9d4cd1f 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java
@@ -141,7 +141,7 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                        return new ConvolutionGPUInstruction(in1, in2, in3, 
out, opcode, str, stride,
                                        padding, input_shape, filter_shape);
                }
-               else if (opcode.equalsIgnoreCase("maxpooling") || 
opcode.equalsIgnoreCase("relu_maxpooling")) {
+               else if (opcode.equalsIgnoreCase("maxpooling")) {
                        InstructionUtils.checkNumFields(parts, 14);
                        CPOperand in1 = new CPOperand(parts[1]);
                        CPOperand out = new CPOperand(parts[14]);
@@ -303,7 +303,7 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                        LibMatrixCUDA.conv2dBackwardData(ec.getGPUContext(), 
getExtendedOpcode(), filter, dout, out, N, C, H, W,
                                        K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
                }
-               else if (instOpcode.equalsIgnoreCase("maxpooling") || 
instOpcode.equalsIgnoreCase("relu_maxpooling")) {
+               else if (instOpcode.equalsIgnoreCase("maxpooling")) {
                        MatrixObject image = 
getMatrixInputForGPUInstruction(ec, _input1.getName());
 
                        if(image.getNumRows() != N || image.getNumColumns() != 
C*H*W) 
@@ -315,9 +315,6 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                        if(instOpcode.equalsIgnoreCase("maxpooling"))
                                LibMatrixCUDA.maxpooling(ec.getGPUContext(), 
getExtendedOpcode(), image, out, N, C, H, W,
                                        K, R, S, pad_h, pad_w, stride_h, 
stride_w, P, Q);
-                       else
-                               
LibMatrixCUDA.reluMaxpooling(ec.getGPUContext(), getExtendedOpcode(), image, 
out, N, C, H, W,
-                                               K, R, S, pad_h, pad_w, 
stride_h, stride_w, P, Q);
                }
                else if (instOpcode.equalsIgnoreCase("maxpooling_backward")) {
                        MatrixObject image = 
getMatrixInputForGPUInstruction(ec, _input1.getName());
@@ -341,7 +338,7 @@ public class ConvolutionGPUInstruction extends 
GPUInstruction
                // release inputs/outputs
                ec.releaseMatrixInputForGPUInstruction(_input1.getName());
 
-               if (!( instOpcode.equalsIgnoreCase("maxpooling") || 
instOpcode.equalsIgnoreCase("relu_maxpooling")) )
+               if ( !instOpcode.equalsIgnoreCase("maxpooling") )
                        
ec.releaseMatrixInputForGPUInstruction(_input2.getName());
 
                if (instOpcode.equalsIgnoreCase("conv2d_bias_add"))

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/instructions/spark/ConvolutionSPInstruction.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/instructions/spark/ConvolutionSPInstruction.java
 
b/src/main/java/org/apache/sysml/runtime/instructions/spark/ConvolutionSPInstruction.java
index b485233..f687cc0 100644
--- 
a/src/main/java/org/apache/sysml/runtime/instructions/spark/ConvolutionSPInstruction.java
+++ 
b/src/main/java/org/apache/sysml/runtime/instructions/spark/ConvolutionSPInstruction.java
@@ -25,6 +25,8 @@ import java.util.Iterator;
 import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.function.PairFlatMapFunction;
 import org.apache.spark.broadcast.Broadcast;
+import org.apache.sysml.conf.ConfigurationManager;
+import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.parser.Expression.DataType;
 import org.apache.sysml.parser.Expression.ValueType;
 import org.apache.sysml.runtime.DMLRuntimeException;
@@ -41,6 +43,7 @@ import org.apache.sysml.runtime.matrix.MatrixFormatMetaData;
 import org.apache.sysml.runtime.matrix.data.ConvolutionParameters;
 import org.apache.sysml.runtime.matrix.data.InputInfo;
 import org.apache.sysml.runtime.matrix.data.LibMatrixDNN;
+import org.apache.sysml.runtime.matrix.data.LibMatrixNative;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.data.MatrixIndexes;
 import org.apache.sysml.runtime.matrix.data.OutputInfo;
@@ -281,7 +284,8 @@ public class ConvolutionSPInstruction extends 
UnarySPInstruction {
                        int Q = (int) ConvolutionUtils.getQ(W, S, stride_w, 
pad_w);
                        
                        ConvolutionParameters params = new 
ConvolutionParameters(numRowsPerBlock, C, H, W, K, R, S, stride_h, stride_w, 
pad_h, pad_w, 1);
-                       JavaPairRDD<MatrixIndexes,MatrixBlock> out = 
inputRDD.mapPartitionsToPair(new RDDConv2dMapMMFunction(filterBroadcast, 
params, instOpcode, biasBroadcast, mcRdd.getRows()), true);
+                       boolean enableNativeBLAS = 
ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.NATIVE_BLAS); 
+                       JavaPairRDD<MatrixIndexes,MatrixBlock> out = 
inputRDD.mapPartitionsToPair(new RDDConv2dMapMMFunction(filterBroadcast, 
params, instOpcode, biasBroadcast, mcRdd.getRows(), enableNativeBLAS), true);
                        
                        //put output RDD handle into symbol table
                        sec.setRDDHandleForVariable(output.getName(), out);
@@ -316,15 +320,16 @@ public class ConvolutionSPInstruction extends 
UnarySPInstruction {
                Broadcast<MatrixBlock> filterBroadcast = null;
                Broadcast<MatrixBlock> biasBroadcast = null;
                ConvolutionParameters params = null;
-               String instOpcode = null;
+               String instOpcode = null; boolean enableNative;
                long numRows = 0;
                public RDDConv2dMapMMFunction(Broadcast<MatrixBlock> 
filterBroadcast, 
-                               ConvolutionParameters params, String 
instOpcode, Broadcast<MatrixBlock> biasBroadcast, long numRows) {
+                               ConvolutionParameters params, String 
instOpcode, Broadcast<MatrixBlock> biasBroadcast, long numRows, boolean 
enableNativeBLAS) {
                        this.filterBroadcast = filterBroadcast;
                        this.params = params;
                        this.instOpcode = instOpcode;
                        this.biasBroadcast = biasBroadcast;
                        this.numRows = numRows;
+                       this.enableNative = enableNativeBLAS;
                }
                
                private MatrixBlock processRectangularBlock(MatrixBlock 
matBlock) throws Exception {
@@ -336,7 +341,10 @@ public class ConvolutionSPInstruction extends 
UnarySPInstruction {
                                }
                                else {
                                        outputBlock = 
getDenseOutputBlock(params.N, params.K*params.P*params.Q);
-                                       LibMatrixDNN.conv2d(matBlock, filter, 
outputBlock, params);
+                                       if(enableNative)
+                                               
LibMatrixNative.conv2d(matBlock, filter, outputBlock, params);
+                                       else
+                                               LibMatrixDNN.conv2d(matBlock, 
filter, outputBlock, params);
                                }
                        }
                        else if 
(instOpcode.equalsIgnoreCase("conv2d_bias_add")) {
@@ -349,7 +357,10 @@ public class ConvolutionSPInstruction extends 
UnarySPInstruction {
                                        outputBlock = 
getDenseOutputBlock(params.N, params.K*params.P*params.Q);
                                        if(!bias.isEmptyBlock())
                                                params.bias = bias;
-                                       LibMatrixDNN.conv2d(matBlock, filter, 
outputBlock, params);
+                                       if(enableNative)
+                                               
LibMatrixNative.conv2d(matBlock, filter, outputBlock, params);
+                                       else
+                                               LibMatrixDNN.conv2d(matBlock, 
filter, outputBlock, params);
                                }
                        }
                        else if(instOpcode.equalsIgnoreCase("maxpooling") || 
instOpcode.equalsIgnoreCase("relu_maxpooling")) {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
index 3f0437f..11e74ca 100644
--- 
a/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
+++ 
b/src/main/java/org/apache/sysml/runtime/matrix/data/ConvolutionParameters.java
@@ -34,7 +34,7 @@ public class ConvolutionParameters implements Serializable {
        public int K; public int R; public int S; public int stride_h; public 
int stride_w; public int pad_h; public int pad_w;
        public int P; public int Q; public int numThreads;
        
-       
+       public boolean enableNative = false;
        public MatrixBlock input1; public MatrixBlock input2; public 
MatrixBlock output;
        
        public MatrixBlock bias;

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
index 9ddd87a..56360f8 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -1043,41 +1043,6 @@ public class LibMatrixCUDA {
        }
        
        /**
-        * performs relu followed by maxpooling on GPU by exploiting 
cudnnPoolingForward(...)
-        * @param gCtx   a valid {@link GPUContext}
-        * @param instName the invoking instruction's name for record {@link 
Statistics}.
-        * @param image image as matrix object
-        * @param outputBlock output matrix
-        * @param N                             batch size
-        * @param C                             number of channels
-        * @param H                             height of image
-        * @param W                             width of image
-        * @param K                             number of filters
-        * @param R                             height of filter
-        * @param S                             width of filter
-        * @param pad_h                 vertical padding
-        * @param pad_w                 horizontal padding
-        * @param stride_h              horizontal stride
-        * @param stride_w              vertical stride
-        * @param P                             (H - R + 1 + 2*pad_h)/stride_h
-        * @param Q                             (W - S + 1 + 2*pad_w)/stride_w
-        * @throws DMLRuntimeException if DMLRuntimeException occurs
-        */
-       public static void reluMaxpooling(GPUContext gCtx, String instName, 
MatrixObject image,
-                       MatrixObject outputBlock, int N, int C, int H, int W, 
int K, int R,
-                       int S, int pad_h, int pad_w, int stride_h, int 
stride_w, int P,
-                       int Q) throws DMLRuntimeException {
-        LOG.trace("GPU : reluMaxpooling" + ", GPUContext=" + gCtx);
-        cudnnTensorDescriptor srcTensorDesc = allocateTensorDescriptor(gCtx, 
image, N, C, H, W);
-               long size  = image.getNumRows() * image.getNumColumns() * 
Sizeof.DOUBLE;
-               Pointer tmp = gCtx.allocate(size);
-               performCuDNNReLU(gCtx, instName, image, tmp, srcTensorDesc);
-               //cudaDeviceSynchronize; // It seemed like the cudnn operation 
in performCuDNNReLU was being done aysnchronously, this adds the neccesary sync
-               performMaxpooling(gCtx, instName, tmp, srcTensorDesc, 
outputBlock, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
-               gCtx.cudaFreeHelper(tmp);
-       }
-
-       /**
         * Performs maxpoolingBackward on GPU by exploiting 
cudnnPoolingBackward(...)
         * This method computes the backpropogation errors for previous layer 
of maxpooling operation
         * @param gCtx   a valid {@link GPUContext}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
index cd88b95..6a43917 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
@@ -36,6 +36,8 @@ import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysml.runtime.util.ConvolutionUtils;
+import org.apache.sysml.utils.NativeHelper;
+import org.apache.sysml.utils.Statistics;
 
 /**
  * This class allows users to invoke deep learning related operations 
@@ -57,10 +59,10 @@ public class LibMatrixDNN {
        public static boolean DISPLAY_STATISTICS = false; //conv2d summaries in 
stats output
 
        private enum TaskType {
-               MaxPooling_Forward, MaxPooling_Backward, 
+               MaxPooling_Forward, MaxPooling_Backward, 
MaxPooling_Relu_Backward,
                // Alternate approaches that we tried but the performance was 
unsatisfactory be included: direct, non-looped im2col
                LoopedIm2ColConv2d, LoopedIm2ColConv2dBwdFilter, 
LoopedIm2ColConv2dBwdData,
-               BiasAdd, ReluBackward, BiasMultiply
+               ReluBackward
        }
        
        // 
------------------------------------------------------------------------------------------------
@@ -141,6 +143,30 @@ public class LibMatrixDNN {
        // 
------------------------------------------------------------------------------------------------
        
        /**
+        * This method performs convolution (i.e. cross-correlation) operation 
on input
+        * 
+        * @param input input batch 
+        * @param filter filter
+        * @param outputBlock output of convolution
+        * @param params convolution parameters
+        * @throws DMLRuntimeException if DMLRuntimeException occurs
+        */
+       public static void conv2d(MatrixBlock input, MatrixBlock filter, 
MatrixBlock outputBlock, ConvolutionParameters params) throws 
DMLRuntimeException {
+               LibMatrixDNN.checkInputsConv2d(input, filter, outputBlock, 
params);
+               
+               if(params.bias != null && params.bias.isInSparseFormat())
+                       params.bias.sparseToDense(); // Since bias is extremely 
small array
+               
+               if(isEligibleForConv2dSparse(params))
+                       Statistics.numNativeLibMatrixDNNCalls.increment();
+               
+               runConvTask(TaskType.LoopedIm2ColConv2d, params);
+               
+               //post-processing: maintain nnz
+               outputBlock.recomputeNonZeros();
+       }
+       
+       /**
         * This method computes the backpropogation errors for previous layer 
of convolution operation
         * 
         * @param filter filter used in conv2d 
@@ -150,25 +176,10 @@ public class LibMatrixDNN {
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
        public static void conv2dBackwardData(MatrixBlock filter, MatrixBlock 
dout, MatrixBlock outputBlock, ConvolutionParameters params) throws 
DMLRuntimeException {
-               params.input1 = filter;
-               params.input2 = dout;
-               params.output = outputBlock;
-               if(filter.getNumRows() != params.K || filter.getNumColumns() != 
params.C*params.R*params.S || 
-                               dout.getNumRows() != params.N || 
dout.getNumColumns() != params.K*params.P*params.Q) {
-                       throw new DMLRuntimeException("Incorrect input to 
conv2d_backward_filter");
-               }
-               if(params.stride_h <= 0 || params.stride_w <= 0) {
-                       throw new DMLRuntimeException("Only positive strides 
supported");
-               }
+               checkInputsConv2dBackwardData(filter, dout, outputBlock, 
params);
                
-               if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
-                       if(filter.isInSparseFormat() || 
dout.isInSparseFormat()) {
-                               conv2dBwdDataSparseCount.addAndGet(1);
-                       }
-                       else {
-                               conv2dBwdDataDenseCount.addAndGet(1);
-                       }
-               }
+               if(isEligibleForConv2dBackwardDataDense(params))
+                       Statistics.numNativeLibMatrixDNNCalls.increment();
                
                runConvTask(TaskType.LoopedIm2ColConv2dBwdData, params);
                
@@ -186,17 +197,71 @@ public class LibMatrixDNN {
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
        public static void conv2dBackwardFilter(MatrixBlock input, MatrixBlock 
dout, MatrixBlock outputBlock, ConvolutionParameters params) throws 
DMLRuntimeException {
-               params.input1 = input;
+               checkInputsConv2dBackwardFilter(input, dout, outputBlock, 
params);
+               
+               if(isEligibleForConv2dBackwardFilterSparseDense(params))
+                       Statistics.numNativeLibMatrixDNNCalls.increment();
+               
+               runConvTask(TaskType.LoopedIm2ColConv2dBwdFilter, params);
+               
+               //post-processing: maintain nnz
+               outputBlock.recomputeNonZeros();
+       }
+       
+       
+       private static void checkOrThrowException(String msg, long lhs, long 
rhs) throws DMLRuntimeException {
+               if(lhs != rhs)
+                       throw new DMLRuntimeException(msg + ":" + lhs + " != " 
+ rhs);
+       }
+       private static void checkOrThrowException(String msg, long lhs, long 
rhs1, long rhs2, long rhs3) throws DMLRuntimeException {
+               if(lhs != (rhs1*rhs2*rhs3))
+                       throw new DMLRuntimeException(msg + ":" + lhs + " != (" 
+ rhs1 + " * " + rhs2 + " * " + rhs3);
+       }
+       
+       static void checkInputsConv2dBackwardData(MatrixBlock filter, 
MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params)  
throws DMLRuntimeException {
+               params.input1 = filter;
                params.input2 = dout;
                params.output = outputBlock;
-               if(input.getNumRows() != params.N || input.getNumColumns() != 
params.C*params.H*params.W || 
-                               dout.getNumRows() != params.N || 
dout.getNumColumns() != params.K*params.P*params.Q) {
-                       throw new DMLRuntimeException("Incorrect input to 
conv2d_backward_filter");
-               }
-               if(params.stride_h <= 0 || params.stride_w <= 0) {
-                       throw new DMLRuntimeException("Only positive strides 
supported");
+               checkOrThrowException("Incorrect input to conv2d_backward_data: 
Number of rows of input filter != "
+                               + "number of filters in filter_shape", 
filter.getNumRows(), params.K);
+               checkOrThrowException("Incorrect input to conv2d_backward_data: 
Number of columns of input filter != "
+                               + "channels*filter_height*filter_height in 
filter_shape", filter.getNumColumns(), params.C, params.R, params.S);
+               checkOrThrowException("Incorrect input to conv2d_backward_data: 
Number of rows of input errors != "
+                               + "batch size in input_shape", 
dout.getNumRows(), params.N);
+               checkOrThrowException("Incorrect input to conv2d_backward_data: 
Number of columns of input errors != "
+                               + "expected input error channels*height*width", 
dout.getNumColumns(), params.K, params.P, params.Q);
+               if(params.stride_h <= 0 || params.stride_w <= 0) 
+                       throw new DMLRuntimeException("Only positive strides 
supported:" + params.stride_h + ", " + params.stride_w);
+               
+               if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
+                       if(filter.isInSparseFormat() || 
dout.isInSparseFormat()) {
+                               conv2dBwdDataSparseCount.addAndGet(1);
+                       }
+                       else {
+                               conv2dBwdDataDenseCount.addAndGet(1);
+                       }
                }
                
+               int constrainedNumThreads = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+               if (!(ALLOW_MULTI_THREADED_OPS && params.isOutputThreadSafe() 
&& constrainedNumThreads > 1))
+                       params.numThreads = 1;
+       }
+       
+       static void checkInputsConv2dBackwardFilter(MatrixBlock input, 
MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params)  
throws DMLRuntimeException {
+               params.input1 = input;
+               params.input2 = dout;
+               params.output = outputBlock;
+               checkOrThrowException("Incorrect input to 
conv2d_backward_filter: Number of rows of input data != "
+                               + "batch size in input_shape", 
input.getNumRows(), params.N);
+               checkOrThrowException("Incorrect input to 
conv2d_backward_filter: Number of columns of input data != "
+                               + "channels*input_height*input_height in 
input_shape", input.getNumColumns(), params.C, params.H, params.W);
+               checkOrThrowException("Incorrect input to 
conv2d_backward_filter: Number of rows of input errors != "
+                               + "batch size in input_shape", 
dout.getNumRows(), params.N);
+               checkOrThrowException("Incorrect input to 
conv2d_backward_filter: Number of columns of input errors != "
+                               + "expected input error channels*height*width", 
dout.getNumColumns(), params.K, params.P, params.Q);
+               if(params.stride_h <= 0 || params.stride_w <= 0) 
+                       throw new DMLRuntimeException("Only positive strides 
supported:" + params.stride_h + ", " + params.stride_w);
+               
                if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
                        if(input.isInSparseFormat() || dout.isInSparseFormat()) 
{
                                conv2dBwdFilterSparseCount.addAndGet(1);
@@ -205,11 +270,6 @@ public class LibMatrixDNN {
                                conv2dBwdFilterDenseCount.addAndGet(1);
                        }
                }
-               
-               runConvTask(TaskType.LoopedIm2ColConv2dBwdFilter, params);
-               
-               //post-processing: maintain nnz
-               outputBlock.recomputeNonZeros();
        }
        
        /**
@@ -252,11 +312,10 @@ public class LibMatrixDNN {
                MatrixBlock filter = params.input1;
                MatrixBlock dout = params.input2;
                doRotate180(n, 0, dout, dout_reshaped.denseBlock, params, true);
-               dout_reshaped.recomputeNonZeros();
                
                MatrixBlock temp = new MatrixBlock(params.P*params.Q, 
params.C*params.R*params.S, false);
                long t1 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
-               LibMatrixMult.matrixMult(dout_reshaped, filter, temp, false);
+               singleThreadedMatMult(dout_reshaped, filter, temp, true, false, 
params);
                long t2 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? 
System.nanoTime() : 0 ;
                doCol2imOverSingleImage(n, temp, params);
                long t3 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? 
System.nanoTime() : 0 ;
@@ -270,15 +329,13 @@ public class LibMatrixDNN {
                        MatrixBlock im2ColOutBlock, MatrixBlock dout_reshaped, 
MatrixBlock partialRetBlock, ConvolutionParameters params, double []  
tempIm2ColArr) throws DMLRuntimeException {
                long t1 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
                doIm2col(n, im2ColOutBlock, params, tempIm2ColArr);
-               im2ColOutBlock.recomputeNonZeros();
                long t2 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? 
System.nanoTime() : 0 ;
                
                doRotate180(n, 0, params.input2, dout_reshaped.denseBlock, 
params, true);
-               dout_reshaped.recomputeNonZeros();
                
                MatrixBlock temp = new MatrixBlock(params.C*params.R*params.S, 
params.K, false);
                long t3 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? 
System.nanoTime() : 0 ;
-               LibMatrixMult.matrixMult(im2ColOutBlock, dout_reshaped, temp, 
false);
+               singleThreadedMatMult(im2ColOutBlock, dout_reshaped, temp, 
true, true, params);
                long t4 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? 
System.nanoTime() : 0 ;
                if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
                        loopedConvBwdFilterMatMultTime.addAndGet(t4-t3);
@@ -298,15 +355,21 @@ public class LibMatrixDNN {
                ret[2] = j % W;
        }
        
-       public static void conv2d(MatrixBlock input, MatrixBlock filter, 
MatrixBlock outputBlock, ConvolutionParameters params) throws 
DMLRuntimeException {
+       static void checkInputsConv2d(MatrixBlock input, MatrixBlock filter, 
MatrixBlock outputBlock, ConvolutionParameters params) throws 
DMLRuntimeException {
                params.input1 = input;
                params.input2 = filter;
                params.output = outputBlock;
                
-               if(input.getNumRows() != params.N || input.getNumColumns() != 
params.C*params.H*params.W || 
-                               filter.getNumRows() != params.K || 
filter.getNumColumns() != params.C*params.R*params.S) {
-                       throw new DMLRuntimeException("Incorrect input to 
conv2d: " + input.getNumRows());
-               }
+               checkOrThrowException("Incorrect input to conv2d: Number of 
rows of input filter != "
+                               + "number of filters in filter_shape", 
filter.getNumRows(), params.K);
+               checkOrThrowException("Incorrect input to conv2d: Number of 
columns of input filter != "
+                               + "channels*filter_height*filter_height in 
filter_shape", filter.getNumColumns(), params.C, params.R, params.S);
+               checkOrThrowException("Incorrect input to conv2d: Number of 
rows of input data != "
+                               + "batch size in input_shape", 
input.getNumRows(), params.N);
+               checkOrThrowException("Incorrect input to conv2d: Number of 
columns of input data != "
+                               + "channels*input_height*input_height in 
input_shape", input.getNumColumns(), params.C, params.H, params.W);
+               if(params.stride_h <= 0 || params.stride_w <= 0) 
+                       throw new DMLRuntimeException("Only positive strides 
supported:" + params.stride_h + ", " + params.stride_w);
                
                if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
                        if(input.isInSparseFormat() || 
filter.isInSparseFormat()) {
@@ -316,21 +379,36 @@ public class LibMatrixDNN {
                                conv2dDenseCount.addAndGet(1);
                        }
                }
-               
-               runConvTask(TaskType.LoopedIm2ColConv2d, params);
-               
-               //post-processing: maintain nnz
-               outputBlock.recomputeNonZeros();
+       }
+       
+       // Single-threaded matrix multiplication
+       private static void singleThreadedMatMult(MatrixBlock m1, MatrixBlock 
m2, MatrixBlock ret, 
+                       boolean recomputeNNZM1, boolean recomputeNNZM2, 
ConvolutionParameters params) throws DMLRuntimeException {
+               if(!params.enableNative || m1.isInSparseFormat() || 
m2.isInSparseFormat()) {
+                       if(recomputeNNZM1)
+                               m1.recomputeNonZeros();
+                       if(recomputeNNZM2)
+                               m2.recomputeNonZeros();
+                       LibMatrixMult.matrixMult(m1, m2, ret, false);
+               }
+               else {
+                       ret.sparse = false;
+                       if(ret.getDenseBlock() == null)
+                               ret.allocateDenseBlock();
+                       NativeHelper.matrixMultDenseDense(m1.denseBlock, 
m2.denseBlock, 
+                                       ret.denseBlock, m1.getNumRows(), 
m1.getNumColumns(), m2.getNumColumns(), 1);
+                       ret.recomputeNonZeros();
+               }
        }
        
        private static void doLoopedIm2ColConv2d(int n, MatrixBlock 
im2ColOutBlock, ConvolutionParameters params, double []  temp) throws 
DMLRuntimeException {
                long t1 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
                doIm2col(n, im2ColOutBlock, params, temp);
-               im2ColOutBlock.recomputeNonZeros();
                long t2 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
                
                MatrixBlock matMultOutBlock = new MatrixBlock(params.K, 
params.P*params.Q, false);
-               LibMatrixMult.matrixMult(params.input2, im2ColOutBlock, 
matMultOutBlock, false);
+               singleThreadedMatMult(params.input2, im2ColOutBlock, 
matMultOutBlock, false, true, params);
+               
                long t3 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? 
System.nanoTime() : 0;
                
                if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
@@ -380,9 +458,11 @@ public class LibMatrixDNN {
         * @param dout dout matrix
         * @param outputBlock output matrix
         * @param params convolution parameters
+        * @param performReluBackward perform ReLU backward
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
-       public static void maxpoolingBackward(MatrixBlock input, MatrixBlock 
dout, MatrixBlock outputBlock, ConvolutionParameters params) throws 
DMLRuntimeException {
+       public static void maxpoolingBackward(MatrixBlock input, MatrixBlock 
dout, MatrixBlock outputBlock, 
+                       ConvolutionParameters params, boolean 
performReluBackward) throws DMLRuntimeException {
                params.input1 = input;
                params.input2 = dout;
                params.output = outputBlock;
@@ -407,7 +487,10 @@ public class LibMatrixDNN {
                        throw new DMLRuntimeException("Sparse 
maxpooling_backward is not supported");
 
                fillIndexesArray(params);
-               runConvTask(TaskType.MaxPooling_Backward, params);
+               if(performReluBackward)
+                       runConvTask(TaskType.MaxPooling_Relu_Backward, params);
+               else
+                       runConvTask(TaskType.MaxPooling_Backward, params);
                
                //post-processing: maintain nnz 
                outputBlock.recomputeNonZeros();
@@ -440,7 +523,7 @@ public class LibMatrixDNN {
                }
        }
        
-       private static void doPoolingBackward(int n, ConvolutionParameters 
params) throws DMLRuntimeException {
+       private static void doPoolingBackward(int n, ConvolutionParameters 
params, boolean performReluBackward) throws DMLRuntimeException {
                double [] inputArray = null;
                if (!params.input1.isInSparseFormat())
                        inputArray = params.input1.getDenseBlock();
@@ -455,19 +538,19 @@ public class LibMatrixDNN {
                        
                if(inputArray != null) {
                        if(doutArray != null)
-                               doPoolingBackwardDenseDense(n, inputArray, 
doutArray, outputArray, params);
+                               doPoolingBackwardDenseDense(n, inputArray, 
doutArray, outputArray, params, performReluBackward);
                        else
-                               doPoolingBackwardDenseSparse(n, inputArray, 
params.input2, outputArray, params);
+                               doPoolingBackwardDenseSparse(n, inputArray, 
params.input2, outputArray, params, performReluBackward);
                }
                else {
                        if(doutArray != null)
-                               doPoolingBackwardSparseDense(n, doutArray, 
outputArray, params);
+                               doPoolingBackwardSparseDense(n, doutArray, 
outputArray, params, performReluBackward);
                        else
-                               doPoolingBackwardSparseSparse(n, outputArray, 
params);
+                               doPoolingBackwardSparseSparse(n, outputArray, 
params, performReluBackward);
                }
        }
        
-       private static void doPoolingBackwardSparseDense(int n, double [] 
doutArray,  double [] outputArray, ConvolutionParameters params) throws 
DMLRuntimeException {
+       private static void doPoolingBackwardSparseDense(int n, double [] 
doutArray,  double [] outputArray, ConvolutionParameters params, boolean 
performReluBackward) throws DMLRuntimeException {
                if (!params.input1.isInSparseFormat())
                        throw new DMLRuntimeException("Incorrect usage: Call 
optimized versions");
                
@@ -477,7 +560,7 @@ public class LibMatrixDNN {
                                        double inVal = 
doutArray[n*params.C*params.P*params.Q + c*params.P*params.Q +  p * params.Q + 
q];
                                        if(inVal != 0) {
                                                final int inputOffset = 
n*params.C*params.H*params.W + c*params.H*params.W;
-                                               int maxIndex = 
getMaxIndexSparse(p, q, inputOffset, n, c, params.input1, params);
+                                               int maxIndex = 
getMaxIndexSparse(p, q, inputOffset, n, c, params.input1, params, 
performReluBackward);
                                                if(maxIndex != -1)
                                                        outputArray[maxIndex] 
+= inVal;
                                        }
@@ -486,7 +569,7 @@ public class LibMatrixDNN {
                }
        }
        
-       private static void doPoolingBackwardSparseSparse(int n, double [] 
outputArray, ConvolutionParameters params) throws DMLRuntimeException {
+       private static void doPoolingBackwardSparseSparse(int n, double [] 
outputArray, ConvolutionParameters params, boolean performReluBackward) throws 
DMLRuntimeException {
                if (!params.input1.isInSparseFormat())
                        throw new DMLRuntimeException("Incorrect usage: Call 
optimized versions");
                
@@ -502,7 +585,7 @@ public class LibMatrixDNN {
                                int p = tensorIndexes[1];
                                int q = tensorIndexes[2];
                                final int inputOffset = 
n*params.C*params.H*params.W + c*params.H*params.W;
-                               int maxIndex = getMaxIndexSparse(p, q, 
inputOffset, n, c, params.input1, params);
+                               int maxIndex = getMaxIndexSparse(p, q, 
inputOffset, n, c, params.input1, params, performReluBackward);
                                if(maxIndex != -1)
                                        outputArray[maxIndex] += avals[j];
                        }
@@ -510,7 +593,7 @@ public class LibMatrixDNN {
        }
        
        private static void doPoolingBackwardDenseSparse(int n, double [] 
inputArray, 
-                       MatrixBlock dout, double [] outputArray, 
ConvolutionParameters params) throws DMLRuntimeException {
+                       MatrixBlock dout, double [] outputArray, 
ConvolutionParameters params, boolean performReluBackward) throws 
DMLRuntimeException {
                if( !dout.sparseBlock.isEmpty(n) ) {
                        int [] tensorIndexes = new int[3];
                        int apos = dout.sparseBlock.pos(n);
@@ -523,7 +606,7 @@ public class LibMatrixDNN {
                                int p = tensorIndexes[1];
                                int q = tensorIndexes[2];
                                final int inputOffset = 
n*params.C*params.H*params.W + c*params.H*params.W;
-                               int maxIndex = getMaxIndex(p, q, inputOffset, 
inputArray, params);
+                               int maxIndex = getMaxIndex(p, q, inputOffset, 
inputArray, params, performReluBackward);
                                if(maxIndex != -1)
                                        outputArray[maxIndex] += avals[j];
                        }
@@ -531,14 +614,14 @@ public class LibMatrixDNN {
        }
        
        private static void doPoolingBackwardDenseDense(int n, double [] 
inputArray, double [] doutArray, 
-                       double [] outputArray, ConvolutionParameters params) {
+                       double [] outputArray, ConvolutionParameters params, 
boolean performReluBackward) {
                for (int c = 0; c < params.C; c++) {
                        final int inputOffset = n*params.C*params.H*params.W + 
c*params.H*params.W;
                        final int outputOffset = n*params.C*params.P*params.Q + 
c*params.P*params.Q;
                        
                        for (int p = 0; p < params.P; p++) {
                                for (int q = 0; q < params.Q; q++) {
-                                       int maxIndex = getMaxIndex(p, q, 
inputOffset, inputArray, params);
+                                       int maxIndex = getMaxIndex(p, q, 
inputOffset, inputArray, params, performReluBackward);
                                        if(maxIndex != -1)
                                                outputArray[maxIndex] += 
doutArray[outputOffset +  p * params.Q + q];
                                }
@@ -556,10 +639,11 @@ public class LibMatrixDNN {
         * @param c number of channels 
         * @param input input matrix
         * @param params convolution parameters
+        * @param performReluBackward perform ReLU on input
         * @return index of the cell with maximum value
         * @throws DMLRuntimeException if error occurs
         */
-       private static int getMaxIndexSparse(int p, int q, int inputOffset, int 
n, int c, MatrixBlock input, ConvolutionParameters params) throws 
DMLRuntimeException {
+       private static int getMaxIndexSparse(int p, int q, int inputOffset, int 
n, int c, MatrixBlock input, ConvolutionParameters params, boolean 
performReluBackward) throws DMLRuntimeException {
                if(!input.isInSparseFormat())
                        throw new DMLRuntimeException("Incorrect usage: Only 
sparse format supported");
                
@@ -591,9 +675,10 @@ public class LibMatrixDNN {
                                int h = tensorIndexes[1];
                                int w = tensorIndexes[2];
                                if(h >= start_index_h && h < end_index_h && w 
>= start_index_w && w < end_index_w) {
-                                       if(maxVal < avals[j]) {
+                                       double val = performReluBackward && 
avals[j] < 0 ? 0 : avals[j]; 
+                                       if(maxVal < val) {
                                                maxIndex = inputOffset +  
h*params.W + w;
-                                               maxVal = avals[j];
+                                               maxVal = val;
                                        }
                                }
                        }
@@ -612,9 +697,10 @@ public class LibMatrixDNN {
         * @param inputOffset offset to be used for input index
         * @param inputArray input array
         * @param params convolution parameters
+        * @param performReluBackward perform ReLU backward
         * @return index of cell with maximum value
         */
-       private static int getMaxIndex(int p, int q, int inputOffset, double [] 
inputArray, ConvolutionParameters params) {
+       private static int getMaxIndex(int p, int q, int inputOffset, double [] 
inputArray, ConvolutionParameters params, boolean performReluBackward) {
                int start_index_h = params.start_indexes_h[p];
                int end_index_h = params.end_indexes_h[p];
                int start_index_w = params.start_indexes_w[q];
@@ -632,6 +718,7 @@ public class LibMatrixDNN {
                for (int h = start_index_h; h < end_index_h; h++) {
                        for (int w = start_index_w; w < end_index_w; w++) {
                                currDoutVal = inputArray[inputOffset +  
h*params.W + w];
+                               currDoutVal = performReluBackward && 
currDoutVal < 0 ? 0 : currDoutVal;
                                if(maxVal < currDoutVal) {
                                        maxIndex = inputOffset +  h*params.W + 
w;
                                        maxVal = currDoutVal;
@@ -894,11 +981,15 @@ public class LibMatrixDNN {
        private static void addMatrixBlocks(int poolSize, TaskType type, 
ConvolutionParameters params, 
                        ConcurrentLinkedQueue<MatrixBlock> im2ColOutBlocks, 
ConcurrentLinkedQueue<MatrixBlock> doutReshapedBlocks,
                        ConcurrentLinkedQueue<MatrixBlock> partialRetBlocks) {
+               boolean isEligibleForConv2dSparse = (type == 
TaskType.LoopedIm2ColConv2d) && isEligibleForConv2dSparse(params);
+               boolean isEligibleForConv2dBackwardFilterSparseDense = (type == 
TaskType.LoopedIm2ColConv2dBwdFilter) && 
isEligibleForConv2dBackwardFilterSparseDense(params) ;
                for(int i = 0; i < poolSize; i++) {
                        if(type == TaskType.LoopedIm2ColConv2d || type == 
TaskType.LoopedIm2ColConv2dBwdFilter) {
-                               MatrixBlock im2ColOutBlock = new 
MatrixBlock(params.C*params.R*params.S, params.P*params.Q, false);
-                               im2ColOutBlock.allocateDenseBlock();
-                               im2ColOutBlocks.add(im2ColOutBlock);
+                               if(!isEligibleForConv2dSparse && 
!isEligibleForConv2dBackwardFilterSparseDense) {
+                                       MatrixBlock im2ColOutBlock = new 
MatrixBlock(params.C*params.R*params.S, params.P*params.Q, false);
+                                       im2ColOutBlock.allocateDenseBlock();
+                                       im2ColOutBlocks.add(im2ColOutBlock);
+                               }
                        }
                        
                        if(type == TaskType.LoopedIm2ColConv2dBwdFilter) {
@@ -962,6 +1053,21 @@ public class LibMatrixDNN {
        }
        // 
----------------------------------------------------------------------------------------------------------------
        
+       private static boolean 
isEligibleForConv2dBackwardFilterSparseDense(ConvolutionParameters params) {
+               // NativeHelper.conv2dBackwardFilterSparseDense only if filter 
is sparse. 
+               // dout converted to dense if sparse.
+               return params.enableNative && params.input1.isInSparseFormat();
+       }
+       private static boolean isEligibleForConv2dSparse(ConvolutionParameters 
params) {
+               // NativeHelper.conv2dSparse only if filter is dense and input 
is sparse
+               return params.enableNative && params.input1.isInSparseFormat() 
&& !params.input2.isInSparseFormat();
+       }
+       private static boolean 
isEligibleForConv2dBackwardDataDense(ConvolutionParameters params) {
+               // NativeHelper.conv2dBackwardDataDense only if filter is 
dense. 
+               // dout converted to dense if sparse.
+               return params.enableNative && !params.input1.isInSparseFormat();
+       }
+       
        /**
         * The ConvTask allows the convolution operations (such s conv2d, 
conv2d_backward, maxpooling, etc)
         * to be executed in multi-thread manner.
@@ -1001,55 +1107,108 @@ public class LibMatrixDNN {
                                        break;
                                case MaxPooling_Backward:
                                        for(int n = _rl; n < _ru; n++) 
-                                               doPoolingBackward(n, _params);
-                                       break;
-                               case BiasAdd:
-                               {
-                                       double [] dest = 
_params.output.getDenseBlock();
-                                       
ConvolutionUtils.binaryBiasOperations(_params.input1, _params.bias, dest, 
_params.K, _params.P*_params.Q, 
-                                                       _rl, _ru, 
_binaryElementWiseAddition);
+                                               doPoolingBackward(n, _params, 
false);
                                        break;
-                               }
-                               case BiasMultiply:
-                               {
-                                       double [] dest = 
_params.output.getDenseBlock();
-                                       
ConvolutionUtils.binaryBiasOperations(_params.input1, _params.bias, dest, 
_params.K, _params.P*_params.Q, 
-                                                       _rl, _ru, 
_binaryElementWiseMultiplication);
+                               case MaxPooling_Relu_Backward:
+                                       for(int n = _rl; n < _ru; n++) 
+                                               doPoolingBackward(n, _params, 
true);
                                        break;
-                               }
                                case ReluBackward:
                                        lnnz = doReluBackward(_params, _rl, 
_ru);
                                        break;
                                case LoopedIm2ColConv2d:
                                {       
-                                       MatrixBlock im2ColOutBlock = 
_im2ColOutBlocks.remove();
-                                       double [] temp = 
_params.input1.isInSparseFormat() ? new double[_params.input1.getNumColumns()] 
: null;
-                                       for(int n = _rl; n < _ru; n++) 
-                                               doLoopedIm2ColConv2d(n, 
im2ColOutBlock, _params, temp);
-                                       _im2ColOutBlocks.add(im2ColOutBlock);
-                                       if(_params.bias != null)
-                                               
ConvolutionUtils.binaryBiasOperationInPlace(_params.bias, 
_params.output.getDenseBlock(), _params.K, 
-                                                               
_params.P*_params.Q, _rl, _ru, _binaryElementWiseAddition);
+                                       if(isEligibleForConv2dSparse(_params)) {
+                                               // NativeHelper.conv2dSparse 
only if filter is dense and input is sparse
+                                               int KPQ = 
_params.K*_params.P*_params.Q;
+                                               double[] temp = new double[KPQ];
+                                               for(int n = _rl; n < _ru; n++)  
{
+                                                       if( 
!_params.input1.getSparseBlock().isEmpty(n) ) {
+                                                               int apos = 
_params.input1.getSparseBlock().pos(n);
+                                                               int alen = 
_params.input1.getSparseBlock().size(n);
+                                                               int[] aix = 
_params.input1.getSparseBlock().indexes(n);
+                                                               double[] avals 
= _params.input1.getSparseBlock().values(n);
+                                                               
NativeHelper.conv2dSparse(apos, alen, aix, avals, 
_params.input2.getDenseBlock(), temp, 
+                                                                               
1, _params.C, _params.H, _params.W, _params.K, _params.R, _params.S, 
+                                                                               
_params.stride_h, _params.stride_w, _params.pad_h, _params.pad_w, _params.P, 
_params.Q, 1);
+                                                               
System.arraycopy(temp, 0, _params.output.denseBlock, n*KPQ, KPQ);
+                                                       }
+                                               }
+                                       }
+                                       else {
+                                               // In all other cases, perform 
im2col in Java + matmult (either native or java).
+                                               MatrixBlock im2ColOutBlock = 
_im2ColOutBlocks.remove();
+                                               double [] temp = 
_params.input1.isInSparseFormat() ? new double[_params.input1.getNumColumns()] 
: null;
+                                               for(int n = _rl; n < _ru; n++) 
+                                                       doLoopedIm2ColConv2d(n, 
im2ColOutBlock, _params, temp);
+                                               
_im2ColOutBlocks.add(im2ColOutBlock);
+                                       }
+                                       if(_params.bias != null) {
+                                               // bias is always converted to 
dense format
+                                               double [] biasArr = 
_params.bias.getDenseBlock();
+                                               int PQ = _params.P*_params.Q;
+                                               int index = _rl*_params.K*PQ;
+                                               for(int n = _rl; n < _ru; n++) {
+                                                       for(int k = 0; k < 
_params.K; k++) {
+                                                               for(int pq = 0; 
pq < PQ; pq++, index++) {
+                                                                       
_params.output.denseBlock[index] += biasArr[k];
+                                                               }
+                                                       }
+                                               }
+                                       }
                                        break;
                                }
                                case LoopedIm2ColConv2dBwdFilter:
                                {
-                                       MatrixBlock im2ColOutBlock = 
_im2ColOutBlocks.remove();
                                        MatrixBlock partialRetBlock = 
_partialRetBlocks.remove();
                                        MatrixBlock doutReshapedBlock = 
_doutReshapedBlocks.remove();
-                                       double [] temp = 
_params.input1.isInSparseFormat() ? new double[_params.input1.getNumColumns()] 
: null;
-                                       for(int n = _rl; n < _ru; n++) 
-                                               partialRetBlock = 
doLoopedIm2ColConv2dBwdFilter(n, im2ColOutBlock, doutReshapedBlock, 
partialRetBlock, _params, temp);
-                                       _im2ColOutBlocks.add(im2ColOutBlock);
-                                       _partialRetBlocks.add(partialRetBlock);
+                                       
if(isEligibleForConv2dBackwardFilterSparseDense(_params)) {
+                                               double [] dout_n = 
doutReshapedBlock.getDenseBlock();
+                                               for(int n = _rl; n < _ru; n++) {
+                                                       if( 
!_params.input1.getSparseBlock().isEmpty(n) ) {
+                                                               doRotate180(n, 
0, _params.input2, dout_n, _params, true);
+                                                               int apos = 
_params.input1.getSparseBlock().pos(n);
+                                                               int alen = 
_params.input1.getSparseBlock().size(n);
+                                                               int[] aix = 
_params.input1.getSparseBlock().indexes(n);
+                                                               double[] avals 
= _params.input1.getSparseBlock().values(n);
+                                                               
NativeHelper.conv2dBackwardFilterSparseDense(apos, alen, aix, avals, 
+                                                                               
dout_n, partialRetBlock.getDenseBlock(), 1, _params.C, _params.H, _params.W, 
_params.K, 
+                                                                               
_params.R, _params.S, _params.stride_h, _params.stride_w, _params.pad_h, 
_params.pad_w, _params.P, _params.Q, 1);
+                                                       }
+                                               }
+                                       }
+                                       else {
+                                               MatrixBlock im2ColOutBlock = 
_im2ColOutBlocks.remove();
+                                               double [] temp = 
_params.input1.isInSparseFormat() ? new double[_params.input1.getNumColumns()] 
: null;
+                                               for(int n = _rl; n < _ru; n++) 
+                                                       partialRetBlock = 
doLoopedIm2ColConv2dBwdFilter(n, im2ColOutBlock, doutReshapedBlock, 
partialRetBlock, _params, temp);
+                                               
_im2ColOutBlocks.add(im2ColOutBlock);
+                                       }
                                        
_doutReshapedBlocks.add(doutReshapedBlock);
+                                       _partialRetBlocks.add(partialRetBlock);
                                        break;
                                }
                                case LoopedIm2ColConv2dBwdData:
                                {
                                        MatrixBlock doutReshapedBlock = 
_doutReshapedBlocks.remove();
-                                       for(int n = _rl; n < _ru; n++) 
-                                               doLoopedIm2ColConv2dBwdData(n, 
doutReshapedBlock, _params);
+                                       
if(isEligibleForConv2dBackwardDataDense(_params)) {
+                                               int CHW = 
_params.C*_params.H*_params.W;
+                                               double [] ret = new double[CHW];
+                                               double [] filterArr = 
_params.input1.getDenseBlock();
+                                               for(int n = _rl; n < _ru; n++) {
+                                                       double [] dout_n = 
getRowInDenseFormat(_params.input2, n, doutReshapedBlock.getDenseBlock());
+                                                       if(n > _rl)
+                                                               
Arrays.fill(ret, 0);
+                                                       
NativeHelper.conv2dBackwardDataDense(filterArr, dout_n, ret, 1, 
+                                                                       
_params.C, _params.H, _params.W, _params.K, 
+                                                                       
_params.R, _params.S, _params.stride_h, _params.stride_w, _params.pad_h, 
_params.pad_w, _params.P, _params.Q, 1);
+                                                       System.arraycopy(ret, 
0, _params.output.getDenseBlock(), n*CHW, CHW);
+                                               }
+                                       }
+                                       else {
+                                               for(int n = _rl; n < _ru; n++) 
+                                                       
doLoopedIm2ColConv2dBwdData(n, doutReshapedBlock, _params);
+                                       }
                                        
_doutReshapedBlocks.add(doutReshapedBlock);
                                        break;
                                }
@@ -1192,16 +1351,24 @@ public class LibMatrixDNN {
        }
        
        // Returns the row of matrix in dense format
-       private static double [] getRowInDenseFormat(MatrixBlock input, int n, 
double []  temp) {
+       private static double [] getRowInDenseFormat(MatrixBlock input, int n, 
double []  temp) throws DMLRuntimeException {
+               if(input.getNumColumns() != temp.length) {
+                       throw new DMLRuntimeException("Invalid parameters");
+               }
                // Use temporary array to avoid binary search
-               Arrays.fill(temp, 0);
-               if( !input.sparseBlock.isEmpty(n) ) {
-                       int apos = input.sparseBlock.pos(n);
-                       int alen = input.sparseBlock.size(n);
-                       int[] aix = input.sparseBlock.indexes(n);
-                       double[] avals = input.sparseBlock.values(n);
-                       for(int j=apos; j<apos+alen; j++)
-                               temp[ aix[j] ] = avals[j];
+               if(input.isInSparseFormat()) {
+                       Arrays.fill(temp, 0);
+                       if( !input.sparseBlock.isEmpty(n) ) {
+                               int apos = input.sparseBlock.pos(n);
+                               int alen = input.sparseBlock.size(n);
+                               int[] aix = input.sparseBlock.indexes(n);
+                               double[] avals = input.sparseBlock.values(n);
+                               for(int j=apos; j<apos+alen; j++)
+                                       temp[ aix[j] ] = avals[j];
+                       }
+               }
+               else {
+                       System.arraycopy(input.getDenseBlock(), 
n*input.getNumColumns(), temp, 0, input.getNumColumns());
                }
                return temp;
        }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index 5c7c2ef..983ce53 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -17,7 +17,6 @@
  * under the License.
  */
 
-
 package org.apache.sysml.runtime.matrix.data;
 
 import java.util.ArrayList;
@@ -123,7 +122,7 @@ public class LibMatrixMult
        {
                matrixMult(m1, m2, ret, rl, ru, true);
        }
-
+       
        public static void matrixMult(MatrixBlock m1, MatrixBlock m2, 
MatrixBlock ret, int rl, int ru, boolean examSparsity) 
                throws DMLRuntimeException
        {       
@@ -132,9 +131,9 @@ public class LibMatrixMult
                        ret.examSparsity(); //turn empty dense into sparse
                        return;
                }
-               
+                       
                //Timing time = new Timing(true);
-               
+                       
                //pre-processing: output allocation
                boolean tm2 = checkPrepMatrixMultRightInput(m1,m2);
                m2 = prepMatrixMultRightInput(m1, m2);
@@ -166,6 +165,7 @@ public class LibMatrixMult
                if(examSparsity)
                        ret.examSparsity();
                
+               
                //System.out.println("MM 
("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+")x"
 +
                //                            
"("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+")
 in "+time.stop());
        }
@@ -188,7 +188,7 @@ public class LibMatrixMult
                        ret.examSparsity(); //turn empty dense into sparse
                        return;
                }
-               
+                       
                //check too high additional vector-matrix memory requirements 
(fallback to sequential)
                //check too small workload in terms of flops (fallback to 
sequential too)
                if( m1.rlen == 1 && (8L * m2.clen * k > MEM_OVERHEAD_THRESHOLD 
|| !LOW_LEVEL_OPTIMIZATION || m2.clen==1 || m1.isUltraSparse() || 
m2.isUltraSparse()) 
@@ -247,6 +247,7 @@ public class LibMatrixMult
                        throw new DMLRuntimeException(ex);
                }
                
+               
                //post-processing (nnz maintained in parallel)
                ret.examSparsity();
                
@@ -3960,4 +3961,4 @@ public class LibMatrixMult
                        return _ret.recomputeNonZeros(_rl, _ru-1, 0, 
_ret.getNumColumns()-1);
                }
        }
-}
\ No newline at end of file
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
new file mode 100644
index 0000000..4b12596
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.runtime.matrix.data;
+
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.utils.NativeHelper;
+import org.apache.sysml.utils.Statistics;
+
+public class LibMatrixNative {
+       
+       // We could encapsulate heuristics in this function
+       // For now, we only consider matrix-vector operation to be memory bound
+       private static boolean isMatMultMemoryBound(int m1Rlen, int m1Clen, int 
m2Clen) {
+               return m1Rlen == 1 || m1Clen == 1 || m2Clen == 1;
+       }
+
+       /**
+        * Performs matrix multiplication using native library if BLAS is 
available or else falls back to
+        * Java BLAS.
+        * 
+        * @param m1 lhs matrix block
+        * @param m2 rhs matrix block
+        * @param ret output matrix block
+        * @param k number of threads
+        * @throws DMLRuntimeException if error occurs
+        */
+       public static void matrixMult(MatrixBlock m1, MatrixBlock m2, 
MatrixBlock ret, int k) throws DMLRuntimeException {
+               matrixMult(m1, m2, ret, k, true);
+       }
+       
+       public static void matrixMult(MatrixBlock m1, MatrixBlock m2, 
MatrixBlock ret, int k, boolean examSparsity) throws DMLRuntimeException {
+               // Sanity check:
+               k = k <= 0 ? NativeHelper.getMaxNumThreads() : k;
+               
+               // check inputs / outputs
+               if (m1.isEmptyBlock() || m2.isEmptyBlock()) {
+                       ret.setNonZeros(0);
+                       if(examSparsity)
+                               ret.examSparsity(); // turn empty dense into 
sparse
+                       return;
+               }
+               if (NativeHelper.isNativeLibraryLoaded() && 
+                               !isMatMultMemoryBound(m1.rlen, m1.clen, 
m2.clen) && !m1.isInSparseFormat() && !m2.isInSparseFormat()) {
+                       ret.sparse = false;
+                       ret.allocateDenseBlock();
+                       if (NativeHelper.matrixMultDenseDense(m1.denseBlock, 
m2.denseBlock, 
+                                       ret.denseBlock, m1.getNumRows(), 
m1.getNumColumns(), m2.getNumColumns(), k)) {
+                               
Statistics.numNativeLibMatrixMultCalls.increment();
+                               ret.recomputeNonZeros();
+                               // post-processing (nnz maintained in parallel)
+                               if(examSparsity)
+                                       ret.examSparsity();
+                               return;
+                       } else {
+                               // Else fall back to Java
+                               Statistics.incrementNativeFailuresCounter();
+                       }
+               }
+               if (k == 1)
+                       LibMatrixMult.matrixMult(m1, m2, ret, examSparsity);
+               else
+                       LibMatrixMult.matrixMult(m1, m2, ret, k);
+       }
+       
+       /**
+        * This method performs convolution (i.e. cross-correlation) operation 
on input
+        * 
+        * @param input input batch 
+        * @param filter filter
+        * @param outputBlock output of convolution
+        * @param params convolution parameters
+        * @throws DMLRuntimeException if DMLRuntimeException occurs
+        */
+       public static void conv2d(MatrixBlock input, MatrixBlock filter, 
MatrixBlock outputBlock, ConvolutionParameters params) throws 
DMLRuntimeException {
+               LibMatrixDNN.checkInputsConv2d(input, filter, outputBlock, 
params);
+               params.numThreads = params.numThreads <= 0 ? 
NativeHelper.getMaxNumThreads() : params.numThreads;
+               if(NativeHelper.isNativeLibraryLoaded() && 
!input.isInSparseFormat() && !filter.isInSparseFormat()) {
+                       setNumThreads(params);
+                       if(params.bias == null) {
+                               if(NativeHelper.conv2dDense(input.denseBlock, 
filter.denseBlock, outputBlock.denseBlock, params.N, params.C, params.H, 
params.W, 
+                                               params.K, params.R, params.S, 
params.stride_h, params.stride_w, params.pad_h, params.pad_w, 
+                                               params.P, params.Q, 
params.numThreads)) {
+                                       
Statistics.numNativeLibMatrixDNNCalls.increment();
+                                       // post-processing: maintain nnz
+                                       outputBlock.recomputeNonZeros();
+                                       return;
+                               }
+                               else {
+                                       // Fall back to Java when failures
+                                       
Statistics.incrementNativeFailuresCounter();
+                               }
+                       }
+                       else {
+                               if(params.bias.isInSparseFormat())
+                                       params.bias.sparseToDense(); // Bias 
matrix is usually extremely small
+                               
if(NativeHelper.conv2dBiasAddDense(input.denseBlock, params.bias.denseBlock, 
filter.denseBlock, outputBlock.denseBlock, 
+                                               params.N, params.C, params.H, 
params.W, 
+                                               params.K, params.R, params.S, 
params.stride_h, params.stride_w, params.pad_h, params.pad_w, 
+                                               params.P, params.Q, 
params.numThreads)) {
+                                       
Statistics.numNativeLibMatrixDNNCalls.increment();
+                                       // post-processing: maintain nnz
+                                       outputBlock.recomputeNonZeros();
+                                       return;
+                               }
+                               else {
+                                       // Fall back to Java when failures
+                                       
Statistics.incrementNativeFailuresCounter();
+                               }
+                       }
+               }
+               
+               // Fall back to Java when failures or sparse
+               LibMatrixDNN.conv2d(input, filter, outputBlock, params);
+       }
+       
+       private static void setNumThreads(ConvolutionParameters params) {
+               params.numThreads = 
OptimizerUtils.getConstrainedNumThreads(params.numThreads);
+               if (!(params.isOutputThreadSafe() && params.numThreads > 1))
+                       params.numThreads = 1;
+       }
+       
+       /**
+        * This method computes the backpropogation errors for filter of 
convolution operation
+        * 
+        * @param input input image 
+        * @param dout errors from next layer
+        * @param outputBlock  output errors
+        * @param params convolution parameters
+        * @throws DMLRuntimeException if DMLRuntimeException occurs
+        */
+       public static void conv2dBackwardFilter(MatrixBlock input, MatrixBlock 
dout, MatrixBlock outputBlock, ConvolutionParameters params) throws 
DMLRuntimeException {
+               LibMatrixDNN.checkInputsConv2dBackwardFilter(input, dout, 
outputBlock, params);
+               params.numThreads = params.numThreads <= 0 ? 
NativeHelper.getMaxNumThreads() : params.numThreads;
+               if(NativeHelper.isNativeLibraryLoaded() && 
!dout.isInSparseFormat() && !input.isInSparseFormat()) {
+                       setNumThreads(params);
+                       
if(NativeHelper.conv2dBackwardFilterDense(input.denseBlock, dout.denseBlock, 
outputBlock.denseBlock, params.N, params.C, params.H, params.W, 
+                                               params.K, params.R, params.S, 
params.stride_h, params.stride_w, params.pad_h, params.pad_w, 
+                                               params.P, params.Q, 
params.numThreads)) {
+                               
Statistics.numNativeLibMatrixDNNCalls.increment();
+                               // post-processing: maintain nnz
+                               outputBlock.recomputeNonZeros();
+                               return;
+                       }
+                       else {
+                               // Fall back to Java when failures
+                               Statistics.incrementNativeFailuresCounter();
+                       }
+               }
+               // Fall back to Java when failures or sparse
+               LibMatrixDNN.conv2dBackwardFilter(input, dout, outputBlock, 
params);
+       }
+       
+       /**
+        * This method computes the backpropogation errors for previous layer 
of convolution operation
+        * 
+        * @param filter filter used in conv2d 
+        * @param dout errors from next layer
+        * @param outputBlock  output errors
+        * @param params convolution parameters
+        * @throws DMLRuntimeException if DMLRuntimeException occurs
+        */
+       public static void conv2dBackwardData(MatrixBlock filter, MatrixBlock 
dout, MatrixBlock outputBlock, ConvolutionParameters params) throws 
DMLRuntimeException {
+               LibMatrixDNN.checkInputsConv2dBackwardData(filter, dout, 
outputBlock, params);
+               params.numThreads = params.numThreads <= 0 ? 
NativeHelper.getMaxNumThreads() : params.numThreads;
+               if(NativeHelper.isNativeLibraryLoaded() && 
!dout.isInSparseFormat() && !filter.isInSparseFormat()) {
+                       setNumThreads(params);
+                       
if(NativeHelper.conv2dBackwardDataDense(filter.denseBlock, dout.denseBlock, 
outputBlock.denseBlock, params.N, params.C, params.H, params.W, 
+                                               params.K, params.R, params.S, 
params.stride_h, params.stride_w, params.pad_h, params.pad_w, 
+                                               params.P, params.Q, 
params.numThreads)) {
+                               
Statistics.numNativeLibMatrixDNNCalls.increment();
+                               // post-processing: maintain nnz
+                               outputBlock.recomputeNonZeros();
+                               return;
+                       }
+                       else {
+                               // Fall back to Java when failures
+                               Statistics.incrementNativeFailuresCounter();
+                       }
+               }
+               // Fall back to Java when failures or sparse
+               LibMatrixDNN.conv2dBackwardData(filter, dout, outputBlock, 
params);
+       }
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
index c458b07..8fac947 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
@@ -35,6 +35,7 @@ import java.util.stream.LongStream;
 import org.apache.commons.math3.random.Well1024a;
 import org.apache.hadoop.io.DataInputBuffer;
 import org.apache.sysml.conf.ConfigurationManager;
+import org.apache.sysml.conf.DMLConfig;
 import org.apache.sysml.hops.Hop.OpOp2;
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.lops.MMTSJ.MMTSJType;
@@ -4873,15 +4874,26 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock, Externalizab
                
                return sum_wt;
        }
+       
+       public MatrixValue aggregateBinaryOperations(MatrixIndexes m1Index, 
MatrixValue m1Value, MatrixIndexes m2Index, MatrixValue m2Value, 
+                       MatrixValue result, AggregateBinaryOperator op ) throws 
DMLRuntimeException
+       {
+               boolean enableNativeBLAS = 
ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.NATIVE_BLAS);
+               return aggregateBinaryOperations(m1Value, m2Value, result, op, 
enableNativeBLAS);
+       }
 
        public MatrixValue aggregateBinaryOperations(MatrixIndexes m1Index, 
MatrixValue m1Value, MatrixIndexes m2Index, MatrixValue m2Value, 
-                       MatrixValue result, AggregateBinaryOperator op ) 
-               throws DMLRuntimeException
+                       MatrixValue result, AggregateBinaryOperator op, boolean 
enableNativeBLAS ) throws DMLRuntimeException
        {
-               return aggregateBinaryOperations(m1Value, m2Value, result, op);
+               return aggregateBinaryOperations(m1Value, m2Value, result, op, 
enableNativeBLAS);
+       }
+       
+       public MatrixValue aggregateBinaryOperations(MatrixValue m1Value, 
MatrixValue m2Value, MatrixValue result, AggregateBinaryOperator op) throws 
DMLRuntimeException {
+               boolean enableNativeBLAS = 
ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.NATIVE_BLAS); 
+               return aggregateBinaryOperations(m1Value, m2Value, result, op, 
enableNativeBLAS);
        }
 
-       public MatrixValue aggregateBinaryOperations(MatrixValue m1Value, 
MatrixValue m2Value, MatrixValue result, AggregateBinaryOperator op) 
+       public MatrixValue aggregateBinaryOperations(MatrixValue m1Value, 
MatrixValue m2Value, MatrixValue result, AggregateBinaryOperator op, boolean 
nativeMatMult) 
                throws DMLRuntimeException
        {
                //check input types, dimensions, configuration
@@ -4907,7 +4919,9 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock, Externalizab
                        ret.reset(rl, cl, sp.sparse, sp.estimatedNonZeros);
                
                //compute matrix multiplication (only supported binary 
aggregate operation)
-               if( op.getNumThreads() > 1 )
+               if( nativeMatMult )
+                       LibMatrixNative.matrixMult(m1, m2, ret, 
op.getNumThreads());
+               else if( op.getNumThreads() > 1 )
                        LibMatrixMult.matrixMult(m1, m2, ret, 
op.getNumThreads());
                else
                        LibMatrixMult.matrixMult(m1, m2, ret);

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java 
b/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
index b988546..8e51405 100644
--- a/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
+++ b/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
@@ -22,6 +22,8 @@ package org.apache.sysml.runtime.util;
 import java.util.Arrays;
 
 import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.functionobjects.Multiply;
+import org.apache.sysml.runtime.functionobjects.Plus;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
 import org.apache.sysml.runtime.matrix.operators.BinaryOperator;
 import org.apache.sysml.runtime.matrix.operators.ScalarOperator;
@@ -58,12 +60,20 @@ public class ConvolutionUtils {
                }
                return ret;
        }
+
        
-       // Performs dest[destPos ...] <- src[src_rl:src_ru, ]
-       //Assumes that dest is zeroed-out before calling
-       public static void copy(MatrixBlock src, double [] dest, int destPos, 
int destNumCols, int src_rl, int src_ru) {
+       // Performs dest[destPos...] op= thatValue[src_rl:src_ru,]
+       public static void binaryOperationInPlace(MatrixBlock src, double [] 
dest, 
+                       int destPos, int destNumCols, int src_rl, int src_ru, 
BinaryOperator op) throws DMLRuntimeException {
                if(src.isInSparseFormat()) {
-                       if(!src.isEmptyBlock()) {
+                       if(src.isEmptyBlock() && op.fn == 
Plus.getPlusFnObject()) {
+                               // Do nothing: Inplace addition by zero
+                       }
+                       else if(src.isEmptyBlock() && op.fn == 
Multiply.getMultiplyFnObject()) {
+                               // Inplace multiplication by zero
+                               Arrays.fill(dest, destPos, destPos + 
(src_ru-src_rl)*destNumCols, 0);
+                       }
+                       else if(op.fn == Plus.getPlusFnObject()) {
                                for(int i = src_rl, cix = destPos; i < src_ru; 
i++, cix += destNumCols) {
                                        if( !src.getSparseBlock().isEmpty(i) ) {
                                                int apos = 
src.getSparseBlock().pos(i);
@@ -71,37 +81,54 @@ public class ConvolutionUtils {
                                                int[] aix = 
src.getSparseBlock().indexes(i);
                                                double[] avals = 
src.getSparseBlock().values(i);
                                                for(int j = apos; j < 
apos+alen; j++) {
-                                                       dest[ cix+aix[j] ] = 
avals[j];
+                                                       dest[ cix+aix[j] ] += 
avals[j];
                                                }
                                        }
                                }
                        }
-               }
-               else {
-                       System.arraycopy(src.getDenseBlock(), 
src_rl*src.getNumColumns(), dest, destPos, (src_ru-src_rl)*src.getNumColumns());
-               }
-       }
-       
-       // Performs dest[destPos...] op= thatValue[src_rl:src_ru,]
-       public static void binaryOperationInPlace(MatrixBlock src, double [] 
dest, 
-                       int destPos, int destNumCols, int src_rl, int src_ru, 
BinaryOperator op) throws DMLRuntimeException {
-               if(src.isInSparseFormat()) {
-                       for(int i = src_rl, cix = destPos; i < src_ru; i++, cix 
+= destNumCols) {
-                               if( !src.getSparseBlock().isEmpty(i) ) {
-                                       int apos = src.getSparseBlock().pos(i);
-                                       int alen = src.getSparseBlock().size(i);
-                                       int[] aix = 
src.getSparseBlock().indexes(i);
-                                       double[] avals = 
src.getSparseBlock().values(i);
-                                       for(int j = apos; j < apos+alen; j++) {
-                                               dest[ cix+aix[j] ] = 
op.fn.execute(dest[ cix+aix[j] ], avals[j]);
+                       else if(op.fn == Multiply.getMultiplyFnObject()) {
+                               // Unsafe operation
+                               for(int i = src_rl, cix = destPos; i < src_ru; 
i++, cix += destNumCols) {
+                                       if( !src.getSparseBlock().isEmpty(i) ) {
+                                               int apos = 
src.getSparseBlock().pos(i);
+                                               int alen = 
src.getSparseBlock().size(i);
+                                               int[] aix = 
src.getSparseBlock().indexes(i);
+                                               double[] avals = 
src.getSparseBlock().values(i);
+                                               int prevDestIndex = 0;
+                                               for(int j = apos; j < 
apos+alen; j++) {
+                                                       // Multiplication by 
zero. Assumption: aix is sorted.
+                                                       Arrays.fill(dest, 
cix+prevDestIndex, aix[j], 0);
+                                                       prevDestIndex = 
aix[j]+1;
+                                                       dest[ cix+aix[j] ] *= 
avals[j];
+                                               }
+                                               Arrays.fill(dest, 
cix+prevDestIndex, cix+destNumCols, 0);
+                                       }
+                                       else {
+                                               Arrays.fill(dest, cix, cix + 
destNumCols, 0);
                                        }
                                }
                        }
+                       else {
+                               // As operation could be safe or unsafe. This 
will be caught at development time.
+                               throw new DMLRuntimeException("Unimplemented 
sparse operation");
+                       }
                }
                else {
                        double [] inputArr = src.getDenseBlock();
-                       for(int i = destPos; i < src_ru*destNumCols; i++) {
-                               dest[i] = op.fn.execute(dest[i], inputArr[i]);
+                       if(op.fn == Plus.getPlusFnObject()) {
+                               for(int i = destPos; i < src_ru*destNumCols; 
i++) {
+                                       dest[i] += inputArr[i];
+                               }
+                       }
+                       else if(op.fn == Multiply.getMultiplyFnObject()) {
+                               for(int i = destPos; i < src_ru*destNumCols; 
i++) {
+                                       dest[i] *= inputArr[i];
+                               }
+                       }
+                       else {
+                               for(int i = destPos; i < src_ru*destNumCols; 
i++) {
+                                       dest[i] = op.fn.execute(dest[i], 
inputArr[i]);
+                               }
                        }
                }
        }
@@ -130,45 +157,6 @@ public class ConvolutionUtils {
                }
        }
        
-       // dest (of size N x KPQ) = input (of size N x KPQ) op bias (of size K 
x 1)
-       public static void binaryBiasOperations(MatrixBlock input, MatrixBlock 
bias, double [] dest, 
-                       int K, int PQ, int rl, int ru, BinaryOperator op) 
throws DMLRuntimeException {
-               copy(input, dest, rl*K*PQ, K*PQ, rl, ru);
-               binaryBiasOperationInPlace(bias, dest, K, PQ, rl, ru, op);
-       }
-       
-       // dest (of size N x KPQ) op= bias (of size K x 1)
-       public static void binaryBiasOperationInPlace(MatrixBlock bias, double 
[] dest, 
-                       int K, int PQ, int rl, int ru, BinaryOperator op) 
throws DMLRuntimeException {
-               // bias.getNumColumns() == 1 checked outside
-               if(!bias.isInSparseFormat()) {
-                       double [] biasArr = bias.getDenseBlock();
-                       int index = rl*K*PQ;
-                       for(int n = rl; n < ru; n++) {
-                               for(int k = 0; k < K; k++) {
-                                       for(int pq = 0; pq < PQ; pq++, index++) 
{
-                                               dest[index] = 
op.fn.execute(dest[index], biasArr[k]);
-                                       }
-                               }
-                       }
-               }
-               else {
-                       for(int k = 0; k < K; k++) {
-                               if( !bias.getSparseBlock().isEmpty(k) ) {
-                                       int apos = bias.getSparseBlock().pos(k);
-                                       double[] avals = 
bias.getSparseBlock().values(k);
-                                       double val = avals[apos];
-                                       for(int n = rl; n < ru; n++) {
-                                               int index = n*K*PQ + k*PQ;
-                                               for(int pq = 0; pq < PQ; pq++, 
index++) {
-                                                       dest[index] = 
op.fn.execute(dest[index], val);
-                                               }
-                                       }
-                               }
-                       }
-               }
-       }
-       
        public static void fillBias(MatrixBlock bias, double [] outputArray, 
int src_rl, int src_ru, int N, int K, int PQ) throws DMLRuntimeException {
                // bias.getNumColumns() == 1 checked outside
                if(bias.isInSparseFormat()) {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/utils/EnvironmentHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/EnvironmentHelper.java 
b/src/main/java/org/apache/sysml/utils/EnvironmentHelper.java
new file mode 100644
index 0000000..c1b4c3e
--- /dev/null
+++ b/src/main/java/org/apache/sysml/utils/EnvironmentHelper.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.sysml.utils;
+
+
+/**
+ * This class is useful in setting environment variable for loading MKL 
library (done by Native Helper)
+ */
+public class EnvironmentHelper {
+       public static native void setEnv(String key, String value);
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/utils/NativeHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/NativeHelper.java 
b/src/main/java/org/apache/sysml/utils/NativeHelper.java
new file mode 100644
index 0000000..2b997ed
--- /dev/null
+++ b/src/main/java/org/apache/sysml/utils/NativeHelper.java
@@ -0,0 +1,259 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.utils;
+
+import java.io.IOException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import java.util.HashMap;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.File;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.SystemUtils;
+import org.apache.sysml.conf.ConfigurationManager;
+import org.apache.sysml.conf.DMLConfig;
+import org.apache.sysml.hops.OptimizerUtils;
+
+/**
+ * This class helps in loading native library.
+ * By default, it first tries to load Intel MKL, else tries to load OpenBLAS.
+ */
+public class NativeHelper {
+       private static boolean isSystemMLLoaded = false;
+       private static final Log LOG = 
LogFactory.getLog(NativeHelper.class.getName());
+       private static HashMap<String, String> supportedArchitectures = new 
HashMap<String, String>();
+       public static String blasType;
+       private static int maxNumThreads = -1;
+       private static boolean setMaxNumThreads = false;
+       static {
+               // Note: we only support 64 bit Java on x86 and AMD machine
+    supportedArchitectures.put("x86_64", "x86_64");
+    supportedArchitectures.put("amd64", "x86_64");
+       }
+       
+       private static boolean attemptedLoading = false;
+       
+       // Performing loading in a method instead of a static block will throw 
a detailed stack trace in case of fatal errors
+       private static void init() {
+               // Only Linux supported for BLAS
+               if(!SystemUtils.IS_OS_LINUX)
+                       return;
+               
+               // attemptedLoading variable ensures that we don't try to load 
SystemML and other dependencies 
+               // again and again especially in the parfor (hence the 
double-checking with synchronized).
+               if(!attemptedLoading) {
+                       DMLConfig dmlConfig = 
ConfigurationManager.getDMLConfig();
+                       String userSpecifiedBLAS = 
System.getenv("SYSTEMML_BLAS");
+                       userSpecifiedBLAS = (userSpecifiedBLAS == null) ? "" :  
userSpecifiedBLAS.trim().toLowerCase();
+                       // 
-------------------------------------------------------------------------------------
+                       // We allow BLAS to be enabled or disabled or 
explicitly selected in one of the two ways:
+                       // 1. DML Configuration: native.blas (boolean flag)
+                       // 2. Environment variable: SYSTEMML_BLAS (can be set 
to mkl, openblas or none)
+                       // The option 1 will be removed in later SystemML 
versions.
+                       // The option 2 is useful for two reasons:
+                       // - Developer testing of different BLAS 
+                       // - Provides fine-grained control. Certain machines 
could use mkl while others use openblas, etc.
+                       boolean enabledViaConfig = (dmlConfig == null) ? true : 
dmlConfig.getBooleanValue(DMLConfig.NATIVE_BLAS);
+                       boolean enabledViaEnvironmentVariable = 
userSpecifiedBLAS.equals("") || userSpecifiedBLAS.equals("mkl") || 
userSpecifiedBLAS.equals("openblas");
+                       
+                       if(enabledViaConfig && enabledViaEnvironmentVariable) {
+                               long start = System.nanoTime();
+                               
if(!supportedArchitectures.containsKey(SystemUtils.OS_ARCH)) {
+                                       LOG.warn("Unsupported architecture for 
native BLAS:" + SystemUtils.OS_ARCH);
+                                       return;
+                               }
+               synchronized(NativeHelper.class) {
+                       if(!attemptedLoading) {
+                               // 
-----------------------------------------------------------------------------
+                               // 
=============================================================================
+                               // By default, we will native.blas=true and we 
will attempt to load MKL first.
+                               // If MKL is not enabled then we try to load 
OpenBLAS.
+                               // If both MKL and OpenBLAS are not available 
we fall back to Java BLAS.
+                               if(userSpecifiedBLAS.equalsIgnoreCase("")) {
+                                       blasType = isMKLAvailable() ? "mkl" : 
isOpenBLASAvailable() ? "openblas" : null;
+                                       if(blasType == null)
+                                               LOG.warn("Unable to load either 
MKL or OpenBLAS");
+                               }
+                               else 
if(userSpecifiedBLAS.equalsIgnoreCase("mkl")) {
+                                       blasType = isMKLAvailable() ? "mkl" : 
null;
+                                       if(blasType == null)
+                                               LOG.warn("Unable to load MKL");
+                               }
+                               else 
if(userSpecifiedBLAS.equalsIgnoreCase("openblas")) {
+                                       blasType = isOpenBLASAvailable() ? 
"openblas" : null;
+                                       if(blasType == null)
+                                               LOG.warn("Unable to load 
OpenBLAS");
+                               }
+                               else {
+                                       LOG.warn("Unsupported BLAS:" + 
userSpecifiedBLAS);
+                               }
+                               // 
=============================================================================
+                                   if(blasType != null && 
loadLibraryHelper("libsystemml_" + blasType + "-Linux-x86_64.so")) {
+                                                       LOG.info("Using native 
blas: " + blasType);
+                                                       isSystemMLLoaded = true;
+                                               }
+                       }
+               }
+               double timeToLoadInMilliseconds = 
(System.nanoTime()-start)*1e-6;
+               if(timeToLoadInMilliseconds > 100) 
+                       LOG.warn("Time to load native blas: " + 
timeToLoadInMilliseconds + " milliseconds.");
+                       }
+                       else {
+                               if(enabledViaConfig)
+                                       LOG.warn("Using internal Java BLAS as 
native BLAS support is disabled by the configuration 'native.blas'.");
+                               else
+                                       LOG.warn("Using internal Java BLAS as 
native BLAS support is disabled by the environment variable 'SYSTEMML_BLAS=" + 
userSpecifiedBLAS + "'.");
+                       }
+                       attemptedLoading = true;
+               }
+       }
+       
+       public static boolean isNativeLibraryLoaded() {
+               init();
+               if(maxNumThreads == -1)
+                       maxNumThreads = 
OptimizerUtils.getConstrainedNumThreads(-1);
+               if(isSystemMLLoaded && !setMaxNumThreads && maxNumThreads != 
-1) {
+                       // This method helps us decide whether to use 
GetPrimitiveArrayCritical or GetDoubleArrayElements in JNI as each has 
different tradeoffs.
+                       // In current implementation, we always use 
GetPrimitiveArrayCritical as it has proven to be fastest. 
+                       // We can revisit this decision later and hence I would 
not recommend removing this method. 
+                       setMaxNumThreads(maxNumThreads);
+                       setMaxNumThreads = true;
+               }
+               return isSystemMLLoaded;
+       }
+       
+       public static int getMaxNumThreads() {
+               if(maxNumThreads == -1)
+                       maxNumThreads = 
OptimizerUtils.getConstrainedNumThreads(-1);
+               return maxNumThreads;
+       }
+       
+       
+       private static boolean isMKLAvailable() {
+               // ------------------------------------------------------------
+               // Set environment variable MKL_THREADING_LAYER to GNU on Linux 
for performance
+               if(!loadLibraryHelper("libpreload_systemml-Linux-x86_64.so")) {
+                       LOG.warn("Unable to load preload_systemml (required for 
loading MKL-enabled SystemML library)");
+                       return false;
+               }
+               // The most reliable way in my investigation to ensure that MKL 
runs smoothly with OpenMP (used by conv2d*)
+               // is setting the environment variable MKL_THREADING_LAYER to 
GNU
+               EnvironmentHelper.setEnv("MKL_THREADING_LAYER", "GNU");
+               if(!loadBLAS("gomp", "gomp required for loading MKL-enabled 
SystemML library")) 
+                       return false;
+               
+               // ------------------------------------------------------------
+               return loadBLAS("mkl_rt", null);
+       }
+       
+       private static boolean isOpenBLASAvailable() {
+               if(!loadBLAS("gomp", "gomp required for loading 
OpenBLAS-enabled SystemML library")) 
+                       return false;
+               return loadBLAS("openblas", null);
+       }
+       
+       private static boolean loadBLAS(String blas, String optionalMsg) {
+               try {
+                        System.loadLibrary(blas);
+                        return true;
+               }
+               catch (UnsatisfiedLinkError e) {
+                       if(optionalMsg != null)
+                               LOG.warn("Unable to load " + blas + "(" + 
optionalMsg + "):" + e.getMessage());
+                       else
+                               LOG.warn("Unable to load " + blas + ":" + 
e.getMessage());
+                       return false;
+               }
+       }
+
+       private static boolean loadLibraryHelper(String path)  {
+               InputStream in = null; OutputStream out = null;
+               try {
+                       // This logic is added because Java doesnot allow to 
load library from a resource file.
+                       in = 
NativeHelper.class.getResourceAsStream("/lib/"+path);
+                       if(in != null) {
+                               File temp = File.createTempFile(path, "");
+                               temp.deleteOnExit();
+                               out = FileUtils.openOutputStream(temp);
+        IOUtils.copy(in, out);
+        in.close(); in = null;
+        out.close(); out = null;
+                               System.load(temp.getAbsolutePath());
+                               return true;
+                       }
+                       else
+                               LOG.warn("No lib available in the jar:" + path);
+               } catch(IOException e) {
+                       LOG.warn("Unable to load library " + path + " from 
resource:" + e.getMessage());
+               } finally {
+                       if(out != null)
+                               try {
+                                       out.close();
+                               } catch (IOException e) {}
+                       if(in != null)
+                               try {
+                                       in.close();
+                               } catch (IOException e) {}
+               }
+               return false;
+       }
+       
+       // TODO: Add pmm, wsloss, mmchain, etc.
+       public static native boolean matrixMultDenseDense(double [] m1, double 
[] m2, double [] ret, int m1rlen, int m1clen, int m2clen, int numThreads);
+       private static native boolean tsmm(double [] m1, double [] ret, int 
m1rlen, int m1clen, boolean isLeftTranspose, int numThreads);
+       
+       // 
----------------------------------------------------------------------------------------------------------------
+       // LibMatrixDNN operations:
+       // N = number of images, C = number of channels, H = image height, W = 
image width
+       // K = number of filters, R = filter height, S = filter width
+       // TODO: case not handled: sparse filters (which will only be executed 
in Java). Since filters are relatively smaller, this is a low priority.
+       
+       // Called by ConvolutionCPInstruction if both input and filter are dense
+       public static native boolean conv2dDense(double [] input, double [] 
filter, double [] ret, int N, int C, int H, int W, 
+                       int K, int R, int S, int stride_h, int stride_w, int 
pad_h, int pad_w, int P, int Q, int numThreads);
+       public static native boolean conv2dBiasAddDense(double [] input, double 
[] bias, double [] filter, double [] ret, int N, int C, int H, int W, 
+                       int K, int R, int S, int stride_h, int stride_w, int 
pad_h, int pad_w, int P, int Q, int numThreads);
+       // Called by ConvolutionCPInstruction if both input and filter are dense
+       public static native boolean conv2dBackwardFilterDense(double [] input, 
double [] dout, double [] ret, int N, int C, int H, int W, 
+                       int K, int R, int S, int stride_h, int stride_w, int 
pad_h, int pad_w, int P, int Q, int numThreads);
+       // If both filter and dout are dense, then called by 
ConvolutionCPInstruction
+       // Else, called by LibMatrixDNN's thread if filter is dense. dout[n] is 
converted to dense if sparse.
+       public static native boolean conv2dBackwardDataDense(double [] filter, 
double [] dout, double [] ret, int N, int C, int H, int W, 
+                       int K, int R, int S, int stride_h, int stride_w, int 
pad_h, int pad_w, int P, int Q, int numThreads);
+       
+       // Currently only supported with numThreads = 1 and sparse input
+       // Called by LibMatrixDNN's thread if input is sparse. dout[n] is 
converted to dense if sparse.
+       public static native boolean conv2dBackwardFilterSparseDense(int apos, 
int alen, int[] aix, double[] avals, double [] rotatedDoutPtr, double [] ret, 
int N, int C, int H, int W, 
+                       int K, int R, int S, int stride_h, int stride_w, int 
pad_h, int pad_w, int P, int Q, int numThreads);
+       // Called by LibMatrixDNN's thread if input is sparse and filter is 
dense
+       public static native boolean conv2dSparse(int apos, int alen, int[] 
aix, double[] avals, double [] filter, double [] ret, int N, int C, int H, int 
W, 
+                       int K, int R, int S, int stride_h, int stride_w, int 
pad_h, int pad_w, int P, int Q, int numThreads);
+       // 
----------------------------------------------------------------------------------------------------------------
+       
+       // This method helps us decide whether to use GetPrimitiveArrayCritical 
or GetDoubleArrayElements in JNI as each has different tradeoffs.
+       // In current implementation, we always use GetPrimitiveArrayCritical 
as it has proven to be fastest. 
+       // We can revisit this decision later and hence I would not recommend 
removing this method. 
+       private static native void setMaxNumThreads(int numThreads);
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/39a37ae4/src/main/java/org/apache/sysml/utils/Statistics.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/Statistics.java 
b/src/main/java/org/apache/sysml/utils/Statistics.java
index 097f58e..97888cb 100644
--- a/src/main/java/org/apache/sysml/utils/Statistics.java
+++ b/src/main/java/org/apache/sysml/utils/Statistics.java
@@ -112,6 +112,17 @@ public class Statistics
                return numExecutedMRJobs.longValue();
        }
        
+       private static LongAdder numNativeFailures = new LongAdder();
+       public static LongAdder numNativeLibMatrixMultCalls = new LongAdder();
+       public static LongAdder numNativeLibMatrixDNNCalls = new LongAdder();
+       public static void incrementNativeFailuresCounter() {
+               numNativeFailures.increment();
+               // This is very rare and am not sure it is possible at all. Our 
initial experiments never encountered this case.
+               // Note: all the native calls have a fallback to Java; so if 
the user wants she can recompile SystemML by 
+               // commenting this exception and everything should work fine.
+               throw new RuntimeException("Unexpected ERROR: OOM caused during 
JNI transfer. Please disable native BLAS by setting enviroment variable: 
SYSTEMML_BLAS=none");
+       }
+       
        public static void incrementNoOfExecutedMRJobs() {
                numExecutedMRJobs.increment();
        }
@@ -366,6 +377,9 @@ public class Statistics
                resetCPHeavyHitters();
 
                GPUStatistics.reset();
+               numNativeLibMatrixMultCalls.reset();
+               numNativeLibMatrixDNNCalls.reset();
+               numNativeFailures.reset();
                LibMatrixDNN.resetStatistics();
        }
 
@@ -621,6 +635,11 @@ public class Statistics
                //show extended caching/compilation statistics
                if( DMLScript.STATISTICS ) 
                {
+                       if(NativeHelper.blasType != null && 
(numNativeLibMatrixMultCalls.longValue() > 0 || 
+                                       numNativeLibMatrixDNNCalls.longValue() 
> 0)) {
+                               String blas = NativeHelper.blasType != null ? 
NativeHelper.blasType : ""; 
+                               sb.append("Native " + blas + " calls 
(LibMatrixMult/LibMatrixDNN):\t" + numNativeLibMatrixMultCalls.longValue()  + 
"/" + numNativeLibMatrixDNNCalls.longValue() + ".\n");
+                       }
                        sb.append("Cache hits (Mem, WB, FS, HDFS):\t" + 
CacheStatistics.displayHits() + ".\n");
                        sb.append("Cache writes (WB, FS, HDFS):\t" + 
CacheStatistics.displayWrites() + ".\n");
                        sb.append("Cache times (ACQr/m, RLS, EXP):\t" + 
CacheStatistics.displayTime() + " sec.\n");

[2/3] incubator-systemml git commit: [SYSTEMML-769] Adding support for native BLAS for Linux

Reply via email to