[1/2] incubator-systemml git commit: [SYSTEMML-445] [WIP] Initial version of GPU backend

niketanpansare Wed, 15 Jun 2016 10:05:41 -0700

Repository: incubator-systemml
Updated Branches:
  refs/heads/master 8c37e2e1e -> f306b0b1e



http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
new file mode 100644
index 0000000..b293ece
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java
@@ -0,0 +1,359 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.matrix.data;
+
+import static jcuda.jcudnn.JCudnn.cudnnConvolutionBackwardData;
+import static jcuda.jcudnn.JCudnn.cudnnConvolutionBackwardFilter;
+import static jcuda.jcudnn.JCudnn.cudnnConvolutionForward;
+import static jcuda.jcudnn.JCudnn.cudnnCreateConvolutionDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnCreateFilterDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnDestroyConvolutionDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnDestroyFilterDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnDestroyTensorDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize;
+import static 
jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize;
+import static jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardWorkspaceSize;
+import static jcuda.jcudnn.JCudnn.cudnnSetConvolutionNdDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnSetFilterNdDescriptor;
+import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
+import static jcuda.jcudnn.cudnnConvolutionMode.CUDNN_CROSS_CORRELATION;
+import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
+import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
+import static jcuda.runtime.JCuda.cudaFree;
+import jcuda.Pointer;
+import jcuda.jcublas.JCublas;
+import jcuda.jcublas.cublasHandle;
+import jcuda.jcudnn.cudnnConvolutionDescriptor;
+import jcuda.jcudnn.cudnnFilterDescriptor;
+import jcuda.jcudnn.cudnnHandle;
+import jcuda.jcudnn.cudnnTensorDescriptor;
+
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
+import org.apache.sysml.runtime.controlprogram.context.JCudaObject;
+import org.apache.sysml.utils.Statistics;
+
+public class LibMatrixCUDA {
+       
+       public static cudnnHandle cudnnHandle;
+       public static cublasHandle cublasHandle;
+       
+       public static void conv2d(MatrixObject image, MatrixObject filter, 
MatrixObject outputBlock, int N, int C, int H, int W,
+                       int K, int R, int S, int pad_h, int pad_w, int 
stride_h, int stride_w, int P, int Q)
+                       throws DMLRuntimeException {
+               cudnnTensorDescriptor srcTensorDesc = null;
+               cudnnTensorDescriptor dstTensorDesc = null;
+               cudnnFilterDescriptor filterDesc = null;
+               cudnnConvolutionDescriptor convDesc = null;
+               Pointer workSpace = null;
+               long sizeInBytes = 0;
+               Pointer alpha = null;
+               Pointer beta = null;
+               try {
+                       // Allocate descriptors
+                       srcTensorDesc = allocateTensorDescriptor(N, C, H, W);
+                       dstTensorDesc = allocateTensorDescriptor(N, K, P, Q);
+                       filterDesc = allocateFilterDescriptor(K, C, R, S);
+                       
+                       // Allocate data
+                       // (Pointer) gpuCtx.prepare(image, true, true);
+                       // (Pointer) gpuCtx.prepare(filter, true, true);
+                       
+                       Pointer imagePointer = 
((JCudaObject)image._gpuHandle).jcudaPointer; 
+                       Pointer filterPointer = 
((JCudaObject)filter._gpuHandle).jcudaPointer; 
+                       Pointer dstPointer = 
((JCudaObject)outputBlock._gpuHandle).jcudaPointer; 
+                       
+                       int padding [] = { pad_h, pad_w }; 
+                       int strides [] = { stride_h, stride_w };
+                       convDesc = allocateConvolutionDescriptor(padding, 
strides);
+                       
+                       // TODO: Select the best algorithm depending on the 
data and supported CUDA
+                       int algo = 
jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
+                       
+                       long sizeInBytesArray[] = { 0 };
+            workSpace = new Pointer();
+            cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, 
+                    srcTensorDesc, filterDesc, convDesc, dstTensorDesc, 
+                    algo, sizeInBytesArray);
+            
+                       alpha = pointerTo(1.0); // TODO
+                       beta = pointerTo(0.0f);
+                       long start = System.nanoTime();
+                       int status = cudnnConvolutionForward(cudnnHandle, 
alpha, 
+                                       srcTensorDesc, imagePointer, 
+                                       filterDesc, filterPointer,
+                                       convDesc, algo, workSpace, sizeInBytes, 
beta,
+                                       dstTensorDesc, dstPointer);
+                       
Statistics.cudaConvFwdTime.addAndGet(System.nanoTime()-start);
+                       if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
+                               throw new DMLRuntimeException("Could not 
executed cudnnConvolutionForward: " + 
jcuda.jcudnn.cudnnStatus.stringFor(status));
+                       }
+               }
+               finally {
+                       
+                       if(alpha != null)
+                               cudaFree(alpha);
+                       if(beta != null)
+                               cudaFree(beta);
+                       
+                       if(srcTensorDesc != null)
+                               cudnnDestroyTensorDescriptor(srcTensorDesc);
+                       if(dstTensorDesc != null)
+                               cudnnDestroyTensorDescriptor(dstTensorDesc);
+                       if(filterDesc != null)
+                               cudnnDestroyFilterDescriptor(filterDesc);
+                       if(convDesc != null)
+                               cudnnDestroyConvolutionDescriptor(convDesc);
+                       if(workSpace != null && sizeInBytes != 0)
+                               cudaFree(workSpace);
+               }
+       }
+       
+       private static cudnnConvolutionDescriptor 
allocateConvolutionDescriptor(int padding [], int strides []) {
+               cudnnConvolutionDescriptor convDesc = new 
cudnnConvolutionDescriptor();
+               cudnnCreateConvolutionDescriptor(convDesc);
+               int upscale[] = { 1, 1 };
+               cudnnSetConvolutionNdDescriptor(convDesc, 2, padding, strides, 
upscale, 
+                               CUDNN_CROSS_CORRELATION, CUDNN_DATA_DOUBLE);
+               return convDesc;
+       }
+       
+       private static  Pointer pointerTo(double value) {
+        return Pointer.to(new double[] { value });
+    }
+       
+       private static  cudnnTensorDescriptor allocateTensorDescriptor(int N, 
int C, int H, int W) {
+               cudnnTensorDescriptor ret = new cudnnTensorDescriptor();
+               cudnnCreateTensorDescriptor(ret);
+               cudnnSetTensor4dDescriptor(ret, CUDNN_TENSOR_NCHW, 
CUDNN_DATA_DOUBLE, N, C, H, W);
+               return ret;
+       }
+       
+       private static cudnnFilterDescriptor allocateFilterDescriptor(int K, 
int C, int R, int S) {
+               cudnnFilterDescriptor filterDesc = new cudnnFilterDescriptor();
+               cudnnCreateFilterDescriptor(filterDesc);
+               int filterDim[] = { K, C, R, S };
+               cudnnSetFilterNdDescriptor(filterDesc, CUDNN_DATA_DOUBLE, 4, 
filterDim);
+               return filterDesc;
+       }
+
+
+       public static void conv2d_backward_filter(MatrixObject image, 
MatrixObject dout,
+                       MatrixObject outputBlock, int N, int C, int H, int W, 
int K, int R,
+                       int S, int pad_h, int pad_w, int stride_h, int 
stride_w, int P,
+                       int Q) throws DMLRuntimeException {
+               Pointer alpha = null;
+               Pointer beta = null;
+               cudnnTensorDescriptor xTensorDesc = null;
+               cudnnTensorDescriptor doutTensorDesc = null;
+               cudnnFilterDescriptor dwDesc = null;
+               cudnnConvolutionDescriptor convDesc = null;
+               
+               Pointer workSpace = null;
+               long sizeInBytes = 0;
+               try {
+                       // Allocate descriptors
+                       xTensorDesc = allocateTensorDescriptor(N, C, H, W);
+                       doutTensorDesc = allocateTensorDescriptor(N, K, P, Q);
+                       dwDesc = allocateFilterDescriptor(K, C, R, S);
+                       
+                       // Allocate data
+                       Pointer imagePointer = 
((JCudaObject)image._gpuHandle).jcudaPointer; 
+                       Pointer doutPointer = 
((JCudaObject)dout._gpuHandle).jcudaPointer; 
+                       Pointer dwPointer = 
((JCudaObject)outputBlock._gpuHandle).jcudaPointer; 
+                       
+                       alpha = pointerTo(1.0); // TODO
+                       beta = pointerTo(0.0f);
+                       
+                       int padding [] = { pad_h, pad_w }; 
+                       int strides [] = { stride_h, stride_w };
+                       convDesc = allocateConvolutionDescriptor(padding, 
strides);
+                       long sizeInBytesArray[] = { 0 };
+                       
+                       // TODO: Select the best algorithm depending on the 
data and supported CUDA
+                       int algo = 
jcuda.jcudnn.cudnnConvolutionBwdFilterAlgo.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
+                       workSpace = new Pointer();
+                       
cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle,
+                                       xTensorDesc, doutTensorDesc, convDesc, 
dwDesc, algo, sizeInBytesArray);
+                       
+                       int status = 
cudnnConvolutionBackwardFilter(cudnnHandle, alpha, xTensorDesc, imagePointer, 
+                                       doutTensorDesc, doutPointer, convDesc, 
algo, workSpace, sizeInBytes, beta, dwDesc, dwPointer);
+                       if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
+                               throw new DMLRuntimeException("Could not 
executed cudnnConvolutionBackwardFilter: " + 
jcuda.jcudnn.cudnnStatus.stringFor(status));
+                       }
+               }
+               finally {
+                       if(alpha != null)
+                               cudaFree(alpha);
+                       if(beta != null)
+                               cudaFree(beta);
+                       if(xTensorDesc != null)
+                               cudnnDestroyTensorDescriptor(xTensorDesc);
+                       if(doutTensorDesc != null)
+                               cudnnDestroyTensorDescriptor(doutTensorDesc);
+                       if(dwDesc != null)
+                               cudnnDestroyFilterDescriptor(dwDesc);
+                       
+                       if(convDesc != null)
+                               cudnnDestroyConvolutionDescriptor(convDesc);
+                       
+                       if(workSpace != null && sizeInBytes != 0)
+                               cudaFree(workSpace);
+               }
+               
+       }
+
+       public static void matmult(MatrixObject left1, MatrixObject right1, 
MatrixObject output, 
+                       boolean isLeftTransposed1, boolean isRightTransposed1) 
throws DMLRuntimeException {
+               if(isInSparseFormat(left1) || isInSparseFormat(right1)) {
+                       throw new DMLRuntimeException("Sparse GPU matrix 
multiplication is not implemented");
+               }
+               
+               // Since CuBLAS expects inputs in column-major format,
+               // reverse the order of matrix-multiplication and take care of 
dimension mismatch.
+               MatrixObject left = right1; 
+               MatrixObject right = left1;
+               boolean isLeftTransposed = isRightTransposed1; 
+               boolean isRightTransposed = isLeftTransposed1; 
+               
+               char transa = isLeftTransposed ? 'T' : 'N';
+               char transb = isRightTransposed ? 'T' : 'N';
+               // Note: the dimensions are swapped
+               int m = (int) (isLeftTransposed ? left.getNumRows() : 
left.getNumColumns()) ;
+               int n = (int) (isRightTransposed ? right.getNumColumns() : 
right.getNumRows());
+               int k = (int) (isLeftTransposed ?  left.getNumColumns() : 
left.getNumRows());
+               int k1 = (int) (isRightTransposed ?  right.getNumRows() : 
right.getNumColumns());
+               if(k != k1) 
+                       throw new DMLRuntimeException("Dimension mismatch: " + 
k + " != " + k1);
+               
+               if(m == -1 || n == -1 || k == -1)
+                       throw new DMLRuntimeException("Incorrect dimensions");
+               
+               double alpha = 1;
+               double beta = 0;
+               
+               int lda = isLeftTransposed ?  k : m;
+               int ldb = isRightTransposed ? n : k;
+               int ldc = m;
+               
+               if(!left.getGPUObject().isAllocated || 
!right.getGPUObject().isAllocated)
+                       throw new DMLRuntimeException("One of input is not 
allocated:" + left.getGPUObject().isAllocated + " " + 
right.getGPUObject().isAllocated);
+               if(!output.getGPUObject().isAllocated)
+                       throw new DMLRuntimeException("Output is not 
allocated:" + output.getGPUObject().isAllocated);
+               
+               Pointer A = ((JCudaObject)left.getGPUObject()).jcudaPointer;
+               Pointer B = ((JCudaObject)right.getGPUObject()).jcudaPointer;
+               Pointer C = ((JCudaObject)output.getGPUObject()).jcudaPointer;
+               
+               long start = System.nanoTime();
+               JCublas.cublasDgemm(transa, transb, m, n, k, alpha, A, lda, B, 
ldb, beta, C, ldc);
+               Statistics.cudaMultTime.addAndGet(System.nanoTime()-start);
+       }
+       
+//     private void transpose(Pointer A, Pointer ret, int numRows, int 
numCols) {
+//             Pointer alpha = null; 
+//             Pointer beta = null;
+//             try {
+//                     alpha = pointerTo(1.0);
+//                     beta = pointerTo(0.0);
+//                     JCublas2.cublasDgeam(cublasHandle, CUBLAS_OP_T, 
CUBLAS_OP_N, numCols, numRows, 
+//                                     alpha, A, numRows, beta, A, numCols, 
ret, numCols);
+//             }
+//             finally {
+//                     if(alpha != null)
+//                             cudaFree(alpha);
+//                     if(beta != null)
+//                             cudaFree(beta);
+//             }
+//     }
+
+       public static void conv2d_backward_data(MatrixObject filter, 
MatrixObject dout,
+                       MatrixObject output, int N, int C, int H, int W, int K, 
int R,
+                       int S, int pad_h, int pad_w, int stride_h, int 
stride_w, int P,
+                       int Q) throws DMLRuntimeException {
+               Pointer alpha = null;
+               Pointer beta = null;
+               cudnnTensorDescriptor dyDesc = null;
+               cudnnTensorDescriptor dxDesc = null;
+               cudnnFilterDescriptor wDesc = null;
+               cudnnConvolutionDescriptor convDesc = null;
+               
+               Pointer workSpace = null;
+               long sizeInBytes = 0;
+               try {
+                       // Allocate descriptors
+                       wDesc = allocateFilterDescriptor(K, C, R, S);
+                       dyDesc = allocateTensorDescriptor(N, K, P, Q);
+                       dxDesc = allocateTensorDescriptor(N, C, H, W);
+                       
+                       // Allocate data
+                       Pointer w = 
((JCudaObject)filter._gpuHandle).jcudaPointer; 
+                       Pointer dy = 
((JCudaObject)dout._gpuHandle).jcudaPointer; 
+                       Pointer dx = 
((JCudaObject)output._gpuHandle).jcudaPointer; 
+                       
+                       alpha = pointerTo(1.0); // TODO
+                       beta = pointerTo(0.0f);
+                       
+                       int padding [] = { pad_h, pad_w }; 
+                       int strides [] = { stride_h, stride_w };
+                       convDesc = allocateConvolutionDescriptor(padding, 
strides);
+                       long sizeInBytesArray[] = { 0 };
+                       
+                       // TODO: Select the best algorithm depending on the 
data and supported CUDA
+                       int algo = 
jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
+                       workSpace = new Pointer();
+                       
cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle,
+                                       wDesc, dyDesc, convDesc, dxDesc, algo, 
sizeInBytesArray);
+                       
+                       int status = cudnnConvolutionBackwardData(cudnnHandle, 
alpha, wDesc, w, 
+                                       dyDesc, dy, convDesc, algo, workSpace, 
sizeInBytes, beta, dxDesc, dx);
+                       if(status != 
jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
+                               throw new DMLRuntimeException("Could not 
executed cudnnConvolutionBackwardData: " + 
jcuda.jcudnn.cudnnStatus.stringFor(status));
+                       }
+               }
+               finally {
+                       if(alpha != null)
+                               cudaFree(alpha);
+                       if(beta != null)
+                               cudaFree(beta);
+                       if(dyDesc != null)
+                               cudnnDestroyTensorDescriptor(dyDesc);
+                       if(dxDesc != null)
+                               cudnnDestroyTensorDescriptor(dxDesc);
+                       if(wDesc != null)
+                               cudnnDestroyFilterDescriptor(wDesc);
+                       
+                       if(convDesc != null)
+                               cudnnDestroyConvolutionDescriptor(convDesc);
+                       
+                       if(workSpace != null && sizeInBytes != 0)
+                               cudaFree(workSpace);
+               }
+       }
+       
+       public static boolean isInSparseFormat(MatrixObject mo) {
+               if(mo.getGPUObject() != null && mo.getGPUObject().isAllocated)
+                       return mo.getGPUObject().isInSparseFormat;
+               else if(mo.getMatrixBlock() != null && 
mo.getMatrixBlock().getDenseBlock() != null)
+                       return false;
+               return MatrixBlock.evalSparseFormatInMemory(mo.getNumRows(), 
mo.getNumColumns(), mo.getNnz());
+       }
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java 
b/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
index 80b20cd..b2da5e2 100644
--- a/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
+++ b/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java
@@ -19,6 +19,20 @@
 
 package org.apache.sysml.runtime.util;
 
+import java.util.ArrayList;
+
+import org.apache.sysml.api.DMLScript;
+import org.apache.sysml.hops.AggBinaryOp;
+import org.apache.sysml.hops.ConvolutionOp;
+import org.apache.sysml.hops.Hop;
+import org.apache.sysml.hops.HopsException;
+import org.apache.sysml.hops.ReorgOp;
+import org.apache.sysml.hops.Hop.ConvOp;
+import org.apache.sysml.hops.Hop.ReOrgOp;
+import org.apache.sysml.lops.Lop;
+import org.apache.sysml.lops.LopsException;
+import org.apache.sysml.lops.LopProperties.ExecType;
+
 
 public class ConvolutionUtils {
        
@@ -40,4 +54,117 @@ public class ConvolutionUtils {
                return ret;
        }
        
+       private static boolean isMatMult(Hop hop) {
+               if(hop != null && hop instanceof AggBinaryOp) {
+                       return true;
+               }
+               return false;
+       }
+       private static boolean isTranspose(Hop hop) {
+               if(hop != null && hop instanceof ReorgOp && 
((ReorgOp)hop).getOp() == ReOrgOp.TRANSPOSE) {
+                       return true;
+               }
+               return false;
+       }
+       private static boolean isConvolutionOp(Hop hop, Hop.ConvOp op) {
+               if(hop != null && hop instanceof ConvolutionOp && 
((ConvolutionOp) hop).getOp() == op) {
+                       return true;
+               }
+               return false;
+       }
+       
+       public static Lop constructConvolutionBackwardFilterLops(Hop 
currentHop) throws HopsException, LopsException {
+               ExecType et = ExecType.CP;
+               if(DMLScript.USE_ACCELERATOR)
+                       et = ExecType.GPU; // TODO: Add memory estimate checks
+               else
+                       return null;
+               
+               if(currentHop != null && isTranspose(currentHop)) {
+                       Hop matMult = currentHop.getInput().get(0);
+                       if(matMult != null && isMatMult(matMult)) {
+                               Hop x_col = matMult.getInput().get(0);
+                               Hop right = matMult.getInput().get(1);
+                               if(isConvolutionOp(x_col, ConvOp.IM2COL) && 
isConvolutionOp(right, ConvOp.ROTATE180)) {
+                                       Hop image = x_col.getInput().get(0);
+                                       Hop dout = right.getInput().get(0);
+                                       ArrayList<Hop> inputs = new 
ArrayList<Hop>();
+                                       inputs.add(image);
+                                       inputs.add(dout);
+                                       for(int i = 1; i < 
x_col.getInput().size(); i++) {
+                                               
inputs.add(x_col.getInput().get(i));
+                                       }
+                                       ConvolutionOp fusedHop = new 
ConvolutionOp("tmp_directconv2dBackwardFilter" + image.getName(), 
image.getDataType(), image.getValueType(), 
ConvOp.DIRECT_CONV2D_BACKWARD_FILTER, inputs);
+                                       setPositions(currentHop, fusedHop);
+                                       return 
fusedHop.constructConvolutionLops(et, inputs);
+                               }
+                       }
+               }
+               return null;
+       }
+       
+       public static Lop constructConvolutionLops(Hop currentHop, ExecType et) 
throws HopsException, LopsException {
+               if(DMLScript.USE_ACCELERATOR)
+                       et = ExecType.GPU; // TODO: Add memory estimate checks
+               else
+                       return null;
+               
+               if(currentHop != null && isConvolutionOp(currentHop, 
ConvOp.RESHAPE_COL)) {
+                       Hop matMult = currentHop.getInput().get(0);
+                       if(matMult != null && isMatMult(matMult)) {
+                               Hop filter = matMult.getInput().get(0);
+                               Hop x_col = matMult.getInput().get(1);
+                               if(isConvolutionOp(x_col, ConvOp.IM2COL)) {
+                                       Hop image = x_col.getInput().get(0);
+                                       ArrayList<Hop> inputs = new 
ArrayList<Hop>();
+                                       inputs.add(image);
+                                       inputs.add(filter);
+                                       for(int i = 1; i < 
x_col.getInput().size(); i++) {
+                                               
inputs.add(x_col.getInput().get(i));
+                                       }
+                                       ConvolutionOp fusedHop = new 
ConvolutionOp("tmp_directconv2d" + image.getName(), image.getDataType(), 
image.getValueType(), ConvOp.DIRECT_CONV2D, inputs);
+                                       setPositions(currentHop, fusedHop);
+                                       return 
fusedHop.constructConvolutionLops(et, inputs);
+                               }
+                       }
+               }
+               
+               return null;
+       }
+       
+       public static Lop constructConvolutionBackwardDataLops(Hop currentHop, 
ExecType et) throws HopsException, LopsException {
+               if(DMLScript.USE_ACCELERATOR)
+                       et = ExecType.GPU; // TODO: Add memory estimate checks
+               else
+                       return null;
+               
+               if(currentHop != null && isConvolutionOp(currentHop, 
ConvOp.COL2IM)) {
+                       Hop temp = currentHop.getInput().get(0);
+                       if(temp != null && isTranspose(temp)) {
+                               Hop matMult = temp.getInput().get(0);
+                               if(matMult != null && isMatMult(matMult)) {
+                                       Hop rotate180 = 
matMult.getInput().get(0);
+                                       Hop filter = matMult.getInput().get(1);
+                                       if(isConvolutionOp(rotate180, 
ConvOp.ROTATE180)) {
+                                               ArrayList<Hop> inputs = new 
ArrayList<Hop>();
+                                               inputs.add(filter);
+                                               
inputs.add(rotate180.getInput().get(0));
+                                               for(int i = 1; i < 
rotate180.getInput().size(); i++) {
+                                                       
inputs.add(rotate180.getInput().get(i));
+                                               }
+                                               ConvolutionOp fusedHop = new 
ConvolutionOp("tmp_directconv2dBackwardData" + filter.getName(), 
filter.getDataType(), filter.getValueType(), 
ConvOp.DIRECT_CONV2D_BACKWARD_DATA, inputs);
+                                               setPositions(currentHop, 
fusedHop);
+                                               return 
fusedHop.constructConvolutionLops(et, inputs);
+                                       }
+                               }
+                       }
+               }
+               
+               return null;
+       }
+       
+       private static void setPositions(Hop currentHop, Hop fusedHop) {
+               fusedHop.setAllPositions(currentHop.getBeginLine(), 
currentHop.getBeginColumn(), currentHop.getEndLine(), 
currentHop.getEndColumn());
+       }
+       
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/utils/Statistics.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/Statistics.java 
b/src/main/java/org/apache/sysml/utils/Statistics.java
index edb3493..b8bf599 100644
--- a/src/main/java/org/apache/sysml/utils/Statistics.java
+++ b/src/main/java/org/apache/sysml/utils/Statistics.java
@@ -60,6 +60,8 @@ public class Statistics
        // number of compiled/executed SP instructions
        private static int iNoOfExecutedSPInst = 0;
        private static int iNoOfCompiledSPInst = 0;
+       
+       private static int iNoOfExecutedGPUInst = 0;
 
        //JVM stats
        private static long jitCompileTime = 0; //in milli sec
@@ -105,6 +107,21 @@ public class Statistics
        private static AtomicLong denseBlockAllocationTime = new AtomicLong(0);
        private static AtomicLong sparseBlockAllocationTime = new AtomicLong(0);
        
+       public static long cudaInitTime = 0;
+       public static long cudaLibrariesInitTime = 0;
+       public static AtomicLong cudaAllocTime = new AtomicLong(0);
+       public static AtomicLong cudaDeAllocTime = new AtomicLong(0);
+       public static AtomicLong cudaToDevTime = new AtomicLong(0);
+       public static AtomicLong cudaFromDevTime = new AtomicLong(0);
+       public static AtomicLong cudaAllocCount = new AtomicLong(0);
+       public static AtomicLong cudaDeAllocCount = new AtomicLong(0);
+       public static AtomicLong cudaToDevCount = new AtomicLong(0);
+       public static AtomicLong cudaFromDevCount = new AtomicLong(0);
+       // Potential CUDA heavy hitter
+       public static AtomicLong cudaMultTime = new AtomicLong(0);
+       public static AtomicLong cudaConvFwdTime = new AtomicLong(0);
+       public static AtomicLong gpuSparseMultCount = new AtomicLong(0);
+       
        public static void incrementAllocationTime(long allocationTime, boolean 
isSparse) {
                if(isSparse)
                        sparseBlockAllocationTime.addAndGet(allocationTime);
@@ -139,6 +156,19 @@ public class Statistics
        public static synchronized void incrementNoOfCompiledMRJobs() {
                iNoOfCompiledMRJobs ++;
        }
+       
+       
+       public static synchronized void setNoOfExecutedGPUInst(int numJobs) {
+               iNoOfExecutedGPUInst = numJobs;
+       }
+       
+       public static synchronized void incrementNoOfExecutedGPUInst() {
+               iNoOfExecutedGPUInst ++;
+       }
+       
+       public static synchronized int getNoOfExecutedGPUInst() {
+               return iNoOfExecutedGPUInst;
+       }
 
        public static synchronized void setNoOfExecutedSPInst(int numJobs) {
                iNoOfExecutedSPInst = numJobs;
@@ -226,6 +256,9 @@ public class Statistics
                        setNoOfExecutedMRJobs(count);
                        setNoOfExecutedSPInst(0);
                }
+               
+               if( DMLScript.USE_ACCELERATOR )
+                       setNoOfExecutedGPUInst(0);
        }
        
        public static synchronized void incrementJITCompileTime( long time ) {
@@ -597,6 +630,28 @@ public class Statistics
                        sb.append("Number of executed MR Jobs:\t" + 
getNoOfExecutedMRJobs() + ".\n");   
                }
                
+               if( DMLScript.USE_ACCELERATOR && DMLScript.STATISTICS ) {
+                       sb.append("CUDA/CuLibraries init time:\t" + 
String.format("%.3f", cudaInitTime*1e-9) + "/"
+                                       + String.format("%.3f", 
cudaLibrariesInitTime*1e-9) + " sec.\n");
+                       sb.append("Number of executed GPU inst:\t" + 
getNoOfExecutedGPUInst() + ".\n");
+                       sb.append("GPU mem tx time 
(alloc/dealloc/toDev/fromDev):\t" 
+                                       + String.format("%.3f", 
cudaAllocTime.get()*1e-9) + "/"
+                                       + String.format("%.3f", 
cudaDeAllocTime.get()*1e-9) + "/"
+                                       + String.format("%.3f", 
cudaToDevTime.get()*1e-9) + "/"
+                                       + String.format("%.3f", 
cudaFromDevTime.get()*1e-9)  + " sec.\n");
+                       sb.append("GPU mem tx count 
(alloc/dealloc/toDev/fromDev):\t" 
+                                       + cudaAllocCount.get() + "/"
+                                       + cudaDeAllocCount.get() + "/"
+                                       + cudaToDevCount.get() + "/"
+                                       + cudaFromDevCount.get()  + ".\n");
+                       sb.append("CUDA op time (mult/conv):\t"
+                                       + String.format("%.3f", 
cudaMultTime.get()*1e-9) + "/"
+                                       + String.format("%.3f", 
cudaConvFwdTime.get()*1e-9)  + " sec.\n");
+                       if(gpuSparseMultCount.get() > 0)
+                               sb.append("CUDA op count (sparse mult):\t"
+                                               + gpuSparseMultCount.get() + 
".\n");
+               }
+               
                //show extended caching/compilation statistics
                if( DMLScript.STATISTICS ) 
                {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java 
b/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java
index eb87197..066e4e5 100644
--- a/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java
+++ b/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java
@@ -86,6 +86,9 @@ public abstract class AutomatedTestBase
        public static final boolean EXCEPTION_EXPECTED = true;
        public static final boolean EXCEPTION_NOT_EXPECTED = false;
        
+       // By default: TEST_GPU is set to false to allow developers without 
Nvidia GPU to run integration test suite 
+       public static final boolean TEST_GPU = false;
+       
        protected ScriptType scriptType;
        
        // *** HACK ALERT *** HACK ALERT *** HACK ALERT ***
@@ -105,8 +108,18 @@ public abstract class AutomatedTestBase
 
                        System.setProperty("hadoop.home.dir", cwd + 
File.separator
                                        + 
"\\src\\test\\config\\hadoop_bin_windows");
-                       System.setProperty("java.library.path", cwd + 
File.separator
+                       
+                       if(TEST_GPU) {
+                               String CUDA_LIBRARY_PATH = 
System.getenv("CUDA_PATH") + File.separator + "bin"; 
+                               System.setProperty("java.library.path", cwd + 
File.separator
+                                               + 
"\\src\\test\\config\\hadoop_bin_windows\\bin" + File.pathSeparator
+                                               + "/lib" + File.pathSeparator
+                                               + CUDA_LIBRARY_PATH);
+                       }
+                       else {
+                               System.setProperty("java.library.path", cwd + 
File.separator
                                        + 
"\\src\\test\\config\\hadoop_bin_windows\\bin");
+                       }
                        
 
                    // Need to muck around with the classloader to get it to 
use the new
@@ -1150,6 +1163,8 @@ public abstract class AutomatedTestBase
                //use optional config file since default under SystemML/DML
                args.add("-config="+ getCurConfigFile().getPath());
                
+               if(TEST_GPU)
+                       args.add("-gpu");
                
                // program-specific parameters
                if ( newWay ) {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/test/java/org/apache/sysml/test/integration/functions/binary/matrix/MatrixMultiplicationTest.java
----------------------------------------------------------------------
diff --git 
a/src/test/java/org/apache/sysml/test/integration/functions/binary/matrix/MatrixMultiplicationTest.java
 
b/src/test/java/org/apache/sysml/test/integration/functions/binary/matrix/MatrixMultiplicationTest.java
index 9276280..60e97e0 100644
--- 
a/src/test/java/org/apache/sysml/test/integration/functions/binary/matrix/MatrixMultiplicationTest.java
+++ 
b/src/test/java/org/apache/sysml/test/integration/functions/binary/matrix/MatrixMultiplicationTest.java
@@ -77,6 +77,33 @@ public class MatrixMultiplicationTest extends 
AutomatedTestBase
 
                compareResults(0.00000000001);
        }
+       
+       @Test
+       public void testSparseMatrixMultiplication() {
+               int m = 40;
+               int n = 10;
+               int k = 30;
+
+               TestConfiguration config = 
availableTestConfigurations.get("MatrixMultiplicationTest");
+               config.addVariable("m", m);
+               config.addVariable("n1", n);
+               config.addVariable("n2", n);
+               config.addVariable("k", k);
+
+               loadTestConfiguration(config);
+
+               double[][] a = getRandomMatrix(m, n, -1, 1, 0.1, -1);
+               double[][] b = getRandomMatrix(n, k, -1, 1, 0.1, -1);
+               double[][] c = TestUtils.performMatrixMultiplication(a, b);
+
+               writeInputMatrix("a", a);
+               writeInputMatrix("b", b);
+               writeExpectedMatrix("c", c);
+
+               runTest();
+
+               compareResults(0.00000000001);
+       }
 
        @Test
        public void testWrongDimensions() {

[1/2] incubator-systemml git commit: [SYSTEMML-445] [WIP] Initial version of GPU backend

Reply via email to