Repository: incubator-systemml Updated Branches: refs/heads/master 8c37e2e1e -> f306b0b1e
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java new file mode 100644 index 0000000..b293ece --- /dev/null +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java @@ -0,0 +1,359 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysml.runtime.matrix.data; + +import static jcuda.jcudnn.JCudnn.cudnnConvolutionBackwardData; +import static jcuda.jcudnn.JCudnn.cudnnConvolutionBackwardFilter; +import static jcuda.jcudnn.JCudnn.cudnnConvolutionForward; +import static jcuda.jcudnn.JCudnn.cudnnCreateConvolutionDescriptor; +import static jcuda.jcudnn.JCudnn.cudnnCreateFilterDescriptor; +import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor; +import static jcuda.jcudnn.JCudnn.cudnnDestroyConvolutionDescriptor; +import static jcuda.jcudnn.JCudnn.cudnnDestroyFilterDescriptor; +import static jcuda.jcudnn.JCudnn.cudnnDestroyTensorDescriptor; +import static jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize; +import static jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize; +import static jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardWorkspaceSize; +import static jcuda.jcudnn.JCudnn.cudnnSetConvolutionNdDescriptor; +import static jcuda.jcudnn.JCudnn.cudnnSetFilterNdDescriptor; +import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor; +import static jcuda.jcudnn.cudnnConvolutionMode.CUDNN_CROSS_CORRELATION; +import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE; +import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW; +import static jcuda.runtime.JCuda.cudaFree; +import jcuda.Pointer; +import jcuda.jcublas.JCublas; +import jcuda.jcublas.cublasHandle; +import jcuda.jcudnn.cudnnConvolutionDescriptor; +import jcuda.jcudnn.cudnnFilterDescriptor; +import jcuda.jcudnn.cudnnHandle; +import jcuda.jcudnn.cudnnTensorDescriptor; + +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; +import org.apache.sysml.runtime.controlprogram.context.JCudaObject; +import org.apache.sysml.utils.Statistics; + +public class LibMatrixCUDA { + + public static cudnnHandle cudnnHandle; + public static cublasHandle cublasHandle; + + public static void conv2d(MatrixObject image, MatrixObject filter, MatrixObject outputBlock, int N, int C, int H, int W, + int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) + throws DMLRuntimeException { + cudnnTensorDescriptor srcTensorDesc = null; + cudnnTensorDescriptor dstTensorDesc = null; + cudnnFilterDescriptor filterDesc = null; + cudnnConvolutionDescriptor convDesc = null; + Pointer workSpace = null; + long sizeInBytes = 0; + Pointer alpha = null; + Pointer beta = null; + try { + // Allocate descriptors + srcTensorDesc = allocateTensorDescriptor(N, C, H, W); + dstTensorDesc = allocateTensorDescriptor(N, K, P, Q); + filterDesc = allocateFilterDescriptor(K, C, R, S); + + // Allocate data + // (Pointer) gpuCtx.prepare(image, true, true); + // (Pointer) gpuCtx.prepare(filter, true, true); + + Pointer imagePointer = ((JCudaObject)image._gpuHandle).jcudaPointer; + Pointer filterPointer = ((JCudaObject)filter._gpuHandle).jcudaPointer; + Pointer dstPointer = ((JCudaObject)outputBlock._gpuHandle).jcudaPointer; + + int padding [] = { pad_h, pad_w }; + int strides [] = { stride_h, stride_w }; + convDesc = allocateConvolutionDescriptor(padding, strides); + + // TODO: Select the best algorithm depending on the data and supported CUDA + int algo = jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; + + long sizeInBytesArray[] = { 0 }; + workSpace = new Pointer(); + cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, + srcTensorDesc, filterDesc, convDesc, dstTensorDesc, + algo, sizeInBytesArray); + + alpha = pointerTo(1.0); // TODO + beta = pointerTo(0.0f); + long start = System.nanoTime(); + int status = cudnnConvolutionForward(cudnnHandle, alpha, + srcTensorDesc, imagePointer, + filterDesc, filterPointer, + convDesc, algo, workSpace, sizeInBytes, beta, + dstTensorDesc, dstPointer); + Statistics.cudaConvFwdTime.addAndGet(System.nanoTime()-start); + if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) { + throw new DMLRuntimeException("Could not executed cudnnConvolutionForward: " + jcuda.jcudnn.cudnnStatus.stringFor(status)); + } + } + finally { + + if(alpha != null) + cudaFree(alpha); + if(beta != null) + cudaFree(beta); + + if(srcTensorDesc != null) + cudnnDestroyTensorDescriptor(srcTensorDesc); + if(dstTensorDesc != null) + cudnnDestroyTensorDescriptor(dstTensorDesc); + if(filterDesc != null) + cudnnDestroyFilterDescriptor(filterDesc); + if(convDesc != null) + cudnnDestroyConvolutionDescriptor(convDesc); + if(workSpace != null && sizeInBytes != 0) + cudaFree(workSpace); + } + } + + private static cudnnConvolutionDescriptor allocateConvolutionDescriptor(int padding [], int strides []) { + cudnnConvolutionDescriptor convDesc = new cudnnConvolutionDescriptor(); + cudnnCreateConvolutionDescriptor(convDesc); + int upscale[] = { 1, 1 }; + cudnnSetConvolutionNdDescriptor(convDesc, 2, padding, strides, upscale, + CUDNN_CROSS_CORRELATION, CUDNN_DATA_DOUBLE); + return convDesc; + } + + private static Pointer pointerTo(double value) { + return Pointer.to(new double[] { value }); + } + + private static cudnnTensorDescriptor allocateTensorDescriptor(int N, int C, int H, int W) { + cudnnTensorDescriptor ret = new cudnnTensorDescriptor(); + cudnnCreateTensorDescriptor(ret); + cudnnSetTensor4dDescriptor(ret, CUDNN_TENSOR_NCHW, CUDNN_DATA_DOUBLE, N, C, H, W); + return ret; + } + + private static cudnnFilterDescriptor allocateFilterDescriptor(int K, int C, int R, int S) { + cudnnFilterDescriptor filterDesc = new cudnnFilterDescriptor(); + cudnnCreateFilterDescriptor(filterDesc); + int filterDim[] = { K, C, R, S }; + cudnnSetFilterNdDescriptor(filterDesc, CUDNN_DATA_DOUBLE, 4, filterDim); + return filterDesc; + } + + + public static void conv2d_backward_filter(MatrixObject image, MatrixObject dout, + MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, + int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, + int Q) throws DMLRuntimeException { + Pointer alpha = null; + Pointer beta = null; + cudnnTensorDescriptor xTensorDesc = null; + cudnnTensorDescriptor doutTensorDesc = null; + cudnnFilterDescriptor dwDesc = null; + cudnnConvolutionDescriptor convDesc = null; + + Pointer workSpace = null; + long sizeInBytes = 0; + try { + // Allocate descriptors + xTensorDesc = allocateTensorDescriptor(N, C, H, W); + doutTensorDesc = allocateTensorDescriptor(N, K, P, Q); + dwDesc = allocateFilterDescriptor(K, C, R, S); + + // Allocate data + Pointer imagePointer = ((JCudaObject)image._gpuHandle).jcudaPointer; + Pointer doutPointer = ((JCudaObject)dout._gpuHandle).jcudaPointer; + Pointer dwPointer = ((JCudaObject)outputBlock._gpuHandle).jcudaPointer; + + alpha = pointerTo(1.0); // TODO + beta = pointerTo(0.0f); + + int padding [] = { pad_h, pad_w }; + int strides [] = { stride_h, stride_w }; + convDesc = allocateConvolutionDescriptor(padding, strides); + long sizeInBytesArray[] = { 0 }; + + // TODO: Select the best algorithm depending on the data and supported CUDA + int algo = jcuda.jcudnn.cudnnConvolutionBwdFilterAlgo.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; + workSpace = new Pointer(); + cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle, + xTensorDesc, doutTensorDesc, convDesc, dwDesc, algo, sizeInBytesArray); + + int status = cudnnConvolutionBackwardFilter(cudnnHandle, alpha, xTensorDesc, imagePointer, + doutTensorDesc, doutPointer, convDesc, algo, workSpace, sizeInBytes, beta, dwDesc, dwPointer); + if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) { + throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardFilter: " + jcuda.jcudnn.cudnnStatus.stringFor(status)); + } + } + finally { + if(alpha != null) + cudaFree(alpha); + if(beta != null) + cudaFree(beta); + if(xTensorDesc != null) + cudnnDestroyTensorDescriptor(xTensorDesc); + if(doutTensorDesc != null) + cudnnDestroyTensorDescriptor(doutTensorDesc); + if(dwDesc != null) + cudnnDestroyFilterDescriptor(dwDesc); + + if(convDesc != null) + cudnnDestroyConvolutionDescriptor(convDesc); + + if(workSpace != null && sizeInBytes != 0) + cudaFree(workSpace); + } + + } + + public static void matmult(MatrixObject left1, MatrixObject right1, MatrixObject output, + boolean isLeftTransposed1, boolean isRightTransposed1) throws DMLRuntimeException { + if(isInSparseFormat(left1) || isInSparseFormat(right1)) { + throw new DMLRuntimeException("Sparse GPU matrix multiplication is not implemented"); + } + + // Since CuBLAS expects inputs in column-major format, + // reverse the order of matrix-multiplication and take care of dimension mismatch. + MatrixObject left = right1; + MatrixObject right = left1; + boolean isLeftTransposed = isRightTransposed1; + boolean isRightTransposed = isLeftTransposed1; + + char transa = isLeftTransposed ? 'T' : 'N'; + char transb = isRightTransposed ? 'T' : 'N'; + // Note: the dimensions are swapped + int m = (int) (isLeftTransposed ? left.getNumRows() : left.getNumColumns()) ; + int n = (int) (isRightTransposed ? right.getNumColumns() : right.getNumRows()); + int k = (int) (isLeftTransposed ? left.getNumColumns() : left.getNumRows()); + int k1 = (int) (isRightTransposed ? right.getNumRows() : right.getNumColumns()); + if(k != k1) + throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1); + + if(m == -1 || n == -1 || k == -1) + throw new DMLRuntimeException("Incorrect dimensions"); + + double alpha = 1; + double beta = 0; + + int lda = isLeftTransposed ? k : m; + int ldb = isRightTransposed ? n : k; + int ldc = m; + + if(!left.getGPUObject().isAllocated || !right.getGPUObject().isAllocated) + throw new DMLRuntimeException("One of input is not allocated:" + left.getGPUObject().isAllocated + " " + right.getGPUObject().isAllocated); + if(!output.getGPUObject().isAllocated) + throw new DMLRuntimeException("Output is not allocated:" + output.getGPUObject().isAllocated); + + Pointer A = ((JCudaObject)left.getGPUObject()).jcudaPointer; + Pointer B = ((JCudaObject)right.getGPUObject()).jcudaPointer; + Pointer C = ((JCudaObject)output.getGPUObject()).jcudaPointer; + + long start = System.nanoTime(); + JCublas.cublasDgemm(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); + Statistics.cudaMultTime.addAndGet(System.nanoTime()-start); + } + +// private void transpose(Pointer A, Pointer ret, int numRows, int numCols) { +// Pointer alpha = null; +// Pointer beta = null; +// try { +// alpha = pointerTo(1.0); +// beta = pointerTo(0.0); +// JCublas2.cublasDgeam(cublasHandle, CUBLAS_OP_T, CUBLAS_OP_N, numCols, numRows, +// alpha, A, numRows, beta, A, numCols, ret, numCols); +// } +// finally { +// if(alpha != null) +// cudaFree(alpha); +// if(beta != null) +// cudaFree(beta); +// } +// } + + public static void conv2d_backward_data(MatrixObject filter, MatrixObject dout, + MatrixObject output, int N, int C, int H, int W, int K, int R, + int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, + int Q) throws DMLRuntimeException { + Pointer alpha = null; + Pointer beta = null; + cudnnTensorDescriptor dyDesc = null; + cudnnTensorDescriptor dxDesc = null; + cudnnFilterDescriptor wDesc = null; + cudnnConvolutionDescriptor convDesc = null; + + Pointer workSpace = null; + long sizeInBytes = 0; + try { + // Allocate descriptors + wDesc = allocateFilterDescriptor(K, C, R, S); + dyDesc = allocateTensorDescriptor(N, K, P, Q); + dxDesc = allocateTensorDescriptor(N, C, H, W); + + // Allocate data + Pointer w = ((JCudaObject)filter._gpuHandle).jcudaPointer; + Pointer dy = ((JCudaObject)dout._gpuHandle).jcudaPointer; + Pointer dx = ((JCudaObject)output._gpuHandle).jcudaPointer; + + alpha = pointerTo(1.0); // TODO + beta = pointerTo(0.0f); + + int padding [] = { pad_h, pad_w }; + int strides [] = { stride_h, stride_w }; + convDesc = allocateConvolutionDescriptor(padding, strides); + long sizeInBytesArray[] = { 0 }; + + // TODO: Select the best algorithm depending on the data and supported CUDA + int algo = jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; + workSpace = new Pointer(); + cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle, + wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytesArray); + + int status = cudnnConvolutionBackwardData(cudnnHandle, alpha, wDesc, w, + dyDesc, dy, convDesc, algo, workSpace, sizeInBytes, beta, dxDesc, dx); + if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) { + throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardData: " + jcuda.jcudnn.cudnnStatus.stringFor(status)); + } + } + finally { + if(alpha != null) + cudaFree(alpha); + if(beta != null) + cudaFree(beta); + if(dyDesc != null) + cudnnDestroyTensorDescriptor(dyDesc); + if(dxDesc != null) + cudnnDestroyTensorDescriptor(dxDesc); + if(wDesc != null) + cudnnDestroyFilterDescriptor(wDesc); + + if(convDesc != null) + cudnnDestroyConvolutionDescriptor(convDesc); + + if(workSpace != null && sizeInBytes != 0) + cudaFree(workSpace); + } + } + + public static boolean isInSparseFormat(MatrixObject mo) { + if(mo.getGPUObject() != null && mo.getGPUObject().isAllocated) + return mo.getGPUObject().isInSparseFormat; + else if(mo.getMatrixBlock() != null && mo.getMatrixBlock().getDenseBlock() != null) + return false; + return MatrixBlock.evalSparseFormatInMemory(mo.getNumRows(), mo.getNumColumns(), mo.getNnz()); + } +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java b/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java index 80b20cd..b2da5e2 100644 --- a/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java +++ b/src/main/java/org/apache/sysml/runtime/util/ConvolutionUtils.java @@ -19,6 +19,20 @@ package org.apache.sysml.runtime.util; +import java.util.ArrayList; + +import org.apache.sysml.api.DMLScript; +import org.apache.sysml.hops.AggBinaryOp; +import org.apache.sysml.hops.ConvolutionOp; +import org.apache.sysml.hops.Hop; +import org.apache.sysml.hops.HopsException; +import org.apache.sysml.hops.ReorgOp; +import org.apache.sysml.hops.Hop.ConvOp; +import org.apache.sysml.hops.Hop.ReOrgOp; +import org.apache.sysml.lops.Lop; +import org.apache.sysml.lops.LopsException; +import org.apache.sysml.lops.LopProperties.ExecType; + public class ConvolutionUtils { @@ -40,4 +54,117 @@ public class ConvolutionUtils { return ret; } + private static boolean isMatMult(Hop hop) { + if(hop != null && hop instanceof AggBinaryOp) { + return true; + } + return false; + } + private static boolean isTranspose(Hop hop) { + if(hop != null && hop instanceof ReorgOp && ((ReorgOp)hop).getOp() == ReOrgOp.TRANSPOSE) { + return true; + } + return false; + } + private static boolean isConvolutionOp(Hop hop, Hop.ConvOp op) { + if(hop != null && hop instanceof ConvolutionOp && ((ConvolutionOp) hop).getOp() == op) { + return true; + } + return false; + } + + public static Lop constructConvolutionBackwardFilterLops(Hop currentHop) throws HopsException, LopsException { + ExecType et = ExecType.CP; + if(DMLScript.USE_ACCELERATOR) + et = ExecType.GPU; // TODO: Add memory estimate checks + else + return null; + + if(currentHop != null && isTranspose(currentHop)) { + Hop matMult = currentHop.getInput().get(0); + if(matMult != null && isMatMult(matMult)) { + Hop x_col = matMult.getInput().get(0); + Hop right = matMult.getInput().get(1); + if(isConvolutionOp(x_col, ConvOp.IM2COL) && isConvolutionOp(right, ConvOp.ROTATE180)) { + Hop image = x_col.getInput().get(0); + Hop dout = right.getInput().get(0); + ArrayList<Hop> inputs = new ArrayList<Hop>(); + inputs.add(image); + inputs.add(dout); + for(int i = 1; i < x_col.getInput().size(); i++) { + inputs.add(x_col.getInput().get(i)); + } + ConvolutionOp fusedHop = new ConvolutionOp("tmp_directconv2dBackwardFilter" + image.getName(), image.getDataType(), image.getValueType(), ConvOp.DIRECT_CONV2D_BACKWARD_FILTER, inputs); + setPositions(currentHop, fusedHop); + return fusedHop.constructConvolutionLops(et, inputs); + } + } + } + return null; + } + + public static Lop constructConvolutionLops(Hop currentHop, ExecType et) throws HopsException, LopsException { + if(DMLScript.USE_ACCELERATOR) + et = ExecType.GPU; // TODO: Add memory estimate checks + else + return null; + + if(currentHop != null && isConvolutionOp(currentHop, ConvOp.RESHAPE_COL)) { + Hop matMult = currentHop.getInput().get(0); + if(matMult != null && isMatMult(matMult)) { + Hop filter = matMult.getInput().get(0); + Hop x_col = matMult.getInput().get(1); + if(isConvolutionOp(x_col, ConvOp.IM2COL)) { + Hop image = x_col.getInput().get(0); + ArrayList<Hop> inputs = new ArrayList<Hop>(); + inputs.add(image); + inputs.add(filter); + for(int i = 1; i < x_col.getInput().size(); i++) { + inputs.add(x_col.getInput().get(i)); + } + ConvolutionOp fusedHop = new ConvolutionOp("tmp_directconv2d" + image.getName(), image.getDataType(), image.getValueType(), ConvOp.DIRECT_CONV2D, inputs); + setPositions(currentHop, fusedHop); + return fusedHop.constructConvolutionLops(et, inputs); + } + } + } + + return null; + } + + public static Lop constructConvolutionBackwardDataLops(Hop currentHop, ExecType et) throws HopsException, LopsException { + if(DMLScript.USE_ACCELERATOR) + et = ExecType.GPU; // TODO: Add memory estimate checks + else + return null; + + if(currentHop != null && isConvolutionOp(currentHop, ConvOp.COL2IM)) { + Hop temp = currentHop.getInput().get(0); + if(temp != null && isTranspose(temp)) { + Hop matMult = temp.getInput().get(0); + if(matMult != null && isMatMult(matMult)) { + Hop rotate180 = matMult.getInput().get(0); + Hop filter = matMult.getInput().get(1); + if(isConvolutionOp(rotate180, ConvOp.ROTATE180)) { + ArrayList<Hop> inputs = new ArrayList<Hop>(); + inputs.add(filter); + inputs.add(rotate180.getInput().get(0)); + for(int i = 1; i < rotate180.getInput().size(); i++) { + inputs.add(rotate180.getInput().get(i)); + } + ConvolutionOp fusedHop = new ConvolutionOp("tmp_directconv2dBackwardData" + filter.getName(), filter.getDataType(), filter.getValueType(), ConvOp.DIRECT_CONV2D_BACKWARD_DATA, inputs); + setPositions(currentHop, fusedHop); + return fusedHop.constructConvolutionLops(et, inputs); + } + } + } + } + + return null; + } + + private static void setPositions(Hop currentHop, Hop fusedHop) { + fusedHop.setAllPositions(currentHop.getBeginLine(), currentHop.getBeginColumn(), currentHop.getEndLine(), currentHop.getEndColumn()); + } + } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/main/java/org/apache/sysml/utils/Statistics.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/utils/Statistics.java b/src/main/java/org/apache/sysml/utils/Statistics.java index edb3493..b8bf599 100644 --- a/src/main/java/org/apache/sysml/utils/Statistics.java +++ b/src/main/java/org/apache/sysml/utils/Statistics.java @@ -60,6 +60,8 @@ public class Statistics // number of compiled/executed SP instructions private static int iNoOfExecutedSPInst = 0; private static int iNoOfCompiledSPInst = 0; + + private static int iNoOfExecutedGPUInst = 0; //JVM stats private static long jitCompileTime = 0; //in milli sec @@ -105,6 +107,21 @@ public class Statistics private static AtomicLong denseBlockAllocationTime = new AtomicLong(0); private static AtomicLong sparseBlockAllocationTime = new AtomicLong(0); + public static long cudaInitTime = 0; + public static long cudaLibrariesInitTime = 0; + public static AtomicLong cudaAllocTime = new AtomicLong(0); + public static AtomicLong cudaDeAllocTime = new AtomicLong(0); + public static AtomicLong cudaToDevTime = new AtomicLong(0); + public static AtomicLong cudaFromDevTime = new AtomicLong(0); + public static AtomicLong cudaAllocCount = new AtomicLong(0); + public static AtomicLong cudaDeAllocCount = new AtomicLong(0); + public static AtomicLong cudaToDevCount = new AtomicLong(0); + public static AtomicLong cudaFromDevCount = new AtomicLong(0); + // Potential CUDA heavy hitter + public static AtomicLong cudaMultTime = new AtomicLong(0); + public static AtomicLong cudaConvFwdTime = new AtomicLong(0); + public static AtomicLong gpuSparseMultCount = new AtomicLong(0); + public static void incrementAllocationTime(long allocationTime, boolean isSparse) { if(isSparse) sparseBlockAllocationTime.addAndGet(allocationTime); @@ -139,6 +156,19 @@ public class Statistics public static synchronized void incrementNoOfCompiledMRJobs() { iNoOfCompiledMRJobs ++; } + + + public static synchronized void setNoOfExecutedGPUInst(int numJobs) { + iNoOfExecutedGPUInst = numJobs; + } + + public static synchronized void incrementNoOfExecutedGPUInst() { + iNoOfExecutedGPUInst ++; + } + + public static synchronized int getNoOfExecutedGPUInst() { + return iNoOfExecutedGPUInst; + } public static synchronized void setNoOfExecutedSPInst(int numJobs) { iNoOfExecutedSPInst = numJobs; @@ -226,6 +256,9 @@ public class Statistics setNoOfExecutedMRJobs(count); setNoOfExecutedSPInst(0); } + + if( DMLScript.USE_ACCELERATOR ) + setNoOfExecutedGPUInst(0); } public static synchronized void incrementJITCompileTime( long time ) { @@ -597,6 +630,28 @@ public class Statistics sb.append("Number of executed MR Jobs:\t" + getNoOfExecutedMRJobs() + ".\n"); } + if( DMLScript.USE_ACCELERATOR && DMLScript.STATISTICS ) { + sb.append("CUDA/CuLibraries init time:\t" + String.format("%.3f", cudaInitTime*1e-9) + "/" + + String.format("%.3f", cudaLibrariesInitTime*1e-9) + " sec.\n"); + sb.append("Number of executed GPU inst:\t" + getNoOfExecutedGPUInst() + ".\n"); + sb.append("GPU mem tx time (alloc/dealloc/toDev/fromDev):\t" + + String.format("%.3f", cudaAllocTime.get()*1e-9) + "/" + + String.format("%.3f", cudaDeAllocTime.get()*1e-9) + "/" + + String.format("%.3f", cudaToDevTime.get()*1e-9) + "/" + + String.format("%.3f", cudaFromDevTime.get()*1e-9) + " sec.\n"); + sb.append("GPU mem tx count (alloc/dealloc/toDev/fromDev):\t" + + cudaAllocCount.get() + "/" + + cudaDeAllocCount.get() + "/" + + cudaToDevCount.get() + "/" + + cudaFromDevCount.get() + ".\n"); + sb.append("CUDA op time (mult/conv):\t" + + String.format("%.3f", cudaMultTime.get()*1e-9) + "/" + + String.format("%.3f", cudaConvFwdTime.get()*1e-9) + " sec.\n"); + if(gpuSparseMultCount.get() > 0) + sb.append("CUDA op count (sparse mult):\t" + + gpuSparseMultCount.get() + ".\n"); + } + //show extended caching/compilation statistics if( DMLScript.STATISTICS ) { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java b/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java index eb87197..066e4e5 100644 --- a/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java +++ b/src/test/java/org/apache/sysml/test/integration/AutomatedTestBase.java @@ -86,6 +86,9 @@ public abstract class AutomatedTestBase public static final boolean EXCEPTION_EXPECTED = true; public static final boolean EXCEPTION_NOT_EXPECTED = false; + // By default: TEST_GPU is set to false to allow developers without Nvidia GPU to run integration test suite + public static final boolean TEST_GPU = false; + protected ScriptType scriptType; // *** HACK ALERT *** HACK ALERT *** HACK ALERT *** @@ -105,8 +108,18 @@ public abstract class AutomatedTestBase System.setProperty("hadoop.home.dir", cwd + File.separator + "\\src\\test\\config\\hadoop_bin_windows"); - System.setProperty("java.library.path", cwd + File.separator + + if(TEST_GPU) { + String CUDA_LIBRARY_PATH = System.getenv("CUDA_PATH") + File.separator + "bin"; + System.setProperty("java.library.path", cwd + File.separator + + "\\src\\test\\config\\hadoop_bin_windows\\bin" + File.pathSeparator + + "/lib" + File.pathSeparator + + CUDA_LIBRARY_PATH); + } + else { + System.setProperty("java.library.path", cwd + File.separator + "\\src\\test\\config\\hadoop_bin_windows\\bin"); + } // Need to muck around with the classloader to get it to use the new @@ -1150,6 +1163,8 @@ public abstract class AutomatedTestBase //use optional config file since default under SystemML/DML args.add("-config="+ getCurConfigFile().getPath()); + if(TEST_GPU) + args.add("-gpu"); // program-specific parameters if ( newWay ) { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/f306b0b1/src/test/java/org/apache/sysml/test/integration/functions/binary/matrix/MatrixMultiplicationTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/integration/functions/binary/matrix/MatrixMultiplicationTest.java b/src/test/java/org/apache/sysml/test/integration/functions/binary/matrix/MatrixMultiplicationTest.java index 9276280..60e97e0 100644 --- a/src/test/java/org/apache/sysml/test/integration/functions/binary/matrix/MatrixMultiplicationTest.java +++ b/src/test/java/org/apache/sysml/test/integration/functions/binary/matrix/MatrixMultiplicationTest.java @@ -77,6 +77,33 @@ public class MatrixMultiplicationTest extends AutomatedTestBase compareResults(0.00000000001); } + + @Test + public void testSparseMatrixMultiplication() { + int m = 40; + int n = 10; + int k = 30; + + TestConfiguration config = availableTestConfigurations.get("MatrixMultiplicationTest"); + config.addVariable("m", m); + config.addVariable("n1", n); + config.addVariable("n2", n); + config.addVariable("k", k); + + loadTestConfiguration(config); + + double[][] a = getRandomMatrix(m, n, -1, 1, 0.1, -1); + double[][] b = getRandomMatrix(n, k, -1, 1, 0.1, -1); + double[][] c = TestUtils.performMatrixMultiplication(a, b); + + writeInputMatrix("a", a); + writeInputMatrix("b", b); + writeExpectedMatrix("c", c); + + runTest(); + + compareResults(0.00000000001); + } @Test public void testWrongDimensions() {
