http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/4f9dcf9a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java index 51a0f6b..1511afc 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java @@ -44,6 +44,7 @@ import static jcuda.jcudnn.JCudnn.cudnnSetConvolution2dDescriptor; import static jcuda.jcudnn.JCudnn.cudnnSetFilter4dDescriptor; import static jcuda.jcudnn.JCudnn.cudnnSetPooling2dDescriptor; import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor; +import static jcuda.jcudnn.cudnnActivationMode.CUDNN_ACTIVATION_RELU; import static jcuda.jcudnn.cudnnConvolutionMode.CUDNN_CROSS_CORRELATION; import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE; import static jcuda.jcudnn.cudnnNanPropagation.CUDNN_PROPAGATE_NAN; @@ -55,23 +56,61 @@ import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE; import static jcuda.runtime.JCuda.cudaDeviceSynchronize; import static jcuda.runtime.JCuda.cudaMemcpy; +import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice; import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost; import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice; -import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice; -import static jcuda.jcudnn.cudnnActivationMode.CUDNN_ACTIVATION_RELU; import static org.apache.sysml.runtime.instructions.gpu.context.JCudaObject.allocate; import static org.apache.sysml.runtime.instructions.gpu.context.JCudaObject.cudaFreeHelper; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; -import org.apache.sysml.runtime.functionobjects.*; +import org.apache.sysml.runtime.functionobjects.And; +import org.apache.sysml.runtime.functionobjects.Builtin; +import org.apache.sysml.runtime.functionobjects.CM; +import org.apache.sysml.runtime.functionobjects.Divide; +import org.apache.sysml.runtime.functionobjects.Equals; +import org.apache.sysml.runtime.functionobjects.GreaterThan; +import org.apache.sysml.runtime.functionobjects.GreaterThanEquals; +import org.apache.sysml.runtime.functionobjects.IndexFunction; +import org.apache.sysml.runtime.functionobjects.KahanPlus; +import org.apache.sysml.runtime.functionobjects.KahanPlusSq; +import org.apache.sysml.runtime.functionobjects.LessThan; +import org.apache.sysml.runtime.functionobjects.LessThanEquals; +import org.apache.sysml.runtime.functionobjects.Mean; +import org.apache.sysml.runtime.functionobjects.Minus; +import org.apache.sysml.runtime.functionobjects.Multiply; +import org.apache.sysml.runtime.functionobjects.Multiply2; +import org.apache.sysml.runtime.functionobjects.NotEquals; +import org.apache.sysml.runtime.functionobjects.Or; +import org.apache.sysml.runtime.functionobjects.Plus; +import org.apache.sysml.runtime.functionobjects.Power; +import org.apache.sysml.runtime.functionobjects.Power2; +import org.apache.sysml.runtime.functionobjects.ReduceAll; +import org.apache.sysml.runtime.functionobjects.ReduceCol; +import org.apache.sysml.runtime.functionobjects.ReduceDiag; +import org.apache.sysml.runtime.functionobjects.ReduceRow; +import org.apache.sysml.runtime.functionobjects.ValueFunction; import org.apache.sysml.runtime.instructions.cp.DoubleObject; -import org.apache.sysml.runtime.instructions.gpu.context.*; +import org.apache.sysml.runtime.instructions.gpu.GPUInstruction; +import org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; +import org.apache.sysml.runtime.instructions.gpu.context.JCudaContext; +import org.apache.sysml.runtime.instructions.gpu.context.JCudaKernels; +import org.apache.sysml.runtime.instructions.gpu.context.JCudaObject; import org.apache.sysml.runtime.instructions.gpu.context.JCudaObject.CSRPointer; -import org.apache.sysml.runtime.matrix.operators.*; +import org.apache.sysml.runtime.matrix.operators.AggregateOperator; +import org.apache.sysml.runtime.matrix.operators.AggregateUnaryOperator; +import org.apache.sysml.runtime.matrix.operators.BinaryOperator; +import org.apache.sysml.runtime.matrix.operators.CMOperator; +import org.apache.sysml.runtime.matrix.operators.LeftScalarOperator; +import org.apache.sysml.runtime.matrix.operators.RightScalarOperator; +import org.apache.sysml.runtime.matrix.operators.ScalarOperator; +import org.apache.sysml.utils.GPUStatistics; import org.apache.sysml.utils.Statistics; + import jcuda.Pointer; import jcuda.Sizeof; import jcuda.jcublas.JCublas2; @@ -146,6 +185,16 @@ public class LibMatrixCUDA { return _WARP_SIZE; } + public static boolean isInSparseFormat(MatrixObject mo) { + if(mo.getGPUObject() != null && mo.getGPUObject().isAllocated()) + return mo.getGPUObject().isInSparseFormat(); + return MatrixBlock.evalSparseFormatInMemory(mo.getNumRows(), mo.getNumColumns(), mo.getNnz()); + } + + + //********************************************************************/ + //***************** DEEP LEARNING Operators **************************/ + //********************************************************************/ public static cudnnHandle cudnnHandle; @@ -157,14 +206,15 @@ public class LibMatrixCUDA { private static int CONVOLUTION_PREFERENCE = cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE; - public static void conv2d(MatrixObject image, MatrixObject filter, MatrixObject outputBlock, int N, int C, int H, int W, + public static void conv2d(String instName, MatrixObject image, MatrixObject filter, MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException { + if(isInSparseFormat(image)) { - ((JCudaObject)image.getGPUObject()).sparseToDense(); + ((JCudaObject)image.getGPUObject()).sparseToDense(instName); } if(isInSparseFormat(filter)) { - ((JCudaObject)filter.getGPUObject()).sparseToDense(); + ((JCudaObject)filter.getGPUObject()).sparseToDense(instName); } cudnnTensorDescriptor srcTensorDesc = null; @@ -176,11 +226,14 @@ public class LibMatrixCUDA { Pointer alpha = null; Pointer beta = null; try { + long t1=0, t2=0; // Allocate descriptors + if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); srcTensorDesc = allocateTensorDescriptor(N, C, H, W); dstTensorDesc = allocateTensorDescriptor(N, K, P, Q); filterDesc = allocateFilterDescriptor(K, C, R, S); + Pointer imagePointer = ((JCudaObject)image.getGPUObject()).jcudaDenseMatrixPtr; Pointer filterPointer = ((JCudaObject)filter.getGPUObject()).jcudaDenseMatrixPtr; Pointer dstPointer = ((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr; @@ -219,24 +272,28 @@ public class LibMatrixCUDA { else { throw new DMLRuntimeException("Unsupported preference criteria for convolution"); } + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1); alpha = pointerTo(1.0); beta = pointerTo(0.0f); + if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime(); int status = cudnnConvolutionForward(cudnnHandle, alpha, srcTensorDesc, imagePointer, filterDesc, filterPointer, convDesc, algo, workSpace, sizeInBytes, beta, dstTensorDesc, dstPointer); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_FORWARD_LIB, System.nanoTime() - t2); if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) { throw new DMLRuntimeException("Could not executed cudnnConvolutionForward: " + jcuda.jcudnn.cudnnStatus.stringFor(status)); } } finally { - + long t3=0; + if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime(); if(alpha != null) - cudaFreeHelper(alpha); + cudaFreeHelper(instName, alpha); if(beta != null) - cudaFreeHelper(beta); + cudaFreeHelper(instName, beta); if(srcTensorDesc != null) cudnnDestroyTensorDescriptor(srcTensorDesc); @@ -247,7 +304,8 @@ public class LibMatrixCUDA { if(convDesc != null) cudnnDestroyConvolutionDescriptor(convDesc); if(workSpace != null && sizeInBytes != 0) - cudaFreeHelper(workSpace); + cudaFreeHelper(instName, workSpace); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3); } } @@ -295,27 +353,32 @@ public class LibMatrixCUDA { /** * This method computes the backpropagation errors for previous layer of relu operation - * + * @param instName the invoking instruction's name for record {@link Statistics}. * @param input input image * @param dout next layer error propogation * @param outputBlock output * @throws DMLRuntimeException if DMLRuntimeException occurs */ - public static void reluBackward(MatrixObject input, MatrixObject dout, MatrixObject outputBlock) throws DMLRuntimeException { + public static void reluBackward(String instName, MatrixObject input, MatrixObject dout, MatrixObject outputBlock) throws DMLRuntimeException { if(isInSparseFormat(input)) { - ((JCudaObject)input.getGPUObject()).sparseToDense(); + ((JCudaObject)input.getGPUObject()).sparseToDense(instName); } if(isInSparseFormat(dout)) { - ((JCudaObject)dout.getGPUObject()).sparseToDense(); + ((JCudaObject)dout.getGPUObject()).sparseToDense(instName); } long rows = input.getNumRows(); long cols = input.getNumColumns(); Pointer imagePointer = ((JCudaObject)input.getGPUObject()).jcudaDenseMatrixPtr; Pointer doutPointer = ((JCudaObject)dout.getGPUObject()).jcudaDenseMatrixPtr; Pointer outputPointer = ((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr; - kernels.launchKernel("reluBackward", + + long t1=0; + if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); + kernels.launchKernel("relu_backward", ExecutionConfig.getConfigForSimpleMatrixOperations((int)rows, (int)cols), imagePointer, doutPointer, outputPointer, (int)rows, (int)cols); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1); + } /** @@ -323,18 +386,18 @@ public class LibMatrixCUDA { * ones = matrix(1, rows=1, cols=Hout*Wout) * output = input + matrix(bias %*% ones, rows=1, cols=F*Hout*Wout) * This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function - * + * @param instName the invoking instruction's name for record {@link Statistics}. * @param input input image * @param bias bias * @param outputBlock output * @throws DMLRuntimeException if DMLRuntimeException occurs */ - public static void biasAdd(MatrixObject input, MatrixObject bias, MatrixObject outputBlock) throws DMLRuntimeException { + public static void biasAdd(String instName, MatrixObject input, MatrixObject bias, MatrixObject outputBlock) throws DMLRuntimeException { if(isInSparseFormat(input)) { - ((JCudaObject)input.getGPUObject()).sparseToDense(); + ((JCudaObject)input.getGPUObject()).sparseToDense(instName); } if(isInSparseFormat(bias)) { - ((JCudaObject)bias.getGPUObject()).sparseToDense(); + ((JCudaObject)bias.getGPUObject()).sparseToDense(instName); } long rows = input.getNumRows(); long cols = input.getNumColumns(); @@ -346,15 +409,18 @@ public class LibMatrixCUDA { Pointer imagePointer = ((JCudaObject)input.getGPUObject()).jcudaDenseMatrixPtr; Pointer biasPointer = ((JCudaObject)bias.getGPUObject()).jcudaDenseMatrixPtr; Pointer outputPointer = ((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr; - kernels.launchKernel("biasAdd", + long t1 = 0; + if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); + kernels.launchKernel("bias_add", ExecutionConfig.getConfigForSimpleMatrixOperations((int)rows, (int)cols), imagePointer, biasPointer, outputPointer, (int)rows, (int)cols, (int) PQ); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1); } /** * This method computes the backpropogation errors for filter of convolution operation - * + * @param instName the invoking instruction's name for record {@link Statistics}. * @param image input image * @param dout errors from next layer * @param outputBlock output errors @@ -373,16 +439,17 @@ public class LibMatrixCUDA { * @param Q output activation width * @throws DMLRuntimeException if DMLRuntimeException occurs */ - public static void conv2dBackwardFilter(MatrixObject image, MatrixObject dout, + public static void conv2dBackwardFilter(String instName, MatrixObject image, MatrixObject dout, MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException { if(isInSparseFormat(image)) { - ((JCudaObject)image.getGPUObject()).sparseToDense(); + ((JCudaObject)image.getGPUObject()).sparseToDense(instName); } if(isInSparseFormat(dout)) { - ((JCudaObject)dout.getGPUObject()).sparseToDense(); + ((JCudaObject)dout.getGPUObject()).sparseToDense(instName); } + Pointer alpha = null; Pointer beta = null; cudnnTensorDescriptor xTensorDesc = null; @@ -393,6 +460,9 @@ public class LibMatrixCUDA { Pointer workSpace = null; long sizeInBytes = 0; try { + + long t1=0, t2=0; + if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); // Allocate descriptors xTensorDesc = allocateTensorDescriptor(N, C, H, W); doutTensorDesc = allocateTensorDescriptor(N, K, P, Q); @@ -413,21 +483,31 @@ public class LibMatrixCUDA { // TODO: Select the best algorithm depending on the data and supported CUDA int algo = jcuda.jcudnn.cudnnConvolutionBwdFilterAlgo.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; + workSpace = new Pointer(); cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle, xTensorDesc, doutTensorDesc, convDesc, dwDesc, algo, sizeInBytesArray); + if (GPUStatistics.DISPLAY_STATISTICS)GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1); + if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime(); int status = cudnnConvolutionBackwardFilter(cudnnHandle, alpha, xTensorDesc, imagePointer, doutTensorDesc, doutPointer, convDesc, algo, workSpace, sizeInBytes, beta, dwDesc, dwPointer); + if (GPUStatistics.DISPLAY_STATISTICS)GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB, System.nanoTime() - t2); + if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) { throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardFilter: " + jcuda.jcudnn.cudnnStatus.stringFor(status)); } } finally { + long t3=0; + if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime(); if(alpha != null) - cudaFreeHelper(alpha); + cudaFreeHelper(instName, alpha); if(beta != null) - cudaFreeHelper(beta); + cudaFreeHelper(instName, beta); + if(workSpace != null && sizeInBytes != 0) + cudaFreeHelper(instName, workSpace); + if(xTensorDesc != null) cudnnDestroyTensorDescriptor(xTensorDesc); if(doutTensorDesc != null) @@ -437,143 +517,468 @@ public class LibMatrixCUDA { if(convDesc != null) cudnnDestroyConvolutionDescriptor(convDesc); - - if(workSpace != null && sizeInBytes != 0) - cudaFreeHelper(workSpace); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3); } } private static long numDoublesIn2GB = 125000000; - public static void relu(ExecutionContext ec, MatrixObject in, String outputName) throws DMLRuntimeException { - if(isInSparseFormat(in)) { - // TODO: FIXME: Implement sparse relu kernel - ((JCudaObject)in.getGPUObject()).sparseToDense(); + /** + * This method computes the backpropogation errors for previous layer of convolution operation + * @param instName the invoking instruction's name for record {@link Statistics}. + * @param filter filter used in conv2d + * @param dout errors from next layer + * @param output output errors + * @param N number of images + * @param C number of channels + * @param H height + * @param W width + * @param K number of filters + * @param R filter height + * @param S filter width + * @param pad_h pad height + * @param pad_w pad width + * @param stride_h stride height + * @param stride_w stride width + * @param P output activation height + * @param Q output activation width + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static void conv2dBackwardData(String instName, MatrixObject filter, MatrixObject dout, + MatrixObject output, int N, int C, int H, int W, int K, int R, + int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, + int Q) throws DMLRuntimeException { + if(isInSparseFormat(dout)) { + ((JCudaObject)dout.getGPUObject()).sparseToDense(instName); + } + if(isInSparseFormat(filter)) { + ((JCudaObject)filter.getGPUObject()).sparseToDense(instName); } - - cudnnTensorDescriptor srcTensorDesc = null; - cudnnTensorDescriptor dstTensorDesc = null; Pointer alpha = null; Pointer beta = null; + cudnnTensorDescriptor dyDesc = null; + cudnnTensorDescriptor dxDesc = null; + cudnnFilterDescriptor wDesc = null; + cudnnConvolutionDescriptor convDesc = null; + Pointer workSpace = null; + long sizeInBytes = 0; try { - alpha = pointerTo(1.0f); + long t1=0, t2=0; + if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); + // Allocate descriptors + wDesc = allocateFilterDescriptor(K, C, R, S); + dyDesc = allocateTensorDescriptor(N, K, P, Q); + dxDesc = allocateTensorDescriptor(N, C, H, W); + + // Allocate data + Pointer w = ((JCudaObject)filter.getGPUObject()).jcudaDenseMatrixPtr; + Pointer dy = ((JCudaObject)dout.getGPUObject()).jcudaDenseMatrixPtr; + Pointer dx = ((JCudaObject)output.getGPUObject()).jcudaDenseMatrixPtr; + + alpha = pointerTo(1.0); // TODO beta = pointerTo(0.0f); - long N = in.getNumRows(); - long H = in.getNumColumns(); - long W = 1; - Pointer srcData = ((JCudaObject)in.getGPUObject()).jcudaDenseMatrixPtr; - MatrixObject output = ec.getMatrixObject(outputName); - ec.getDenseMatrixOutputForGPUInstruction(outputName); // Allocated the dense output matrix - Pointer dstData = ((JCudaObject)output.getGPUObject()).jcudaDenseMatrixPtr; + int padding [] = { pad_h, pad_w }; + int strides [] = { stride_h, stride_w }; + convDesc = allocateConvolutionDescriptor(padding, strides); + long sizeInBytesArray[] = { 0 }; - if(N*H*W >= numDoublesIn2GB) { - // Invokes relu(double* A, double* ret, int rlen, int clen) - kernels.launchKernel("relu", - ExecutionConfig.getConfigForSimpleMatrixOperations((int)N, (int) (H*W)), - srcData, dstData, (int)N, (int) H*W); - } - else { - // Allocate descriptors - srcTensorDesc = allocateTensorDescriptor((int)N, 1, (int)H, (int)W); - dstTensorDesc = allocateTensorDescriptor((int)N, 1, (int)H, (int)W); - cudnnActivationDescriptor activationDescriptor = new cudnnActivationDescriptor(); - cudnnCreateActivationDescriptor(activationDescriptor); - double dummy = -1; - cudnnSetActivationDescriptor(activationDescriptor, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, dummy); - cudnnActivationForward(cudnnHandle, activationDescriptor, - alpha, srcTensorDesc, srcData, - beta, dstTensorDesc, dstData); + // TODO: Select the best algorithm depending on the data and supported CUDA + int algo = jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; + workSpace = new Pointer(); + cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle, + wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytesArray); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1); + + if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime(); + int status = cudnnConvolutionBackwardData(cudnnHandle, alpha, wDesc, w, + dyDesc, dy, convDesc, algo, workSpace, sizeInBytes, beta, dxDesc, dx); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB, System.nanoTime() - t2); + + if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) { + throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardData: " + jcuda.jcudnn.cudnnStatus.stringFor(status)); } } finally { + long t3=0; + if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime(); if(alpha != null) - cudaFreeHelper(alpha); + cudaFreeHelper(instName, alpha); if(beta != null) - cudaFreeHelper(beta); + cudaFreeHelper(instName, beta); + if(workSpace != null && sizeInBytes != 0) + cudaFreeHelper(instName, workSpace); - if(srcTensorDesc != null) - cudnnDestroyTensorDescriptor(srcTensorDesc); - if(dstTensorDesc != null) - cudnnDestroyTensorDescriptor(dstTensorDesc); + if(dyDesc != null) + cudnnDestroyTensorDescriptor(dyDesc); + if(dxDesc != null) + cudnnDestroyTensorDescriptor(dxDesc); + if(wDesc != null) + cudnnDestroyFilterDescriptor(wDesc); + if(convDesc != null) + cudnnDestroyConvolutionDescriptor(convDesc); + + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3); } } /** - * Performs tsmm, A %*% A' or A' %*% A, on GPU by exploiting cublasDsyrk(...) - * - * @param ec execution context - * @param left input matrix, as in a tsmm expression like A %*% A' or A' %*% A, we just need to check whether the left one is transposed or not, I named it 'left' - * @param outputName output matrix name - * @param isLeftTransposed if true, left transposed + * performs maxpooling on GPU by exploiting cudnnPoolingForward(...) + * @param instName the invoking instruction's name for record {@link Statistics}. + * @param image image as matrix object + * @param outputBlock output matrix + * @param N batch size + * @param C number of channels + * @param H height of image + * @param W width of image + * @param K number of filters + * @param R height of filter + * @param S width of filter + * @param pad_h vertical padding + * @param pad_w horizontal padding + * @param stride_h horizontal stride + * @param stride_w vertical stride + * @param P (H - R + 1 + 2*pad_h)/stride_h + * @param Q (W - S + 1 + 2*pad_w)/stride_w * @throws DMLRuntimeException if DMLRuntimeException occurs */ - public static void matmultTSMM(ExecutionContext ec, MatrixObject left, String outputName, - boolean isLeftTransposed) throws DMLRuntimeException { - if(isInSparseFormat(left)) { - // For sparse TSMM, invoke matmult (TODO: possible performance improvement) - matmult(ec, left, left, outputName, isLeftTransposed, !isLeftTransposed); - return; + public static void maxpooling(String instName, MatrixObject image, + MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, + int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, + int Q) throws DMLRuntimeException { + if(isInSparseFormat(image)) { + ((JCudaObject)image.getGPUObject()).sparseToDense(instName); } + Pointer alpha = null; + Pointer beta = null; + cudnnTensorDescriptor xDesc = null; + cudnnTensorDescriptor yDesc = null; + cudnnPoolingDescriptor poolingDesc = null; - // For dense TSMM, exploit cublasDsyrk(...) and call custom kernel to flip the matrix - MatrixObject output = ec.getMatrixObject(outputName); - ec.getDenseMatrixOutputForGPUInstruction(outputName); // Allocated the dense output matrix - - // Since CuBLAS expects inputs in column-major format, - // reverse the order of matrix-multiplication and take care of dimension mismatch. - int transa = isLeftTransposed ? cublasOperation.CUBLAS_OP_N : cublasOperation.CUBLAS_OP_T; - // Note: the dimensions are swapped - int m = (int) (isLeftTransposed ? left.getNumColumns() : left.getNumRows()); - int k = (int) (isLeftTransposed ? left.getNumRows() : left.getNumColumns()); + try { + long t1=0,t2=0; + if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); + // Allocate descriptors + yDesc = allocateTensorDescriptor(N, C, P, Q); + xDesc = allocateTensorDescriptor(N, C, H, W); + poolingDesc = allocatePoolingDescriptor(R, S, pad_h, pad_w, stride_h, stride_w); - if(m == -1) - throw new DMLRuntimeException("Incorrect dimensions"); + // Allocate data + Pointer x = ((JCudaObject)image.getGPUObject()).jcudaDenseMatrixPtr; + Pointer y = ((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr; - double[] alpha = {1.0d}; - double[] beta = {0.0d}; + alpha = pointerTo(1.0); + beta = pointerTo(0.0f); - int lda = (int) (isLeftTransposed ? m : k); - int ldc = m; + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1); - if(!left.getGPUObject().isAllocated()) - throw new DMLRuntimeException("Input is not allocated:" + left.getGPUObject().isAllocated()); - if(!output.getGPUObject().isAllocated()) - throw new DMLRuntimeException("Output is not allocated:" + output.getGPUObject().isAllocated()); + if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime(); + int status = cudnnPoolingForward(cudnnHandle, poolingDesc, alpha, xDesc, x, beta, yDesc, y); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2); - Pointer A = ((JCudaObject)left.getGPUObject()).jcudaDenseMatrixPtr; - Pointer C = ((JCudaObject)output.getGPUObject()).jcudaDenseMatrixPtr; + if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) { + throw new DMLRuntimeException("Could not executed cudnnPoolingForward: " + jcuda.jcudnn.cudnnStatus.stringFor(status)); + } + } + finally { + long t3=0; + if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime(); + if(alpha != null) + cudaFreeHelper(instName, alpha); + if(beta != null) + cudaFreeHelper(instName, beta); - JCublas2.cublasDsyrk(cublasHandle, cublasFillMode.CUBLAS_FILL_MODE_LOWER,transa, m, k, Pointer.to(alpha), A, lda, Pointer.to(beta), C, ldc); - copyUpperToLowerTriangle(output); + if(yDesc != null) + cudnnDestroyTensorDescriptor(yDesc); + if(xDesc != null) + cudnnDestroyTensorDescriptor(xDesc); + if(poolingDesc != null) + cudnnDestroyPoolingDescriptor(poolingDesc); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3); + } } /** - * Used for all version of TSMM where the result is known to be symmetric. - * Hence, we compute only the upper triangular matrix and copy this partial - * result down to lower triangular matrix once. - * - * @param ret upper triangular matrix + * Performs maxpoolingBackward on GPU by exploiting cudnnPoolingBackward(...) + * This method computes the backpropogation errors for previous layer of maxpooling operation + * @param instName the invoking instruction's name for record {@link Statistics}. + * @param image image as matrix object + * @param dout delta matrix, output of previous layer + * @param outputBlock output matrix + * @param N batch size + * @param C number of channels + * @param H height of image + * @param W width of image + * @param K number of filters + * @param R height of filter + * @param S width of filter + * @param pad_h vertical padding + * @param pad_w horizontal padding + * @param stride_h horizontal stride + * @param stride_w vertical stride + * @param P (H - R + 1 + 2*pad_h)/stride_h + * @param Q (W - S + 1 + 2*pad_w)/stride_w * @throws DMLRuntimeException if DMLRuntimeException occurs */ - private static void copyUpperToLowerTriangle(MatrixObject ret) throws DMLRuntimeException { - if(isInSparseFormat(ret)) { - throw new DMLRuntimeException("Sparse GPU copyUpperToLowerTriangle is not implemented"); + public static void maxpoolingBackward(String instName, MatrixObject image, MatrixObject dout, + MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, + int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, + int Q) throws DMLRuntimeException { + if(isInSparseFormat(image)) { + ((JCudaObject)image.getGPUObject()).sparseToDense(instName); } - if(ret.getNumRows() != ret.getNumColumns()) { - throw new DMLRuntimeException("Only square matrix kernel is implemented for copyUpperToLowerTriangle"); + if(isInSparseFormat(dout)) { + ((JCudaObject)dout.getGPUObject()).sparseToDense(instName); } - int dim = (int) ret.getNumRows(); - kernels.launchKernel("copyUpperToLowerTriangleDense", - ExecutionConfig.getConfigForSimpleMatrixOperations(dim, dim), - ((JCudaObject)ret.getGPUObject()).jcudaDenseMatrixPtr, dim, dim*dim); - } - + Pointer alpha = null; + Pointer beta = null; + Pointer y = null; + cudnnTensorDescriptor xDesc = null; + cudnnTensorDescriptor yDesc = null; + cudnnTensorDescriptor dyDesc = null; + cudnnTensorDescriptor dxDesc = null; + cudnnPoolingDescriptor poolingDesc = null; - //********************************************************************/ - //***************** MATRIX MULTIPLY Functions ************************/ + try { + long t1=0, t2=0, t3=0; + if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); + // Allocate descriptors + xDesc = allocateTensorDescriptor(N, C, H, W); + yDesc = allocateTensorDescriptor(N, C, P, Q); + dxDesc = allocateTensorDescriptor(N, C, H, W); + dyDesc = allocateTensorDescriptor(N, C, P, Q); + + poolingDesc = allocatePoolingDescriptor(R, S, pad_h, pad_w, stride_h, stride_w); + + // Calling PoolForward first, y is one of the inputs for poolBackward + // TODO: Remove calling poolForward after necessary changes at language level for poolBackward + long numBytes = N*C*P*Q*Sizeof.DOUBLE; + y = allocate(numBytes); + + // Allocate data + Pointer x = ((JCudaObject)image.getGPUObject()).jcudaDenseMatrixPtr; + Pointer dx = ((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr; + Pointer dy = ((JCudaObject)dout.getGPUObject()).jcudaDenseMatrixPtr; + + alpha = pointerTo(1.0); + beta = pointerTo(0.0f); + + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1); + + if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime(); + int status = cudnnPoolingForward(cudnnHandle, poolingDesc, alpha, xDesc, x, beta, yDesc, y); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2); + + if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) { + throw new DMLRuntimeException("Could not executed cudnnPoolingForward before cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status)); + } + + if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime(); + status = cudnnPoolingBackward(cudnnHandle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_BACKWARD_LIB, System.nanoTime() - t3); + + if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) { + throw new DMLRuntimeException("Could not executed cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status)); + } + } + finally { + long t4=0; + if (GPUStatistics.DISPLAY_STATISTICS) t4 = System.nanoTime(); + if(alpha != null) + cudaFreeHelper(instName, alpha); + if(beta != null) + cudaFreeHelper(instName, beta); + if(y != null) + cudaFreeHelper(instName, y); + + if(yDesc != null) + cudnnDestroyTensorDescriptor(yDesc); + if(xDesc != null) + cudnnDestroyTensorDescriptor(xDesc); + if(dyDesc != null) + cudnnDestroyTensorDescriptor(dyDesc); + if(dxDesc != null) + cudnnDestroyTensorDescriptor(dxDesc); + if(poolingDesc != null) + cudnnDestroyPoolingDescriptor(poolingDesc); + + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t4); + } + } + + + /** + * Performs the relu operation on the GPU. + * @param ec currently active {@link ExecutionContext} + * @param instName the invoking instruction's name for record {@link Statistics}. + * @param in input matrix + * @param outputName name of the output matrix + * @throws DMLRuntimeException if an error occurs + */ + public static void relu(ExecutionContext ec, String instName, MatrixObject in, String outputName) throws DMLRuntimeException { + if(isInSparseFormat(in)) { + // TODO: FIXME: Implement sparse relu kernel + ((JCudaObject)in.getGPUObject()).sparseToDense(instName); + } + + cudnnTensorDescriptor srcTensorDesc = null; + cudnnTensorDescriptor dstTensorDesc = null; + Pointer alpha = null; + Pointer beta = null; + + try { + alpha = pointerTo(1.0f); + beta = pointerTo(0.0f); + long N = in.getNumRows(); + long H = in.getNumColumns(); + long W = 1; + Pointer srcData = ((JCudaObject)in.getGPUObject()).jcudaDenseMatrixPtr; + + MatrixObject output = ec.getMatrixObject(outputName); + getDenseMatrixOutputForGPUInstruction(ec, instName, outputName); // Allocated the dense output matrix + Pointer dstData = ((JCudaObject)output.getGPUObject()).jcudaDenseMatrixPtr; + + long t0=0; + if(N*H*W >= numDoublesIn2GB) { + // Invokes relu(double* A, double* ret, int rlen, int clen) + if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); + kernels.launchKernel("relu", + ExecutionConfig.getConfigForSimpleMatrixOperations((int)N, (int) (H*W)), + srcData, dstData, (int)N, (int) H*W); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_KERNEL, System.nanoTime() - t0); + } + else { + // Allocate descriptors + srcTensorDesc = allocateTensorDescriptor((int)N, 1, (int)H, (int)W); + dstTensorDesc = allocateTensorDescriptor((int)N, 1, (int)H, (int)W); + cudnnActivationDescriptor activationDescriptor = new cudnnActivationDescriptor(); + cudnnCreateActivationDescriptor(activationDescriptor); + double dummy = -1; + cudnnSetActivationDescriptor(activationDescriptor, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, dummy); + if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); + cudnnActivationForward(cudnnHandle, activationDescriptor, + alpha, srcTensorDesc, srcData, + beta, dstTensorDesc, dstData); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ACTIVATION_FORWARD_LIB, System.nanoTime() - t0); + } + } + finally { + long t1=0; + if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); + if(alpha != null) + cudaFreeHelper(instName, alpha); + if(beta != null) + cudaFreeHelper(instName, beta); + + if(srcTensorDesc != null) + cudnnDestroyTensorDescriptor(srcTensorDesc); + if(dstTensorDesc != null) + cudnnDestroyTensorDescriptor(dstTensorDesc); + + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t1); + } + } + + + + //********************************************************************/ + //************* End of DEEP LEARNING Operators ***********************/ + //********************************************************************/ + + + + //********************************************************************/ + //********** TRANSPOSE SELF MATRIX MULTIPLY Functions ****************/ + //********************************************************************/ + + /** + * Performs tsmm, A %*% A' or A' %*% A, on GPU by exploiting cublasDsyrk(...) + * + * @param ec execution context + * @param instName the invoking instruction's name for record {@link Statistics}. + * @param left input matrix, as in a tsmm expression like A %*% A' or A' %*% A, we just need to check whether the left one is transposed or not, I named it 'left' + * @param outputName output matrix name + * @param isLeftTransposed if true, left transposed + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static void matmultTSMM(ExecutionContext ec, String instName, MatrixObject left, String outputName, + boolean isLeftTransposed) throws DMLRuntimeException { + if(isInSparseFormat(left)) { + // For sparse TSMM, invoke matmult (TODO: possible performance improvement) + matmult(ec, instName, left, left, outputName, isLeftTransposed, !isLeftTransposed); + return; + } + + // For dense TSMM, exploit cublasDsyrk(...) and call custom kernel to flip the matrix + MatrixObject output = ec.getMatrixObject(outputName); + getDenseMatrixOutputForGPUInstruction(ec, instName, outputName); // Allocated the dense output matrix + + // Since CuBLAS expects inputs in column-major format, + // reverse the order of matrix-multiplication and take care of dimension mismatch. + int transa = isLeftTransposed ? cublasOperation.CUBLAS_OP_N : cublasOperation.CUBLAS_OP_T; + // Note: the dimensions are swapped + int m = (int) (isLeftTransposed ? left.getNumColumns() : left.getNumRows()); + int k = (int) (isLeftTransposed ? left.getNumRows() : left.getNumColumns()); + + if(m == -1) + throw new DMLRuntimeException("Incorrect dimensions"); + + double[] alpha = {1.0d}; + double[] beta = {0.0d}; + + int lda = (int) (isLeftTransposed ? m : k); + int ldc = m; + + if(!left.getGPUObject().isAllocated()) + throw new DMLRuntimeException("Input is not allocated:" + left.getGPUObject().isAllocated()); + if(!output.getGPUObject().isAllocated()) + throw new DMLRuntimeException("Output is not allocated:" + output.getGPUObject().isAllocated()); + + Pointer A = ((JCudaObject)left.getGPUObject()).jcudaDenseMatrixPtr; + Pointer C = ((JCudaObject)output.getGPUObject()).jcudaDenseMatrixPtr; + + long t0=0, t1=0; + + if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); + JCublas2.cublasDsyrk(cublasHandle, cublasFillMode.CUBLAS_FILL_MODE_LOWER,transa, m, k, Pointer.to(alpha), A, lda, Pointer.to(beta), C, ldc); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SYRK_LIB, System.nanoTime() - t0); + + if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); + copyUpperToLowerTriangle(output); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_UPPER_TO_LOWER_TRIANGLE_KERNEL, System.nanoTime() - t1); + } + + /** + * Used for all version of TSMM where the result is known to be symmetric. + * Hence, we compute only the upper triangular matrix and copy this partial + * result down to lower triangular matrix once. + * + * @param ret upper triangular matrix + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + private static void copyUpperToLowerTriangle(MatrixObject ret) throws DMLRuntimeException { + if(isInSparseFormat(ret)) { + throw new DMLRuntimeException("Sparse GPU copyUpperToLowerTriangle is not implemented"); + } + if(ret.getNumRows() != ret.getNumColumns()) { + throw new DMLRuntimeException("Only square matrix kernel is implemented for copyUpperToLowerTriangle"); + } + int dim = (int) ret.getNumRows(); + kernels.launchKernel("copy_u2l_dense", + ExecutionConfig.getConfigForSimpleMatrixOperations(dim, dim), + ((JCudaObject)ret.getGPUObject()).jcudaDenseMatrixPtr, dim, dim*dim); + } + + + + //********************************************************************/ + //******** End of TRANSPOSE SELF MATRIX MULTIPLY Functions ***********/ + //********************************************************************/ + + //********************************************************************/ + //***************** MATRIX MULTIPLY Functions ************************/ //********************************************************************/ /** @@ -581,16 +986,17 @@ public class LibMatrixCUDA { * Examines sparsity and shapes and routes call to appropriate method * from cuBLAS or cuSparse * C = op(A) x op(B) - * @param ec Current {@link ExecutionContext} instance - * @param left1 Matrix A - * @param right1 Matrix B - * @param outputName Name of the output matrix C (in code generated after LOP layer) + * @param ec Current {@link ExecutionContext} instance + * @param instName name of the invoking instruction to record{@link Statistics}. + * @param left1 Matrix A + * @param right1 Matrix B + * @param outputName Name of the output matrix C (in code generated after LOP layer) * @param isLeftTransposed1 op for A, transposed or not * @param isRightTransposed1 op for B, tranposed or not * @return output of matrix multiply * @throws DMLRuntimeException if DMLRuntimeException occurs */ - public static MatrixObject matmult(ExecutionContext ec, MatrixObject left1, MatrixObject right1, String outputName, + public static MatrixObject matmult(ExecutionContext ec, String instName, MatrixObject left1, MatrixObject right1, String outputName, boolean isLeftTransposed1, boolean isRightTransposed1) throws DMLRuntimeException { if(!left1.getGPUObject().isAllocated() || !right1.getGPUObject().isAllocated()) @@ -603,17 +1009,17 @@ public class LibMatrixCUDA { if (bothDense) { // Dense C = Dense A * Dense B // For both dense, do cuBLAS - ec.getDenseMatrixOutputForGPUInstruction(outputName); // Allocated the dense output matrix - denseDenseMatmult(output, left1, right1, isLeftTransposed1, isRightTransposed1); + getDenseMatrixOutputForGPUInstruction(ec, instName, outputName); // Allocated the dense output matrix + denseDenseMatmult(instName, output, left1, right1, isLeftTransposed1, isRightTransposed1); } else if (bothSparse){ // Sparse C = Sparse A * Sparse B ec.allocateGPUMatrixObject(outputName); - bothSparseMatmult(output, left1, right1, isLeftTransposed1, isRightTransposed1); + bothSparseMatmult(instName, output, left1, right1, isLeftTransposed1, isRightTransposed1); } else { // Either of A or B is sparse, Sparse C = Sparse/Dense A * Dense/Sparse B // Convert the dense to sparse and use the cusparseDcsrgemm routine ec.allocateGPUMatrixObject(outputName); - eitherSparseMatmult(output, left1, right1, isLeftTransposed1, isRightTransposed1); + eitherSparseMatmult(instName, output, left1, right1, isLeftTransposed1, isRightTransposed1); } return output; @@ -622,6 +1028,7 @@ public class LibMatrixCUDA { /** * One of the matrices is sparse, the other dense * C = op(A) x op(B) + * @param instName the invoking instruction's name for record {@link Statistics}. * @param output allocated output object for C on host to which GPU output will be attached * @param left Matrix A on host * @param right Matrix B on host @@ -629,7 +1036,7 @@ public class LibMatrixCUDA { * @param isRightTransposed op for B, transposed or not * @throws DMLRuntimeException if DMLRuntimeException occurs */ - protected static void eitherSparseMatmult(MatrixObject output, MatrixObject left, MatrixObject right, + protected static void eitherSparseMatmult(String instName, MatrixObject output, MatrixObject left, MatrixObject right, boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException { int transA = isLeftTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; @@ -648,10 +1055,10 @@ public class LibMatrixCUDA { if (left.getGPUObject().isInSparseFormat()) { // Left sparse, right dense - sparseDenseMatmult(output, left, right, isLeftTransposed, isRightTransposed, transA, transB, m, n, k); + sparseDenseMatmult(instName, output, left, right, isLeftTransposed, isRightTransposed, transA, transB, m, n, k); } else { // Left dense, right sparse - denseSparseMatmult(output, right, left, isLeftTransposed, isRightTransposed, transA, transB, m, n, k); + denseSparseMatmult(instName, output, right, left, isLeftTransposed, isRightTransposed, transA, transB, m, n, k); } } @@ -659,6 +1066,7 @@ public class LibMatrixCUDA { * C = op(A) * op(B) where A is dense and B is sparse * If B is ultrasparse, A is converted to a sparse matrix and {@code sparseSparseMatmult(MatrixObject, int, int, int, int, int, CSRPointer, CSRPointer)} is invoked * otherwise B is converted to a dense matrix and {@code denseDenseMatmult(Pointer, int, int, int, int, boolean, boolean, Pointer, Pointer)} is invoked. + * @param instName the invoking instruction's name for record {@link Statistics}. * @param output ? * @param right ? * @param left ? @@ -671,7 +1079,7 @@ public class LibMatrixCUDA { * @param k ? * @throws DMLRuntimeException if DMLRuntimeException occurs */ - protected static void denseSparseMatmult(MatrixObject output, MatrixObject right, MatrixObject left, + protected static void denseSparseMatmult(String instName, MatrixObject output, MatrixObject right, MatrixObject left, boolean isLeftTransposed, boolean isRightTransposed, int transA, int transB, int m, int n, int k) throws DMLRuntimeException { // right sparse, left dense @@ -680,30 +1088,51 @@ public class LibMatrixCUDA { if (B.isUltraSparse(k, n)){ LOG.debug(" GPU Dense-Sparse Matrix Multiplication (Converted to Sparse-Sparse)"); // Convert left to CSR and do cuSparse matmul - long t0 = System.nanoTime(); int rowsA = (int)left.getNumRows(); int colsA = (int)left.getNumColumns(); + + + long t1=0, t2=0; + long t0 = System.nanoTime(); Pointer AT = JCudaObject.transpose(ADense, rowsA, colsA, colsA, rowsA); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime() - t0); + + if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); CSRPointer A = JCudaObject.columnMajorDenseToRowMajorSparse(cusparseHandle, rowsA, colsA, AT); - Statistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0); - Statistics.cudaSparseConversionCount.addAndGet(1); - sparseSparseMatmult(output, transA, transB, m, n, k, A, B); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1); + + GPUStatistics.cudaDenseToSparseTime.getAndAdd(System.nanoTime() - t0); + GPUStatistics.cudaDenseToSparseCount.getAndAdd(1); + sparseSparseMatmult(instName, output, transA, transB, m, n, k, A, B); + + if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime(); A.deallocate(); cudaFreeHelper(AT); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2); + } else { LOG.debug(" GPU Dense-Sparse Matrix Multiplication (Converted to Dense-Dense)"); // Convert right to dense and do a cuBlas matmul // BDenseTransposed is a column major matrix // Note the arguments to denseDenseMatmult to accommodate for this. + long t0=0, t1=0, t2=0; + if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); Pointer BDenseTransposed = B.toColumnMajorDenseMatrix(cusparseHandle, cublasHandle, (int)right.getNumRows(), (int)right.getNumColumns()); - output.getGPUObject().acquireDeviceModifyDense(); // To allocate the dense matrix + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0); + GPUStatistics.cudaSparseToDenseTime.getAndAdd(System.nanoTime() - t0); + GPUStatistics.cudaSparseToDenseCount.getAndAdd(System.nanoTime() - t0); + + if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); + boolean allocated = output.getGPUObject().acquireDeviceModifyDense(); // To allocate the dense matrix + if (allocated) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t1); Pointer C = ((JCudaObject)output.getGPUObject()).jcudaDenseMatrixPtr; - denseDenseMatmult(C, + denseDenseMatmult(instName, C, (int) left.getNumRows(), (int) left.getNumColumns(), (int) right.getNumColumns(), (int) right.getNumRows(), isLeftTransposed, !isRightTransposed, ADense, BDenseTransposed); - cudaFreeHelper(BDenseTransposed); + + cudaFreeHelper(instName, BDenseTransposed); } } @@ -711,6 +1140,7 @@ public class LibMatrixCUDA { * * C = op(A) * op(B) where A is sparse and B is dense * If A is ultrasparse, B is converted to a sparse matrix and {@code sparseSparseMatmult(MatrixObject, int, int, int, int, int, CSRPointer, CSRPointer)} is invoked * otherwise A is converted to a dense matrix and {@code denseDenseMatmult(Pointer, int, int, int, int, boolean, boolean, Pointer, Pointer)} is invoked. + * @param instName the invoking instruction's name for record {@link Statistics}. * @param output ? * @param left ? * @param right ? @@ -723,7 +1153,7 @@ public class LibMatrixCUDA { * @param k ? * @throws DMLRuntimeException if DMLRuntimeException occurs */ - protected static void sparseDenseMatmult(MatrixObject output, MatrixObject left, MatrixObject right, + protected static void sparseDenseMatmult(String instName, MatrixObject output, MatrixObject left, MatrixObject right, boolean isLeftTransposed, boolean isRightTransposed, int transA, int transB, int m, int n, int k) throws DMLRuntimeException { CSRPointer A = ((JCudaObject)left.getGPUObject()).jcudaSparseMatrixPtr; @@ -732,37 +1162,59 @@ public class LibMatrixCUDA { if (n == 1){ // Sparse Matrix - Dense Vector multiply LOG.debug(" GPU Sparse Matrix - Dense Vector Mutliply"); - sparseMatrixDenseVectorMult(output, A, BDense, transA, (int)left.getNumRows(), (int)left.getNumColumns()); + sparseMatrixDenseVectorMult(instName, output, A, BDense, transA, (int)left.getNumRows(), (int)left.getNumColumns()); } else { + + long t0=0, t1=0, t2=0; // Sparse Matrix Dense Matrix multiply if (A.isUltraSparse(m, k)){ LOG.debug(" GPU Sparse-Dense Matrix Multiplication (Converted to Sparse-Sparse)"); // Convert right to CSR and do cuSparse matmul - long t0 = System.nanoTime(); int rowsB = (int)right.getNumRows(); int colsB = (int)right.getNumColumns(); + + if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); Pointer BT = JCudaObject.transpose(BDense, rowsB, colsB, colsB, rowsB); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime() - t0); + + if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); CSRPointer B = JCudaObject.columnMajorDenseToRowMajorSparse(cusparseHandle, rowsB, colsB, BT); - Statistics.cudaSparseConversionTime.addAndGet(System.nanoTime() - t0); - Statistics.cudaSparseConversionCount.addAndGet(1); - sparseSparseMatmult(output, transA, transB, m, n, k, A, B); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1); + + GPUStatistics.cudaDenseToSparseTime.getAndAdd(System.nanoTime() - t0); + GPUStatistics.cudaDenseToSparseCount.getAndAdd(1); + + sparseSparseMatmult(instName, output, transA, transB, m, n, k, A, B); + + if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime(); B.deallocate(); cudaFreeHelper(BT); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2); + } else { LOG.debug(" GPU Sparse-Dense Matrix Multiplication (Converted to Dense-Dense)"); // Convert left to dense and do a cuBlas matmul // ADenseTransposed is a column major matrix // Note the arguments to denseDenseMatmult to accommodate for this. + if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); Pointer ADenseTransposed = A.toColumnMajorDenseMatrix(cusparseHandle, cublasHandle, (int)left.getNumRows(), (int)left.getNumColumns()); - output.getGPUObject().acquireDeviceModifyDense(); // To allocate the dense matrix + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0); + GPUStatistics.cudaSparseToDenseTime.getAndAdd(System.nanoTime() - t0); + GPUStatistics.cudaSparseToDenseCount.getAndAdd(System.nanoTime() - t0); + + if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); + boolean allocated = output.getGPUObject().acquireDeviceModifyDense(); // To allocate the dense matrix + if (allocated) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t1); + Pointer C = ((JCudaObject)output.getGPUObject()).jcudaDenseMatrixPtr; - denseDenseMatmult(C, + denseDenseMatmult(instName, C, (int) left.getNumColumns(), (int) left.getNumRows(), (int) right.getNumRows(), (int) right.getNumColumns(), !isLeftTransposed, isRightTransposed, ADenseTransposed, BDense); - cudaFreeHelper(ADenseTransposed); + + cudaFreeHelper(instName, ADenseTransposed); } } } @@ -770,6 +1222,7 @@ public class LibMatrixCUDA { /** * C = op(A) x B * A is a sparse matrix, B is a dense vector + * @param instName the invoking instruction's name for record {@link Statistics}. * @param output allocated output on the host, to which the GPU output C will be attached * @param A sparse matrix A on the GPU * @param B_dense dense matrix/vector B on the GPU @@ -778,17 +1231,23 @@ public class LibMatrixCUDA { * @param k number of cols in A or number of rows in B (not op(A) or op(B)) * @throws DMLRuntimeException if DMLRuntimeException occurs */ - protected static void sparseMatrixDenseVectorMult(MatrixObject output, CSRPointer A, Pointer B_dense, int transA, + protected static void sparseMatrixDenseVectorMult(String instName, MatrixObject output, CSRPointer A, Pointer B_dense, int transA, int m, int k) throws DMLRuntimeException { long size = m * Sizeof.DOUBLE; if (transA == CUSPARSE_OPERATION_TRANSPOSE){ size = k * Sizeof.DOUBLE; } - Pointer C_dense = JCudaObject.allocate((int)size); + Pointer C_dense = JCudaObject.allocate(instName, (int)size); + double[] alpha = { 1 }; double[] beta = { 0 }; + + long t1=0; + if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); cusparseDcsrmv(cusparseHandle, transA, m, k, (int)A.nnz, Pointer.to(alpha), A.descr, A.val, A.rowPtr, A.colInd, B_dense, Pointer.to(beta), C_dense); cudaDeviceSynchronize(); // Since cusparseDcsrmv is asynchronously executed + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB, System.nanoTime() - t1); + ((JCudaObject)(output.getGPUObject())).setDenseMatrixCudaPointer(C_dense); output.getGPUObject().setDeviceModify(size); } @@ -796,14 +1255,16 @@ public class LibMatrixCUDA { /** * Sparse C = Sparse op(A) * Sparse op(B) * Reroutes call to sparse matrix-vector mult if needed + * @param instName the invoking instruction's name for record {@link Statistics}. * @param output ? + * @param instName name of the invoking instruction to record{@link Statistics}. * @param left ? * @param right ? * @param isLeftTransposed ? * @param isRightTransposed ? * @throws DMLRuntimeException if DMLRuntimeException occurs */ - protected static void bothSparseMatmult(MatrixObject output, MatrixObject left, MatrixObject right, + protected static void bothSparseMatmult(String instName, MatrixObject output, MatrixObject left, MatrixObject right, boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException { int transA = isLeftTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; @@ -825,15 +1286,16 @@ public class LibMatrixCUDA { // TODO if (m == 1) { // Vector-matrix multiplication if (!isRightTransposed && right.getNumColumns() == 1){ // Matrix-Vector multiplication - sparseMatrixVectorMult(output, transA, (int)left.getNumRows(), (int)left.getNumColumns(), (int)right.getNumRows(), A, B); + sparseMatrixVectorMult(instName, output, transA, (int)left.getNumRows(), (int)left.getNumColumns(), (int)right.getNumRows(), A, B); } else { // Matrix-Matrix multiplication - sparseSparseMatmult(output, transA, transB, m, n, k, A, B); + sparseSparseMatmult(instName, output, transA, transB, m, n, k, A, B); } } /** * Does a sparse matrix-vector multiply. * C = op(A) x B, A is a sparse matrix, B is a sparse vector with numCols = 1. + * @param instName the invoking instruction's name for record {@link Statistics}. * @param output allocated output object C to which the GPU output matrix will be attached * @param transA if A is to be transposed or not (the op in op(A)) * @param m number of rows in A (not op(A)) @@ -843,16 +1305,20 @@ public class LibMatrixCUDA { * @param B right sparse vector on GPU * @throws DMLRuntimeException if DMLRuntimeException occurs */ - protected static void sparseMatrixVectorMult(MatrixObject output, int transA, int m, int n, int k, + protected static void sparseMatrixVectorMult(String instName, MatrixObject output, int transA, int m, int n, int k, CSRPointer A, CSRPointer B) throws DMLRuntimeException { LOG.debug(" GPU Sparse Matrix Sparse Vector Multiply (Converted to Sparse Matrix Dense Vector Multiply)"); + long t0=0; + if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); Pointer BDenseVector = B.toColumnMajorDenseMatrix(cusparseHandle, cublasHandle, k, 1); - sparseMatrixDenseVectorMult(output, A, BDenseVector, transA, m, k); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0); + sparseMatrixDenseVectorMult(instName, output, A, BDenseVector, transA, m, k); } /** * Does a sparse-sparse Matrix multiply * C = op(A) x op(B), A, B are sparse matrices + * @param instName the invoking instruction's name for record {@link Statistics}. * @param output allocated output object on host to which the GPU output matrix will be attached * @param transA op for A - to be transposed or not * @param transB op for B @@ -863,25 +1329,32 @@ public class LibMatrixCUDA { * @param B right sparse matrix on GPU * @throws DMLRuntimeException if DMLRuntimeException occurs */ - protected static void sparseSparseMatmult(MatrixObject output, int transA, int transB, int m, int n, int k, + protected static void sparseSparseMatmult(String instName, MatrixObject output, int transA, int transB, int m, int n, int k, CSRPointer A, CSRPointer B) throws DMLRuntimeException { LOG.debug(" GPU Sparse-Sparse Matrix Multiply "); + long t0=0, t1=0; + if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); CSRPointer C = CSRPointer.allocateForMatrixMultiply(cusparseHandle, A, transA, B, transB, m, n, k); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, System.nanoTime() - t0); + ((JCudaObject)output.getGPUObject()).setSparseMatrixCudaPointer(C); long sizeOfC = CSRPointer.estimateSize(C.nnz, output.getNumRows()); output.getGPUObject().setDeviceModify(sizeOfC); + if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); cusparseDcsrgemm(cusparseHandle, transA, transB, m, n, k, A.descr, (int)A.nnz, A.val, A.rowPtr, A.colInd, B.descr, (int)B.nnz, B.val, B.rowPtr, B.colInd, C.descr, C.val, C.rowPtr, C.colInd); cudaDeviceSynchronize(); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB, System.nanoTime() - t1); } /** * Dense dense matrix multiply * C = op(A) * op(B), A and B are dense matrices + * @param instName name of the invoking instruction to record{@link Statistics}. * @param output output object C on host with GPU data allocated * @param left1 left matrix A on host (in row-major order) * @param right1 right matrix B on host (in row-major order) @@ -889,7 +1362,7 @@ public class LibMatrixCUDA { * @param isRightTransposed1 op for B, transposed or not * @throws DMLRuntimeException if DMLRuntimeException occurs */ - protected static void denseDenseMatmult(MatrixObject output, MatrixObject left1, MatrixObject right1, + protected static void denseDenseMatmult(String instName, MatrixObject output, MatrixObject left1, MatrixObject right1, boolean isLeftTransposed1, boolean isRightTransposed1) throws DMLRuntimeException { Pointer leftPtr = ((JCudaObject)left1.getGPUObject()).jcudaDenseMatrixPtr; @@ -900,7 +1373,7 @@ public class LibMatrixCUDA { int rightRows = (int) right1.getNumRows(); int rightCols = (int) right1.getNumColumns(); Pointer C = ((JCudaObject)output.getGPUObject()).jcudaDenseMatrixPtr; - denseDenseMatmult(C, leftRows, leftCols, rightRows, rightCols, isLeftTransposed1, isRightTransposed1, + denseDenseMatmult(instName, C, leftRows, leftCols, rightRows, rightCols, isLeftTransposed1, isRightTransposed1, leftPtr, rightPtr); } @@ -912,6 +1385,7 @@ public class LibMatrixCUDA { * We do t(B) %*% t(A) to get t(C); * If we were to calculate t(t(C), we would get the resultant matrix C, but this would be in column-major format. * What we really want is t(C). This we already have as the result of t(B) %*% t(A). + * @param instName name of the invoking instruction to record{@link Statistics}. * @param output output allocated on GPU in column major format * @param leftRows1 number of rows in A * @param leftCols1 number of cols in A @@ -923,7 +1397,7 @@ public class LibMatrixCUDA { * @param rightPtr B allocated on the GPU in row-major format * @throws DMLRuntimeException if DMLRuntimeException occurs */ - public static void denseDenseMatmult(Pointer output, int leftRows1, int leftCols1, int rightRows1, + public static void denseDenseMatmult(String instName, Pointer output, int leftRows1, int leftCols1, int rightRows1, int rightCols1, boolean isLeftTransposed1, boolean isRightTransposed1, Pointer leftPtr, Pointer rightPtr) throws DMLRuntimeException { @@ -961,6 +1435,8 @@ public class LibMatrixCUDA { int transa = isLeftTransposed ? cublasOperation.CUBLAS_OP_T : cublasOperation.CUBLAS_OP_N; int transb = isRightTransposed ? cublasOperation.CUBLAS_OP_T : cublasOperation.CUBLAS_OP_N; + long t0=0; + if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); Pointer C = output; if (m == 1 && n == 1){ // Vector product @@ -972,18 +1448,22 @@ public class LibMatrixCUDA { // The result is copied from the host back to the device so that the rest of // infrastructure can treat it uniformly. cudaMemcpy(C, Pointer.to(result), 1 * Sizeof.DOUBLE, cudaMemcpyHostToDevice); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_DOT_LIB, System.nanoTime() - t0); } else if (m == 1) { // Vector-matrix multiply LOG.debug(" GPU Dense Vector-Matrix Multiply"); transb = isRightTransposed ? cublasOperation.CUBLAS_OP_N : cublasOperation.CUBLAS_OP_T; JCublas2.cublasDgemv(cublasHandle, transb, rightRows, rightCols, Pointer.to(one), B, ldb, A, 1, Pointer.to(zero), C, 1); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_VECTOR_DENSE_MATRIX_LIB, System.nanoTime() - t0); } else if (n == 1){ // Matrix-vector multiply LOG.debug(" GPU Dense Matrix-Vector Multiply"); JCublas2.cublasDgemv(cublasHandle, transa, leftRows, leftCols, Pointer.to(one), A, lda, B, 1, Pointer.to(zero), C, 1); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_VECTOR_LIB, System.nanoTime() - t0); } else { LOG.debug(" GPU Dense-Dense Matrix Multiply "); JCublas2.cublasDgemm(cublasHandle, transa, transb, m, n, k, Pointer.to(one), A, lda, B, ldb, Pointer.to(zero), C, ldc); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB, System.nanoTime() - t0); } } @@ -992,7 +1472,6 @@ public class LibMatrixCUDA { //********************************************************************/ - //********************************************************************/ //**************** UNARY AGGREGATE Functions ************************/ //********************************************************************/ @@ -1002,12 +1481,13 @@ public class LibMatrixCUDA { * Entry point to perform Unary aggregate operations on the GPU. * The execution context object is used to allocate memory for the GPU. * @param ec Instance of {@link ExecutionContext}, from which the output variable will be allocated + * @param instName name of the invoking instruction to record{@link Statistics}. * @param in1 input matrix * @param output output matrix/scalar name * @param op Instance of {@link AggregateUnaryOperator} which encapsulates the direction of reduction/aggregation and the reduction operation. * @throws DMLRuntimeException if {@link DMLRuntimeException} occurs */ - public static void unaryAggregate(ExecutionContext ec, MatrixObject in1, String output, AggregateUnaryOperator op) + public static void unaryAggregate(ExecutionContext ec, String instName, MatrixObject in1, String output, AggregateUnaryOperator op) throws DMLRuntimeException { final int REDUCTION_ALL = 1; @@ -1087,7 +1567,7 @@ public class LibMatrixCUDA { if (isSparse){ // The strategy for the time being is to convert sparse to dense // until a sparse specific kernel is written. - ((JCudaObject)in1.getGPUObject()).sparseToDense(); + ((JCudaObject)in1.getGPUObject()).sparseToDense(instName); // long nnz = in1.getNnz(); // assert nnz > 0 : "Internal Error - number of non zeroes set to " + nnz + " in Aggregate Binary for GPU"; // MatrixObject out = ec.getSparseMatrixOutputForGPUInstruction(output, nnz); @@ -1098,28 +1578,29 @@ public class LibMatrixCUDA { Pointer out = null; if (reductionDirection == REDUCTION_COL || reductionDirection == REDUCTION_ROW) { // Matrix output - MatrixObject out1 = ec.getDenseMatrixOutputForGPUInstruction(output); + MatrixObject out1 = getDenseMatrixOutputForGPUInstruction(ec, instName, output); out = ((JCudaObject) out1.getGPUObject()).jcudaDenseMatrixPtr; } Pointer in = ((JCudaObject)in1.getGPUObject()).jcudaDenseMatrixPtr; int size = rlen * clen; + long t0=0; // For scalars, set the scalar output in the Execution Context object switch (opIndex){ case OP_PLUS: { switch(reductionDirection) { case REDUCTION_ALL : { - double result = reduceAll("reduce_sum", in, size); + double result = reduceAll(instName, "reduce_sum", in, size); ec.setScalarOutput(output, new DoubleObject(result)); break; } case REDUCTION_COL : { // The names are a bit misleading, REDUCTION_COL refers to the direction (reduce all elements in a column) - reduceRow("reduce_row_sum", in, out, rlen, clen); + reduceRow(instName, "reduce_row_sum", in, out, rlen, clen); break; } case REDUCTION_ROW : { - reduceCol("reduce_col_sum", in, out, rlen, clen); + reduceCol(instName, "reduce_col_sum", in, out, rlen, clen); break; } case REDUCTION_DIAG : @@ -1129,43 +1610,45 @@ public class LibMatrixCUDA { } case OP_PLUS_SQ : { // Calculate the squares in a temporary object tmp - Pointer tmp = JCudaObject.allocate(size * Sizeof.DOUBLE); - squareMatrix(in, tmp, rlen, clen); + if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); + Pointer tmp = JCudaObject.allocate(instName, size * Sizeof.DOUBLE); + + squareMatrix(instName, in, tmp, rlen, clen); // Then do the sum on the temporary object and free it switch(reductionDirection) { case REDUCTION_ALL : { - double result = reduceAll("reduce_sum", tmp, size); + double result = reduceAll(instName, "reduce_sum", tmp, size); ec.setScalarOutput(output, new DoubleObject(result)); break; } case REDUCTION_COL : { // The names are a bit misleading, REDUCTION_COL refers to the direction (reduce all elements in a column) - reduceRow("reduce_row_sum", tmp, out, rlen, clen); + reduceRow(instName, "reduce_row_sum", tmp, out, rlen, clen); break; } case REDUCTION_ROW : { - reduceCol("reduce_col_sum", tmp, out, rlen, clen); + reduceCol(instName, "reduce_col_sum", tmp, out, rlen, clen); break; } default: throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for summation squared"); } - cudaFreeHelper(tmp); + cudaFreeHelper(instName, tmp); break; } case OP_MEAN:{ switch(reductionDirection) { case REDUCTION_ALL: { - double result = reduceAll("reduce_sum", in, size); + double result = reduceAll(instName, "reduce_sum", in, size); double mean = result / size; ec.setScalarOutput(output, new DoubleObject(mean)); break; } case REDUCTION_COL: { - reduceRow("reduce_row_mean", in, out, rlen, clen); + reduceRow(instName, "reduce_row_mean", in, out, rlen, clen); break; } case REDUCTION_ROW: { - reduceCol("reduce_col_mean", in, out, rlen, clen); + reduceCol(instName, "reduce_col_mean", in, out, rlen, clen); break; } default: @@ -1176,7 +1659,7 @@ public class LibMatrixCUDA { case OP_MULTIPLY : { switch (reductionDirection) { case REDUCTION_ALL: { - double result = reduceAll("reduce_prod", in, size); + double result = reduceAll(instName, "reduce_prod", in, size); ec.setScalarOutput(output, new DoubleObject(result)); break; } @@ -1188,16 +1671,16 @@ public class LibMatrixCUDA { case OP_MAX :{ switch(reductionDirection) { case REDUCTION_ALL: { - double result = reduceAll("reduce_max", in, size); + double result = reduceAll(instName, "reduce_max", in, size); ec.setScalarOutput(output, new DoubleObject(result)); break; } case REDUCTION_COL: { - reduceRow("reduce_row_max", in, out, rlen, clen); + reduceRow(instName, "reduce_row_max", in, out, rlen, clen); break; } case REDUCTION_ROW: { - reduceCol("reduce_col_max", in, out, rlen, clen); + reduceCol(instName, "reduce_col_max", in, out, rlen, clen); break; } default: @@ -1208,16 +1691,16 @@ public class LibMatrixCUDA { case OP_MIN :{ switch(reductionDirection) { case REDUCTION_ALL: { - double result = reduceAll("reduce_min", in, size); + double result = reduceAll(instName, "reduce_min", in, size); ec.setScalarOutput(output, new DoubleObject(result)); break; } case REDUCTION_COL: { - reduceRow("reduce_row_min", in, out, rlen, clen); + reduceRow(instName, "reduce_row_min", in, out, rlen, clen); break; } case REDUCTION_ROW: { - reduceCol("reduce_col_min", in, out, rlen, clen); + reduceCol(instName, "reduce_col_min", in, out, rlen, clen); break; } default: @@ -1227,66 +1710,69 @@ public class LibMatrixCUDA { } case OP_VARIANCE : { // Temporary GPU array for - Pointer tmp = JCudaObject.allocate(size * Sizeof.DOUBLE); - Pointer tmp2 = JCudaObject.allocate(size * Sizeof.DOUBLE); + if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); + Pointer tmp = JCudaObject.allocate(instName, size * Sizeof.DOUBLE); + Pointer tmp2 = JCudaObject.allocate(instName, size * Sizeof.DOUBLE); switch(reductionDirection) { case REDUCTION_ALL: { - double result = reduceAll("reduce_sum", in, size); + double result = reduceAll(instName, "reduce_sum", in, size); double mean = result / size; // Subtract mean from every element in the matrix ScalarOperator minusOp = new RightScalarOperator(Minus.getMinusFnObject(), mean); - matrixScalarOp(in, mean, rlen, clen, tmp, minusOp); + matrixScalarOp(instName, in, mean, rlen, clen, tmp, minusOp); - squareMatrix(tmp, tmp2, rlen, clen); + squareMatrix(instName, tmp, tmp2, rlen, clen); - double result2 = reduceAll("reduce_sum", tmp2, size); + double result2 = reduceAll(instName, "reduce_sum", tmp2, size); double variance = result2 / (size - 1); ec.setScalarOutput(output, new DoubleObject(variance)); break; } case REDUCTION_COL: { - reduceRow("reduce_row_mean", in, out, rlen, clen); + reduceRow(instName, "reduce_row_mean", in, out, rlen, clen); // Subtract the row-wise mean from every element in the matrix BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject()); - matrixMatrixOp(in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.COLUMN.code(), tmp, minusOp); + matrixMatrixOp(instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.COLUMN.code(), tmp, minusOp); - squareMatrix(tmp, tmp2, rlen, clen); + squareMatrix(instName, tmp, tmp2, rlen, clen); - Pointer tmpRow = JCudaObject.allocate(rlen * Sizeof.DOUBLE); - reduceRow("reduce_row_sum", tmp2, tmpRow, rlen, clen); + Pointer tmpRow = JCudaObject.allocate(instName, rlen * Sizeof.DOUBLE); + reduceRow(instName, "reduce_row_sum", tmp2, tmpRow, rlen, clen); ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), clen - 1); - matrixScalarOp(tmpRow, clen - 1, rlen, clen, out, divideOp); + matrixScalarOp(instName, tmpRow, clen - 1, rlen, clen, out, divideOp); + + cudaFreeHelper(instName, tmpRow); - cudaFreeHelper(tmpRow); break; } case REDUCTION_ROW: { - reduceCol("reduce_col_mean", in, out, rlen, clen); + reduceCol(instName, "reduce_col_mean", in, out, rlen, clen); // Subtract the columns-wise mean from every element in the matrix BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject()); - matrixMatrixOp(in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.ROW.code(), tmp, minusOp); + matrixMatrixOp(instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.ROW.code(), tmp, minusOp); - squareMatrix(tmp, tmp2, rlen, clen); + squareMatrix(instName, tmp, tmp2, rlen, clen); - Pointer tmpCol = JCudaObject.allocate(clen * Sizeof.DOUBLE); - reduceCol("reduce_col_sum", tmp2, tmpCol, rlen, clen); + Pointer tmpCol = JCudaObject.allocate(instName, clen * Sizeof.DOUBLE); + reduceCol(instName, "reduce_col_sum", tmp2, tmpCol, rlen, clen); ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), rlen - 1); - matrixScalarOp(tmpCol, rlen - 1, rlen, clen, out, divideOp); + matrixScalarOp(instName, tmpCol, rlen - 1, rlen, clen, out, divideOp); + + cudaFreeHelper(instName, tmpCol); - cudaFreeHelper(tmpCol); break; } default: throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for variance"); } - cudaFreeHelper(tmp); - cudaFreeHelper(tmp2); + cudaFreeHelper(instName, tmp); + cudaFreeHelper(instName, tmp2); break; } case OP_MAXINDEX : { @@ -1313,15 +1799,16 @@ public class LibMatrixCUDA { /** * Helper method to square a matrix in GPU memory + * @param instName the invoking instruction's name for record {@link Statistics}. * @param in input matrix on GPU * @param out output matrix on GPU * @param rlen row length * @param clen column length * @throws DMLRuntimeException */ - private static void squareMatrix(Pointer in, Pointer out, int rlen, int clen) throws DMLRuntimeException { + private static void squareMatrix(String instName, Pointer in, Pointer out, int rlen, int clen) throws DMLRuntimeException { ScalarOperator power2op = new RightScalarOperator(Power.getPowerFnObject(), 2); - matrixScalarOp(in, 2, rlen, clen, out, power2op); + matrixScalarOp(instName, in, 2, rlen, clen, out, power2op); } /** @@ -1332,26 +1819,36 @@ public class LibMatrixCUDA { * @return the reduced value * @throws DMLRuntimeException if DMLRuntimeException occurs */ - private static double reduceAll(String kernelFunction, Pointer in, int n) throws DMLRuntimeException { + private static double reduceAll(String instName, String kernelFunction, Pointer in, int n) throws DMLRuntimeException { int[] tmp = getKernelParamsForReduceAll(n); int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2]; - Pointer tempOut = JCudaObject.allocate(n * Sizeof.DOUBLE); - kernels.launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), - in, tempOut, n); + Pointer tempOut = JCudaObject.allocate(instName, n * Sizeof.DOUBLE); + + long t1=0,t2=0,t3=0; + + if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); + kernels.launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, tempOut, n); cudaDeviceSynchronize(); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ALL_KERNEL, System.nanoTime() - t1); + int s = blocks; while (s > 1) { tmp = getKernelParamsForReduceAll(s); blocks = tmp[0]; threads = tmp[1]; sharedMem = tmp[2]; + if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime(); kernels.launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), tempOut, tempOut, s); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ALL_KERNEL, System.nanoTime() - t2); s = (s + (threads*2-1)) / (threads*2); } double[] result = {-1f}; + + if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime(); cudaMemcpy(Pointer.to(result), tempOut, Sizeof.DOUBLE, cudaMemcpyDeviceToHost); - cudaFreeHelper(tempOut); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_HOST, System.nanoTime() - t3); + cudaFreeHelper(instName, tempOut); return result[0]; } @@ -1365,12 +1862,17 @@ public class LibMatrixCUDA { * @param cols number of columns in input matrix * @throws DMLRuntimeException if DMLRuntimeException occurs */ - private static void reduceRow(String kernelFunction, Pointer in, Pointer out, int rows, int cols) throws DMLRuntimeException { + private static void reduceRow(String instName, String kernelFunction, Pointer in, Pointer out, int rows, int cols) throws DMLRuntimeException { int[] tmp = getKernelParamsForReduceByRow(rows, cols); int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2]; + + long t0=0; + if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); kernels.launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, out, rows, cols); cudaDeviceSynchronize(); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ROW_KERNEL, System.nanoTime() - t0); + } /** @@ -1383,383 +1885,138 @@ public class LibMatrixCUDA { * @param cols number of columns in input matrix * @throws DMLRuntimeException if DMLRuntimeException occurs */ - private static void reduceCol(String kernelFunction, Pointer in, Pointer out, int rows, int cols) throws DMLRuntimeException { + private static void reduceCol(String instName, String kernelFunction, Pointer in, Pointer out, int rows, int cols) throws DMLRuntimeException { int[] tmp = getKernelParamsForReduceByCol(rows, cols); int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2]; + + long t0=0; + if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); kernels.launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, out, rows, cols); cudaDeviceSynchronize(); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_COL_KERNEL, System.nanoTime() - t0); } - /** - * Get threads, blocks and shared memory for a reduce all operation - * @param n size of input array - * @return integer array containing {blocks, threads, shared memory} - */ - private static int[] getKernelParamsForReduceAll(int n) throws DMLRuntimeException{ - final int MAX_THREADS = getMaxThreads(); - final int MAX_BLOCKS = getMaxBlocks(); - final int WARP_SIZE = getWarpSize(); - int threads = (n < MAX_THREADS *2) ? nextPow2((n + 1)/ 2) : MAX_THREADS; - - int blocks = (n + (threads * 2 - 1)) / (threads * 2); - blocks = Math.min(MAX_BLOCKS, blocks); - - int sharedMemSize = threads * Sizeof.DOUBLE; - if (threads <= WARP_SIZE){ - sharedMemSize *= 2; - } - return new int[] {blocks, threads, sharedMemSize}; - } - - /** - * Get threads, blocks and shared memory for a reduce by row operation - * @param rows number of rows in input matrix - * @param cols number of columns in input matrix - * @return integer array containing {blocks, threads, shared memory} - */ - private static int[] getKernelParamsForReduceByRow(int rows, int cols) throws DMLRuntimeException { - final int WARP_SIZE = getWarpSize(); - final int MAX_THREADS = getMaxThreads(); - int threads = (cols < MAX_THREADS *2) ? nextPow2((cols + 1)/ 2) : MAX_THREADS; - int blocks = rows; - int sharedMemSize = threads * Sizeof.DOUBLE; - if (threads <= WARP_SIZE){ - sharedMemSize *=2; - } - return new int[] {blocks, threads, sharedMemSize}; - } - - private static int[] getKernelParamsForReduceByCol(int rows, int cols) throws DMLRuntimeException { - final int MAX_THREADS = getMaxThreads(); - final int MAX_BLOCKS = getMaxBlocks(); - final int WARP_SIZE = getWarpSize(); - int threads = Math.min(cols, MAX_THREADS); - int blocks = Math.min(cols/MAX_THREADS, MAX_BLOCKS); - if (cols % MAX_THREADS != 0) blocks++; - int sharedMemSize = threads * Sizeof.DOUBLE; - if (threads <= WARP_SIZE){ - sharedMemSize *=2; - } - return new int[] {blocks, threads, sharedMemSize}; - } - - - private static int nextPow2(int x) - { - --x; - x |= x >> 1; - x |= x >> 2; - x |= x >> 4; - x |= x >> 8; - x |= x >> 16; - return ++x; - } - - //********************************************************************/ - //**************** END OF UNARY AGGREGATE Functions *****************/ - //********************************************************************/ - - - /** - * This method computes the backpropogation errors for previous layer of convolution operation - * - * @param filter filter used in conv2d - * @param dout errors from next layer - * @param output output errors - * @param N number of images - * @param C number of channels - * @param H height - * @param W width - * @param K number of filters - * @param R filter height - * @param S filter width - * @param pad_h pad height - * @param pad_w pad width - * @param stride_h stride height - * @param stride_w stride width - * @param P output activation height - * @param Q output activation width - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - public static void conv2dBackwardData(MatrixObject filter, MatrixObject dout, - MatrixObject output, int N, int C, int H, int W, int K, int R, - int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, - int Q) throws DMLRuntimeException { - if(isInSparseFormat(dout)) { - ((JCudaObject)dout.getGPUObject()).sparseToDense(); - } - if(isInSparseFormat(filter)) { - ((JCudaObject)filter.getGPUObject()).sparseToDense(); - } - Pointer alpha = null; - Pointer beta = null; - cudnnTensorDescriptor dyDesc = null; - cudnnTensorDescriptor dxDesc = null; - cudnnFilterDescriptor wDesc = null; - cudnnConvolutionDescriptor convDesc = null; - - Pointer workSpace = null; - long sizeInBytes = 0; - try { - // Allocate descriptors - wDesc = allocateFilterDescriptor(K, C, R, S); - dyDesc = allocateTensorDescriptor(N, K, P, Q); - dxDesc = allocateTensorDescriptor(N, C, H, W); - - // Allocate data - Pointer w = ((JCudaObject)filter.getGPUObject()).jcudaDenseMatrixPtr; - Pointer dy = ((JCudaObject)dout.getGPUObject()).jcudaDenseMatrixPtr; - Pointer dx = ((JCudaObject)output.getGPUObject()).jcudaDenseMatrixPtr; - - alpha = pointerTo(1.0); // TODO - beta = pointerTo(0.0f); - - int padding [] = { pad_h, pad_w }; - int strides [] = { stride_h, stride_w }; - convDesc = allocateConvolutionDescriptor(padding, strides); - long sizeInBytesArray[] = { 0 }; - - // TODO: Select the best algorithm depending on the data and supported CUDA - int algo = jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; - workSpace = new Pointer(); - cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle, - wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytesArray); - - int status = cudnnConvolutionBackwardData(cudnnHandle, alpha, wDesc, w, - dyDesc, dy, convDesc, algo, workSpace, sizeInBytes, beta, dxDesc, dx); - if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) { - throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardData: " + jcuda.jcudnn.cudnnStatus.stringFor(status)); - } - } - finally { - if(alpha != null) - cudaFreeHelper(alpha); - if(beta != null) - cudaFreeHelper(beta); - if(dyDesc != null) - cudnnDestroyTensorDescriptor(dyDesc); - if(dxDesc != null) - cudnnDestroyTensorDescriptor(dxDesc); - if(wDesc != null) - cudnnDestroyFilterDescriptor(wDesc); - - if(convDesc != null) - cudnnDestroyConvolutionDescriptor(convDesc); - - if(workSpace != null && sizeInBytes != 0) - cudaFreeHelper(workSpace); - } - } - - /** - * performs maxpooling on GPU by exploiting cudnnPoolingForward(...) - * @param image image as matrix object - * @param outputBlock output matrix - * @param N batch size - * @param C number of channels - * @param H height of image - * @param W width of image - * @param K number of filters - * @param R height of filter - * @param S width of filter - * @param pad_h vertical padding - * @param pad_w horizontal padding - * @param stride_h horizontal stride - * @param stride_w vertical stride - * @param P (H - R + 1 + 2*pad_h)/stride_h - * @param Q (W - S + 1 + 2*pad_w)/stride_w - * @throws DMLRuntimeException if DMLRuntimeException occurs - */ - public static void maxpooling(MatrixObject image, - MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, - int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, - int Q) throws DMLRuntimeException { - if(isInSparseFormat(image)) { - ((JCuda
<TRUNCATED>
