http://git-wip-us.apache.org/repos/asf/systemml/blob/628ffad1/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java index 62c0e0d..92a5546 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java @@ -112,6 +112,7 @@ import org.apache.sysml.runtime.matrix.operators.CMOperator; import org.apache.sysml.runtime.matrix.operators.LeftScalarOperator; import org.apache.sysml.runtime.matrix.operators.RightScalarOperator; import org.apache.sysml.runtime.matrix.operators.ScalarOperator; +import org.apache.sysml.runtime.util.IndexRange; import org.apache.sysml.utils.GPUStatistics; import org.apache.sysml.utils.Statistics; @@ -148,7 +149,7 @@ public class LibMatrixCUDA { private static final Log LOG = LogFactory.getLog(LibMatrixCUDA.class.getName()); - // Assume Compute Capability 3.0 + // Assume Compute Capability 3.0 // MAX BLOCKS is 2^31 - 1 For compute capability > 3.0 // MAX_THREADS is 1024 For compute capability > 3.0 private static int _MAX_THREADS = -1; @@ -163,7 +164,7 @@ public class LibMatrixCUDA { static GPUContext gCtx throws DMLRuntimeException { return GPUContext.gCtx; } - */ + */ /** * Utility function to get maximum number of threads supported by the active CUDA device. @@ -336,7 +337,6 @@ public class LibMatrixCUDA { * @return a sparse matrix pointer * @throws DMLRuntimeException if error occurs */ - @SuppressWarnings("unused") private static CSRPointer getSparsePointer(GPUContext gCtx, MatrixObject input, String instName) throws DMLRuntimeException { if(!isInSparseFormat(gCtx, input)) { input.getGPUObject(gCtx).denseToSparse(); @@ -405,7 +405,7 @@ public class LibMatrixCUDA { biasAdd(instName, tmp, biasPointer, outputPointer, rows, cols, (int)k1); cudaFreeHelper(tmp); - */ + */ LOG.trace("GPU : conv2dBiasAdd" + ", GPUContext=" + gCtx); conv2d(gCtx, instName, image, filter, output, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); //cudaDeviceSynchronize; @@ -413,7 +413,7 @@ public class LibMatrixCUDA { } public static void conv2d(GPUContext gCtx, String instName, MatrixObject image, MatrixObject filter, MatrixObject outputBlock, int N, int C, int H, int W, - int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) + int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException { Pointer imagePointer = getDensePointer(gCtx, image, true, instName); Pointer filterPointer = getDensePointer(gCtx, filter, true, instName); @@ -448,10 +448,10 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if error */ public static void conv2d(GPUContext gCtx, String instName, Pointer image, Pointer filter, Pointer output, int N, - int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) + int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException { - LOG.trace("GPU : conv2d" + ", GPUContext=" + gCtx); - cudnnFilterDescriptor filterDesc = null; + LOG.trace("GPU : conv2d" + ", GPUContext=" + gCtx); + cudnnFilterDescriptor filterDesc = null; cudnnConvolutionDescriptor convDesc = null; Pointer workSpace = null; long sizeInBytes = 0; @@ -480,7 +480,7 @@ public class LibMatrixCUDA { // Also ensure that GPU has enough memory to allocate memory long sizeInBytesArray[] = {0}; jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardAlgorithm(getCudnnHandle(gCtx), srcTensorDesc, filterDesc, convDesc, dstTensorDesc, - CONVOLUTION_PREFERENCE, sizeInBytesArray[0], algos); + CONVOLUTION_PREFERENCE, sizeInBytesArray[0], algos); cudnnGetConvolutionForwardWorkspaceSize(getCudnnHandle(gCtx), srcTensorDesc, filterDesc, convDesc, dstTensorDesc, algos[0], sizeInBytesArray); if (sizeInBytesArray[0] != 0) workSpace = gCtx.allocate(sizeInBytesArray[0]); @@ -494,10 +494,10 @@ public class LibMatrixCUDA { GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1); if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime(); int status = cudnnConvolutionForward(getCudnnHandle(gCtx), one(), - srcTensorDesc, image, - filterDesc, filter, - convDesc, algo, workSpace, sizeInBytes, zero(), - dstTensorDesc, output); + srcTensorDesc, image, + filterDesc, filter, + convDesc, algo, workSpace, sizeInBytes, zero(), + dstTensorDesc, output); if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_FORWARD_LIB, System.nanoTime() - t2); if (status != cudnnStatus.CUDNN_STATUS_SUCCESS) { @@ -564,8 +564,8 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void reluBackward(GPUContext gCtx, String instName, MatrixObject input, MatrixObject dout, MatrixObject outputBlock) throws DMLRuntimeException { - LOG.trace("GPU : reluBackward" + ", GPUContext=" + gCtx); - long rows = input.getNumRows(); + LOG.trace("GPU : reluBackward" + ", GPUContext=" + gCtx); + long rows = input.getNumRows(); long cols = input.getNumColumns(); Pointer imagePointer = getDensePointer(gCtx, input, instName); Pointer doutPointer = getDensePointer(gCtx, dout, instName); @@ -574,8 +574,8 @@ public class LibMatrixCUDA { long t1=0; if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); getCudaKernels(gCtx).launchKernel("relu_backward", - ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(rows), toInt(cols)), - imagePointer, doutPointer, outputPointer, toInt(rows), toInt(cols)); + ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(rows), toInt(cols)), + imagePointer, doutPointer, outputPointer, toInt(rows), toInt(cols)); if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1); } @@ -593,8 +593,8 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void biasMultiply(GPUContext gCtx, String instName, MatrixObject input, MatrixObject bias, MatrixObject outputBlock) throws DMLRuntimeException { - LOG.trace("GPU : biasMultiply" + ", GPUContext=" + gCtx); - if(isInSparseFormat(gCtx, input)) { + LOG.trace("GPU : biasMultiply" + ", GPUContext=" + gCtx); + if(isInSparseFormat(gCtx, input)) { input.getGPUObject(gCtx).sparseToDense(instName); } if(isInSparseFormat(gCtx, bias)) { @@ -613,8 +613,8 @@ public class LibMatrixCUDA { long t1 = 0; if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); getCudaKernels(gCtx).launchKernel("bias_multiply", - ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(rows), toInt(cols)), - imagePointer, biasPointer, outputPointer, toInt(rows), toInt(cols), toInt(PQ)); + ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(rows), toInt(cols)), + imagePointer, biasPointer, outputPointer, toInt(rows), toInt(cols), toInt(PQ)); if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1); } @@ -660,13 +660,13 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException */ private static void biasAdd(GPUContext gCtx, String instName, Pointer image, Pointer bias, Pointer output, int rows, int cols, int k) throws DMLRuntimeException { - LOG.trace("GPU : biasAdd" + ", GPUContext=" + gCtx); - int PQ = cols / k; + LOG.trace("GPU : biasAdd" + ", GPUContext=" + gCtx); + int PQ = cols / k; long t1 = 0; if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); getCudaKernels(gCtx).launchKernel("bias_add", - ExecutionConfig.getConfigForSimpleMatrixOperations(rows, cols), - image, bias, output, rows, cols, PQ); + ExecutionConfig.getConfigForSimpleMatrixOperations(rows, cols), + image, bias, output, rows, cols, PQ); if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1); } @@ -701,8 +701,8 @@ public class LibMatrixCUDA { public static void batchNormalizationForwardInference(GPUContext gCtx, String instName, MatrixObject image, MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, MatrixObject ret, double epsilon) throws DMLRuntimeException { - LOG.trace("GPU : batchNormalizationForwardInference" + ", GPUContext=" + gCtx); - int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL; + LOG.trace("GPU : batchNormalizationForwardInference" + ", GPUContext=" + gCtx); + int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL; int N = toInt(image.getNumRows()); int C = toInt(scale.getNumColumns()); @@ -724,8 +724,8 @@ public class LibMatrixCUDA { checkStatus(cudnnBatchNormalizationForwardInference(getCudnnHandle(gCtx), mode, one(), zero(), nCHWDescriptor, imagePtr, nCHWDescriptor, retPtr, - scaleTensorDesc, scalePtr, biasPtr, - runningMeanPtr, runningVarPtr, epsilon)); + scaleTensorDesc, scalePtr, biasPtr, + runningMeanPtr, runningVarPtr, epsilon)); } /** @@ -747,8 +747,8 @@ public class LibMatrixCUDA { public static void batchNormalizationForwardTraining(GPUContext gCtx, String instName, MatrixObject image, MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, MatrixObject ret, MatrixObject retRunningMean, MatrixObject retRunningVar, double epsilon, double exponentialAverageFactor) throws DMLRuntimeException { - LOG.trace("GPU : batchNormalizationForwardTraining" + ", GPUContext=" + gCtx); - int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL; + LOG.trace("GPU : batchNormalizationForwardTraining" + ", GPUContext=" + gCtx); + int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL; int N = toInt(image.getNumRows()); int C = toInt(scale.getNumColumns()); @@ -777,8 +777,8 @@ public class LibMatrixCUDA { // ignoring resultSaveMean and resultSaveVariance as it requires state management checkStatus(cudnnBatchNormalizationForwardTraining(getCudnnHandle(gCtx), mode, one(), zero(), nCHWDescriptor, imagePtr, nCHWDescriptor, retPtr, - scaleTensorDesc, scalePtr, biasPtr, exponentialAverageFactor, - retRunningMeanPtr, retRunningVarPtr, epsilon, new Pointer(), new Pointer())); + scaleTensorDesc, scalePtr, biasPtr, exponentialAverageFactor, + retRunningMeanPtr, retRunningVarPtr, epsilon, new Pointer(), new Pointer())); } /** @@ -852,8 +852,8 @@ public class LibMatrixCUDA { public static void batchNormalizationBackward(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout, MatrixObject scale, MatrixObject ret, MatrixObject retScale, MatrixObject retBias, double epsilon) throws DMLRuntimeException { - LOG.trace("GPU : batchNormalizationBackward" + ", GPUContext=" + gCtx); - int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL; + LOG.trace("GPU : batchNormalizationBackward" + ", GPUContext=" + gCtx); + int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL; int N = toInt(image.getNumRows()); int C = toInt(scale.getNumColumns()); @@ -902,11 +902,11 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void conv2dBackwardFilter(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout, - MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, - int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, - int Q) throws DMLRuntimeException { - LOG.trace("GPU : conv2dBackwardFilter" + ", GPUContext=" + gCtx); - cudnnFilterDescriptor dwDesc = null; + MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, + int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, + int Q) throws DMLRuntimeException { + LOG.trace("GPU : conv2dBackwardFilter" + ", GPUContext=" + gCtx); + cudnnFilterDescriptor dwDesc = null; cudnnConvolutionDescriptor convDesc = null; Pointer workSpace = null; @@ -934,13 +934,13 @@ public class LibMatrixCUDA { workSpace = new Pointer(); cudnnGetConvolutionBackwardFilterWorkspaceSize(getCudnnHandle(gCtx), - xTensorDesc, doutTensorDesc, convDesc, dwDesc, algo, sizeInBytesArray); + xTensorDesc, doutTensorDesc, convDesc, dwDesc, algo, sizeInBytesArray); if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1); if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime(); int status = cudnnConvolutionBackwardFilter(getCudnnHandle(gCtx), one(), xTensorDesc, imagePointer, - doutTensorDesc, doutPointer, convDesc, algo, workSpace, sizeInBytes, zero(), dwDesc, dwPointer); + doutTensorDesc, doutPointer, convDesc, algo, workSpace, sizeInBytes, zero(), dwDesc, dwPointer); if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB, System.nanoTime() - t2); @@ -948,7 +948,7 @@ public class LibMatrixCUDA { throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardFilter: " + jcuda.jcudnn.cudnnStatus.stringFor(status)); } } catch (CudaException e) { - throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e); + throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e); } finally { long t3=0; if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime(); @@ -989,11 +989,11 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void conv2dBackwardData(GPUContext gCtx, String instName, MatrixObject filter, MatrixObject dout, - MatrixObject output, int N, int C, int H, int W, int K, int R, - int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, - int Q) throws DMLRuntimeException { - LOG.trace("GPU : conv2dBackwardData" + ", GPUContext=" + gCtx); - cudnnFilterDescriptor wDesc = null; + MatrixObject output, int N, int C, int H, int W, int K, int R, + int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, + int Q) throws DMLRuntimeException { + LOG.trace("GPU : conv2dBackwardData" + ", GPUContext=" + gCtx); + cudnnFilterDescriptor wDesc = null; cudnnConvolutionDescriptor convDesc = null; Pointer workSpace = null; @@ -1020,12 +1020,12 @@ public class LibMatrixCUDA { int algo = jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; workSpace = new Pointer(); cudnnGetConvolutionBackwardDataWorkspaceSize(getCudnnHandle(gCtx), - wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytesArray); + wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytesArray); if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1); if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime(); int status = cudnnConvolutionBackwardData(getCudnnHandle(gCtx), one(), wDesc, w, - dyDesc, dy, convDesc, algo, workSpace, sizeInBytes, zero(), dxDesc, dx); + dyDesc, dy, convDesc, algo, workSpace, sizeInBytes, zero(), dxDesc, dx); if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB, System.nanoTime() - t2); if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) { @@ -1074,7 +1074,7 @@ public class LibMatrixCUDA { MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException { - Pointer x = getDensePointer(gCtx, image, true, instName); + Pointer x = getDensePointer(gCtx, image, true, instName); cudnnTensorDescriptor xDesc = allocateTensorDescriptor(gCtx, image, N, C, H, W); performMaxpooling(gCtx, instName, x, xDesc, outputBlock, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q); } @@ -1083,8 +1083,8 @@ public class LibMatrixCUDA { MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q) throws DMLRuntimeException { - LOG.trace("GPU : performMaxpooling" + ", GPUContext=" + gCtx); - Pointer y = getDensePointer(gCtx, outputBlock, true, instName); + LOG.trace("GPU : performMaxpooling" + ", GPUContext=" + gCtx); + Pointer y = getDensePointer(gCtx, outputBlock, true, instName); cudnnPoolingDescriptor poolingDesc = null; try { @@ -1138,11 +1138,11 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void maxpoolingBackward(GPUContext gCtx, String instName, MatrixObject image, MatrixObject dout, - MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, - int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, - int Q) throws DMLRuntimeException { - LOG.trace("GPU : maxpoolingBackward" + ", GPUContext=" + gCtx); - Pointer y = null; + MatrixObject outputBlock, int N, int C, int H, int W, int K, int R, + int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, + int Q) throws DMLRuntimeException { + LOG.trace("GPU : maxpoolingBackward" + ", GPUContext=" + gCtx); + Pointer y = null; cudnnPoolingDescriptor poolingDesc = null; try { @@ -1202,8 +1202,8 @@ public class LibMatrixCUDA { private static void performCuDNNReLU(GPUContext gCtx, String instName, MatrixObject in, Pointer dstData, cudnnTensorDescriptor srcTensorDesc) throws DMLRuntimeException { long t0=0; try { - LOG.trace("GPU : performCuDNNReLU" + ", GPUContext=" + gCtx); - cudnnTensorDescriptor dstTensorDesc = srcTensorDesc; + LOG.trace("GPU : performCuDNNReLU" + ", GPUContext=" + gCtx); + cudnnTensorDescriptor dstTensorDesc = srcTensorDesc; Pointer srcData = getDensePointer(gCtx, in, true, instName); cudnnActivationDescriptor activationDescriptor = new cudnnActivationDescriptor(); @@ -1212,8 +1212,8 @@ public class LibMatrixCUDA { cudnnSetActivationDescriptor(activationDescriptor, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, dummy); if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); cudnnActivationForward(getCudnnHandle(gCtx), activationDescriptor, - one(), srcTensorDesc, srcData, - zero(), dstTensorDesc, dstData); + one(), srcTensorDesc, srcData, + zero(), dstTensorDesc, dstData); if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ACTIVATION_FORWARD_LIB, System.nanoTime() - t0); } catch (CudaException e) { throw new DMLRuntimeException("Error in conv2d in GPUContext " + gCtx.toString() + " from Thread " + Thread.currentThread().toString(), e); @@ -1245,14 +1245,14 @@ public class LibMatrixCUDA { long t0=0; cudnnTensorDescriptor srcTensorDesc = in.getGPUObject(gCtx).getTensorDescriptor(); if(N*CHW >= numDoublesIn2GB || srcTensorDesc == null) { - LOG.trace("GPU : relu custom kernel" + ", GPUContext=" + gCtx); - // Invokes relu(double* A, double* ret, int rlen, int clen) + LOG.trace("GPU : relu custom kernel" + ", GPUContext=" + gCtx); + // Invokes relu(double* A, double* ret, int rlen, int clen) if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); Pointer dstData = getDensePointer(gCtx, output, instName); Pointer srcData = getDensePointer(gCtx, in, instName); // TODO: FIXME: Add sparse kernel support for relu getCudaKernels(gCtx).launchKernel("relu", - ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(N), toInt(CHW)), - srcData, dstData, toInt(N), toInt(CHW)); + ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(N), toInt(CHW)), + srcData, dstData, toInt(N), toInt(CHW)); if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_KERNEL, System.nanoTime() - t0); } else { @@ -1287,7 +1287,7 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void matmultTSMM(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject left, String outputName, - boolean isLeftTransposed) throws DMLRuntimeException { + boolean isLeftTransposed) throws DMLRuntimeException { LOG.trace("GPU : matmultTSMM" + ", GPUContext=" + gCtx); if (ec.getGPUContext(0) != gCtx) throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); @@ -1303,7 +1303,7 @@ public class LibMatrixCUDA { // Note: the dimensions are swapped int m = toInt(isLeftTransposed ? left.getNumColumns() : left.getNumRows()); int k = toInt(isLeftTransposed ? left.getNumRows() : left.getNumColumns()); - + // For dense TSMM, exploit cublasDsyrk(...) and call custom kernel to flip the matrix MatrixObject output = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, m, m); // Allocated the dense output matrix @@ -1343,8 +1343,8 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ private static void copyUpperToLowerTriangle(GPUContext gCtx, String instName, MatrixObject ret) throws DMLRuntimeException { - LOG.trace("GPU : copyUpperToLowerTriangle" + ", GPUContext=" + gCtx); - if(isInSparseFormat(gCtx, ret)) { + LOG.trace("GPU : copyUpperToLowerTriangle" + ", GPUContext=" + gCtx); + if(isInSparseFormat(gCtx, ret)) { throw new DMLRuntimeException("Sparse GPU copyUpperToLowerTriangle is not implemented"); } if(ret.getNumRows() != ret.getNumColumns()) { @@ -1352,8 +1352,8 @@ public class LibMatrixCUDA { } int dim = toInt(ret.getNumRows()); getCudaKernels(gCtx).launchKernel("copy_u2l_dense", - ExecutionConfig.getConfigForSimpleMatrixOperations(dim, dim), - getDensePointer(gCtx, ret, instName), dim, dim*dim); + ExecutionConfig.getConfigForSimpleMatrixOperations(dim, dim), + getDensePointer(gCtx, ret, instName), dim, dim*dim); } @@ -1389,7 +1389,7 @@ public class LibMatrixCUDA { * @return output of matrix multiply */ public static MatrixObject matmult(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject left, MatrixObject right, String outputName, - boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException { + boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException { if (ec.getGPUContext(0) != gCtx) throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); LOG.trace("GPU : matmult" + ", GPUContext=" + gCtx); @@ -1400,7 +1400,7 @@ public class LibMatrixCUDA { boolean bothSparse = left.getGPUObject(gCtx).isSparse() && right.getGPUObject(gCtx).isSparse(); MatrixObject output = ec.getMatrixObject(outputName); - + long outRLen = isLeftTransposed ? left.getNumColumns() : left.getNumRows(); long outCLen = isRightTransposed ? right.getNumRows() : right.getNumColumns(); @@ -1436,7 +1436,7 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ private static void eitherSparseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right, - boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException { + boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException { int m = toInt(isLeftTransposed ? left.getNumColumns() : left.getNumRows()) ; int n = toInt(isRightTransposed ? right.getNumRows() : right.getNumColumns()); @@ -1476,15 +1476,15 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ private static void denseSparseMatmult(GPUContext gCtx, String instName, MatrixObject left, MatrixObject right, MatrixObject output, - boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k) + boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k) throws DMLRuntimeException { // right sparse, left dense CSRPointer B = right.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); Pointer ADense = getDensePointer(gCtx, left, instName); if (B.isUltraSparse(k, n)){ - LOG.trace(" GPU : Convert d M %*% sp M --> sp M %*% sp M)" + ", GPUContext=" + gCtx); + LOG.trace(" GPU : Convert d M %*% sp M --> sp M %*% sp M)" + ", GPUContext=" + gCtx); - // Convert left to CSR and do cuSparse matmul + // Convert left to CSR and do cuSparse matmul int rowsA = (int)left.getNumRows(); int colsA = (int)left.getNumColumns(); @@ -1497,8 +1497,8 @@ public class LibMatrixCUDA { CSRPointer A = GPUObject.columnMajorDenseToRowMajorSparse(gCtx, getCusparseHandle(gCtx), AT, rowsA, colsA); if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1); - if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseTime.getAndAdd(System.nanoTime() - t0); - if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseCount.getAndAdd(1); + if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseTime.add(System.nanoTime() - t0); + if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseCount.add(1); sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k); if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime(); @@ -1507,7 +1507,7 @@ public class LibMatrixCUDA { if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2); } else { - LOG.trace(" GPU : Convert d M %*% sp M --> d M %*% d M" + ", GPUContext=" + gCtx); + LOG.trace(" GPU : Convert d M %*% sp M --> d M %*% d M" + ", GPUContext=" + gCtx); // Convert right to dense and do a cuBlas matmul // BDenseTransposed is a column major matrix // Note the arguments to denseDenseMatmult to accommodate for this. @@ -1515,8 +1515,8 @@ public class LibMatrixCUDA { if (DMLScript.STATISTICS) t0 = System.nanoTime(); Pointer BDenseTransposed = B.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), (int)right.getNumRows(), (int)right.getNumColumns()); if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0); - if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseTime.getAndAdd(System.nanoTime() - t0); - if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseCount.getAndAdd(System.nanoTime() - t0); + if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseTime.add(System.nanoTime() - t0); + if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseCount.add(System.nanoTime() - t0); if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); boolean allocated = output.getGPUObject(gCtx).acquireDeviceModifyDense(); // To allocate the dense matrix @@ -1550,7 +1550,7 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ private static void sparseDenseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right, - boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k) + boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k) throws DMLRuntimeException { CSRPointer A = left.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); Pointer BDense = getDensePointer(gCtx, right, instName); @@ -1577,8 +1577,8 @@ public class LibMatrixCUDA { CSRPointer B = GPUObject.columnMajorDenseToRowMajorSparse(gCtx, getCusparseHandle(gCtx), BT, rowsB, colsB); if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1); - if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseTime.getAndAdd(System.nanoTime() - t0); - if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseCount.getAndAdd(1); + if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseTime.add(System.nanoTime() - t0); + if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseCount.add(1); sparseSparseMatmult(gCtx, instName, A, B, output, isLeftTransposed, isRightTransposed, m, n, k); @@ -1588,15 +1588,15 @@ public class LibMatrixCUDA { if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2); } else { - LOG.trace(" GPU : Convert sp M %*% d M --> d M %*% d M" + ", GPUContext=" + gCtx); + LOG.trace(" GPU : Convert sp M %*% d M --> d M %*% d M" + ", GPUContext=" + gCtx); // Convert left to dense and do a cuBlas matmul // ADenseTransposed is a column major matrix // Note the arguments to denseDenseMatmult to accommodate for this. if (DMLScript.STATISTICS) t0 = System.nanoTime(); Pointer ADenseTransposed = A.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), (int)left.getNumRows(), (int)left.getNumColumns()); if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0); - if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseTime.getAndAdd(System.nanoTime() - t0); - if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseCount.getAndAdd(System.nanoTime() - t0); + if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseTime.add(System.nanoTime() - t0); + if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseCount.add(System.nanoTime() - t0); if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); boolean allocated = output.getGPUObject(gCtx).acquireDeviceModifyDense(); // To allocate the dense matrix @@ -1629,13 +1629,13 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ private static void sparseMatrixDenseVectorMult(GPUContext gCtx, String instName, MatrixObject output, CSRPointer A, Pointer B_dense, boolean isATranposed, - int m, int k) throws DMLRuntimeException { - LOG.trace("GPU : sp M %*% dense V" + ", GPUContext=" + gCtx); - int transA = CUSPARSE_OPERATION_NON_TRANSPOSE; + int m, int k) throws DMLRuntimeException { + LOG.trace("GPU : sp M %*% dense V" + ", GPUContext=" + gCtx); + int transA = CUSPARSE_OPERATION_NON_TRANSPOSE; long size = m * Sizeof.DOUBLE; if (isATranposed){ size = k * Sizeof.DOUBLE; - transA = CUSPARSE_OPERATION_TRANSPOSE; + transA = CUSPARSE_OPERATION_TRANSPOSE; } Pointer C_dense = gCtx.allocate(instName, (int)size); long t1=0; @@ -1662,8 +1662,8 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ private static void bothSparseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right, - boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException { - int m = toInt(isLeftTransposed ? left.getNumColumns() : left.getNumRows()) ; + boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException { + int m = toInt(isLeftTransposed ? left.getNumColumns() : left.getNumRows()) ; int n = toInt(isRightTransposed ? right.getNumRows() : right.getNumColumns()); int k = toInt(isLeftTransposed ? left.getNumRows() : left.getNumColumns()); int k1 = toInt(isRightTransposed ? right.getNumColumns() : right.getNumRows()); @@ -1701,7 +1701,7 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ private static void sparseMatrixVectorMult(GPUContext gCtx, String instName, MatrixObject output, boolean isATranposed, int m, int n, int k, - CSRPointer A, CSRPointer B) throws DMLRuntimeException { + CSRPointer A, CSRPointer B) throws DMLRuntimeException { long t0=0; if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); Pointer BDenseVector = B.toColumnMajorDenseMatrix(getCusparseHandle(gCtx), getCublasHandle(gCtx), k, 1); @@ -1726,11 +1726,11 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ private static void sparseSparseMatmult(GPUContext gCtx, String instName, CSRPointer A, CSRPointer B, MatrixObject output, - boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k) throws DMLRuntimeException { - LOG.trace("GPU : sp M %*% sp M" + ", GPUContext=" + gCtx); + boolean isLeftTransposed, boolean isRightTransposed, int m, int n, int k) throws DMLRuntimeException { + LOG.trace("GPU : sp M %*% sp M" + ", GPUContext=" + gCtx); - int transA = isLeftTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; - int transB = isRightTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; + int transA = isLeftTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; + int transB = isRightTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; long t0=0, t1=0; if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); @@ -1741,9 +1741,9 @@ public class LibMatrixCUDA { if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); cusparseDcsrgemm(getCusparseHandle(gCtx), transA, transB, m, n, k, - A.descr, (int)A.nnz, A.val, A.rowPtr, A.colInd, - B.descr, (int)B.nnz, B.val, B.rowPtr, B.colInd, - C.descr, C.val, C.rowPtr, C.colInd); + A.descr, (int)A.nnz, A.val, A.rowPtr, A.colInd, + B.descr, (int)B.nnz, B.val, B.rowPtr, B.colInd, + C.descr, C.val, C.rowPtr, C.colInd); //cudaDeviceSynchronize; if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB, System.nanoTime() - t1); } @@ -1762,7 +1762,7 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ private static void denseDenseMatmult(GPUContext gCtx, String instName, MatrixObject output, MatrixObject left, MatrixObject right, - boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException { + boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException { Pointer leftPtr = getDensePointer(gCtx, left, instName); Pointer rightPtr = getDensePointer(gCtx, right, instName); @@ -1773,7 +1773,7 @@ public class LibMatrixCUDA { int rightCols = toInt(right.getNumColumns()); Pointer C = getDensePointer(gCtx, output, instName); denseDenseMatmult(gCtx, instName, C, leftRows, leftCols, rightRows, rightCols, isLeftTransposed, isRightTransposed, - leftPtr, rightPtr); + leftPtr, rightPtr); } /** @@ -1799,9 +1799,9 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void denseDenseMatmult(GPUContext gCtx, String instName, Pointer output, int leftRows1, int leftCols1, int rightRows1, - int rightCols1, boolean isLeftTransposed1, boolean isRightTransposed1, Pointer leftPtr, Pointer rightPtr) + int rightCols1, boolean isLeftTransposed1, boolean isRightTransposed1, Pointer leftPtr, Pointer rightPtr) throws DMLRuntimeException { - LOG.trace("GPU : d M %*% d M" + ", GPUContext=" + gCtx); + LOG.trace("GPU : d M %*% d M" + ", GPUContext=" + gCtx); Pointer A = rightPtr; Pointer B = leftPtr; @@ -1892,7 +1892,7 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if {@link DMLRuntimeException} occurs */ public static void unaryAggregate(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, String output, AggregateUnaryOperator op) - throws DMLRuntimeException { + throws DMLRuntimeException { if (ec.getGPUContext(0) != gCtx) throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); LOG.trace("GPU : unaryAggregate" + ", GPUContext=" + gCtx); @@ -1955,12 +1955,12 @@ public class LibMatrixCUDA { } else if (aggOp.increOp.fn instanceof Builtin) { Builtin b = (Builtin)aggOp.increOp.fn; switch(b.bFunc) { - case MAX: opIndex = OP_MAX; break; - case MIN: opIndex = OP_MIN; break; - case MAXINDEX: opIndex = OP_MAXINDEX; break; - case MININDEX: opIndex = OP_MININDEX;break; - default: - new DMLRuntimeException("Internal Error - Unsupported Builtin Function for Aggregate unary being done on GPU"); + case MAX: opIndex = OP_MAX; break; + case MIN: opIndex = OP_MIN; break; + case MAXINDEX: opIndex = OP_MAXINDEX; break; + case MININDEX: opIndex = OP_MININDEX;break; + default: + new DMLRuntimeException("Internal Error - Unsupported Builtin Function for Aggregate unary being done on GPU"); } } else { throw new DMLRuntimeException("Internal Error - Aggregate operator has invalid Value function"); @@ -1980,7 +1980,7 @@ public class LibMatrixCUDA { // throw new DMLRuntimeException("Internal Error - Not implemented"); } - + long outRLen = -1; long outCLen = -1; if (indexFn instanceof ReduceRow) { // COL{SUM, MAX...} @@ -2004,210 +2004,210 @@ public class LibMatrixCUDA { // For scalars, set the scalar output in the Execution Context object switch (opIndex){ - case OP_PLUS: { - switch(reductionDirection) { - case REDUCTION_ALL : { - double result = reduceAll(gCtx, instName, "reduce_sum", in, size); - ec.setScalarOutput(output, new DoubleObject(result)); - break; - } - case REDUCTION_COL : { // The names are a bit misleading, REDUCTION_COL refers to the direction (reduce all elements in a column) - reduceRow(gCtx, instName, "reduce_row_sum", in, out, rlen, clen); - break; - } - case REDUCTION_ROW : { - reduceCol(gCtx, instName, "reduce_col_sum", in, out, rlen, clen); - break; - } - case REDUCTION_DIAG : - throw new DMLRuntimeException("Internal Error - Row, Column and Diag summation not implemented yet"); - } + case OP_PLUS: { + switch(reductionDirection) { + case REDUCTION_ALL : { + double result = reduceAll(gCtx, instName, "reduce_sum", in, size); + ec.setScalarOutput(output, new DoubleObject(result)); break; } - case OP_PLUS_SQ : { - // Calculate the squares in a temporary object tmp - Pointer tmp = gCtx.allocate(instName, size * Sizeof.DOUBLE); - - squareMatrix(gCtx, instName, in, tmp, rlen, clen); - // Then do the sum on the temporary object and free it - switch(reductionDirection) { - case REDUCTION_ALL : { - double result = reduceAll(gCtx, instName, "reduce_sum", tmp, size); - ec.setScalarOutput(output, new DoubleObject(result)); - break; - } - case REDUCTION_COL : { // The names are a bit misleading, REDUCTION_COL refers to the direction (reduce all elements in a column) - reduceRow(gCtx, instName, "reduce_row_sum", tmp, out, rlen, clen); - break; - } - case REDUCTION_ROW : { - reduceCol(gCtx, instName, "reduce_col_sum", tmp, out, rlen, clen); - break; - } - default: - throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for summation squared"); - } - gCtx.cudaFreeHelper(instName, tmp); + case REDUCTION_COL : { // The names are a bit misleading, REDUCTION_COL refers to the direction (reduce all elements in a column) + reduceRow(gCtx, instName, "reduce_row_sum", in, out, rlen, clen); break; } - case OP_MEAN:{ - switch(reductionDirection) { - case REDUCTION_ALL: { - double result = reduceAll(gCtx, instName, "reduce_sum", in, size); - double mean = result / size; - ec.setScalarOutput(output, new DoubleObject(mean)); - break; - } - case REDUCTION_COL: { - reduceRow(gCtx, instName, "reduce_row_mean", in, out, rlen, clen); - break; - } - case REDUCTION_ROW: { - reduceCol(gCtx, instName, "reduce_col_mean", in, out, rlen, clen); - break; - } - default: - throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for mean"); - } + case REDUCTION_ROW : { + reduceCol(gCtx, instName, "reduce_col_sum", in, out, rlen, clen); break; } - case OP_MULTIPLY : { - switch (reductionDirection) { - case REDUCTION_ALL: { - double result = reduceAll(gCtx, instName, "reduce_prod", in, size); - ec.setScalarOutput(output, new DoubleObject(result)); - break; - } - default: - throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for multiplication"); - } + case REDUCTION_DIAG : + throw new DMLRuntimeException("Internal Error - Row, Column and Diag summation not implemented yet"); + } + break; + } + case OP_PLUS_SQ : { + // Calculate the squares in a temporary object tmp + Pointer tmp = gCtx.allocate(instName, size * Sizeof.DOUBLE); + + squareMatrix(gCtx, instName, in, tmp, rlen, clen); + // Then do the sum on the temporary object and free it + switch(reductionDirection) { + case REDUCTION_ALL : { + double result = reduceAll(gCtx, instName, "reduce_sum", tmp, size); + ec.setScalarOutput(output, new DoubleObject(result)); break; } - case OP_MAX :{ - switch(reductionDirection) { - case REDUCTION_ALL: { - double result = reduceAll(gCtx, instName, "reduce_max", in, size); - ec.setScalarOutput(output, new DoubleObject(result)); - break; - } - case REDUCTION_COL: { - reduceRow(gCtx, instName, "reduce_row_max", in, out, rlen, clen); - break; - } - case REDUCTION_ROW: { - reduceCol(gCtx, instName, "reduce_col_max", in, out, rlen, clen); - break; - } - default: - throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for max"); - } + case REDUCTION_COL : { // The names are a bit misleading, REDUCTION_COL refers to the direction (reduce all elements in a column) + reduceRow(gCtx, instName, "reduce_row_sum", tmp, out, rlen, clen); break; } - case OP_MIN :{ - switch(reductionDirection) { - case REDUCTION_ALL: { - double result = reduceAll(gCtx, instName, "reduce_min", in, size); - ec.setScalarOutput(output, new DoubleObject(result)); - break; - } - case REDUCTION_COL: { - reduceRow(gCtx, instName, "reduce_row_min", in, out, rlen, clen); - break; - } - case REDUCTION_ROW: { - reduceCol(gCtx, instName, "reduce_col_min", in, out, rlen, clen); - break; - } - default: - throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for min"); - } + case REDUCTION_ROW : { + reduceCol(gCtx, instName, "reduce_col_sum", tmp, out, rlen, clen); + break; + } + default: + throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for summation squared"); + } + gCtx.cudaFreeHelper(instName, tmp); + break; + } + case OP_MEAN:{ + switch(reductionDirection) { + case REDUCTION_ALL: { + double result = reduceAll(gCtx, instName, "reduce_sum", in, size); + double mean = result / size; + ec.setScalarOutput(output, new DoubleObject(mean)); + break; + } + case REDUCTION_COL: { + reduceRow(gCtx, instName, "reduce_row_mean", in, out, rlen, clen); + break; + } + case REDUCTION_ROW: { + reduceCol(gCtx, instName, "reduce_col_mean", in, out, rlen, clen); + break; + } + default: + throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for mean"); + } + break; + } + case OP_MULTIPLY : { + switch (reductionDirection) { + case REDUCTION_ALL: { + double result = reduceAll(gCtx, instName, "reduce_prod", in, size); + ec.setScalarOutput(output, new DoubleObject(result)); + break; + } + default: + throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for multiplication"); + } + break; + } + case OP_MAX :{ + switch(reductionDirection) { + case REDUCTION_ALL: { + double result = reduceAll(gCtx, instName, "reduce_max", in, size); + ec.setScalarOutput(output, new DoubleObject(result)); + break; + } + case REDUCTION_COL: { + reduceRow(gCtx, instName, "reduce_row_max", in, out, rlen, clen); + break; + } + case REDUCTION_ROW: { + reduceCol(gCtx, instName, "reduce_col_max", in, out, rlen, clen); + break; + } + default: + throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for max"); + } + break; + } + case OP_MIN :{ + switch(reductionDirection) { + case REDUCTION_ALL: { + double result = reduceAll(gCtx, instName, "reduce_min", in, size); + ec.setScalarOutput(output, new DoubleObject(result)); break; } - case OP_VARIANCE : { - // Temporary GPU array for - Pointer tmp = gCtx.allocate(instName, size * Sizeof.DOUBLE); - Pointer tmp2 = gCtx.allocate(instName, size * Sizeof.DOUBLE); + case REDUCTION_COL: { + reduceRow(gCtx, instName, "reduce_row_min", in, out, rlen, clen); + break; + } + case REDUCTION_ROW: { + reduceCol(gCtx, instName, "reduce_col_min", in, out, rlen, clen); + break; + } + default: + throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for min"); + } + break; + } + case OP_VARIANCE : { + // Temporary GPU array for + Pointer tmp = gCtx.allocate(instName, size * Sizeof.DOUBLE); + Pointer tmp2 = gCtx.allocate(instName, size * Sizeof.DOUBLE); - switch(reductionDirection) { + switch(reductionDirection) { - case REDUCTION_ALL: { - double result = reduceAll(gCtx, instName, "reduce_sum", in, size); - double mean = result / size; + case REDUCTION_ALL: { + double result = reduceAll(gCtx, instName, "reduce_sum", in, size); + double mean = result / size; - // Subtract mean from every element in the matrix - ScalarOperator minusOp = new RightScalarOperator(Minus.getMinusFnObject(), mean); - matrixScalarOp(gCtx, instName, in, mean, rlen, clen, tmp, minusOp); + // Subtract mean from every element in the matrix + ScalarOperator minusOp = new RightScalarOperator(Minus.getMinusFnObject(), mean); + matrixScalarOp(gCtx, instName, in, mean, rlen, clen, tmp, minusOp); - squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen); + squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen); - double result2 = reduceAll(gCtx, instName, "reduce_sum", tmp2, size); - double variance = result2 / (size - 1); - ec.setScalarOutput(output, new DoubleObject(variance)); + double result2 = reduceAll(gCtx, instName, "reduce_sum", tmp2, size); + double variance = result2 / (size - 1); + ec.setScalarOutput(output, new DoubleObject(variance)); - break; - } - case REDUCTION_COL: { - reduceRow(gCtx, instName, "reduce_row_mean", in, out, rlen, clen); - // Subtract the row-wise mean from every element in the matrix - BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject()); - matrixMatrixOp(gCtx, instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.COLUMN.code(), tmp, minusOp); + break; + } + case REDUCTION_COL: { + reduceRow(gCtx, instName, "reduce_row_mean", in, out, rlen, clen); + // Subtract the row-wise mean from every element in the matrix + BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject()); + matrixMatrixOp(gCtx, instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.COLUMN.code(), tmp, minusOp); - squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen); + squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen); - Pointer tmpRow = gCtx.allocate(instName, rlen * Sizeof.DOUBLE); - reduceRow(gCtx, instName, "reduce_row_sum", tmp2, tmpRow, rlen, clen); + Pointer tmpRow = gCtx.allocate(instName, rlen * Sizeof.DOUBLE); + reduceRow(gCtx, instName, "reduce_row_sum", tmp2, tmpRow, rlen, clen); - ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), clen - 1); - matrixScalarOp(gCtx, instName, tmpRow, clen - 1, rlen, 1, out, divideOp); + ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), clen - 1); + matrixScalarOp(gCtx, instName, tmpRow, clen - 1, rlen, 1, out, divideOp); - gCtx.cudaFreeHelper(instName, tmpRow); + gCtx.cudaFreeHelper(instName, tmpRow); - break; - } - case REDUCTION_ROW: { - reduceCol(gCtx, instName, "reduce_col_mean", in, out, rlen, clen); - // Subtract the columns-wise mean from every element in the matrix - BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject()); - matrixMatrixOp(gCtx, instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.ROW.code(), tmp, minusOp); + break; + } + case REDUCTION_ROW: { + reduceCol(gCtx, instName, "reduce_col_mean", in, out, rlen, clen); + // Subtract the columns-wise mean from every element in the matrix + BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject()); + matrixMatrixOp(gCtx, instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.ROW.code(), tmp, minusOp); - squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen); + squareMatrix(gCtx, instName, tmp, tmp2, rlen, clen); - Pointer tmpCol = gCtx.allocate(instName, clen * Sizeof.DOUBLE); - reduceCol(gCtx, instName, "reduce_col_sum", tmp2, tmpCol, rlen, clen); + Pointer tmpCol = gCtx.allocate(instName, clen * Sizeof.DOUBLE); + reduceCol(gCtx, instName, "reduce_col_sum", tmp2, tmpCol, rlen, clen); - ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), rlen - 1); - matrixScalarOp(gCtx, instName, tmpCol, rlen - 1, 1, clen, out, divideOp); + ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), rlen - 1); + matrixScalarOp(gCtx, instName, tmpCol, rlen - 1, 1, clen, out, divideOp); - gCtx.cudaFreeHelper(instName, tmpCol); + gCtx.cudaFreeHelper(instName, tmpCol); - break; - } - default: - throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for variance"); - } - gCtx.cudaFreeHelper(instName, tmp); - gCtx.cudaFreeHelper(instName, tmp2); break; } - case OP_MAXINDEX : { - switch(reductionDirection) { - case REDUCTION_COL: - throw new DMLRuntimeException("Internal Error - Column maxindex of matrix not implemented yet for GPU "); - default: - throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for maxindex"); - } - // break; + default: + throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for variance"); } - case OP_MININDEX : { - switch(reductionDirection) { - case REDUCTION_COL: - throw new DMLRuntimeException("Internal Error - Column minindex of matrix not implemented yet for GPU "); - default: - throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for minindex"); - } - // break; + gCtx.cudaFreeHelper(instName, tmp); + gCtx.cudaFreeHelper(instName, tmp2); + break; + } + case OP_MAXINDEX : { + switch(reductionDirection) { + case REDUCTION_COL: + throw new DMLRuntimeException("Internal Error - Column maxindex of matrix not implemented yet for GPU "); + default: + throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for maxindex"); + } + // break; + } + case OP_MININDEX : { + switch(reductionDirection) { + case REDUCTION_COL: + throw new DMLRuntimeException("Internal Error - Column minindex of matrix not implemented yet for GPU "); + default: + throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for minindex"); } - default : throw new DMLRuntimeException("Internal Error - Invalid GPU Unary aggregate function!"); + // break; + } + default : throw new DMLRuntimeException("Internal Error - Invalid GPU Unary aggregate function!"); } } @@ -2222,7 +2222,7 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if error */ private static void squareMatrix(GPUContext gCtx, String instName, Pointer in, Pointer out, int rlen, int clen) throws DMLRuntimeException { - ScalarOperator power2op = new RightScalarOperator(Power.getPowerFnObject(), 2); + ScalarOperator power2op = new RightScalarOperator(Power.getPowerFnObject(), 2); matrixScalarOp(gCtx, instName, in, 2, rlen, clen, out, power2op); } @@ -2236,9 +2236,9 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ private static double reduceAll(GPUContext gCtx, String instName, String kernelFunction, Pointer in, int n) throws DMLRuntimeException { - LOG.trace("GPU : reduceAll for " + kernelFunction + ", GPUContext=" + gCtx); + LOG.trace("GPU : reduceAll for " + kernelFunction + ", GPUContext=" + gCtx); - int[] tmp = getKernelParamsForReduceAll(gCtx, n); + int[] tmp = getKernelParamsForReduceAll(gCtx, n); int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2]; Pointer tempOut = gCtx.allocate(instName, n * Sizeof.DOUBLE); @@ -2256,7 +2256,7 @@ public class LibMatrixCUDA { blocks = tmp[0]; threads = tmp[1]; sharedMem = tmp[2]; if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime(); getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), - tempOut, tempOut, s); + tempOut, tempOut, s); if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ALL_KERNEL, System.nanoTime() - t2); s = (s + (threads*2-1)) / (threads*2); } @@ -2282,15 +2282,15 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ private static void reduceRow(GPUContext gCtx, String instName, String kernelFunction, Pointer in, Pointer out, int rows, int cols) throws DMLRuntimeException { - LOG.trace("GPU : reduceRow for " + kernelFunction + ", GPUContext=" + gCtx); + LOG.trace("GPU : reduceRow for " + kernelFunction + ", GPUContext=" + gCtx); - int[] tmp = getKernelParamsForReduceByRow(gCtx, rows, cols); + int[] tmp = getKernelParamsForReduceByRow(gCtx, rows, cols); int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2]; long t0=0; if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), - in, out, rows, cols); + in, out, rows, cols); //cudaDeviceSynchronize; if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ROW_KERNEL, System.nanoTime() - t0); @@ -2308,15 +2308,15 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ private static void reduceCol(GPUContext gCtx, String instName, String kernelFunction, Pointer in, Pointer out, int rows, int cols) throws DMLRuntimeException { - LOG.trace("GPU : reduceCol for " + kernelFunction + ", GPUContext=" + gCtx); + LOG.trace("GPU : reduceCol for " + kernelFunction + ", GPUContext=" + gCtx); - int[] tmp = getKernelParamsForReduceByCol(gCtx, rows, cols); + int[] tmp = getKernelParamsForReduceByCol(gCtx, rows, cols); int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2]; long t0=0; if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); getCudaKernels(gCtx).launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), - in, out, rows, cols); + in, out, rows, cols); //cudaDeviceSynchronize; if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_COL_KERNEL, System.nanoTime() - t0); } @@ -2448,51 +2448,51 @@ public class LibMatrixCUDA { throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); double constant = op.getConstant(); LOG.trace("GPU : matrixScalarArithmetic, scalar: " + constant + ", GPUContext=" + gCtx); - + int outRLen = isInputTransposed ? (int) in.getNumColumns() : (int) in.getNumRows(); int outCLen = isInputTransposed ? (int) in.getNumRows() : (int) in.getNumColumns(); - + //boolean isCUDALibAvailable = (op.fn instanceof Multiply // || (op.fn instanceof Divide && op instanceof RightScalarOperator && constant != 0)) && !isSparseAndEmpty(gCtx, in); //if(!isCUDALibAvailable) { - if(constant == 0) { - if(op.fn instanceof Plus || (op.fn instanceof Minus && op instanceof RightScalarOperator) || op.fn instanceof Or) { - deviceCopy(ec, gCtx, instName, in, outputName, isInputTransposed); - } - else if(op.fn instanceof Multiply || op.fn instanceof And) { - setOutputToConstant(ec, gCtx, instName, 0.0, outputName, outRLen, outCLen); - } - else if(op.fn instanceof Power) { - setOutputToConstant(ec, gCtx, instName, 1.0, outputName, outRLen, outCLen); - } - // TODO: - // x/0.0 is either +Infinity or -Infinity according to Java. - // In the context of a matrix, different elements of the matrix - // could have different values. - // If the IEEE 754 standard defines otherwise, this logic needs - // to be re-enabled and the Java computation logic for divide by zero - // needs to be revisited - //else if(op.fn instanceof Divide && isSparseAndEmpty(gCtx, in)) { - // setOutputToConstant(ec, gCtx, instName, Double.NaN, outputName); - //} - //else if(op.fn instanceof Divide) { - // //For division, IEEE 754 defines x/0.0 as INFINITY and 0.0/0.0 as NaN. - // compareAndSet(ec, gCtx, instName, in, outputName, 0.0, 1e-6, Double.NaN, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY); - //} - else { - // TODO: Potential to optimize - matrixScalarOp(ec, gCtx, instName, in, outputName, isInputTransposed, op); - } + if(constant == 0) { + if(op.fn instanceof Plus || (op.fn instanceof Minus && op instanceof RightScalarOperator) || op.fn instanceof Or) { + deviceCopy(ec, gCtx, instName, in, outputName, isInputTransposed); } - else if(constant == 1.0 && op.fn instanceof Or) { - setOutputToConstant(ec, gCtx, instName, 1.0, outputName, outRLen, outCLen); + else if(op.fn instanceof Multiply || op.fn instanceof And) { + setOutputToConstant(ec, gCtx, instName, 0.0, outputName, outRLen, outCLen); } - else if(constant == 1.0 && (op.fn instanceof And || op.fn instanceof Power)) { - deviceCopy(ec, gCtx, instName, in, outputName, isInputTransposed); + else if(op.fn instanceof Power) { + setOutputToConstant(ec, gCtx, instName, 1.0, outputName, outRLen, outCLen); } + // TODO: + // x/0.0 is either +Infinity or -Infinity according to Java. + // In the context of a matrix, different elements of the matrix + // could have different values. + // If the IEEE 754 standard defines otherwise, this logic needs + // to be re-enabled and the Java computation logic for divide by zero + // needs to be revisited + //else if(op.fn instanceof Divide && isSparseAndEmpty(gCtx, in)) { + // setOutputToConstant(ec, gCtx, instName, Double.NaN, outputName); + //} + //else if(op.fn instanceof Divide) { + // //For division, IEEE 754 defines x/0.0 as INFINITY and 0.0/0.0 as NaN. + // compareAndSet(ec, gCtx, instName, in, outputName, 0.0, 1e-6, Double.NaN, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY); + //} else { + // TODO: Potential to optimize matrixScalarOp(ec, gCtx, instName, in, outputName, isInputTransposed, op); } + } + else if(constant == 1.0 && op.fn instanceof Or) { + setOutputToConstant(ec, gCtx, instName, 1.0, outputName, outRLen, outCLen); + } + else if(constant == 1.0 && (op.fn instanceof And || op.fn instanceof Power)) { + deviceCopy(ec, gCtx, instName, in, outputName, isInputTransposed); + } + else { + matrixScalarOp(ec, gCtx, instName, in, outputName, isInputTransposed, op); + } // } //else { // double alpha = 0; @@ -2506,8 +2506,8 @@ public class LibMatrixCUDA { // throw new DMLRuntimeException("Unsupported op"); // } - // TODO: Performance optimization: Call cublasDaxpy if(in.getNumRows() == 1 || in.getNumColumns() == 1) - // C = alpha* op( A ) + beta* op ( B ) + // TODO: Performance optimization: Call cublasDaxpy if(in.getNumRows() == 1 || in.getNumColumns() == 1) + // C = alpha* op( A ) + beta* op ( B ) // dgeam(ec, gCtx, instName, in, in, outputName, isInputTransposed, isInputTransposed, alpha, 0.0); //} } @@ -2563,15 +2563,15 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void matrixMatrixArithmetic(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, - String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) throws DMLRuntimeException { + String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) throws DMLRuntimeException { if (ec.getGPUContext(0) != gCtx) throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); boolean isCUDALibAvailable = (op.fn instanceof Plus || op.fn instanceof Minus) && !isSparseAndEmpty(gCtx, in1) && !isSparseAndEmpty(gCtx, in2) && !isVector(in1) && !isVector(in2); if(!isCUDALibAvailable) { - matrixMatrixOp(ec, gCtx, instName, in1, in2, outputName, isLeftTransposed, isRightTransposed, op); + matrixMatrixOp(ec, gCtx, instName, in1, in2, outputName, isLeftTransposed, isRightTransposed, op); } else { - double alpha; + double alpha; double beta; if(op.fn instanceof Plus) { alpha = 1.0; @@ -2602,7 +2602,7 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ private static void matrixScalarOp(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName, boolean isInputTransposed, - ScalarOperator op) throws DMLRuntimeException { + ScalarOperator op) throws DMLRuntimeException { if (ec.getGPUContext(0) != gCtx) throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); if(isInputTransposed) @@ -2638,11 +2638,11 @@ public class LibMatrixCUDA { int isLeftScalar = (op instanceof LeftScalarOperator) ? 1 : 0; int size = rlenA * clenA; long t0=0; - if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); - getCudaKernels(gCtx).launchKernel("matrix_scalar_op", - ExecutionConfig.getConfigForSimpleVectorOperations(size), - a, scalar, c, size, getBinaryOp(op.fn), isLeftScalar); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MATRIX_SCALAR_OP_KERNEL, System.nanoTime() - t0); + if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); + getCudaKernels(gCtx).launchKernel("matrix_scalar_op", + ExecutionConfig.getConfigForSimpleVectorOperations(size), + a, scalar, c, size, getBinaryOp(op.fn), isLeftScalar); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MATRIX_SCALAR_OP_KERNEL, System.nanoTime() - t0); } /** @@ -2660,7 +2660,7 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ private static void matrixMatrixOp(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, - String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) throws DMLRuntimeException { + String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) throws DMLRuntimeException { if (ec.getGPUContext(0) != gCtx) throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); boolean isEmpty1 = isSparseAndEmpty(gCtx, in1); @@ -2671,7 +2671,7 @@ public class LibMatrixCUDA { int clenB = toInt(in2.getNumColumns()); int vecStatusA = getVectorStatus(rlenA, clenA).code(); int vecStatusB = getVectorStatus(rlenB, clenB).code(); - + if(isLeftTransposed || isRightTransposed) { throw new DMLRuntimeException("Unsupported operator: GPU transposed binary op " + isLeftTransposed + " " + isRightTransposed); } @@ -2742,8 +2742,8 @@ public class LibMatrixCUDA { long t0=0; if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); getCudaKernels(gCtx).launchKernel("matrix_matrix_cellwise_op", - ExecutionConfig.getConfigForSimpleMatrixOperations(maxRlen, maxClen), - a, b, c, maxRlen, maxClen, vecStatusA, vecStatusB, getBinaryOp(op.fn)); + ExecutionConfig.getConfigForSimpleMatrixOperations(maxRlen, maxClen), + a, b, c, maxRlen, maxClen, vecStatusA, vecStatusB, getBinaryOp(op.fn)); if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MATRIX_MATRIX_CELLWISE_OP_KERNEL, System.nanoTime() - t0); } @@ -2816,7 +2816,7 @@ public class LibMatrixCUDA { @SuppressWarnings("unused") private static void compareAndSet(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in, String outputName, double compareVal, double tolerance, - double ifEqualsVal, double ifLessThanVal, double ifGreaterThanVal) throws DMLRuntimeException { + double ifEqualsVal, double ifLessThanVal, double ifGreaterThanVal) throws DMLRuntimeException { if (ec.getGPUContext(0) != gCtx) throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); Pointer A = getDensePointer(gCtx, in, instName); // TODO: FIXME: Implement sparse kernel @@ -2825,14 +2825,14 @@ public class LibMatrixCUDA { int clen = toInt(out.getNumColumns()); getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, rlen, clen); // Allocated the dense output matrix Pointer ret = getDensePointer(gCtx, out, instName); - + // out.getMatrixCharacteristics().setNonZeros(rlen*clen); // compareAndSet(double* A, double* ret, int rlen, int clen, double compareVal, double ifEqualsVal, double ifNotEqualsVal) long t0=0; if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); getCudaKernels(gCtx).launchKernel("compare_and_set", - ExecutionConfig.getConfigForSimpleMatrixOperations(rlen, clen), - A, ret, rlen, clen, compareVal, tolerance, ifEqualsVal, ifLessThanVal, ifGreaterThanVal); + ExecutionConfig.getConfigForSimpleMatrixOperations(rlen, clen), + A, ret, rlen, clen, compareVal, tolerance, ifEqualsVal, ifLessThanVal, ifGreaterThanVal); if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_COMPARE_AND_SET_KERNEL, System.nanoTime() - t0); } @@ -2935,7 +2935,7 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ private static void dgeam(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName, - boolean isLeftTransposed, boolean isRightTransposed, double alpha, double beta) throws DMLRuntimeException { + boolean isLeftTransposed, boolean isRightTransposed, double alpha, double beta) throws DMLRuntimeException { if (ec.getGPUContext(0) != gCtx) throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); LOG.trace("GPU : dgeam" + ", GPUContext=" + gCtx); @@ -2944,7 +2944,7 @@ public class LibMatrixCUDA { Pointer betaPtr = pointerTo(beta); int transa = isLeftTransposed ? CUBLAS_OP_T : CUBLAS_OP_N; int transb = isRightTransposed ? CUBLAS_OP_T : CUBLAS_OP_N; - + long outRLen = isLeftTransposed ? in1.getNumColumns() : in1.getNumRows(); long outCLen = isLeftTransposed ? in1.getNumRows() : in1.getNumColumns(); @@ -3086,6 +3086,70 @@ public class LibMatrixCUDA { //**************** Matrix Manipulation Functions *********************/ //********************************************************************/ + /** + * Method to perform rangeReIndex operation for a given lower and upper bounds in row and column dimensions. + * + * @param ec current execution context + * @param gCtx current gpu context + * @param instName name of the instruction for maintaining statistics + * @param in1 input matrix object + * @param ixrange index range (0-based) + * @param outputName output matrix object + * @throws DMLRuntimeException if error occurs + */ + public static void sliceOperations(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, + IndexRange ixrange, String outputName) throws DMLRuntimeException { + if (ec.getGPUContext(0) != gCtx) + throw new DMLRuntimeException( + "GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); + LOG.trace("GPU : sliceOperations" + ", GPUContext=" + gCtx); + + int rl = (int) ixrange.rowStart; + int ru = (int) ixrange.rowEnd; + int cl = (int) ixrange.colStart; + int cu = (int) ixrange.colEnd; + if (rl < 0 || rl >= in1.getNumRows() || ru < rl || ru >= in1.getNumRows() || cl < 0 + || cu >= in1.getNumColumns() || cu < cl || cu >= in1.getNumColumns()) { + throw new DMLRuntimeException("Invalid values for matrix indexing: [" + (rl + 1) + ":" + (ru + 1) + "," + + (cl + 1) + ":" + (cu + 1) + "] " + "must be within matrix dimensions [" + in1.getNumRows() + "," + + in1.getNumColumns() + "]"); + } + + int len1 = toInt(in1.getNumColumns()); + int len2 = toInt(ec.getMatrixObject(outputName).getNumColumns()); + if(isInSparseFormat(gCtx, in1)) { + // Input in1 is in sparse format and output is in dense format + MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, ru - rl + 1, cu - cl + 1); + CSRPointer inPointer = getSparsePointer(gCtx, in1, instName); + Pointer outPointer = getDensePointer(gCtx, out, instName); + int size = ru - rl + 1; + long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; + // Performs a slice operation where the input matrix is sparse and the output matrix is dense. + // This function avoids unnecessary sparse to dense conversion of the input matrix. + // We can generalize this later to output sparse matrix. + getCudaKernels(gCtx).launchKernel("slice_sparse_dense", ExecutionConfig.getConfigForSimpleVectorOperations(size), + inPointer.val, inPointer.rowPtr, inPointer.colInd, outPointer, rl, ru, cl, cu); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RIX_SPARSE_DENSE_OP, System.nanoTime() - t0); + } + else { + // Input in1 is in dense format (see inPointer) + MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, ru - rl + 1, cu - cl + 1); + + Pointer inPointer = getDensePointer(gCtx, in1, instName); + Pointer outPointer = getDensePointer(gCtx, out, instName); + long t0 = GPUStatistics.DISPLAY_STATISTICS ? System.nanoTime() : 0; + if (len1 == len2) { + cudaMemcpy(outPointer, inPointer.withByteOffset(rl * len1 * Sizeof.DOUBLE), (ru - rl + 1) * len1 + * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice); + } else { + for (int i = rl, ix1 = rl * len1 + cl, ix2 = 0; i <= ru; i++, ix1 += len1, ix2 += len2) { + cudaMemcpy(outPointer.withByteOffset(ix2 * Sizeof.DOUBLE), + inPointer.withByteOffset(ix1 * Sizeof.DOUBLE), len2 * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice); + } + } + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RIX_DENSE_OP, System.nanoTime() - t0); + } + } public static void cbind(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName) throws DMLRuntimeException { if (ec.getGPUContext(0) != gCtx) @@ -3093,7 +3157,7 @@ public class LibMatrixCUDA { LOG.trace("GPU : cbind" + ", GPUContext=" + gCtx); long t1 = 0; - + long rowsA = toInt(in1.getNumRows()); long colsA = toInt(in1.getNumColumns()); long rowsB = toInt(in2.getNumRows()); @@ -3114,8 +3178,8 @@ public class LibMatrixCUDA { if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); getCudaKernels(gCtx) - .launchKernel("cbind", ExecutionConfig.getConfigForSimpleMatrixOperations(maxRows, maxCols), A, B, C, - rowsA, colsA, rowsB, colsB); + .launchKernel("cbind", ExecutionConfig.getConfigForSimpleMatrixOperations(maxRows, maxCols), A, B, C, + rowsA, colsA, rowsB, colsB); if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CBIND_KERNEL, System.nanoTime() - t1); } @@ -3126,7 +3190,7 @@ public class LibMatrixCUDA { LOG.trace("GPU : rbind" + ", GPUContext=" + gCtx); long t1 = 0; - + int rowsA = toInt(in1.getNumRows()); int colsA = toInt(in1.getNumColumns()); int rowsB = toInt(in2.getNumRows()); @@ -3147,8 +3211,8 @@ public class LibMatrixCUDA { if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); getCudaKernels(gCtx) - .launchKernel("rbind", ExecutionConfig.getConfigForSimpleMatrixOperations(maxRows, maxCols), A, B, C, - rowsA, colsA, rowsB, colsB); + .launchKernel("rbind", ExecutionConfig.getConfigForSimpleMatrixOperations(maxRows, maxCols), A, B, C, + rowsA, colsA, rowsB, colsB); if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RBIND_KERNEL, System.nanoTime() - t1); } @@ -3422,7 +3486,7 @@ public class LibMatrixCUDA { * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static void axpy(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, - String outputName, double constant) throws DMLRuntimeException { + String outputName, double constant) throws DMLRuntimeException { if (ec.getGPUContext(0) != gCtx) throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); Pointer A = getDensePointer(gCtx, in1, instName); @@ -3433,9 +3497,9 @@ public class LibMatrixCUDA { long t1=0, t2=0; if(in1.getNumRows() == in2.getNumRows() && in1.getNumColumns() == in2.getNumColumns()) { - LOG.trace("GPU : cublasDaxpy" + ", GPUContext=" + gCtx); + LOG.trace("GPU : cublasDaxpy" + ", GPUContext=" + gCtx); - // Matrix-Matrix daxpy + // Matrix-Matrix daxpy long n = in1.getNumRows()*in2.getNumColumns(); // Since A is always a matrix Pointer alphaPtr = pointerTo(constant); // C <- A + alpha*B @@ -3451,9 +3515,9 @@ public class LibMatrixCUDA { if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DAXPY_LIB, System.nanoTime() - t2); } else { - LOG.trace("GPU : daxpy_matrix_vector" + ", GPUContext=" + gCtx); + LOG.trace("GPU : daxpy_matrix_vector" + ", GPUContext=" + gCtx); - // Matrix-Vector daxpy + // Matrix-Vector daxpy // Note: Vector-Matrix operation is not supported // daxpy_matrix_vector(double* A, double* B, double alpha, double* ret, int rlenA, int clenA, int rlenB, int clenB) if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime(); @@ -3466,51 +3530,51 @@ public class LibMatrixCUDA { } - /** - * Implements the "solve" function for systemml Ax = B (A is of size m*n, B is of size m*1, x is of size n*1) - * - * @param ec a valid {@link ExecutionContext} - * @param gCtx a valid {@link GPUContext} - * @param instName the invoking instruction's name for record {@link Statistics}. - * @param in1 input matrix A - * @param in2 input matrix B - * @param outputName name of the output matrix - * @throws DMLRuntimeException if an error occurs - */ - public static void solve(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName) throws DMLRuntimeException { - if (ec.getGPUContext(0) != gCtx) - throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); + /** + * Implements the "solve" function for systemml Ax = B (A is of size m*n, B is of size m*1, x is of size n*1) + * + * @param ec a valid {@link ExecutionContext} + * @param gCtx a valid {@link GPUContext} + * @param instName the invoking instruction's name for record {@link Statistics}. + * @param in1 input matrix A + * @param in2 input matrix B + * @param outputName name of the output matrix + * @throws DMLRuntimeException if an error occurs + */ + public static void solve(ExecutionContext ec, GPUContext gCtx, String instName, MatrixObject in1, MatrixObject in2, String outputName) throws DMLRuntimeException { + if (ec.getGPUContext(0) != gCtx) + throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); - // x = solve(A, b) + // x = solve(A, b) LOG.trace("GPU : solve" + ", GPUContext=" + gCtx); long t0 = -1; - if (!isInSparseFormat(gCtx, in1) && !isInSparseFormat(gCtx, in2)) { // Both dense - GPUObject Aobj = in1.getGPUObject(gCtx); - GPUObject bobj = in2.getGPUObject(gCtx); - int m = toInt(in1.getNumRows()); - int n = toInt(in1.getNumColumns()); - if (in2.getNumRows() != m) - throw new DMLRuntimeException("GPU : Incorrect input for solve(), rows in A should be the same as rows in B"); - if (in2.getNumColumns() != 1) - throw new DMLRuntimeException("GPU : Incorrect input for solve(), columns in B should be 1"); - - - // Copy over matrices and - // convert dense matrices to row major - // Operation in cuSolver and cuBlas are for column major dense matrices - // and are destructive to the original input + if (!isInSparseFormat(gCtx, in1) && !isInSparseFormat(gCtx, in2)) { // Both dense + GPUObject Aobj = in1.getGPUObject(gCtx); + GPUObject bobj = in2.getGPUObject(gCtx); + int m = toInt(in1.getNumRows()); + int n = toInt(in1.getNumColumns()); + if (in2.getNumRows() != m) + throw new DMLRuntimeException("GPU : Incorrect input for solve(), rows in A should be the same as rows in B"); + if (in2.getNumColumns() != 1) + throw new DMLRuntimeException("GPU : Incorrect input for solve(), columns in B should be 1"); + + + // Copy over matrices and + // convert dense matrices to row major + // Operation in cuSolver and cuBlas are for column major dense matrices + // and are destructive to the original input if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); - GPUObject ATobj = (GPUObject) Aobj.clone(); + GPUObject ATobj = (GPUObject) Aobj.clone(); if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_OBJECT_CLONE, System.nanoTime() - t0); if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); ATobj.denseRowMajorToColumnMajor(); - if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0); - Pointer A = ATobj.getJcudaDenseMatrixPtr(); + if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0); + Pointer A = ATobj.getJcudaDenseMatrixPtr(); if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); - GPUObject bTobj = (GPUObject) bobj.clone(); + GPUObject bTobj = (GPUObject) bobj.clone(); if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_OBJECT_CLONE, System.nanoTime() - t0); if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); bTobj.denseRowMajorToColumnMajor(); @@ -3518,76 +3582,76 @@ public class LibMatrixCUDA { Pointer b = bTobj.getJcudaDenseMatrixPtr(); - // The following set of operations is done following the example in the cusolver documentation - // http://docs.nvidia.com/cuda/cusolver/#ormqr-example1 + // The following set of operations is done following the example in the cusolver documentation + // http://docs.nvidia.com/cuda/cusolver/#ormqr-example1 - // step 3: query working space of geqrf and ormqr + // step 3: query working space of geqrf and ormqr if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); int[] lwork = {0}; - JCusolverDn.cusolverDnDgeqrf_bufferSize(gCtx.getCusolverDnHandle(), m, n, A, m, lwork); + JCusolverDn.cusolverDnDgeqrf_bufferSize(gCtx.getCusolverDnHandle(), m, n, A, m, lwork); if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR_BUFFER, System.nanoTime() - t0); // step 4: compute QR factorization - Pointer work = gCtx.allocate(instName, lwork[0] * Sizeof.DOUBLE); - Pointer tau = gCtx.allocate(instName, m * Sizeof.DOUBLE); - Pointer devInfo = gCtx.allocate(Sizeof.INT); + Pointer work = gCtx.allocate(instName, lwork[0] * Sizeof.DOUBLE); + Pointer tau = gCtx.allocate(instName, m * Sizeof.DOUBLE); + Pointer devInfo = gCtx.allocate(Sizeof.INT); if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); JCusolverDn.cusolverDnDgeqrf(gCtx.getCusolverDnHandle(), m, n, A, m, tau, work, lwork[0], devInfo); if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_QR, System.nanoTime() - t0); int[] qrError = {-1}; - cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost); - if (qrError[0] != 0) { - throw new DMLRuntimeException("GPU : Error in call to geqrf (QR factorization) as part of solve, argument " + qrError[0] + " was wrong"); - } + cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost); + if (qrError[0] != 0) { + throw new DMLRuntimeException("GPU : Error in call to geqrf (QR factorization) as part of solve, argument " + qrError[0] + " was wrong"); + } - // step 5: compute Q^T*B + // step 5: compute Q^T*B if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); JCusolverDn.cusolverDnDormqr(gCtx.getCusolverDnHandle(), cublasSideMode.CUBLAS_SIDE_LEFT, cublasOperation.CUBLAS_OP_T, m, 1, n, A, m, tau, b, m, work, lwork[0], devInfo); if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ORMQR, System.nanoTime() - t0); cudaMemcpy(Pointer.to(qrError), devInfo, Sizeof.INT, cudaMemcpyDeviceToHost); - if (qrError[0] != 0) { - throw new DMLRuntimeException("GPU : Error in call to ormqr (to compuete Q^T*B after QR factorization) as part of solve, argument " + qrError[0] + " was wrong"); - } + if (qrError[0] != 0) { + throw new DMLRuntimeException("GPU : Error in call to ormqr (to compuete Q^T*B after QR factorization) as part of solve, argument " + qrError[0] + " was wrong"); + } - // step 6: compute x = R \ Q^T*B + // step 6: compute x = R \ Q^T*B if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); JCublas2.cublasDtrsm(gCtx.getCublasHandle(), - cublasSideMode.CUBLAS_SIDE_LEFT, cublasFillMode.CUBLAS_FILL_MODE_UPPER, cublasOperation.CUBLAS_OP_N, cublasDiagType.CUBLAS_DIAG_NON_UNIT, - n, 1, pointerTo(1.0), A, m, b, m); + cublasSideMode.CUBLAS_SIDE_LEFT, cublasFillMode.CUBLAS_FILL_MODE_UPPER, cublasOperation.CUBLAS_OP_N, cublasDiagType.CUBLAS_DIAG_NON_UNIT, + n, 1, pointerTo(1.0), A, m, b, m); if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRSM, System.nanoTime() - t0); if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime(); bTobj.denseColumnMajorToRowMajor(); if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_COLUMN_TO_ROW_MAJOR, System.nanoTime() - t0); - // TODO : Find a way to assign bTobj directly to the output and set the correct flags so as to not crash - // There is an avoidable copy happening here + // TODO : Find a way to assign bTobj directly to the output and set the correct flags so as to not crash + // There is an avoidable copy happening here MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in1.getNumColumns(), 1); - cudaMemcpy(out.getGPUObject(gCtx).getJcudaDenseMatrixPtr(), bTobj.getJcudaDenseMatrixPtr(), n * 1 * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice); + cudaMemcpy(out.getGPUObject(gCtx).getJcudaDenseMatrixPtr(), bTobj.getJcudaDenseMatrixPtr(), n * 1 * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice); - gCtx.cudaFreeHelper(instName, work); - gCtx.cudaFreeHelper(instName, tau); - ATobj.clearData(); - bTobj.clearData(); + gCtx.cudaFreeHelper(instName, work); + gCtx.cudaFreeHelper(instName, tau); + ATobj.clearData(); + bTobj.clearData(); - //debugPrintMatrix(b, n, 1); + //debugPrintMatrix(b, n, 1); - } else if (isInSparseFormat(gCtx, in1) && isInSparseFormat(gCtx, in2)) { // Both sparse - throw new DMLRuntimeException("GPU : solve on sparse inputs not supported"); - } else if (!isInSparseFormat(gCtx, in1) && isInSparseFormat(gCtx, in2)) { // A is dense, b is sparse - // Pointer A = getDensePointer(gCtx, in1, instName); - // Pointer B = getDensePointer(gCtx, in2, instName); - throw new DMLRuntimeException("GPU : solve on sparse inputs not supported"); - } else if (isInSparseFormat(gCtx, in1) && !isInSparseFormat(gCtx, in2)) { // A is sparse, b is dense - throw new DMLRuntimeException("GPU : solve on sparse inputs not supported"); - } + } else if (isInSparseFormat(gCtx, in1) && isInSparseFormat(gCtx, in2)) { // Both sparse + throw new DMLRuntimeException("GPU : solve on sparse inputs not supported"); + } else if (!isInSparseFormat(gCtx, in1) && isInSparseFormat(gCtx, in2)) { // A is dense, b is sparse + // Pointer A = getDensePointer(gCtx, in1, instName); + // Pointer B = getDensePointer(gCtx, in2, instName); + throw new DMLRuntimeException("GPU : solv
<TRUNCATED>
