Repository: systemml Updated Branches: refs/heads/master af9cc8a90 -> 4d3216678
http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java index 82a76b6..cdb69f9 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java @@ -211,7 +211,7 @@ public class LibMatrixCUDA { return gCtx.getCublasHandle(); } - protected static JCudaKernels getCudaKernels(GPUContext gCtx) { + public static JCudaKernels getCudaKernels(GPUContext gCtx) throws DMLRuntimeException { return gCtx.getKernels(); } @@ -244,7 +244,7 @@ public class LibMatrixCUDA { */ public static Pointer one() { if(_one == null || oldDataTypeSize != sizeOfDataType) { - _one = dataTypePointerTo(1.0); + _one = _dataTypePointerTo(1.0); oldDataTypeSize = sizeOfDataType; } return _one; @@ -255,7 +255,7 @@ public class LibMatrixCUDA { */ public static Pointer zero() { if(_zero == null || oldDataTypeSize != sizeOfDataType) { - _zero = dataTypePointerTo(0.0); + _zero = _dataTypePointerTo(0.0); oldDataTypeSize = sizeOfDataType; } return _zero; @@ -268,11 +268,11 @@ public class LibMatrixCUDA { * @param instName the invoking instruction's name for record {@link Statistics}. * @return jcuda pointer */ - protected static Pointer getDensePointer(GPUContext gCtx, MatrixObject input, String instName) { + public static Pointer getDensePointer(GPUContext gCtx, MatrixObject input, String instName) throws DMLRuntimeException { if(isInSparseFormat(gCtx, input)) { input.getGPUObject(gCtx).sparseToDense(instName); } - return input.getGPUObject(gCtx).getJcudaDenseMatrixPtr(); + return input.getGPUObject(gCtx).getDensePointer(); } /** @@ -289,7 +289,7 @@ public class LibMatrixCUDA { return input.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); } - protected static Pointer dataTypePointerTo(double value) { + private static Pointer _dataTypePointerTo(double value) { if(sizeOfDataType == Sizeof.DOUBLE) { return Pointer.to(new double[] { value }); } @@ -301,6 +301,18 @@ public class LibMatrixCUDA { } } + protected static Pointer dataTypePointerTo(double value) { + if(value == 1) { + return one(); + } + else if(value == 0) { + return zero(); + } + else { + return _dataTypePointerTo(value); + } + } + /** * This method computes the backpropagation errors for previous layer of relu operation @@ -355,8 +367,7 @@ public class LibMatrixCUDA { Pointer tmp = gCtx.allocate(instName, cols*sizeOfDataType); reduceCol(gCtx, instName, "reduce_col_sum", imagePointer, tmp, N, cols); reduceRow(gCtx, instName, "reduce_row_sum", tmp, outputPointer, toInt(C), toInt(HW)); - gCtx.cudaFreeHelper(tmp); - + gCtx.cudaFreeHelper(instName, tmp, DMLScript.EAGER_CUDA_FREE); } /** @@ -387,9 +398,9 @@ public class LibMatrixCUDA { if(bias.getNumColumns() != 1 || cols % K != 0) { throw new DMLRuntimeException("Incorrect inputs for bias_multiply: input[" + rows + " X " + cols + "] and bias[" + K + " X " + bias.getNumColumns() + "]"); } - Pointer imagePointer = input.getGPUObject(gCtx).getJcudaDenseMatrixPtr(); - Pointer biasPointer = bias.getGPUObject(gCtx).getJcudaDenseMatrixPtr(); - Pointer outputPointer = outputBlock.getGPUObject(gCtx).getJcudaDenseMatrixPtr(); + Pointer imagePointer = input.getGPUObject(gCtx).getDensePointer(); + Pointer biasPointer = bias.getGPUObject(gCtx).getDensePointer(); + Pointer outputPointer = outputBlock.getGPUObject(gCtx).getDensePointer(); long t1 = 0; if (DMLScript.FINEGRAINED_STATISTICS) t1 = System.nanoTime(); getCudaKernels(gCtx).launchKernel("bias_multiply", @@ -729,7 +740,7 @@ public class LibMatrixCUDA { default: throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for summation squared"); } - gCtx.cudaFreeHelper(instName, tmp); + gCtx.cudaFreeHelper(instName, tmp, DMLScript.EAGER_CUDA_FREE); break; } case OP_MEAN:{ @@ -842,7 +853,7 @@ public class LibMatrixCUDA { ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), clen - 1); matrixScalarOp(gCtx, instName, tmpRow, clen - 1, rlen, 1, out, divideOp); - gCtx.cudaFreeHelper(instName, tmpRow); + gCtx.cudaFreeHelper(instName, tmpRow, DMLScript.EAGER_CUDA_FREE); break; } @@ -860,15 +871,15 @@ public class LibMatrixCUDA { ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), rlen - 1); matrixScalarOp(gCtx, instName, tmpCol, rlen - 1, 1, clen, out, divideOp); - gCtx.cudaFreeHelper(instName, tmpCol); + gCtx.cudaFreeHelper(instName, tmpCol, DMLScript.EAGER_CUDA_FREE); break; } default: throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for variance"); } - gCtx.cudaFreeHelper(instName, tmp); - gCtx.cudaFreeHelper(instName, tmp2); + gCtx.cudaFreeHelper(instName, tmp, DMLScript.EAGER_CUDA_FREE); + gCtx.cudaFreeHelper(instName, tmp2, DMLScript.EAGER_CUDA_FREE); break; } case OP_MAXINDEX : { @@ -923,7 +934,7 @@ public class LibMatrixCUDA { int[] tmp = getKernelParamsForReduceAll(gCtx, n); int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2]; - Pointer tempOut = gCtx.allocate(instName, n * sizeOfDataType); + Pointer tempOut = gCtx.allocate(instName, n*sizeOfDataType); long t1=0,t2=0; @@ -944,7 +955,7 @@ public class LibMatrixCUDA { } double[] result = {-1f}; cudaSupportFunctions.deviceToHost(gCtx, tempOut, result, instName, false); - gCtx.cudaFreeHelper(instName, tempOut); + gCtx.cudaFreeHelper(instName, tempOut, DMLScript.EAGER_CUDA_FREE); return result[0]; } @@ -1699,6 +1710,36 @@ public class LibMatrixCUDA { if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_DGEAM_LIB, System.nanoTime() - t0); } } + + /** + * Computes C = t(A) + * @param ec execution context + * @param gCtx gpu context + * @param instName name of the instruction + * @param A pointer to the input matrix + * @param C pointer to the output matrix + * @param numRowsA number of rows of the input matrix + * @param numColsA number of columns of the output matrix + * @throws DMLRuntimeException if error + */ + public static void denseTranspose(ExecutionContext ec, GPUContext gCtx, String instName, + Pointer A, Pointer C, long numRowsA, long numColsA) throws DMLRuntimeException { + if (ec.getGPUContext(0) != gCtx) + throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : dense transpose" + ", GPUContext=" + gCtx); + } + long t0=0; + // Dense-Dense dgeam + int lda = toInt(numColsA); + int ldb = lda; + int m = toInt(numRowsA); + int n = lda; + int ldc = m; + if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); + cudaSupportFunctions.cublasgeam(getCublasHandle(gCtx), CUBLAS_OP_T, CUBLAS_OP_T, m, n, one(), A, lda, zero(), A, ldb, C, ldc); + if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_DGEAM_LIB, System.nanoTime() - t0); + } //********************************************************************/ @@ -2389,7 +2430,7 @@ public class LibMatrixCUDA { if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); ATobj.denseRowMajorToColumnMajor(); if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0); - Pointer A = ATobj.getJcudaDenseMatrixPtr(); + Pointer A = ATobj.getDensePointer(); if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); GPUObject bTobj = (GPUObject) bobj.clone(); @@ -2399,7 +2440,7 @@ public class LibMatrixCUDA { if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ROW_TO_COLUMN_MAJOR, System.nanoTime() - t0); - Pointer b = bTobj.getJcudaDenseMatrixPtr(); + Pointer b = bTobj.getDensePointer(); // The following set of operations is done following the example in the cusolver documentation // http://docs.nvidia.com/cuda/cusolver/#ormqr-example1 @@ -2447,12 +2488,12 @@ public class LibMatrixCUDA { // TODO : Find a way to assign bTobj directly to the output and set the correct flags so as to not crash // There is an avoidable copy happening here MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in1.getNumColumns(), 1); - cudaMemcpy(out.getGPUObject(gCtx).getJcudaDenseMatrixPtr(), bTobj.getJcudaDenseMatrixPtr(), n * 1 * sizeOfDataType, cudaMemcpyDeviceToDevice); + cudaMemcpy(out.getGPUObject(gCtx).getDensePointer(), bTobj.getDensePointer(), n * 1 * sizeOfDataType, cudaMemcpyDeviceToDevice); - gCtx.cudaFreeHelper(instName, work); - gCtx.cudaFreeHelper(instName, tau); - ATobj.clearData(); - bTobj.clearData(); + gCtx.cudaFreeHelper(instName, work, DMLScript.EAGER_CUDA_FREE); + gCtx.cudaFreeHelper(instName, tau, DMLScript.EAGER_CUDA_FREE); + ATobj.clearData(instName, DMLScript.EAGER_CUDA_FREE); + bTobj.clearData(instName, DMLScript.EAGER_CUDA_FREE); //debugPrintMatrix(b, n, 1); } @@ -2501,6 +2542,39 @@ public class LibMatrixCUDA { GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ALLOCATE_SPARSE_OUTPUT, System.nanoTime() - t0); return mb.getKey(); } - - -} + +// // Small 1-int pointers to avoid unnecessary allocation/deallocation +// private static Pointer _TMP_NNZ_ROW_PTR = null; +// private static Pointer _TMP_NNZ_PTR = null; + /** + * Utility to compute number of non-zeroes on the GPU + * + * @param gCtx the associated GPUContext + * @param densePtr device pointer to the dense matrix + * @param length length of the dense pointer + * @return the number of non-zeroes + */ + public static synchronized int computeNNZ(GPUContext gCtx, Pointer densePtr, int length) { + return (int) reduceAll(gCtx, null, "compute_nnz", densePtr, length); + // This is extremely slow +// cusparseMatDescr matDescr = CSRPointer.getDefaultCuSparseMatrixDescriptor(); +// cusparseHandle cusparseHandle = gCtx.getCusparseHandle(); +// if(_TMP_NNZ_ROW_PTR == null) { +// // As these are 4-byte pointers, using cudaMalloc directly so as not to include them in memory information. +// _TMP_NNZ_ROW_PTR = new Pointer(); +// cudaMalloc(_TMP_NNZ_ROW_PTR, jcuda.Sizeof.INT); +// _TMP_NNZ_PTR = new Pointer(); +// cudaMalloc(_TMP_NNZ_PTR, jcuda.Sizeof.INT); +// // _TMP_NNZ_ROW_PTR = gCtx.allocate(jcuda.Sizeof.INT); +// // _TMP_NNZ_PTR = gCtx.allocate(jcuda.Sizeof.INT); +// } +// // Output is in dense vector format, convert it to CSR +// LibMatrixCUDA.cudaSupportFunctions.cusparsennz(cusparseHandle, cusparseDirection.CUSPARSE_DIRECTION_ROW, 1, length, matDescr, densePtr, 1, +// _TMP_NNZ_ROW_PTR, _TMP_NNZ_PTR); +// int[] nnzC = { -1 }; +// cudaMemcpy(Pointer.to(nnzC), _TMP_NNZ_PTR, jcuda.Sizeof.INT, cudaMemcpyDeviceToHost); +// return nnzC[0]; + } + + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java index 26a4d2e..2bfb8f2 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java @@ -215,7 +215,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { CSRPointer filterPointer = filter.getGPUObject(gCtx).getJcudaSparseMatrixPtr(); Pointer matmultOutputPointer = gCtx.allocate(instName, NKPQ*sizeOfDataType); LibMatrixCuMatMult.sparseDenseMatMult(gCtx, instName, matmultOutputPointer, filterPointer, im2colPointer, K, CRS, CRS, NPQ, K, NPQ, false, false); - gCtx.cudaFreeHelper(instName, im2colPointer); + gCtx.cudaFreeHelper(instName, im2colPointer, DMLScript.EAGER_CUDA_FREE); // Perform reorg_knpq a reorg operation of matmultOutputPointer matrix with dimensions [K, NPQ] // and return a matrix dstPointer with dimensions [N, KPQ] @@ -224,7 +224,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { matmultOutputPointer, dstPointer, NKPQ, NPQ, KPQ, P*Q); if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_REORG_KNPQ_KERNEL, System.nanoTime() - t1); - gCtx.cudaFreeHelper(instName, matmultOutputPointer); + gCtx.cudaFreeHelper(instName, matmultOutputPointer, DMLScript.EAGER_CUDA_FREE); } else { // Filter and output are accounted as dense in the memory estimation for conv2d @@ -444,7 +444,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { } // Deallocate temporary array to hold one element of input - gCtx.cudaFreeHelper(tempdwPointer, true); + gCtx.cudaFreeHelper(instName, tempdwPointer, true); } } } @@ -772,7 +772,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { long t4=0; if (DMLScript.FINEGRAINED_STATISTICS) t4 = System.nanoTime(); if(!isMaxPoolOutputProvided) - gCtx.cudaFreeHelper(instName, y); + gCtx.cudaFreeHelper(instName, y, DMLScript.EAGER_CUDA_FREE); if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t4); } } @@ -818,17 +818,15 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { throw new DMLRuntimeException("GPU : Invalid internal state, the GPUContext set with the ExecutionContext is not the same used to run this LibMatrixCUDA function"); long N = in.getNumRows(); long CHW = in.getNumColumns(); - MatrixObject output = ec.getMatrixObject(outputName); - getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, in.getNumRows(), in.getNumColumns()); // Allocated the dense output matrix + Pointer dstData = getDenseOutputPointer(ec, gCtx, instName, outputName, in.getNumRows(), in.getNumColumns()); long t0=0; if(N*CHW >= maxNumElementsOfCuDNNTensor) { if(LOG.isTraceEnabled()) { LOG.trace("GPU : relu custom kernel" + ", GPUContext=" + gCtx); } // Invokes relu(double* A, double* ret, int rlen, int clen) - if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); - Pointer dstData = getDensePointerForCuDNN(gCtx, output, instName); Pointer srcData = getDensePointerForCuDNN(gCtx, in, instName); // TODO: FIXME: Add sparse kernel support for relu + if (DMLScript.FINEGRAINED_STATISTICS) t0 = System.nanoTime(); getCudaKernels(gCtx).launchKernel("relu", ExecutionConfig.getConfigForSimpleMatrixOperations(toInt(N), toInt(CHW)), srcData, dstData, toInt(N), toInt(CHW)); @@ -838,11 +836,18 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { cudnnTensorDescriptor tensorDescriptor = new cudnnTensorDescriptor(); cudnnCreateTensorDescriptor(tensorDescriptor); cudnnSetTensor4dDescriptor(tensorDescriptor, CUDNN_TENSOR_NCHW, CUDNN_DATA_TYPE, toInt(N), 1, 1, toInt(CHW)); - cudnnReLU(gCtx, instName, in, getDensePointerForCuDNN(gCtx, output, instName), tensorDescriptor); + cudnnReLU(gCtx, instName, in, dstData, tensorDescriptor); cudnnDestroyTensorDescriptor(tensorDescriptor); } } - + + private static Pointer getDenseOutputPointer(ExecutionContext ec, GPUContext gCtx, String instName, String outputName, + long numRows, long numCols) throws DMLRuntimeException { + MatrixObject output = ec.getMatrixObject(outputName); + getDenseMatrixOutputForGPUInstruction(ec, instName, outputName, numRows, numCols); // Allocated the dense output matrix + return getDensePointerForCuDNN(gCtx, output, instName, toInt(numRows), toInt(numCols)); + } + /** * Convenience method to get jcudaDenseMatrixPtr. This method explicitly converts sparse to dense format, so use it judiciously. * @@ -858,6 +863,33 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { } return getDensePointer(gCtx, image, instName); } + + /** + * Convenience method to get jcudaDenseMatrixPtr. This method explicitly converts sparse to dense format, so use it judiciously. + * + * @param gCtx a valid {@link GPUContext} + * @param image input matrix object + * @param instName name of the instruction + * @param numRows expected number of rows + * @param numCols expected number of columns + * @return jcuda pointer + * @throws DMLRuntimeException if error occurs while sparse to dense conversion + */ + public static Pointer getDensePointerForCuDNN(GPUContext gCtx, MatrixObject image, String instName, int numRows, int numCols) throws DMLRuntimeException { + long numElems = image.getNumRows()*image.getNumColumns(); + if(image.getNumRows() != numRows || image.getNumColumns() != numCols) { + throw new DMLRuntimeException("Expected input of size:[" + numRows + ", " + numCols + "], but found [" + image.getNumRows() + ", " + image.getNumColumns() + "]."); + } + else if(numElems > maxNumElementsOfCuDNNTensor) { + throw new DMLRuntimeException("CuDNN restriction: the size of input tensor cannot have greater than 2 giga-elements, but has " + numElems + " (i.e. [" + image.getNumRows() + " X " + image.getNumColumns() + "]). Hint: try reducing the mini-batch size."); + } + Pointer ptr = getDensePointer(gCtx, image, instName); + long sizeOfPtr = gCtx.getMemoryManager().getSizeAllocatedGPUPointer(ptr); + if(sizeOfPtr != numElems*sizeOfDataType) { + throw new DMLRuntimeException("Incorrect pointer: expected size:" + (numElems*sizeOfDataType) + ", but found " + sizeOfPtr); + } + return ptr; + } /** * Convenience method for checking the status of CuDNN kernel. @@ -868,4 +900,4 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { if(status != cudnnStatus.CUDNN_STATUS_SUCCESS) throw new DMLRuntimeException("Error status returned by CuDNN:" + jcuda.jcudnn.cudnnStatus.stringFor(status)); } -} +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java index a50dbc3..432e79e 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNConvolutionAlgorithm.java @@ -97,7 +97,7 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab cudnnDestroyConvolutionDescriptor(convDesc); if(sizeInBytes != 0) { try { - gCtx.cudaFreeHelper(instName, workSpace); + gCtx.cudaFreeHelper(instName, workSpace, DMLScript.EAGER_CUDA_FREE); } catch (DMLRuntimeException e) { throw new RuntimeException(e); } @@ -276,4 +276,4 @@ public class LibMatrixCuDNNConvolutionAlgorithm implements java.lang.AutoCloseab cudnnSetConvolution2dDescriptor(convDesc, padding[0], padding[1], strides[0], strides[1], 1, 1, CUDNN_CROSS_CORRELATION, LibMatrixCUDA.CUDNN_DATA_TYPE); return convDesc; } -} +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java index f52da30..f3ce70d 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNNInputRowFetcher.java @@ -76,7 +76,7 @@ public class LibMatrixCuDNNInputRowFetcher extends LibMatrixCUDA implements java @Override public void close() { try { - gCtx.cudaFreeHelper(outPointer, true); + gCtx.cudaFreeHelper(null, outPointer, true); } catch (DMLRuntimeException e) { throw new RuntimeException(e); } http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java index f476dfe..60b2352 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuMatMult.java @@ -285,7 +285,7 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA { toInt(outRLen), C, toInt(outCLen)); if (!DMLScript.EAGER_CUDA_FREE) JCuda.cudaDeviceSynchronize(); - gCtx.cudaFreeHelper(output, DMLScript.EAGER_CUDA_FREE); + gCtx.cudaFreeHelper(instName, output, DMLScript.EAGER_CUDA_FREE); if (DMLScript.FINEGRAINED_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime() - t0); @@ -466,4 +466,4 @@ public class LibMatrixCuMatMult extends LibMatrixCUDA { private static int reverseCusparseOp(int trans) { return trans == CUSPARSE_OPERATION_TRANSPOSE ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; } -} +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java index 9fec026..5fd642e 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java @@ -315,7 +315,7 @@ public class LibMatrixNative return ret2; } - private static void fromFloatBuffer(FloatBuffer buff, double[] output) { + public static void fromFloatBuffer(FloatBuffer buff, double[] output) { Arrays.parallelSetAll(output, i -> (double)buff.get(i) ); } } http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java b/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java index d5edf48..3bd101c 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/SinglePrecisionCudaSupportFunctions.java @@ -22,6 +22,11 @@ import static jcuda.runtime.JCuda.cudaMemcpy; import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost; import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.FloatBuffer; +import java.util.stream.IntStream; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.sysml.api.DMLScript; @@ -163,7 +168,7 @@ public class SinglePrecisionCudaSupportFunctions implements CudaSupportFunctions @Override public void deviceToHost(GPUContext gCtx, Pointer src, double[] dest, String instName, boolean isEviction) { - long t1 = DMLScript.FINEGRAINED_STATISTICS && instName != null? System.nanoTime() : 0; + long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; // We invoke transfer matrix from device to host in two cases: // 1. During eviction of unlocked matrices // 2. During acquireHostRead @@ -177,40 +182,46 @@ public class SinglePrecisionCudaSupportFunctions implements CudaSupportFunctions Pointer deviceDoubleData = gCtx.allocate(((long)dest.length)*Sizeof.DOUBLE); LibMatrixCUDA.float2double(gCtx, src, deviceDoubleData, dest.length); cudaMemcpy(Pointer.to(dest), deviceDoubleData, ((long)dest.length)*Sizeof.DOUBLE, cudaMemcpyDeviceToHost); - gCtx.cudaFreeHelper(deviceDoubleData); + gCtx.cudaFreeHelper(instName, deviceDoubleData, DMLScript.EAGER_CUDA_FREE); } else { LOG.debug("Potential OOM: Allocated additional space on host in deviceToHost"); - float [] floatData = new float[dest.length]; + FloatBuffer floatData = ByteBuffer.allocateDirect(Sizeof.FLOAT*dest.length).order(ByteOrder.nativeOrder()).asFloatBuffer(); cudaMemcpy(Pointer.to(floatData), src, ((long)dest.length)*Sizeof.FLOAT, cudaMemcpyDeviceToHost); - for(int i = 0; i < dest.length; i++) { - dest[i] = floatData[i]; - } + LibMatrixNative.fromFloatBuffer(floatData, dest); + } + if(DMLScript.STATISTICS) { + long totalTime = System.nanoTime() - t0; + GPUStatistics.cudaFloat2DoubleTime.add(totalTime); + GPUStatistics.cudaFloat2DoubleCount.add(1); + if(DMLScript.FINEGRAINED_STATISTICS && instName != null) + GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_HOST, totalTime); } - if(DMLScript.FINEGRAINED_STATISTICS && instName != null) - GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_HOST, System.nanoTime() - t1); } @Override public void hostToDevice(GPUContext gCtx, double[] src, Pointer dest, String instName) { LOG.debug("Potential OOM: Allocated additional space in hostToDevice"); // TODO: Perform conversion on GPU using double2float and float2double kernels - long t1 = DMLScript.FINEGRAINED_STATISTICS && instName != null? System.nanoTime() : 0; + long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; if(PERFORM_CONVERSION_ON_DEVICE) { Pointer deviceDoubleData = gCtx.allocate(((long)src.length)*Sizeof.DOUBLE); cudaMemcpy(deviceDoubleData, Pointer.to(src), ((long)src.length)*Sizeof.DOUBLE, cudaMemcpyHostToDevice); LibMatrixCUDA.double2float(gCtx, deviceDoubleData, dest, src.length); - gCtx.cudaFreeHelper(deviceDoubleData); + gCtx.cudaFreeHelper(instName, deviceDoubleData, DMLScript.EAGER_CUDA_FREE); } else { - float [] floatData = new float[src.length]; - for(int i = 0; i < src.length; i++) { - floatData[i] = (float) src[i]; - } + FloatBuffer floatData = ByteBuffer.allocateDirect(Sizeof.FLOAT*src.length).order(ByteOrder.nativeOrder()).asFloatBuffer(); + IntStream.range(0, src.length).parallel().forEach(i -> floatData.put(i, (float)src[i])); cudaMemcpy(dest, Pointer.to(floatData), ((long)src.length)*Sizeof.FLOAT, cudaMemcpyHostToDevice); } - if(DMLScript.FINEGRAINED_STATISTICS && instName != null) - GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, System.nanoTime() - t1); + if(DMLScript.STATISTICS) { + long totalTime = System.nanoTime() - t0; + GPUStatistics.cudaDouble2FloatTime.add(totalTime); + GPUStatistics.cudaDouble2FloatCount.add(1); + if(DMLScript.FINEGRAINED_STATISTICS && instName != null) + GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_HOST_TO_DEVICE, totalTime); + } } -} +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/systemml/blob/4d321667/src/main/java/org/apache/sysml/utils/GPUStatistics.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/utils/GPUStatistics.java b/src/main/java/org/apache/sysml/utils/GPUStatistics.java index d12f4dd..f7bee4f 100644 --- a/src/main/java/org/apache/sysml/utils/GPUStatistics.java +++ b/src/main/java/org/apache/sysml/utils/GPUStatistics.java @@ -52,6 +52,10 @@ public class GPUStatistics { public static LongAdder cudaToDevTime = new LongAdder(); // time spent in copying data from host (CPU) to device (GPU) memory public static LongAdder cudaFromDevTime = new LongAdder(); // time spent in copying data from device to host public static LongAdder cudaEvictTime = new LongAdder(); // time spent in eviction + public static LongAdder cudaEvictMallocTime = new LongAdder(); // time spent in eviction + public static LongAdder cudaFloat2DoubleTime = new LongAdder(); // time spent in converting float to double during eviction + public static LongAdder cudaDouble2FloatTime = new LongAdder(); // time spent in converting double to float during eviction + public static LongAdder cudaEvictMemcpyTime = new LongAdder(); // time spent in cudaMemcpy kernel during eviction public static LongAdder cudaForcedClearLazyFreedEvictTime = new LongAdder(); // time spent in forced lazy eviction public static LongAdder cudaForcedClearUnpinnedEvictTime = new LongAdder(); // time spent in forced unpinned eviction public static LongAdder cudaAllocCount = new LongAdder(); @@ -60,6 +64,9 @@ public class GPUStatistics { public static LongAdder cudaToDevCount = new LongAdder(); public static LongAdder cudaFromDevCount = new LongAdder(); public static LongAdder cudaEvictionCount = new LongAdder(); + public static LongAdder cudaFloat2DoubleCount = new LongAdder(); + public static LongAdder cudaDouble2FloatCount = new LongAdder(); + public static LongAdder cudaEvictionMallocCount = new LongAdder(); // Per instruction miscellaneous timers. // Used to record events in a CP Heavy Hitter instruction and @@ -88,6 +95,11 @@ public class GPUStatistics { cudaToDevTime.reset(); cudaFromDevTime.reset(); cudaEvictTime.reset(); + cudaEvictMallocTime.reset(); + cudaFloat2DoubleTime.reset(); + cudaDouble2FloatTime.reset(); + cudaFloat2DoubleCount.reset(); + cudaDouble2FloatCount.reset(); cudaForcedClearLazyFreedEvictTime.reset(); cudaForcedClearUnpinnedEvictTime.reset(); cudaAllocCount.reset(); @@ -95,6 +107,7 @@ public class GPUStatistics { cudaToDevCount.reset(); cudaFromDevCount.reset(); cudaEvictionCount.reset(); + cudaEvictionMallocCount.reset(); resetMiscTimers(); } @@ -193,21 +206,27 @@ public class GPUStatistics { sb.append("CUDA/CuLibraries init time:\t" + String.format("%.3f", cudaInitTime*1e-9) + "/" + String.format("%.3f", cudaLibrariesInitTime*1e-9) + " sec.\n"); sb.append("Number of executed GPU inst:\t" + getNoOfExecutedGPUInst() + ".\n"); - sb.append("GPU mem tx time (alloc/dealloc/set0/toDev/fromDev/evict):\t" + sb.append("GPU mem tx time (alloc/dealloc/set0/toDev(d2f)/fromDev(f2d)/evict(alloc)):\t" + String.format("%.3f", cudaAllocTime.longValue()*1e-9) + "/" + String.format("%.3f", cudaDeAllocTime.longValue()*1e-9) + "/" + String.format("%.3f", cudaMemSet0Time.longValue()*1e-9) + "/" - + String.format("%.3f", cudaToDevTime.longValue()*1e-9) + "/" - + String.format("%.3f", cudaFromDevTime.longValue()*1e-9) + "/" - + String.format("%.3f", cudaEvictTime.longValue()*1e-9) + " sec.\n"); - sb.append("GPU mem tx count (alloc/dealloc/set0/toDev/fromDev/evict):\t" + + String.format("%.3f", cudaToDevTime.longValue()*1e-9) + "(" + + String.format("%.3f", cudaDouble2FloatTime.longValue()*1e-9)+ ")/" + + String.format("%.3f", cudaFromDevTime.longValue()*1e-9) + "(" + + String.format("%.3f", cudaFloat2DoubleTime.longValue()*1e-9) + ")/" + + String.format("%.3f", cudaEvictTime.longValue()*1e-9) + "(" + + String.format("%.3f", cudaEvictMallocTime.longValue()*1e-9) + ") sec.\n"); + sb.append("GPU mem tx count (alloc/dealloc/set0/toDev(d2f)/fromDev(f2d)/evict(alloc)):\t" + cudaAllocCount.longValue() + "/" + cudaDeAllocCount.longValue() + "/" + cudaMemSet0Count.longValue() + "/" + cudaSparseConversionCount.longValue() + "/" - + cudaToDevCount.longValue() + "/" - + cudaFromDevCount.longValue() + "/" - + cudaEvictionCount.longValue() + ".\n"); + + cudaToDevCount.longValue() + "(" + + cudaDouble2FloatCount.longValue() + ")/" + + cudaFromDevCount.longValue() + "(" + + cudaFloat2DoubleCount.longValue() + ")/" + + cudaEvictionCount.longValue() + "(" + + cudaEvictionMallocCount.longValue() + ").\n"); sb.append("GPU conversion time (sparseConv/sp2dense/dense2sp):\t" + String.format("%.3f", cudaSparseConversionTime.longValue()*1e-9) + "/" + String.format("%.3f", cudaSparseToDenseTime.longValue()*1e-9) + "/" @@ -221,4 +240,4 @@ public class GPUStatistics { } -} +} \ No newline at end of file
