[SYSTEMML-540] Added a rewrite to support a common tensor operation (sum over channels)
- Added a rewrite to convert out = rowSums(matrix(colSums(A), rows=C, cols=HW)) to out = channel_sums(A) when nrow(A) > 1 and exectype is CP or GPU. - This avoids unnecessary intermediates and GPU-CP-GPU transfer (for reshape). This saves about ~150 seconds on sentence CNN for 200 epochs. - When we move to a higher CuDNN version, we can replace the custom channel_sums kernel with possibly more optimized CuDNN reduce tensor kernel. - Added the corresponding CPU and GPU tests. - Updated T_MAX(val) to MAX(). Interestingly enough, nvcc was smart enough to remove the parameter automatically, hence the ptx remained the same after the change. Closes #693. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/d916ba5b Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/d916ba5b Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/d916ba5b Branch: refs/heads/master Commit: d916ba5bd8ceec591a04f4d16c6d24f3985e3e4f Parents: 2896f33 Author: Niketan Pansare <[email protected]> Authored: Mon Oct 30 10:32:53 2017 -0700 Committer: Niketan Pansare <[email protected]> Committed: Mon Oct 30 10:32:53 2017 -0700 ---------------------------------------------------------------------- src/main/cpp/kernels/Makefile | 2 +- src/main/cpp/kernels/SystemML.cu | 308 ++++++++++--------- .../java/org/apache/sysml/hops/AggUnaryOp.java | 95 ++++-- .../apache/sysml/lops/ConvolutionTransform.java | 42 ++- .../instructions/CPInstructionParser.java | 1 + .../instructions/GPUInstructionParser.java | 1 + .../cp/ConvolutionCPInstruction.java | 86 ++++++ .../gpu/ConvolutionGPUInstruction.java | 47 +++ .../spark/QuantilePickSPInstruction.java | 2 +- .../runtime/matrix/data/LibMatrixCUDA.java | 31 ++ .../runtime/matrix/data/LibMatrixCuDNN.java | 4 +- .../sysml/test/gpu/AggregateUnaryOpTests.java | 31 ++ .../apache/sysml/test/gpu/UnaryOpTestsBase.java | 8 +- .../functions/tensor/ChannelSumTest.java | 146 +++++++++ .../scripts/functions/tensor/ChannelSumTest.R | 39 +++ .../scripts/functions/tensor/ChannelSumTest.dml | 35 +++ 16 files changed, 690 insertions(+), 188 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/main/cpp/kernels/Makefile ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/Makefile b/src/main/cpp/kernels/Makefile index 5feae69..ec10317 100644 --- a/src/main/cpp/kernels/Makefile +++ b/src/main/cpp/kernels/Makefile @@ -16,7 +16,7 @@ # under the License. NVCC=nvcc -CUDAFLAGS= -ptx -c -arch=sm_30 +CUDAFLAGS= -ptx -c -arch=sm_30 --std c++11 # Use these flags for precise math #CUDAFLAGS= -ptx -c -arch=sm_30 -ftz=false -prec-div=true -prec-sqrt=true http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/main/cpp/kernels/SystemML.cu ---------------------------------------------------------------------- diff --git a/src/main/cpp/kernels/SystemML.cu b/src/main/cpp/kernels/SystemML.cu index d176f8f..ade2dd1 100644 --- a/src/main/cpp/kernels/SystemML.cu +++ b/src/main/cpp/kernels/SystemML.cu @@ -20,7 +20,7 @@ /********************************** When updating a kernel or adding a new one, please compile the ptx file and commit it: -nvcc -ptx -arch=sm_30 SystemML.cu +nvcc -ptx -arch=sm_30 --std c++11 SystemML.cu ***********************************/ #include <cfloat> @@ -29,7 +29,8 @@ nvcc -ptx -arch=sm_30 SystemML.cu extern "C" __global__ void double2float_f(double *A, float *ret, int N) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < N) { - // TODO: Use __double2float_rd or __double2float_rn or __double2float_ru or __double2float_rz after + // TODO: Use __double2float_rd or __double2float_rn or __double2float_ru or + // __double2float_rz after ret[tid] = (float)A[tid]; } } @@ -84,15 +85,14 @@ __device__ void slice_sparse_dense_row(T *inVal, int *inRowPtr, int *colInd, * * int size = inRowPtr[rowIndex+1] - inRowPtr[rowIndex]; * double numThreads = (double)min(size, MAX_NUM_THREADS_CHILD_KERNEL); - * slice_sparse_dense_row_helper<<< ceil(numThreads/ -*MAX_NUM_THREADS_CHILD_KERNEL), MAX_NUM_THREADS_CHILD_KERNEL>>>(inVal, inRowPtr, -*colInd, ret, -* rl, ru, cl, cu, retClen, inRowPtr[rowIndex], -*inRowPtr[rowIndex+1], index); -* -* Two-step compilation and linking process in JCudaKernels's constructor: -* cuLinkAddFile(linkState, CUjitInputType.CU_JIT_INPUT_LIBRARY, -*"/usr/local/cuda/lib64/libcudadevrt.a", jitOptions); + * slice_sparse_dense_row_helper + * <<< ceil(numThreads/MAX_NUM_THREADS_CHILD_KERNEL), MAX_NUM_THREADS_CHILD_KERNEL>>> + * (inVal, inRowPtr, colInd, ret, rl, ru, cl, cu, retClen, inRowPtr[rowIndex], + * inRowPtr[rowIndex+1], index); + * + * Two-step compilation and linking process in JCudaKernels's constructor: + * cuLinkAddFile(linkState, CUjitInputType.CU_JIT_INPUT_LIBRARY, + * "/usr/local/cuda/lib64/libcudadevrt.a", jitOptions); */ // Iterate over elements of the row 'rowIndex'. for (int i = inRowPtr[rowIndex]; i < inRowPtr[rowIndex + 1]; i++) { @@ -104,17 +104,18 @@ __device__ void slice_sparse_dense_row(T *inVal, int *inRowPtr, int *colInd, } } -extern "C" __global__ void slice_sparse_dense_row_d(double *inVal, int *inRowPtr, - int *colInd, double *ret, - int rl, int ru, int cl, - int cu, int retClen) { +extern "C" __global__ void slice_sparse_dense_row_d(double *inVal, + int *inRowPtr, int *colInd, + double *ret, int rl, int ru, + int cl, int cu, + int retClen) { slice_sparse_dense_row(inVal, inRowPtr, colInd, ret, rl, ru, cl, cu, retClen); } extern "C" __global__ void slice_sparse_dense_row_f(float *inVal, int *inRowPtr, - int *colInd, float *ret, - int rl, int ru, int cl, - int cu, int retClen) { + int *colInd, float *ret, + int rl, int ru, int cl, + int cu, int retClen) { slice_sparse_dense_row(inVal, inRowPtr, colInd, ret, rl, ru, cl, cu, retClen); } @@ -153,17 +154,18 @@ __device__ void slice_sparse_dense_nnz(T *inVal, int *inRowPtr, int *colInd, } } -extern "C" __global__ void slice_sparse_dense_nnz_d(double *inVal, int *inRowPtr, - int *colInd, double *ret, - int rl, int ru, int cl, - int cu, int retClen) { +extern "C" __global__ void slice_sparse_dense_nnz_d(double *inVal, + int *inRowPtr, int *colInd, + double *ret, int rl, int ru, + int cl, int cu, + int retClen) { slice_sparse_dense_nnz(inVal, inRowPtr, colInd, ret, rl, ru, cl, cu, retClen); } extern "C" __global__ void slice_sparse_dense_nnz_f(float *inVal, int *inRowPtr, - int *colInd, float *ret, - int rl, int ru, int cl, - int cu, int retClen) { + int *colInd, float *ret, + int rl, int ru, int cl, + int cu, int retClen) { slice_sparse_dense_nnz(inVal, inRowPtr, colInd, ret, rl, ru, cl, cu, retClen); } @@ -194,16 +196,16 @@ __device__ void slice_dense_dense(T *in, T *ret, int rl, int ru, int cl, int cu, } extern "C" __global__ void slice_dense_dense_d(double *in, double *ret, int rl, - int ru, int cl, int cu, - int inClen, int retRlen, - int retClen) { + int ru, int cl, int cu, + int inClen, int retRlen, + int retClen) { slice_dense_dense(in, ret, rl, ru, cl, cu, inClen, retRlen, retClen); } extern "C" __global__ void slice_dense_dense_f(float *in, float *ret, int rl, - int ru, int cl, int cu, - int inClen, int retRlen, - int retClen) { + int ru, int cl, int cu, + int inClen, int retRlen, + int retClen) { slice_dense_dense(in, ret, rl, ru, cl, cu, inClen, retRlen, retClen); } @@ -236,15 +238,15 @@ extern "C" __global__ void copy_u2l_dense_f(float *ret, int dim, int N) { // Use this method in templates to fetch the maximum value for a given datatype template <typename T> -__forceinline__ __device__ T T_MAX(T x) { - return (T)DBL_MAX; +__forceinline__ __device__ T MAX() { + return T(); } template <> -__forceinline__ __device__ float T_MAX(float x) { +__forceinline__ __device__ float MAX <float>() { return FLT_MAX; } template <> -__forceinline__ __device__ double T_MAX(double x) { +__forceinline__ __device__ double MAX <double>() { return DBL_MAX; } @@ -311,7 +313,7 @@ __forceinline__ __device__ T binaryOp(T x, T y, int op) { } } default: - return T_MAX(x); + return MAX<T>(); } } @@ -342,7 +344,8 @@ extern "C" __global__ void relu_f(float *A, float *ret, int rlen, int clen) { } /** - * This method computes the backpropagation errors for previous layer of relu operation + * This method computes the backpropagation errors for previous layer of relu + * operation * * @param X input activation array allocated on the GPU * @param dout errors from previous layer @@ -361,12 +364,12 @@ __device__ void relu_backward(T *X, T *dout, T *ret, int rlen, int clen) { } extern "C" __global__ void relu_backward_d(double *X, double *dout, double *ret, - int rlen, int clen) { + int rlen, int clen) { relu_backward(X, dout, ret, rlen, clen); } extern "C" __global__ void relu_backward_f(float *X, float *dout, float *ret, - int rlen, int clen) { + int rlen, int clen) { relu_backward(X, dout, ret, rlen, clen); } @@ -389,12 +392,12 @@ __device__ void inplace_add(T *input, T *ret, int rlen, int clen) { } extern "C" __global__ void inplace_add_d(double *input, double *ret, int rlen, - int clen) { + int clen) { inplace_add(input, ret, rlen, clen); } extern "C" __global__ void inplace_add_f(float *input, float *ret, int rlen, - int clen) { + int clen) { inplace_add(input, ret, rlen, clen); } @@ -416,12 +419,12 @@ __device__ void bias_add(T *input, T *bias, T *ret, int rlen, int clen, } extern "C" __global__ void bias_add_d(double *input, double *bias, double *ret, - int rlen, int clen, int PQ) { + int rlen, int clen, int PQ) { bias_add(input, bias, ret, rlen, clen, PQ); } extern "C" __global__ void bias_add_f(float *input, float *bias, float *ret, - int rlen, int clen, int PQ) { + int rlen, int clen, int PQ) { bias_add(input, bias, ret, rlen, clen, PQ); } @@ -443,16 +446,16 @@ __device__ void daxpy_matrix_vector(T *A, T *B, double alpha, T *ret, int rlenA, } extern "C" __global__ void daxpy_matrix_vector_d(double *A, double *B, - double alpha, double *ret, - int rlenA, int clenA, int rlenB, - int clenB) { + double alpha, double *ret, + int rlenA, int clenA, + int rlenB, int clenB) { daxpy_matrix_vector(A, B, alpha, ret, rlenA, clenA, rlenB, clenB); } extern "C" __global__ void daxpy_matrix_vector_f(float *A, float *B, - double alpha, float *ret, - int rlenA, int clenA, int rlenB, - int clenB) { + double alpha, float *ret, + int rlenA, int clenA, + int rlenB, int clenB) { daxpy_matrix_vector(A, B, alpha, ret, rlenA, clenA, rlenB, clenB); } @@ -471,13 +474,14 @@ __device__ void bias_multiply(T *input, T *bias, T *ret, int rlen, int clen, } extern "C" __global__ void bias_multiply_d(double *input, double *bias, - double *ret, int rlen, int clen, - int PQ) { + double *ret, int rlen, int clen, + int PQ) { bias_multiply(input, bias, ret, rlen, clen, PQ); } -extern "C" __global__ void bias_multiply_f(float *input, float *bias, float *ret, - int rlen, int clen, int PQ) { +extern "C" __global__ void bias_multiply_f(float *input, float *bias, + float *ret, int rlen, int clen, + int PQ) { bias_multiply(input, bias, ret, rlen, clen, PQ); } @@ -563,14 +567,14 @@ __device__ void matrix_scalar_op(T *A, T scalar, T *C, int size, int op, } extern "C" __global__ void matrix_scalar_op_d(double *A, double scalar, - double *C, int size, int op, - int isLeftScalar) { + double *C, int size, int op, + int isLeftScalar) { matrix_scalar_op(A, scalar, C, size, op, isLeftScalar); } extern "C" __global__ void matrix_scalar_op_f(float *A, double scalar, float *C, - int size, int op, - int isLeftScalar) { + int size, int op, + int isLeftScalar) { matrix_scalar_op(A, (float)scalar, C, size, op, isLeftScalar); } @@ -635,12 +639,12 @@ __device__ void cbind(T *A, T *B, T *C, int rowsA, int colsA, int rowsB, } extern "C" __global__ void cbind_d(double *A, double *B, double *C, int rowsA, - int colsA, int rowsB, int colsB) { + int colsA, int rowsB, int colsB) { cbind(A, B, C, rowsA, colsA, rowsB, colsB); } extern "C" __global__ void cbind_f(float *A, float *B, float *C, int rowsA, - int colsA, int rowsB, int colsB) { + int colsA, int rowsB, int colsB) { cbind(A, B, C, rowsA, colsA, rowsB, colsB); } @@ -684,12 +688,12 @@ __device__ void rbind(T *A, T *B, T *C, int rowsA, int colsA, int rowsB, } extern "C" __global__ void rbind_d(double *A, double *B, double *C, int rowsA, - int colsA, int rowsB, int colsB) { + int colsA, int rowsB, int colsB) { rbind(A, B, C, rowsA, colsA, rowsB, colsB); } extern "C" __global__ void rbind_f(float *A, float *B, float *C, int rowsA, - int colsA, int rowsB, int colsB) { + int colsA, int rowsB, int colsB) { rbind(A, B, C, rowsA, colsA, rowsB, colsB); } @@ -828,15 +832,15 @@ template <typename ReductionOp, typename AssignmentOp, typename T> __device__ void reduce_row( T *g_idata, ///< input data stored in device memory (of size rows*cols) T *g_odata, ///< output/temporary array store in device memory (of size - ///rows*cols) + /// rows*cols) unsigned int rows, ///< rows in input and temporary/output arrays unsigned int cols, ///< columns in input and temporary/output arrays ReductionOp reduction_op, ///< Reduction operation to perform (functor object) AssignmentOp assignment_op, ///< Operation to perform before assigning this - ///to its final location in global memory for - ///each row - T initialValue) { ///< initial value for the reduction variable + /// to its final location in global memory for + /// each row + T initialValue) { ///< initial value for the reduction variable // extern __shared__ T sdata[]; extern __shared__ __align__(sizeof(T)) unsigned char my_sdata[]; T *sdata = reinterpret_cast<T *>(my_sdata); @@ -935,15 +939,15 @@ template <typename ReductionOp, typename AssignmentOp, typename T> __device__ void reduce_col( T *g_idata, ///< input data stored in device memory (of size rows*cols) T *g_odata, ///< output/temporary array store in device memory (of size - ///rows*cols) + /// rows*cols) unsigned int rows, ///< rows in input and temporary/output arrays unsigned int cols, ///< columns in input and temporary/output arrays ReductionOp reduction_op, ///< Reduction operation to perform (functor object) AssignmentOp assignment_op, ///< Operation to perform before assigning this - ///to its final location in global memory for - ///each column - T initialValue) ///< initial value for the reduction variable + /// to its final location in global memory for + /// each column + T initialValue) ///< initial value for the reduction variable { unsigned int global_tid = blockIdx.x * blockDim.x + threadIdx.x; if (global_tid >= cols) { @@ -990,12 +994,12 @@ __device__ void reduce_sum(T *g_idata, T *g_odata, unsigned int n) { } extern "C" __global__ void reduce_sum_d(double *g_idata, double *g_odata, - unsigned int n) { + unsigned int n) { reduce_sum(g_idata, g_odata, n); } extern "C" __global__ void reduce_sum_f(float *g_idata, float *g_odata, - unsigned int n) { + unsigned int n) { reduce_sum(g_idata, g_odata, n); } @@ -1016,14 +1020,14 @@ __device__ void reduce_row_sum(T *g_idata, T *g_odata, unsigned int rows, } extern "C" __global__ void reduce_row_sum_d(double *g_idata, double *g_odata, - unsigned int rows, - unsigned int cols) { + unsigned int rows, + unsigned int cols) { reduce_row_sum(g_idata, g_odata, rows, cols); } extern "C" __global__ void reduce_row_sum_f(float *g_idata, float *g_odata, - unsigned int rows, - unsigned int cols) { + unsigned int rows, + unsigned int cols) { reduce_row_sum(g_idata, g_odata, rows, cols); } @@ -1044,14 +1048,14 @@ __device__ void reduce_col_sum(T *g_idata, T *g_odata, unsigned int rows, } extern "C" __global__ void reduce_col_sum_d(double *g_idata, double *g_odata, - unsigned int rows, - unsigned int cols) { + unsigned int rows, + unsigned int cols) { reduce_col_sum(g_idata, g_odata, rows, cols); } extern "C" __global__ void reduce_col_sum_f(float *g_idata, float *g_odata, - unsigned int rows, - unsigned int cols) { + unsigned int rows, + unsigned int cols) { reduce_col_sum(g_idata, g_odata, rows, cols); } @@ -1063,12 +1067,13 @@ struct MaxOp { __device__ __forceinline__ T operator()(T a, T b) const { return fmax(a, b); } }; -template<> +template <> struct MaxOp<float> { - __device__ __forceinline__ float operator()(float a, float b) const { return fmaxf(a, b); } + __device__ __forceinline__ float operator()(float a, float b) const { + return fmaxf(a, b); + } }; - /** * Do a max over all elements of an array/matrix * @param g_idata input data stored in device memory (of size n) @@ -1078,16 +1083,16 @@ struct MaxOp<float> { template <typename T> __device__ void reduce_max(T *g_idata, T *g_odata, unsigned int n) { MaxOp<T> op; - reduce<MaxOp<T>, T>(g_idata, g_odata, n, op, -T_MAX(g_idata[0])); + reduce<MaxOp<T>, T>(g_idata, g_odata, n, op, -MAX<T>()); } extern "C" __global__ void reduce_max_d(double *g_idata, double *g_odata, - unsigned int n) { + unsigned int n) { reduce_max(g_idata, g_odata, n); } extern "C" __global__ void reduce_max_f(float *g_idata, float *g_odata, - unsigned int n) { + unsigned int n) { reduce_max(g_idata, g_odata, n); } @@ -1104,18 +1109,18 @@ __device__ void reduce_row_max(T *g_idata, T *g_odata, unsigned int rows, MaxOp<T> op; IdentityOp<T> aop; reduce_row<MaxOp<T>, IdentityOp<T>, T>(g_idata, g_odata, rows, cols, op, aop, - -T_MAX(g_idata[0])); + -MAX<T>()); } extern "C" __global__ void reduce_row_max_d(double *g_idata, double *g_odata, - unsigned int rows, - unsigned int cols) { + unsigned int rows, + unsigned int cols) { reduce_row_max(g_idata, g_odata, rows, cols); } extern "C" __global__ void reduce_row_max_f(float *g_idata, float *g_odata, - unsigned int rows, - unsigned int cols) { + unsigned int rows, + unsigned int cols) { reduce_row_max(g_idata, g_odata, rows, cols); } @@ -1132,18 +1137,18 @@ __device__ void reduce_col_max(T *g_idata, T *g_odata, unsigned int rows, MaxOp<T> op; IdentityOp<T> aop; reduce_col<MaxOp<T>, IdentityOp<T>, T>(g_idata, g_odata, rows, cols, op, aop, - (T)-T_MAX(g_idata[0])); + -MAX<T>()); } extern "C" __global__ void reduce_col_max_d(double *g_idata, double *g_odata, - unsigned int rows, - unsigned int cols) { + unsigned int rows, + unsigned int cols) { reduce_col_max(g_idata, g_odata, rows, cols); } extern "C" __global__ void reduce_col_max_f(float *g_idata, float *g_odata, - unsigned int rows, - unsigned int cols) { + unsigned int rows, + unsigned int cols) { reduce_col_max(g_idata, g_odata, rows, cols); } @@ -1164,16 +1169,16 @@ struct MinOp { template <typename T> __device__ void reduce_min(T *g_idata, T *g_odata, unsigned int n) { MinOp<T> op; - reduce<MinOp<T>, T>(g_idata, g_odata, n, op, T_MAX(g_idata[0])); + reduce<MinOp<T>, T>(g_idata, g_odata, n, op, MAX<T>()); } extern "C" __global__ void reduce_min_d(double *g_idata, double *g_odata, - unsigned int n) { + unsigned int n) { reduce_min(g_idata, g_odata, n); } extern "C" __global__ void reduce_min_f(float *g_idata, float *g_odata, - unsigned int n) { + unsigned int n) { reduce_min(g_idata, g_odata, n); } @@ -1190,18 +1195,18 @@ __device__ void reduce_row_min(T *g_idata, T *g_odata, unsigned int rows, MinOp<T> op; IdentityOp<T> aop; reduce_row<MinOp<T>, IdentityOp<T>, T>(g_idata, g_odata, rows, cols, op, aop, - T_MAX(g_idata[0])); + MAX<T>()); } extern "C" __global__ void reduce_row_min_d(double *g_idata, double *g_odata, - unsigned int rows, - unsigned int cols) { + unsigned int rows, + unsigned int cols) { reduce_row_min(g_idata, g_odata, rows, cols); } extern "C" __global__ void reduce_row_min_f(float *g_idata, float *g_odata, - unsigned int rows, - unsigned int cols) { + unsigned int rows, + unsigned int cols) { reduce_row_min(g_idata, g_odata, rows, cols); } @@ -1218,18 +1223,18 @@ __device__ void reduce_col_min(T *g_idata, T *g_odata, unsigned int rows, MinOp<T> op; IdentityOp<T> aop; reduce_col<MinOp<T>, IdentityOp<T>, T>(g_idata, g_odata, rows, cols, op, aop, - T_MAX(g_idata[0])); + MAX<T>()); } extern "C" __global__ void reduce_col_min_d(double *g_idata, double *g_odata, - unsigned int rows, - unsigned int cols) { + unsigned int rows, + unsigned int cols) { reduce_col_min(g_idata, g_odata, rows, cols); } extern "C" __global__ void reduce_col_min_f(float *g_idata, float *g_odata, - unsigned int rows, - unsigned int cols) { + unsigned int rows, + unsigned int cols) { reduce_col_min(g_idata, g_odata, rows, cols); } @@ -1254,12 +1259,12 @@ __device__ void reduce_prod(T *g_idata, T *g_odata, unsigned int n) { } extern "C" __global__ void reduce_prod_d(double *g_idata, double *g_odata, - unsigned int n) { + unsigned int n) { reduce_prod(g_idata, g_odata, n); } extern "C" __global__ void reduce_prod_f(float *g_idata, float *g_odata, - unsigned int n) { + unsigned int n) { reduce_prod(g_idata, g_odata, n); } @@ -1293,14 +1298,14 @@ __device__ void reduce_row_mean(T *g_idata, T *g_odata, unsigned int rows, } extern "C" __global__ void reduce_row_mean_d(double *g_idata, double *g_odata, - unsigned int rows, - unsigned int cols) { + unsigned int rows, + unsigned int cols) { reduce_row_mean(g_idata, g_odata, rows, cols); } extern "C" __global__ void reduce_row_mean_f(float *g_idata, float *g_odata, - unsigned int rows, - unsigned int cols) { + unsigned int rows, + unsigned int cols) { reduce_row_mean(g_idata, g_odata, rows, cols); } @@ -1321,14 +1326,14 @@ __device__ void reduce_col_mean(T *g_idata, T *g_odata, unsigned int rows, } extern "C" __global__ void reduce_col_mean_d(double *g_idata, double *g_odata, - unsigned int rows, - unsigned int cols) { + unsigned int rows, + unsigned int cols) { reduce_col_mean(g_idata, g_odata, rows, cols); } extern "C" __global__ void reduce_col_mean_f(float *g_idata, float *g_odata, - unsigned int rows, - unsigned int cols) { + unsigned int rows, + unsigned int cols) { reduce_col_mean(g_idata, g_odata, rows, cols); } @@ -1347,7 +1352,7 @@ __device__ void matrix_exp(T *A, T *C, unsigned int size) { } extern "C" __global__ void matrix_exp_d(double *A, double *C, - unsigned int size) { + unsigned int size) { matrix_exp(A, C, size); } @@ -1370,11 +1375,12 @@ __device__ void matrix_sqrt(T *A, T *C, unsigned int size) { } extern "C" __global__ void matrix_sqrt_d(double *A, double *C, - unsigned int size) { + unsigned int size) { matrix_sqrt(A, C, size); } -extern "C" __global__ void matrix_sqrt_f(float *A, float *C, unsigned int size) { +extern "C" __global__ void matrix_sqrt_f(float *A, float *C, + unsigned int size) { matrix_sqrt(A, C, size); } @@ -1393,12 +1399,12 @@ __device__ void matrix_round(T *A, T *C, unsigned int size) { } extern "C" __global__ void matrix_round_d(double *A, double *C, - unsigned int size) { + unsigned int size) { matrix_round(A, C, size); } extern "C" __global__ void matrix_round_f(float *A, float *C, - unsigned int size) { + unsigned int size) { matrix_round(A, C, size); } @@ -1417,7 +1423,7 @@ __device__ void matrix_abs(T *A, T *C, unsigned int size) { } extern "C" __global__ void matrix_abs_d(double *A, double *C, - unsigned int size) { + unsigned int size) { matrix_abs(A, C, size); } @@ -1440,7 +1446,7 @@ __device__ void matrix_log(T *A, T *C, unsigned int size) { } extern "C" __global__ void matrix_log_d(double *A, double *C, - unsigned int size) { + unsigned int size) { matrix_log(A, C, size); } @@ -1463,12 +1469,12 @@ __device__ void matrix_floor(T *A, T *C, unsigned int size) { } extern "C" __global__ void matrix_floor_d(double *A, double *C, - unsigned int size) { + unsigned int size) { matrix_floor(A, C, size); } extern "C" __global__ void matrix_floor_f(float *A, float *C, - unsigned int size) { + unsigned int size) { matrix_floor(A, C, size); } @@ -1487,11 +1493,12 @@ __device__ void matrix_ceil(T *A, T *C, unsigned int size) { } extern "C" __global__ void matrix_ceil_d(double *A, double *C, - unsigned int size) { + unsigned int size) { matrix_ceil(A, C, size); } -extern "C" __global__ void matrix_ceil_f(float *A, float *C, unsigned int size) { +extern "C" __global__ void matrix_ceil_f(float *A, float *C, + unsigned int size) { matrix_ceil(A, C, size); } @@ -1510,7 +1517,7 @@ __device__ void matrix_sin(T *A, T *C, unsigned int size) { } extern "C" __global__ void matrix_sin_d(double *A, double *C, - unsigned int size) { + unsigned int size) { matrix_sin(A, C, size); } @@ -1533,11 +1540,12 @@ __device__ void matrix_sinh(T *A, T *C, unsigned int size) { } extern "C" __global__ void matrix_sinh_d(double *A, double *C, - unsigned int size) { + unsigned int size) { matrix_sinh(A, C, size); } -extern "C" __global__ void matrix_sinh_f(float *A, float *C, unsigned int size) { +extern "C" __global__ void matrix_sinh_f(float *A, float *C, + unsigned int size) { matrix_sinh(A, C, size); } @@ -1556,7 +1564,7 @@ __device__ void matrix_cos(T *A, T *C, unsigned int size) { } extern "C" __global__ void matrix_cos_d(double *A, double *C, - unsigned int size) { + unsigned int size) { matrix_cos(A, C, size); } @@ -1579,11 +1587,12 @@ __device__ void matrix_cosh(T *A, T *C, unsigned int size) { } extern "C" __global__ void matrix_cosh_d(double *A, double *C, - unsigned int size) { + unsigned int size) { matrix_cosh(A, C, size); } -extern "C" __global__ void matrix_cosh_f(float *A, float *C, unsigned int size) { +extern "C" __global__ void matrix_cosh_f(float *A, float *C, + unsigned int size) { matrix_cosh(A, C, size); } @@ -1602,7 +1611,7 @@ __device__ void matrix_tan(T *A, T *C, unsigned int size) { } extern "C" __global__ void matrix_tan_d(double *A, double *C, - unsigned int size) { + unsigned int size) { matrix_tan(A, C, size); } @@ -1625,11 +1634,12 @@ __device__ void matrix_tanh(T *A, T *C, unsigned int size) { } extern "C" __global__ void matrix_tanh_d(double *A, double *C, - unsigned int size) { + unsigned int size) { matrix_tanh(A, C, size); } -extern "C" __global__ void matrix_tanh_f(float *A, float *C, unsigned int size) { +extern "C" __global__ void matrix_tanh_f(float *A, float *C, + unsigned int size) { matrix_tanh(A, C, size); } @@ -1648,11 +1658,12 @@ __device__ void matrix_asin(T *A, T *C, unsigned int size) { } extern "C" __global__ void matrix_asin_d(double *A, double *C, - unsigned int size) { + unsigned int size) { matrix_asin(A, C, size); } -extern "C" __global__ void matrix_asin_f(float *A, float *C, unsigned int size) { +extern "C" __global__ void matrix_asin_f(float *A, float *C, + unsigned int size) { matrix_asin(A, C, size); } @@ -1671,11 +1682,12 @@ __device__ void matrix_acos(T *A, T *C, unsigned int size) { } extern "C" __global__ void matrix_acos_d(double *A, double *C, - unsigned int size) { + unsigned int size) { matrix_acos(A, C, size); } -extern "C" __global__ void matrix_acos_f(float *A, float *C, unsigned int size) { +extern "C" __global__ void matrix_acos_f(float *A, float *C, + unsigned int size) { matrix_acos(A, C, size); } @@ -1694,11 +1706,12 @@ __device__ void matrix_atan(T *A, T *C, unsigned int size) { } extern "C" __global__ void matrix_atan_d(double *A, double *C, - unsigned int size) { + unsigned int size) { matrix_atan(A, C, size); } -extern "C" __global__ void matrix_atan_f(float *A, float *C, unsigned int size) { +extern "C" __global__ void matrix_atan_f(float *A, float *C, + unsigned int size) { matrix_atan(A, C, size); } @@ -1722,10 +1735,11 @@ __device__ void matrix_sign(T *A, T *C, unsigned int size) { } extern "C" __global__ void matrix_sign_d(double *A, double *C, - unsigned int size) { + unsigned int size) { matrix_sign(A, C, size); } -extern "C" __global__ void matrix_sign_f(float *A, float *C, unsigned int size) { +extern "C" __global__ void matrix_sign_f(float *A, float *C, + unsigned int size) { matrix_sign(A, C, size); -} \ No newline at end of file +} http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java index 04a32bd..9b9406a 100644 --- a/src/main/java/org/apache/sysml/hops/AggUnaryOp.java +++ b/src/main/java/org/apache/sysml/hops/AggUnaryOp.java @@ -26,6 +26,7 @@ import org.apache.sysml.hops.rewrite.HopRewriteUtils; import org.apache.sysml.lops.Aggregate; import org.apache.sysml.lops.Aggregate.OperationTypes; import org.apache.sysml.lops.Binary; +import org.apache.sysml.lops.ConvolutionTransform; import org.apache.sysml.lops.Group; import org.apache.sysml.lops.Lop; import org.apache.sysml.lops.LopsException; @@ -131,6 +132,20 @@ public class AggUnaryOp extends Hop implements MultiThreadedHop return false; } + /** + * Checks if channels sum rewrite is applicable + * + * @return returns true for pattern rowSums(matrix(colSums(X), rows=.., cols=..)) else false + */ + private boolean isChannelSumRewriteApplicable() { + if( OptimizerUtils.ALLOW_OPERATOR_FUSION && _op == AggOp.SUM && _direction == Direction.Row + && getInput().get(0) instanceof ReorgOp && ((ReorgOp)getInput().get(0)).getOp() == ReOrgOp.RESHAPE) { + Hop input1 = getInput().get(0).getInput().get(0); + return input1 instanceof AggUnaryOp && ((AggUnaryOp)input1)._op == AggOp.SUM && ((AggUnaryOp)input1)._direction == Direction.Col; + } + return false; + } + @Override public Lop constructLops() throws HopsException, LopsException @@ -147,41 +162,57 @@ public class AggUnaryOp extends Hop implements MultiThreadedHop if ( et == ExecType.CP || et == ExecType.GPU ) { Lop agg1 = null; - if( isTernaryAggregateRewriteApplicable() ) { - agg1 = constructLopsTernaryAggregateRewrite(et); + long numChannels = isChannelSumRewriteApplicable() ? Hop.computeSizeInformation(getInput().get(0).getInput().get(1)) : -1; + if(numChannels > 0 && numChannels < 1000000) { + // Apply channel sums only if rewrite is applicable and if the dimension of C is known at compile time + // and if numChannels is less than 8 MB. + ReorgOp in = ((ReorgOp)getInput().get(0)); + agg1 = new ConvolutionTransform( + in.getInput().get(0).getInput().get(0).constructLops(), + in.getInput().get(1).constructLops(), + in.getInput().get(2).constructLops(), + ConvolutionTransform.OperationTypes.CHANNEL_SUMS, getDataType(), getValueType(), et, -1); + agg1.getOutputParameters().setDimensions(numChannels, 1, getRowsInBlock(), getColsInBlock(), -1); + setLineNumbers(agg1); + setLops(agg1); } - else if( isUnaryAggregateOuterCPRewriteApplicable() ) - { - OperationTypes op = HopsAgg2Lops.get(_op); - DirectionTypes dir = HopsDirection2Lops.get(_direction); - - BinaryOp binput = (BinaryOp)getInput().get(0); - agg1 = new UAggOuterChain( binput.getInput().get(0).constructLops(), - binput.getInput().get(1).constructLops(), op, dir, - HopsOpOp2LopsB.get(binput.getOp()), DataType.MATRIX, getValueType(), ExecType.CP); - PartialAggregate.setDimensionsBasedOnDirection(agg1, getDim1(), getDim2(), input.getRowsInBlock(), input.getColsInBlock(), dir); - + else { + if( isTernaryAggregateRewriteApplicable() ) { + agg1 = constructLopsTernaryAggregateRewrite(et); + } + else if( isUnaryAggregateOuterCPRewriteApplicable() ) + { + OperationTypes op = HopsAgg2Lops.get(_op); + DirectionTypes dir = HopsDirection2Lops.get(_direction); + + BinaryOp binput = (BinaryOp)getInput().get(0); + agg1 = new UAggOuterChain( binput.getInput().get(0).constructLops(), + binput.getInput().get(1).constructLops(), op, dir, + HopsOpOp2LopsB.get(binput.getOp()), DataType.MATRIX, getValueType(), ExecType.CP); + PartialAggregate.setDimensionsBasedOnDirection(agg1, getDim1(), getDim2(), input.getRowsInBlock(), input.getColsInBlock(), dir); + + if (getDataType() == DataType.SCALAR) { + UnaryCP unary1 = new UnaryCP(agg1, HopsOpOp1LopsUS.get(OpOp1.CAST_AS_SCALAR), + getDataType(), getValueType()); + unary1.getOutputParameters().setDimensions(0, 0, 0, 0, -1); + setLineNumbers(unary1); + setLops(unary1); + } + + } + else { //general case + int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads); + agg1 = new PartialAggregate(input.constructLops(), + HopsAgg2Lops.get(_op), HopsDirection2Lops.get(_direction), getDataType(),getValueType(), et, k); + } + + setOutputDimensions(agg1); + setLineNumbers(agg1); + setLops(agg1); + if (getDataType() == DataType.SCALAR) { - UnaryCP unary1 = new UnaryCP(agg1, HopsOpOp1LopsUS.get(OpOp1.CAST_AS_SCALAR), - getDataType(), getValueType()); - unary1.getOutputParameters().setDimensions(0, 0, 0, 0, -1); - setLineNumbers(unary1); - setLops(unary1); + agg1.getOutputParameters().setDimensions(1, 1, getRowsInBlock(), getColsInBlock(), getNnz()); } - - } - else { //general case - int k = OptimizerUtils.getConstrainedNumThreads(_maxNumThreads); - agg1 = new PartialAggregate(input.constructLops(), - HopsAgg2Lops.get(_op), HopsDirection2Lops.get(_direction), getDataType(),getValueType(), et, k); - } - - setOutputDimensions(agg1); - setLineNumbers(agg1); - setLops(agg1); - - if (getDataType() == DataType.SCALAR) { - agg1.getOutputParameters().setDimensions(1, 1, getRowsInBlock(), getColsInBlock(), getNnz()); } } else if( et == ExecType.MR ) http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java index dfc187c..94a67f0 100644 --- a/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java +++ b/src/main/java/org/apache/sysml/lops/ConvolutionTransform.java @@ -32,7 +32,7 @@ public class ConvolutionTransform extends Lop public enum OperationTypes { MAX_POOLING, MAX_POOLING_BACKWARD, RELU_MAX_POOLING, RELU_BACKWARD, RELU_MAX_POOLING_BACKWARD, DIRECT_CONV2D, DIRECT_CONV2D_BACKWARD_FILTER, DIRECT_CONV2D_BACKWARD_DATA, - BIAS_ADD, DIRECT_CONV2D_BIAS_ADD, BIAS_MULTIPLY + BIAS_ADD, DIRECT_CONV2D_BIAS_ADD, BIAS_MULTIPLY, CHANNEL_SUMS } private OperationTypes operation = null; @@ -67,6 +67,18 @@ public class ConvolutionTransform extends Lop input2.addOutput(this); setLevel(); } + + public ConvolutionTransform(Lop input1, Lop input2, Lop input3, ConvolutionTransform.OperationTypes op, DataType dt, ValueType vt, ExecType et, int k) + { + super(Lop.Type.Transform, dt, vt); + init(input1, op, dt, vt, et); + numThreads = k; + this.addInput(input2); + input2.addOutput(this); + this.addInput(input3); + input3.addOutput(this); + setLevel(); + } private void init (Lop input, ConvolutionTransform.OperationTypes op, DataType dt, ValueType vt, ExecType et) { @@ -142,6 +154,9 @@ public class ConvolutionTransform extends Lop case DIRECT_CONV2D_BACKWARD_DATA: return "conv2d_backward_data"; + case CHANNEL_SUMS: + return "channel_sums"; + default: throw new UnsupportedOperationException(this.printErrorLocation() + "Instruction is not defined for Transform operation " + operation); @@ -180,6 +195,31 @@ public class ConvolutionTransform extends Lop } @Override + public String getInstructions(String input, String C, String HW, String output) throws LopsException { + if(operation == OperationTypes.CHANNEL_SUMS) { + StringBuilder sb = new StringBuilder(); + sb.append( getExecType() ); + + sb.append( OPERAND_DELIMITOR ); + sb.append( getOpcode() ); + sb.append( OPERAND_DELIMITOR ); + sb.append( getInputs().get(0).prepInputOperand(input)); + sb.append( OPERAND_DELIMITOR ); + sb.append( getInputs().get(1).prepInputOperand(C)); + sb.append( OPERAND_DELIMITOR ); + sb.append( getInputs().get(2).prepInputOperand(HW)); + //output + sb.append( OPERAND_DELIMITOR ); + sb.append( this.prepOutputOperand(output)); + + return sb.toString(); + } + else { + throw new LopsException("The operation is not supported with three operands:" + operation.name()); + } + } + + @Override public String getInstructions(String[] inputs, String output) throws LopsException { StringBuilder sb = new StringBuilder(); appendOpcode(sb); http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java index 4e66042..d0bc429 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/CPInstructionParser.java @@ -233,6 +233,7 @@ public class CPInstructionParser extends InstructionParser String2CPInstructionType.put( "conv2d_backward_data" , CPINSTRUCTION_TYPE.Convolution); String2CPInstructionType.put( "bias_add" , CPINSTRUCTION_TYPE.Convolution); String2CPInstructionType.put( "bias_multiply" , CPINSTRUCTION_TYPE.Convolution); + String2CPInstructionType.put( "channel_sums" , CPINSTRUCTION_TYPE.Convolution); // Quaternary instruction opcodes String2CPInstructionType.put( "wsloss" , CPINSTRUCTION_TYPE.Quaternary); http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java index 503576f..ae19969 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/GPUInstructionParser.java @@ -53,6 +53,7 @@ public class GPUInstructionParser extends InstructionParser String2GPUInstructionType.put( "maxpooling_backward", GPUINSTRUCTION_TYPE.Convolution); String2GPUInstructionType.put( "bias_add", GPUINSTRUCTION_TYPE.Convolution); String2GPUInstructionType.put( "bias_multiply", GPUINSTRUCTION_TYPE.Convolution); + String2GPUInstructionType.put( "channel_sums", GPUINSTRUCTION_TYPE.Convolution); // Matrix Multiply Operators String2GPUInstructionType.put( "ba+*", GPUINSTRUCTION_TYPE.AggregateBinary); http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java index c6b4698..36422d9 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/cp/ConvolutionCPInstruction.java @@ -27,12 +27,14 @@ import org.apache.sysml.api.DMLScript; import org.apache.sysml.hops.OptimizerUtils; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; +import org.apache.sysml.runtime.functionobjects.KahanPlus; import org.apache.sysml.runtime.functionobjects.SwapIndex; import org.apache.sysml.runtime.instructions.InstructionUtils; import org.apache.sysml.runtime.matrix.data.ConvolutionParameters; import org.apache.sysml.runtime.matrix.data.LibMatrixDNN; import org.apache.sysml.runtime.matrix.data.LibMatrixNative; import org.apache.sysml.runtime.matrix.data.MatrixBlock; +import org.apache.sysml.runtime.matrix.data.SparseBlock; import org.apache.sysml.runtime.matrix.operators.ReorgOperator; import org.apache.sysml.runtime.util.ConvolutionUtils; import org.apache.sysml.utils.NativeHelper; @@ -59,6 +61,19 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { _numThreads = numThreads; _intermediateMemoryBudget = intermediateMemoryBudget; } + + public ConvolutionCPInstruction(CPOperand in, CPOperand in2, CPOperand in3, CPOperand out, String opcode, String istr, int numThreads, double intermediateMemoryBudget) throws DMLRuntimeException { + super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), in, out, + opcode, istr); + if( !opcode.equals("channel_sums") ) { + throw new DMLRuntimeException("Incorrect usage. Expected the opcode to be channel_sums, but found " + opcode); + } + _in2 = in2; + _in3 = in3; + _cptype = CPINSTRUCTION_TYPE.Convolution; + _numThreads = numThreads; + _intermediateMemoryBudget = intermediateMemoryBudget; + } private ConvolutionCPInstruction(CPOperand in, CPOperand out, String opcode, String istr, ArrayList<CPOperand> stride, ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape, @@ -212,6 +227,14 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { int k = Integer.parseInt(parts[4]); return new ConvolutionCPInstruction(in, in2, out, opcode, str, k, Double.parseDouble(parts[5])); } + else if (opcode.equalsIgnoreCase("channel_sums")) { + InstructionUtils.checkNumFields(parts, 4); + CPOperand in = new CPOperand(parts[1]); + CPOperand in2 = new CPOperand(parts[2]); + CPOperand in3 = new CPOperand(parts[3]); + CPOperand out = new CPOperand(parts[4]); + return new ConvolutionCPInstruction(in, in2, in3, out, opcode, str, -1, 0); + } else { throw new DMLRuntimeException("Unknown opcode while parsing a ConvolutionCPInstruction: " + str); } @@ -297,6 +320,65 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { ec.setMatrixOutput(getOutputVariableName(), outputBlock, getExtendedOpcode()); } + public void processChannelSumsInstruction(ExecutionContext ec) throws DMLRuntimeException { + MatrixBlock input = ec.getMatrixInput(input1.getName(), getExtendedOpcode()); + int C = (int) ec.getScalarInput(_in2.getName(), _in2.getValueType(), _in2.isLiteral()).getLongValue(); + int HW = (int) ec.getScalarInput(_in3.getName(), _in3.getValueType(), _in3.isLiteral()).getLongValue(); + if(C*HW != input.getNumColumns()) { + throw new DMLRuntimeException("Expected rows*cols" + C + "*" + HW + " to be equal to number of columns of input " + input.getNumColumns()); + } + MatrixBlock outputBlock = null; + if(input.isEmpty()) { + outputBlock = new MatrixBlock(C, 1, true); + } + else { + outputBlock = new MatrixBlock(C, 1, false).allocateBlock(); + double [] output = outputBlock.getDenseBlock(); + if(input.isInSparseFormat()) { + SparseBlock sblock = input.getSparseBlock(); + for(int n = 0; n < input.getNumRows(); n++) { + if( sblock.isEmpty(n) ) + continue; + int apos = sblock.pos(n); + int alen = sblock.size(n); + int[] aix = sblock.indexes(n); + double[] avals = sblock.values(n); + + // Iterate over the sparse block + for(int j=apos; j<apos+alen; j++) { + // Note: the input is of shape [N, CHW] + int chw = aix[j]; + + // Get individual zero-based c,h,w indexes from zero-based 'chw' + int c = chw / HW; + output[c] += avals[j]; + } + } + } + else { + double [] inArr = input.getDenseBlock(); + if(inArr != null) { + KahanPlus kplus = KahanPlus.getKahanPlusFnObject(); + for(int c = 0; c < C; c++) { + KahanObject sum = new KahanObject(0.0, 0.0); + for(int n = 0; n < input.getNumRows(); n++) { + int index = n*C*HW + c*HW; + for(int hw = 0; hw < HW; hw++, index++) { + kplus.execute2(sum, inArr[index]); + } + } + output[c] = sum._sum; + } + } + } + outputBlock.recomputeNonZeros(getExtendedOpcode()); + } + + // release inputs/outputs + ec.releaseMatrixInput(input1.getName(), getExtendedOpcode()); + ec.setMatrixOutput(getOutputVariableName(), outputBlock, getExtendedOpcode()); + } + // Assumption: enableNative && NativeHelper.isNativeLibraryLoaded() is true // This increases the number of native calls. For example:the cases where filter is sparse but input is dense private static boolean isFilterSparse(MatrixBlock filter) throws DMLRuntimeException { @@ -324,6 +406,10 @@ public class ConvolutionCPInstruction extends UnaryCPInstruction { processReluBackwardInstruction(ec); return; } + else if (instOpcode.equalsIgnoreCase("channel_sums")) { + processChannelSumsInstruction(ec); + return; + } // acquire inputs MatrixBlock outputBlock = null; http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java index 8565b5a..fdb208e 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/gpu/ConvolutionGPUInstruction.java @@ -20,12 +20,17 @@ package org.apache.sysml.runtime.instructions.gpu; import java.util.ArrayList; +import jcuda.Pointer; + import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.caching.MatrixObject; import org.apache.sysml.runtime.controlprogram.context.ExecutionContext; import org.apache.sysml.runtime.functionobjects.SwapIndex; import org.apache.sysml.runtime.instructions.InstructionUtils; import org.apache.sysml.runtime.instructions.cp.CPOperand; +import org.apache.sysml.runtime.instructions.cp.ConvolutionCPInstruction; +import org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig; +import org.apache.sysml.runtime.instructions.gpu.context.GPUContext; import org.apache.sysml.runtime.matrix.data.LibMatrixCUDA; import org.apache.sysml.runtime.matrix.data.LibMatrixCuDNN; import org.apache.sysml.runtime.matrix.operators.ReorgOperator; @@ -57,6 +62,19 @@ public class ConvolutionGPUInstruction extends GPUInstruction { _intermediateMemoryBudget = intermediateMemoryBudget; } + public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand in3, CPOperand out, String opcode, String istr, double intermediateMemoryBudget) throws DMLRuntimeException { + super(new ReorgOperator(SwapIndex.getSwapIndexFnObject()), opcode, istr); + if( !opcode.equals("channel_sums") ) { + throw new DMLRuntimeException("Incorrect usage. Expected the opcode to be channel_sums, but found " + opcode); + } + _input1 = in1; + _input2 = in2; + _input3 = in3; + _gputype = GPUINSTRUCTION_TYPE.Convolution; + _output = out; + _intermediateMemoryBudget = intermediateMemoryBudget; + } + public ConvolutionGPUInstruction(CPOperand in1, CPOperand in2, CPOperand in3, CPOperand out, String opcode, String istr, ArrayList<CPOperand> stride, ArrayList<CPOperand> padding, ArrayList<CPOperand> input_shape, @@ -210,6 +228,14 @@ public class ConvolutionGPUInstruction extends GPUInstruction { CPOperand out = new CPOperand(parts[3]); return new ConvolutionGPUInstruction(in1, in2, out, opcode, str, Double.parseDouble(parts[4])); } + else if (opcode.equalsIgnoreCase("channel_sums")) { + InstructionUtils.checkNumFields(parts, 4); + CPOperand in = new CPOperand(parts[1]); + CPOperand in2 = new CPOperand(parts[2]); + CPOperand in3 = new CPOperand(parts[3]); + CPOperand out = new CPOperand(parts[4]); + return new ConvolutionGPUInstruction(in, in2, in3, out, opcode, str, 0); + } else { throw new DMLRuntimeException("Unknown opcode while parsing a ConvolutionGPUInstruction: " + str); } @@ -246,6 +272,23 @@ public class ConvolutionGPUInstruction extends GPUInstruction { ec.releaseMatrixOutputForGPUInstruction(_output.getName()); } + public void processChannelSumsInstruction(ExecutionContext ec) throws DMLRuntimeException { + GPUStatistics.incrementNoOfExecutedGPUInst(); + MatrixObject input = getMatrixInputForGPUInstruction(ec, _input1.getName()); + int C = (int) ec.getScalarInput(_input2.getName(), _input2.getValueType(), _input2.isLiteral()).getLongValue(); + int HW = (int) ec.getScalarInput(_input3.getName(), _input3.getValueType(), _input3.isLiteral()).getLongValue(); + if(C*HW != input.getNumColumns()) { + throw new DMLRuntimeException("Expected rows*cols" + C + "*" + HW + " to be equal to number of columns of input " + input.getNumColumns()); + } + MatrixObject outputBlock = getDenseMatrixOutputForGPUInstruction(ec, _output.getName(), C, 1); + + LibMatrixCUDA.channelSums(ec.getGPUContext(0), getExtendedOpcode(), input, outputBlock, C, HW); + + // release inputs/outputs + ec.releaseMatrixInputForGPUInstruction(_input1.getName()); + ec.releaseMatrixOutputForGPUInstruction(_output.getName()); + } + @Override public void processInstruction(ExecutionContext ec) throws DMLRuntimeException @@ -258,6 +301,10 @@ public class ConvolutionGPUInstruction extends GPUInstruction { processReLUBackwardInstruction(ec); return; } + else if (instOpcode.equalsIgnoreCase("channel_sums")) { + processChannelSumsInstruction(ec); + return; + } GPUStatistics.incrementNoOfExecutedGPUInst(); http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/main/java/org/apache/sysml/runtime/instructions/spark/QuantilePickSPInstruction.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/instructions/spark/QuantilePickSPInstruction.java b/src/main/java/org/apache/sysml/runtime/instructions/spark/QuantilePickSPInstruction.java index e7f515a..caaa9e8 100644 --- a/src/main/java/org/apache/sysml/runtime/instructions/spark/QuantilePickSPInstruction.java +++ b/src/main/java/org/apache/sysml/runtime/instructions/spark/QuantilePickSPInstruction.java @@ -316,7 +316,7 @@ public class QuantilePickSPInstruction extends BinarySPInstruction { sum += v2.next()._2().sumWeightForQuantile(); //return tuple for partition aggregate - return Arrays.asList(new Tuple2<>(v1,sum)).iterator(); + return Arrays.asList(new Tuple2<Integer, Double>(v1,sum)).iterator(); } } http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java index 2cccde0..c0091c8 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCUDA.java @@ -322,6 +322,37 @@ public class LibMatrixCUDA { if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1); } + + /** + * Perform channel_sums operations: out = rowSums(matrix(colSums(A), rows=C, cols=HW)) + * + * @param gCtx a valid {@link GPUContext} + * @param instName the invoking instruction's name for record {@link Statistics}. + * @param input input image + * @param outputBlock output + * @param C number of channels + * @param HW height*width + * @throws DMLRuntimeException if DMLRuntimeException occurs + */ + public static void channelSums(GPUContext gCtx, String instName, MatrixObject input, MatrixObject outputBlock, long C, long HW) throws DMLRuntimeException { + if(LOG.isTraceEnabled()) { + LOG.trace("GPU : channelSums" + ", GPUContext=" + gCtx); + } + int N = toInt(input.getNumRows()); + int cols = toInt(input.getNumColumns()); + if(cols != C*HW) { + throw new DMLRuntimeException("Incorrect parameters, number of columns " + cols + " != " + C + "*" + HW); + } + Pointer imagePointer = getDensePointer(gCtx, input, instName); + Pointer outputPointer = getDensePointer(gCtx, outputBlock, instName); + + // We can replace this with CuDNN tensor reduce + Pointer tmp = gCtx.allocate(instName, cols*sizeOfDataType); + reduceCol(gCtx, instName, "reduce_col_sum", imagePointer, tmp, N, cols); + reduceRow(gCtx, instName, "reduce_row_sum", tmp, outputPointer, toInt(C), toInt(HW)); + gCtx.cudaFreeHelper(tmp); + + } /** * Performs the operation corresponding to the DML script: http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java index e0a6a57..5935285 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixCuDNN.java @@ -64,7 +64,7 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { protected static cudnnHandle getCudnnHandle(GPUContext gCtx) throws DMLRuntimeException { return gCtx.getCudnnHandle(); } - + /** * Does a 2D convolution followed by a bias_add * @@ -722,4 +722,4 @@ public class LibMatrixCuDNN extends LibMatrixCUDA { if(status != cudnnStatus.CUDNN_STATUS_SUCCESS) throw new DMLRuntimeException("Error status returned by CuDNN:" + jcuda.jcudnn.cudnnStatus.stringFor(status)); } -} \ No newline at end of file +} http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java b/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java index 0b229f0..59e9cb1 100644 --- a/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java +++ b/src/test/java/org/apache/sysml/test/gpu/AggregateUnaryOpTests.java @@ -45,6 +45,37 @@ public class AggregateUnaryOpTests extends UnaryOpTestsBase { public void colSums() { testSimpleUnaryOpMatrixOutput("colSums", "gpu_uack+"); } + + @Test + public void channelSums() { + int[] rows = rowSizes; + int[] C = new int[] { 2, 5, 10, 50 }; + int[] HW = new int[] { 10, 12, 21, 51 }; + double[] sparsities = this.sparsities; + int seed = this.seed; + + for (int k = 0; k < sparsities.length; k++) { + double sparsity = sparsities[k]; + if(sparsity == 0) + continue; // sparsity == 0 has been independently tested but it fails with non-informative mlcontext error + for (int i = 0; i < rows.length; i++) { + int row = rows[i]; + if(row == 1) + continue; // Currently channel_sums rewrite is enabled only for row > 1 + for (int c : C) { + if(c == 1) + continue; // C == 1 will result in scalar value, but this case has been independently tested + for (int hw : HW) { + // Skip the case of a scalar unary op + // System.out.println("Started channelSum test for " + row + " " + c + " " + hw + " " + sparsity); + String scriptStr = "out = rowSums(matrix(colSums(in1), rows=" + c + ", cols=" + hw + "));"; + testUnaryOpMatrixOutput(scriptStr, "gpu_channel_sums", "in1", "out", seed, row, c*hw, sparsity); + // System.out.println("Ended channelSum test for " + row + " " + c + " " + hw + " " + sparsity); + } + } + } + } + } @Test public void rowSums() { http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/test/java/org/apache/sysml/test/gpu/UnaryOpTestsBase.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/gpu/UnaryOpTestsBase.java b/src/test/java/org/apache/sysml/test/gpu/UnaryOpTestsBase.java index 0051dd4..0f6b59c 100644 --- a/src/test/java/org/apache/sysml/test/gpu/UnaryOpTestsBase.java +++ b/src/test/java/org/apache/sysml/test/gpu/UnaryOpTestsBase.java @@ -31,10 +31,10 @@ import org.apache.sysml.api.mlcontext.Matrix; public abstract class UnaryOpTestsBase extends GPUTests { // Set of rows and column sizes & sparsities to test unary ops - private final int[] rowSizes = new int[] { 2049, 1024, 140, 64, 1 }; - private final int[] columnSizes = new int[] { 2049, 1024, 140, 64, 1 }; - private final double[] sparsities = new double[] { 0.9, 0.3, 0.03, 0.0 }; - private final int seed = 42; + protected final int[] rowSizes = new int[] { 2049, 1024, 140, 64, 1 }; + protected final int[] columnSizes = new int[] { 2049, 1024, 150, 64, 1 }; + protected final double[] sparsities = new double[] { 0.9, 0.3, 0.03, 0.0 }; + protected final int seed = 42; /** * Tests unary ops with a variety of matrix shapes and sparsities. http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/test/java/org/apache/sysml/test/integration/functions/tensor/ChannelSumTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/org/apache/sysml/test/integration/functions/tensor/ChannelSumTest.java b/src/test/java/org/apache/sysml/test/integration/functions/tensor/ChannelSumTest.java new file mode 100644 index 0000000..61ca370 --- /dev/null +++ b/src/test/java/org/apache/sysml/test/integration/functions/tensor/ChannelSumTest.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.sysml.test.integration.functions.tensor; + +import java.util.HashMap; + +import org.apache.sysml.api.DMLScript; +import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM; +import org.apache.sysml.lops.LopProperties.ExecType; +import org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex; +import org.apache.sysml.test.integration.AutomatedTestBase; +import org.apache.sysml.test.integration.TestConfiguration; +import org.apache.sysml.test.utils.TestUtils; +import org.junit.Test; + +public class ChannelSumTest extends AutomatedTestBase +{ + + private final static String TEST_NAME = "ChannelSumTest"; + private final static String TEST_DIR = "functions/tensor/"; + private final static String TEST_CLASS_DIR = TEST_DIR + PoolTest.class.getSimpleName() + "/"; + private final static double epsilon=0.0000000001; + + @Override + public void setUp() { + addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, + new String[] {"B"})); + } + + @Test + public void testChannelSumDense1() + { + int numImg = 10; int imgSize = 9; int numChannels = 5; + runChannelSumTest(ExecType.CP, imgSize, numImg, numChannels, false); + } + + @Test + public void testChannelSumDense2() + { + int numImg = 2; int imgSize = 5; int numChannels = 3; + runChannelSumTest(ExecType.CP, imgSize, numImg, numChannels, false); + } + + @Test + public void testChannelSumDense3() + { + int numImg = 9; int imgSize = 4; int numChannels = 11; + runChannelSumTest(ExecType.CP, imgSize, numImg, numChannels, false); + } + + @Test + public void testChannelSumDense4() + { + int numImg = 7; int imgSize = 8; int numChannels = 12; + runChannelSumTest(ExecType.CP, imgSize, numImg, numChannels, false); + } + + @Test + public void testChannelSumSparse1() + { + int numImg = 4; int imgSize = 10; int numChannels = 5; + runChannelSumTest(ExecType.CP, imgSize, numImg, numChannels, true); + } + + @Test + public void testChannelSumSparse2() + { + int numImg = 2; int imgSize = 10; int numChannels = 8; + runChannelSumTest(ExecType.CP, imgSize, numImg, numChannels, true); + } + + @Test + public void testChannelSumSparse3() + { + int numImg = 4; int imgSize = 10; int numChannels = 11; + runChannelSumTest(ExecType.CP, imgSize, numImg, numChannels, true); + } + + @Test + public void testChannelSumSparse4() + { + int numImg = 9; int imgSize = 6; int numChannels = 8; + runChannelSumTest(ExecType.CP, imgSize, numImg, numChannels, true); + } + + public void runChannelSumTest( ExecType et, int imgSize, int numImg, int numChannels, boolean sparse) + { + RUNTIME_PLATFORM platformOld = rtplatform; + switch( et ){ + case MR: rtplatform = RUNTIME_PLATFORM.HADOOP; break; + case SPARK: rtplatform = RUNTIME_PLATFORM.SPARK; break; + default: rtplatform = RUNTIME_PLATFORM.HYBRID; break; + } + boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG; + if( rtplatform == RUNTIME_PLATFORM.SPARK ) + DMLScript.USE_LOCAL_SPARK_CONFIG = true; + + try + { + String sparseVal = String.valueOf(sparse).toUpperCase(); + + TestConfiguration config = getTestConfiguration(TEST_NAME); + loadTestConfiguration(config); + + String RI_HOME = SCRIPT_DIR + TEST_DIR; + fullDMLScriptName = RI_HOME + TEST_NAME + ".dml"; + programArgs = new String[]{"-explain", "hops", "-args", String.valueOf(imgSize), + String.valueOf(numImg), String.valueOf(numChannels), + output("B"), sparseVal}; + + fullRScriptName = RI_HOME + TEST_NAME + ".R"; + rCmd = "Rscript" + " " + fullRScriptName + " " + imgSize + " " + numImg + + " " + numChannels + " " + expectedDir() + " " + sparseVal; + + // run scripts + runTest(true, false, null, -1); + runRScript(true); + + //compare results + HashMap<CellIndex, Double> bHM = readRMatrixFromFS("B"); + HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromHDFS("B"); + TestUtils.compareMatrices(dmlfile, bHM, epsilon, "B-DML", "NumPy"); + } + finally { + rtplatform = platformOld; + DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld; + } + } + +} http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/test/scripts/functions/tensor/ChannelSumTest.R ---------------------------------------------------------------------- diff --git a/src/test/scripts/functions/tensor/ChannelSumTest.R b/src/test/scripts/functions/tensor/ChannelSumTest.R new file mode 100644 index 0000000..c605074 --- /dev/null +++ b/src/test/scripts/functions/tensor/ChannelSumTest.R @@ -0,0 +1,39 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +args <- commandArgs(TRUE) +library("Matrix") +library("matrixStats") +imgSize=as.integer(args[1]) +numImg=as.integer(args[2]) +numChannels=as.integer(args[3]) + +# Assumption: NCHW image format +x=matrix(seq(1, numImg*numChannels*imgSize*imgSize), numImg, numChannels*imgSize*imgSize, byrow=TRUE) +if(as.logical(args[5])) { + zero_mask = (x - 1.5*mean(x)) > 0 + x = x * zero_mask +} else { + x = x - mean(x) +} + +output = rowSums(matrix(colSums(x), numChannels, imgSize*imgSize, byrow=TRUE)); + +writeMM(as(output,"CsparseMatrix"), paste(args[4], "B", sep="")) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/systemml/blob/d916ba5b/src/test/scripts/functions/tensor/ChannelSumTest.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/functions/tensor/ChannelSumTest.dml b/src/test/scripts/functions/tensor/ChannelSumTest.dml new file mode 100644 index 0000000..7810a12 --- /dev/null +++ b/src/test/scripts/functions/tensor/ChannelSumTest.dml @@ -0,0 +1,35 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +imgSize=$1 +numImg=$2 +numChannels=$3 + +# Assumption: NCHW image format +x=matrix(seq(1, numImg*numChannels*imgSize*imgSize), rows=numImg, cols=numChannels*imgSize*imgSize) +if($5) { + zero_mask = (x - 1.5*mean(x)) > 0 + x = x * zero_mask +} +else { + x = x - mean(x) +} +output = rowSums(matrix(colSums(x), rows=numChannels, cols=imgSize*imgSize)) # shape (C, 1) +write(output, $4, format="text") \ No newline at end of file
