http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_row.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_row.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_row.hpp new file mode 100644 index 0000000..45d6987 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_row.hpp @@ -0,0 +1,1468 @@ +#ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_ +#define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/cuda/matrix_operations_row.hpp + @brief Implementations of row-major dense matrix related operations, including matrix-vector products, using CUDA. +*/ + + +namespace viennacl +{ +namespace linalg +{ +namespace cuda +{ + +template<typename DestNumericT, typename SrcNumericT> +__global__ void convert_row_kernel( + DestNumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + const SrcNumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2) +{ + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]; +} + +//Matrix transpose kernel +template<typename NumericT> +__global__ void trans_kernel( + const NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_stride1, unsigned int A_stride2, + + NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_internal_size1, unsigned int B_internal_size2, + unsigned int B_stride1, unsigned int B_stride2, + bool data_major) +{ + for(unsigned int row = blockIdx.x; row<A_size1; row+=gridDim.x) + { + for(unsigned int col = threadIdx.x; col<A_size2; col+=blockDim.x) + { + if(data_major) + B[(B_start1 + B_stride1 * col) * B_internal_size2 + (B_start2 + B_stride2 * row)] = A[(A_start1 + A_stride1 * row) * A_internal_size2 + (A_start2 + A_stride2 * col)]; + else + B[(B_start1 + B_stride1 * col) + (B_start2 + B_stride2 * row) * B_internal_size1] = A[(A_start1 + A_stride1 * row) + (A_start2 + A_stride2 * col) * A_internal_size1]; + } + } +} + +// +// am +// + +// alpha on CPU +template<typename NumericT> +__global__ void am_row_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + NumericT fac2, + unsigned int options2, + const NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2) +{ + NumericT alpha = fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + if (options2 & (1 << 1)) + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha; + } + else + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha; + } +} + +// alpha on GPU +template<typename NumericT> +__global__ void am_row_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + const NumericT * fac2, + unsigned int options2, + const NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2) +{ + NumericT alpha = *fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + if (options2 & (1 << 1)) + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha; + } + else + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha; + } +} + + +// +// ambm +// + +// alpha and beta on CPU +template<typename NumericT> +__global__ void ambm_row_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + NumericT fac2, + unsigned int options2, + const NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2, + + NumericT fac3, + unsigned int options3, + const NumericT * C, + unsigned int C_start1, unsigned int C_start2, + unsigned int C_inc1, unsigned int C_inc2, + unsigned int C_internal_size1, unsigned int C_internal_size2) +{ + NumericT alpha = fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + + NumericT beta = fac3; + if (options3 & (1 << 0)) + beta = -beta; + + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + if (options2 & (1 << 1)) + { + if (options3 & (1 << 1)) + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; + } + else + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; + } + } + else + { + if (options3 & (1 << 1)) + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; + } + else + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; + } + } +} + + +// alpha on CPU, beta on GPU +template<typename NumericT> +__global__ void ambm_row_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + NumericT fac2, + unsigned int options2, + const NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2, + + const NumericT * fac3, + unsigned int options3, + const NumericT * C, + unsigned int C_start1, unsigned int C_start2, + unsigned int C_inc1, unsigned int C_inc2, + unsigned int C_internal_size1, unsigned int C_internal_size2) +{ + NumericT alpha = fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + + NumericT beta = *fac3; + if (options3 & (1 << 0)) + beta = -beta; + + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + if (options2 & (1 << 1)) + { + if (options3 & (1 << 1)) + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; + } + else + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; + } + } + else + { + if (options3 & (1 << 1)) + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; + } + else + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; + } + } +} + +// alpha on GPU, beta on CPU +template<typename NumericT> +__global__ void ambm_row_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + const NumericT * fac2, + unsigned int options2, + const NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2, + + NumericT fac3, + unsigned int options3, + const NumericT * C, + unsigned int C_start1, unsigned int C_start2, + unsigned int C_inc1, unsigned int C_inc2, + unsigned int C_internal_size1, unsigned int C_internal_size2) +{ + NumericT alpha = *fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + + NumericT beta = fac3; + if (options3 & (1 << 0)) + beta = -beta; + + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + if (options2 & (1 << 1)) + { + if (options3 & (1 << 1)) + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; + } + else + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; + } + } + else + { + if (options3 & (1 << 1)) + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; + } + else + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; + } + } +} + + +// alpha and beta on GPU +template<typename NumericT> +__global__ void ambm_row_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + const NumericT * fac2, + unsigned int options2, + const NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2, + + const NumericT * fac3, + unsigned int options3, + const NumericT * C, + unsigned int C_start1, unsigned int C_start2, + unsigned int C_inc1, unsigned int C_inc2, + unsigned int C_internal_size1, unsigned int C_internal_size2) +{ + NumericT alpha = *fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + + NumericT beta = *fac3; + if (options3 & (1 << 0)) + beta = -beta; + + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + if (options2 & (1 << 1)) + { + if (options3 & (1 << 1)) + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; + } + else + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; + } + } + else + { + if (options3 & (1 << 1)) + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; + } + else + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; + } + } +} + + +// +// ambm_m +// + +// alpha and beta on CPU +template<typename NumericT> +__global__ void ambm_m_row_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + NumericT fac2, + unsigned int options2, + const NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2, + + NumericT fac3, + unsigned int options3, + const NumericT * C, + unsigned int C_start1, unsigned int C_start2, + unsigned int C_inc1, unsigned int C_inc2, + unsigned int C_internal_size1, unsigned int C_internal_size2) +{ + NumericT alpha = fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + + NumericT beta = fac3; + if (options3 & (1 << 0)) + beta = -beta; + + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + if (options2 & (1 << 1)) + { + if (options3 & (1 << 1)) + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; + } + else + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; + } + } + else + { + if (options3 & (1 << 1)) + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; + } + else + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; + } + } +} + + +// alpha on CPU, beta on GPU +template<typename NumericT> +__global__ void ambm_m_row_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + NumericT fac2, + unsigned int options2, + const NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2, + + const NumericT * fac3, + unsigned int options3, + const NumericT * C, + unsigned int C_start1, unsigned int C_start2, + unsigned int C_inc1, unsigned int C_inc2, + unsigned int C_internal_size1, unsigned int C_internal_size2) +{ + NumericT alpha = fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + + NumericT beta = *fac3; + if (options3 & (1 << 0)) + beta = -beta; + + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + if (options2 & (1 << 1)) + { + if (options3 & (1 << 1)) + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; + } + else + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; + } + } + else + { + if (options3 & (1 << 1)) + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; + } + else + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; + } + } +} + +// alpha on GPU, beta on CPU +template<typename NumericT> +__global__ void ambm_m_row_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + const NumericT * fac2, + unsigned int options2, + const NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2, + + NumericT fac3, + unsigned int options3, + const NumericT * C, + unsigned int C_start1, unsigned int C_start2, + unsigned int C_inc1, unsigned int C_inc2, + unsigned int C_internal_size1, unsigned int C_internal_size2) +{ + NumericT alpha = *fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + + NumericT beta = fac3; + if (options3 & (1 << 0)) + beta = -beta; + + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + if (options2 & (1 << 1)) + { + if (options3 & (1 << 1)) + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; + } + else + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; + } + } + else + { + if (options3 & (1 << 1)) + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; + } + else + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; + } + } +} + + +// alpha and beta on GPU +template<typename NumericT> +__global__ void ambm_m_row_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + const NumericT * fac2, + unsigned int options2, + const NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2, + + const NumericT * fac3, + unsigned int options3, + const NumericT * C, + unsigned int C_start1, unsigned int C_start2, + unsigned int C_inc1, unsigned int C_inc2, + unsigned int C_internal_size1, unsigned int C_internal_size2) +{ + NumericT alpha = *fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + + NumericT beta = *fac3; + if (options3 & (1 << 0)) + beta = -beta; + + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + if (options2 & (1 << 1)) + { + if (options3 & (1 << 1)) + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; + } + else + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; + } + } + else + { + if (options3 & (1 << 1)) + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; + } + else + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha + + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; + } + } +} + +// +// assignments +// + +template<typename NumericT> +__global__ void matrix_row_assign_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + NumericT alpha) +{ + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = alpha; +} + + +template<typename NumericT> +__global__ void matrix_row_diagonal_assign_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + NumericT alpha) +{ + unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x); + + for (unsigned int row = gid; row < A_size1; row += blockDim.x * gridDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + row * A_inc2 + A_start2] = alpha; +} + +// +// binary element-wise operations +// + +template<typename NumericT> +__global__ void element_op_row_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + const NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2, + + const NumericT * C, + unsigned int C_start1, unsigned int C_start2, + unsigned int C_inc1, unsigned int C_inc2, + unsigned int C_internal_size1, unsigned int C_internal_size2, + + unsigned int op_type) //0: product, 1: division, 2: pow +{ + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + if (op_type == 2) + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + = pow(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2], + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2]); + } + else if (op_type == 1) + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] + / C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2]; + } + else if (op_type == 0) + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] + * C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2]; + } +} + +template<typename NumericT> +__global__ void element_op_int_row_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + const NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2, + + const NumericT * C, + unsigned int C_start1, unsigned int C_start2, + unsigned int C_inc1, unsigned int C_inc2, + unsigned int C_internal_size1, unsigned int C_internal_size2, + + unsigned int op_type) //0: product, 1: division, 2: pow +{ + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + if (op_type == 1) + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] + / C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2]; + } + else if (op_type == 0) + { + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] + = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] + * C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2]; + } +} + +// +// unary element-wise operations +// + +// abs +template<typename NumericT> +__global__ void matrix_row_element_abs_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + const NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2) +{ + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = abs(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); +} + + +// acos +template<typename NumericT> +__global__ void matrix_row_element_acos_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + const NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2) +{ + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = acos(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); +} + + +// asin +template<typename NumericT> +__global__ void matrix_row_element_asin_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + const NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2) +{ + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = asin(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); +} + + +// atan +template<typename NumericT> +__global__ void matrix_row_element_atan_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + const NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2) +{ + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = atan(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); +} + + +// ceil +template<typename NumericT> +__global__ void matrix_row_element_ceil_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + const NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2) +{ + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = ceil(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); +} + + +// cos +template<typename NumericT> +__global__ void matrix_row_element_cos_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + const NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2) +{ + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cos(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); +} + + +// cosh +template<typename NumericT> +__global__ void matrix_row_element_cosh_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + const NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2) +{ + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cosh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); +} + + +// exp +template<typename NumericT> +__global__ void matrix_row_element_exp_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + const NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2) +{ + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = exp(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); +} + + +// fabs +template<typename NumericT> +__global__ void matrix_row_element_fabs_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + const NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2) +{ + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = fabs(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); +} + + +// floor +template<typename NumericT> +__global__ void matrix_row_element_floor_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + const NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2) +{ + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = floor(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); +} + + +// log +template<typename NumericT> +__global__ void matrix_row_element_log_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + const NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2) +{ + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); +} + + +// log10 +template<typename NumericT> +__global__ void matrix_row_element_log10_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + const NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2) +{ + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log10(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); +} + + +// sin +template<typename NumericT> +__global__ void matrix_row_element_sin_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + const NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2) +{ + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sin(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); +} + + +// sinh +template<typename NumericT> +__global__ void matrix_row_element_sinh_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + const NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2) +{ + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sinh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); +} + + +// sqrt +template<typename NumericT> +__global__ void matrix_row_element_sqrt_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + const NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2) +{ + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sqrt(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); +} + + +// tan +template<typename NumericT> +__global__ void matrix_row_element_tan_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + const NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2) +{ + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tan(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); +} + + +// tanh +template<typename NumericT> +__global__ void matrix_row_element_tanh_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + const NumericT * B, + unsigned int B_start1, unsigned int B_start2, + unsigned int B_inc1, unsigned int B_inc2, + unsigned int B_internal_size1, unsigned int B_internal_size2) +{ + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tanh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); +} + + + +// +// matrix-vector product +// + +template<typename NumericT> +__global__ void vec_mul_row_kernel( + const NumericT * A, + unsigned int A_row_start, + unsigned int A_col_start, + unsigned int A_row_inc, + unsigned int A_col_inc, + unsigned int A_row_size, + unsigned int A_col_size, + unsigned int A_internal_rows, + unsigned int A_internal_cols, + const NumericT * v, + unsigned int v_start, + unsigned int v_inc, + unsigned int v_size, + NumericT * result, + unsigned int result_start, + unsigned int result_inc, + unsigned int result_size) +{ + __shared__ NumericT work[128]; + + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + unsigned int lid = threadIdx.x; + + for (unsigned int row = row_gid; row < A_row_size; row += gridDim.x) + { + NumericT dot_prod = 0; + for (unsigned int col = col_gid; col < A_col_size; col += blockDim.x) + dot_prod += A[(row * A_row_inc + A_row_start) * A_internal_cols + col * A_col_inc + A_col_start] * v[v_start + v_inc * col]; + work[lid] = dot_prod; + + for (unsigned int stride = blockDim.x/2; stride>0; stride>>=1){ + __syncthreads(); + if (lid < stride) + work[lid] += work[lid+stride]; + } + + if (lid == 0) + result[row * result_inc + result_start] = work[0]; + } +} + + +template<typename NumericT> +__global__ void trans_vec_mul_row_kernel( + const NumericT * A, + unsigned int A_row_start, + unsigned int A_col_start, + unsigned int A_row_inc, + unsigned int A_col_inc, + unsigned int A_row_size, + unsigned int A_col_size, + unsigned int A_internal_rows, + unsigned int A_internal_cols, + const NumericT * v, + unsigned int v_start, + unsigned int v_inc, + unsigned int v_size, + NumericT * result, + unsigned int result_start, + unsigned int result_inc, + unsigned int result_size) +{ + for (unsigned int row = blockIdx.x * blockDim.x + threadIdx.x; row < A_col_size; row += gridDim.x * blockDim.x) + { + NumericT dot_prod = 0; + for (unsigned int col = 0; col < A_row_size; ++col) + dot_prod += A[(row * A_col_inc + A_col_start) + (col * A_row_inc + A_row_start) * A_internal_cols] * v[v_start + v_inc * col]; + result[row * result_inc + result_start] = dot_prod; + } +} + + +// +// matrix-matrix products +// + + + + +// +// scaled rank-1-update +// + +// alpha on CPU +template<typename NumericT> +__global__ void scaled_rank1_update_row_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + NumericT val, + unsigned int options2, + + const NumericT * vec1, + unsigned int start1, + unsigned int inc1, + unsigned int size1, + + const NumericT * vec2, + unsigned int start2, + unsigned int inc2, + unsigned int size2) +{ + NumericT alpha = val; + if (options2 & (1 << 0)) + alpha = -alpha; + if (options2 & (1 << 1)) + alpha = NumericT(1) / alpha; + + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + { + NumericT tmp = alpha * vec1[row * inc1 + start1]; + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2]; + } +} + + +// alpha on GPU +template<typename NumericT> +__global__ void scaled_rank1_update_row_kernel( + NumericT * A, + unsigned int A_start1, unsigned int A_start2, + unsigned int A_inc1, unsigned int A_inc2, + unsigned int A_size1, unsigned int A_size2, + unsigned int A_internal_size1, unsigned int A_internal_size2, + + const NumericT * val, + unsigned int options2, + + const NumericT * vec1, + unsigned int start1, + unsigned int inc1, + unsigned int size1, + + const NumericT * vec2, + unsigned int start2, + unsigned int inc2, + unsigned int size2) +{ + NumericT alpha = *val; + if (options2 & (1 << 0)) + alpha = -alpha; + if (options2 & (1 << 1)) + alpha = NumericT(1) / alpha; + + unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; + unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; + + for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) + { + NumericT tmp = alpha * vec1[row * inc1 + start1]; + for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) + A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2]; + } +} + + + +} // namespace cuda +} //namespace linalg +} //namespace viennacl + + +#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/misc_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/misc_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/misc_operations.hpp new file mode 100644 index 0000000..4821f5b --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/misc_operations.hpp @@ -0,0 +1,91 @@ +#ifndef VIENNACL_LINALG_CUDA_MISC_OPERATIONS_HPP_ +#define VIENNACL_LINALG_CUDA_MISC_OPERATIONS_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/cuda/misc_operations.hpp + @brief Implementations of miscellaneous operations using CUDA +*/ + +#include "viennacl/forwards.h" +#include "viennacl/scalar.hpp" +#include "viennacl/vector.hpp" +#include "viennacl/tools/tools.hpp" +#include "viennacl/linalg/cuda/common.hpp" + + +namespace viennacl +{ +namespace linalg +{ +namespace cuda +{ +namespace detail +{ + +template<typename NumericT> +__global__ void level_scheduling_substitute_kernel( + const unsigned int * row_index_array, + const unsigned int * row_indices, + const unsigned int * column_indices, + const NumericT * elements, + NumericT * vec, + unsigned int size) +{ + for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; + row < size; + row += gridDim.x * blockDim.x) + { + unsigned int eq_row = row_index_array[row]; + NumericT vec_entry = vec[eq_row]; + unsigned int row_end = row_indices[row+1]; + + for (unsigned int j = row_indices[row]; j < row_end; ++j) + vec_entry -= vec[column_indices[j]] * elements[j]; + + vec[eq_row] = vec_entry; + } +} + + + +template<typename NumericT> +void level_scheduling_substitute(vector<NumericT> & vec, + viennacl::backend::mem_handle const & row_index_array, + viennacl::backend::mem_handle const & row_buffer, + viennacl::backend::mem_handle const & col_buffer, + viennacl::backend::mem_handle const & element_buffer, + vcl_size_t num_rows + ) +{ + level_scheduling_substitute_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(row_index_array), + viennacl::cuda_arg<unsigned int>(row_buffer), + viennacl::cuda_arg<unsigned int>(col_buffer), + viennacl::cuda_arg<NumericT>(element_buffer), + viennacl::cuda_arg(vec), + static_cast<unsigned int>(num_rows) + ); +} + +} //namespace detail +} //namespace cuda +} //namespace linalg +} //namespace viennacl + + +#endif http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/nmf_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/nmf_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/nmf_operations.hpp new file mode 100644 index 0000000..109f74f --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/nmf_operations.hpp @@ -0,0 +1,152 @@ +#ifndef VIENNACL_LINALG_CUDA_NMF_OPERATIONS_HPP_ +#define VIENNACL_LINALG_CUDA_NMF_OPERATIONS_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/cuda/vector_operations.hpp + @brief Implementations of NMF operations using CUDA + */ + +#include "viennacl/linalg/host_based/nmf_operations.hpp" + +#include "viennacl/linalg/cuda/common.hpp" + +namespace viennacl +{ +namespace linalg +{ +namespace cuda +{ + +/** @brief Main CUDA kernel for nonnegative matrix factorization of a dense matrices. */ +template<typename NumericT> +__global__ void el_wise_mul_div(NumericT * matrix1, + NumericT const * matrix2, + NumericT const * matrix3, + unsigned int size) +{ + for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i +=gridDim.x * blockDim.x) + { + NumericT val = matrix1[i] * matrix2[i]; + NumericT divisor = matrix3[i]; + matrix1[i] = (divisor > (NumericT) 0.00001) ? (val / divisor) : NumericT(0); + } +} + +/** @brief The nonnegative matrix factorization (approximation) algorithm as suggested by Lee and Seung. Factorizes a matrix V with nonnegative entries into matrices W and H such that ||V - W*H|| is minimized. + * + * @param V Input matrix + * @param W First factor + * @param H Second factor + * @param conf A configuration object holding tolerances and the like + */ +template<typename NumericT> +void nmf(viennacl::matrix_base<NumericT> const & V, + viennacl::matrix_base<NumericT> & W, + viennacl::matrix_base<NumericT> & H, + viennacl::linalg::nmf_config const & conf) +{ + vcl_size_t k = W.size2(); + conf.iters_ = 0; + + if (!viennacl::linalg::norm_frobenius(W)) + W = viennacl::scalar_matrix<NumericT>(W.size1(), W.size2(), NumericT(1.0)); + + if (!viennacl::linalg::norm_frobenius(H)) + H = viennacl::scalar_matrix<NumericT>(H.size1(), H.size2(), NumericT(1.0)); + + viennacl::matrix_base<NumericT> wn(V.size1(), k, W.row_major()); + viennacl::matrix_base<NumericT> wd(V.size1(), k, W.row_major()); + viennacl::matrix_base<NumericT> wtmp(V.size1(), V.size2(), W.row_major()); + + viennacl::matrix_base<NumericT> hn(k, V.size2(), H.row_major()); + viennacl::matrix_base<NumericT> hd(k, V.size2(), H.row_major()); + viennacl::matrix_base<NumericT> htmp(k, k, H.row_major()); + + viennacl::matrix_base<NumericT> appr(V.size1(), V.size2(), V.row_major()); + + viennacl::vector<NumericT> diff(V.size1() * V.size2()); + + NumericT last_diff = 0; + NumericT diff_init = 0; + bool stagnation_flag = false; + + for (vcl_size_t i = 0; i < conf.max_iterations(); i++) + { + conf.iters_ = i + 1; + + hn = viennacl::linalg::prod(trans(W), V); + htmp = viennacl::linalg::prod(trans(W), W); + hd = viennacl::linalg::prod(htmp, H); + + el_wise_mul_div<<<128, 128>>>(viennacl::cuda_arg<NumericT>(H), + viennacl::cuda_arg<NumericT>(hn), + viennacl::cuda_arg<NumericT>(hd), + static_cast<unsigned int>(H.internal_size1() * H.internal_size2())); + VIENNACL_CUDA_LAST_ERROR_CHECK("el_wise_mul_div"); + + wn = viennacl::linalg::prod(V, trans(H)); + wtmp = viennacl::linalg::prod(W, H); + wd = viennacl::linalg::prod(wtmp, trans(H)); + + el_wise_mul_div<<<128, 128>>>(viennacl::cuda_arg<NumericT>(W), + viennacl::cuda_arg<NumericT>(wn), + viennacl::cuda_arg<NumericT>(wd), + static_cast<unsigned int>( W.internal_size1() * W.internal_size2())); + VIENNACL_CUDA_LAST_ERROR_CHECK("el_wise_mul_div"); + + if (i % conf.check_after_steps() == 0) //check for convergence + { + appr = viennacl::linalg::prod(W, H); + + appr -= V; + NumericT diff_val = viennacl::linalg::norm_frobenius(appr); + + if (i == 0) + diff_init = diff_val; + + if (conf.print_relative_error()) + std::cout << diff_val / diff_init << std::endl; + + // Approximation check + if (diff_val / diff_init < conf.tolerance()) + break; + + // Stagnation check + if (std::fabs(diff_val - last_diff) / (diff_val * conf.check_after_steps()) < conf.stagnation_tolerance()) //avoid situations where convergence stagnates + { + if (stagnation_flag) // iteration stagnates (two iterates with no notable progress) + break; + else + // record stagnation in this iteration + stagnation_flag = true; + } else + // good progress in this iteration, so unset stagnation flag + stagnation_flag = false; + + // prepare for next iterate: + last_diff = diff_val; + } + } +} + +} //namespace cuda +} //namespace linalg +} //namespace viennacl + +#endif /* VIENNACL_LINALG_CUDA_NMF_OPERATIONS_HPP_ */ http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/scalar_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/scalar_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/scalar_operations.hpp new file mode 100644 index 0000000..3adaca2 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/scalar_operations.hpp @@ -0,0 +1,375 @@ +#ifndef VIENNACL_LINALG_CUDA_SCALAR_OPERATIONS_HPP_ +#define VIENNACL_LINALG_CUDA_SCALAR_OPERATIONS_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/cuda/scalar_operations.hpp + @brief Implementations of scalar operations using CUDA +*/ + +#include "viennacl/forwards.h" +#include "viennacl/tools/tools.hpp" +#include "viennacl/meta/predicate.hpp" +#include "viennacl/meta/enable_if.hpp" +#include "viennacl/traits/size.hpp" +#include "viennacl/traits/start.hpp" +#include "viennacl/traits/stride.hpp" +#include "viennacl/linalg/cuda/common.hpp" + +// includes CUDA +#include <cuda_runtime.h> + + +namespace viennacl +{ +namespace linalg +{ +namespace cuda +{ + +/////////////////// as ///////////////////////////// + +template<typename NumericT> +__global__ void as_kernel(NumericT * s1, const NumericT * fac2, unsigned int options2, const NumericT * s2) +{ + NumericT alpha = *fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + if (options2 & (1 << 1)) + alpha = NumericT(1) / alpha; + + *s1 = *s2 * alpha; +} + +template<typename NumericT> +__global__ void as_kernel(NumericT * s1, NumericT fac2, unsigned int options2, const NumericT * s2) +{ + NumericT alpha = fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + if (options2 & (1 << 1)) + alpha = NumericT(1) / alpha; + + *s1 = *s2 * alpha; +} + +template<typename ScalarT1, + typename ScalarT2, typename NumericT> +typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value + && viennacl::is_scalar<ScalarT2>::value + && viennacl::is_any_scalar<NumericT>::value + >::type +as(ScalarT1 & s1, + ScalarT2 const & s2, NumericT const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha) +{ + typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type value_type; + + unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha); + + value_type temporary_alpha = 0; + if (viennacl::is_cpu_scalar<NumericT>::value) + temporary_alpha = alpha; + + as_kernel<<<1, 1>>>(viennacl::cuda_arg(s1), + viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), + options_alpha, + viennacl::cuda_arg(s2)); + VIENNACL_CUDA_LAST_ERROR_CHECK("as_kernel"); +} + +//////////////////// asbs //////////////////////////// + +// alpha and beta on GPU +template<typename NumericT> +__global__ void asbs_kernel(NumericT * s1, + const NumericT * fac2, unsigned int options2, const NumericT * s2, + const NumericT * fac3, unsigned int options3, const NumericT * s3) +{ + NumericT alpha = *fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + if (options2 & (1 << 1)) + alpha = NumericT(1) / alpha; + + NumericT beta = *fac3; + if (options3 & (1 << 0)) + beta = -beta; + if (options3 & (1 << 1)) + beta = NumericT(1) / beta; + + *s1 = *s2 * alpha + *s3 * beta; +} + +// alpha on CPU, beta on GPU +template<typename NumericT> +__global__ void asbs_kernel(NumericT * s1, + NumericT fac2, unsigned int options2, const NumericT * s2, + NumericT const * fac3, unsigned int options3, const NumericT * s3) +{ + NumericT alpha = fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + if (options2 & (1 << 1)) + alpha = NumericT(1) / alpha; + + NumericT beta = *fac3; + if (options3 & (1 << 0)) + beta = -beta; + if (options3 & (1 << 1)) + beta = NumericT(1) / beta; + + *s1 = *s2 * alpha + *s3 * beta; +} + +// alpha on GPU, beta on CPU +template<typename NumericT> +__global__ void asbs_kernel(NumericT * s1, + NumericT const * fac2, unsigned int options2, const NumericT * s2, + NumericT fac3, unsigned int options3, const NumericT * s3) +{ + NumericT alpha = *fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + if (options2 & (1 << 1)) + alpha = NumericT(1) / alpha; + + NumericT beta = fac3; + if (options3 & (1 << 0)) + beta = -beta; + if (options3 & (1 << 1)) + beta = NumericT(1) / beta; + + *s1 = *s2 * alpha + *s3 * beta; +} + +// alpha and beta on CPU +template<typename NumericT> +__global__ void asbs_kernel(NumericT * s1, + NumericT fac2, unsigned int options2, const NumericT * s2, + NumericT fac3, unsigned int options3, const NumericT * s3) +{ + NumericT alpha = fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + if (options2 & (1 << 1)) + alpha = NumericT(1) / alpha; + + NumericT beta = fac3; + if (options3 & (1 << 0)) + beta = -beta; + if (options3 & (1 << 1)) + beta = NumericT(1) / beta; + + *s1 = *s2 * alpha + *s3 * beta; +} + + +template<typename ScalarT1, + typename ScalarT2, typename NumericT1, + typename ScalarT3, typename NumericT2> +typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value + && viennacl::is_scalar<ScalarT2>::value + && viennacl::is_scalar<ScalarT3>::value + && viennacl::is_any_scalar<NumericT1>::value + && viennacl::is_any_scalar<NumericT2>::value + >::type +asbs(ScalarT1 & s1, + ScalarT2 const & s2, NumericT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, + ScalarT3 const & s3, NumericT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta) +{ + typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type value_type; + + unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha); + unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta); + + value_type temporary_alpha = 0; + if (viennacl::is_cpu_scalar<NumericT1>::value) + temporary_alpha = alpha; + + value_type temporary_beta = 0; + if (viennacl::is_cpu_scalar<NumericT2>::value) + temporary_beta = beta; + + asbs_kernel<<<1, 1>>>(viennacl::cuda_arg(s1), + viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), + options_alpha, + viennacl::cuda_arg(s2), + viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)), + options_beta, + viennacl::cuda_arg(s3) ); + VIENNACL_CUDA_LAST_ERROR_CHECK("asbs_kernel"); +} + +//////////////////// asbs_s //////////////////// + +// alpha and beta on GPU +template<typename NumericT> +__global__ void asbs_s_kernel(NumericT * s1, + const NumericT * fac2, unsigned int options2, const NumericT * s2, + const NumericT * fac3, unsigned int options3, const NumericT * s3) +{ + NumericT alpha = *fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + if (options2 & (1 << 1)) + alpha = NumericT(1) / alpha; + + NumericT beta = *fac3; + if (options3 & (1 << 0)) + beta = -beta; + if (options3 & (1 << 1)) + beta = NumericT(1) / beta; + + *s1 += *s2 * alpha + *s3 * beta; +} + +// alpha on CPU, beta on GPU +template<typename NumericT> +__global__ void asbs_s_kernel(NumericT * s1, + NumericT fac2, unsigned int options2, const NumericT * s2, + NumericT const * fac3, unsigned int options3, const NumericT * s3) +{ + NumericT alpha = fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + if (options2 & (1 << 1)) + alpha = NumericT(1) / alpha; + + NumericT beta = *fac3; + if (options3 & (1 << 0)) + beta = -beta; + if (options3 & (1 << 1)) + beta = NumericT(1) / beta; + + *s1 += *s2 * alpha + *s3 * beta; +} + +// alpha on GPU, beta on CPU +template<typename NumericT> +__global__ void asbs_s_kernel(NumericT * s1, + NumericT const * fac2, unsigned int options2, const NumericT * s2, + NumericT fac3, unsigned int options3, const NumericT * s3) +{ + NumericT alpha = *fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + if (options2 & (1 << 1)) + alpha = NumericT(1) / alpha; + + NumericT beta = fac3; + if (options3 & (1 << 0)) + beta = -beta; + if (options3 & (1 << 1)) + beta = NumericT(1) / beta; + + *s1 += *s2 * alpha + *s3 * beta; +} + +// alpha and beta on CPU +template<typename NumericT> +__global__ void asbs_s_kernel(NumericT * s1, + NumericT fac2, unsigned int options2, const NumericT * s2, + NumericT fac3, unsigned int options3, const NumericT * s3) +{ + NumericT alpha = fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + if (options2 & (1 << 1)) + alpha = NumericT(1) / alpha; + + NumericT beta = fac3; + if (options3 & (1 << 0)) + beta = -beta; + if (options3 & (1 << 1)) + beta = NumericT(1) / beta; + + *s1 += *s2 * alpha + *s3 * beta; +} + + +template<typename ScalarT1, + typename ScalarT2, typename NumericT1, + typename ScalarT3, typename NumericT2> +typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value + && viennacl::is_scalar<ScalarT2>::value + && viennacl::is_scalar<ScalarT3>::value + && viennacl::is_any_scalar<NumericT1>::value + && viennacl::is_any_scalar<NumericT2>::value + >::type +asbs_s(ScalarT1 & s1, + ScalarT2 const & s2, NumericT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, + ScalarT3 const & s3, NumericT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta) +{ + typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type value_type; + + unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha); + unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta); + + value_type temporary_alpha = 0; + if (viennacl::is_cpu_scalar<NumericT1>::value) + temporary_alpha = alpha; + + value_type temporary_beta = 0; + if (viennacl::is_cpu_scalar<NumericT2>::value) + temporary_beta = beta; + + std::cout << "Launching asbs_s_kernel..." << std::endl; + asbs_s_kernel<<<1, 1>>>(viennacl::cuda_arg(s1), + viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), + options_alpha, + viennacl::cuda_arg(s2), + viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)), + options_beta, + viennacl::cuda_arg(s3) ); + VIENNACL_CUDA_LAST_ERROR_CHECK("asbs_s_kernel"); +} + +///////////////// swap ////////////////// + +template<typename NumericT> +__global__ void scalar_swap_kernel(NumericT * s1, NumericT * s2) +{ + NumericT tmp = *s2; + *s2 = *s1; + *s1 = tmp; +} + +/** @brief Swaps the contents of two scalars, data is copied +* +* @param s1 The first scalar +* @param s2 The second scalar +*/ +template<typename ScalarT1, typename ScalarT2> +typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value + && viennacl::is_scalar<ScalarT2>::value + >::type +swap(ScalarT1 & s1, ScalarT2 & s2) +{ + typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type value_type; + + scalar_swap_kernel<<<1, 1>>>(viennacl::cuda_arg(s1), viennacl::cuda_arg(s2)); +} + + + +} //namespace single_threaded +} //namespace linalg +} //namespace viennacl + + +#endif
