http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_row.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_row.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_row.hpp deleted file mode 100644 index 45d6987..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_row.hpp +++ /dev/null @@ -1,1468 +0,0 @@ -#ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_ -#define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_ - -/* ========================================================================= - Copyright (c) 2010-2016, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. - Portions of this software are copyright by UChicago Argonne, LLC. - - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - - Project Head: Karl Rupp [email protected] - - (A list of authors and contributors can be found in the manual) - - License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - -/** @file viennacl/linalg/cuda/matrix_operations_row.hpp - @brief Implementations of row-major dense matrix related operations, including matrix-vector products, using CUDA. -*/ - - -namespace viennacl -{ -namespace linalg -{ -namespace cuda -{ - -template<typename DestNumericT, typename SrcNumericT> -__global__ void convert_row_kernel( - DestNumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - const SrcNumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2) -{ - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]; -} - -//Matrix transpose kernel -template<typename NumericT> -__global__ void trans_kernel( - const NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_stride1, unsigned int A_stride2, - - NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_internal_size1, unsigned int B_internal_size2, - unsigned int B_stride1, unsigned int B_stride2, - bool data_major) -{ - for(unsigned int row = blockIdx.x; row<A_size1; row+=gridDim.x) - { - for(unsigned int col = threadIdx.x; col<A_size2; col+=blockDim.x) - { - if(data_major) - B[(B_start1 + B_stride1 * col) * B_internal_size2 + (B_start2 + B_stride2 * row)] = A[(A_start1 + A_stride1 * row) * A_internal_size2 + (A_start2 + A_stride2 * col)]; - else - B[(B_start1 + B_stride1 * col) + (B_start2 + B_stride2 * row) * B_internal_size1] = A[(A_start1 + A_stride1 * row) + (A_start2 + A_stride2 * col) * A_internal_size1]; - } - } -} - -// -// am -// - -// alpha on CPU -template<typename NumericT> -__global__ void am_row_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - NumericT fac2, - unsigned int options2, - const NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2) -{ - NumericT alpha = fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - if (options2 & (1 << 1)) - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha; - } - else - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha; - } -} - -// alpha on GPU -template<typename NumericT> -__global__ void am_row_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - const NumericT * fac2, - unsigned int options2, - const NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2) -{ - NumericT alpha = *fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - if (options2 & (1 << 1)) - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha; - } - else - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha; - } -} - - -// -// ambm -// - -// alpha and beta on CPU -template<typename NumericT> -__global__ void ambm_row_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - NumericT fac2, - unsigned int options2, - const NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2, - - NumericT fac3, - unsigned int options3, - const NumericT * C, - unsigned int C_start1, unsigned int C_start2, - unsigned int C_inc1, unsigned int C_inc2, - unsigned int C_internal_size1, unsigned int C_internal_size2) -{ - NumericT alpha = fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - - NumericT beta = fac3; - if (options3 & (1 << 0)) - beta = -beta; - - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - if (options2 & (1 << 1)) - { - if (options3 & (1 << 1)) - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; - } - else - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; - } - } - else - { - if (options3 & (1 << 1)) - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; - } - else - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; - } - } -} - - -// alpha on CPU, beta on GPU -template<typename NumericT> -__global__ void ambm_row_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - NumericT fac2, - unsigned int options2, - const NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2, - - const NumericT * fac3, - unsigned int options3, - const NumericT * C, - unsigned int C_start1, unsigned int C_start2, - unsigned int C_inc1, unsigned int C_inc2, - unsigned int C_internal_size1, unsigned int C_internal_size2) -{ - NumericT alpha = fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - - NumericT beta = *fac3; - if (options3 & (1 << 0)) - beta = -beta; - - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - if (options2 & (1 << 1)) - { - if (options3 & (1 << 1)) - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; - } - else - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; - } - } - else - { - if (options3 & (1 << 1)) - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; - } - else - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; - } - } -} - -// alpha on GPU, beta on CPU -template<typename NumericT> -__global__ void ambm_row_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - const NumericT * fac2, - unsigned int options2, - const NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2, - - NumericT fac3, - unsigned int options3, - const NumericT * C, - unsigned int C_start1, unsigned int C_start2, - unsigned int C_inc1, unsigned int C_inc2, - unsigned int C_internal_size1, unsigned int C_internal_size2) -{ - NumericT alpha = *fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - - NumericT beta = fac3; - if (options3 & (1 << 0)) - beta = -beta; - - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - if (options2 & (1 << 1)) - { - if (options3 & (1 << 1)) - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; - } - else - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; - } - } - else - { - if (options3 & (1 << 1)) - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; - } - else - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; - } - } -} - - -// alpha and beta on GPU -template<typename NumericT> -__global__ void ambm_row_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - const NumericT * fac2, - unsigned int options2, - const NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2, - - const NumericT * fac3, - unsigned int options3, - const NumericT * C, - unsigned int C_start1, unsigned int C_start2, - unsigned int C_inc1, unsigned int C_inc2, - unsigned int C_internal_size1, unsigned int C_internal_size2) -{ - NumericT alpha = *fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - - NumericT beta = *fac3; - if (options3 & (1 << 0)) - beta = -beta; - - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - if (options2 & (1 << 1)) - { - if (options3 & (1 << 1)) - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; - } - else - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; - } - } - else - { - if (options3 & (1 << 1)) - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; - } - else - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; - } - } -} - - -// -// ambm_m -// - -// alpha and beta on CPU -template<typename NumericT> -__global__ void ambm_m_row_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - NumericT fac2, - unsigned int options2, - const NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2, - - NumericT fac3, - unsigned int options3, - const NumericT * C, - unsigned int C_start1, unsigned int C_start2, - unsigned int C_inc1, unsigned int C_inc2, - unsigned int C_internal_size1, unsigned int C_internal_size2) -{ - NumericT alpha = fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - - NumericT beta = fac3; - if (options3 & (1 << 0)) - beta = -beta; - - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - if (options2 & (1 << 1)) - { - if (options3 & (1 << 1)) - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; - } - else - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; - } - } - else - { - if (options3 & (1 << 1)) - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; - } - else - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; - } - } -} - - -// alpha on CPU, beta on GPU -template<typename NumericT> -__global__ void ambm_m_row_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - NumericT fac2, - unsigned int options2, - const NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2, - - const NumericT * fac3, - unsigned int options3, - const NumericT * C, - unsigned int C_start1, unsigned int C_start2, - unsigned int C_inc1, unsigned int C_inc2, - unsigned int C_internal_size1, unsigned int C_internal_size2) -{ - NumericT alpha = fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - - NumericT beta = *fac3; - if (options3 & (1 << 0)) - beta = -beta; - - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - if (options2 & (1 << 1)) - { - if (options3 & (1 << 1)) - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; - } - else - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; - } - } - else - { - if (options3 & (1 << 1)) - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; - } - else - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; - } - } -} - -// alpha on GPU, beta on CPU -template<typename NumericT> -__global__ void ambm_m_row_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - const NumericT * fac2, - unsigned int options2, - const NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2, - - NumericT fac3, - unsigned int options3, - const NumericT * C, - unsigned int C_start1, unsigned int C_start2, - unsigned int C_inc1, unsigned int C_inc2, - unsigned int C_internal_size1, unsigned int C_internal_size2) -{ - NumericT alpha = *fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - - NumericT beta = fac3; - if (options3 & (1 << 0)) - beta = -beta; - - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - if (options2 & (1 << 1)) - { - if (options3 & (1 << 1)) - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; - } - else - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; - } - } - else - { - if (options3 & (1 << 1)) - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; - } - else - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; - } - } -} - - -// alpha and beta on GPU -template<typename NumericT> -__global__ void ambm_m_row_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - const NumericT * fac2, - unsigned int options2, - const NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2, - - const NumericT * fac3, - unsigned int options3, - const NumericT * C, - unsigned int C_start1, unsigned int C_start2, - unsigned int C_inc1, unsigned int C_inc2, - unsigned int C_internal_size1, unsigned int C_internal_size2) -{ - NumericT alpha = *fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - - NumericT beta = *fac3; - if (options3 & (1 << 0)) - beta = -beta; - - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - if (options2 & (1 << 1)) - { - if (options3 & (1 << 1)) - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; - } - else - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; - } - } - else - { - if (options3 & (1 << 1)) - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; - } - else - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha - + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; - } - } -} - -// -// assignments -// - -template<typename NumericT> -__global__ void matrix_row_assign_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - NumericT alpha) -{ - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = alpha; -} - - -template<typename NumericT> -__global__ void matrix_row_diagonal_assign_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - NumericT alpha) -{ - unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x); - - for (unsigned int row = gid; row < A_size1; row += blockDim.x * gridDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + row * A_inc2 + A_start2] = alpha; -} - -// -// binary element-wise operations -// - -template<typename NumericT> -__global__ void element_op_row_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - const NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2, - - const NumericT * C, - unsigned int C_start1, unsigned int C_start2, - unsigned int C_inc1, unsigned int C_inc2, - unsigned int C_internal_size1, unsigned int C_internal_size2, - - unsigned int op_type) //0: product, 1: division, 2: pow -{ - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - if (op_type == 2) - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - = pow(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2], - C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2]); - } - else if (op_type == 1) - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] - / C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2]; - } - else if (op_type == 0) - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] - * C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2]; - } -} - -template<typename NumericT> -__global__ void element_op_int_row_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - const NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2, - - const NumericT * C, - unsigned int C_start1, unsigned int C_start2, - unsigned int C_inc1, unsigned int C_inc2, - unsigned int C_internal_size1, unsigned int C_internal_size2, - - unsigned int op_type) //0: product, 1: division, 2: pow -{ - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - if (op_type == 1) - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] - / C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2]; - } - else if (op_type == 0) - { - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] - = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] - * C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2]; - } -} - -// -// unary element-wise operations -// - -// abs -template<typename NumericT> -__global__ void matrix_row_element_abs_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - const NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2) -{ - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = abs(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); -} - - -// acos -template<typename NumericT> -__global__ void matrix_row_element_acos_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - const NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2) -{ - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = acos(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); -} - - -// asin -template<typename NumericT> -__global__ void matrix_row_element_asin_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - const NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2) -{ - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = asin(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); -} - - -// atan -template<typename NumericT> -__global__ void matrix_row_element_atan_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - const NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2) -{ - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = atan(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); -} - - -// ceil -template<typename NumericT> -__global__ void matrix_row_element_ceil_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - const NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2) -{ - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = ceil(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); -} - - -// cos -template<typename NumericT> -__global__ void matrix_row_element_cos_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - const NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2) -{ - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cos(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); -} - - -// cosh -template<typename NumericT> -__global__ void matrix_row_element_cosh_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - const NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2) -{ - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cosh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); -} - - -// exp -template<typename NumericT> -__global__ void matrix_row_element_exp_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - const NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2) -{ - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = exp(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); -} - - -// fabs -template<typename NumericT> -__global__ void matrix_row_element_fabs_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - const NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2) -{ - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = fabs(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); -} - - -// floor -template<typename NumericT> -__global__ void matrix_row_element_floor_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - const NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2) -{ - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = floor(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); -} - - -// log -template<typename NumericT> -__global__ void matrix_row_element_log_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - const NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2) -{ - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); -} - - -// log10 -template<typename NumericT> -__global__ void matrix_row_element_log10_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - const NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2) -{ - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log10(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); -} - - -// sin -template<typename NumericT> -__global__ void matrix_row_element_sin_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - const NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2) -{ - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sin(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); -} - - -// sinh -template<typename NumericT> -__global__ void matrix_row_element_sinh_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - const NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2) -{ - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sinh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); -} - - -// sqrt -template<typename NumericT> -__global__ void matrix_row_element_sqrt_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - const NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2) -{ - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sqrt(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); -} - - -// tan -template<typename NumericT> -__global__ void matrix_row_element_tan_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - const NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2) -{ - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tan(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); -} - - -// tanh -template<typename NumericT> -__global__ void matrix_row_element_tanh_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - const NumericT * B, - unsigned int B_start1, unsigned int B_start2, - unsigned int B_inc1, unsigned int B_inc2, - unsigned int B_internal_size1, unsigned int B_internal_size2) -{ - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tanh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); -} - - - -// -// matrix-vector product -// - -template<typename NumericT> -__global__ void vec_mul_row_kernel( - const NumericT * A, - unsigned int A_row_start, - unsigned int A_col_start, - unsigned int A_row_inc, - unsigned int A_col_inc, - unsigned int A_row_size, - unsigned int A_col_size, - unsigned int A_internal_rows, - unsigned int A_internal_cols, - const NumericT * v, - unsigned int v_start, - unsigned int v_inc, - unsigned int v_size, - NumericT * result, - unsigned int result_start, - unsigned int result_inc, - unsigned int result_size) -{ - __shared__ NumericT work[128]; - - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - unsigned int lid = threadIdx.x; - - for (unsigned int row = row_gid; row < A_row_size; row += gridDim.x) - { - NumericT dot_prod = 0; - for (unsigned int col = col_gid; col < A_col_size; col += blockDim.x) - dot_prod += A[(row * A_row_inc + A_row_start) * A_internal_cols + col * A_col_inc + A_col_start] * v[v_start + v_inc * col]; - work[lid] = dot_prod; - - for (unsigned int stride = blockDim.x/2; stride>0; stride>>=1){ - __syncthreads(); - if (lid < stride) - work[lid] += work[lid+stride]; - } - - if (lid == 0) - result[row * result_inc + result_start] = work[0]; - } -} - - -template<typename NumericT> -__global__ void trans_vec_mul_row_kernel( - const NumericT * A, - unsigned int A_row_start, - unsigned int A_col_start, - unsigned int A_row_inc, - unsigned int A_col_inc, - unsigned int A_row_size, - unsigned int A_col_size, - unsigned int A_internal_rows, - unsigned int A_internal_cols, - const NumericT * v, - unsigned int v_start, - unsigned int v_inc, - unsigned int v_size, - NumericT * result, - unsigned int result_start, - unsigned int result_inc, - unsigned int result_size) -{ - for (unsigned int row = blockIdx.x * blockDim.x + threadIdx.x; row < A_col_size; row += gridDim.x * blockDim.x) - { - NumericT dot_prod = 0; - for (unsigned int col = 0; col < A_row_size; ++col) - dot_prod += A[(row * A_col_inc + A_col_start) + (col * A_row_inc + A_row_start) * A_internal_cols] * v[v_start + v_inc * col]; - result[row * result_inc + result_start] = dot_prod; - } -} - - -// -// matrix-matrix products -// - - - - -// -// scaled rank-1-update -// - -// alpha on CPU -template<typename NumericT> -__global__ void scaled_rank1_update_row_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - NumericT val, - unsigned int options2, - - const NumericT * vec1, - unsigned int start1, - unsigned int inc1, - unsigned int size1, - - const NumericT * vec2, - unsigned int start2, - unsigned int inc2, - unsigned int size2) -{ - NumericT alpha = val; - if (options2 & (1 << 0)) - alpha = -alpha; - if (options2 & (1 << 1)) - alpha = NumericT(1) / alpha; - - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - { - NumericT tmp = alpha * vec1[row * inc1 + start1]; - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2]; - } -} - - -// alpha on GPU -template<typename NumericT> -__global__ void scaled_rank1_update_row_kernel( - NumericT * A, - unsigned int A_start1, unsigned int A_start2, - unsigned int A_inc1, unsigned int A_inc2, - unsigned int A_size1, unsigned int A_size2, - unsigned int A_internal_size1, unsigned int A_internal_size2, - - const NumericT * val, - unsigned int options2, - - const NumericT * vec1, - unsigned int start1, - unsigned int inc1, - unsigned int size1, - - const NumericT * vec2, - unsigned int start2, - unsigned int inc2, - unsigned int size2) -{ - NumericT alpha = *val; - if (options2 & (1 << 0)) - alpha = -alpha; - if (options2 & (1 << 1)) - alpha = NumericT(1) / alpha; - - unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; - unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; - - for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) - { - NumericT tmp = alpha * vec1[row * inc1 + start1]; - for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) - A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2]; - } -} - - - -} // namespace cuda -} //namespace linalg -} //namespace viennacl - - -#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/misc_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/misc_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/misc_operations.hpp deleted file mode 100644 index 4821f5b..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/misc_operations.hpp +++ /dev/null @@ -1,91 +0,0 @@ -#ifndef VIENNACL_LINALG_CUDA_MISC_OPERATIONS_HPP_ -#define VIENNACL_LINALG_CUDA_MISC_OPERATIONS_HPP_ - -/* ========================================================================= - Copyright (c) 2010-2016, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. - Portions of this software are copyright by UChicago Argonne, LLC. - - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - - Project Head: Karl Rupp [email protected] - - (A list of authors and contributors can be found in the manual) - - License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - -/** @file viennacl/linalg/cuda/misc_operations.hpp - @brief Implementations of miscellaneous operations using CUDA -*/ - -#include "viennacl/forwards.h" -#include "viennacl/scalar.hpp" -#include "viennacl/vector.hpp" -#include "viennacl/tools/tools.hpp" -#include "viennacl/linalg/cuda/common.hpp" - - -namespace viennacl -{ -namespace linalg -{ -namespace cuda -{ -namespace detail -{ - -template<typename NumericT> -__global__ void level_scheduling_substitute_kernel( - const unsigned int * row_index_array, - const unsigned int * row_indices, - const unsigned int * column_indices, - const NumericT * elements, - NumericT * vec, - unsigned int size) -{ - for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; - row < size; - row += gridDim.x * blockDim.x) - { - unsigned int eq_row = row_index_array[row]; - NumericT vec_entry = vec[eq_row]; - unsigned int row_end = row_indices[row+1]; - - for (unsigned int j = row_indices[row]; j < row_end; ++j) - vec_entry -= vec[column_indices[j]] * elements[j]; - - vec[eq_row] = vec_entry; - } -} - - - -template<typename NumericT> -void level_scheduling_substitute(vector<NumericT> & vec, - viennacl::backend::mem_handle const & row_index_array, - viennacl::backend::mem_handle const & row_buffer, - viennacl::backend::mem_handle const & col_buffer, - viennacl::backend::mem_handle const & element_buffer, - vcl_size_t num_rows - ) -{ - level_scheduling_substitute_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(row_index_array), - viennacl::cuda_arg<unsigned int>(row_buffer), - viennacl::cuda_arg<unsigned int>(col_buffer), - viennacl::cuda_arg<NumericT>(element_buffer), - viennacl::cuda_arg(vec), - static_cast<unsigned int>(num_rows) - ); -} - -} //namespace detail -} //namespace cuda -} //namespace linalg -} //namespace viennacl - - -#endif http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/nmf_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/nmf_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/nmf_operations.hpp deleted file mode 100644 index 109f74f..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/nmf_operations.hpp +++ /dev/null @@ -1,152 +0,0 @@ -#ifndef VIENNACL_LINALG_CUDA_NMF_OPERATIONS_HPP_ -#define VIENNACL_LINALG_CUDA_NMF_OPERATIONS_HPP_ - -/* ========================================================================= - Copyright (c) 2010-2016, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. - Portions of this software are copyright by UChicago Argonne, LLC. - - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - - Project Head: Karl Rupp [email protected] - - (A list of authors and contributors can be found in the manual) - - License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - -/** @file viennacl/linalg/cuda/vector_operations.hpp - @brief Implementations of NMF operations using CUDA - */ - -#include "viennacl/linalg/host_based/nmf_operations.hpp" - -#include "viennacl/linalg/cuda/common.hpp" - -namespace viennacl -{ -namespace linalg -{ -namespace cuda -{ - -/** @brief Main CUDA kernel for nonnegative matrix factorization of a dense matrices. */ -template<typename NumericT> -__global__ void el_wise_mul_div(NumericT * matrix1, - NumericT const * matrix2, - NumericT const * matrix3, - unsigned int size) -{ - for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i +=gridDim.x * blockDim.x) - { - NumericT val = matrix1[i] * matrix2[i]; - NumericT divisor = matrix3[i]; - matrix1[i] = (divisor > (NumericT) 0.00001) ? (val / divisor) : NumericT(0); - } -} - -/** @brief The nonnegative matrix factorization (approximation) algorithm as suggested by Lee and Seung. Factorizes a matrix V with nonnegative entries into matrices W and H such that ||V - W*H|| is minimized. - * - * @param V Input matrix - * @param W First factor - * @param H Second factor - * @param conf A configuration object holding tolerances and the like - */ -template<typename NumericT> -void nmf(viennacl::matrix_base<NumericT> const & V, - viennacl::matrix_base<NumericT> & W, - viennacl::matrix_base<NumericT> & H, - viennacl::linalg::nmf_config const & conf) -{ - vcl_size_t k = W.size2(); - conf.iters_ = 0; - - if (!viennacl::linalg::norm_frobenius(W)) - W = viennacl::scalar_matrix<NumericT>(W.size1(), W.size2(), NumericT(1.0)); - - if (!viennacl::linalg::norm_frobenius(H)) - H = viennacl::scalar_matrix<NumericT>(H.size1(), H.size2(), NumericT(1.0)); - - viennacl::matrix_base<NumericT> wn(V.size1(), k, W.row_major()); - viennacl::matrix_base<NumericT> wd(V.size1(), k, W.row_major()); - viennacl::matrix_base<NumericT> wtmp(V.size1(), V.size2(), W.row_major()); - - viennacl::matrix_base<NumericT> hn(k, V.size2(), H.row_major()); - viennacl::matrix_base<NumericT> hd(k, V.size2(), H.row_major()); - viennacl::matrix_base<NumericT> htmp(k, k, H.row_major()); - - viennacl::matrix_base<NumericT> appr(V.size1(), V.size2(), V.row_major()); - - viennacl::vector<NumericT> diff(V.size1() * V.size2()); - - NumericT last_diff = 0; - NumericT diff_init = 0; - bool stagnation_flag = false; - - for (vcl_size_t i = 0; i < conf.max_iterations(); i++) - { - conf.iters_ = i + 1; - - hn = viennacl::linalg::prod(trans(W), V); - htmp = viennacl::linalg::prod(trans(W), W); - hd = viennacl::linalg::prod(htmp, H); - - el_wise_mul_div<<<128, 128>>>(viennacl::cuda_arg<NumericT>(H), - viennacl::cuda_arg<NumericT>(hn), - viennacl::cuda_arg<NumericT>(hd), - static_cast<unsigned int>(H.internal_size1() * H.internal_size2())); - VIENNACL_CUDA_LAST_ERROR_CHECK("el_wise_mul_div"); - - wn = viennacl::linalg::prod(V, trans(H)); - wtmp = viennacl::linalg::prod(W, H); - wd = viennacl::linalg::prod(wtmp, trans(H)); - - el_wise_mul_div<<<128, 128>>>(viennacl::cuda_arg<NumericT>(W), - viennacl::cuda_arg<NumericT>(wn), - viennacl::cuda_arg<NumericT>(wd), - static_cast<unsigned int>( W.internal_size1() * W.internal_size2())); - VIENNACL_CUDA_LAST_ERROR_CHECK("el_wise_mul_div"); - - if (i % conf.check_after_steps() == 0) //check for convergence - { - appr = viennacl::linalg::prod(W, H); - - appr -= V; - NumericT diff_val = viennacl::linalg::norm_frobenius(appr); - - if (i == 0) - diff_init = diff_val; - - if (conf.print_relative_error()) - std::cout << diff_val / diff_init << std::endl; - - // Approximation check - if (diff_val / diff_init < conf.tolerance()) - break; - - // Stagnation check - if (std::fabs(diff_val - last_diff) / (diff_val * conf.check_after_steps()) < conf.stagnation_tolerance()) //avoid situations where convergence stagnates - { - if (stagnation_flag) // iteration stagnates (two iterates with no notable progress) - break; - else - // record stagnation in this iteration - stagnation_flag = true; - } else - // good progress in this iteration, so unset stagnation flag - stagnation_flag = false; - - // prepare for next iterate: - last_diff = diff_val; - } - } -} - -} //namespace cuda -} //namespace linalg -} //namespace viennacl - -#endif /* VIENNACL_LINALG_CUDA_NMF_OPERATIONS_HPP_ */ http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/scalar_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/scalar_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/scalar_operations.hpp deleted file mode 100644 index 3adaca2..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/scalar_operations.hpp +++ /dev/null @@ -1,375 +0,0 @@ -#ifndef VIENNACL_LINALG_CUDA_SCALAR_OPERATIONS_HPP_ -#define VIENNACL_LINALG_CUDA_SCALAR_OPERATIONS_HPP_ - -/* ========================================================================= - Copyright (c) 2010-2016, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. - Portions of this software are copyright by UChicago Argonne, LLC. - - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - - Project Head: Karl Rupp [email protected] - - (A list of authors and contributors can be found in the manual) - - License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - -/** @file viennacl/linalg/cuda/scalar_operations.hpp - @brief Implementations of scalar operations using CUDA -*/ - -#include "viennacl/forwards.h" -#include "viennacl/tools/tools.hpp" -#include "viennacl/meta/predicate.hpp" -#include "viennacl/meta/enable_if.hpp" -#include "viennacl/traits/size.hpp" -#include "viennacl/traits/start.hpp" -#include "viennacl/traits/stride.hpp" -#include "viennacl/linalg/cuda/common.hpp" - -// includes CUDA -#include <cuda_runtime.h> - - -namespace viennacl -{ -namespace linalg -{ -namespace cuda -{ - -/////////////////// as ///////////////////////////// - -template<typename NumericT> -__global__ void as_kernel(NumericT * s1, const NumericT * fac2, unsigned int options2, const NumericT * s2) -{ - NumericT alpha = *fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - if (options2 & (1 << 1)) - alpha = NumericT(1) / alpha; - - *s1 = *s2 * alpha; -} - -template<typename NumericT> -__global__ void as_kernel(NumericT * s1, NumericT fac2, unsigned int options2, const NumericT * s2) -{ - NumericT alpha = fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - if (options2 & (1 << 1)) - alpha = NumericT(1) / alpha; - - *s1 = *s2 * alpha; -} - -template<typename ScalarT1, - typename ScalarT2, typename NumericT> -typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value - && viennacl::is_scalar<ScalarT2>::value - && viennacl::is_any_scalar<NumericT>::value - >::type -as(ScalarT1 & s1, - ScalarT2 const & s2, NumericT const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha) -{ - typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type value_type; - - unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha); - - value_type temporary_alpha = 0; - if (viennacl::is_cpu_scalar<NumericT>::value) - temporary_alpha = alpha; - - as_kernel<<<1, 1>>>(viennacl::cuda_arg(s1), - viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), - options_alpha, - viennacl::cuda_arg(s2)); - VIENNACL_CUDA_LAST_ERROR_CHECK("as_kernel"); -} - -//////////////////// asbs //////////////////////////// - -// alpha and beta on GPU -template<typename NumericT> -__global__ void asbs_kernel(NumericT * s1, - const NumericT * fac2, unsigned int options2, const NumericT * s2, - const NumericT * fac3, unsigned int options3, const NumericT * s3) -{ - NumericT alpha = *fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - if (options2 & (1 << 1)) - alpha = NumericT(1) / alpha; - - NumericT beta = *fac3; - if (options3 & (1 << 0)) - beta = -beta; - if (options3 & (1 << 1)) - beta = NumericT(1) / beta; - - *s1 = *s2 * alpha + *s3 * beta; -} - -// alpha on CPU, beta on GPU -template<typename NumericT> -__global__ void asbs_kernel(NumericT * s1, - NumericT fac2, unsigned int options2, const NumericT * s2, - NumericT const * fac3, unsigned int options3, const NumericT * s3) -{ - NumericT alpha = fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - if (options2 & (1 << 1)) - alpha = NumericT(1) / alpha; - - NumericT beta = *fac3; - if (options3 & (1 << 0)) - beta = -beta; - if (options3 & (1 << 1)) - beta = NumericT(1) / beta; - - *s1 = *s2 * alpha + *s3 * beta; -} - -// alpha on GPU, beta on CPU -template<typename NumericT> -__global__ void asbs_kernel(NumericT * s1, - NumericT const * fac2, unsigned int options2, const NumericT * s2, - NumericT fac3, unsigned int options3, const NumericT * s3) -{ - NumericT alpha = *fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - if (options2 & (1 << 1)) - alpha = NumericT(1) / alpha; - - NumericT beta = fac3; - if (options3 & (1 << 0)) - beta = -beta; - if (options3 & (1 << 1)) - beta = NumericT(1) / beta; - - *s1 = *s2 * alpha + *s3 * beta; -} - -// alpha and beta on CPU -template<typename NumericT> -__global__ void asbs_kernel(NumericT * s1, - NumericT fac2, unsigned int options2, const NumericT * s2, - NumericT fac3, unsigned int options3, const NumericT * s3) -{ - NumericT alpha = fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - if (options2 & (1 << 1)) - alpha = NumericT(1) / alpha; - - NumericT beta = fac3; - if (options3 & (1 << 0)) - beta = -beta; - if (options3 & (1 << 1)) - beta = NumericT(1) / beta; - - *s1 = *s2 * alpha + *s3 * beta; -} - - -template<typename ScalarT1, - typename ScalarT2, typename NumericT1, - typename ScalarT3, typename NumericT2> -typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value - && viennacl::is_scalar<ScalarT2>::value - && viennacl::is_scalar<ScalarT3>::value - && viennacl::is_any_scalar<NumericT1>::value - && viennacl::is_any_scalar<NumericT2>::value - >::type -asbs(ScalarT1 & s1, - ScalarT2 const & s2, NumericT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, - ScalarT3 const & s3, NumericT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta) -{ - typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type value_type; - - unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha); - unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta); - - value_type temporary_alpha = 0; - if (viennacl::is_cpu_scalar<NumericT1>::value) - temporary_alpha = alpha; - - value_type temporary_beta = 0; - if (viennacl::is_cpu_scalar<NumericT2>::value) - temporary_beta = beta; - - asbs_kernel<<<1, 1>>>(viennacl::cuda_arg(s1), - viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), - options_alpha, - viennacl::cuda_arg(s2), - viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)), - options_beta, - viennacl::cuda_arg(s3) ); - VIENNACL_CUDA_LAST_ERROR_CHECK("asbs_kernel"); -} - -//////////////////// asbs_s //////////////////// - -// alpha and beta on GPU -template<typename NumericT> -__global__ void asbs_s_kernel(NumericT * s1, - const NumericT * fac2, unsigned int options2, const NumericT * s2, - const NumericT * fac3, unsigned int options3, const NumericT * s3) -{ - NumericT alpha = *fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - if (options2 & (1 << 1)) - alpha = NumericT(1) / alpha; - - NumericT beta = *fac3; - if (options3 & (1 << 0)) - beta = -beta; - if (options3 & (1 << 1)) - beta = NumericT(1) / beta; - - *s1 += *s2 * alpha + *s3 * beta; -} - -// alpha on CPU, beta on GPU -template<typename NumericT> -__global__ void asbs_s_kernel(NumericT * s1, - NumericT fac2, unsigned int options2, const NumericT * s2, - NumericT const * fac3, unsigned int options3, const NumericT * s3) -{ - NumericT alpha = fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - if (options2 & (1 << 1)) - alpha = NumericT(1) / alpha; - - NumericT beta = *fac3; - if (options3 & (1 << 0)) - beta = -beta; - if (options3 & (1 << 1)) - beta = NumericT(1) / beta; - - *s1 += *s2 * alpha + *s3 * beta; -} - -// alpha on GPU, beta on CPU -template<typename NumericT> -__global__ void asbs_s_kernel(NumericT * s1, - NumericT const * fac2, unsigned int options2, const NumericT * s2, - NumericT fac3, unsigned int options3, const NumericT * s3) -{ - NumericT alpha = *fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - if (options2 & (1 << 1)) - alpha = NumericT(1) / alpha; - - NumericT beta = fac3; - if (options3 & (1 << 0)) - beta = -beta; - if (options3 & (1 << 1)) - beta = NumericT(1) / beta; - - *s1 += *s2 * alpha + *s3 * beta; -} - -// alpha and beta on CPU -template<typename NumericT> -__global__ void asbs_s_kernel(NumericT * s1, - NumericT fac2, unsigned int options2, const NumericT * s2, - NumericT fac3, unsigned int options3, const NumericT * s3) -{ - NumericT alpha = fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - if (options2 & (1 << 1)) - alpha = NumericT(1) / alpha; - - NumericT beta = fac3; - if (options3 & (1 << 0)) - beta = -beta; - if (options3 & (1 << 1)) - beta = NumericT(1) / beta; - - *s1 += *s2 * alpha + *s3 * beta; -} - - -template<typename ScalarT1, - typename ScalarT2, typename NumericT1, - typename ScalarT3, typename NumericT2> -typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value - && viennacl::is_scalar<ScalarT2>::value - && viennacl::is_scalar<ScalarT3>::value - && viennacl::is_any_scalar<NumericT1>::value - && viennacl::is_any_scalar<NumericT2>::value - >::type -asbs_s(ScalarT1 & s1, - ScalarT2 const & s2, NumericT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, - ScalarT3 const & s3, NumericT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta) -{ - typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type value_type; - - unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha); - unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta); - - value_type temporary_alpha = 0; - if (viennacl::is_cpu_scalar<NumericT1>::value) - temporary_alpha = alpha; - - value_type temporary_beta = 0; - if (viennacl::is_cpu_scalar<NumericT2>::value) - temporary_beta = beta; - - std::cout << "Launching asbs_s_kernel..." << std::endl; - asbs_s_kernel<<<1, 1>>>(viennacl::cuda_arg(s1), - viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), - options_alpha, - viennacl::cuda_arg(s2), - viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)), - options_beta, - viennacl::cuda_arg(s3) ); - VIENNACL_CUDA_LAST_ERROR_CHECK("asbs_s_kernel"); -} - -///////////////// swap ////////////////// - -template<typename NumericT> -__global__ void scalar_swap_kernel(NumericT * s1, NumericT * s2) -{ - NumericT tmp = *s2; - *s2 = *s1; - *s1 = tmp; -} - -/** @brief Swaps the contents of two scalars, data is copied -* -* @param s1 The first scalar -* @param s2 The second scalar -*/ -template<typename ScalarT1, typename ScalarT2> -typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value - && viennacl::is_scalar<ScalarT2>::value - >::type -swap(ScalarT1 & s1, ScalarT2 & s2) -{ - typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type value_type; - - scalar_swap_kernel<<<1, 1>>>(viennacl::cuda_arg(s1), viennacl::cuda_arg(s2)); -} - - - -} //namespace single_threaded -} //namespace linalg -} //namespace viennacl - - -#endif
