http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_prod.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_prod.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_prod.hpp new file mode 100644 index 0000000..24cb4a6 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_prod.hpp @@ -0,0 +1,2887 @@ +#ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_PROD_HPP_ +#define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_PROD_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/cuda/matrix_operations_prod.hpp + @brief Dense matrix-matrix product CUDA kernels reside here. + + Note: File created semi-automatically from OpenCL kernels. +*/ + +#include "viennacl/forwards.h" + +namespace viennacl +{ +namespace linalg +{ +namespace cuda +{ + +// matrix-matrix multiplication C = A * B +// matrix layouts: C...col_major, A...col_major, B...col_major +template<typename NumericT> +__global__ void matrix_matrix_col_col_col_prod_AA_kernel( + NumericT alpha, + const NumericT * A, + unsigned int A_row_start, + unsigned int A_col_start, + unsigned int A_row_inc, + unsigned int A_col_inc, + unsigned int A_row_size, + unsigned int A_col_size, + unsigned int A_internal_rows, + unsigned int A_internal_cols, + const NumericT * B, + unsigned int B_row_start, + unsigned int B_col_start, + unsigned int B_row_inc, + unsigned int B_col_inc, + unsigned int B_row_size, + unsigned int B_col_size, + unsigned int B_internal_rows, + unsigned int B_internal_cols, + NumericT beta, + NumericT * C, + unsigned int C_row_start, + unsigned int C_col_start, + unsigned int C_row_inc, + unsigned int C_col_inc, + unsigned int C_row_size, + unsigned int C_col_size, + unsigned int C_internal_rows, + unsigned int C_internal_cols) +{ + + __shared__ NumericT bufA[272]; + __shared__ NumericT bufB[272]; + + vcl_size_t block_size = 16;//get_local_size(0); + vcl_size_t row_block_id = blockIdx.x; + vcl_size_t col_block_id = blockIdx.y; + vcl_size_t row_thread_id = threadIdx.x; + vcl_size_t col_thread_id = threadIdx.y; + vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows; + vcl_size_t aStep = block_size * A_col_inc * A_internal_rows; + vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start; + vcl_size_t bStep = block_size * B_row_inc; + vcl_size_t block_num = (A_col_size + block_size - 1) / block_size; + NumericT Csub = 0; + vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; + vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows; + + vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); + vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); + for (vcl_size_t block = 0; + block < block_num; + ++block) + { + bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; + bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; + __syncthreads(); + NumericT * bufAptr = bufA + row_thread_id_times_block_size; + NumericT * bufBptr = bufB + col_thread_id_times_block_size; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + __syncthreads(); + aBegin += aStep; + bBegin += bStep; + } + if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size) + C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows]; +} + +// matrix-matrix multiplication C = A * B^T +// matrix layouts: C...col_major, A...col_major, B...col_major +template<typename NumericT> +__global__ void matrix_matrix_col_col_col_prod_AT_kernel( + NumericT alpha, + const NumericT * A, + unsigned int A_row_start, + unsigned int A_col_start, + unsigned int A_row_inc, + unsigned int A_col_inc, + unsigned int A_row_size, + unsigned int A_col_size, + unsigned int A_internal_rows, + unsigned int A_internal_cols, + const NumericT * B, + unsigned int B_row_start, + unsigned int B_col_start, + unsigned int B_row_inc, + unsigned int B_col_inc, + unsigned int B_row_size, + unsigned int B_col_size, + unsigned int B_internal_rows, + unsigned int B_internal_cols, + NumericT beta, + NumericT * C, + unsigned int C_row_start, + unsigned int C_col_start, + unsigned int C_row_inc, + unsigned int C_col_inc, + unsigned int C_row_size, + unsigned int C_col_size, + unsigned int C_internal_rows, + unsigned int C_internal_cols) +{ + + __shared__ NumericT bufA[272]; + __shared__ NumericT bufB[272]; + + vcl_size_t block_size = 16;//get_local_size(0); + vcl_size_t row_block_id = blockIdx.x; + vcl_size_t col_block_id = blockIdx.y; + vcl_size_t row_thread_id = threadIdx.x; + vcl_size_t col_thread_id = threadIdx.y; + vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows; + vcl_size_t aStep = block_size * A_col_inc * A_internal_rows; + vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows; + vcl_size_t bStep = block_size * B_internal_rows * B_col_inc; + vcl_size_t block_num = (A_col_size + block_size - 1) / block_size; + NumericT Csub = 0; + vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; + vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows; + + vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); + vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); + for (vcl_size_t block = 0; + block < block_num; + ++block) + { + bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; + bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; + __syncthreads(); + NumericT * bufAptr = bufA + row_thread_id_times_block_size; + NumericT * bufBptr = bufB + col_thread_id_times_block_size; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + __syncthreads(); + aBegin += aStep; + bBegin += bStep; + } + if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size) + C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows]; +} + +// matrix-matrix multiplication C = A^T * B +// matrix layouts: C...col_major, A...col_major, B...col_major +template<typename NumericT> +__global__ void matrix_matrix_col_col_col_prod_TA_kernel( + NumericT alpha, + const NumericT * A, + unsigned int A_row_start, + unsigned int A_col_start, + unsigned int A_row_inc, + unsigned int A_col_inc, + unsigned int A_row_size, + unsigned int A_col_size, + unsigned int A_internal_rows, + unsigned int A_internal_cols, + const NumericT * B, + unsigned int B_row_start, + unsigned int B_col_start, + unsigned int B_row_inc, + unsigned int B_col_inc, + unsigned int B_row_size, + unsigned int B_col_size, + unsigned int B_internal_rows, + unsigned int B_internal_cols, + NumericT beta, + NumericT * C, + unsigned int C_row_start, + unsigned int C_col_start, + unsigned int C_row_inc, + unsigned int C_col_inc, + unsigned int C_row_size, + unsigned int C_col_size, + unsigned int C_internal_rows, + unsigned int C_internal_cols) +{ + + __shared__ NumericT bufA[272]; + __shared__ NumericT bufB[272]; + + vcl_size_t block_size = 16;//get_local_size(0); + vcl_size_t row_block_id = blockIdx.x; + vcl_size_t col_block_id = blockIdx.y; + vcl_size_t row_thread_id = threadIdx.x; + vcl_size_t col_thread_id = threadIdx.y; + vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start; + vcl_size_t aStep = block_size * A_row_inc; + vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start; + vcl_size_t bStep = block_size * B_row_inc; + vcl_size_t block_num = (A_row_size + block_size - 1) / block_size; + NumericT Csub = 0; + vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; + vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows; + + vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); + vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); + for (vcl_size_t block = 0; + block < block_num; + ++block) + { + bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; + bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; + __syncthreads(); + NumericT * bufAptr = bufA + row_thread_id_times_block_size; + NumericT * bufBptr = bufB + col_thread_id_times_block_size; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + __syncthreads(); + aBegin += aStep; + bBegin += bStep; + } + if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size) + C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows]; +} + +// matrix-matrix multiplication C = A^T * B^T +// matrix layouts: C...col_major, A...col_major, B...col_major +template<typename NumericT> +__global__ void matrix_matrix_col_col_col_prod_TT_kernel( + NumericT alpha, + const NumericT * A, + unsigned int A_row_start, + unsigned int A_col_start, + unsigned int A_row_inc, + unsigned int A_col_inc, + unsigned int A_row_size, + unsigned int A_col_size, + unsigned int A_internal_rows, + unsigned int A_internal_cols, + const NumericT * B, + unsigned int B_row_start, + unsigned int B_col_start, + unsigned int B_row_inc, + unsigned int B_col_inc, + unsigned int B_row_size, + unsigned int B_col_size, + unsigned int B_internal_rows, + unsigned int B_internal_cols, + NumericT beta, + NumericT * C, + unsigned int C_row_start, + unsigned int C_col_start, + unsigned int C_row_inc, + unsigned int C_col_inc, + unsigned int C_row_size, + unsigned int C_col_size, + unsigned int C_internal_rows, + unsigned int C_internal_cols) +{ + + __shared__ NumericT bufA[272]; + __shared__ NumericT bufB[272]; + + vcl_size_t block_size = 16;//get_local_size(0); + vcl_size_t row_block_id = blockIdx.x; + vcl_size_t col_block_id = blockIdx.y; + vcl_size_t row_thread_id = threadIdx.x; + vcl_size_t col_thread_id = threadIdx.y; + vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start; + vcl_size_t aStep = block_size * A_row_inc; + vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows; + vcl_size_t bStep = block_size * B_internal_rows * B_col_inc; + vcl_size_t block_num = (A_row_size + block_size - 1) / block_size; + NumericT Csub = 0; + vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; + vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows; + + vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); + vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); + for (vcl_size_t block = 0; + block < block_num; + ++block) + { + bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; + bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; + __syncthreads(); + NumericT * bufAptr = bufA + row_thread_id_times_block_size; + NumericT * bufBptr = bufB + col_thread_id_times_block_size; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + __syncthreads(); + aBegin += aStep; + bBegin += bStep; + } + if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size) + C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows]; +} + + + +//////////////////////////////////////////////////////////////////////////// + + + + +// matrix-matrix multiplication C = A * B +// matrix layouts: C...row_major, A...col_major, B...col_major +template<typename NumericT> +__global__ void matrix_matrix_row_col_col_prod_AA_kernel( + NumericT alpha, + const NumericT * A, + unsigned int A_row_start, + unsigned int A_col_start, + unsigned int A_row_inc, + unsigned int A_col_inc, + unsigned int A_row_size, + unsigned int A_col_size, + unsigned int A_internal_rows, + unsigned int A_internal_cols, + const NumericT * B, + unsigned int B_row_start, + unsigned int B_col_start, + unsigned int B_row_inc, + unsigned int B_col_inc, + unsigned int B_row_size, + unsigned int B_col_size, + unsigned int B_internal_rows, + unsigned int B_internal_cols, + NumericT beta, + NumericT * C, + unsigned int C_row_start, + unsigned int C_col_start, + unsigned int C_row_inc, + unsigned int C_col_inc, + unsigned int C_row_size, + unsigned int C_col_size, + unsigned int C_internal_rows, + unsigned int C_internal_cols) +{ + + __shared__ NumericT bufA[272]; + __shared__ NumericT bufB[272]; + + vcl_size_t block_size = 16;//get_local_size(0); + vcl_size_t row_block_id = blockIdx.x; + vcl_size_t col_block_id = blockIdx.y; + vcl_size_t row_thread_id = threadIdx.x; + vcl_size_t col_thread_id = threadIdx.y; + vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows; + vcl_size_t aStep = block_size * A_col_inc * A_internal_rows; + vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start; + vcl_size_t bStep = block_size * B_row_inc; + vcl_size_t block_num = (A_col_size + block_size - 1) / block_size; + NumericT Csub = 0; + vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; + vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows; + + vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); + vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); + for (vcl_size_t block = 0; + block < block_num; + ++block) + { + bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; + bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; + __syncthreads(); + NumericT * bufAptr = bufA + row_thread_id_times_block_size; + NumericT * bufBptr = bufB + col_thread_id_times_block_size; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + __syncthreads(); + aBegin += aStep; + bBegin += bStep; + } + if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size) + C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start]; +} + +// matrix-matrix multiplication C = A * B^T +// matrix layouts: C...row_major, A...col_major, B...col_major +template<typename NumericT> +__global__ void matrix_matrix_row_col_col_prod_AT_kernel( + NumericT alpha, + const NumericT * A, + unsigned int A_row_start, + unsigned int A_col_start, + unsigned int A_row_inc, + unsigned int A_col_inc, + unsigned int A_row_size, + unsigned int A_col_size, + unsigned int A_internal_rows, + unsigned int A_internal_cols, + const NumericT * B, + unsigned int B_row_start, + unsigned int B_col_start, + unsigned int B_row_inc, + unsigned int B_col_inc, + unsigned int B_row_size, + unsigned int B_col_size, + unsigned int B_internal_rows, + unsigned int B_internal_cols, + NumericT beta, + NumericT * C, + unsigned int C_row_start, + unsigned int C_col_start, + unsigned int C_row_inc, + unsigned int C_col_inc, + unsigned int C_row_size, + unsigned int C_col_size, + unsigned int C_internal_rows, + unsigned int C_internal_cols) +{ + + __shared__ NumericT bufA[272]; + __shared__ NumericT bufB[272]; + + vcl_size_t block_size = 16;//get_local_size(0); + vcl_size_t row_block_id = blockIdx.x; + vcl_size_t col_block_id = blockIdx.y; + vcl_size_t row_thread_id = threadIdx.x; + vcl_size_t col_thread_id = threadIdx.y; + vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows; + vcl_size_t aStep = block_size * A_col_inc * A_internal_rows; + vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows; + vcl_size_t bStep = block_size * B_internal_rows * B_col_inc; + vcl_size_t block_num = (A_col_size + block_size - 1) / block_size; + NumericT Csub = 0; + vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; + vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows; + + vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); + vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); + for (vcl_size_t block = 0; + block < block_num; + ++block) + { + bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; + bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; + __syncthreads(); + NumericT * bufAptr = bufA + row_thread_id_times_block_size; + NumericT * bufBptr = bufB + col_thread_id_times_block_size; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + __syncthreads(); + aBegin += aStep; + bBegin += bStep; + } + if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size) + C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start]; +} + +// matrix-matrix multiplication C = A^T * B +// matrix layouts: C...row_major, A...col_major, B...col_major +template<typename NumericT> +__global__ void matrix_matrix_row_col_col_prod_TA_kernel( + NumericT alpha, + const NumericT * A, + unsigned int A_row_start, + unsigned int A_col_start, + unsigned int A_row_inc, + unsigned int A_col_inc, + unsigned int A_row_size, + unsigned int A_col_size, + unsigned int A_internal_rows, + unsigned int A_internal_cols, + const NumericT * B, + unsigned int B_row_start, + unsigned int B_col_start, + unsigned int B_row_inc, + unsigned int B_col_inc, + unsigned int B_row_size, + unsigned int B_col_size, + unsigned int B_internal_rows, + unsigned int B_internal_cols, + NumericT beta, + NumericT * C, + unsigned int C_row_start, + unsigned int C_col_start, + unsigned int C_row_inc, + unsigned int C_col_inc, + unsigned int C_row_size, + unsigned int C_col_size, + unsigned int C_internal_rows, + unsigned int C_internal_cols) +{ + + __shared__ NumericT bufA[272]; + __shared__ NumericT bufB[272]; + + vcl_size_t block_size = 16;//get_local_size(0); + vcl_size_t row_block_id = blockIdx.x; + vcl_size_t col_block_id = blockIdx.y; + vcl_size_t row_thread_id = threadIdx.x; + vcl_size_t col_thread_id = threadIdx.y; + vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start; + vcl_size_t aStep = block_size * A_row_inc; + vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start; + vcl_size_t bStep = block_size * B_row_inc; + vcl_size_t block_num = (A_row_size + block_size - 1) / block_size; + NumericT Csub = 0; + vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; + vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows; + + vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); + vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); + for (vcl_size_t block = 0; + block < block_num; + ++block) + { + bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; + bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; + __syncthreads(); + NumericT * bufAptr = bufA + row_thread_id_times_block_size; + NumericT * bufBptr = bufB + col_thread_id_times_block_size; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + __syncthreads(); + aBegin += aStep; + bBegin += bStep; + } + if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size) + C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start]; +} + +// matrix-matrix multiplication C = A^T * B^T +// matrix layouts: C...row_major, A...col_major, B...col_major +template<typename NumericT> +__global__ void matrix_matrix_row_col_col_prod_TT_kernel( + NumericT alpha, + const NumericT * A, + unsigned int A_row_start, + unsigned int A_col_start, + unsigned int A_row_inc, + unsigned int A_col_inc, + unsigned int A_row_size, + unsigned int A_col_size, + unsigned int A_internal_rows, + unsigned int A_internal_cols, + const NumericT * B, + unsigned int B_row_start, + unsigned int B_col_start, + unsigned int B_row_inc, + unsigned int B_col_inc, + unsigned int B_row_size, + unsigned int B_col_size, + unsigned int B_internal_rows, + unsigned int B_internal_cols, + NumericT beta, + NumericT * C, + unsigned int C_row_start, + unsigned int C_col_start, + unsigned int C_row_inc, + unsigned int C_col_inc, + unsigned int C_row_size, + unsigned int C_col_size, + unsigned int C_internal_rows, + unsigned int C_internal_cols) +{ + + __shared__ NumericT bufA[272]; + __shared__ NumericT bufB[272]; + + vcl_size_t block_size = 16;//get_local_size(0); + vcl_size_t row_block_id = blockIdx.x; + vcl_size_t col_block_id = blockIdx.y; + vcl_size_t row_thread_id = threadIdx.x; + vcl_size_t col_thread_id = threadIdx.y; + vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start; + vcl_size_t aStep = block_size * A_row_inc; + vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows; + vcl_size_t bStep = block_size * B_internal_rows * B_col_inc; + vcl_size_t block_num = (A_row_size + block_size - 1) / block_size; + NumericT Csub = 0; + vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; + vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows; + + vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); + vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); + for (vcl_size_t block = 0; + block < block_num; + ++block) + { + bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; + bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; + __syncthreads(); + NumericT * bufAptr = bufA + row_thread_id_times_block_size; + NumericT * bufBptr = bufB + col_thread_id_times_block_size; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + __syncthreads(); + aBegin += aStep; + bBegin += bStep; + } + if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size) + C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start]; +} + + + + +//////////////////////////////////////////////////////////////////////////// + + + + +// matrix-matrix multiplication C = A * B +// matrix layouts: C...col_major, A...col_major, B...row_major +template<typename NumericT> +__global__ void matrix_matrix_col_col_row_prod_AA_kernel( + NumericT alpha, + const NumericT * A, + unsigned int A_row_start, + unsigned int A_col_start, + unsigned int A_row_inc, + unsigned int A_col_inc, + unsigned int A_row_size, + unsigned int A_col_size, + unsigned int A_internal_rows, + unsigned int A_internal_cols, + const NumericT * B, + unsigned int B_row_start, + unsigned int B_col_start, + unsigned int B_row_inc, + unsigned int B_col_inc, + unsigned int B_row_size, + unsigned int B_col_size, + unsigned int B_internal_rows, + unsigned int B_internal_cols, + NumericT beta, + NumericT * C, + unsigned int C_row_start, + unsigned int C_col_start, + unsigned int C_row_inc, + unsigned int C_col_inc, + unsigned int C_row_size, + unsigned int C_col_size, + unsigned int C_internal_rows, + unsigned int C_internal_cols) +{ + + __shared__ NumericT bufA[272]; + __shared__ NumericT bufB[272]; + + vcl_size_t block_size = 16;//get_local_size(0); + vcl_size_t row_block_id = blockIdx.x; + vcl_size_t col_block_id = blockIdx.y; + vcl_size_t row_thread_id = threadIdx.x; + vcl_size_t col_thread_id = threadIdx.y; + vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows; + vcl_size_t aStep = block_size * A_col_inc * A_internal_rows; + vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols; + vcl_size_t bStep = block_size * B_internal_cols * B_row_inc; + vcl_size_t block_num = (A_col_size + block_size - 1) / block_size; + NumericT Csub = 0; + vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; + vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols; + + vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); + vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); + for (vcl_size_t block = 0; + block < block_num; + ++block) + { + bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; + bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; + __syncthreads(); + NumericT * bufAptr = bufA + row_thread_id_times_block_size; + NumericT * bufBptr = bufB + col_thread_id_times_block_size; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + __syncthreads(); + aBegin += aStep; + bBegin += bStep; + } + if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size) + C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows]; +} + +// matrix-matrix multiplication C = A * B^T +// matrix layouts: C...col_major, A...col_major, B...row_major +template<typename NumericT> +__global__ void matrix_matrix_col_col_row_prod_AT_kernel( + NumericT alpha, + const NumericT * A, + unsigned int A_row_start, + unsigned int A_col_start, + unsigned int A_row_inc, + unsigned int A_col_inc, + unsigned int A_row_size, + unsigned int A_col_size, + unsigned int A_internal_rows, + unsigned int A_internal_cols, + const NumericT * B, + unsigned int B_row_start, + unsigned int B_col_start, + unsigned int B_row_inc, + unsigned int B_col_inc, + unsigned int B_row_size, + unsigned int B_col_size, + unsigned int B_internal_rows, + unsigned int B_internal_cols, + NumericT beta, + NumericT * C, + unsigned int C_row_start, + unsigned int C_col_start, + unsigned int C_row_inc, + unsigned int C_col_inc, + unsigned int C_row_size, + unsigned int C_col_size, + unsigned int C_internal_rows, + unsigned int C_internal_cols) +{ + + __shared__ NumericT bufA[272]; + __shared__ NumericT bufB[272]; + + vcl_size_t block_size = 16;//get_local_size(0); + vcl_size_t row_block_id = blockIdx.x; + vcl_size_t col_block_id = blockIdx.y; + vcl_size_t row_thread_id = threadIdx.x; + vcl_size_t col_thread_id = threadIdx.y; + vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows; + vcl_size_t aStep = block_size * A_col_inc * A_internal_rows; + vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start; + vcl_size_t bStep = block_size * B_col_inc; + vcl_size_t block_num = (A_col_size + block_size - 1) / block_size; + NumericT Csub = 0; + vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; + vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols; + + vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); + vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); + for (vcl_size_t block = 0; + block < block_num; + ++block) + { + bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; + bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; + __syncthreads(); + NumericT * bufAptr = bufA + row_thread_id_times_block_size; + NumericT * bufBptr = bufB + col_thread_id_times_block_size; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + __syncthreads(); + aBegin += aStep; + bBegin += bStep; + } + if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size) + C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows]; +} + +// matrix-matrix multiplication C = A^T * B +// matrix layouts: C...col_major, A...col_major, B...row_major +template<typename NumericT> +__global__ void matrix_matrix_col_col_row_prod_TA_kernel( + NumericT alpha, + const NumericT * A, + unsigned int A_row_start, + unsigned int A_col_start, + unsigned int A_row_inc, + unsigned int A_col_inc, + unsigned int A_row_size, + unsigned int A_col_size, + unsigned int A_internal_rows, + unsigned int A_internal_cols, + const NumericT * B, + unsigned int B_row_start, + unsigned int B_col_start, + unsigned int B_row_inc, + unsigned int B_col_inc, + unsigned int B_row_size, + unsigned int B_col_size, + unsigned int B_internal_rows, + unsigned int B_internal_cols, + NumericT beta, + NumericT * C, + unsigned int C_row_start, + unsigned int C_col_start, + unsigned int C_row_inc, + unsigned int C_col_inc, + unsigned int C_row_size, + unsigned int C_col_size, + unsigned int C_internal_rows, + unsigned int C_internal_cols) +{ + + __shared__ NumericT bufA[272]; + __shared__ NumericT bufB[272]; + + vcl_size_t block_size = 16;//get_local_size(0); + vcl_size_t row_block_id = blockIdx.x; + vcl_size_t col_block_id = blockIdx.y; + vcl_size_t row_thread_id = threadIdx.x; + vcl_size_t col_thread_id = threadIdx.y; + vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start; + vcl_size_t aStep = block_size * A_row_inc; + vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols; + vcl_size_t bStep = block_size * B_internal_cols * B_row_inc; + vcl_size_t block_num = (A_row_size + block_size - 1) / block_size; + NumericT Csub = 0; + vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; + vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols; + + vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); + vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); + for (vcl_size_t block = 0; + block < block_num; + ++block) + { + bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; + bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; + __syncthreads(); + NumericT * bufAptr = bufA + row_thread_id_times_block_size; + NumericT * bufBptr = bufB + col_thread_id_times_block_size; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + __syncthreads(); + aBegin += aStep; + bBegin += bStep; + } + if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size) + C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows]; +} + +// matrix-matrix multiplication C = A^T * B^T +// matrix layouts: C...col_major, A...col_major, B...row_major +template<typename NumericT> +__global__ void matrix_matrix_col_col_row_prod_TT_kernel( + NumericT alpha, + const NumericT * A, + unsigned int A_row_start, + unsigned int A_col_start, + unsigned int A_row_inc, + unsigned int A_col_inc, + unsigned int A_row_size, + unsigned int A_col_size, + unsigned int A_internal_rows, + unsigned int A_internal_cols, + const NumericT * B, + unsigned int B_row_start, + unsigned int B_col_start, + unsigned int B_row_inc, + unsigned int B_col_inc, + unsigned int B_row_size, + unsigned int B_col_size, + unsigned int B_internal_rows, + unsigned int B_internal_cols, + NumericT beta, + NumericT * C, + unsigned int C_row_start, + unsigned int C_col_start, + unsigned int C_row_inc, + unsigned int C_col_inc, + unsigned int C_row_size, + unsigned int C_col_size, + unsigned int C_internal_rows, + unsigned int C_internal_cols) +{ + + __shared__ NumericT bufA[272]; + __shared__ NumericT bufB[272]; + + vcl_size_t block_size = 16;//get_local_size(0); + vcl_size_t row_block_id = blockIdx.x; + vcl_size_t col_block_id = blockIdx.y; + vcl_size_t row_thread_id = threadIdx.x; + vcl_size_t col_thread_id = threadIdx.y; + vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start; + vcl_size_t aStep = block_size * A_row_inc; + vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start; + vcl_size_t bStep = block_size * B_col_inc; + vcl_size_t block_num = (A_row_size + block_size - 1) / block_size; + NumericT Csub = 0; + vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; + vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols; + + vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); + vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); + for (vcl_size_t block = 0; + block < block_num; + ++block) + { + bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; + bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; + __syncthreads(); + NumericT * bufAptr = bufA + row_thread_id_times_block_size; + NumericT * bufBptr = bufB + col_thread_id_times_block_size; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + __syncthreads(); + aBegin += aStep; + bBegin += bStep; + } + if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size) + C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows]; +} + + + +//////////////////////////////////////////////////////////////////////////// + + + + +// matrix-matrix multiplication C = A * B +// matrix layouts: C...row_major, A...col_major, B...row_major +template<typename NumericT> +__global__ void matrix_matrix_row_col_row_prod_AA_kernel( + NumericT alpha, + const NumericT * A, + unsigned int A_row_start, + unsigned int A_col_start, + unsigned int A_row_inc, + unsigned int A_col_inc, + unsigned int A_row_size, + unsigned int A_col_size, + unsigned int A_internal_rows, + unsigned int A_internal_cols, + const NumericT * B, + unsigned int B_row_start, + unsigned int B_col_start, + unsigned int B_row_inc, + unsigned int B_col_inc, + unsigned int B_row_size, + unsigned int B_col_size, + unsigned int B_internal_rows, + unsigned int B_internal_cols, + NumericT beta, + NumericT * C, + unsigned int C_row_start, + unsigned int C_col_start, + unsigned int C_row_inc, + unsigned int C_col_inc, + unsigned int C_row_size, + unsigned int C_col_size, + unsigned int C_internal_rows, + unsigned int C_internal_cols) +{ + + __shared__ NumericT bufA[272]; + __shared__ NumericT bufB[272]; + + vcl_size_t block_size = 16;//get_local_size(0); + vcl_size_t row_block_id = blockIdx.x; + vcl_size_t col_block_id = blockIdx.y; + vcl_size_t row_thread_id = threadIdx.x; + vcl_size_t col_thread_id = threadIdx.y; + vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows; + vcl_size_t aStep = block_size * A_col_inc * A_internal_rows; + vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols; + vcl_size_t bStep = block_size * B_internal_cols * B_row_inc; + vcl_size_t block_num = (A_col_size + block_size - 1) / block_size; + NumericT Csub = 0; + vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; + vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols; + + vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); + vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); + for (vcl_size_t block = 0; + block < block_num; + ++block) + { + bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; + bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; + __syncthreads(); + NumericT * bufAptr = bufA + row_thread_id_times_block_size; + NumericT * bufBptr = bufB + col_thread_id_times_block_size; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + __syncthreads(); + aBegin += aStep; + bBegin += bStep; + } + if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size) + C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start]; +} + +// matrix-matrix multiplication C = A * B^T +// matrix layouts: C...row_major, A...col_major, B...row_major +template<typename NumericT> +__global__ void matrix_matrix_row_col_row_prod_AT_kernel( + NumericT alpha, + const NumericT * A, + unsigned int A_row_start, + unsigned int A_col_start, + unsigned int A_row_inc, + unsigned int A_col_inc, + unsigned int A_row_size, + unsigned int A_col_size, + unsigned int A_internal_rows, + unsigned int A_internal_cols, + const NumericT * B, + unsigned int B_row_start, + unsigned int B_col_start, + unsigned int B_row_inc, + unsigned int B_col_inc, + unsigned int B_row_size, + unsigned int B_col_size, + unsigned int B_internal_rows, + unsigned int B_internal_cols, + NumericT beta, + NumericT * C, + unsigned int C_row_start, + unsigned int C_col_start, + unsigned int C_row_inc, + unsigned int C_col_inc, + unsigned int C_row_size, + unsigned int C_col_size, + unsigned int C_internal_rows, + unsigned int C_internal_cols) +{ + + __shared__ NumericT bufA[272]; + __shared__ NumericT bufB[272]; + + vcl_size_t block_size = 16;//get_local_size(0); + vcl_size_t row_block_id = blockIdx.x; + vcl_size_t col_block_id = blockIdx.y; + vcl_size_t row_thread_id = threadIdx.x; + vcl_size_t col_thread_id = threadIdx.y; + vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + A_col_start * A_internal_rows; + vcl_size_t aStep = block_size * A_col_inc * A_internal_rows; + vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start; + vcl_size_t bStep = block_size * B_col_inc; + vcl_size_t block_num = (A_col_size + block_size - 1) / block_size; + NumericT Csub = 0; + vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; + vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols; + + vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); + vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); + for (vcl_size_t block = 0; + block < block_num; + ++block) + { + bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_col_size) && (row_block_id * block_size + row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; + bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; + __syncthreads(); + NumericT * bufAptr = bufA + row_thread_id_times_block_size; + NumericT * bufBptr = bufB + col_thread_id_times_block_size; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + __syncthreads(); + aBegin += aStep; + bBegin += bStep; + } + if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size) + C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start]; +} + +// matrix-matrix multiplication C = A^T * B +// matrix layouts: C...row_major, A...col_major, B...row_major +template<typename NumericT> +__global__ void matrix_matrix_row_col_row_prod_TA_kernel( + NumericT alpha, + const NumericT * A, + unsigned int A_row_start, + unsigned int A_col_start, + unsigned int A_row_inc, + unsigned int A_col_inc, + unsigned int A_row_size, + unsigned int A_col_size, + unsigned int A_internal_rows, + unsigned int A_internal_cols, + const NumericT * B, + unsigned int B_row_start, + unsigned int B_col_start, + unsigned int B_row_inc, + unsigned int B_col_inc, + unsigned int B_row_size, + unsigned int B_col_size, + unsigned int B_internal_rows, + unsigned int B_internal_cols, + NumericT beta, + NumericT * C, + unsigned int C_row_start, + unsigned int C_col_start, + unsigned int C_row_inc, + unsigned int C_col_inc, + unsigned int C_row_size, + unsigned int C_col_size, + unsigned int C_internal_rows, + unsigned int C_internal_cols) +{ + + __shared__ NumericT bufA[272]; + __shared__ NumericT bufB[272]; + + vcl_size_t block_size = 16;//get_local_size(0); + vcl_size_t row_block_id = blockIdx.x; + vcl_size_t col_block_id = blockIdx.y; + vcl_size_t row_thread_id = threadIdx.x; + vcl_size_t col_thread_id = threadIdx.y; + vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start; + vcl_size_t aStep = block_size * A_row_inc; + vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + B_row_start * B_internal_cols; + vcl_size_t bStep = block_size * B_internal_cols * B_row_inc; + vcl_size_t block_num = (A_row_size + block_size - 1) / block_size; + NumericT Csub = 0; + vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; + vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols; + + vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); + vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); + for (vcl_size_t block = 0; + block < block_num; + ++block) + { + bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; + bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_row_size) && (col_block_id * block_size + row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; + __syncthreads(); + NumericT * bufAptr = bufA + row_thread_id_times_block_size; + NumericT * bufBptr = bufB + col_thread_id_times_block_size; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + __syncthreads(); + aBegin += aStep; + bBegin += bStep; + } + if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size) + C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start]; +} + +// matrix-matrix multiplication C = A^T * B^T +// matrix layouts: C...row_major, A...col_major, B...row_major +template<typename NumericT> +__global__ void matrix_matrix_row_col_row_prod_TT_kernel( + NumericT alpha, + const NumericT * A, + unsigned int A_row_start, + unsigned int A_col_start, + unsigned int A_row_inc, + unsigned int A_col_inc, + unsigned int A_row_size, + unsigned int A_col_size, + unsigned int A_internal_rows, + unsigned int A_internal_cols, + const NumericT * B, + unsigned int B_row_start, + unsigned int B_col_start, + unsigned int B_row_inc, + unsigned int B_col_inc, + unsigned int B_row_size, + unsigned int B_col_size, + unsigned int B_internal_rows, + unsigned int B_internal_cols, + NumericT beta, + NumericT * C, + unsigned int C_row_start, + unsigned int C_col_start, + unsigned int C_row_inc, + unsigned int C_col_inc, + unsigned int C_row_size, + unsigned int C_col_size, + unsigned int C_internal_rows, + unsigned int C_internal_cols) +{ + + __shared__ NumericT bufA[272]; + __shared__ NumericT bufB[272]; + + vcl_size_t block_size = 16;//get_local_size(0); + vcl_size_t row_block_id = blockIdx.x; + vcl_size_t col_block_id = blockIdx.y; + vcl_size_t row_thread_id = threadIdx.x; + vcl_size_t col_thread_id = threadIdx.y; + vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * A_internal_rows + A_row_start; + vcl_size_t aStep = block_size * A_row_inc; + vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * B_internal_cols + B_col_start; + vcl_size_t bStep = block_size * B_col_inc; + vcl_size_t block_num = (A_row_size + block_size - 1) / block_size; + NumericT Csub = 0; + vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * A_internal_rows; + vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * B_internal_cols; + + vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); + vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); + for (vcl_size_t block = 0; + block < block_num; + ++block) + { + bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_row_size) && (row_block_id * block_size + col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; + bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_col_size) && (col_block_id * block_size + col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; + __syncthreads(); + NumericT * bufAptr = bufA + row_thread_id_times_block_size; + NumericT * bufBptr = bufB + col_thread_id_times_block_size; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + __syncthreads(); + aBegin += aStep; + bBegin += bStep; + } + if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size) + C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start]; +} + + + + + +//////////////////////////////////////////////////////////////////////////// + + + + + + +// matrix-matrix multiplication C = A * B +// matrix layouts: C...col_major, A...row_major, B...col_major +template<typename NumericT> +__global__ void matrix_matrix_col_row_col_prod_AA_kernel( + NumericT alpha, + const NumericT * A, + unsigned int A_row_start, + unsigned int A_col_start, + unsigned int A_row_inc, + unsigned int A_col_inc, + unsigned int A_row_size, + unsigned int A_col_size, + unsigned int A_internal_rows, + unsigned int A_internal_cols, + const NumericT * B, + unsigned int B_row_start, + unsigned int B_col_start, + unsigned int B_row_inc, + unsigned int B_col_inc, + unsigned int B_row_size, + unsigned int B_col_size, + unsigned int B_internal_rows, + unsigned int B_internal_cols, + NumericT beta, + NumericT * C, + unsigned int C_row_start, + unsigned int C_col_start, + unsigned int C_row_inc, + unsigned int C_col_inc, + unsigned int C_row_size, + unsigned int C_col_size, + unsigned int C_internal_rows, + unsigned int C_internal_cols) +{ + + __shared__ NumericT bufA[272]; + __shared__ NumericT bufB[272]; + + vcl_size_t block_size = 16;//get_local_size(0); + vcl_size_t row_block_id = blockIdx.x; + vcl_size_t col_block_id = blockIdx.y; + vcl_size_t row_thread_id = threadIdx.x; + vcl_size_t col_thread_id = threadIdx.y; + vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start; + vcl_size_t aStep = block_size * A_col_inc; + vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start; + vcl_size_t bStep = block_size * B_row_inc; + vcl_size_t block_num = (A_col_size + block_size - 1) / block_size; + NumericT Csub = 0; + vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols; + vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows; + + vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); + vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); + for (vcl_size_t block = 0; + block < block_num; + ++block) + { + bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; + bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; + __syncthreads(); + NumericT * bufAptr = bufA + row_thread_id_times_block_size; + NumericT * bufBptr = bufB + col_thread_id_times_block_size; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + __syncthreads(); + aBegin += aStep; + bBegin += bStep; + } + if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size) + C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows]; +} + +// matrix-matrix multiplication C = A * B^T +// matrix layouts: C...col_major, A...row_major, B...col_major +template<typename NumericT> +__global__ void matrix_matrix_col_row_col_prod_AT_kernel( + NumericT alpha, + const NumericT * A, + unsigned int A_row_start, + unsigned int A_col_start, + unsigned int A_row_inc, + unsigned int A_col_inc, + unsigned int A_row_size, + unsigned int A_col_size, + unsigned int A_internal_rows, + unsigned int A_internal_cols, + const NumericT * B, + unsigned int B_row_start, + unsigned int B_col_start, + unsigned int B_row_inc, + unsigned int B_col_inc, + unsigned int B_row_size, + unsigned int B_col_size, + unsigned int B_internal_rows, + unsigned int B_internal_cols, + NumericT beta, + NumericT * C, + unsigned int C_row_start, + unsigned int C_col_start, + unsigned int C_row_inc, + unsigned int C_col_inc, + unsigned int C_row_size, + unsigned int C_col_size, + unsigned int C_internal_rows, + unsigned int C_internal_cols) +{ + + __shared__ NumericT bufA[272]; + __shared__ NumericT bufB[272]; + + vcl_size_t block_size = 16;//get_local_size(0); + vcl_size_t row_block_id = blockIdx.x; + vcl_size_t col_block_id = blockIdx.y; + vcl_size_t row_thread_id = threadIdx.x; + vcl_size_t col_thread_id = threadIdx.y; + vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * A_internal_cols + A_col_start; + vcl_size_t aStep = block_size * A_col_inc; + vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows; + vcl_size_t bStep = block_size * B_internal_rows * B_col_inc; + vcl_size_t block_num = (A_col_size + block_size - 1) / block_size; + NumericT Csub = 0; + vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols; + vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows; + + vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); + vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); + for (vcl_size_t block = 0; + block < block_num; + ++block) + { + bufA[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < A_col_size) && (row_block_id * block_size + col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0; + bufB[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < B_col_size) && (col_block_id * block_size + row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0; + __syncthreads(); + NumericT * bufAptr = bufA + row_thread_id_times_block_size; + NumericT * bufBptr = bufB + col_thread_id_times_block_size; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + __syncthreads(); + aBegin += aStep; + bBegin += bStep; + } + if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_row_size) + C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows]; +} + +// matrix-matrix multiplication C = A^T * B +// matrix layouts: C...col_major, A...row_major, B...col_major +template<typename NumericT> +__global__ void matrix_matrix_col_row_col_prod_TA_kernel( + NumericT alpha, + const NumericT * A, + unsigned int A_row_start, + unsigned int A_col_start, + unsigned int A_row_inc, + unsigned int A_col_inc, + unsigned int A_row_size, + unsigned int A_col_size, + unsigned int A_internal_rows, + unsigned int A_internal_cols, + const NumericT * B, + unsigned int B_row_start, + unsigned int B_col_start, + unsigned int B_row_inc, + unsigned int B_col_inc, + unsigned int B_row_size, + unsigned int B_col_size, + unsigned int B_internal_rows, + unsigned int B_internal_cols, + NumericT beta, + NumericT * C, + unsigned int C_row_start, + unsigned int C_col_start, + unsigned int C_row_inc, + unsigned int C_col_inc, + unsigned int C_row_size, + unsigned int C_col_size, + unsigned int C_internal_rows, + unsigned int C_internal_cols) +{ + + __shared__ NumericT bufA[272]; + __shared__ NumericT bufB[272]; + + vcl_size_t block_size = 16;//get_local_size(0); + vcl_size_t row_block_id = blockIdx.x; + vcl_size_t col_block_id = blockIdx.y; + vcl_size_t row_thread_id = threadIdx.x; + vcl_size_t col_thread_id = threadIdx.y; + vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols; + vcl_size_t aStep = block_size * A_row_inc * A_internal_cols; + vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * B_internal_rows + B_row_start; + vcl_size_t bStep = block_size * B_row_inc; + vcl_size_t block_num = (A_row_size + block_size - 1) / block_size; + NumericT Csub = 0; + vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * A_internal_cols; + vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * B_internal_rows; + + vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1); + vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1); + for (vcl_size_t block = 0; + block < block_num; + ++block) + { + bufA[row_thread_id_times_block_size + col_thread_id] = ((block * block_size + col_thread_id < A_row_size) && (row_block_id * block_size + row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0; + bufB[col_thread_id_times_block_size + row_thread_id] = ((block * block_size + row_thread_id < B_row_size) && (col_block_id * block_size + col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0; + __syncthreads(); + NumericT * bufAptr = bufA + row_thread_id_times_block_size; + NumericT * bufBptr = bufB + col_thread_id_times_block_size; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr; + __syncthreads(); + aBegin += aStep; + bBegin += bStep; + } + if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * blockDim.y + threadIdx.y) < B_col_size) + C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + ((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * C_internal_rows]; +} + +// matrix-matrix multiplication C = A^T * B^T +// matrix layouts: C...col_major, A...row_major, B...col_major +template<typename NumericT> +__global__ void matrix_matrix_col_row_col_prod_TT_kernel( + NumericT alpha, + const NumericT * A, + unsigned int A_row_start, + unsigned int A_col_start, + unsigned int A_row_inc, + unsigned int A_col_inc, + unsigned int A_row_size, + unsigned int A_col_size, + unsigned int A_internal_rows, + unsigned int A_internal_cols, + const NumericT * B, + unsigned int B_row_start, + unsigned int B_col_start, + unsigned int B_row_inc, + unsigned int B_col_inc, + unsigned int B_row_size, + unsigned int B_col_size, + unsigned int B_internal_rows, + unsigned int B_internal_cols, + NumericT beta, + NumericT * C, + unsigned int C_row_start, + unsigned int C_col_start, + unsigned int C_row_inc, + unsigned int C_col_inc, + unsigned int C_row_size, + unsigned int C_col_size, + unsigned int C_internal_rows, + unsigned int C_internal_cols) +{ + + __shared__ NumericT bufA[272]; + __shared__ NumericT bufB[272]; + + vcl_size_t block_size = 16;//get_local_size(0); + vcl_size_t row_block_id = blockIdx.x; + vcl_size_t col_block_id = blockIdx.y; + vcl_size_t row_thread_id = threadIdx.x; + vcl_size_t col_thread_id = threadIdx.y; + vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + A_row_start * A_internal_cols; + vcl_size_t aStep = block_size * A_row_inc * A_internal_cols; + vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + B_col_start * B_internal_rows; + vcl_size_t bStep = block_size * B_internal_rows * B_col_inc; + vcl_size_t block_num = (A_row_size + block_size - 1) /
<TRUNCATED>
