http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_prod.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_prod.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_prod.hpp
deleted file mode 100644
index 24cb4a6..0000000
--- 
a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_prod.hpp
+++ /dev/null
@@ -1,2887 +0,0 @@
-#ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_PROD_HPP_
-#define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_PROD_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-/** @file  viennacl/linalg/cuda/matrix_operations_prod.hpp
-    @brief Dense matrix-matrix product CUDA kernels reside here.
-
-    Note: File created semi-automatically from OpenCL kernels.
-*/
-
-#include "viennacl/forwards.h"
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace cuda
-{
-
-// matrix-matrix multiplication C = A * B
-// matrix layouts: C...col_major, A...col_major, B...col_major
-template<typename NumericT>
-__global__ void matrix_matrix_col_col_col_prod_AA_kernel(
-          NumericT alpha,
-          const NumericT * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          const NumericT * B,
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols,
-          NumericT beta,
-          NumericT * C,
-          unsigned int C_row_start,
-          unsigned int C_col_start,
-          unsigned int C_row_inc,
-          unsigned int C_col_inc,
-          unsigned int C_row_size,
-          unsigned int C_col_size,
-          unsigned int C_internal_rows,
-          unsigned int C_internal_cols)
-{
-
-  __shared__ NumericT bufA[272];
-  __shared__ NumericT bufB[272];
-
-  vcl_size_t block_size = 16;//get_local_size(0);
-  vcl_size_t row_block_id = blockIdx.x;
-  vcl_size_t col_block_id = blockIdx.y;
-  vcl_size_t row_thread_id = threadIdx.x;
-  vcl_size_t col_thread_id = threadIdx.y;
-  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + 
A_col_start * A_internal_rows;
-  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
-  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * 
B_internal_rows + B_row_start;
-  vcl_size_t bStep = block_size * B_row_inc;
-  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
-  NumericT Csub = 0;
-  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * 
A_internal_rows;
-  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * 
 B_internal_rows;
-
-  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
-  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
-  for (vcl_size_t block = 0;
-          block < block_num;
-          ++block)
-  {
-    bufA[row_thread_id_times_block_size + col_thread_id] = ((block * 
block_size + col_thread_id < A_col_size) && (row_block_id * block_size + 
row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
-    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * 
block_size + row_thread_id < B_row_size) && (col_block_id * block_size + 
col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
-    __syncthreads();
-    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
-    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-    __syncthreads();
-    aBegin += aStep;
-    bBegin += bStep;
-  }
-  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * 
blockDim.y + threadIdx.y) < B_col_size)
-    C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + 
((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * 
C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * 
C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + 
((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * 
C_internal_rows];
-}
-
-// matrix-matrix multiplication C = A * B^T
-// matrix layouts: C...col_major, A...col_major, B...col_major
-template<typename NumericT>
-__global__ void matrix_matrix_col_col_col_prod_AT_kernel(
-          NumericT alpha,
-          const NumericT * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          const NumericT * B,
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols,
-          NumericT beta,
-          NumericT * C,
-          unsigned int C_row_start,
-          unsigned int C_col_start,
-          unsigned int C_row_inc,
-          unsigned int C_col_inc,
-          unsigned int C_row_size,
-          unsigned int C_col_size,
-          unsigned int C_internal_rows,
-          unsigned int C_internal_cols)
-{
-
-  __shared__ NumericT bufA[272];
-  __shared__ NumericT bufB[272];
-
-  vcl_size_t block_size = 16;//get_local_size(0);
-  vcl_size_t row_block_id = blockIdx.x;
-  vcl_size_t col_block_id = blockIdx.y;
-  vcl_size_t row_thread_id = threadIdx.x;
-  vcl_size_t col_thread_id = threadIdx.y;
-  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + 
A_col_start * A_internal_rows;
-  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
-  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + 
B_col_start * B_internal_rows;
-  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
-  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
-  NumericT Csub = 0;
-  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * 
A_internal_rows;
-  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * 
 B_internal_rows;
-
-  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
-  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
-  for (vcl_size_t block = 0;
-          block < block_num;
-          ++block)
-  {
-    bufA[row_thread_id_times_block_size + col_thread_id] = ((block * 
block_size + col_thread_id < A_col_size) && (row_block_id * block_size + 
row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
-    bufB[row_thread_id_times_block_size + col_thread_id] = ((block * 
block_size + col_thread_id < B_col_size) && (col_block_id * block_size + 
row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
-    __syncthreads();
-    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
-    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-    __syncthreads();
-    aBegin += aStep;
-    bBegin += bStep;
-  }
-  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * 
blockDim.y + threadIdx.y) < B_row_size)
-    C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + 
((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * 
C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * 
C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + 
((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * 
C_internal_rows];
-}
-
-// matrix-matrix multiplication C = A^T * B
-// matrix layouts: C...col_major, A...col_major, B...col_major
-template<typename NumericT>
-__global__ void matrix_matrix_col_col_col_prod_TA_kernel(
-          NumericT alpha,
-          const NumericT * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          const NumericT * B,
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols,
-          NumericT beta,
-          NumericT * C,
-          unsigned int C_row_start,
-          unsigned int C_col_start,
-          unsigned int C_row_inc,
-          unsigned int C_col_inc,
-          unsigned int C_row_size,
-          unsigned int C_col_size,
-          unsigned int C_internal_rows,
-          unsigned int C_internal_cols)
-{
-
-  __shared__ NumericT bufA[272];
-  __shared__ NumericT bufB[272];
-
-  vcl_size_t block_size = 16;//get_local_size(0);
-  vcl_size_t row_block_id = blockIdx.x;
-  vcl_size_t col_block_id = blockIdx.y;
-  vcl_size_t row_thread_id = threadIdx.x;
-  vcl_size_t col_thread_id = threadIdx.y;
-  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * 
A_internal_rows + A_row_start;
-  vcl_size_t aStep = block_size * A_row_inc;
-  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * 
B_internal_rows + B_row_start;
-  vcl_size_t bStep = block_size * B_row_inc;
-  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
-  NumericT Csub = 0;
-  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * 
A_internal_rows;
-  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * 
 B_internal_rows;
-
-  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
-  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
-  for (vcl_size_t block = 0;
-          block < block_num;
-          ++block)
-  {
-    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * 
block_size + row_thread_id < A_row_size) && (row_block_id * block_size + 
col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
-    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * 
block_size + row_thread_id < B_row_size) && (col_block_id * block_size + 
col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
-    __syncthreads();
-    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
-    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-    __syncthreads();
-    aBegin += aStep;
-    bBegin += bStep;
-  }
-  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * 
blockDim.y + threadIdx.y) < B_col_size)
-    C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + 
((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * 
C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * 
C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + 
((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * 
C_internal_rows];
-}
-
-// matrix-matrix multiplication C = A^T * B^T
-// matrix layouts: C...col_major, A...col_major, B...col_major
-template<typename NumericT>
-__global__ void matrix_matrix_col_col_col_prod_TT_kernel(
-          NumericT alpha,
-          const NumericT * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          const NumericT * B,
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols,
-          NumericT beta,
-          NumericT * C,
-          unsigned int C_row_start,
-          unsigned int C_col_start,
-          unsigned int C_row_inc,
-          unsigned int C_col_inc,
-          unsigned int C_row_size,
-          unsigned int C_col_size,
-          unsigned int C_internal_rows,
-          unsigned int C_internal_cols)
-{
-
-  __shared__ NumericT bufA[272];
-  __shared__ NumericT bufB[272];
-
-  vcl_size_t block_size = 16;//get_local_size(0);
-  vcl_size_t row_block_id = blockIdx.x;
-  vcl_size_t col_block_id = blockIdx.y;
-  vcl_size_t row_thread_id = threadIdx.x;
-  vcl_size_t col_thread_id = threadIdx.y;
-  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * 
A_internal_rows + A_row_start;
-  vcl_size_t aStep = block_size * A_row_inc;
-  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + 
B_col_start * B_internal_rows;
-  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
-  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
-  NumericT Csub = 0;
-  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * 
A_internal_rows;
-  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * 
 B_internal_rows;
-
-  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
-  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
-  for (vcl_size_t block = 0;
-          block < block_num;
-          ++block)
-  {
-    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * 
block_size + row_thread_id < A_row_size) && (row_block_id * block_size + 
col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
-    bufB[row_thread_id_times_block_size + col_thread_id] = ((block * 
block_size + col_thread_id < B_col_size) && (col_block_id * block_size + 
row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
-    __syncthreads();
-    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
-    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-    __syncthreads();
-    aBegin += aStep;
-    bBegin += bStep;
-  }
-  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * 
blockDim.y + threadIdx.y) < B_row_size)
-    C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + 
((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * 
C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * 
C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + 
((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * 
C_internal_rows];
-}
-
-
-
-////////////////////////////////////////////////////////////////////////////
-
-
-
-
-// matrix-matrix multiplication C = A * B
-// matrix layouts: C...row_major, A...col_major, B...col_major
-template<typename NumericT>
-__global__ void matrix_matrix_row_col_col_prod_AA_kernel(
-          NumericT alpha,
-          const NumericT * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          const NumericT * B,
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols,
-          NumericT beta,
-          NumericT * C,
-          unsigned int C_row_start,
-          unsigned int C_col_start,
-          unsigned int C_row_inc,
-          unsigned int C_col_inc,
-          unsigned int C_row_size,
-          unsigned int C_col_size,
-          unsigned int C_internal_rows,
-          unsigned int C_internal_cols)
-{
-
-  __shared__ NumericT bufA[272];
-  __shared__ NumericT bufB[272];
-
-  vcl_size_t block_size = 16;//get_local_size(0);
-  vcl_size_t row_block_id = blockIdx.x;
-  vcl_size_t col_block_id = blockIdx.y;
-  vcl_size_t row_thread_id = threadIdx.x;
-  vcl_size_t col_thread_id = threadIdx.y;
-  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + 
A_col_start * A_internal_rows;
-  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
-  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * 
B_internal_rows + B_row_start;
-  vcl_size_t bStep = block_size * B_row_inc;
-  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
-  NumericT Csub = 0;
-  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * 
A_internal_rows;
-  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * 
 B_internal_rows;
-
-  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
-  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
-  for (vcl_size_t block = 0;
-          block < block_num;
-          ++block)
-  {
-    bufA[row_thread_id_times_block_size + col_thread_id] = ((block * 
block_size + col_thread_id < A_col_size) && (row_block_id * block_size + 
row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
-    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * 
block_size + row_thread_id < B_row_size) && (col_block_id * block_size + 
col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
-    __syncthreads();
-    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
-    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-    __syncthreads();
-    aBegin += aStep;
-    bBegin += bStep;
-  }
-  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * 
blockDim.y + threadIdx.y) < B_col_size)
-    C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * 
C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + 
C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * 
C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * 
C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + 
C_col_start];
-}
-
-// matrix-matrix multiplication C = A * B^T
-// matrix layouts: C...row_major, A...col_major, B...col_major
-template<typename NumericT>
-__global__ void matrix_matrix_row_col_col_prod_AT_kernel(
-          NumericT alpha,
-          const NumericT * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          const NumericT * B,
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols,
-          NumericT beta,
-          NumericT * C,
-          unsigned int C_row_start,
-          unsigned int C_col_start,
-          unsigned int C_row_inc,
-          unsigned int C_col_inc,
-          unsigned int C_row_size,
-          unsigned int C_col_size,
-          unsigned int C_internal_rows,
-          unsigned int C_internal_cols)
-{
-
-  __shared__ NumericT bufA[272];
-  __shared__ NumericT bufB[272];
-
-  vcl_size_t block_size = 16;//get_local_size(0);
-  vcl_size_t row_block_id = blockIdx.x;
-  vcl_size_t col_block_id = blockIdx.y;
-  vcl_size_t row_thread_id = threadIdx.x;
-  vcl_size_t col_thread_id = threadIdx.y;
-  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + 
A_col_start * A_internal_rows;
-  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
-  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + 
B_col_start * B_internal_rows;
-  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
-  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
-  NumericT Csub = 0;
-  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * 
A_internal_rows;
-  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * 
 B_internal_rows;
-
-  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
-  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
-  for (vcl_size_t block = 0;
-          block < block_num;
-          ++block)
-  {
-    bufA[row_thread_id_times_block_size + col_thread_id] = ((block * 
block_size + col_thread_id < A_col_size) && (row_block_id * block_size + 
row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
-    bufB[row_thread_id_times_block_size + col_thread_id] = ((block * 
block_size + col_thread_id < B_col_size) && (col_block_id * block_size + 
row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
-    __syncthreads();
-    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
-    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-    __syncthreads();
-    aBegin += aStep;
-    bBegin += bStep;
-  }
-  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * 
blockDim.y + threadIdx.y) < B_row_size)
-    C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * 
C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + 
C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * 
C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * 
C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + 
C_col_start];
-}
-
-// matrix-matrix multiplication C = A^T * B
-// matrix layouts: C...row_major, A...col_major, B...col_major
-template<typename NumericT>
-__global__ void matrix_matrix_row_col_col_prod_TA_kernel(
-          NumericT alpha,
-          const NumericT * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          const NumericT * B,
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols,
-          NumericT beta,
-          NumericT * C,
-          unsigned int C_row_start,
-          unsigned int C_col_start,
-          unsigned int C_row_inc,
-          unsigned int C_col_inc,
-          unsigned int C_row_size,
-          unsigned int C_col_size,
-          unsigned int C_internal_rows,
-          unsigned int C_internal_cols)
-{
-
-  __shared__ NumericT bufA[272];
-  __shared__ NumericT bufB[272];
-
-  vcl_size_t block_size = 16;//get_local_size(0);
-  vcl_size_t row_block_id = blockIdx.x;
-  vcl_size_t col_block_id = blockIdx.y;
-  vcl_size_t row_thread_id = threadIdx.x;
-  vcl_size_t col_thread_id = threadIdx.y;
-  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * 
A_internal_rows + A_row_start;
-  vcl_size_t aStep = block_size * A_row_inc;
-  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * 
B_internal_rows + B_row_start;
-  vcl_size_t bStep = block_size * B_row_inc;
-  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
-  NumericT Csub = 0;
-  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * 
A_internal_rows;
-  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * 
 B_internal_rows;
-
-  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
-  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
-  for (vcl_size_t block = 0;
-          block < block_num;
-          ++block)
-  {
-    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * 
block_size + row_thread_id < A_row_size) && (row_block_id * block_size + 
col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
-    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * 
block_size + row_thread_id < B_row_size) && (col_block_id * block_size + 
col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
-    __syncthreads();
-    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
-    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-    __syncthreads();
-    aBegin += aStep;
-    bBegin += bStep;
-  }
-  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * 
blockDim.y + threadIdx.y) < B_col_size)
-    C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * 
C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + 
C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * 
C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * 
C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + 
C_col_start];
-}
-
-// matrix-matrix multiplication C = A^T * B^T
-// matrix layouts: C...row_major, A...col_major, B...col_major
-template<typename NumericT>
-__global__ void matrix_matrix_row_col_col_prod_TT_kernel(
-          NumericT alpha,
-          const NumericT * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          const NumericT * B,
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols,
-          NumericT beta,
-          NumericT * C,
-          unsigned int C_row_start,
-          unsigned int C_col_start,
-          unsigned int C_row_inc,
-          unsigned int C_col_inc,
-          unsigned int C_row_size,
-          unsigned int C_col_size,
-          unsigned int C_internal_rows,
-          unsigned int C_internal_cols)
-{
-
-  __shared__ NumericT bufA[272];
-  __shared__ NumericT bufB[272];
-
-  vcl_size_t block_size = 16;//get_local_size(0);
-  vcl_size_t row_block_id = blockIdx.x;
-  vcl_size_t col_block_id = blockIdx.y;
-  vcl_size_t row_thread_id = threadIdx.x;
-  vcl_size_t col_thread_id = threadIdx.y;
-  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * 
A_internal_rows + A_row_start;
-  vcl_size_t aStep = block_size * A_row_inc;
-  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + 
B_col_start * B_internal_rows;
-  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
-  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
-  NumericT Csub = 0;
-  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * 
A_internal_rows;
-  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * 
 B_internal_rows;
-
-  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
-  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
-  for (vcl_size_t block = 0;
-          block < block_num;
-          ++block)
-  {
-    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * 
block_size + row_thread_id < A_row_size) && (row_block_id * block_size + 
col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
-    bufB[row_thread_id_times_block_size + col_thread_id] = ((block * 
block_size + col_thread_id < B_col_size) && (col_block_id * block_size + 
row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
-    __syncthreads();
-    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
-    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-    __syncthreads();
-    aBegin += aStep;
-    bBegin += bStep;
-  }
-  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * 
blockDim.y + threadIdx.y) < B_row_size)
-    C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * 
C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + 
C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * 
C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * 
C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + 
C_col_start];
-}
-
-
-
-
-////////////////////////////////////////////////////////////////////////////
-
-
-
-
-// matrix-matrix multiplication C = A * B
-// matrix layouts: C...col_major, A...col_major, B...row_major
-template<typename NumericT>
-__global__ void matrix_matrix_col_col_row_prod_AA_kernel(
-          NumericT alpha,
-          const NumericT * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          const NumericT * B,
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols,
-          NumericT beta,
-          NumericT * C,
-          unsigned int C_row_start,
-          unsigned int C_col_start,
-          unsigned int C_row_inc,
-          unsigned int C_col_inc,
-          unsigned int C_row_size,
-          unsigned int C_col_size,
-          unsigned int C_internal_rows,
-          unsigned int C_internal_cols)
-{
-
-  __shared__ NumericT bufA[272];
-  __shared__ NumericT bufB[272];
-
-  vcl_size_t block_size = 16;//get_local_size(0);
-  vcl_size_t row_block_id = blockIdx.x;
-  vcl_size_t col_block_id = blockIdx.y;
-  vcl_size_t row_thread_id = threadIdx.x;
-  vcl_size_t col_thread_id = threadIdx.y;
-  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + 
A_col_start * A_internal_rows;
-  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
-  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + 
B_row_start * B_internal_cols;
-  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
-  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
-  NumericT Csub = 0;
-  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * 
A_internal_rows;
-  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * 
B_internal_cols;
-
-  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
-  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
-  for (vcl_size_t block = 0;
-          block < block_num;
-          ++block)
-  {
-    bufA[row_thread_id_times_block_size + col_thread_id] = ((block * 
block_size + col_thread_id < A_col_size) && (row_block_id * block_size + 
row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
-    bufB[row_thread_id_times_block_size + col_thread_id] = ((block * 
block_size + col_thread_id < B_row_size) && (col_block_id * block_size + 
row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
-    __syncthreads();
-    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
-    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-    __syncthreads();
-    aBegin += aStep;
-    bBegin += bStep;
-  }
-  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * 
blockDim.y + threadIdx.y) < B_col_size)
-    C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + 
((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * 
C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * 
C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + 
((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * 
C_internal_rows];
-}
-
-// matrix-matrix multiplication C = A * B^T
-// matrix layouts: C...col_major, A...col_major, B...row_major
-template<typename NumericT>
-__global__ void matrix_matrix_col_col_row_prod_AT_kernel(
-          NumericT alpha,
-          const NumericT * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          const NumericT * B,
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols,
-          NumericT beta,
-          NumericT * C,
-          unsigned int C_row_start,
-          unsigned int C_col_start,
-          unsigned int C_row_inc,
-          unsigned int C_col_inc,
-          unsigned int C_row_size,
-          unsigned int C_col_size,
-          unsigned int C_internal_rows,
-          unsigned int C_internal_cols)
-{
-
-  __shared__ NumericT bufA[272];
-  __shared__ NumericT bufB[272];
-
-  vcl_size_t block_size = 16;//get_local_size(0);
-  vcl_size_t row_block_id = blockIdx.x;
-  vcl_size_t col_block_id = blockIdx.y;
-  vcl_size_t row_thread_id = threadIdx.x;
-  vcl_size_t col_thread_id = threadIdx.y;
-  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + 
A_col_start * A_internal_rows;
-  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
-  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * 
B_internal_cols + B_col_start;
-  vcl_size_t bStep = block_size * B_col_inc;
-  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
-  NumericT Csub = 0;
-  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * 
A_internal_rows;
-  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * 
B_internal_cols;
-
-  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
-  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
-  for (vcl_size_t block = 0;
-          block < block_num;
-          ++block)
-  {
-    bufA[row_thread_id_times_block_size + col_thread_id] = ((block * 
block_size + col_thread_id < A_col_size) && (row_block_id * block_size + 
row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
-    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * 
block_size + row_thread_id < B_col_size) && (col_block_id * block_size + 
col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
-    __syncthreads();
-    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
-    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-    __syncthreads();
-    aBegin += aStep;
-    bBegin += bStep;
-  }
-  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * 
blockDim.y + threadIdx.y) < B_row_size)
-    C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + 
((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * 
C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * 
C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + 
((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * 
C_internal_rows];
-}
-
-// matrix-matrix multiplication C = A^T * B
-// matrix layouts: C...col_major, A...col_major, B...row_major
-template<typename NumericT>
-__global__ void matrix_matrix_col_col_row_prod_TA_kernel(
-          NumericT alpha,
-          const NumericT * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          const NumericT * B,
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols,
-          NumericT beta,
-          NumericT * C,
-          unsigned int C_row_start,
-          unsigned int C_col_start,
-          unsigned int C_row_inc,
-          unsigned int C_col_inc,
-          unsigned int C_row_size,
-          unsigned int C_col_size,
-          unsigned int C_internal_rows,
-          unsigned int C_internal_cols)
-{
-
-  __shared__ NumericT bufA[272];
-  __shared__ NumericT bufB[272];
-
-  vcl_size_t block_size = 16;//get_local_size(0);
-  vcl_size_t row_block_id = blockIdx.x;
-  vcl_size_t col_block_id = blockIdx.y;
-  vcl_size_t row_thread_id = threadIdx.x;
-  vcl_size_t col_thread_id = threadIdx.y;
-  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * 
A_internal_rows + A_row_start;
-  vcl_size_t aStep = block_size * A_row_inc;
-  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + 
B_row_start * B_internal_cols;
-  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
-  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
-  NumericT Csub = 0;
-  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * 
A_internal_rows;
-  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * 
B_internal_cols;
-
-  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
-  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
-  for (vcl_size_t block = 0;
-          block < block_num;
-          ++block)
-  {
-    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * 
block_size + row_thread_id < A_row_size) && (row_block_id * block_size + 
col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
-    bufB[row_thread_id_times_block_size + col_thread_id] = ((block * 
block_size + col_thread_id < B_row_size) && (col_block_id * block_size + 
row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
-    __syncthreads();
-    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
-    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-    __syncthreads();
-    aBegin += aStep;
-    bBegin += bStep;
-  }
-  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * 
blockDim.y + threadIdx.y) < B_col_size)
-    C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + 
((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * 
C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * 
C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + 
((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * 
C_internal_rows];
-}
-
-// matrix-matrix multiplication C = A^T * B^T
-// matrix layouts: C...col_major, A...col_major, B...row_major
-template<typename NumericT>
-__global__ void matrix_matrix_col_col_row_prod_TT_kernel(
-          NumericT alpha,
-          const NumericT * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          const NumericT * B,
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols,
-          NumericT beta,
-          NumericT * C,
-          unsigned int C_row_start,
-          unsigned int C_col_start,
-          unsigned int C_row_inc,
-          unsigned int C_col_inc,
-          unsigned int C_row_size,
-          unsigned int C_col_size,
-          unsigned int C_internal_rows,
-          unsigned int C_internal_cols)
-{
-
-  __shared__ NumericT bufA[272];
-  __shared__ NumericT bufB[272];
-
-  vcl_size_t block_size = 16;//get_local_size(0);
-  vcl_size_t row_block_id = blockIdx.x;
-  vcl_size_t col_block_id = blockIdx.y;
-  vcl_size_t row_thread_id = threadIdx.x;
-  vcl_size_t col_thread_id = threadIdx.y;
-  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * 
A_internal_rows + A_row_start;
-  vcl_size_t aStep = block_size * A_row_inc;
-  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * 
B_internal_cols + B_col_start;
-  vcl_size_t bStep = block_size * B_col_inc;
-  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
-  NumericT Csub = 0;
-  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * 
A_internal_rows;
-  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * 
B_internal_cols;
-
-  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
-  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
-  for (vcl_size_t block = 0;
-          block < block_num;
-          ++block)
-  {
-    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * 
block_size + row_thread_id < A_row_size) && (row_block_id * block_size + 
col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
-    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * 
block_size + row_thread_id < B_col_size) && (col_block_id * block_size + 
col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
-    __syncthreads();
-    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
-    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-    __syncthreads();
-    aBegin += aStep;
-    bBegin += bStep;
-  }
-  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * 
blockDim.y + threadIdx.y) < B_row_size)
-    C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + 
((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * 
C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * 
C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + 
((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * 
C_internal_rows];
-}
-
-
-
-////////////////////////////////////////////////////////////////////////////
-
-
-
-
-// matrix-matrix multiplication C = A * B
-// matrix layouts: C...row_major, A...col_major, B...row_major
-template<typename NumericT>
-__global__ void matrix_matrix_row_col_row_prod_AA_kernel(
-          NumericT alpha,
-          const NumericT * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          const NumericT * B,
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols,
-          NumericT beta,
-          NumericT * C,
-          unsigned int C_row_start,
-          unsigned int C_col_start,
-          unsigned int C_row_inc,
-          unsigned int C_col_inc,
-          unsigned int C_row_size,
-          unsigned int C_col_size,
-          unsigned int C_internal_rows,
-          unsigned int C_internal_cols)
-{
-
-  __shared__ NumericT bufA[272];
-  __shared__ NumericT bufB[272];
-
-  vcl_size_t block_size = 16;//get_local_size(0);
-  vcl_size_t row_block_id = blockIdx.x;
-  vcl_size_t col_block_id = blockIdx.y;
-  vcl_size_t row_thread_id = threadIdx.x;
-  vcl_size_t col_thread_id = threadIdx.y;
-  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + 
A_col_start * A_internal_rows;
-  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
-  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + 
B_row_start * B_internal_cols;
-  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
-  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
-  NumericT Csub = 0;
-  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * 
A_internal_rows;
-  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * 
B_internal_cols;
-
-  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
-  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
-  for (vcl_size_t block = 0;
-          block < block_num;
-          ++block)
-  {
-    bufA[row_thread_id_times_block_size + col_thread_id] = ((block * 
block_size + col_thread_id < A_col_size) && (row_block_id * block_size + 
row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
-    bufB[row_thread_id_times_block_size + col_thread_id] = ((block * 
block_size + col_thread_id < B_row_size) && (col_block_id * block_size + 
row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
-    __syncthreads();
-    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
-    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-    __syncthreads();
-    aBegin += aStep;
-    bBegin += bStep;
-  }
-  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * 
blockDim.y + threadIdx.y) < B_col_size)
-    C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * 
C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + 
C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * 
C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * 
C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + 
C_col_start];
-}
-
-// matrix-matrix multiplication C = A * B^T
-// matrix layouts: C...row_major, A...col_major, B...row_major
-template<typename NumericT>
-__global__ void matrix_matrix_row_col_row_prod_AT_kernel(
-          NumericT alpha,
-          const NumericT * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          const NumericT * B,
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols,
-          NumericT beta,
-          NumericT * C,
-          unsigned int C_row_start,
-          unsigned int C_col_start,
-          unsigned int C_row_inc,
-          unsigned int C_col_inc,
-          unsigned int C_row_size,
-          unsigned int C_col_size,
-          unsigned int C_internal_rows,
-          unsigned int C_internal_cols)
-{
-
-  __shared__ NumericT bufA[272];
-  __shared__ NumericT bufB[272];
-
-  vcl_size_t block_size = 16;//get_local_size(0);
-  vcl_size_t row_block_id = blockIdx.x;
-  vcl_size_t col_block_id = blockIdx.y;
-  vcl_size_t row_thread_id = threadIdx.x;
-  vcl_size_t col_thread_id = threadIdx.y;
-  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) + 
A_col_start * A_internal_rows;
-  vcl_size_t aStep = block_size * A_col_inc * A_internal_rows;
-  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * 
B_internal_cols + B_col_start;
-  vcl_size_t bStep = block_size * B_col_inc;
-  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
-  NumericT Csub = 0;
-  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * 
A_internal_rows;
-  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * 
B_internal_cols;
-
-  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
-  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
-  for (vcl_size_t block = 0;
-          block < block_num;
-          ++block)
-  {
-    bufA[row_thread_id_times_block_size + col_thread_id] = ((block * 
block_size + col_thread_id < A_col_size) && (row_block_id * block_size + 
row_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
-    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * 
block_size + row_thread_id < B_col_size) && (col_block_id * block_size + 
col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
-    __syncthreads();
-    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
-    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-    __syncthreads();
-    aBegin += aStep;
-    bBegin += bStep;
-  }
-  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * 
blockDim.y + threadIdx.y) < B_row_size)
-    C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * 
C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + 
C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * 
C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * 
C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + 
C_col_start];
-}
-
-// matrix-matrix multiplication C = A^T * B
-// matrix layouts: C...row_major, A...col_major, B...row_major
-template<typename NumericT>
-__global__ void matrix_matrix_row_col_row_prod_TA_kernel(
-          NumericT alpha,
-          const NumericT * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          const NumericT * B,
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols,
-          NumericT beta,
-          NumericT * C,
-          unsigned int C_row_start,
-          unsigned int C_col_start,
-          unsigned int C_row_inc,
-          unsigned int C_col_inc,
-          unsigned int C_row_size,
-          unsigned int C_col_size,
-          unsigned int C_internal_rows,
-          unsigned int C_internal_cols)
-{
-
-  __shared__ NumericT bufA[272];
-  __shared__ NumericT bufB[272];
-
-  vcl_size_t block_size = 16;//get_local_size(0);
-  vcl_size_t row_block_id = blockIdx.x;
-  vcl_size_t col_block_id = blockIdx.y;
-  vcl_size_t row_thread_id = threadIdx.x;
-  vcl_size_t col_thread_id = threadIdx.y;
-  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * 
A_internal_rows + A_row_start;
-  vcl_size_t aStep = block_size * A_row_inc;
-  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) + 
B_row_start * B_internal_cols;
-  vcl_size_t bStep = block_size * B_internal_cols * B_row_inc;
-  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
-  NumericT Csub = 0;
-  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * 
A_internal_rows;
-  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * 
B_internal_cols;
-
-  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
-  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
-  for (vcl_size_t block = 0;
-          block < block_num;
-          ++block)
-  {
-    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * 
block_size + row_thread_id < A_row_size) && (row_block_id * block_size + 
col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
-    bufB[row_thread_id_times_block_size + col_thread_id] = ((block * 
block_size + col_thread_id < B_row_size) && (col_block_id * block_size + 
row_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
-    __syncthreads();
-    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
-    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-    __syncthreads();
-    aBegin += aStep;
-    bBegin += bStep;
-  }
-  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * 
blockDim.y + threadIdx.y) < B_col_size)
-    C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * 
C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + 
C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * 
C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * 
C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + 
C_col_start];
-}
-
-// matrix-matrix multiplication C = A^T * B^T
-// matrix layouts: C...row_major, A...col_major, B...row_major
-template<typename NumericT>
-__global__ void matrix_matrix_row_col_row_prod_TT_kernel(
-          NumericT alpha,
-          const NumericT * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          const NumericT * B,
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols,
-          NumericT beta,
-          NumericT * C,
-          unsigned int C_row_start,
-          unsigned int C_col_start,
-          unsigned int C_row_inc,
-          unsigned int C_col_inc,
-          unsigned int C_row_size,
-          unsigned int C_col_size,
-          unsigned int C_internal_rows,
-          unsigned int C_internal_cols)
-{
-
-  __shared__ NumericT bufA[272];
-  __shared__ NumericT bufB[272];
-
-  vcl_size_t block_size = 16;//get_local_size(0);
-  vcl_size_t row_block_id = blockIdx.x;
-  vcl_size_t col_block_id = blockIdx.y;
-  vcl_size_t row_thread_id = threadIdx.x;
-  vcl_size_t col_thread_id = threadIdx.y;
-  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) * 
A_internal_rows + A_row_start;
-  vcl_size_t aStep = block_size * A_row_inc;
-  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) * 
B_internal_cols + B_col_start;
-  vcl_size_t bStep = block_size * B_col_inc;
-  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
-  NumericT Csub = 0;
-  vcl_size_t aOffset = row_thread_id * A_row_inc + col_thread_id * A_col_inc * 
A_internal_rows;
-  vcl_size_t bOffset = row_thread_id * B_col_inc + col_thread_id * B_row_inc * 
B_internal_cols;
-
-  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
-  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
-  for (vcl_size_t block = 0;
-          block < block_num;
-          ++block)
-  {
-    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * 
block_size + row_thread_id < A_row_size) && (row_block_id * block_size + 
col_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
-    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * 
block_size + row_thread_id < B_col_size) && (col_block_id * block_size + 
col_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
-    __syncthreads();
-    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
-    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-    __syncthreads();
-    aBegin += aStep;
-    bBegin += bStep;
-  }
-  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * 
blockDim.y + threadIdx.y) < B_row_size)
-    C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * 
C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + 
C_col_start] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * 
C[((blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start) * 
C_internal_cols + (blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + 
C_col_start];
-}
-
-
-
-
-
-////////////////////////////////////////////////////////////////////////////
-
-
-
-
-
-
-// matrix-matrix multiplication C = A * B
-// matrix layouts: C...col_major, A...row_major, B...col_major
-template<typename NumericT>
-__global__ void matrix_matrix_col_row_col_prod_AA_kernel(
-          NumericT alpha,
-          const NumericT * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          const NumericT * B,
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols,
-          NumericT beta,
-          NumericT * C,
-          unsigned int C_row_start,
-          unsigned int C_col_start,
-          unsigned int C_row_inc,
-          unsigned int C_col_inc,
-          unsigned int C_row_size,
-          unsigned int C_col_size,
-          unsigned int C_internal_rows,
-          unsigned int C_internal_cols)
-{
-
-  __shared__ NumericT bufA[272];
-  __shared__ NumericT bufB[272];
-
-  vcl_size_t block_size = 16;//get_local_size(0);
-  vcl_size_t row_block_id = blockIdx.x;
-  vcl_size_t col_block_id = blockIdx.y;
-  vcl_size_t row_thread_id = threadIdx.x;
-  vcl_size_t col_thread_id = threadIdx.y;
-  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * 
A_internal_cols + A_col_start;
-  vcl_size_t aStep = block_size * A_col_inc;
-  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * 
B_internal_rows + B_row_start;
-  vcl_size_t bStep = block_size * B_row_inc;
-  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
-  NumericT Csub = 0;
-  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * 
A_internal_cols;
-  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * 
 B_internal_rows;
-
-  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
-  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
-  for (vcl_size_t block = 0;
-          block < block_num;
-          ++block)
-  {
-    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * 
block_size + row_thread_id < A_col_size) && (row_block_id * block_size + 
col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
-    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * 
block_size + row_thread_id < B_row_size) && (col_block_id * block_size + 
col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
-    __syncthreads();
-    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
-    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-    __syncthreads();
-    aBegin += aStep;
-    bBegin += bStep;
-  }
-  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * 
blockDim.y + threadIdx.y) < B_col_size)
-    C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + 
((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * 
C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * 
C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + 
((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * 
C_internal_rows];
-}
-
-// matrix-matrix multiplication C = A * B^T
-// matrix layouts: C...col_major, A...row_major, B...col_major
-template<typename NumericT>
-__global__ void matrix_matrix_col_row_col_prod_AT_kernel(
-          NumericT alpha,
-          const NumericT * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          const NumericT * B,
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols,
-          NumericT beta,
-          NumericT * C,
-          unsigned int C_row_start,
-          unsigned int C_col_start,
-          unsigned int C_row_inc,
-          unsigned int C_col_inc,
-          unsigned int C_row_size,
-          unsigned int C_col_size,
-          unsigned int C_internal_rows,
-          unsigned int C_internal_cols)
-{
-
-  __shared__ NumericT bufA[272];
-  __shared__ NumericT bufB[272];
-
-  vcl_size_t block_size = 16;//get_local_size(0);
-  vcl_size_t row_block_id = blockIdx.x;
-  vcl_size_t col_block_id = blockIdx.y;
-  vcl_size_t row_thread_id = threadIdx.x;
-  vcl_size_t col_thread_id = threadIdx.y;
-  vcl_size_t aBegin = (row_block_id * block_size * A_row_inc + A_row_start) * 
A_internal_cols + A_col_start;
-  vcl_size_t aStep = block_size * A_col_inc;
-  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + 
B_col_start * B_internal_rows;
-  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
-  vcl_size_t block_num = (A_col_size + block_size - 1) / block_size;
-  NumericT Csub = 0;
-  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * 
A_internal_cols;
-  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * 
 B_internal_rows;
-
-  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
-  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
-  for (vcl_size_t block = 0;
-          block < block_num;
-          ++block)
-  {
-    bufA[col_thread_id_times_block_size + row_thread_id] = ((block * 
block_size + row_thread_id < A_col_size) && (row_block_id * block_size + 
col_thread_id < A_row_size)) ? A[aBegin + aOffset] : 0;
-    bufB[row_thread_id_times_block_size + col_thread_id] = ((block * 
block_size + col_thread_id < B_col_size) && (col_block_id * block_size + 
row_thread_id < B_row_size)) ? B[bBegin + bOffset] : 0;
-    __syncthreads();
-    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
-    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-    __syncthreads();
-    aBegin += aStep;
-    bBegin += bStep;
-  }
-  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_row_size && (blockIdx.y * 
blockDim.y + threadIdx.y) < B_row_size)
-    C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + 
((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * 
C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * 
C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + 
((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * 
C_internal_rows];
-}
-
-// matrix-matrix multiplication C = A^T * B
-// matrix layouts: C...col_major, A...row_major, B...col_major
-template<typename NumericT>
-__global__ void matrix_matrix_col_row_col_prod_TA_kernel(
-          NumericT alpha,
-          const NumericT * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          const NumericT * B,
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols,
-          NumericT beta,
-          NumericT * C,
-          unsigned int C_row_start,
-          unsigned int C_col_start,
-          unsigned int C_row_inc,
-          unsigned int C_col_inc,
-          unsigned int C_row_size,
-          unsigned int C_col_size,
-          unsigned int C_internal_rows,
-          unsigned int C_internal_cols)
-{
-
-  __shared__ NumericT bufA[272];
-  __shared__ NumericT bufB[272];
-
-  vcl_size_t block_size = 16;//get_local_size(0);
-  vcl_size_t row_block_id = blockIdx.x;
-  vcl_size_t col_block_id = blockIdx.y;
-  vcl_size_t row_thread_id = threadIdx.x;
-  vcl_size_t col_thread_id = threadIdx.y;
-  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + 
A_row_start * A_internal_cols;
-  vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
-  vcl_size_t bBegin = (col_block_id * block_size * B_col_inc + B_col_start) * 
B_internal_rows + B_row_start;
-  vcl_size_t bStep = block_size * B_row_inc;
-  vcl_size_t block_num = (A_row_size + block_size - 1) / block_size;
-  NumericT Csub = 0;
-  vcl_size_t aOffset = row_thread_id * A_col_inc + col_thread_id * A_row_inc * 
A_internal_cols;
-  vcl_size_t bOffset = row_thread_id * B_row_inc + col_thread_id * B_col_inc * 
 B_internal_rows;
-
-  vcl_size_t row_thread_id_times_block_size = row_thread_id * (block_size + 1);
-  vcl_size_t col_thread_id_times_block_size = col_thread_id * (block_size + 1);
-  for (vcl_size_t block = 0;
-          block < block_num;
-          ++block)
-  {
-    bufA[row_thread_id_times_block_size + col_thread_id] = ((block * 
block_size + col_thread_id < A_row_size) && (row_block_id * block_size + 
row_thread_id < A_col_size)) ? A[aBegin + aOffset] : 0;
-    bufB[col_thread_id_times_block_size + row_thread_id] = ((block * 
block_size + row_thread_id < B_row_size) && (col_block_id * block_size + 
col_thread_id < B_col_size)) ? B[bBegin + bOffset] : 0;
-    __syncthreads();
-    NumericT * bufAptr = bufA + row_thread_id_times_block_size;
-    NumericT * bufBptr = bufB + col_thread_id_times_block_size;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-      Csub += (*bufAptr) * (*bufBptr); ++bufAptr; ++bufBptr;
-    __syncthreads();
-    aBegin += aStep;
-    bBegin += bStep;
-  }
-  if ((blockIdx.x * blockDim.x + threadIdx.x) < A_col_size && (blockIdx.y * 
blockDim.y + threadIdx.y) < B_col_size)
-    C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + 
((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * 
C_internal_rows] = (beta == 0) ? alpha * Csub : alpha * Csub + beta * 
C[(blockIdx.x * blockDim.x + threadIdx.x) * C_row_inc + C_row_start + 
((blockIdx.y * blockDim.y + threadIdx.y) * C_col_inc + C_col_start) * 
C_internal_rows];
-}
-
-// matrix-matrix multiplication C = A^T * B^T
-// matrix layouts: C...col_major, A...row_major, B...col_major
-template<typename NumericT>
-__global__ void matrix_matrix_col_row_col_prod_TT_kernel(
-          NumericT alpha,
-          const NumericT * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          const NumericT * B,
-          unsigned int B_row_start,
-          unsigned int B_col_start,
-          unsigned int B_row_inc,
-          unsigned int B_col_inc,
-          unsigned int B_row_size,
-          unsigned int B_col_size,
-          unsigned int B_internal_rows,
-          unsigned int B_internal_cols,
-          NumericT beta,
-          NumericT * C,
-          unsigned int C_row_start,
-          unsigned int C_col_start,
-          unsigned int C_row_inc,
-          unsigned int C_col_inc,
-          unsigned int C_row_size,
-          unsigned int C_col_size,
-          unsigned int C_internal_rows,
-          unsigned int C_internal_cols)
-{
-
-  __shared__ NumericT bufA[272];
-  __shared__ NumericT bufB[272];
-
-  vcl_size_t block_size = 16;//get_local_size(0);
-  vcl_size_t row_block_id = blockIdx.x;
-  vcl_size_t col_block_id = blockIdx.y;
-  vcl_size_t row_thread_id = threadIdx.x;
-  vcl_size_t col_thread_id = threadIdx.y;
-  vcl_size_t aBegin = (row_block_id * block_size * A_col_inc + A_col_start) + 
A_row_start * A_internal_cols;
-  vcl_size_t aStep = block_size * A_row_inc * A_internal_cols;
-  vcl_size_t bBegin = (col_block_id * block_size * B_row_inc + B_row_start) + 
B_col_start * B_internal_rows;
-  vcl_size_t bStep = block_size * B_internal_rows * B_col_inc;
-  vcl_size_t block_num = (A_row_size + block_size - 1

<TRUNCATED>

Reply via email to