http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_row.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_row.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_row.hpp
deleted file mode 100644
index 45d6987..0000000
--- 
a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_row.hpp
+++ /dev/null
@@ -1,1468 +0,0 @@
-#ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_
-#define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-/** @file  viennacl/linalg/cuda/matrix_operations_row.hpp
-    @brief Implementations of row-major dense matrix related operations, 
including matrix-vector products, using CUDA.
-*/
-
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace cuda
-{
-
-template<typename DestNumericT, typename SrcNumericT>
-__global__ void convert_row_kernel(
-          DestNumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const SrcNumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2];
-}
-
-//Matrix transpose kernel
-template<typename NumericT>
-__global__ void trans_kernel(
-          const NumericT * A,
-          unsigned int A_start1,          unsigned int A_start2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-          unsigned int A_size1,           unsigned int A_size2,
-          unsigned int A_stride1,         unsigned int A_stride2,
-
-          NumericT * B,
-          unsigned int B_start1,          unsigned int B_start2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2,
-          unsigned int B_stride1,         unsigned int B_stride2,
-          bool data_major)
-{
-  for(unsigned int row = blockIdx.x; row<A_size1; row+=gridDim.x)
-  {
-    for(unsigned int col = threadIdx.x; col<A_size2; col+=blockDim.x)
-    {
-      if(data_major)
-        B[(B_start1 + B_stride1 * col) * B_internal_size2 + (B_start2 + 
B_stride2 * row)] = A[(A_start1 + A_stride1 * row) * A_internal_size2 + 
(A_start2 + A_stride2 * col)];
-      else
-        B[(B_start1 + B_stride1 * col) + (B_start2 + B_stride2 * row) * 
B_internal_size1] = A[(A_start1 + A_stride1 * row) + (A_start2 + A_stride2 * 
col) * A_internal_size1];
-     }
-  }
-}
-
-//
-// am
-//
-
-// alpha on CPU
-template<typename NumericT>
-__global__ void am_row_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          NumericT fac2,
-          unsigned int options2,
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  NumericT alpha = fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  if (options2 & (1 << 1))
-  {
-    for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-      for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-        A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha;
-  }
-  else
-  {
-    for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-      for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-        A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha;
-  }
-}
-
-// alpha on GPU
-template<typename NumericT>
-__global__ void am_row_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * fac2,
-          unsigned int options2,
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  NumericT alpha = *fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  if (options2 & (1 << 1))
-  {
-    for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-      for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-        A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha;
-  }
-  else
-  {
-    for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-      for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-        A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha;
-  }
-}
-
-
-//
-// ambm
-//
-
-// alpha and beta on CPU
-template<typename NumericT>
-__global__ void ambm_row_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          NumericT fac2,
-          unsigned int options2,
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2,
-
-          NumericT fac3,
-          unsigned int options3,
-          const NumericT * C,
-          unsigned int C_start1, unsigned int C_start2,
-          unsigned int C_inc1,   unsigned int C_inc2,
-          unsigned int C_internal_size1,  unsigned int C_internal_size2)
-{
-  NumericT alpha = fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  NumericT beta = fac3;
-  if (options3 & (1 << 0))
-    beta = -beta;
-
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  if (options2 & (1 << 1))
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
-    }
-    else
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
-    }
-  }
-  else
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
-    }
-    else
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
-    }
-  }
-}
-
-
-// alpha on CPU, beta on GPU
-template<typename NumericT>
-__global__ void ambm_row_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          NumericT fac2,
-          unsigned int options2,
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2,
-
-          const NumericT * fac3,
-          unsigned int options3,
-          const NumericT * C,
-          unsigned int C_start1, unsigned int C_start2,
-          unsigned int C_inc1,   unsigned int C_inc2,
-          unsigned int C_internal_size1,  unsigned int C_internal_size2)
-{
-  NumericT alpha = fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  NumericT beta = *fac3;
-  if (options3 & (1 << 0))
-    beta = -beta;
-
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  if (options2 & (1 << 1))
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
-    }
-    else
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
-    }
-  }
-  else
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
-    }
-    else
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
-    }
-  }
-}
-
-// alpha on GPU, beta on CPU
-template<typename NumericT>
-__global__ void ambm_row_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * fac2,
-          unsigned int options2,
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2,
-
-          NumericT fac3,
-          unsigned int options3,
-          const NumericT * C,
-          unsigned int C_start1, unsigned int C_start2,
-          unsigned int C_inc1,   unsigned int C_inc2,
-          unsigned int C_internal_size1,  unsigned int C_internal_size2)
-{
-  NumericT alpha = *fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  NumericT beta = fac3;
-  if (options3 & (1 << 0))
-    beta = -beta;
-
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  if (options2 & (1 << 1))
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
-    }
-    else
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
-    }
-  }
-  else
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
-    }
-    else
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
-    }
-  }
-}
-
-
-// alpha and beta on GPU
-template<typename NumericT>
-__global__ void ambm_row_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * fac2,
-          unsigned int options2,
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2,
-
-          const NumericT * fac3,
-          unsigned int options3,
-          const NumericT * C,
-          unsigned int C_start1, unsigned int C_start2,
-          unsigned int C_inc1,   unsigned int C_inc2,
-          unsigned int C_internal_size1,  unsigned int C_internal_size2)
-{
-  NumericT alpha = *fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  NumericT beta = *fac3;
-  if (options3 & (1 << 0))
-    beta = -beta;
-
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  if (options2 & (1 << 1))
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
-    }
-    else
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
-    }
-  }
-  else
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
-    }
-    else
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
-    }
-  }
-}
-
-
-//
-// ambm_m
-//
-
-// alpha and beta on CPU
-template<typename NumericT>
-__global__ void ambm_m_row_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          NumericT fac2,
-          unsigned int options2,
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2,
-
-          NumericT fac3,
-          unsigned int options3,
-          const NumericT * C,
-          unsigned int C_start1, unsigned int C_start2,
-          unsigned int C_inc1,   unsigned int C_inc2,
-          unsigned int C_internal_size1,  unsigned int C_internal_size2)
-{
-  NumericT alpha = fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  NumericT beta = fac3;
-  if (options3 & (1 << 0))
-    beta = -beta;
-
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  if (options2 & (1 << 1))
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
-    }
-    else
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
-    }
-  }
-  else
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
-    }
-    else
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
-    }
-  }
-}
-
-
-// alpha on CPU, beta on GPU
-template<typename NumericT>
-__global__ void ambm_m_row_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          NumericT fac2,
-          unsigned int options2,
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2,
-
-          const NumericT * fac3,
-          unsigned int options3,
-          const NumericT * C,
-          unsigned int C_start1, unsigned int C_start2,
-          unsigned int C_inc1,   unsigned int C_inc2,
-          unsigned int C_internal_size1,  unsigned int C_internal_size2)
-{
-  NumericT alpha = fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  NumericT beta = *fac3;
-  if (options3 & (1 << 0))
-    beta = -beta;
-
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  if (options2 & (1 << 1))
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
-    }
-    else
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
-    }
-  }
-  else
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
-    }
-    else
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
-    }
-  }
-}
-
-// alpha on GPU, beta on CPU
-template<typename NumericT>
-__global__ void ambm_m_row_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * fac2,
-          unsigned int options2,
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2,
-
-          NumericT fac3,
-          unsigned int options3,
-          const NumericT * C,
-          unsigned int C_start1, unsigned int C_start2,
-          unsigned int C_inc1,   unsigned int C_inc2,
-          unsigned int C_internal_size1,  unsigned int C_internal_size2)
-{
-  NumericT alpha = *fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  NumericT beta = fac3;
-  if (options3 & (1 << 0))
-    beta = -beta;
-
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  if (options2 & (1 << 1))
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
-    }
-    else
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
-    }
-  }
-  else
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
-    }
-    else
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
-    }
-  }
-}
-
-
-// alpha and beta on GPU
-template<typename NumericT>
-__global__ void ambm_m_row_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * fac2,
-          unsigned int options2,
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2,
-
-          const NumericT * fac3,
-          unsigned int options3,
-          const NumericT * C,
-          unsigned int C_start1, unsigned int C_start2,
-          unsigned int C_inc1,   unsigned int C_inc2,
-          unsigned int C_internal_size1,  unsigned int C_internal_size2)
-{
-  NumericT alpha = *fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  NumericT beta = *fac3;
-  if (options3 & (1 << 0))
-    beta = -beta;
-
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  if (options2 & (1 << 1))
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
-    }
-    else
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
-    }
-  }
-  else
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
-    }
-    else
-    {
-      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
-        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
-    }
-  }
-}
-
-//
-// assignments
-//
-
-template<typename NumericT>
-__global__ void matrix_row_assign_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-          NumericT alpha)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = alpha;
-}
-
-
-template<typename NumericT>
-__global__ void matrix_row_diagonal_assign_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-          NumericT alpha)
-{
-  unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x);
-
-  for (unsigned int row = gid; row < A_size1; row += blockDim.x * gridDim.x)
-    A[(row * A_inc1 + A_start1) * A_internal_size2 + row * A_inc2 + A_start2] 
= alpha;
-}
-
-//
-// binary element-wise operations
-//
-
-template<typename NumericT>
-__global__ void element_op_row_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2,
-
-          const NumericT * C,
-          unsigned int C_start1, unsigned int C_start2,
-          unsigned int C_inc1,   unsigned int C_inc2,
-          unsigned int C_internal_size1,  unsigned int C_internal_size2,
-
-          unsigned int op_type) //0: product, 1: division, 2: pow
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  if (op_type == 2)
-  {
-    for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-      for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-        A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-      = pow(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2],
-            C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2]);
-  }
-  else if (op_type == 1)
-  {
-    for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-      for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-        A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-      = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2]
-      / C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2];
-  }
-  else if (op_type == 0)
-  {
-    for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-      for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-        A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-      = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2]
-      * C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2];
-  }
-}
-
-template<typename NumericT>
-__global__ void element_op_int_row_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2,
-
-          const NumericT * C,
-          unsigned int C_start1, unsigned int C_start2,
-          unsigned int C_inc1,   unsigned int C_inc2,
-          unsigned int C_internal_size1,  unsigned int C_internal_size2,
-
-          unsigned int op_type) //0: product, 1: division, 2: pow
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  if (op_type == 1)
-  {
-    for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-      for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-        A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-      = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2]
-      / C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2];
-  }
-  else if (op_type == 0)
-  {
-    for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-      for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-        A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
-      = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2]
-      * C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2];
-  }
-}
-
-//
-// unary element-wise operations
-//
-
-// abs
-template<typename NumericT>
-__global__ void matrix_row_element_abs_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = abs(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2]);
-}
-
-
-// acos
-template<typename NumericT>
-__global__ void matrix_row_element_acos_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = acos(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 
+ B_start2]);
-}
-
-
-// asin
-template<typename NumericT>
-__global__ void matrix_row_element_asin_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = asin(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 
+ B_start2]);
-}
-
-
-// atan
-template<typename NumericT>
-__global__ void matrix_row_element_atan_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = atan(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 
+ B_start2]);
-}
-
-
-// ceil
-template<typename NumericT>
-__global__ void matrix_row_element_ceil_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = ceil(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 
+ B_start2]);
-}
-
-
-// cos
-template<typename NumericT>
-__global__ void matrix_row_element_cos_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = cos(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2]);
-}
-
-
-// cosh
-template<typename NumericT>
-__global__ void matrix_row_element_cosh_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = cosh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 
+ B_start2]);
-}
-
-
-// exp
-template<typename NumericT>
-__global__ void matrix_row_element_exp_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = exp(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2]);
-}
-
-
-// fabs
-template<typename NumericT>
-__global__ void matrix_row_element_fabs_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = fabs(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 
+ B_start2]);
-}
-
-
-// floor
-template<typename NumericT>
-__global__ void matrix_row_element_floor_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = floor(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 
+ B_start2]);
-}
-
-
-// log
-template<typename NumericT>
-__global__ void matrix_row_element_log_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = log(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2]);
-}
-
-
-// log10
-template<typename NumericT>
-__global__ void matrix_row_element_log10_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = log10(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 
+ B_start2]);
-}
-
-
-// sin
-template<typename NumericT>
-__global__ void matrix_row_element_sin_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = sin(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2]);
-}
-
-
-// sinh
-template<typename NumericT>
-__global__ void matrix_row_element_sinh_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = sinh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 
+ B_start2]);
-}
-
-
-// sqrt
-template<typename NumericT>
-__global__ void matrix_row_element_sqrt_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = sqrt(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 
+ B_start2]);
-}
-
-
-// tan
-template<typename NumericT>
-__global__ void matrix_row_element_tan_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = tan(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2]);
-}
-
-
-// tanh
-template<typename NumericT>
-__global__ void matrix_row_element_tanh_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * B,
-          unsigned int B_start1, unsigned int B_start2,
-          unsigned int B_inc1,   unsigned int B_inc2,
-          unsigned int B_internal_size1,  unsigned int B_internal_size2)
-{
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = tanh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 
+ B_start2]);
-}
-
-
-
-//
-// matrix-vector product
-//
-
-template<typename NumericT>
-__global__ void vec_mul_row_kernel(
-          const NumericT * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          const NumericT * v,
-          unsigned int v_start,
-          unsigned int v_inc,
-          unsigned int v_size,
-          NumericT * result,
-          unsigned int result_start,
-          unsigned int result_inc,
-          unsigned int result_size)
-{
-  __shared__ NumericT work[128];
-
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-  unsigned int lid = threadIdx.x;
-
-  for (unsigned int row = row_gid; row < A_row_size; row += gridDim.x)
-  {
-    NumericT dot_prod = 0;
-    for (unsigned int col = col_gid; col < A_col_size; col += blockDim.x)
-      dot_prod += A[(row * A_row_inc + A_row_start) * A_internal_cols + col * 
A_col_inc + A_col_start] * v[v_start + v_inc * col];
-    work[lid] = dot_prod;
-
-    for (unsigned int stride = blockDim.x/2; stride>0; stride>>=1){
-      __syncthreads();
-      if (lid < stride)
-        work[lid] += work[lid+stride];
-    }
-
-    if (lid == 0)
-      result[row * result_inc + result_start] = work[0];
-  }
-}
-
-
-template<typename NumericT>
-__global__ void trans_vec_mul_row_kernel(
-          const NumericT * A,
-          unsigned int A_row_start,
-          unsigned int A_col_start,
-          unsigned int A_row_inc,
-          unsigned int A_col_inc,
-          unsigned int A_row_size,
-          unsigned int A_col_size,
-          unsigned int A_internal_rows,
-          unsigned int A_internal_cols,
-          const NumericT * v,
-          unsigned int v_start,
-          unsigned int v_inc,
-          unsigned int v_size,
-          NumericT * result,
-          unsigned int result_start,
-          unsigned int result_inc,
-          unsigned int result_size)
-{
-  for (unsigned int row = blockIdx.x * blockDim.x + threadIdx.x; row < 
A_col_size; row += gridDim.x * blockDim.x)
-  {
-    NumericT dot_prod = 0;
-    for (unsigned int col = 0; col < A_row_size; ++col)
-      dot_prod += A[(row * A_col_inc + A_col_start) + (col * A_row_inc + 
A_row_start) * A_internal_cols] * v[v_start + v_inc * col];
-    result[row * result_inc + result_start] = dot_prod;
-  }
-}
-
-
-//
-// matrix-matrix products
-//
-
-
-
-
-//
-// scaled rank-1-update
-//
-
-// alpha on CPU
-template<typename NumericT>
-__global__ void scaled_rank1_update_row_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          NumericT val,
-          unsigned int options2,
-
-          const NumericT * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-
-          const NumericT * vec2,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2)
-{
-  NumericT alpha = val;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-  if (options2 & (1 << 1))
-    alpha = NumericT(1) / alpha;
-
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-  {
-    NumericT tmp = alpha * vec1[row * inc1 + start1];
-    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] += tmp * vec2[col * inc2 + start2];
-  }
-}
-
-
-// alpha on GPU
-template<typename NumericT>
-__global__ void scaled_rank1_update_row_kernel(
-          NumericT * A,
-          unsigned int A_start1, unsigned int A_start2,
-          unsigned int A_inc1,   unsigned int A_inc2,
-          unsigned int A_size1,  unsigned int A_size2,
-          unsigned int A_internal_size1,  unsigned int A_internal_size2,
-
-          const NumericT * val,
-          unsigned int options2,
-
-          const NumericT * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-
-          const NumericT * vec2,
-          unsigned int start2,
-          unsigned int inc2,
-          unsigned int size2)
-{
-  NumericT alpha = *val;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-  if (options2 & (1 << 1))
-    alpha = NumericT(1) / alpha;
-
-  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
-  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
-
-  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
-  {
-    NumericT tmp = alpha * vec1[row * inc1 + start1];
-    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
-      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] += tmp * vec2[col * inc2 + start2];
-  }
-}
-
-
-
-} // namespace cuda
-} //namespace linalg
-} //namespace viennacl
-
-
-#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/misc_operations.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/misc_operations.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/misc_operations.hpp
deleted file mode 100644
index 4821f5b..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/misc_operations.hpp
+++ /dev/null
@@ -1,91 +0,0 @@
-#ifndef VIENNACL_LINALG_CUDA_MISC_OPERATIONS_HPP_
-#define VIENNACL_LINALG_CUDA_MISC_OPERATIONS_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-/** @file viennacl/linalg/cuda/misc_operations.hpp
-    @brief Implementations of miscellaneous operations using CUDA
-*/
-
-#include "viennacl/forwards.h"
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/tools/tools.hpp"
-#include "viennacl/linalg/cuda/common.hpp"
-
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace cuda
-{
-namespace detail
-{
-
-template<typename NumericT>
-__global__ void level_scheduling_substitute_kernel(
-          const unsigned int * row_index_array,
-          const unsigned int * row_indices,
-          const unsigned int * column_indices,
-          const NumericT * elements,
-          NumericT * vec,
-          unsigned int size)
-{
-  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
-                    row  < size;
-                    row += gridDim.x * blockDim.x)
-  {
-    unsigned int eq_row = row_index_array[row];
-    NumericT vec_entry = vec[eq_row];
-    unsigned int row_end = row_indices[row+1];
-
-    for (unsigned int j = row_indices[row]; j < row_end; ++j)
-      vec_entry -= vec[column_indices[j]] * elements[j];
-
-    vec[eq_row] = vec_entry;
-  }
-}
-
-
-
-template<typename NumericT>
-void level_scheduling_substitute(vector<NumericT> & vec,
-                             viennacl::backend::mem_handle const & 
row_index_array,
-                             viennacl::backend::mem_handle const & row_buffer,
-                             viennacl::backend::mem_handle const & col_buffer,
-                             viennacl::backend::mem_handle const & 
element_buffer,
-                             vcl_size_t num_rows
-                            )
-{
-  level_scheduling_substitute_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned 
int>(row_index_array),
-                                                   viennacl::cuda_arg<unsigned 
int>(row_buffer),
-                                                   viennacl::cuda_arg<unsigned 
int>(col_buffer),
-                                                   
viennacl::cuda_arg<NumericT>(element_buffer),
-                                                   viennacl::cuda_arg(vec),
-                                                   static_cast<unsigned 
int>(num_rows)
-                                                  );
-}
-
-} //namespace detail
-} //namespace cuda
-} //namespace linalg
-} //namespace viennacl
-
-
-#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/nmf_operations.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/nmf_operations.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/nmf_operations.hpp
deleted file mode 100644
index 109f74f..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/nmf_operations.hpp
+++ /dev/null
@@ -1,152 +0,0 @@
-#ifndef VIENNACL_LINALG_CUDA_NMF_OPERATIONS_HPP_
-#define VIENNACL_LINALG_CUDA_NMF_OPERATIONS_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-/** @file viennacl/linalg/cuda/vector_operations.hpp
- @brief Implementations of NMF operations using CUDA
- */
-
-#include "viennacl/linalg/host_based/nmf_operations.hpp"
-
-#include "viennacl/linalg/cuda/common.hpp"
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace cuda
-{
-
-/** @brief Main CUDA kernel for nonnegative matrix factorization of a dense 
matrices. */
-template<typename NumericT>
-__global__ void el_wise_mul_div(NumericT       * matrix1,
-                                NumericT const * matrix2,
-                                NumericT const * matrix3,
-                                unsigned int size)
-{
-  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i 
+=gridDim.x * blockDim.x)
-  {
-    NumericT val = matrix1[i] * matrix2[i];
-    NumericT divisor = matrix3[i];
-    matrix1[i] = (divisor > (NumericT) 0.00001) ? (val / divisor) : 
NumericT(0);
-  }
-}
-
-/** @brief The nonnegative matrix factorization (approximation) algorithm as 
suggested by Lee and Seung. Factorizes a matrix V with nonnegative entries into 
matrices W and H such that ||V - W*H|| is minimized.
- *
- * @param V     Input matrix
- * @param W     First factor
- * @param H     Second factor
- * @param conf  A configuration object holding tolerances and the like
- */
-template<typename NumericT>
-void nmf(viennacl::matrix_base<NumericT> const & V,
-         viennacl::matrix_base<NumericT> & W,
-         viennacl::matrix_base<NumericT> & H,
-         viennacl::linalg::nmf_config const & conf)
-{
-  vcl_size_t k = W.size2();
-  conf.iters_ = 0;
-
-  if (!viennacl::linalg::norm_frobenius(W))
-    W = viennacl::scalar_matrix<NumericT>(W.size1(), W.size2(), NumericT(1.0));
-
-  if (!viennacl::linalg::norm_frobenius(H))
-    H = viennacl::scalar_matrix<NumericT>(H.size1(), H.size2(), NumericT(1.0));
-
-  viennacl::matrix_base<NumericT> wn(V.size1(), k, W.row_major());
-  viennacl::matrix_base<NumericT> wd(V.size1(), k, W.row_major());
-  viennacl::matrix_base<NumericT> wtmp(V.size1(), V.size2(), W.row_major());
-
-  viennacl::matrix_base<NumericT> hn(k, V.size2(), H.row_major());
-  viennacl::matrix_base<NumericT> hd(k, V.size2(), H.row_major());
-  viennacl::matrix_base<NumericT> htmp(k, k, H.row_major());
-
-  viennacl::matrix_base<NumericT> appr(V.size1(), V.size2(), V.row_major());
-
-  viennacl::vector<NumericT> diff(V.size1() * V.size2());
-
-  NumericT last_diff = 0;
-  NumericT diff_init = 0;
-  bool stagnation_flag = false;
-
-  for (vcl_size_t i = 0; i < conf.max_iterations(); i++)
-  {
-    conf.iters_ = i + 1;
-
-    hn = viennacl::linalg::prod(trans(W), V);
-    htmp = viennacl::linalg::prod(trans(W), W);
-    hd = viennacl::linalg::prod(htmp, H);
-
-    el_wise_mul_div<<<128, 128>>>(viennacl::cuda_arg<NumericT>(H),
-                                  viennacl::cuda_arg<NumericT>(hn),
-                                  viennacl::cuda_arg<NumericT>(hd),
-                                  static_cast<unsigned int>(H.internal_size1() 
* H.internal_size2()));
-    VIENNACL_CUDA_LAST_ERROR_CHECK("el_wise_mul_div");
-
-    wn   = viennacl::linalg::prod(V, trans(H));
-    wtmp = viennacl::linalg::prod(W, H);
-    wd   = viennacl::linalg::prod(wtmp, trans(H));
-
-    el_wise_mul_div<<<128, 128>>>(viennacl::cuda_arg<NumericT>(W),
-                                  viennacl::cuda_arg<NumericT>(wn),
-                                  viennacl::cuda_arg<NumericT>(wd),
-                                  static_cast<unsigned int>( 
W.internal_size1() * W.internal_size2()));
-    VIENNACL_CUDA_LAST_ERROR_CHECK("el_wise_mul_div");
-
-    if (i % conf.check_after_steps() == 0)  //check for convergence
-    {
-      appr = viennacl::linalg::prod(W, H);
-
-      appr -= V;
-      NumericT diff_val = viennacl::linalg::norm_frobenius(appr);
-
-      if (i == 0)
-        diff_init = diff_val;
-
-      if (conf.print_relative_error())
-        std::cout << diff_val / diff_init << std::endl;
-
-      // Approximation check
-      if (diff_val / diff_init < conf.tolerance())
-        break;
-
-      // Stagnation check
-      if (std::fabs(diff_val - last_diff) / (diff_val * 
conf.check_after_steps()) < conf.stagnation_tolerance()) //avoid situations 
where convergence stagnates
-      {
-        if (stagnation_flag)  // iteration stagnates (two iterates with no 
notable progress)
-          break;
-        else
-          // record stagnation in this iteration
-          stagnation_flag = true;
-      } else
-        // good progress in this iteration, so unset stagnation flag
-        stagnation_flag = false;
-
-      // prepare for next iterate:
-      last_diff = diff_val;
-    }
-  }
-}
-
-} //namespace cuda
-} //namespace linalg
-} //namespace viennacl
-
-#endif /* VIENNACL_LINALG_CUDA_NMF_OPERATIONS_HPP_ */

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/scalar_operations.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/scalar_operations.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/scalar_operations.hpp
deleted file mode 100644
index 3adaca2..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/scalar_operations.hpp
+++ /dev/null
@@ -1,375 +0,0 @@
-#ifndef VIENNACL_LINALG_CUDA_SCALAR_OPERATIONS_HPP_
-#define VIENNACL_LINALG_CUDA_SCALAR_OPERATIONS_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-/** @file viennacl/linalg/cuda/scalar_operations.hpp
-    @brief Implementations of scalar operations using CUDA
-*/
-
-#include "viennacl/forwards.h"
-#include "viennacl/tools/tools.hpp"
-#include "viennacl/meta/predicate.hpp"
-#include "viennacl/meta/enable_if.hpp"
-#include "viennacl/traits/size.hpp"
-#include "viennacl/traits/start.hpp"
-#include "viennacl/traits/stride.hpp"
-#include "viennacl/linalg/cuda/common.hpp"
-
-// includes CUDA
-#include <cuda_runtime.h>
-
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace cuda
-{
-
-/////////////////// as /////////////////////////////
-
-template<typename NumericT>
-__global__ void as_kernel(NumericT * s1, const NumericT * fac2, unsigned int 
options2, const NumericT * s2)
-{
-  NumericT alpha = *fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-  if (options2 & (1 << 1))
-    alpha = NumericT(1) / alpha;
-
-  *s1 = *s2 * alpha;
-}
-
-template<typename NumericT>
-__global__ void as_kernel(NumericT * s1, NumericT fac2, unsigned int options2, 
const NumericT * s2)
-{
-  NumericT alpha = fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-  if (options2 & (1 << 1))
-    alpha = NumericT(1) / alpha;
-
-  *s1 = *s2 * alpha;
-}
-
-template<typename ScalarT1,
-         typename ScalarT2, typename NumericT>
-typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
-                              && viennacl::is_scalar<ScalarT2>::value
-                              && viennacl::is_any_scalar<NumericT>::value
-                            >::type
-as(ScalarT1 & s1,
-   ScalarT2 const & s2, NumericT const & alpha, vcl_size_t len_alpha, bool 
reciprocal_alpha, bool flip_sign_alpha)
-{
-  typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type        
value_type;
-
-  unsigned int options_alpha = detail::make_options(len_alpha, 
reciprocal_alpha, flip_sign_alpha);
-
-  value_type temporary_alpha = 0;
-  if (viennacl::is_cpu_scalar<NumericT>::value)
-    temporary_alpha = alpha;
-
-  as_kernel<<<1, 1>>>(viennacl::cuda_arg(s1),
-                      
viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
-                      options_alpha,
-                      viennacl::cuda_arg(s2));
-  VIENNACL_CUDA_LAST_ERROR_CHECK("as_kernel");
-}
-
-//////////////////// asbs ////////////////////////////
-
-// alpha and beta on GPU
-template<typename NumericT>
-__global__ void asbs_kernel(NumericT * s1,
-                            const NumericT * fac2, unsigned int options2, 
const NumericT * s2,
-                            const NumericT * fac3, unsigned int options3, 
const NumericT * s3)
-{
-    NumericT alpha = *fac2;
-    if (options2 & (1 << 0))
-      alpha = -alpha;
-    if (options2 & (1 << 1))
-      alpha = NumericT(1) / alpha;
-
-    NumericT beta = *fac3;
-    if (options3 & (1 << 0))
-      beta = -beta;
-    if (options3 & (1 << 1))
-      beta = NumericT(1) / beta;
-
-    *s1 = *s2 * alpha + *s3 * beta;
-}
-
-// alpha on CPU, beta on GPU
-template<typename NumericT>
-__global__ void asbs_kernel(NumericT * s1,
-                            NumericT fac2,         unsigned int options2, 
const NumericT * s2,
-                            NumericT const * fac3, unsigned int options3, 
const NumericT * s3)
-{
-    NumericT alpha = fac2;
-    if (options2 & (1 << 0))
-      alpha = -alpha;
-    if (options2 & (1 << 1))
-      alpha = NumericT(1) / alpha;
-
-    NumericT beta = *fac3;
-    if (options3 & (1 << 0))
-      beta = -beta;
-    if (options3 & (1 << 1))
-      beta = NumericT(1) / beta;
-
-    *s1 = *s2 * alpha + *s3 * beta;
-}
-
-// alpha on GPU, beta on CPU
-template<typename NumericT>
-__global__ void asbs_kernel(NumericT * s1,
-                            NumericT const * fac2, unsigned int options2, 
const NumericT * s2,
-                            NumericT         fac3, unsigned int options3, 
const NumericT * s3)
-{
-    NumericT alpha = *fac2;
-    if (options2 & (1 << 0))
-      alpha = -alpha;
-    if (options2 & (1 << 1))
-      alpha = NumericT(1) / alpha;
-
-    NumericT beta = fac3;
-    if (options3 & (1 << 0))
-      beta = -beta;
-    if (options3 & (1 << 1))
-      beta = NumericT(1) / beta;
-
-    *s1 = *s2 * alpha + *s3 * beta;
-}
-
-// alpha and beta on CPU
-template<typename NumericT>
-__global__ void asbs_kernel(NumericT * s1,
-                            NumericT fac2, unsigned int options2, const 
NumericT * s2,
-                            NumericT fac3, unsigned int options3, const 
NumericT * s3)
-{
-    NumericT alpha = fac2;
-    if (options2 & (1 << 0))
-      alpha = -alpha;
-    if (options2 & (1 << 1))
-      alpha = NumericT(1) / alpha;
-
-    NumericT beta = fac3;
-    if (options3 & (1 << 0))
-      beta = -beta;
-    if (options3 & (1 << 1))
-      beta = NumericT(1) / beta;
-
-    *s1 = *s2 * alpha + *s3 * beta;
-}
-
-
-template<typename ScalarT1,
-         typename ScalarT2, typename NumericT1,
-         typename ScalarT3, typename NumericT2>
-typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
-                              && viennacl::is_scalar<ScalarT2>::value
-                              && viennacl::is_scalar<ScalarT3>::value
-                              && viennacl::is_any_scalar<NumericT1>::value
-                              && viennacl::is_any_scalar<NumericT2>::value
-                            >::type
-asbs(ScalarT1 & s1,
-     ScalarT2 const & s2, NumericT1 const & alpha, vcl_size_t len_alpha, bool 
reciprocal_alpha, bool flip_sign_alpha,
-     ScalarT3 const & s3, NumericT2 const & beta,  vcl_size_t len_beta,  bool 
reciprocal_beta,  bool flip_sign_beta)
-{
-  typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type        
value_type;
-
-  unsigned int options_alpha = detail::make_options(len_alpha, 
reciprocal_alpha, flip_sign_alpha);
-  unsigned int options_beta  = detail::make_options(len_beta,  
reciprocal_beta,  flip_sign_beta);
-
-  value_type temporary_alpha = 0;
-  if (viennacl::is_cpu_scalar<NumericT1>::value)
-    temporary_alpha = alpha;
-
-  value_type temporary_beta = 0;
-  if (viennacl::is_cpu_scalar<NumericT2>::value)
-    temporary_beta = beta;
-
-  asbs_kernel<<<1, 1>>>(viennacl::cuda_arg(s1),
-                        
viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
-                        options_alpha,
-                        viennacl::cuda_arg(s2),
-                        
viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
-                        options_beta,
-                        viennacl::cuda_arg(s3) );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("asbs_kernel");
-}
-
-//////////////////// asbs_s ////////////////////
-
-// alpha and beta on GPU
-template<typename NumericT>
-__global__ void asbs_s_kernel(NumericT * s1,
-                              const NumericT * fac2, unsigned int options2, 
const NumericT * s2,
-                              const NumericT * fac3, unsigned int options3, 
const NumericT * s3)
-{
-    NumericT alpha = *fac2;
-    if (options2 & (1 << 0))
-      alpha = -alpha;
-    if (options2 & (1 << 1))
-      alpha = NumericT(1) / alpha;
-
-    NumericT beta = *fac3;
-    if (options3 & (1 << 0))
-      beta = -beta;
-    if (options3 & (1 << 1))
-      beta = NumericT(1) / beta;
-
-    *s1 += *s2 * alpha + *s3 * beta;
-}
-
-// alpha on CPU, beta on GPU
-template<typename NumericT>
-__global__ void asbs_s_kernel(NumericT * s1,
-                              NumericT         fac2, unsigned int options2, 
const NumericT * s2,
-                              NumericT const * fac3, unsigned int options3, 
const NumericT * s3)
-{
-    NumericT alpha = fac2;
-    if (options2 & (1 << 0))
-      alpha = -alpha;
-    if (options2 & (1 << 1))
-      alpha = NumericT(1) / alpha;
-
-    NumericT beta = *fac3;
-    if (options3 & (1 << 0))
-      beta = -beta;
-    if (options3 & (1 << 1))
-      beta = NumericT(1) / beta;
-
-    *s1 += *s2 * alpha + *s3 * beta;
-}
-
-// alpha on GPU, beta on CPU
-template<typename NumericT>
-__global__ void asbs_s_kernel(NumericT * s1,
-                              NumericT const * fac2, unsigned int options2, 
const NumericT * s2,
-                              NumericT         fac3, unsigned int options3, 
const NumericT * s3)
-{
-    NumericT alpha = *fac2;
-    if (options2 & (1 << 0))
-      alpha = -alpha;
-    if (options2 & (1 << 1))
-      alpha = NumericT(1) / alpha;
-
-    NumericT beta = fac3;
-    if (options3 & (1 << 0))
-      beta = -beta;
-    if (options3 & (1 << 1))
-      beta = NumericT(1) / beta;
-
-    *s1 += *s2 * alpha + *s3 * beta;
-}
-
-// alpha and beta on CPU
-template<typename NumericT>
-__global__ void asbs_s_kernel(NumericT * s1,
-                              NumericT fac2, unsigned int options2, const 
NumericT * s2,
-                              NumericT fac3, unsigned int options3, const 
NumericT * s3)
-{
-    NumericT alpha = fac2;
-    if (options2 & (1 << 0))
-      alpha = -alpha;
-    if (options2 & (1 << 1))
-      alpha = NumericT(1) / alpha;
-
-    NumericT beta = fac3;
-    if (options3 & (1 << 0))
-      beta = -beta;
-    if (options3 & (1 << 1))
-      beta = NumericT(1) / beta;
-
-    *s1 += *s2 * alpha + *s3 * beta;
-}
-
-
-template<typename ScalarT1,
-         typename ScalarT2, typename NumericT1,
-         typename ScalarT3, typename NumericT2>
-typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
-                              && viennacl::is_scalar<ScalarT2>::value
-                              && viennacl::is_scalar<ScalarT3>::value
-                              && viennacl::is_any_scalar<NumericT1>::value
-                              && viennacl::is_any_scalar<NumericT2>::value
-                            >::type
-asbs_s(ScalarT1 & s1,
-       ScalarT2 const & s2, NumericT1 const & alpha, vcl_size_t len_alpha, 
bool reciprocal_alpha, bool flip_sign_alpha,
-       ScalarT3 const & s3, NumericT2 const & beta,  vcl_size_t len_beta,  
bool reciprocal_beta,  bool flip_sign_beta)
-{
-  typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type        
value_type;
-
-  unsigned int options_alpha = detail::make_options(len_alpha, 
reciprocal_alpha, flip_sign_alpha);
-  unsigned int options_beta  = detail::make_options(len_beta,  
reciprocal_beta,  flip_sign_beta);
-
-  value_type temporary_alpha = 0;
-  if (viennacl::is_cpu_scalar<NumericT1>::value)
-    temporary_alpha = alpha;
-
-  value_type temporary_beta = 0;
-  if (viennacl::is_cpu_scalar<NumericT2>::value)
-    temporary_beta = beta;
-
-  std::cout << "Launching asbs_s_kernel..." << std::endl;
-  asbs_s_kernel<<<1, 1>>>(viennacl::cuda_arg(s1),
-                          
viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
-                          options_alpha,
-                          viennacl::cuda_arg(s2),
-                          
viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
-                          options_beta,
-                          viennacl::cuda_arg(s3) );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("asbs_s_kernel");
-}
-
-///////////////// swap //////////////////
-
-template<typename NumericT>
-__global__ void scalar_swap_kernel(NumericT * s1, NumericT * s2)
-{
-  NumericT tmp = *s2;
-  *s2 = *s1;
-  *s1 = tmp;
-}
-
-/** @brief Swaps the contents of two scalars, data is copied
-*
-* @param s1   The first scalar
-* @param s2   The second scalar
-*/
-template<typename ScalarT1, typename ScalarT2>
-typename viennacl::enable_if<    viennacl::is_scalar<ScalarT1>::value
-                              && viennacl::is_scalar<ScalarT2>::value
-                            >::type
-swap(ScalarT1 & s1, ScalarT2 & s2)
-{
-  typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type        
value_type;
-
-  scalar_swap_kernel<<<1, 1>>>(viennacl::cuda_arg(s1), viennacl::cuda_arg(s2));
-}
-
-
-
-} //namespace single_threaded
-} //namespace linalg
-} //namespace viennacl
-
-
-#endif

Reply via email to