http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_row.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_row.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_row.hpp
new file mode 100644
index 0000000..45d6987
--- /dev/null
+++ 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations_row.hpp
@@ -0,0 +1,1468 @@
+#ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_
+#define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+/** @file  viennacl/linalg/cuda/matrix_operations_row.hpp
+    @brief Implementations of row-major dense matrix related operations, 
including matrix-vector products, using CUDA.
+*/
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+template<typename DestNumericT, typename SrcNumericT>
+__global__ void convert_row_kernel(
+          DestNumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const SrcNumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2];
+}
+
+//Matrix transpose kernel
+template<typename NumericT>
+__global__ void trans_kernel(
+          const NumericT * A,
+          unsigned int A_start1,          unsigned int A_start2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+          unsigned int A_size1,           unsigned int A_size2,
+          unsigned int A_stride1,         unsigned int A_stride2,
+
+          NumericT * B,
+          unsigned int B_start1,          unsigned int B_start2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2,
+          unsigned int B_stride1,         unsigned int B_stride2,
+          bool data_major)
+{
+  for(unsigned int row = blockIdx.x; row<A_size1; row+=gridDim.x)
+  {
+    for(unsigned int col = threadIdx.x; col<A_size2; col+=blockDim.x)
+    {
+      if(data_major)
+        B[(B_start1 + B_stride1 * col) * B_internal_size2 + (B_start2 + 
B_stride2 * row)] = A[(A_start1 + A_stride1 * row) * A_internal_size2 + 
(A_start2 + A_stride2 * col)];
+      else
+        B[(B_start1 + B_stride1 * col) + (B_start2 + B_stride2 * row) * 
B_internal_size1] = A[(A_start1 + A_stride1 * row) + (A_start2 + A_stride2 * 
col) * A_internal_size1];
+     }
+  }
+}
+
+//
+// am
+//
+
+// alpha on CPU
+template<typename NumericT>
+__global__ void am_row_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          NumericT fac2,
+          unsigned int options2,
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  NumericT alpha = fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+      for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+        A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha;
+  }
+  else
+  {
+    for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+      for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+        A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha;
+  }
+}
+
+// alpha on GPU
+template<typename NumericT>
+__global__ void am_row_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * fac2,
+          unsigned int options2,
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  NumericT alpha = *fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+      for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+        A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha;
+  }
+  else
+  {
+    for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+      for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+        A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha;
+  }
+}
+
+
+//
+// ambm
+//
+
+// alpha and beta on CPU
+template<typename NumericT>
+__global__ void ambm_row_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          NumericT fac2,
+          unsigned int options2,
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+          NumericT fac3,
+          unsigned int options3,
+          const NumericT * C,
+          unsigned int C_start1, unsigned int C_start2,
+          unsigned int C_inc1,   unsigned int C_inc2,
+          unsigned int C_internal_size1,  unsigned int C_internal_size2)
+{
+  NumericT alpha = fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
+    }
+  }
+}
+
+
+// alpha on CPU, beta on GPU
+template<typename NumericT>
+__global__ void ambm_row_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          NumericT fac2,
+          unsigned int options2,
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+          const NumericT * fac3,
+          unsigned int options3,
+          const NumericT * C,
+          unsigned int C_start1, unsigned int C_start2,
+          unsigned int C_inc1,   unsigned int C_inc2,
+          unsigned int C_internal_size1,  unsigned int C_internal_size2)
+{
+  NumericT alpha = fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = *fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
+    }
+  }
+}
+
+// alpha on GPU, beta on CPU
+template<typename NumericT>
+__global__ void ambm_row_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * fac2,
+          unsigned int options2,
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+          NumericT fac3,
+          unsigned int options3,
+          const NumericT * C,
+          unsigned int C_start1, unsigned int C_start2,
+          unsigned int C_inc1,   unsigned int C_inc2,
+          unsigned int C_internal_size1,  unsigned int C_internal_size2)
+{
+  NumericT alpha = *fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
+    }
+  }
+}
+
+
+// alpha and beta on GPU
+template<typename NumericT>
+__global__ void ambm_row_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * fac2,
+          unsigned int options2,
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+          const NumericT * fac3,
+          unsigned int options3,
+          const NumericT * C,
+          unsigned int C_start1, unsigned int C_start2,
+          unsigned int C_inc1,   unsigned int C_inc2,
+          unsigned int C_internal_size1,  unsigned int C_internal_size2)
+{
+  NumericT alpha = *fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = *fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+        = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
+    }
+  }
+}
+
+
+//
+// ambm_m
+//
+
+// alpha and beta on CPU
+template<typename NumericT>
+__global__ void ambm_m_row_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          NumericT fac2,
+          unsigned int options2,
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+          NumericT fac3,
+          unsigned int options3,
+          const NumericT * C,
+          unsigned int C_start1, unsigned int C_start2,
+          unsigned int C_inc1,   unsigned int C_inc2,
+          unsigned int C_internal_size1,  unsigned int C_internal_size2)
+{
+  NumericT alpha = fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
+    }
+  }
+}
+
+
+// alpha on CPU, beta on GPU
+template<typename NumericT>
+__global__ void ambm_m_row_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          NumericT fac2,
+          unsigned int options2,
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+          const NumericT * fac3,
+          unsigned int options3,
+          const NumericT * C,
+          unsigned int C_start1, unsigned int C_start2,
+          unsigned int C_inc1,   unsigned int C_inc2,
+          unsigned int C_internal_size1,  unsigned int C_internal_size2)
+{
+  NumericT alpha = fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = *fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
+    }
+  }
+}
+
+// alpha on GPU, beta on CPU
+template<typename NumericT>
+__global__ void ambm_m_row_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * fac2,
+          unsigned int options2,
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+          NumericT fac3,
+          unsigned int options3,
+          const NumericT * C,
+          unsigned int C_start1, unsigned int C_start2,
+          unsigned int C_inc1,   unsigned int C_inc2,
+          unsigned int C_internal_size1,  unsigned int C_internal_size2)
+{
+  NumericT alpha = *fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
+    }
+  }
+}
+
+
+// alpha and beta on GPU
+template<typename NumericT>
+__global__ void ambm_m_row_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * fac2,
+          unsigned int options2,
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+          const NumericT * fac3,
+          unsigned int options3,
+          const NumericT * C,
+          unsigned int C_start1, unsigned int C_start2,
+          unsigned int C_inc1,   unsigned int C_inc2,
+          unsigned int C_internal_size1,  unsigned int C_internal_size2)
+{
+  NumericT alpha = *fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+
+  NumericT beta = *fac3;
+  if (options3 & (1 << 0))
+    beta = -beta;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  if (options2 & (1 << 1))
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] / alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
+    }
+  }
+  else
+  {
+    if (options3 & (1 << 1))
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] / beta;
+    }
+    else
+    {
+      for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+        for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+          A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+       += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2] * alpha
+        + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2] * beta;
+    }
+  }
+}
+
+//
+// assignments
+//
+
+template<typename NumericT>
+__global__ void matrix_row_assign_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+          NumericT alpha)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = alpha;
+}
+
+
+template<typename NumericT>
+__global__ void matrix_row_diagonal_assign_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+          NumericT alpha)
+{
+  unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x);
+
+  for (unsigned int row = gid; row < A_size1; row += blockDim.x * gridDim.x)
+    A[(row * A_inc1 + A_start1) * A_internal_size2 + row * A_inc2 + A_start2] 
= alpha;
+}
+
+//
+// binary element-wise operations
+//
+
+template<typename NumericT>
+__global__ void element_op_row_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+          const NumericT * C,
+          unsigned int C_start1, unsigned int C_start2,
+          unsigned int C_inc1,   unsigned int C_inc2,
+          unsigned int C_internal_size1,  unsigned int C_internal_size2,
+
+          unsigned int op_type) //0: product, 1: division, 2: pow
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  if (op_type == 2)
+  {
+    for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+      for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+        A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+      = pow(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2],
+            C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2]);
+  }
+  else if (op_type == 1)
+  {
+    for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+      for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+        A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+      = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2]
+      / C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2];
+  }
+  else if (op_type == 0)
+  {
+    for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+      for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+        A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+      = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2]
+      * C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2];
+  }
+}
+
+template<typename NumericT>
+__global__ void element_op_int_row_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2,
+
+          const NumericT * C,
+          unsigned int C_start1, unsigned int C_start2,
+          unsigned int C_inc1,   unsigned int C_inc2,
+          unsigned int C_internal_size1,  unsigned int C_internal_size2,
+
+          unsigned int op_type) //0: product, 1: division, 2: pow
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  if (op_type == 1)
+  {
+    for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+      for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+        A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+      = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2]
+      / C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2];
+  }
+  else if (op_type == 0)
+  {
+    for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+      for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+        A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2]
+      = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2]
+      * C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + 
C_start2];
+  }
+}
+
+//
+// unary element-wise operations
+//
+
+// abs
+template<typename NumericT>
+__global__ void matrix_row_element_abs_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = abs(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2]);
+}
+
+
+// acos
+template<typename NumericT>
+__global__ void matrix_row_element_acos_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = acos(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 
+ B_start2]);
+}
+
+
+// asin
+template<typename NumericT>
+__global__ void matrix_row_element_asin_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = asin(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 
+ B_start2]);
+}
+
+
+// atan
+template<typename NumericT>
+__global__ void matrix_row_element_atan_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = atan(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 
+ B_start2]);
+}
+
+
+// ceil
+template<typename NumericT>
+__global__ void matrix_row_element_ceil_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = ceil(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 
+ B_start2]);
+}
+
+
+// cos
+template<typename NumericT>
+__global__ void matrix_row_element_cos_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = cos(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2]);
+}
+
+
+// cosh
+template<typename NumericT>
+__global__ void matrix_row_element_cosh_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = cosh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 
+ B_start2]);
+}
+
+
+// exp
+template<typename NumericT>
+__global__ void matrix_row_element_exp_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = exp(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2]);
+}
+
+
+// fabs
+template<typename NumericT>
+__global__ void matrix_row_element_fabs_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = fabs(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 
+ B_start2]);
+}
+
+
+// floor
+template<typename NumericT>
+__global__ void matrix_row_element_floor_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = floor(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 
+ B_start2]);
+}
+
+
+// log
+template<typename NumericT>
+__global__ void matrix_row_element_log_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = log(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2]);
+}
+
+
+// log10
+template<typename NumericT>
+__global__ void matrix_row_element_log10_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = log10(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 
+ B_start2]);
+}
+
+
+// sin
+template<typename NumericT>
+__global__ void matrix_row_element_sin_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = sin(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2]);
+}
+
+
+// sinh
+template<typename NumericT>
+__global__ void matrix_row_element_sinh_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = sinh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 
+ B_start2]);
+}
+
+
+// sqrt
+template<typename NumericT>
+__global__ void matrix_row_element_sqrt_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = sqrt(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 
+ B_start2]);
+}
+
+
+// tan
+template<typename NumericT>
+__global__ void matrix_row_element_tan_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = tan(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + 
B_start2]);
+}
+
+
+// tanh
+template<typename NumericT>
+__global__ void matrix_row_element_tanh_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * B,
+          unsigned int B_start1, unsigned int B_start2,
+          unsigned int B_inc1,   unsigned int B_inc2,
+          unsigned int B_internal_size1,  unsigned int B_internal_size2)
+{
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] = tanh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 
+ B_start2]);
+}
+
+
+
+//
+// matrix-vector product
+//
+
+template<typename NumericT>
+__global__ void vec_mul_row_kernel(
+          const NumericT * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_inc,
+          unsigned int A_col_inc,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          const NumericT * v,
+          unsigned int v_start,
+          unsigned int v_inc,
+          unsigned int v_size,
+          NumericT * result,
+          unsigned int result_start,
+          unsigned int result_inc,
+          unsigned int result_size)
+{
+  __shared__ NumericT work[128];
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+  unsigned int lid = threadIdx.x;
+
+  for (unsigned int row = row_gid; row < A_row_size; row += gridDim.x)
+  {
+    NumericT dot_prod = 0;
+    for (unsigned int col = col_gid; col < A_col_size; col += blockDim.x)
+      dot_prod += A[(row * A_row_inc + A_row_start) * A_internal_cols + col * 
A_col_inc + A_col_start] * v[v_start + v_inc * col];
+    work[lid] = dot_prod;
+
+    for (unsigned int stride = blockDim.x/2; stride>0; stride>>=1){
+      __syncthreads();
+      if (lid < stride)
+        work[lid] += work[lid+stride];
+    }
+
+    if (lid == 0)
+      result[row * result_inc + result_start] = work[0];
+  }
+}
+
+
+template<typename NumericT>
+__global__ void trans_vec_mul_row_kernel(
+          const NumericT * A,
+          unsigned int A_row_start,
+          unsigned int A_col_start,
+          unsigned int A_row_inc,
+          unsigned int A_col_inc,
+          unsigned int A_row_size,
+          unsigned int A_col_size,
+          unsigned int A_internal_rows,
+          unsigned int A_internal_cols,
+          const NumericT * v,
+          unsigned int v_start,
+          unsigned int v_inc,
+          unsigned int v_size,
+          NumericT * result,
+          unsigned int result_start,
+          unsigned int result_inc,
+          unsigned int result_size)
+{
+  for (unsigned int row = blockIdx.x * blockDim.x + threadIdx.x; row < 
A_col_size; row += gridDim.x * blockDim.x)
+  {
+    NumericT dot_prod = 0;
+    for (unsigned int col = 0; col < A_row_size; ++col)
+      dot_prod += A[(row * A_col_inc + A_col_start) + (col * A_row_inc + 
A_row_start) * A_internal_cols] * v[v_start + v_inc * col];
+    result[row * result_inc + result_start] = dot_prod;
+  }
+}
+
+
+//
+// matrix-matrix products
+//
+
+
+
+
+//
+// scaled rank-1-update
+//
+
+// alpha on CPU
+template<typename NumericT>
+__global__ void scaled_rank1_update_row_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          NumericT val,
+          unsigned int options2,
+
+          const NumericT * vec1,
+          unsigned int start1,
+          unsigned int inc1,
+          unsigned int size1,
+
+          const NumericT * vec2,
+          unsigned int start2,
+          unsigned int inc2,
+          unsigned int size2)
+{
+  NumericT alpha = val;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+  if (options2 & (1 << 1))
+    alpha = NumericT(1) / alpha;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+  {
+    NumericT tmp = alpha * vec1[row * inc1 + start1];
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] += tmp * vec2[col * inc2 + start2];
+  }
+}
+
+
+// alpha on GPU
+template<typename NumericT>
+__global__ void scaled_rank1_update_row_kernel(
+          NumericT * A,
+          unsigned int A_start1, unsigned int A_start2,
+          unsigned int A_inc1,   unsigned int A_inc2,
+          unsigned int A_size1,  unsigned int A_size2,
+          unsigned int A_internal_size1,  unsigned int A_internal_size2,
+
+          const NumericT * val,
+          unsigned int options2,
+
+          const NumericT * vec1,
+          unsigned int start1,
+          unsigned int inc1,
+          unsigned int size1,
+
+          const NumericT * vec2,
+          unsigned int start2,
+          unsigned int inc2,
+          unsigned int size2)
+{
+  NumericT alpha = *val;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+  if (options2 & (1 << 1))
+    alpha = NumericT(1) / alpha;
+
+  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
+  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
+
+  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
+  {
+    NumericT tmp = alpha * vec1[row * inc1 + start1];
+    for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
+      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + 
A_start2] += tmp * vec2[col * inc2 + start2];
+  }
+}
+
+
+
+} // namespace cuda
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/misc_operations.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/misc_operations.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/misc_operations.hpp
new file mode 100644
index 0000000..4821f5b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/misc_operations.hpp
@@ -0,0 +1,91 @@
+#ifndef VIENNACL_LINALG_CUDA_MISC_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_MISC_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+/** @file viennacl/linalg/cuda/misc_operations.hpp
+    @brief Implementations of miscellaneous operations using CUDA
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/cuda/common.hpp"
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+namespace detail
+{
+
+template<typename NumericT>
+__global__ void level_scheduling_substitute_kernel(
+          const unsigned int * row_index_array,
+          const unsigned int * row_indices,
+          const unsigned int * column_indices,
+          const NumericT * elements,
+          NumericT * vec,
+          unsigned int size)
+{
+  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                    row  < size;
+                    row += gridDim.x * blockDim.x)
+  {
+    unsigned int eq_row = row_index_array[row];
+    NumericT vec_entry = vec[eq_row];
+    unsigned int row_end = row_indices[row+1];
+
+    for (unsigned int j = row_indices[row]; j < row_end; ++j)
+      vec_entry -= vec[column_indices[j]] * elements[j];
+
+    vec[eq_row] = vec_entry;
+  }
+}
+
+
+
+template<typename NumericT>
+void level_scheduling_substitute(vector<NumericT> & vec,
+                             viennacl::backend::mem_handle const & 
row_index_array,
+                             viennacl::backend::mem_handle const & row_buffer,
+                             viennacl::backend::mem_handle const & col_buffer,
+                             viennacl::backend::mem_handle const & 
element_buffer,
+                             vcl_size_t num_rows
+                            )
+{
+  level_scheduling_substitute_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned 
int>(row_index_array),
+                                                   viennacl::cuda_arg<unsigned 
int>(row_buffer),
+                                                   viennacl::cuda_arg<unsigned 
int>(col_buffer),
+                                                   
viennacl::cuda_arg<NumericT>(element_buffer),
+                                                   viennacl::cuda_arg(vec),
+                                                   static_cast<unsigned 
int>(num_rows)
+                                                  );
+}
+
+} //namespace detail
+} //namespace cuda
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/nmf_operations.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/nmf_operations.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/nmf_operations.hpp
new file mode 100644
index 0000000..109f74f
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/nmf_operations.hpp
@@ -0,0 +1,152 @@
+#ifndef VIENNACL_LINALG_CUDA_NMF_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_NMF_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+/** @file viennacl/linalg/cuda/vector_operations.hpp
+ @brief Implementations of NMF operations using CUDA
+ */
+
+#include "viennacl/linalg/host_based/nmf_operations.hpp"
+
+#include "viennacl/linalg/cuda/common.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+/** @brief Main CUDA kernel for nonnegative matrix factorization of a dense 
matrices. */
+template<typename NumericT>
+__global__ void el_wise_mul_div(NumericT       * matrix1,
+                                NumericT const * matrix2,
+                                NumericT const * matrix3,
+                                unsigned int size)
+{
+  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i 
+=gridDim.x * blockDim.x)
+  {
+    NumericT val = matrix1[i] * matrix2[i];
+    NumericT divisor = matrix3[i];
+    matrix1[i] = (divisor > (NumericT) 0.00001) ? (val / divisor) : 
NumericT(0);
+  }
+}
+
+/** @brief The nonnegative matrix factorization (approximation) algorithm as 
suggested by Lee and Seung. Factorizes a matrix V with nonnegative entries into 
matrices W and H such that ||V - W*H|| is minimized.
+ *
+ * @param V     Input matrix
+ * @param W     First factor
+ * @param H     Second factor
+ * @param conf  A configuration object holding tolerances and the like
+ */
+template<typename NumericT>
+void nmf(viennacl::matrix_base<NumericT> const & V,
+         viennacl::matrix_base<NumericT> & W,
+         viennacl::matrix_base<NumericT> & H,
+         viennacl::linalg::nmf_config const & conf)
+{
+  vcl_size_t k = W.size2();
+  conf.iters_ = 0;
+
+  if (!viennacl::linalg::norm_frobenius(W))
+    W = viennacl::scalar_matrix<NumericT>(W.size1(), W.size2(), NumericT(1.0));
+
+  if (!viennacl::linalg::norm_frobenius(H))
+    H = viennacl::scalar_matrix<NumericT>(H.size1(), H.size2(), NumericT(1.0));
+
+  viennacl::matrix_base<NumericT> wn(V.size1(), k, W.row_major());
+  viennacl::matrix_base<NumericT> wd(V.size1(), k, W.row_major());
+  viennacl::matrix_base<NumericT> wtmp(V.size1(), V.size2(), W.row_major());
+
+  viennacl::matrix_base<NumericT> hn(k, V.size2(), H.row_major());
+  viennacl::matrix_base<NumericT> hd(k, V.size2(), H.row_major());
+  viennacl::matrix_base<NumericT> htmp(k, k, H.row_major());
+
+  viennacl::matrix_base<NumericT> appr(V.size1(), V.size2(), V.row_major());
+
+  viennacl::vector<NumericT> diff(V.size1() * V.size2());
+
+  NumericT last_diff = 0;
+  NumericT diff_init = 0;
+  bool stagnation_flag = false;
+
+  for (vcl_size_t i = 0; i < conf.max_iterations(); i++)
+  {
+    conf.iters_ = i + 1;
+
+    hn = viennacl::linalg::prod(trans(W), V);
+    htmp = viennacl::linalg::prod(trans(W), W);
+    hd = viennacl::linalg::prod(htmp, H);
+
+    el_wise_mul_div<<<128, 128>>>(viennacl::cuda_arg<NumericT>(H),
+                                  viennacl::cuda_arg<NumericT>(hn),
+                                  viennacl::cuda_arg<NumericT>(hd),
+                                  static_cast<unsigned int>(H.internal_size1() 
* H.internal_size2()));
+    VIENNACL_CUDA_LAST_ERROR_CHECK("el_wise_mul_div");
+
+    wn   = viennacl::linalg::prod(V, trans(H));
+    wtmp = viennacl::linalg::prod(W, H);
+    wd   = viennacl::linalg::prod(wtmp, trans(H));
+
+    el_wise_mul_div<<<128, 128>>>(viennacl::cuda_arg<NumericT>(W),
+                                  viennacl::cuda_arg<NumericT>(wn),
+                                  viennacl::cuda_arg<NumericT>(wd),
+                                  static_cast<unsigned int>( 
W.internal_size1() * W.internal_size2()));
+    VIENNACL_CUDA_LAST_ERROR_CHECK("el_wise_mul_div");
+
+    if (i % conf.check_after_steps() == 0)  //check for convergence
+    {
+      appr = viennacl::linalg::prod(W, H);
+
+      appr -= V;
+      NumericT diff_val = viennacl::linalg::norm_frobenius(appr);
+
+      if (i == 0)
+        diff_init = diff_val;
+
+      if (conf.print_relative_error())
+        std::cout << diff_val / diff_init << std::endl;
+
+      // Approximation check
+      if (diff_val / diff_init < conf.tolerance())
+        break;
+
+      // Stagnation check
+      if (std::fabs(diff_val - last_diff) / (diff_val * 
conf.check_after_steps()) < conf.stagnation_tolerance()) //avoid situations 
where convergence stagnates
+      {
+        if (stagnation_flag)  // iteration stagnates (two iterates with no 
notable progress)
+          break;
+        else
+          // record stagnation in this iteration
+          stagnation_flag = true;
+      } else
+        // good progress in this iteration, so unset stagnation flag
+        stagnation_flag = false;
+
+      // prepare for next iterate:
+      last_diff = diff_val;
+    }
+  }
+}
+
+} //namespace cuda
+} //namespace linalg
+} //namespace viennacl
+
+#endif /* VIENNACL_LINALG_CUDA_NMF_OPERATIONS_HPP_ */

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/scalar_operations.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/scalar_operations.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/scalar_operations.hpp
new file mode 100644
index 0000000..3adaca2
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/scalar_operations.hpp
@@ -0,0 +1,375 @@
+#ifndef VIENNACL_LINALG_CUDA_SCALAR_OPERATIONS_HPP_
+#define VIENNACL_LINALG_CUDA_SCALAR_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+/** @file viennacl/linalg/cuda/scalar_operations.hpp
+    @brief Implementations of scalar operations using CUDA
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/linalg/cuda/common.hpp"
+
+// includes CUDA
+#include <cuda_runtime.h>
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+
+/////////////////// as /////////////////////////////
+
+template<typename NumericT>
+__global__ void as_kernel(NumericT * s1, const NumericT * fac2, unsigned int 
options2, const NumericT * s2)
+{
+  NumericT alpha = *fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+  if (options2 & (1 << 1))
+    alpha = NumericT(1) / alpha;
+
+  *s1 = *s2 * alpha;
+}
+
+template<typename NumericT>
+__global__ void as_kernel(NumericT * s1, NumericT fac2, unsigned int options2, 
const NumericT * s2)
+{
+  NumericT alpha = fac2;
+  if (options2 & (1 << 0))
+    alpha = -alpha;
+  if (options2 & (1 << 1))
+    alpha = NumericT(1) / alpha;
+
+  *s1 = *s2 * alpha;
+}
+
+template<typename ScalarT1,
+         typename ScalarT2, typename NumericT>
+typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
+                              && viennacl::is_scalar<ScalarT2>::value
+                              && viennacl::is_any_scalar<NumericT>::value
+                            >::type
+as(ScalarT1 & s1,
+   ScalarT2 const & s2, NumericT const & alpha, vcl_size_t len_alpha, bool 
reciprocal_alpha, bool flip_sign_alpha)
+{
+  typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type        
value_type;
+
+  unsigned int options_alpha = detail::make_options(len_alpha, 
reciprocal_alpha, flip_sign_alpha);
+
+  value_type temporary_alpha = 0;
+  if (viennacl::is_cpu_scalar<NumericT>::value)
+    temporary_alpha = alpha;
+
+  as_kernel<<<1, 1>>>(viennacl::cuda_arg(s1),
+                      
viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                      options_alpha,
+                      viennacl::cuda_arg(s2));
+  VIENNACL_CUDA_LAST_ERROR_CHECK("as_kernel");
+}
+
+//////////////////// asbs ////////////////////////////
+
+// alpha and beta on GPU
+template<typename NumericT>
+__global__ void asbs_kernel(NumericT * s1,
+                            const NumericT * fac2, unsigned int options2, 
const NumericT * s2,
+                            const NumericT * fac3, unsigned int options3, 
const NumericT * s3)
+{
+    NumericT alpha = *fac2;
+    if (options2 & (1 << 0))
+      alpha = -alpha;
+    if (options2 & (1 << 1))
+      alpha = NumericT(1) / alpha;
+
+    NumericT beta = *fac3;
+    if (options3 & (1 << 0))
+      beta = -beta;
+    if (options3 & (1 << 1))
+      beta = NumericT(1) / beta;
+
+    *s1 = *s2 * alpha + *s3 * beta;
+}
+
+// alpha on CPU, beta on GPU
+template<typename NumericT>
+__global__ void asbs_kernel(NumericT * s1,
+                            NumericT fac2,         unsigned int options2, 
const NumericT * s2,
+                            NumericT const * fac3, unsigned int options3, 
const NumericT * s3)
+{
+    NumericT alpha = fac2;
+    if (options2 & (1 << 0))
+      alpha = -alpha;
+    if (options2 & (1 << 1))
+      alpha = NumericT(1) / alpha;
+
+    NumericT beta = *fac3;
+    if (options3 & (1 << 0))
+      beta = -beta;
+    if (options3 & (1 << 1))
+      beta = NumericT(1) / beta;
+
+    *s1 = *s2 * alpha + *s3 * beta;
+}
+
+// alpha on GPU, beta on CPU
+template<typename NumericT>
+__global__ void asbs_kernel(NumericT * s1,
+                            NumericT const * fac2, unsigned int options2, 
const NumericT * s2,
+                            NumericT         fac3, unsigned int options3, 
const NumericT * s3)
+{
+    NumericT alpha = *fac2;
+    if (options2 & (1 << 0))
+      alpha = -alpha;
+    if (options2 & (1 << 1))
+      alpha = NumericT(1) / alpha;
+
+    NumericT beta = fac3;
+    if (options3 & (1 << 0))
+      beta = -beta;
+    if (options3 & (1 << 1))
+      beta = NumericT(1) / beta;
+
+    *s1 = *s2 * alpha + *s3 * beta;
+}
+
+// alpha and beta on CPU
+template<typename NumericT>
+__global__ void asbs_kernel(NumericT * s1,
+                            NumericT fac2, unsigned int options2, const 
NumericT * s2,
+                            NumericT fac3, unsigned int options3, const 
NumericT * s3)
+{
+    NumericT alpha = fac2;
+    if (options2 & (1 << 0))
+      alpha = -alpha;
+    if (options2 & (1 << 1))
+      alpha = NumericT(1) / alpha;
+
+    NumericT beta = fac3;
+    if (options3 & (1 << 0))
+      beta = -beta;
+    if (options3 & (1 << 1))
+      beta = NumericT(1) / beta;
+
+    *s1 = *s2 * alpha + *s3 * beta;
+}
+
+
+template<typename ScalarT1,
+         typename ScalarT2, typename NumericT1,
+         typename ScalarT3, typename NumericT2>
+typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
+                              && viennacl::is_scalar<ScalarT2>::value
+                              && viennacl::is_scalar<ScalarT3>::value
+                              && viennacl::is_any_scalar<NumericT1>::value
+                              && viennacl::is_any_scalar<NumericT2>::value
+                            >::type
+asbs(ScalarT1 & s1,
+     ScalarT2 const & s2, NumericT1 const & alpha, vcl_size_t len_alpha, bool 
reciprocal_alpha, bool flip_sign_alpha,
+     ScalarT3 const & s3, NumericT2 const & beta,  vcl_size_t len_beta,  bool 
reciprocal_beta,  bool flip_sign_beta)
+{
+  typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type        
value_type;
+
+  unsigned int options_alpha = detail::make_options(len_alpha, 
reciprocal_alpha, flip_sign_alpha);
+  unsigned int options_beta  = detail::make_options(len_beta,  
reciprocal_beta,  flip_sign_beta);
+
+  value_type temporary_alpha = 0;
+  if (viennacl::is_cpu_scalar<NumericT1>::value)
+    temporary_alpha = alpha;
+
+  value_type temporary_beta = 0;
+  if (viennacl::is_cpu_scalar<NumericT2>::value)
+    temporary_beta = beta;
+
+  asbs_kernel<<<1, 1>>>(viennacl::cuda_arg(s1),
+                        
viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                        options_alpha,
+                        viennacl::cuda_arg(s2),
+                        
viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+                        options_beta,
+                        viennacl::cuda_arg(s3) );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("asbs_kernel");
+}
+
+//////////////////// asbs_s ////////////////////
+
+// alpha and beta on GPU
+template<typename NumericT>
+__global__ void asbs_s_kernel(NumericT * s1,
+                              const NumericT * fac2, unsigned int options2, 
const NumericT * s2,
+                              const NumericT * fac3, unsigned int options3, 
const NumericT * s3)
+{
+    NumericT alpha = *fac2;
+    if (options2 & (1 << 0))
+      alpha = -alpha;
+    if (options2 & (1 << 1))
+      alpha = NumericT(1) / alpha;
+
+    NumericT beta = *fac3;
+    if (options3 & (1 << 0))
+      beta = -beta;
+    if (options3 & (1 << 1))
+      beta = NumericT(1) / beta;
+
+    *s1 += *s2 * alpha + *s3 * beta;
+}
+
+// alpha on CPU, beta on GPU
+template<typename NumericT>
+__global__ void asbs_s_kernel(NumericT * s1,
+                              NumericT         fac2, unsigned int options2, 
const NumericT * s2,
+                              NumericT const * fac3, unsigned int options3, 
const NumericT * s3)
+{
+    NumericT alpha = fac2;
+    if (options2 & (1 << 0))
+      alpha = -alpha;
+    if (options2 & (1 << 1))
+      alpha = NumericT(1) / alpha;
+
+    NumericT beta = *fac3;
+    if (options3 & (1 << 0))
+      beta = -beta;
+    if (options3 & (1 << 1))
+      beta = NumericT(1) / beta;
+
+    *s1 += *s2 * alpha + *s3 * beta;
+}
+
+// alpha on GPU, beta on CPU
+template<typename NumericT>
+__global__ void asbs_s_kernel(NumericT * s1,
+                              NumericT const * fac2, unsigned int options2, 
const NumericT * s2,
+                              NumericT         fac3, unsigned int options3, 
const NumericT * s3)
+{
+    NumericT alpha = *fac2;
+    if (options2 & (1 << 0))
+      alpha = -alpha;
+    if (options2 & (1 << 1))
+      alpha = NumericT(1) / alpha;
+
+    NumericT beta = fac3;
+    if (options3 & (1 << 0))
+      beta = -beta;
+    if (options3 & (1 << 1))
+      beta = NumericT(1) / beta;
+
+    *s1 += *s2 * alpha + *s3 * beta;
+}
+
+// alpha and beta on CPU
+template<typename NumericT>
+__global__ void asbs_s_kernel(NumericT * s1,
+                              NumericT fac2, unsigned int options2, const 
NumericT * s2,
+                              NumericT fac3, unsigned int options3, const 
NumericT * s3)
+{
+    NumericT alpha = fac2;
+    if (options2 & (1 << 0))
+      alpha = -alpha;
+    if (options2 & (1 << 1))
+      alpha = NumericT(1) / alpha;
+
+    NumericT beta = fac3;
+    if (options3 & (1 << 0))
+      beta = -beta;
+    if (options3 & (1 << 1))
+      beta = NumericT(1) / beta;
+
+    *s1 += *s2 * alpha + *s3 * beta;
+}
+
+
+template<typename ScalarT1,
+         typename ScalarT2, typename NumericT1,
+         typename ScalarT3, typename NumericT2>
+typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
+                              && viennacl::is_scalar<ScalarT2>::value
+                              && viennacl::is_scalar<ScalarT3>::value
+                              && viennacl::is_any_scalar<NumericT1>::value
+                              && viennacl::is_any_scalar<NumericT2>::value
+                            >::type
+asbs_s(ScalarT1 & s1,
+       ScalarT2 const & s2, NumericT1 const & alpha, vcl_size_t len_alpha, 
bool reciprocal_alpha, bool flip_sign_alpha,
+       ScalarT3 const & s3, NumericT2 const & beta,  vcl_size_t len_beta,  
bool reciprocal_beta,  bool flip_sign_beta)
+{
+  typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type        
value_type;
+
+  unsigned int options_alpha = detail::make_options(len_alpha, 
reciprocal_alpha, flip_sign_alpha);
+  unsigned int options_beta  = detail::make_options(len_beta,  
reciprocal_beta,  flip_sign_beta);
+
+  value_type temporary_alpha = 0;
+  if (viennacl::is_cpu_scalar<NumericT1>::value)
+    temporary_alpha = alpha;
+
+  value_type temporary_beta = 0;
+  if (viennacl::is_cpu_scalar<NumericT2>::value)
+    temporary_beta = beta;
+
+  std::cout << "Launching asbs_s_kernel..." << std::endl;
+  asbs_s_kernel<<<1, 1>>>(viennacl::cuda_arg(s1),
+                          
viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
+                          options_alpha,
+                          viennacl::cuda_arg(s2),
+                          
viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
+                          options_beta,
+                          viennacl::cuda_arg(s3) );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("asbs_s_kernel");
+}
+
+///////////////// swap //////////////////
+
+template<typename NumericT>
+__global__ void scalar_swap_kernel(NumericT * s1, NumericT * s2)
+{
+  NumericT tmp = *s2;
+  *s2 = *s1;
+  *s1 = tmp;
+}
+
+/** @brief Swaps the contents of two scalars, data is copied
+*
+* @param s1   The first scalar
+* @param s2   The second scalar
+*/
+template<typename ScalarT1, typename ScalarT2>
+typename viennacl::enable_if<    viennacl::is_scalar<ScalarT1>::value
+                              && viennacl::is_scalar<ScalarT2>::value
+                            >::type
+swap(ScalarT1 & s1, ScalarT2 & s2)
+{
+  typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type        
value_type;
+
+  scalar_swap_kernel<<<1, 1>>>(viennacl::cuda_arg(s1), viennacl::cuda_arg(s2));
+}
+
+
+
+} //namespace single_threaded
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif

Reply via email to