http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/matrix_operations.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/matrix_operations.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/matrix_operations.hpp
new file mode 100644
index 0000000..bd5116d
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/matrix_operations.hpp
@@ -0,0 +1,1019 @@
+#ifndef VIENNACL_LINALG_OPENCL_MATRIX_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_MATRIX_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+/** @file  viennacl/linalg/opencl/matrix_operations.hpp
+    @brief Implementations of dense matrix related operations, including 
matrix-vector products, using OpenCL.
+*/
+
+#include "viennacl/forwards.h"
+
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/result_of.hpp"
+
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+
+#include "viennacl/linalg/opencl/common.hpp"
+#include "viennacl/linalg/opencl/kernels/svd.hpp"
+#include "viennacl/linalg/opencl/kernels/vector.hpp"
+#include "viennacl/linalg/opencl/kernels/matrix.hpp"
+#include "viennacl/linalg/opencl/kernels/matrix_element.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+
+namespace detail
+{
+
+  template<typename NumericT>
+  viennacl::ocl::kernel & kernel_for_matrix(matrix_base<NumericT> const & M, 
std::string const & kernel_name)
+  {
+    viennacl::ocl::context & ctx = traits::opencl_context(M);
+    viennacl::ocl::program * program;
+    if (M.row_major())
+    {
+      typedef viennacl::linalg::opencl::kernels::matrix<NumericT, row_major>  
KernelClass;
+      KernelClass::init(ctx);
+      program = &ctx.get_program(KernelClass::program_name());
+    }
+    else
+    {
+      typedef viennacl::linalg::opencl::kernels::matrix<NumericT, 
column_major>  KernelClass;
+      KernelClass::init(ctx);
+      program = &ctx.get_program(KernelClass::program_name());
+    }
+    return program->get_kernel(kernel_name);
+  }
+
+  template<typename NumericT>
+  viennacl::ocl::kernel & element_kernel_for_matrix(matrix_base<NumericT> 
const & M, std::string const & kernel_name)
+  {
+    viennacl::ocl::context & ctx = traits::opencl_context(M);
+    viennacl::ocl::program * program;
+    if (M.row_major())
+    {
+      typedef viennacl::linalg::opencl::kernels::matrix_element<NumericT, 
row_major>  KernelClass;
+      KernelClass::init(ctx);
+      program = &ctx.get_program(KernelClass::program_name());
+    }
+    else
+    {
+      typedef viennacl::linalg::opencl::kernels::matrix_element<NumericT, 
column_major>  KernelClass;
+      KernelClass::init(ctx);
+      program = &ctx.get_program(KernelClass::program_name());
+    }
+    return program->get_kernel(kernel_name);
+  }
+
+  template<typename NumericT>
+  viennacl::ocl::kernel & legacy_kernel_for_matrix(matrix_base<NumericT> const 
& M, std::string const & kernel_name)
+  {
+    viennacl::ocl::context & ctx = traits::opencl_context(M);
+    viennacl::ocl::program * program;
+    if (M.row_major())
+    {
+      typedef viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, 
row_major>  KernelClass;
+      KernelClass::init(ctx);
+      program = &ctx.get_program(KernelClass::program_name());
+    }
+    else
+    {
+      typedef viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, 
column_major>  KernelClass;
+      KernelClass::init(ctx);
+      program = &ctx.get_program(KernelClass::program_name());
+    }
+    return program->get_kernel(kernel_name);
+  }
+
+}
+
+//
+// Introductory note: By convention, all dimensions are already checked in the 
dispatcher frontend. No need to double-check again in here!
+//
+
+const std::string SVD_BIDIAG_PACK_KERNEL = "bidiag_pack";
+const std::string SVD_HOUSEHOLDER_UPDATE_A_LEFT_KERNEL = "house_update_A_left";
+const std::string SVD_HOUSEHOLDER_UPDATE_A_RIGHT_KERNEL = 
"house_update_A_right";
+const std::string SVD_HOUSEHOLDER_UPDATE_QL_KERNEL = "house_update_QL";
+const std::string SVD_GIVENS_NEXT_KERNEL = "givens_next";
+const std::string SVD_COPY_COL_KERNEL = "copy_col";
+const std::string SVD_COPY_ROW_KERNEL = "copy_row";
+
+template<typename DestNumericT, typename SrcNumericT>
+void convert(matrix_base<DestNumericT> & dest, matrix_base<SrcNumericT> const 
& src)
+{
+  assert(dest.row_major() == src.row_major() && bool("Addition/subtraction on 
mixed matrix layouts not supported yet!"));
+
+  assert(viennacl::traits::opencl_handle(dest).context() == 
viennacl::traits::opencl_handle(src).context() && bool("Matrices do not reside 
in the same OpenCL context. Automatic migration not yet supported!"));
+
+  std::string kernel_name("convert_");
+  kernel_name += dest.row_major() ? "row_" : "col_";
+  kernel_name += viennacl::ocl::type_to_string<DestNumericT>::apply();
+  kernel_name += "_";
+  kernel_name += viennacl::ocl::type_to_string<SrcNumericT>::apply();
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(dest).context());
+  viennacl::linalg::opencl::kernels::matrix_convert::init(ctx);
+  viennacl::ocl::kernel& k = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::matrix_convert::program_name(),
 kernel_name);
+
+  viennacl::ocl::enqueue(k( dest, cl_uint(dest.start1()), 
cl_uint(dest.stride1()), cl_uint(dest.size1()), cl_uint(dest.internal_size1()), 
cl_uint(dest.start2()), cl_uint(dest.stride2()), cl_uint(dest.size2()), 
cl_uint(dest.internal_size2()),
+                            src,  cl_uint( src.start1()), cl_uint( 
src.stride1()), cl_uint( src.size1()), cl_uint( src.internal_size1()), cl_uint( 
src.start2()), cl_uint( src.stride2()), cl_uint( src.size2()), cl_uint( 
src.internal_size2())
+                        ) );
+}
+
+//
+// Introductory note: By convention, all dimensions are already checked in the 
dispatcher frontend. No need to double-check again in here!
+//
+
+template <typename NumericT,
+          typename ScalarT1>
+void am(matrix_base<NumericT> & mat1,
+        matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t 
len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
+{
+  viennacl::ocl::kernel & k= detail::kernel_for_matrix(mat1, 
(viennacl::is_cpu_scalar<ScalarT1>::value ? "am_cpu" : "am_gpu"));
+
+  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, 
flip_sign_alpha);
+
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat1),
+                          cl_uint(viennacl::traits::start1(mat1)),           
cl_uint(viennacl::traits::start2(mat1)),
+                          cl_uint(viennacl::traits::stride1(mat1)),          
cl_uint(viennacl::traits::stride2(mat1)),
+                          cl_uint(viennacl::traits::size1(mat1)),            
cl_uint(viennacl::traits::size2(mat1)),
+                          cl_uint(viennacl::traits::internal_size1(mat1)),   
cl_uint(viennacl::traits::internal_size2(mat1)),
+
+                          
viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(alpha)),
+                          options_alpha,
+                          viennacl::traits::opencl_handle(mat2),
+                          cl_uint(viennacl::traits::start1(mat2)),           
cl_uint(viennacl::traits::start2(mat2)),
+                          cl_uint(viennacl::traits::stride1(mat2)),          
cl_uint(viennacl::traits::stride2(mat2)),
+                          cl_uint(viennacl::traits::internal_size1(mat2)),   
cl_uint(viennacl::traits::internal_size2(mat2))
+                          )
+                        );
+}
+
+
+template <typename NumericT,
+          typename ScalarT1, typename ScalarT2>
+void ambm(matrix_base<NumericT> & mat1,
+          matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, 
vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+          matrix_base<NumericT> const & mat3, ScalarT2 const & beta,  
vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+{
+  std::string kernel_name;
+  if      ( viennacl::is_cpu_scalar<ScalarT1>::value &&  
viennacl::is_cpu_scalar<ScalarT2>::value)
+    kernel_name = "ambm_cpu_cpu";
+  else if ( viennacl::is_cpu_scalar<ScalarT1>::value && 
!viennacl::is_cpu_scalar<ScalarT2>::value)
+    kernel_name = "ambm_cpu_gpu";
+  else if (!viennacl::is_cpu_scalar<ScalarT1>::value &&  
viennacl::is_cpu_scalar<ScalarT2>::value)
+    kernel_name = "ambm_gpu_cpu";
+  else
+    kernel_name = "ambm_gpu_gpu";
+
+  viennacl::ocl::kernel & k = detail::kernel_for_matrix(mat1, kernel_name);
+
+  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, 
flip_sign_alpha);
+  cl_uint options_beta  = detail::make_options(len_beta,  reciprocal_beta,  
flip_sign_beta);
+
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat1),
+                          cl_uint(viennacl::traits::start1(mat1)),           
cl_uint(viennacl::traits::start2(mat1)),
+                          cl_uint(viennacl::traits::stride1(mat1)),          
cl_uint(viennacl::traits::stride2(mat1)),
+                          cl_uint(viennacl::traits::size1(mat1)),            
cl_uint(viennacl::traits::size2(mat1)),
+                          cl_uint(viennacl::traits::internal_size1(mat1)),   
cl_uint(viennacl::traits::internal_size2(mat1)),
+
+                          
viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(alpha)),
+                          options_alpha,
+                          viennacl::traits::opencl_handle(mat2),
+                          cl_uint(viennacl::traits::start1(mat2)),           
cl_uint(viennacl::traits::start2(mat2)),
+                          cl_uint(viennacl::traits::stride1(mat2)),          
cl_uint(viennacl::traits::stride2(mat2)),
+                          cl_uint(viennacl::traits::internal_size1(mat2)),   
cl_uint(viennacl::traits::internal_size2(mat2)),
+
+                          
viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(beta)),
+                          options_beta,
+                          viennacl::traits::opencl_handle(mat3),
+                          cl_uint(viennacl::traits::start1(mat3)),           
cl_uint(viennacl::traits::start2(mat3)),
+                          cl_uint(viennacl::traits::stride1(mat3)),          
cl_uint(viennacl::traits::stride2(mat3)),
+                          cl_uint(viennacl::traits::internal_size1(mat3)),   
cl_uint(viennacl::traits::internal_size2(mat3))
+                          )
+                        );
+}
+
+
+template <typename NumericT,
+          typename ScalarT1, typename ScalarT2>
+void ambm_m(matrix_base<NumericT> & mat1,
+            matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, 
vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
+            matrix_base<NumericT> const & mat3, ScalarT2 const & beta,  
vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
+{
+  std::string kernel_name;
+  if      ( viennacl::is_cpu_scalar<ScalarT1>::value &&  
viennacl::is_cpu_scalar<ScalarT2>::value)
+    kernel_name = "ambm_m_cpu_cpu";
+  else if ( viennacl::is_cpu_scalar<ScalarT1>::value && 
!viennacl::is_cpu_scalar<ScalarT2>::value)
+    kernel_name = "ambm_m_cpu_gpu";
+  else if (!viennacl::is_cpu_scalar<ScalarT1>::value &&  
viennacl::is_cpu_scalar<ScalarT2>::value)
+    kernel_name = "ambm_m_gpu_cpu";
+  else
+    kernel_name = "ambm_m_gpu_gpu";
+
+  viennacl::ocl::kernel & k = detail::kernel_for_matrix(mat1, kernel_name);
+
+  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, 
flip_sign_alpha);
+  cl_uint options_beta  = detail::make_options(len_beta,  reciprocal_beta,  
flip_sign_beta);
+
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat1),
+                          cl_uint(viennacl::traits::start1(mat1)),           
cl_uint(viennacl::traits::start2(mat1)),
+                          cl_uint(viennacl::traits::stride1(mat1)),          
cl_uint(viennacl::traits::stride2(mat1)),
+                          cl_uint(viennacl::traits::size1(mat1)),            
cl_uint(viennacl::traits::size2(mat1)),
+                          cl_uint(viennacl::traits::internal_size1(mat1)),   
cl_uint(viennacl::traits::internal_size2(mat1)),
+
+                          
viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(alpha)),
+                          options_alpha,
+                          viennacl::traits::opencl_handle(mat2),
+                          cl_uint(viennacl::traits::start1(mat2)),           
cl_uint(viennacl::traits::start2(mat2)),
+                          cl_uint(viennacl::traits::stride1(mat2)),          
cl_uint(viennacl::traits::stride2(mat2)),
+                          cl_uint(viennacl::traits::internal_size1(mat2)),   
cl_uint(viennacl::traits::internal_size2(mat2)),
+
+                          
viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(beta)),
+                          options_beta,
+                          viennacl::traits::opencl_handle(mat3),
+                          cl_uint(viennacl::traits::start1(mat3)),           
cl_uint(viennacl::traits::start2(mat3)),
+                          cl_uint(viennacl::traits::stride1(mat3)),          
cl_uint(viennacl::traits::stride2(mat3)),
+                          cl_uint(viennacl::traits::internal_size1(mat3)),   
cl_uint(viennacl::traits::internal_size2(mat3))
+                          )
+                        );
+}
+
+template<typename NumericT,
+          typename SizeT, typename DistanceT>
+void trans(const matrix_expression<const matrix_base<NumericT, SizeT, 
DistanceT>,const matrix_base<NumericT, SizeT, DistanceT>, op_trans> & proxy,
+           matrix_base<NumericT> & temp_trans)
+{
+  std::string kernel_name("trans_kernel");
+  viennacl::ocl::kernel& kernel = 
detail::legacy_kernel_for_matrix(proxy.lhs(),kernel_name);
+  viennacl::ocl::enqueue(kernel(proxy.lhs(),
+                                static_cast<cl_uint>(proxy.lhs().start1()),    
     static_cast<cl_uint>(proxy.lhs().start2()),
+                                
static_cast<cl_uint>(proxy.lhs().internal_size1()), 
static_cast<cl_uint>(proxy.lhs().internal_size2()),
+                                static_cast<cl_uint>(proxy.lhs().size1()),     
     static_cast<cl_uint>(proxy.lhs().size2()),
+                                static_cast<cl_uint>(proxy.lhs().stride1()),   
     static_cast<cl_uint>(proxy.lhs().stride2()),
+
+                                temp_trans,
+                                static_cast<cl_uint>(temp_trans.start1()),     
    static_cast<cl_uint>(temp_trans.start2()),
+                                
static_cast<cl_uint>(temp_trans.internal_size1()), 
static_cast<cl_uint>(temp_trans.internal_size2()),
+                                static_cast<cl_uint>(temp_trans.stride1()),    
    static_cast<cl_uint>(temp_trans.stride2())));
+}
+
+template <typename NumericT>
+void matrix_assign(matrix_base<NumericT> & mat, NumericT s, bool clear = false)
+{
+  cl_uint s1 = clear ? cl_uint(viennacl::traits::internal_size1(mat)) : 
cl_uint(viennacl::traits::size1(mat));
+  cl_uint s2 = clear ? cl_uint(viennacl::traits::internal_size2(mat)) : 
cl_uint(viennacl::traits::size2(mat));
+
+  viennacl::ocl::kernel & k = detail::kernel_for_matrix(mat, "assign_cpu");
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat),
+                           cl_uint(viennacl::traits::start1(mat)),           
cl_uint(viennacl::traits::start2(mat)),
+                           cl_uint(viennacl::traits::stride1(mat)),          
cl_uint(viennacl::traits::stride2(mat)),
+                           s1,                                               
s2,
+                           cl_uint(viennacl::traits::internal_size1(mat)),   
cl_uint(viennacl::traits::internal_size2(mat)),
+                           
viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(s))
+                          )
+                        );
+}
+
+template <typename NumericT>
+void matrix_diagonal_assign(matrix_base<NumericT> & mat, NumericT s)
+{
+  viennacl::ocl::kernel & k = detail::kernel_for_matrix(mat, 
"diagonal_assign_cpu");
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat),
+                           cl_uint(viennacl::traits::start1(mat)),           
cl_uint(viennacl::traits::start2(mat)),
+                           cl_uint(viennacl::traits::stride1(mat)),          
cl_uint(viennacl::traits::stride2(mat)),
+                           cl_uint(viennacl::traits::size1(mat)),            
cl_uint(viennacl::traits::size2(mat)),
+                           cl_uint(viennacl::traits::internal_size1(mat)),   
cl_uint(viennacl::traits::internal_size2(mat)),
+                           
viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(s))
+                          )
+                        );
+}
+
+template <typename NumericT>
+void matrix_diag_from_vector(const vector_base<NumericT> & vec, int k, 
matrix_base<NumericT> & mat)
+{
+  // Step 1: set everything to zero
+  matrix_assign(mat, NumericT(0));
+
+  // Step 2: set the diagonal:
+
+  // reuse vector ambm kernel for assigning the elements:
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(mat).context());
+  typedef viennacl::linalg::opencl::kernels::vector<NumericT>  KernelClass;
+  KernelClass::init(ctx);
+
+  cl_uint options_alpha = 0;
+  viennacl::ocl::packed_cl_uint size_mat;
+  if (mat.row_major())
+  {
+    vcl_size_t first_row_index = 0;
+    vcl_size_t first_col_index = 0;
+    if (k < 0)
+      first_row_index = vcl_size_t(-k);
+    else
+      first_col_index = vcl_size_t(k);
+    size_mat.start  = cl_uint( (viennacl::traits::start1(mat) + 
first_row_index * viennacl::traits::stride1(mat)) * 
viennacl::traits::internal_size2(mat)
+                              + viennacl::traits::start2(mat) + 
first_col_index * viennacl::traits::stride2(mat));
+    size_mat.stride = cl_uint(viennacl::traits::stride1(mat) * 
viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat));
+    size_mat.size   = cl_uint(viennacl::traits::size(vec));
+    size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+  }
+  else
+  {
+    vcl_size_t first_row_index = 0;
+    vcl_size_t first_col_index = 0;
+    if (k < 0)
+      first_row_index = vcl_size_t(-k);
+    else
+      first_col_index = vcl_size_t(k);
+    size_mat.start  = cl_uint(   viennacl::traits::start1(mat) + 
first_row_index * viennacl::traits::stride1(mat)
+                              + (viennacl::traits::start2(mat) + 
first_col_index * viennacl::traits::stride2(mat)) * 
viennacl::traits::internal_size1(mat));
+    size_mat.stride = cl_uint(viennacl::traits::stride2(mat) * 
viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat));
+    size_mat.size   = cl_uint(viennacl::traits::size(vec));
+    size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+  }
+
+  viennacl::ocl::packed_cl_uint size_vec;
+  size_vec.start  = cl_uint(viennacl::traits::start(vec));
+  size_vec.stride = cl_uint(viennacl::traits::stride(vec));
+  size_vec.size   = cl_uint(viennacl::traits::size(vec));
+  size_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+
+  viennacl::ocl::kernel & kern = ctx.get_kernel(KernelClass::program_name(), 
"av_cpu");
+  viennacl::ocl::enqueue(kern(viennacl::traits::opencl_handle(mat),
+                              size_mat,
+
+                              viennacl::traits::opencl_handle(NumericT(1)),
+                              options_alpha,
+                              viennacl::traits::opencl_handle(vec),
+                              size_vec)
+                        );
+}
+
+template <typename NumericT>
+void matrix_diag_to_vector(const matrix_base<NumericT> & mat, int k, 
vector_base<NumericT> & vec)
+{
+  // reuse vector ambm kernel for assigning the elements:
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(mat).context());
+  typedef viennacl::linalg::opencl::kernels::vector<NumericT>  KernelClass;
+  KernelClass::init(ctx);
+
+  cl_uint options_alpha = 0;
+  viennacl::ocl::packed_cl_uint size_mat;
+  if (mat.row_major())
+  {
+    vcl_size_t first_row_index = 0;
+    vcl_size_t first_col_index = 0;
+    if (k < 0)
+      first_row_index = vcl_size_t(-k);
+    else
+      first_col_index = vcl_size_t(k);
+    size_mat.start  = cl_uint( (viennacl::traits::start1(mat) + 
first_row_index * viennacl::traits::stride1(mat)) * 
viennacl::traits::internal_size2(mat)
+                              + viennacl::traits::start2(mat) + 
first_col_index * viennacl::traits::stride2(mat));
+    size_mat.stride = cl_uint(viennacl::traits::stride1(mat) * 
viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat));
+    size_mat.size   = cl_uint(viennacl::traits::size(vec));
+    size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+  }
+  else
+  {
+    vcl_size_t first_row_index = 0;
+    vcl_size_t first_col_index = 0;
+    if (k < 0)
+      first_row_index = vcl_size_t(-k);
+    else
+      first_col_index = vcl_size_t(k);
+    size_mat.start  = cl_uint(   viennacl::traits::start1(mat) + 
first_row_index * viennacl::traits::stride1(mat)
+                              + (viennacl::traits::start2(mat) + 
first_col_index * viennacl::traits::stride2(mat)) * 
viennacl::traits::internal_size1(mat));
+    size_mat.stride = cl_uint(viennacl::traits::stride2(mat) * 
viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat));
+    size_mat.size   = cl_uint(viennacl::traits::size(vec));
+    size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+  }
+
+  viennacl::ocl::packed_cl_uint size_vec;
+  size_vec.start  = cl_uint(viennacl::traits::start(vec));
+  size_vec.stride = cl_uint(viennacl::traits::stride(vec));
+  size_vec.size   = cl_uint(viennacl::traits::size(vec));
+  size_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+
+
+  viennacl::ocl::kernel & kern = ctx.get_kernel(KernelClass::program_name(), 
"av_cpu");
+  viennacl::ocl::enqueue(kern(viennacl::traits::opencl_handle(vec),
+                              size_vec,
+
+                              viennacl::traits::opencl_handle(NumericT(1)),
+                              options_alpha,
+                              viennacl::traits::opencl_handle(mat),
+                              size_mat)
+                        );
+}
+
+template <typename NumericT>
+void matrix_row(matrix_base<NumericT> const & mat, unsigned int i, 
vector_base<NumericT> & vec)
+{
+  // reuse vector ambm kernel for assigning the elements:
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(mat).context());
+  typedef viennacl::linalg::opencl::kernels::vector<NumericT>  KernelClass;
+  KernelClass::init(ctx);
+
+  cl_uint options_alpha = 0;
+  viennacl::ocl::packed_cl_uint size_mat;
+  if (mat.row_major())
+  {
+    size_mat.start  = cl_uint((viennacl::traits::start1(mat) + i * 
viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat) + 
viennacl::traits::start2(mat));
+    size_mat.stride = cl_uint(viennacl::traits::stride2(mat));
+    size_mat.size   = cl_uint(viennacl::traits::size(vec));
+    size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+  }
+  else
+  {
+    size_mat.start  = cl_uint((viennacl::traits::start1(mat) + i * 
viennacl::traits::stride1(mat)) + viennacl::traits::start2(mat) * 
viennacl::traits::internal_size1(mat));
+    size_mat.stride = cl_uint(viennacl::traits::stride2(mat) * 
viennacl::traits::internal_size1(mat));
+    size_mat.size   = cl_uint(viennacl::traits::size(vec));
+    size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+  }
+
+  viennacl::ocl::packed_cl_uint size_vec;
+  size_vec.start  = cl_uint(viennacl::traits::start(vec));
+  size_vec.stride = cl_uint(viennacl::traits::stride(vec));
+  size_vec.size   = cl_uint(viennacl::traits::size(vec));
+  size_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+
+
+  viennacl::ocl::kernel & kern = ctx.get_kernel(KernelClass::program_name(), 
"av_cpu");
+  viennacl::ocl::enqueue(kern(viennacl::traits::opencl_handle(vec),
+                              size_vec,
+
+                              viennacl::traits::opencl_handle(NumericT(1)),
+                              options_alpha,
+                              viennacl::traits::opencl_handle(mat),
+                              size_mat)
+                        );
+}
+
+template <typename NumericT>
+void matrix_column(const matrix_base<NumericT> & mat, unsigned int j, 
vector_base<NumericT> & vec)
+{
+  // reuse vector ambm kernel for assigning the elements:
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(mat).context());
+  typedef viennacl::linalg::opencl::kernels::vector<NumericT>  KernelClass;
+  KernelClass::init(ctx);
+
+  cl_uint options_alpha = 0;
+  viennacl::ocl::packed_cl_uint size_mat;
+  if (mat.row_major())
+  {
+    size_mat.start  = cl_uint(viennacl::traits::start1(mat) * 
viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat) + j * 
viennacl::traits::stride2(mat));
+    size_mat.stride = cl_uint(viennacl::traits::stride2(mat) * 
viennacl::traits::internal_size2(mat));
+    size_mat.size   = cl_uint(viennacl::traits::size(vec));
+    size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+  }
+  else
+  {
+    size_mat.start  = cl_uint(viennacl::traits::start1(mat) + 
(viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat)) * 
viennacl::traits::internal_size1(mat));
+    size_mat.stride = cl_uint(viennacl::traits::stride2(mat));
+    size_mat.size   = cl_uint(viennacl::traits::size(vec));
+    size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+  }
+
+  viennacl::ocl::packed_cl_uint size_vec;
+  size_vec.start  = cl_uint(viennacl::traits::start(vec));
+  size_vec.stride = cl_uint(viennacl::traits::stride(vec));
+  size_vec.size   = cl_uint(viennacl::traits::size(vec));
+  size_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
+
+
+  viennacl::ocl::kernel & kern = ctx.get_kernel(KernelClass::program_name(), 
"av_cpu");
+  viennacl::ocl::enqueue(kern(viennacl::traits::opencl_handle(vec),
+                              size_vec,
+
+                              viennacl::traits::opencl_handle(NumericT(1)),
+                              options_alpha,
+                              viennacl::traits::opencl_handle(mat),
+                              size_mat)
+                        );
+}
+
+
+//
+///////////////////////// Element-wise operation 
//////////////////////////////////
+//
+
+// Binary operations A = B .* C and A = B ./ C
+/** @brief Implementation of binary element-wise operations A = OP(B,C)
+*
+* @param A      The result matrix (or -range, or -slice)
+* @param proxy  The proxy object holding B, C, and the operation
+*/
+template <typename T, typename OP>
+void element_op(matrix_base<T> & A,
+                matrix_expression<const matrix_base<T>, const matrix_base<T>, 
op_element_binary<OP> > const & proxy)
+{
+  assert(viennacl::traits::opencl_handle(A).context() == 
viennacl::traits::opencl_handle(proxy.lhs()).context() && bool("Matrices do not 
reside in the same OpenCL context. Automatic migration not yet supported!"));
+  assert(viennacl::traits::opencl_handle(A).context() == 
viennacl::traits::opencl_handle(proxy.rhs()).context() && bool("Matrices do not 
reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+  viennacl::ocl::kernel & k = detail::kernel_for_matrix(A, "element_op");
+
+  cl_uint op_type = 2; //0: product, 1: division, 2: power
+  if (viennacl::is_division<OP>::value)
+    op_type = 1;
+  else if (viennacl::is_product<OP>::value)
+    op_type = 0;
+
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(A),
+                          cl_uint(viennacl::traits::start1(A)),           
cl_uint(viennacl::traits::start2(A)),
+                          cl_uint(viennacl::traits::stride1(A)),          
cl_uint(viennacl::traits::stride2(A)),
+                          cl_uint(viennacl::traits::size1(A)),            
cl_uint(viennacl::traits::size2(A)),
+                          cl_uint(viennacl::traits::internal_size1(A)),   
cl_uint(viennacl::traits::internal_size2(A)),
+
+                          viennacl::traits::opencl_handle(proxy.lhs()),
+                          cl_uint(viennacl::traits::start1(proxy.lhs())),      
     cl_uint(viennacl::traits::start2(proxy.lhs())),
+                          cl_uint(viennacl::traits::stride1(proxy.lhs())),     
     cl_uint(viennacl::traits::stride2(proxy.lhs())),
+                          
cl_uint(viennacl::traits::internal_size1(proxy.lhs())),   
cl_uint(viennacl::traits::internal_size2(proxy.lhs())),
+
+                          viennacl::traits::opencl_handle(proxy.rhs()),
+                          cl_uint(viennacl::traits::start1(proxy.rhs())),      
     cl_uint(viennacl::traits::start2(proxy.rhs())),
+                          cl_uint(viennacl::traits::stride1(proxy.rhs())),     
     cl_uint(viennacl::traits::stride2(proxy.rhs())),
+                          
cl_uint(viennacl::traits::internal_size1(proxy.rhs())),   
cl_uint(viennacl::traits::internal_size2(proxy.rhs())),
+
+                          op_type)
+                        );
+}
+
+
+// Unary operations
+
+/** @brief Implementation of unary element-wise operations A = OP(B)
+*
+* @param A      The result matrix (or -range, or -slice)
+* @param proxy  The proxy object holding B and the operation
+*/
+template <typename T, typename OP>
+void element_op(matrix_base<T> & A,
+                matrix_expression<const matrix_base<T>, const matrix_base<T>, 
op_element_unary<OP> > const & proxy)
+{
+  assert(viennacl::traits::opencl_handle(A).context() == 
viennacl::traits::opencl_handle(proxy.lhs()).context() && bool("Matrices do not 
reside in the same OpenCL context. Automatic migration not yet supported!"));
+  assert(viennacl::traits::opencl_handle(A).context() == 
viennacl::traits::opencl_handle(proxy.rhs()).context() && bool("Matrices do not 
reside in the same OpenCL context. Automatic migration not yet supported!"));
+
+  viennacl::ocl::kernel & k = detail::element_kernel_for_matrix(A, 
detail::op_to_string(OP()) + "_assign");
+
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(A),
+                           cl_uint(viennacl::traits::start1(A)),           
cl_uint(viennacl::traits::start2(A)),
+                           cl_uint(viennacl::traits::stride1(A)),          
cl_uint(viennacl::traits::stride2(A)),
+                           cl_uint(viennacl::traits::size1(A)),            
cl_uint(viennacl::traits::size2(A)),
+                           cl_uint(viennacl::traits::internal_size1(A)),   
cl_uint(viennacl::traits::internal_size2(A)),
+
+                           viennacl::traits::opencl_handle(proxy.lhs()),
+                           cl_uint(viennacl::traits::start1(proxy.lhs())),     
      cl_uint(viennacl::traits::start2(proxy.lhs())),
+                           cl_uint(viennacl::traits::stride1(proxy.lhs())),    
      cl_uint(viennacl::traits::stride2(proxy.lhs())),
+                           
cl_uint(viennacl::traits::internal_size1(proxy.lhs())),   
cl_uint(viennacl::traits::internal_size2(proxy.lhs())))
+                        );
+}
+
+
+//
+/////////////////////////   matrix-vector products 
/////////////////////////////////
+//
+
+// A * x
+
+/** @brief Carries out matrix-vector multiplication
+*
+* Implementation of the convenience expression result = prod(mat, vec);
+*
+* @param mat    The matrix
+* @param vec    The vector
+* @param result The result vector
+*/
+template <typename NumericT>
+void prod_impl(const matrix_base<NumericT> & mat, bool trans_A,
+               const vector_base<NumericT> & vec,
+                     vector_base<NumericT> & result)
+{
+  assert(viennacl::traits::handle(vec) != viennacl::traits::handle(result) && 
bool("No direct inplace transposed matrix-vector product possible. Introduce a 
temporary!"));
+
+  viennacl::ocl::kernel & k = detail::kernel_for_matrix(mat, trans_A ? 
"trans_vec_mul" : "vec_mul");
+
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat),
+                          cl_uint(viennacl::traits::start1(mat)),         
cl_uint(viennacl::traits::start2(mat)),
+                          cl_uint(viennacl::traits::stride1(mat)),        
cl_uint(viennacl::traits::stride2(mat)),
+                          cl_uint(viennacl::traits::size1(mat)),          
cl_uint(viennacl::traits::size2(mat)),
+                          cl_uint(viennacl::traits::internal_size1(mat)), 
cl_uint(viennacl::traits::internal_size2(mat)),
+
+                          viennacl::traits::opencl_handle(vec),
+                          cl_uint(viennacl::traits::start(vec)),
+                          cl_uint(viennacl::traits::stride(vec)),
+                          cl_uint(viennacl::traits::size(vec)),
+
+                          viennacl::traits::opencl_handle(result),
+                          cl_uint(viennacl::traits::start(result)),
+                          cl_uint(viennacl::traits::stride(result)),
+                          cl_uint(viennacl::traits::size(result)),
+
+                          viennacl::ocl::local_mem(sizeof(NumericT) * 
k.local_work_size())
+                        ) );
+}
+
+
+//
+
+
+/** @brief Carries out matrix-matrix multiplication
+*
+* Implementation of C = prod(A, B);
+*
+*/
+template<typename NumericT, typename ScalarType >
+void prod_impl(matrix_base<NumericT> const & A, bool A_trans,
+               matrix_base<NumericT> const & B, bool B_trans,
+               matrix_base<NumericT>       & C,
+               ScalarType alpha,
+               ScalarType beta)
+{
+    bool effective_A_trans = A_trans ^ A.row_major();
+    bool effective_B_trans = B_trans ^ B.row_major();
+
+    char cAt = effective_A_trans ? 'T' : 'N';
+    char cBt = effective_B_trans ? 'T' : 'N';
+
+    std::string kernel_prefix("prod_");
+    kernel_prefix+=cAt;
+    kernel_prefix+=cBt;
+
+    scheduler::statement statement = scheduler::preset::mat_mat_prod(alpha, 
&A, effective_A_trans, &B, effective_B_trans, beta, &C);
+    kernels::matrix_prod<NumericT>::execution_handler(C.row_major(), 
viennacl::traits::opencl_context(C)).execute(kernel_prefix, statement);
+}
+
+//
+/////////////////////////   miscellaneous operations 
/////////////////////////////////
+//
+
+
+/** @brief The implementation of the operation mat += alpha * vec1 * vec2^T, 
i.e. a scaled rank 1 update
+*
+* Implementation of the convenience expression result += alpha * 
outer_prod(vec1, vec2);
+*
+* @param A    The matrix to be updated
+* @param alpha            The scaling factor (either a viennacl::scalar<>, 
float, or double)
+* @param len_alpha        Length of the buffer for an eventual final reduction 
step (currently always '1')
+* @param reciprocal_alpha Use 1/alpha instead of alpha
+* @param flip_sign_alpha  Use -alpha instead of alpha
+* @param vec1    The first vector
+* @param vec2    The second vector
+*/
+template<typename NumericT, typename ScalarT1>
+void scaled_rank_1_update(matrix_base<NumericT> & A,
+                          ScalarT1 const & alpha, vcl_size_t len_alpha, bool 
reciprocal_alpha, bool flip_sign_alpha,
+                          const vector_base<NumericT> & vec1,
+                          const vector_base<NumericT> & vec2)
+{
+  assert( (viennacl::traits::size1(A) == viennacl::traits::size(vec1)) && 
bool("Size mismatch in scaled_rank_1_update: size1(A) != size(v1)"));
+  assert( (viennacl::traits::size2(A) == viennacl::traits::size(vec2)) && 
bool("Size mismatch in scaled_rank_1_update: size2(A) != size(v2)"));
+
+  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, 
flip_sign_alpha);
+  bool is_cpu = viennacl::is_cpu_scalar<ScalarT1>::value;
+  viennacl::ocl::kernel& kernel= detail::legacy_kernel_for_matrix(A, is_cpu ? 
"scaled_rank1_update_cpu" : "scaled_rank1_update_gpu");
+
+  viennacl::ocl::enqueue(kernel(viennacl::traits::opencl_handle(A),
+                           cl_uint(viennacl::traits::start1(A)),           
cl_uint(viennacl::traits::start2(A)),
+                           cl_uint(viennacl::traits::stride1(A)),          
cl_uint(viennacl::traits::stride2(A)),
+                           cl_uint(viennacl::traits::size1(A)),            
cl_uint(viennacl::traits::size2(A)),
+                           cl_uint(viennacl::traits::internal_size1(A)),   
cl_uint(viennacl::traits::internal_size2(A)),
+
+                           
viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(alpha)),
+                           options_alpha,
+
+                           viennacl::traits::opencl_handle(vec1),
+                           cl_uint(viennacl::traits::start(vec1)),
+                           cl_uint(viennacl::traits::stride(vec1)),
+                           cl_uint(viennacl::traits::size(vec1)),
+
+                           viennacl::traits::opencl_handle(vec2),
+                           cl_uint(viennacl::traits::start(vec2)),
+                           cl_uint(viennacl::traits::stride(vec2)),
+                           cl_uint(viennacl::traits::size(vec2))
+                          )
+                        );
+}
+
+//
+template <typename SCALARTYPE, typename VectorType>
+void bidiag_pack_svd(viennacl::matrix<SCALARTYPE>& A,
+                 VectorType & dh,
+                 VectorType & sh
+                )
+{
+  viennacl::vector<SCALARTYPE> D(dh.size());
+  viennacl::vector<SCALARTYPE> S(sh.size());
+
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(A).context());
+  viennacl::ocl::kernel& kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(),
 SVD_BIDIAG_PACK_KERNEL);
+
+  viennacl::ocl::enqueue(kernel(
+                                A,
+                                D,
+                                S,
+                                static_cast<cl_uint>(A.size1()),
+                                static_cast<cl_uint>(A.size2()),
+                                static_cast<cl_uint>(A.internal_size2())
+                              ));
+
+  fast_copy(D, dh);
+  fast_copy(S, sh);
+}
+
+
+template <typename NumericT>
+void bidiag_pack(matrix_base<NumericT> & A,
+                 viennacl::vector<NumericT> & dh,
+                 viennacl::vector<NumericT> & sh
+                )
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(A).context());
+
+  if(A.row_major())
+  {
+      viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::init(ctx);
+      viennacl::ocl::kernel& kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, 
row_major>::program_name(), SVD_BIDIAG_PACK_KERNEL);
+
+      viennacl::ocl::enqueue(kernel(
+                                    A,
+                                    dh,
+                                    sh,
+                                    cl_uint(viennacl::traits::size1(A)),
+                                    cl_uint(viennacl::traits::size2(A)),
+                                    
cl_uint(viennacl::traits::internal_size2(A))
+                                  ));
+  }
+  else
+  {
+      viennacl::linalg::opencl::kernels::svd<NumericT, 
column_major>::init(ctx);
+      viennacl::ocl::kernel& kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, 
column_major>::program_name(), SVD_BIDIAG_PACK_KERNEL);
+
+      viennacl::ocl::enqueue(kernel(
+                                    A,
+                                    dh,
+                                    sh,
+                                    cl_uint(viennacl::traits::size1(A)),
+                                    cl_uint(viennacl::traits::size2(A)),
+                                    
cl_uint(viennacl::traits::internal_size2(A))
+                                  ));
+  }
+}
+
+
+template <typename NumericT>
+void house_update_A_left(matrix_base<NumericT> & A,
+                         vector_base<NumericT> & D,
+                         vcl_size_t start)
+{
+
+    viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(A).context());
+    if(A.row_major())
+    {
+        viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::init(ctx);
+        viennacl::ocl::kernel& kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, 
row_major>::program_name(), SVD_HOUSEHOLDER_UPDATE_A_LEFT_KERNEL);
+        viennacl::ocl::enqueue(kernel(
+                                      A,
+                                      D,
+                                      static_cast<cl_uint>(start + 1),
+                                      static_cast<cl_uint>(start),
+                                      cl_uint(viennacl::traits::size1(A)),
+                                      cl_uint(viennacl::traits::size2(A)),
+                                      
cl_uint(viennacl::traits::internal_size2(A)),
+                                      
viennacl::ocl::local_mem(static_cast<cl_uint>(128 * 4))
+                              ));
+    }
+    else
+    {
+        viennacl::linalg::opencl::kernels::svd<NumericT, 
column_major>::init(ctx);
+        viennacl::ocl::kernel& kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, 
column_major>::program_name(), SVD_HOUSEHOLDER_UPDATE_A_LEFT_KERNEL);
+        viennacl::ocl::enqueue(kernel(
+                                      A,
+                                      D,
+                                      static_cast<cl_uint>(start + 1),
+                                      static_cast<cl_uint>(start),
+                                      cl_uint(viennacl::traits::size1(A)),
+                                      cl_uint(viennacl::traits::size2(A)),
+                                      
cl_uint(viennacl::traits::internal_size2(A)),
+                                      
viennacl::ocl::local_mem(static_cast<cl_uint>(128 * 4))
+                              ));
+    }
+
+
+
+
+}
+
+template <typename NumericT>
+void house_update_A_right(matrix_base<NumericT> & A,
+                          vector_base<NumericT> & D)
+{
+    viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(A).context());
+
+    if(A.row_major())
+    {
+        viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::init(ctx);
+        viennacl::ocl::kernel& kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, 
row_major>::program_name(), SVD_HOUSEHOLDER_UPDATE_A_RIGHT_KERNEL);
+
+        viennacl::ocl::enqueue(kernel(
+                                      A,
+                                      D,
+                                      static_cast<cl_uint>(0),
+                                      static_cast<cl_uint>(0),
+                                      cl_uint(viennacl::traits::size1(A)),
+                                      cl_uint(viennacl::traits::size2(A)),
+                                      
cl_uint(viennacl::traits::internal_size2(A)),
+                                      
viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(NumericT)))
+                              ));
+    }
+    else
+    {
+        viennacl::linalg::opencl::kernels::svd<NumericT, 
column_major>::init(ctx);
+        viennacl::ocl::kernel& kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, 
column_major>::program_name(), SVD_HOUSEHOLDER_UPDATE_A_RIGHT_KERNEL);
+
+        viennacl::ocl::enqueue(kernel(
+                                      A,
+                                      D,
+                                      static_cast<cl_uint>(0),
+                                      static_cast<cl_uint>(0),
+                                      cl_uint(viennacl::traits::size1(A)),
+                                      cl_uint(viennacl::traits::size2(A)),
+                                      
cl_uint(viennacl::traits::internal_size2(A)),
+                                      
viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(NumericT)))
+                              ));
+    }
+
+
+}
+
+
+
+template <typename NumericT>
+void house_update_QL(matrix_base<NumericT> & Q,
+                     vector_base<NumericT> & D,
+                     vcl_size_t A_size1)
+
+{
+    viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(Q).context());
+
+    if(Q.row_major())
+    {
+        viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::init(ctx);
+        viennacl::ocl::kernel& kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, 
row_major>::program_name(), SVD_HOUSEHOLDER_UPDATE_QL_KERNEL);
+
+        viennacl::ocl::enqueue(kernel(
+                                        Q,
+                                        D,
+                                        cl_uint(A_size1),
+                                        
cl_uint(viennacl::traits::internal_size2(Q)),
+                                        
viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(NumericT)))
+                                    ));
+    }
+    else
+    {
+        viennacl::linalg::opencl::kernels::svd<NumericT, 
column_major>::init(ctx);
+        viennacl::ocl::kernel& kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, 
column_major>::program_name(), SVD_HOUSEHOLDER_UPDATE_QL_KERNEL);
+
+        viennacl::ocl::enqueue(kernel(
+                                        Q,
+                                        D,
+                                        cl_uint(A_size1),
+                                        
cl_uint(viennacl::traits::internal_size2(Q)),
+                                        
viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(NumericT)))
+                                    ));
+    }
+
+}
+
+
+template<typename NumericT>
+  void givens_next(matrix_base<NumericT> & matrix,
+                  vector_base<NumericT>& tmp1,
+                  vector_base<NumericT>& tmp2,
+                  int l,
+                  int m
+                )
+  {
+    viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(matrix).context());
+
+    if(matrix.row_major())
+    {
+        viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::init(ctx);
+        viennacl::ocl::kernel& kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, 
row_major>::program_name(), SVD_GIVENS_NEXT_KERNEL);
+        kernel.global_work_size(0, 
viennacl::tools::align_to_multiple<cl_uint>(cl_uint(viennacl::traits::size1(matrix)),
 256));
+        kernel.local_work_size(0, 256);
+
+        viennacl::ocl::enqueue(kernel(
+                                      matrix,
+                                      tmp1,
+                                      tmp2,
+                                      cl_uint(viennacl::traits::size1(matrix)),
+                                      
cl_uint(viennacl::traits::internal_size2(matrix)),
+                                      static_cast<cl_uint>(l),
+                                      static_cast<cl_uint>(m - 1)
+                              ));
+    }
+    else
+    {
+        viennacl::linalg::opencl::kernels::svd<NumericT, 
column_major>::init(ctx);
+        viennacl::ocl::kernel& kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, 
column_major>::program_name(), SVD_GIVENS_NEXT_KERNEL);
+        kernel.global_work_size(0, 
viennacl::tools::align_to_multiple<cl_uint>(cl_uint(viennacl::traits::size1(matrix)),
 256));
+        kernel.local_work_size(0, 256);
+
+        viennacl::ocl::enqueue(kernel(
+                                      matrix,
+                                      tmp1,
+                                      tmp2,
+                                      cl_uint(viennacl::traits::size1(matrix)),
+                                      
cl_uint(viennacl::traits::internal_size2(matrix)),
+                                      static_cast<cl_uint>(l),
+                                      static_cast<cl_uint>(m - 1)
+                              ));
+    }
+
+
+  }
+
+  template <typename NumericT>
+  void copy_vec(matrix_base<NumericT>& A,
+                vector_base<NumericT> & V,
+                vcl_size_t row_start,
+                vcl_size_t col_start,
+                bool copy_col
+  )
+  {
+    std::string kernel_name = copy_col ? SVD_COPY_COL_KERNEL : 
SVD_COPY_ROW_KERNEL;
+    viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(A).context());
+
+    if(A.row_major())
+    {
+        viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::init(ctx);
+        viennacl::ocl::kernel& kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, 
row_major>::program_name(), kernel_name);
+
+        viennacl::ocl::enqueue(kernel(
+                                      A,
+                                      V,
+                                      static_cast<cl_uint>(row_start),
+                                      static_cast<cl_uint>(col_start),
+                                      copy_col ? 
cl_uint(viennacl::traits::size1(A))
+                                               : 
cl_uint(viennacl::traits::size2(A)),
+                                      static_cast<cl_uint>(A.internal_size2())
+                              ));
+    }
+    else
+    {
+        viennacl::linalg::opencl::kernels::svd<NumericT, 
column_major>::init(ctx);
+        viennacl::ocl::kernel& kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, 
column_major>::program_name(), kernel_name);
+
+        viennacl::ocl::enqueue(kernel(
+                                      A,
+                                      V,
+                                      static_cast<cl_uint>(row_start),
+                                      static_cast<cl_uint>(col_start),
+                                      copy_col ? 
cl_uint(viennacl::traits::size1(A))
+                                               : 
cl_uint(viennacl::traits::size2(A)),
+                                      static_cast<cl_uint>(A.internal_size2())
+                              ));
+    }
+
+
+  }
+
+} // namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/misc_operations.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/misc_operations.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/misc_operations.hpp
new file mode 100644
index 0000000..83a3db7
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/misc_operations.hpp
@@ -0,0 +1,69 @@
+#ifndef VIENNACL_LINALG_OPENCL_MISC_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_MISC_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+/** @file viennacl/linalg/opencl/misc_operations.hpp
+    @brief Implementations of operations using compressed_matrix and OpenCL
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/opencl/kernels/ilu.hpp"
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+namespace detail
+{
+
+template<typename NumericT>
+void level_scheduling_substitute(vector<NumericT> & x,
+                                 viennacl::backend::mem_handle const & 
row_index_array,
+                                 viennacl::backend::mem_handle const & 
row_buffer,
+                                 viennacl::backend::mem_handle const & 
col_buffer,
+                                 viennacl::backend::mem_handle const & 
element_buffer,
+                                 vcl_size_t num_rows
+                                )
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(x).context());
+
+  viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx);
+  viennacl::ocl::kernel & k = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(),
 "level_scheduling_substitute");
+
+  viennacl::ocl::enqueue(k(row_index_array.opencl_handle(), 
row_buffer.opencl_handle(), col_buffer.opencl_handle(), 
element_buffer.opencl_handle(),
+                           x,
+                           static_cast<cl_uint>(num_rows)));
+}
+
+} //namespace detail
+} // namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/nmf_operations.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/nmf_operations.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/nmf_operations.hpp
new file mode 100644
index 0000000..5daf297
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/nmf_operations.hpp
@@ -0,0 +1,139 @@
+#ifndef VIENNACL_LINALG_OPENCL_NMF_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_NMF_OPERATIONS_HPP_
+
+/* =========================================================================
+ Copyright (c) 2010-2016, Institute for Microelectronics,
+ Institute for Analysis and Scientific Computing,
+ TU Wien.
+ Portions of this software are copyright by UChicago Argonne, LLC.
+
+ -----------------
+ ViennaCL - The Vienna Computing Library
+ -----------------
+
+ Project Head:    Karl Rupp                   [email protected]
+
+ (A list of authors and contributors can be found in the manual)
+
+ License:         MIT (X11), see file LICENSE in the base directory
+ ============================================================================= 
*/
+
+/** @file viennacl/linalg/opencl/vector_operations.hpp
+ @brief Implementations of NMF operations using OpenCL
+ */
+
+#include "viennacl/linalg/opencl/kernels/nmf.hpp"
+#include "viennacl/linalg/opencl/nmf_operations.hpp"
+
+#include "viennacl/linalg/host_based/nmf_operations.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+
+/** @brief The nonnegative matrix factorization (approximation) algorithm as 
suggested by Lee and Seung. Factorizes a matrix V with nonnegative entries into 
matrices W and H such that ||V - W*H|| is minimized.
+ *
+ * @param V     Input matrix
+ * @param W     First factor
+ * @param H     Second factor
+ * @param conf  A configuration object holding tolerances and the like
+ */
+template<typename NumericT>
+void nmf(viennacl::matrix_base<NumericT> const & V,
+         viennacl::matrix_base<NumericT>       & W,
+         viennacl::matrix_base<NumericT>       & H,
+         viennacl::linalg::nmf_config const & conf)
+{
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(V).context());
+
+  const std::string NMF_MUL_DIV_KERNEL = "el_wise_mul_div";
+
+  viennacl::linalg::opencl::kernels::nmf<NumericT>::init(ctx);
+
+  vcl_size_t k = W.size2();
+  conf.iters_ = 0;
+
+  if (viennacl::linalg::norm_frobenius(W) <= 0)
+    W = viennacl::scalar_matrix<NumericT>(W.size1(), W.size2(), NumericT(1), 
ctx);
+
+  if (viennacl::linalg::norm_frobenius(H) <= 0)
+    H = viennacl::scalar_matrix<NumericT>(H.size1(), H.size2(), NumericT(1), 
ctx);
+
+  viennacl::matrix_base<NumericT> wn(V.size1(), k, W.row_major(), ctx);
+  viennacl::matrix_base<NumericT> wd(V.size1(), k, W.row_major(), ctx);
+  viennacl::matrix_base<NumericT> wtmp(V.size1(), V.size2(), W.row_major(), 
ctx);
+
+  viennacl::matrix_base<NumericT> hn(k, V.size2(), H.row_major(), ctx);
+  viennacl::matrix_base<NumericT> hd(k, V.size2(), H.row_major(), ctx);
+  viennacl::matrix_base<NumericT> htmp(k, k, H.row_major(), ctx);
+
+  viennacl::matrix_base<NumericT> appr(V.size1(), V.size2(), V.row_major(), 
ctx);
+
+  NumericT last_diff = 0;
+  NumericT diff_init = 0;
+  bool stagnation_flag = false;
+
+  for (vcl_size_t i = 0; i < conf.max_iterations(); i++)
+  {
+    conf.iters_ = i + 1;
+    {
+      hn = viennacl::linalg::prod(trans(W), V);
+      htmp = viennacl::linalg::prod(trans(W), W);
+      hd = viennacl::linalg::prod(htmp, H);
+
+      viennacl::ocl::kernel & mul_div_kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::nmf<NumericT>::program_name(),
 NMF_MUL_DIV_KERNEL);
+      viennacl::ocl::enqueue(mul_div_kernel(H, hn, hd, 
cl_uint(H.internal_size1() * H.internal_size2())));
+    }
+    {
+      wn = viennacl::linalg::prod(V, trans(H));
+      wtmp = viennacl::linalg::prod(W, H);
+      wd = viennacl::linalg::prod(wtmp, trans(H));
+
+      viennacl::ocl::kernel & mul_div_kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::nmf<NumericT>::program_name(),
 NMF_MUL_DIV_KERNEL);
+
+      viennacl::ocl::enqueue(mul_div_kernel(W, wn, wd, 
cl_uint(W.internal_size1() * W.internal_size2())));
+    }
+
+    if (i % conf.check_after_steps() == 0)  //check for convergence
+    {
+      appr = viennacl::linalg::prod(W, H);
+
+      appr -= V;
+      NumericT diff_val = viennacl::linalg::norm_frobenius(appr);
+
+      if (i == 0)
+        diff_init = diff_val;
+
+      if (conf.print_relative_error())
+        std::cout << diff_val / diff_init << std::endl;
+
+      // Approximation check
+      if (diff_val / diff_init < conf.tolerance())
+        break;
+
+      // Stagnation check
+      if (std::fabs(diff_val - last_diff) / (diff_val * 
NumericT(conf.check_after_steps())) < conf.stagnation_tolerance()) //avoid 
situations where convergence stagnates
+      {
+        if (stagnation_flag)    // iteration stagnates (two iterates with no 
notable progress)
+          break;
+        else
+          // record stagnation in this iteration
+          stagnation_flag = true;
+      } else
+        // good progress in this iteration, so unset stagnation flag
+        stagnation_flag = false;
+
+      // prepare for next iterate:
+      last_diff = diff_val;
+    }
+  }
+}
+
+} //namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+#endif /* VIENNACL_LINALG_OPENCL_NMF_OPERATIONS_HPP_ */

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/scalar_operations.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/scalar_operations.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/scalar_operations.hpp
new file mode 100644
index 0000000..a94681f
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/scalar_operations.hpp
@@ -0,0 +1,205 @@
+#ifndef VIENNACL_LINALG_OPENCL_SCALAR_OPERATIONS_HPP_
+#define VIENNACL_LINALG_OPENCL_SCALAR_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+/** @file viennacl/linalg/opencl/scalar_operations.hpp
+    @brief Implementations of scalar operations using OpenCL
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/ocl/device.hpp"
+#include "viennacl/ocl/handle.hpp"
+#include "viennacl/ocl/kernel.hpp"
+#include "viennacl/tools/tools.hpp"
+#include "viennacl/linalg/opencl/kernels/scalar.hpp"
+#include "viennacl/linalg/opencl/common.hpp"
+#include "viennacl/meta/predicate.hpp"
+#include "viennacl/meta/result_of.hpp"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/traits/size.hpp"
+#include "viennacl/traits/start.hpp"
+#include "viennacl/traits/handle.hpp"
+#include "viennacl/traits/stride.hpp"
+#include "viennacl/traits/handle.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace opencl
+{
+
+template<typename ScalarT1,
+         typename ScalarT2, typename NumericT>
+typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
+                              && viennacl::is_scalar<ScalarT2>::value
+                              && viennacl::is_any_scalar<NumericT>::value
+                            >::type
+as(ScalarT1 & s1,
+   ScalarT2 const & s2, NumericT const & alpha, vcl_size_t len_alpha, bool 
reciprocal_alpha, bool flip_sign_alpha)
+{
+  assert( &viennacl::traits::opencl_handle(s1).context() == 
&viennacl::traits::opencl_handle(s2).context() && bool("Operands not in the 
same OpenCL context!"));
+
+  typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type        
value_type;
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(s1).context());
+  viennacl::linalg::opencl::kernels::scalar<value_type>::init(ctx);
+
+  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, 
flip_sign_alpha);
+
+  bool is_cpu = viennacl::is_cpu_scalar<NumericT>::value;
+  viennacl::ocl::kernel & k = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::scalar<value_type>::program_name(),
 is_cpu ? "as_cpu" : "as_gpu");
+  k.local_work_size(0, 1);
+  k.global_work_size(0, 1);
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(s1),
+                           
viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha)),
+                           options_alpha,
+                           viennacl::traits::opencl_handle(s2) )
+                        );
+}
+
+
+template<typename ScalarT1,
+         typename ScalarT2, typename NumericT2,
+         typename ScalarT3, typename NumericT3>
+typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
+                              && viennacl::is_scalar<ScalarT2>::value
+                              && viennacl::is_scalar<ScalarT3>::value
+                              && viennacl::is_any_scalar<NumericT2>::value
+                              && viennacl::is_any_scalar<NumericT3>::value
+                            >::type
+asbs(ScalarT1 & s1,
+     ScalarT2 const & s2, NumericT2 const & alpha, vcl_size_t len_alpha, bool 
reciprocal_alpha, bool flip_sign_alpha,
+     ScalarT3 const & s3, NumericT3 const & beta,  vcl_size_t len_beta,  bool 
reciprocal_beta,  bool flip_sign_beta)
+{
+  assert( &viennacl::traits::opencl_handle(s1).context() == 
&viennacl::traits::opencl_handle(s2).context() && bool("Operands not in the 
same OpenCL context!"));
+  assert( &viennacl::traits::opencl_handle(s2).context() == 
&viennacl::traits::opencl_handle(s3).context() && bool("Operands not in the 
same OpenCL context!"));
+
+  typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type        
value_type;
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(s1).context());
+  viennacl::linalg::opencl::kernels::scalar<value_type>::init(ctx);
+
+  std::string kernel_name;
+  bool is_cpu_2 = viennacl::is_cpu_scalar<NumericT2>::value;
+  bool is_cpu_3 = viennacl::is_cpu_scalar<NumericT3>::value;
+  if (is_cpu_2 && is_cpu_3)
+    kernel_name = "asbs_cpu_cpu";
+  else if (is_cpu_2 && !is_cpu_3)
+    kernel_name = "asbs_cpu_gpu";
+  else if (!is_cpu_2 && is_cpu_3)
+    kernel_name = "asbs_gpu_cpu";
+  else
+    kernel_name = "asbs_gpu_gpu";
+
+  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, 
flip_sign_alpha);
+  cl_uint options_beta  = detail::make_options(len_beta,  reciprocal_beta,  
flip_sign_beta);
+
+  viennacl::ocl::kernel & k = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::scalar<value_type>::program_name(),
 kernel_name);
+  k.local_work_size(0, 1);
+  k.global_work_size(0, 1);
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(s1),
+                           
viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha)),
+                           options_alpha,
+                           viennacl::traits::opencl_handle(s2),
+                           
viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(beta)),
+                           options_beta,
+                           viennacl::traits::opencl_handle(s3) )
+                        );
+}
+
+
+template<typename ScalarT1,
+         typename ScalarT2, typename NumericT2,
+         typename ScalarT3, typename NumericT3>
+typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
+                              && viennacl::is_scalar<ScalarT2>::value
+                              && viennacl::is_scalar<ScalarT3>::value
+                              && viennacl::is_any_scalar<NumericT2>::value
+                              && viennacl::is_any_scalar<NumericT3>::value
+                            >::type
+asbs_s(ScalarT1 & s1,
+       ScalarT2 const & s2, NumericT2 const & alpha, vcl_size_t len_alpha, 
bool reciprocal_alpha, bool flip_sign_alpha,
+       ScalarT3 const & s3, NumericT3 const & beta,  vcl_size_t len_beta,  
bool reciprocal_beta,  bool flip_sign_beta)
+{
+  assert( &viennacl::traits::opencl_handle(s1).context() == 
&viennacl::traits::opencl_handle(s2).context() && bool("Operands not in the 
same OpenCL context!"));
+  assert( &viennacl::traits::opencl_handle(s2).context() == 
&viennacl::traits::opencl_handle(s3).context() && bool("Operands not in the 
same OpenCL context!"));
+
+  typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type        
value_type;
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(s1).context());
+  viennacl::linalg::opencl::kernels::scalar<value_type>::init(ctx);
+
+  std::string kernel_name;
+  if (viennacl::is_cpu_scalar<NumericT2>::value && 
viennacl::is_cpu_scalar<NumericT3>::value)
+    kernel_name = "asbs_s_cpu_cpu";
+  else if (viennacl::is_cpu_scalar<NumericT2>::value && 
!viennacl::is_cpu_scalar<NumericT3>::value)
+    kernel_name = "asbs_s_cpu_gpu";
+  else if (!viennacl::is_cpu_scalar<NumericT2>::value && 
viennacl::is_cpu_scalar<NumericT3>::value)
+    kernel_name = "asbs_s_gpu_cpu";
+  else
+    kernel_name = "asbs_s_gpu_gpu";
+
+  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, 
flip_sign_alpha);
+  cl_uint options_beta  = detail::make_options(len_beta,  reciprocal_beta,  
flip_sign_beta);
+
+  viennacl::ocl::kernel & k = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::scalar<value_type>::program_name(),
 kernel_name);
+  k.local_work_size(0, 1);
+  k.global_work_size(0, 1);
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(s1),
+                           
viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha)),
+                           options_alpha,
+                           viennacl::traits::opencl_handle(s2),
+                           
viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(beta)),
+                           options_beta,
+                           viennacl::traits::opencl_handle(s3) )
+                        );
+}
+
+
+/** @brief Swaps the contents of two scalars, data is copied
+*
+* @param s1   The first scalar
+* @param s2   The second scalar
+*/
+template<typename ScalarT1, typename ScalarT2>
+typename viennacl::enable_if<    viennacl::is_scalar<ScalarT1>::value
+                              && viennacl::is_scalar<ScalarT2>::value
+                            >::type
+swap(ScalarT1 & s1, ScalarT2 & s2)
+{
+  assert( &viennacl::traits::opencl_handle(s1).context() == 
&viennacl::traits::opencl_handle(s2).context() && bool("Operands not in the 
same OpenCL context!"));
+
+  typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type        
value_type;
+  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(s1).context());
+  viennacl::linalg::opencl::kernels::scalar<value_type>::init(ctx);
+
+  viennacl::ocl::kernel & k = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::scalar<value_type>::program_name(),
 "swap");
+  k.local_work_size(0, 1);
+  k.global_work_size(0, 1);
+  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(s1),
+                           viennacl::traits::opencl_handle(s2))
+                        );
+}
+
+
+
+} //namespace opencl
+} //namespace linalg
+} //namespace viennacl
+
+
+#endif

Reply via email to