http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/matrix_operations.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/matrix_operations.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/matrix_operations.hpp
deleted file mode 100644
index bd5116d..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/matrix_operations.hpp
+++ /dev/null
@@ -1,1019 +0,0 @@
-#ifndef VIENNACL_LINALG_OPENCL_MATRIX_OPERATIONS_HPP_
-#define VIENNACL_LINALG_OPENCL_MATRIX_OPERATIONS_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-/** @file  viennacl/linalg/opencl/matrix_operations.hpp
-    @brief Implementations of dense matrix related operations, including 
matrix-vector products, using OpenCL.
-*/
-
-#include "viennacl/forwards.h"
-
-#include "viennacl/ocl/device.hpp"
-#include "viennacl/ocl/handle.hpp"
-#include "viennacl/ocl/kernel.hpp"
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/vector_proxy.hpp"
-#include "viennacl/tools/tools.hpp"
-#include "viennacl/meta/enable_if.hpp"
-#include "viennacl/meta/predicate.hpp"
-#include "viennacl/meta/result_of.hpp"
-
-#include "viennacl/traits/size.hpp"
-#include "viennacl/traits/start.hpp"
-#include "viennacl/traits/handle.hpp"
-#include "viennacl/traits/stride.hpp"
-
-#include "viennacl/linalg/opencl/common.hpp"
-#include "viennacl/linalg/opencl/kernels/svd.hpp"
-#include "viennacl/linalg/opencl/kernels/vector.hpp"
-#include "viennacl/linalg/opencl/kernels/matrix.hpp"
-#include "viennacl/linalg/opencl/kernels/matrix_element.hpp"
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace opencl
-{
-
-namespace detail
-{
-
-  template<typename NumericT>
-  viennacl::ocl::kernel & kernel_for_matrix(matrix_base<NumericT> const & M, 
std::string const & kernel_name)
-  {
-    viennacl::ocl::context & ctx = traits::opencl_context(M);
-    viennacl::ocl::program * program;
-    if (M.row_major())
-    {
-      typedef viennacl::linalg::opencl::kernels::matrix<NumericT, row_major>  
KernelClass;
-      KernelClass::init(ctx);
-      program = &ctx.get_program(KernelClass::program_name());
-    }
-    else
-    {
-      typedef viennacl::linalg::opencl::kernels::matrix<NumericT, 
column_major>  KernelClass;
-      KernelClass::init(ctx);
-      program = &ctx.get_program(KernelClass::program_name());
-    }
-    return program->get_kernel(kernel_name);
-  }
-
-  template<typename NumericT>
-  viennacl::ocl::kernel & element_kernel_for_matrix(matrix_base<NumericT> 
const & M, std::string const & kernel_name)
-  {
-    viennacl::ocl::context & ctx = traits::opencl_context(M);
-    viennacl::ocl::program * program;
-    if (M.row_major())
-    {
-      typedef viennacl::linalg::opencl::kernels::matrix_element<NumericT, 
row_major>  KernelClass;
-      KernelClass::init(ctx);
-      program = &ctx.get_program(KernelClass::program_name());
-    }
-    else
-    {
-      typedef viennacl::linalg::opencl::kernels::matrix_element<NumericT, 
column_major>  KernelClass;
-      KernelClass::init(ctx);
-      program = &ctx.get_program(KernelClass::program_name());
-    }
-    return program->get_kernel(kernel_name);
-  }
-
-  template<typename NumericT>
-  viennacl::ocl::kernel & legacy_kernel_for_matrix(matrix_base<NumericT> const 
& M, std::string const & kernel_name)
-  {
-    viennacl::ocl::context & ctx = traits::opencl_context(M);
-    viennacl::ocl::program * program;
-    if (M.row_major())
-    {
-      typedef viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, 
row_major>  KernelClass;
-      KernelClass::init(ctx);
-      program = &ctx.get_program(KernelClass::program_name());
-    }
-    else
-    {
-      typedef viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, 
column_major>  KernelClass;
-      KernelClass::init(ctx);
-      program = &ctx.get_program(KernelClass::program_name());
-    }
-    return program->get_kernel(kernel_name);
-  }
-
-}
-
-//
-// Introductory note: By convention, all dimensions are already checked in the 
dispatcher frontend. No need to double-check again in here!
-//
-
-const std::string SVD_BIDIAG_PACK_KERNEL = "bidiag_pack";
-const std::string SVD_HOUSEHOLDER_UPDATE_A_LEFT_KERNEL = "house_update_A_left";
-const std::string SVD_HOUSEHOLDER_UPDATE_A_RIGHT_KERNEL = 
"house_update_A_right";
-const std::string SVD_HOUSEHOLDER_UPDATE_QL_KERNEL = "house_update_QL";
-const std::string SVD_GIVENS_NEXT_KERNEL = "givens_next";
-const std::string SVD_COPY_COL_KERNEL = "copy_col";
-const std::string SVD_COPY_ROW_KERNEL = "copy_row";
-
-template<typename DestNumericT, typename SrcNumericT>
-void convert(matrix_base<DestNumericT> & dest, matrix_base<SrcNumericT> const 
& src)
-{
-  assert(dest.row_major() == src.row_major() && bool("Addition/subtraction on 
mixed matrix layouts not supported yet!"));
-
-  assert(viennacl::traits::opencl_handle(dest).context() == 
viennacl::traits::opencl_handle(src).context() && bool("Matrices do not reside 
in the same OpenCL context. Automatic migration not yet supported!"));
-
-  std::string kernel_name("convert_");
-  kernel_name += dest.row_major() ? "row_" : "col_";
-  kernel_name += viennacl::ocl::type_to_string<DestNumericT>::apply();
-  kernel_name += "_";
-  kernel_name += viennacl::ocl::type_to_string<SrcNumericT>::apply();
-
-  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(dest).context());
-  viennacl::linalg::opencl::kernels::matrix_convert::init(ctx);
-  viennacl::ocl::kernel& k = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::matrix_convert::program_name(),
 kernel_name);
-
-  viennacl::ocl::enqueue(k( dest, cl_uint(dest.start1()), 
cl_uint(dest.stride1()), cl_uint(dest.size1()), cl_uint(dest.internal_size1()), 
cl_uint(dest.start2()), cl_uint(dest.stride2()), cl_uint(dest.size2()), 
cl_uint(dest.internal_size2()),
-                            src,  cl_uint( src.start1()), cl_uint( 
src.stride1()), cl_uint( src.size1()), cl_uint( src.internal_size1()), cl_uint( 
src.start2()), cl_uint( src.stride2()), cl_uint( src.size2()), cl_uint( 
src.internal_size2())
-                        ) );
-}
-
-//
-// Introductory note: By convention, all dimensions are already checked in the 
dispatcher frontend. No need to double-check again in here!
-//
-
-template <typename NumericT,
-          typename ScalarT1>
-void am(matrix_base<NumericT> & mat1,
-        matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t 
len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
-{
-  viennacl::ocl::kernel & k= detail::kernel_for_matrix(mat1, 
(viennacl::is_cpu_scalar<ScalarT1>::value ? "am_cpu" : "am_gpu"));
-
-  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, 
flip_sign_alpha);
-
-  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat1),
-                          cl_uint(viennacl::traits::start1(mat1)),           
cl_uint(viennacl::traits::start2(mat1)),
-                          cl_uint(viennacl::traits::stride1(mat1)),          
cl_uint(viennacl::traits::stride2(mat1)),
-                          cl_uint(viennacl::traits::size1(mat1)),            
cl_uint(viennacl::traits::size2(mat1)),
-                          cl_uint(viennacl::traits::internal_size1(mat1)),   
cl_uint(viennacl::traits::internal_size2(mat1)),
-
-                          
viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(alpha)),
-                          options_alpha,
-                          viennacl::traits::opencl_handle(mat2),
-                          cl_uint(viennacl::traits::start1(mat2)),           
cl_uint(viennacl::traits::start2(mat2)),
-                          cl_uint(viennacl::traits::stride1(mat2)),          
cl_uint(viennacl::traits::stride2(mat2)),
-                          cl_uint(viennacl::traits::internal_size1(mat2)),   
cl_uint(viennacl::traits::internal_size2(mat2))
-                          )
-                        );
-}
-
-
-template <typename NumericT,
-          typename ScalarT1, typename ScalarT2>
-void ambm(matrix_base<NumericT> & mat1,
-          matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, 
vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
-          matrix_base<NumericT> const & mat3, ScalarT2 const & beta,  
vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
-{
-  std::string kernel_name;
-  if      ( viennacl::is_cpu_scalar<ScalarT1>::value &&  
viennacl::is_cpu_scalar<ScalarT2>::value)
-    kernel_name = "ambm_cpu_cpu";
-  else if ( viennacl::is_cpu_scalar<ScalarT1>::value && 
!viennacl::is_cpu_scalar<ScalarT2>::value)
-    kernel_name = "ambm_cpu_gpu";
-  else if (!viennacl::is_cpu_scalar<ScalarT1>::value &&  
viennacl::is_cpu_scalar<ScalarT2>::value)
-    kernel_name = "ambm_gpu_cpu";
-  else
-    kernel_name = "ambm_gpu_gpu";
-
-  viennacl::ocl::kernel & k = detail::kernel_for_matrix(mat1, kernel_name);
-
-  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, 
flip_sign_alpha);
-  cl_uint options_beta  = detail::make_options(len_beta,  reciprocal_beta,  
flip_sign_beta);
-
-  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat1),
-                          cl_uint(viennacl::traits::start1(mat1)),           
cl_uint(viennacl::traits::start2(mat1)),
-                          cl_uint(viennacl::traits::stride1(mat1)),          
cl_uint(viennacl::traits::stride2(mat1)),
-                          cl_uint(viennacl::traits::size1(mat1)),            
cl_uint(viennacl::traits::size2(mat1)),
-                          cl_uint(viennacl::traits::internal_size1(mat1)),   
cl_uint(viennacl::traits::internal_size2(mat1)),
-
-                          
viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(alpha)),
-                          options_alpha,
-                          viennacl::traits::opencl_handle(mat2),
-                          cl_uint(viennacl::traits::start1(mat2)),           
cl_uint(viennacl::traits::start2(mat2)),
-                          cl_uint(viennacl::traits::stride1(mat2)),          
cl_uint(viennacl::traits::stride2(mat2)),
-                          cl_uint(viennacl::traits::internal_size1(mat2)),   
cl_uint(viennacl::traits::internal_size2(mat2)),
-
-                          
viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(beta)),
-                          options_beta,
-                          viennacl::traits::opencl_handle(mat3),
-                          cl_uint(viennacl::traits::start1(mat3)),           
cl_uint(viennacl::traits::start2(mat3)),
-                          cl_uint(viennacl::traits::stride1(mat3)),          
cl_uint(viennacl::traits::stride2(mat3)),
-                          cl_uint(viennacl::traits::internal_size1(mat3)),   
cl_uint(viennacl::traits::internal_size2(mat3))
-                          )
-                        );
-}
-
-
-template <typename NumericT,
-          typename ScalarT1, typename ScalarT2>
-void ambm_m(matrix_base<NumericT> & mat1,
-            matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, 
vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
-            matrix_base<NumericT> const & mat3, ScalarT2 const & beta,  
vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
-{
-  std::string kernel_name;
-  if      ( viennacl::is_cpu_scalar<ScalarT1>::value &&  
viennacl::is_cpu_scalar<ScalarT2>::value)
-    kernel_name = "ambm_m_cpu_cpu";
-  else if ( viennacl::is_cpu_scalar<ScalarT1>::value && 
!viennacl::is_cpu_scalar<ScalarT2>::value)
-    kernel_name = "ambm_m_cpu_gpu";
-  else if (!viennacl::is_cpu_scalar<ScalarT1>::value &&  
viennacl::is_cpu_scalar<ScalarT2>::value)
-    kernel_name = "ambm_m_gpu_cpu";
-  else
-    kernel_name = "ambm_m_gpu_gpu";
-
-  viennacl::ocl::kernel & k = detail::kernel_for_matrix(mat1, kernel_name);
-
-  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, 
flip_sign_alpha);
-  cl_uint options_beta  = detail::make_options(len_beta,  reciprocal_beta,  
flip_sign_beta);
-
-  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat1),
-                          cl_uint(viennacl::traits::start1(mat1)),           
cl_uint(viennacl::traits::start2(mat1)),
-                          cl_uint(viennacl::traits::stride1(mat1)),          
cl_uint(viennacl::traits::stride2(mat1)),
-                          cl_uint(viennacl::traits::size1(mat1)),            
cl_uint(viennacl::traits::size2(mat1)),
-                          cl_uint(viennacl::traits::internal_size1(mat1)),   
cl_uint(viennacl::traits::internal_size2(mat1)),
-
-                          
viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(alpha)),
-                          options_alpha,
-                          viennacl::traits::opencl_handle(mat2),
-                          cl_uint(viennacl::traits::start1(mat2)),           
cl_uint(viennacl::traits::start2(mat2)),
-                          cl_uint(viennacl::traits::stride1(mat2)),          
cl_uint(viennacl::traits::stride2(mat2)),
-                          cl_uint(viennacl::traits::internal_size1(mat2)),   
cl_uint(viennacl::traits::internal_size2(mat2)),
-
-                          
viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(beta)),
-                          options_beta,
-                          viennacl::traits::opencl_handle(mat3),
-                          cl_uint(viennacl::traits::start1(mat3)),           
cl_uint(viennacl::traits::start2(mat3)),
-                          cl_uint(viennacl::traits::stride1(mat3)),          
cl_uint(viennacl::traits::stride2(mat3)),
-                          cl_uint(viennacl::traits::internal_size1(mat3)),   
cl_uint(viennacl::traits::internal_size2(mat3))
-                          )
-                        );
-}
-
-template<typename NumericT,
-          typename SizeT, typename DistanceT>
-void trans(const matrix_expression<const matrix_base<NumericT, SizeT, 
DistanceT>,const matrix_base<NumericT, SizeT, DistanceT>, op_trans> & proxy,
-           matrix_base<NumericT> & temp_trans)
-{
-  std::string kernel_name("trans_kernel");
-  viennacl::ocl::kernel& kernel = 
detail::legacy_kernel_for_matrix(proxy.lhs(),kernel_name);
-  viennacl::ocl::enqueue(kernel(proxy.lhs(),
-                                static_cast<cl_uint>(proxy.lhs().start1()),    
     static_cast<cl_uint>(proxy.lhs().start2()),
-                                
static_cast<cl_uint>(proxy.lhs().internal_size1()), 
static_cast<cl_uint>(proxy.lhs().internal_size2()),
-                                static_cast<cl_uint>(proxy.lhs().size1()),     
     static_cast<cl_uint>(proxy.lhs().size2()),
-                                static_cast<cl_uint>(proxy.lhs().stride1()),   
     static_cast<cl_uint>(proxy.lhs().stride2()),
-
-                                temp_trans,
-                                static_cast<cl_uint>(temp_trans.start1()),     
    static_cast<cl_uint>(temp_trans.start2()),
-                                
static_cast<cl_uint>(temp_trans.internal_size1()), 
static_cast<cl_uint>(temp_trans.internal_size2()),
-                                static_cast<cl_uint>(temp_trans.stride1()),    
    static_cast<cl_uint>(temp_trans.stride2())));
-}
-
-template <typename NumericT>
-void matrix_assign(matrix_base<NumericT> & mat, NumericT s, bool clear = false)
-{
-  cl_uint s1 = clear ? cl_uint(viennacl::traits::internal_size1(mat)) : 
cl_uint(viennacl::traits::size1(mat));
-  cl_uint s2 = clear ? cl_uint(viennacl::traits::internal_size2(mat)) : 
cl_uint(viennacl::traits::size2(mat));
-
-  viennacl::ocl::kernel & k = detail::kernel_for_matrix(mat, "assign_cpu");
-  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat),
-                           cl_uint(viennacl::traits::start1(mat)),           
cl_uint(viennacl::traits::start2(mat)),
-                           cl_uint(viennacl::traits::stride1(mat)),          
cl_uint(viennacl::traits::stride2(mat)),
-                           s1,                                               
s2,
-                           cl_uint(viennacl::traits::internal_size1(mat)),   
cl_uint(viennacl::traits::internal_size2(mat)),
-                           
viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(s))
-                          )
-                        );
-}
-
-template <typename NumericT>
-void matrix_diagonal_assign(matrix_base<NumericT> & mat, NumericT s)
-{
-  viennacl::ocl::kernel & k = detail::kernel_for_matrix(mat, 
"diagonal_assign_cpu");
-  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat),
-                           cl_uint(viennacl::traits::start1(mat)),           
cl_uint(viennacl::traits::start2(mat)),
-                           cl_uint(viennacl::traits::stride1(mat)),          
cl_uint(viennacl::traits::stride2(mat)),
-                           cl_uint(viennacl::traits::size1(mat)),            
cl_uint(viennacl::traits::size2(mat)),
-                           cl_uint(viennacl::traits::internal_size1(mat)),   
cl_uint(viennacl::traits::internal_size2(mat)),
-                           
viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(s))
-                          )
-                        );
-}
-
-template <typename NumericT>
-void matrix_diag_from_vector(const vector_base<NumericT> & vec, int k, 
matrix_base<NumericT> & mat)
-{
-  // Step 1: set everything to zero
-  matrix_assign(mat, NumericT(0));
-
-  // Step 2: set the diagonal:
-
-  // reuse vector ambm kernel for assigning the elements:
-  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(mat).context());
-  typedef viennacl::linalg::opencl::kernels::vector<NumericT>  KernelClass;
-  KernelClass::init(ctx);
-
-  cl_uint options_alpha = 0;
-  viennacl::ocl::packed_cl_uint size_mat;
-  if (mat.row_major())
-  {
-    vcl_size_t first_row_index = 0;
-    vcl_size_t first_col_index = 0;
-    if (k < 0)
-      first_row_index = vcl_size_t(-k);
-    else
-      first_col_index = vcl_size_t(k);
-    size_mat.start  = cl_uint( (viennacl::traits::start1(mat) + 
first_row_index * viennacl::traits::stride1(mat)) * 
viennacl::traits::internal_size2(mat)
-                              + viennacl::traits::start2(mat) + 
first_col_index * viennacl::traits::stride2(mat));
-    size_mat.stride = cl_uint(viennacl::traits::stride1(mat) * 
viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat));
-    size_mat.size   = cl_uint(viennacl::traits::size(vec));
-    size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
-  }
-  else
-  {
-    vcl_size_t first_row_index = 0;
-    vcl_size_t first_col_index = 0;
-    if (k < 0)
-      first_row_index = vcl_size_t(-k);
-    else
-      first_col_index = vcl_size_t(k);
-    size_mat.start  = cl_uint(   viennacl::traits::start1(mat) + 
first_row_index * viennacl::traits::stride1(mat)
-                              + (viennacl::traits::start2(mat) + 
first_col_index * viennacl::traits::stride2(mat)) * 
viennacl::traits::internal_size1(mat));
-    size_mat.stride = cl_uint(viennacl::traits::stride2(mat) * 
viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat));
-    size_mat.size   = cl_uint(viennacl::traits::size(vec));
-    size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
-  }
-
-  viennacl::ocl::packed_cl_uint size_vec;
-  size_vec.start  = cl_uint(viennacl::traits::start(vec));
-  size_vec.stride = cl_uint(viennacl::traits::stride(vec));
-  size_vec.size   = cl_uint(viennacl::traits::size(vec));
-  size_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
-
-  viennacl::ocl::kernel & kern = ctx.get_kernel(KernelClass::program_name(), 
"av_cpu");
-  viennacl::ocl::enqueue(kern(viennacl::traits::opencl_handle(mat),
-                              size_mat,
-
-                              viennacl::traits::opencl_handle(NumericT(1)),
-                              options_alpha,
-                              viennacl::traits::opencl_handle(vec),
-                              size_vec)
-                        );
-}
-
-template <typename NumericT>
-void matrix_diag_to_vector(const matrix_base<NumericT> & mat, int k, 
vector_base<NumericT> & vec)
-{
-  // reuse vector ambm kernel for assigning the elements:
-  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(mat).context());
-  typedef viennacl::linalg::opencl::kernels::vector<NumericT>  KernelClass;
-  KernelClass::init(ctx);
-
-  cl_uint options_alpha = 0;
-  viennacl::ocl::packed_cl_uint size_mat;
-  if (mat.row_major())
-  {
-    vcl_size_t first_row_index = 0;
-    vcl_size_t first_col_index = 0;
-    if (k < 0)
-      first_row_index = vcl_size_t(-k);
-    else
-      first_col_index = vcl_size_t(k);
-    size_mat.start  = cl_uint( (viennacl::traits::start1(mat) + 
first_row_index * viennacl::traits::stride1(mat)) * 
viennacl::traits::internal_size2(mat)
-                              + viennacl::traits::start2(mat) + 
first_col_index * viennacl::traits::stride2(mat));
-    size_mat.stride = cl_uint(viennacl::traits::stride1(mat) * 
viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat));
-    size_mat.size   = cl_uint(viennacl::traits::size(vec));
-    size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
-  }
-  else
-  {
-    vcl_size_t first_row_index = 0;
-    vcl_size_t first_col_index = 0;
-    if (k < 0)
-      first_row_index = vcl_size_t(-k);
-    else
-      first_col_index = vcl_size_t(k);
-    size_mat.start  = cl_uint(   viennacl::traits::start1(mat) + 
first_row_index * viennacl::traits::stride1(mat)
-                              + (viennacl::traits::start2(mat) + 
first_col_index * viennacl::traits::stride2(mat)) * 
viennacl::traits::internal_size1(mat));
-    size_mat.stride = cl_uint(viennacl::traits::stride2(mat) * 
viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat));
-    size_mat.size   = cl_uint(viennacl::traits::size(vec));
-    size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
-  }
-
-  viennacl::ocl::packed_cl_uint size_vec;
-  size_vec.start  = cl_uint(viennacl::traits::start(vec));
-  size_vec.stride = cl_uint(viennacl::traits::stride(vec));
-  size_vec.size   = cl_uint(viennacl::traits::size(vec));
-  size_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
-
-
-  viennacl::ocl::kernel & kern = ctx.get_kernel(KernelClass::program_name(), 
"av_cpu");
-  viennacl::ocl::enqueue(kern(viennacl::traits::opencl_handle(vec),
-                              size_vec,
-
-                              viennacl::traits::opencl_handle(NumericT(1)),
-                              options_alpha,
-                              viennacl::traits::opencl_handle(mat),
-                              size_mat)
-                        );
-}
-
-template <typename NumericT>
-void matrix_row(matrix_base<NumericT> const & mat, unsigned int i, 
vector_base<NumericT> & vec)
-{
-  // reuse vector ambm kernel for assigning the elements:
-  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(mat).context());
-  typedef viennacl::linalg::opencl::kernels::vector<NumericT>  KernelClass;
-  KernelClass::init(ctx);
-
-  cl_uint options_alpha = 0;
-  viennacl::ocl::packed_cl_uint size_mat;
-  if (mat.row_major())
-  {
-    size_mat.start  = cl_uint((viennacl::traits::start1(mat) + i * 
viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat) + 
viennacl::traits::start2(mat));
-    size_mat.stride = cl_uint(viennacl::traits::stride2(mat));
-    size_mat.size   = cl_uint(viennacl::traits::size(vec));
-    size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
-  }
-  else
-  {
-    size_mat.start  = cl_uint((viennacl::traits::start1(mat) + i * 
viennacl::traits::stride1(mat)) + viennacl::traits::start2(mat) * 
viennacl::traits::internal_size1(mat));
-    size_mat.stride = cl_uint(viennacl::traits::stride2(mat) * 
viennacl::traits::internal_size1(mat));
-    size_mat.size   = cl_uint(viennacl::traits::size(vec));
-    size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
-  }
-
-  viennacl::ocl::packed_cl_uint size_vec;
-  size_vec.start  = cl_uint(viennacl::traits::start(vec));
-  size_vec.stride = cl_uint(viennacl::traits::stride(vec));
-  size_vec.size   = cl_uint(viennacl::traits::size(vec));
-  size_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
-
-
-  viennacl::ocl::kernel & kern = ctx.get_kernel(KernelClass::program_name(), 
"av_cpu");
-  viennacl::ocl::enqueue(kern(viennacl::traits::opencl_handle(vec),
-                              size_vec,
-
-                              viennacl::traits::opencl_handle(NumericT(1)),
-                              options_alpha,
-                              viennacl::traits::opencl_handle(mat),
-                              size_mat)
-                        );
-}
-
-template <typename NumericT>
-void matrix_column(const matrix_base<NumericT> & mat, unsigned int j, 
vector_base<NumericT> & vec)
-{
-  // reuse vector ambm kernel for assigning the elements:
-  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(mat).context());
-  typedef viennacl::linalg::opencl::kernels::vector<NumericT>  KernelClass;
-  KernelClass::init(ctx);
-
-  cl_uint options_alpha = 0;
-  viennacl::ocl::packed_cl_uint size_mat;
-  if (mat.row_major())
-  {
-    size_mat.start  = cl_uint(viennacl::traits::start1(mat) * 
viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat) + j * 
viennacl::traits::stride2(mat));
-    size_mat.stride = cl_uint(viennacl::traits::stride2(mat) * 
viennacl::traits::internal_size2(mat));
-    size_mat.size   = cl_uint(viennacl::traits::size(vec));
-    size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
-  }
-  else
-  {
-    size_mat.start  = cl_uint(viennacl::traits::start1(mat) + 
(viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat)) * 
viennacl::traits::internal_size1(mat));
-    size_mat.stride = cl_uint(viennacl::traits::stride2(mat));
-    size_mat.size   = cl_uint(viennacl::traits::size(vec));
-    size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
-  }
-
-  viennacl::ocl::packed_cl_uint size_vec;
-  size_vec.start  = cl_uint(viennacl::traits::start(vec));
-  size_vec.stride = cl_uint(viennacl::traits::stride(vec));
-  size_vec.size   = cl_uint(viennacl::traits::size(vec));
-  size_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
-
-
-  viennacl::ocl::kernel & kern = ctx.get_kernel(KernelClass::program_name(), 
"av_cpu");
-  viennacl::ocl::enqueue(kern(viennacl::traits::opencl_handle(vec),
-                              size_vec,
-
-                              viennacl::traits::opencl_handle(NumericT(1)),
-                              options_alpha,
-                              viennacl::traits::opencl_handle(mat),
-                              size_mat)
-                        );
-}
-
-
-//
-///////////////////////// Element-wise operation 
//////////////////////////////////
-//
-
-// Binary operations A = B .* C and A = B ./ C
-/** @brief Implementation of binary element-wise operations A = OP(B,C)
-*
-* @param A      The result matrix (or -range, or -slice)
-* @param proxy  The proxy object holding B, C, and the operation
-*/
-template <typename T, typename OP>
-void element_op(matrix_base<T> & A,
-                matrix_expression<const matrix_base<T>, const matrix_base<T>, 
op_element_binary<OP> > const & proxy)
-{
-  assert(viennacl::traits::opencl_handle(A).context() == 
viennacl::traits::opencl_handle(proxy.lhs()).context() && bool("Matrices do not 
reside in the same OpenCL context. Automatic migration not yet supported!"));
-  assert(viennacl::traits::opencl_handle(A).context() == 
viennacl::traits::opencl_handle(proxy.rhs()).context() && bool("Matrices do not 
reside in the same OpenCL context. Automatic migration not yet supported!"));
-
-  viennacl::ocl::kernel & k = detail::kernel_for_matrix(A, "element_op");
-
-  cl_uint op_type = 2; //0: product, 1: division, 2: power
-  if (viennacl::is_division<OP>::value)
-    op_type = 1;
-  else if (viennacl::is_product<OP>::value)
-    op_type = 0;
-
-  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(A),
-                          cl_uint(viennacl::traits::start1(A)),           
cl_uint(viennacl::traits::start2(A)),
-                          cl_uint(viennacl::traits::stride1(A)),          
cl_uint(viennacl::traits::stride2(A)),
-                          cl_uint(viennacl::traits::size1(A)),            
cl_uint(viennacl::traits::size2(A)),
-                          cl_uint(viennacl::traits::internal_size1(A)),   
cl_uint(viennacl::traits::internal_size2(A)),
-
-                          viennacl::traits::opencl_handle(proxy.lhs()),
-                          cl_uint(viennacl::traits::start1(proxy.lhs())),      
     cl_uint(viennacl::traits::start2(proxy.lhs())),
-                          cl_uint(viennacl::traits::stride1(proxy.lhs())),     
     cl_uint(viennacl::traits::stride2(proxy.lhs())),
-                          
cl_uint(viennacl::traits::internal_size1(proxy.lhs())),   
cl_uint(viennacl::traits::internal_size2(proxy.lhs())),
-
-                          viennacl::traits::opencl_handle(proxy.rhs()),
-                          cl_uint(viennacl::traits::start1(proxy.rhs())),      
     cl_uint(viennacl::traits::start2(proxy.rhs())),
-                          cl_uint(viennacl::traits::stride1(proxy.rhs())),     
     cl_uint(viennacl::traits::stride2(proxy.rhs())),
-                          
cl_uint(viennacl::traits::internal_size1(proxy.rhs())),   
cl_uint(viennacl::traits::internal_size2(proxy.rhs())),
-
-                          op_type)
-                        );
-}
-
-
-// Unary operations
-
-/** @brief Implementation of unary element-wise operations A = OP(B)
-*
-* @param A      The result matrix (or -range, or -slice)
-* @param proxy  The proxy object holding B and the operation
-*/
-template <typename T, typename OP>
-void element_op(matrix_base<T> & A,
-                matrix_expression<const matrix_base<T>, const matrix_base<T>, 
op_element_unary<OP> > const & proxy)
-{
-  assert(viennacl::traits::opencl_handle(A).context() == 
viennacl::traits::opencl_handle(proxy.lhs()).context() && bool("Matrices do not 
reside in the same OpenCL context. Automatic migration not yet supported!"));
-  assert(viennacl::traits::opencl_handle(A).context() == 
viennacl::traits::opencl_handle(proxy.rhs()).context() && bool("Matrices do not 
reside in the same OpenCL context. Automatic migration not yet supported!"));
-
-  viennacl::ocl::kernel & k = detail::element_kernel_for_matrix(A, 
detail::op_to_string(OP()) + "_assign");
-
-  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(A),
-                           cl_uint(viennacl::traits::start1(A)),           
cl_uint(viennacl::traits::start2(A)),
-                           cl_uint(viennacl::traits::stride1(A)),          
cl_uint(viennacl::traits::stride2(A)),
-                           cl_uint(viennacl::traits::size1(A)),            
cl_uint(viennacl::traits::size2(A)),
-                           cl_uint(viennacl::traits::internal_size1(A)),   
cl_uint(viennacl::traits::internal_size2(A)),
-
-                           viennacl::traits::opencl_handle(proxy.lhs()),
-                           cl_uint(viennacl::traits::start1(proxy.lhs())),     
      cl_uint(viennacl::traits::start2(proxy.lhs())),
-                           cl_uint(viennacl::traits::stride1(proxy.lhs())),    
      cl_uint(viennacl::traits::stride2(proxy.lhs())),
-                           
cl_uint(viennacl::traits::internal_size1(proxy.lhs())),   
cl_uint(viennacl::traits::internal_size2(proxy.lhs())))
-                        );
-}
-
-
-//
-/////////////////////////   matrix-vector products 
/////////////////////////////////
-//
-
-// A * x
-
-/** @brief Carries out matrix-vector multiplication
-*
-* Implementation of the convenience expression result = prod(mat, vec);
-*
-* @param mat    The matrix
-* @param vec    The vector
-* @param result The result vector
-*/
-template <typename NumericT>
-void prod_impl(const matrix_base<NumericT> & mat, bool trans_A,
-               const vector_base<NumericT> & vec,
-                     vector_base<NumericT> & result)
-{
-  assert(viennacl::traits::handle(vec) != viennacl::traits::handle(result) && 
bool("No direct inplace transposed matrix-vector product possible. Introduce a 
temporary!"));
-
-  viennacl::ocl::kernel & k = detail::kernel_for_matrix(mat, trans_A ? 
"trans_vec_mul" : "vec_mul");
-
-  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat),
-                          cl_uint(viennacl::traits::start1(mat)),         
cl_uint(viennacl::traits::start2(mat)),
-                          cl_uint(viennacl::traits::stride1(mat)),        
cl_uint(viennacl::traits::stride2(mat)),
-                          cl_uint(viennacl::traits::size1(mat)),          
cl_uint(viennacl::traits::size2(mat)),
-                          cl_uint(viennacl::traits::internal_size1(mat)), 
cl_uint(viennacl::traits::internal_size2(mat)),
-
-                          viennacl::traits::opencl_handle(vec),
-                          cl_uint(viennacl::traits::start(vec)),
-                          cl_uint(viennacl::traits::stride(vec)),
-                          cl_uint(viennacl::traits::size(vec)),
-
-                          viennacl::traits::opencl_handle(result),
-                          cl_uint(viennacl::traits::start(result)),
-                          cl_uint(viennacl::traits::stride(result)),
-                          cl_uint(viennacl::traits::size(result)),
-
-                          viennacl::ocl::local_mem(sizeof(NumericT) * 
k.local_work_size())
-                        ) );
-}
-
-
-//
-
-
-/** @brief Carries out matrix-matrix multiplication
-*
-* Implementation of C = prod(A, B);
-*
-*/
-template<typename NumericT, typename ScalarType >
-void prod_impl(matrix_base<NumericT> const & A, bool A_trans,
-               matrix_base<NumericT> const & B, bool B_trans,
-               matrix_base<NumericT>       & C,
-               ScalarType alpha,
-               ScalarType beta)
-{
-    bool effective_A_trans = A_trans ^ A.row_major();
-    bool effective_B_trans = B_trans ^ B.row_major();
-
-    char cAt = effective_A_trans ? 'T' : 'N';
-    char cBt = effective_B_trans ? 'T' : 'N';
-
-    std::string kernel_prefix("prod_");
-    kernel_prefix+=cAt;
-    kernel_prefix+=cBt;
-
-    scheduler::statement statement = scheduler::preset::mat_mat_prod(alpha, 
&A, effective_A_trans, &B, effective_B_trans, beta, &C);
-    kernels::matrix_prod<NumericT>::execution_handler(C.row_major(), 
viennacl::traits::opencl_context(C)).execute(kernel_prefix, statement);
-}
-
-//
-/////////////////////////   miscellaneous operations 
/////////////////////////////////
-//
-
-
-/** @brief The implementation of the operation mat += alpha * vec1 * vec2^T, 
i.e. a scaled rank 1 update
-*
-* Implementation of the convenience expression result += alpha * 
outer_prod(vec1, vec2);
-*
-* @param A    The matrix to be updated
-* @param alpha            The scaling factor (either a viennacl::scalar<>, 
float, or double)
-* @param len_alpha        Length of the buffer for an eventual final reduction 
step (currently always '1')
-* @param reciprocal_alpha Use 1/alpha instead of alpha
-* @param flip_sign_alpha  Use -alpha instead of alpha
-* @param vec1    The first vector
-* @param vec2    The second vector
-*/
-template<typename NumericT, typename ScalarT1>
-void scaled_rank_1_update(matrix_base<NumericT> & A,
-                          ScalarT1 const & alpha, vcl_size_t len_alpha, bool 
reciprocal_alpha, bool flip_sign_alpha,
-                          const vector_base<NumericT> & vec1,
-                          const vector_base<NumericT> & vec2)
-{
-  assert( (viennacl::traits::size1(A) == viennacl::traits::size(vec1)) && 
bool("Size mismatch in scaled_rank_1_update: size1(A) != size(v1)"));
-  assert( (viennacl::traits::size2(A) == viennacl::traits::size(vec2)) && 
bool("Size mismatch in scaled_rank_1_update: size2(A) != size(v2)"));
-
-  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, 
flip_sign_alpha);
-  bool is_cpu = viennacl::is_cpu_scalar<ScalarT1>::value;
-  viennacl::ocl::kernel& kernel= detail::legacy_kernel_for_matrix(A, is_cpu ? 
"scaled_rank1_update_cpu" : "scaled_rank1_update_gpu");
-
-  viennacl::ocl::enqueue(kernel(viennacl::traits::opencl_handle(A),
-                           cl_uint(viennacl::traits::start1(A)),           
cl_uint(viennacl::traits::start2(A)),
-                           cl_uint(viennacl::traits::stride1(A)),          
cl_uint(viennacl::traits::stride2(A)),
-                           cl_uint(viennacl::traits::size1(A)),            
cl_uint(viennacl::traits::size2(A)),
-                           cl_uint(viennacl::traits::internal_size1(A)),   
cl_uint(viennacl::traits::internal_size2(A)),
-
-                           
viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(alpha)),
-                           options_alpha,
-
-                           viennacl::traits::opencl_handle(vec1),
-                           cl_uint(viennacl::traits::start(vec1)),
-                           cl_uint(viennacl::traits::stride(vec1)),
-                           cl_uint(viennacl::traits::size(vec1)),
-
-                           viennacl::traits::opencl_handle(vec2),
-                           cl_uint(viennacl::traits::start(vec2)),
-                           cl_uint(viennacl::traits::stride(vec2)),
-                           cl_uint(viennacl::traits::size(vec2))
-                          )
-                        );
-}
-
-//
-template <typename SCALARTYPE, typename VectorType>
-void bidiag_pack_svd(viennacl::matrix<SCALARTYPE>& A,
-                 VectorType & dh,
-                 VectorType & sh
-                )
-{
-  viennacl::vector<SCALARTYPE> D(dh.size());
-  viennacl::vector<SCALARTYPE> S(sh.size());
-
-  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(A).context());
-  viennacl::ocl::kernel& kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(),
 SVD_BIDIAG_PACK_KERNEL);
-
-  viennacl::ocl::enqueue(kernel(
-                                A,
-                                D,
-                                S,
-                                static_cast<cl_uint>(A.size1()),
-                                static_cast<cl_uint>(A.size2()),
-                                static_cast<cl_uint>(A.internal_size2())
-                              ));
-
-  fast_copy(D, dh);
-  fast_copy(S, sh);
-}
-
-
-template <typename NumericT>
-void bidiag_pack(matrix_base<NumericT> & A,
-                 viennacl::vector<NumericT> & dh,
-                 viennacl::vector<NumericT> & sh
-                )
-{
-  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(A).context());
-
-  if(A.row_major())
-  {
-      viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::init(ctx);
-      viennacl::ocl::kernel& kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, 
row_major>::program_name(), SVD_BIDIAG_PACK_KERNEL);
-
-      viennacl::ocl::enqueue(kernel(
-                                    A,
-                                    dh,
-                                    sh,
-                                    cl_uint(viennacl::traits::size1(A)),
-                                    cl_uint(viennacl::traits::size2(A)),
-                                    
cl_uint(viennacl::traits::internal_size2(A))
-                                  ));
-  }
-  else
-  {
-      viennacl::linalg::opencl::kernels::svd<NumericT, 
column_major>::init(ctx);
-      viennacl::ocl::kernel& kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, 
column_major>::program_name(), SVD_BIDIAG_PACK_KERNEL);
-
-      viennacl::ocl::enqueue(kernel(
-                                    A,
-                                    dh,
-                                    sh,
-                                    cl_uint(viennacl::traits::size1(A)),
-                                    cl_uint(viennacl::traits::size2(A)),
-                                    
cl_uint(viennacl::traits::internal_size2(A))
-                                  ));
-  }
-}
-
-
-template <typename NumericT>
-void house_update_A_left(matrix_base<NumericT> & A,
-                         vector_base<NumericT> & D,
-                         vcl_size_t start)
-{
-
-    viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(A).context());
-    if(A.row_major())
-    {
-        viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::init(ctx);
-        viennacl::ocl::kernel& kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, 
row_major>::program_name(), SVD_HOUSEHOLDER_UPDATE_A_LEFT_KERNEL);
-        viennacl::ocl::enqueue(kernel(
-                                      A,
-                                      D,
-                                      static_cast<cl_uint>(start + 1),
-                                      static_cast<cl_uint>(start),
-                                      cl_uint(viennacl::traits::size1(A)),
-                                      cl_uint(viennacl::traits::size2(A)),
-                                      
cl_uint(viennacl::traits::internal_size2(A)),
-                                      
viennacl::ocl::local_mem(static_cast<cl_uint>(128 * 4))
-                              ));
-    }
-    else
-    {
-        viennacl::linalg::opencl::kernels::svd<NumericT, 
column_major>::init(ctx);
-        viennacl::ocl::kernel& kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, 
column_major>::program_name(), SVD_HOUSEHOLDER_UPDATE_A_LEFT_KERNEL);
-        viennacl::ocl::enqueue(kernel(
-                                      A,
-                                      D,
-                                      static_cast<cl_uint>(start + 1),
-                                      static_cast<cl_uint>(start),
-                                      cl_uint(viennacl::traits::size1(A)),
-                                      cl_uint(viennacl::traits::size2(A)),
-                                      
cl_uint(viennacl::traits::internal_size2(A)),
-                                      
viennacl::ocl::local_mem(static_cast<cl_uint>(128 * 4))
-                              ));
-    }
-
-
-
-
-}
-
-template <typename NumericT>
-void house_update_A_right(matrix_base<NumericT> & A,
-                          vector_base<NumericT> & D)
-{
-    viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(A).context());
-
-    if(A.row_major())
-    {
-        viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::init(ctx);
-        viennacl::ocl::kernel& kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, 
row_major>::program_name(), SVD_HOUSEHOLDER_UPDATE_A_RIGHT_KERNEL);
-
-        viennacl::ocl::enqueue(kernel(
-                                      A,
-                                      D,
-                                      static_cast<cl_uint>(0),
-                                      static_cast<cl_uint>(0),
-                                      cl_uint(viennacl::traits::size1(A)),
-                                      cl_uint(viennacl::traits::size2(A)),
-                                      
cl_uint(viennacl::traits::internal_size2(A)),
-                                      
viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(NumericT)))
-                              ));
-    }
-    else
-    {
-        viennacl::linalg::opencl::kernels::svd<NumericT, 
column_major>::init(ctx);
-        viennacl::ocl::kernel& kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, 
column_major>::program_name(), SVD_HOUSEHOLDER_UPDATE_A_RIGHT_KERNEL);
-
-        viennacl::ocl::enqueue(kernel(
-                                      A,
-                                      D,
-                                      static_cast<cl_uint>(0),
-                                      static_cast<cl_uint>(0),
-                                      cl_uint(viennacl::traits::size1(A)),
-                                      cl_uint(viennacl::traits::size2(A)),
-                                      
cl_uint(viennacl::traits::internal_size2(A)),
-                                      
viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(NumericT)))
-                              ));
-    }
-
-
-}
-
-
-
-template <typename NumericT>
-void house_update_QL(matrix_base<NumericT> & Q,
-                     vector_base<NumericT> & D,
-                     vcl_size_t A_size1)
-
-{
-    viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(Q).context());
-
-    if(Q.row_major())
-    {
-        viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::init(ctx);
-        viennacl::ocl::kernel& kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, 
row_major>::program_name(), SVD_HOUSEHOLDER_UPDATE_QL_KERNEL);
-
-        viennacl::ocl::enqueue(kernel(
-                                        Q,
-                                        D,
-                                        cl_uint(A_size1),
-                                        
cl_uint(viennacl::traits::internal_size2(Q)),
-                                        
viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(NumericT)))
-                                    ));
-    }
-    else
-    {
-        viennacl::linalg::opencl::kernels::svd<NumericT, 
column_major>::init(ctx);
-        viennacl::ocl::kernel& kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, 
column_major>::program_name(), SVD_HOUSEHOLDER_UPDATE_QL_KERNEL);
-
-        viennacl::ocl::enqueue(kernel(
-                                        Q,
-                                        D,
-                                        cl_uint(A_size1),
-                                        
cl_uint(viennacl::traits::internal_size2(Q)),
-                                        
viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(NumericT)))
-                                    ));
-    }
-
-}
-
-
-template<typename NumericT>
-  void givens_next(matrix_base<NumericT> & matrix,
-                  vector_base<NumericT>& tmp1,
-                  vector_base<NumericT>& tmp2,
-                  int l,
-                  int m
-                )
-  {
-    viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(matrix).context());
-
-    if(matrix.row_major())
-    {
-        viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::init(ctx);
-        viennacl::ocl::kernel& kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, 
row_major>::program_name(), SVD_GIVENS_NEXT_KERNEL);
-        kernel.global_work_size(0, 
viennacl::tools::align_to_multiple<cl_uint>(cl_uint(viennacl::traits::size1(matrix)),
 256));
-        kernel.local_work_size(0, 256);
-
-        viennacl::ocl::enqueue(kernel(
-                                      matrix,
-                                      tmp1,
-                                      tmp2,
-                                      cl_uint(viennacl::traits::size1(matrix)),
-                                      
cl_uint(viennacl::traits::internal_size2(matrix)),
-                                      static_cast<cl_uint>(l),
-                                      static_cast<cl_uint>(m - 1)
-                              ));
-    }
-    else
-    {
-        viennacl::linalg::opencl::kernels::svd<NumericT, 
column_major>::init(ctx);
-        viennacl::ocl::kernel& kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, 
column_major>::program_name(), SVD_GIVENS_NEXT_KERNEL);
-        kernel.global_work_size(0, 
viennacl::tools::align_to_multiple<cl_uint>(cl_uint(viennacl::traits::size1(matrix)),
 256));
-        kernel.local_work_size(0, 256);
-
-        viennacl::ocl::enqueue(kernel(
-                                      matrix,
-                                      tmp1,
-                                      tmp2,
-                                      cl_uint(viennacl::traits::size1(matrix)),
-                                      
cl_uint(viennacl::traits::internal_size2(matrix)),
-                                      static_cast<cl_uint>(l),
-                                      static_cast<cl_uint>(m - 1)
-                              ));
-    }
-
-
-  }
-
-  template <typename NumericT>
-  void copy_vec(matrix_base<NumericT>& A,
-                vector_base<NumericT> & V,
-                vcl_size_t row_start,
-                vcl_size_t col_start,
-                bool copy_col
-  )
-  {
-    std::string kernel_name = copy_col ? SVD_COPY_COL_KERNEL : 
SVD_COPY_ROW_KERNEL;
-    viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(A).context());
-
-    if(A.row_major())
-    {
-        viennacl::linalg::opencl::kernels::svd<NumericT, row_major>::init(ctx);
-        viennacl::ocl::kernel& kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, 
row_major>::program_name(), kernel_name);
-
-        viennacl::ocl::enqueue(kernel(
-                                      A,
-                                      V,
-                                      static_cast<cl_uint>(row_start),
-                                      static_cast<cl_uint>(col_start),
-                                      copy_col ? 
cl_uint(viennacl::traits::size1(A))
-                                               : 
cl_uint(viennacl::traits::size2(A)),
-                                      static_cast<cl_uint>(A.internal_size2())
-                              ));
-    }
-    else
-    {
-        viennacl::linalg::opencl::kernels::svd<NumericT, 
column_major>::init(ctx);
-        viennacl::ocl::kernel& kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<NumericT, 
column_major>::program_name(), kernel_name);
-
-        viennacl::ocl::enqueue(kernel(
-                                      A,
-                                      V,
-                                      static_cast<cl_uint>(row_start),
-                                      static_cast<cl_uint>(col_start),
-                                      copy_col ? 
cl_uint(viennacl::traits::size1(A))
-                                               : 
cl_uint(viennacl::traits::size2(A)),
-                                      static_cast<cl_uint>(A.internal_size2())
-                              ));
-    }
-
-
-  }
-
-} // namespace opencl
-} //namespace linalg
-} //namespace viennacl
-
-
-#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/misc_operations.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/misc_operations.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/misc_operations.hpp
deleted file mode 100644
index 83a3db7..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/misc_operations.hpp
+++ /dev/null
@@ -1,69 +0,0 @@
-#ifndef VIENNACL_LINALG_OPENCL_MISC_OPERATIONS_HPP_
-#define VIENNACL_LINALG_OPENCL_MISC_OPERATIONS_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-/** @file viennacl/linalg/opencl/misc_operations.hpp
-    @brief Implementations of operations using compressed_matrix and OpenCL
-*/
-
-#include "viennacl/forwards.h"
-#include "viennacl/ocl/device.hpp"
-#include "viennacl/ocl/handle.hpp"
-#include "viennacl/ocl/kernel.hpp"
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/tools/tools.hpp"
-#include "viennacl/linalg/opencl/kernels/ilu.hpp"
-
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace opencl
-{
-namespace detail
-{
-
-template<typename NumericT>
-void level_scheduling_substitute(vector<NumericT> & x,
-                                 viennacl::backend::mem_handle const & 
row_index_array,
-                                 viennacl::backend::mem_handle const & 
row_buffer,
-                                 viennacl::backend::mem_handle const & 
col_buffer,
-                                 viennacl::backend::mem_handle const & 
element_buffer,
-                                 vcl_size_t num_rows
-                                )
-{
-  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(x).context());
-
-  viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx);
-  viennacl::ocl::kernel & k = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(),
 "level_scheduling_substitute");
-
-  viennacl::ocl::enqueue(k(row_index_array.opencl_handle(), 
row_buffer.opencl_handle(), col_buffer.opencl_handle(), 
element_buffer.opencl_handle(),
-                           x,
-                           static_cast<cl_uint>(num_rows)));
-}
-
-} //namespace detail
-} // namespace opencl
-} //namespace linalg
-} //namespace viennacl
-
-
-#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/nmf_operations.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/nmf_operations.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/nmf_operations.hpp
deleted file mode 100644
index 5daf297..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/nmf_operations.hpp
+++ /dev/null
@@ -1,139 +0,0 @@
-#ifndef VIENNACL_LINALG_OPENCL_NMF_OPERATIONS_HPP_
-#define VIENNACL_LINALG_OPENCL_NMF_OPERATIONS_HPP_
-
-/* =========================================================================
- Copyright (c) 2010-2016, Institute for Microelectronics,
- Institute for Analysis and Scientific Computing,
- TU Wien.
- Portions of this software are copyright by UChicago Argonne, LLC.
-
- -----------------
- ViennaCL - The Vienna Computing Library
- -----------------
-
- Project Head:    Karl Rupp                   [email protected]
-
- (A list of authors and contributors can be found in the manual)
-
- License:         MIT (X11), see file LICENSE in the base directory
- ============================================================================= 
*/
-
-/** @file viennacl/linalg/opencl/vector_operations.hpp
- @brief Implementations of NMF operations using OpenCL
- */
-
-#include "viennacl/linalg/opencl/kernels/nmf.hpp"
-#include "viennacl/linalg/opencl/nmf_operations.hpp"
-
-#include "viennacl/linalg/host_based/nmf_operations.hpp"
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace opencl
-{
-
-/** @brief The nonnegative matrix factorization (approximation) algorithm as 
suggested by Lee and Seung. Factorizes a matrix V with nonnegative entries into 
matrices W and H such that ||V - W*H|| is minimized.
- *
- * @param V     Input matrix
- * @param W     First factor
- * @param H     Second factor
- * @param conf  A configuration object holding tolerances and the like
- */
-template<typename NumericT>
-void nmf(viennacl::matrix_base<NumericT> const & V,
-         viennacl::matrix_base<NumericT>       & W,
-         viennacl::matrix_base<NumericT>       & H,
-         viennacl::linalg::nmf_config const & conf)
-{
-  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(V).context());
-
-  const std::string NMF_MUL_DIV_KERNEL = "el_wise_mul_div";
-
-  viennacl::linalg::opencl::kernels::nmf<NumericT>::init(ctx);
-
-  vcl_size_t k = W.size2();
-  conf.iters_ = 0;
-
-  if (viennacl::linalg::norm_frobenius(W) <= 0)
-    W = viennacl::scalar_matrix<NumericT>(W.size1(), W.size2(), NumericT(1), 
ctx);
-
-  if (viennacl::linalg::norm_frobenius(H) <= 0)
-    H = viennacl::scalar_matrix<NumericT>(H.size1(), H.size2(), NumericT(1), 
ctx);
-
-  viennacl::matrix_base<NumericT> wn(V.size1(), k, W.row_major(), ctx);
-  viennacl::matrix_base<NumericT> wd(V.size1(), k, W.row_major(), ctx);
-  viennacl::matrix_base<NumericT> wtmp(V.size1(), V.size2(), W.row_major(), 
ctx);
-
-  viennacl::matrix_base<NumericT> hn(k, V.size2(), H.row_major(), ctx);
-  viennacl::matrix_base<NumericT> hd(k, V.size2(), H.row_major(), ctx);
-  viennacl::matrix_base<NumericT> htmp(k, k, H.row_major(), ctx);
-
-  viennacl::matrix_base<NumericT> appr(V.size1(), V.size2(), V.row_major(), 
ctx);
-
-  NumericT last_diff = 0;
-  NumericT diff_init = 0;
-  bool stagnation_flag = false;
-
-  for (vcl_size_t i = 0; i < conf.max_iterations(); i++)
-  {
-    conf.iters_ = i + 1;
-    {
-      hn = viennacl::linalg::prod(trans(W), V);
-      htmp = viennacl::linalg::prod(trans(W), W);
-      hd = viennacl::linalg::prod(htmp, H);
-
-      viennacl::ocl::kernel & mul_div_kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::nmf<NumericT>::program_name(),
 NMF_MUL_DIV_KERNEL);
-      viennacl::ocl::enqueue(mul_div_kernel(H, hn, hd, 
cl_uint(H.internal_size1() * H.internal_size2())));
-    }
-    {
-      wn = viennacl::linalg::prod(V, trans(H));
-      wtmp = viennacl::linalg::prod(W, H);
-      wd = viennacl::linalg::prod(wtmp, trans(H));
-
-      viennacl::ocl::kernel & mul_div_kernel = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::nmf<NumericT>::program_name(),
 NMF_MUL_DIV_KERNEL);
-
-      viennacl::ocl::enqueue(mul_div_kernel(W, wn, wd, 
cl_uint(W.internal_size1() * W.internal_size2())));
-    }
-
-    if (i % conf.check_after_steps() == 0)  //check for convergence
-    {
-      appr = viennacl::linalg::prod(W, H);
-
-      appr -= V;
-      NumericT diff_val = viennacl::linalg::norm_frobenius(appr);
-
-      if (i == 0)
-        diff_init = diff_val;
-
-      if (conf.print_relative_error())
-        std::cout << diff_val / diff_init << std::endl;
-
-      // Approximation check
-      if (diff_val / diff_init < conf.tolerance())
-        break;
-
-      // Stagnation check
-      if (std::fabs(diff_val - last_diff) / (diff_val * 
NumericT(conf.check_after_steps())) < conf.stagnation_tolerance()) //avoid 
situations where convergence stagnates
-      {
-        if (stagnation_flag)    // iteration stagnates (two iterates with no 
notable progress)
-          break;
-        else
-          // record stagnation in this iteration
-          stagnation_flag = true;
-      } else
-        // good progress in this iteration, so unset stagnation flag
-        stagnation_flag = false;
-
-      // prepare for next iterate:
-      last_diff = diff_val;
-    }
-  }
-}
-
-} //namespace opencl
-} //namespace linalg
-} //namespace viennacl
-
-#endif /* VIENNACL_LINALG_OPENCL_NMF_OPERATIONS_HPP_ */

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/scalar_operations.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/scalar_operations.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/scalar_operations.hpp
deleted file mode 100644
index a94681f..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/scalar_operations.hpp
+++ /dev/null
@@ -1,205 +0,0 @@
-#ifndef VIENNACL_LINALG_OPENCL_SCALAR_OPERATIONS_HPP_
-#define VIENNACL_LINALG_OPENCL_SCALAR_OPERATIONS_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-/** @file viennacl/linalg/opencl/scalar_operations.hpp
-    @brief Implementations of scalar operations using OpenCL
-*/
-
-#include "viennacl/forwards.h"
-#include "viennacl/ocl/device.hpp"
-#include "viennacl/ocl/handle.hpp"
-#include "viennacl/ocl/kernel.hpp"
-#include "viennacl/tools/tools.hpp"
-#include "viennacl/linalg/opencl/kernels/scalar.hpp"
-#include "viennacl/linalg/opencl/common.hpp"
-#include "viennacl/meta/predicate.hpp"
-#include "viennacl/meta/result_of.hpp"
-#include "viennacl/meta/enable_if.hpp"
-#include "viennacl/traits/size.hpp"
-#include "viennacl/traits/start.hpp"
-#include "viennacl/traits/handle.hpp"
-#include "viennacl/traits/stride.hpp"
-#include "viennacl/traits/handle.hpp"
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace opencl
-{
-
-template<typename ScalarT1,
-         typename ScalarT2, typename NumericT>
-typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
-                              && viennacl::is_scalar<ScalarT2>::value
-                              && viennacl::is_any_scalar<NumericT>::value
-                            >::type
-as(ScalarT1 & s1,
-   ScalarT2 const & s2, NumericT const & alpha, vcl_size_t len_alpha, bool 
reciprocal_alpha, bool flip_sign_alpha)
-{
-  assert( &viennacl::traits::opencl_handle(s1).context() == 
&viennacl::traits::opencl_handle(s2).context() && bool("Operands not in the 
same OpenCL context!"));
-
-  typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type        
value_type;
-  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(s1).context());
-  viennacl::linalg::opencl::kernels::scalar<value_type>::init(ctx);
-
-  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, 
flip_sign_alpha);
-
-  bool is_cpu = viennacl::is_cpu_scalar<NumericT>::value;
-  viennacl::ocl::kernel & k = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::scalar<value_type>::program_name(),
 is_cpu ? "as_cpu" : "as_gpu");
-  k.local_work_size(0, 1);
-  k.global_work_size(0, 1);
-  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(s1),
-                           
viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha)),
-                           options_alpha,
-                           viennacl::traits::opencl_handle(s2) )
-                        );
-}
-
-
-template<typename ScalarT1,
-         typename ScalarT2, typename NumericT2,
-         typename ScalarT3, typename NumericT3>
-typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
-                              && viennacl::is_scalar<ScalarT2>::value
-                              && viennacl::is_scalar<ScalarT3>::value
-                              && viennacl::is_any_scalar<NumericT2>::value
-                              && viennacl::is_any_scalar<NumericT3>::value
-                            >::type
-asbs(ScalarT1 & s1,
-     ScalarT2 const & s2, NumericT2 const & alpha, vcl_size_t len_alpha, bool 
reciprocal_alpha, bool flip_sign_alpha,
-     ScalarT3 const & s3, NumericT3 const & beta,  vcl_size_t len_beta,  bool 
reciprocal_beta,  bool flip_sign_beta)
-{
-  assert( &viennacl::traits::opencl_handle(s1).context() == 
&viennacl::traits::opencl_handle(s2).context() && bool("Operands not in the 
same OpenCL context!"));
-  assert( &viennacl::traits::opencl_handle(s2).context() == 
&viennacl::traits::opencl_handle(s3).context() && bool("Operands not in the 
same OpenCL context!"));
-
-  typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type        
value_type;
-  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(s1).context());
-  viennacl::linalg::opencl::kernels::scalar<value_type>::init(ctx);
-
-  std::string kernel_name;
-  bool is_cpu_2 = viennacl::is_cpu_scalar<NumericT2>::value;
-  bool is_cpu_3 = viennacl::is_cpu_scalar<NumericT3>::value;
-  if (is_cpu_2 && is_cpu_3)
-    kernel_name = "asbs_cpu_cpu";
-  else if (is_cpu_2 && !is_cpu_3)
-    kernel_name = "asbs_cpu_gpu";
-  else if (!is_cpu_2 && is_cpu_3)
-    kernel_name = "asbs_gpu_cpu";
-  else
-    kernel_name = "asbs_gpu_gpu";
-
-  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, 
flip_sign_alpha);
-  cl_uint options_beta  = detail::make_options(len_beta,  reciprocal_beta,  
flip_sign_beta);
-
-  viennacl::ocl::kernel & k = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::scalar<value_type>::program_name(),
 kernel_name);
-  k.local_work_size(0, 1);
-  k.global_work_size(0, 1);
-  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(s1),
-                           
viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha)),
-                           options_alpha,
-                           viennacl::traits::opencl_handle(s2),
-                           
viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(beta)),
-                           options_beta,
-                           viennacl::traits::opencl_handle(s3) )
-                        );
-}
-
-
-template<typename ScalarT1,
-         typename ScalarT2, typename NumericT2,
-         typename ScalarT3, typename NumericT3>
-typename viennacl::enable_if< viennacl::is_scalar<ScalarT1>::value
-                              && viennacl::is_scalar<ScalarT2>::value
-                              && viennacl::is_scalar<ScalarT3>::value
-                              && viennacl::is_any_scalar<NumericT2>::value
-                              && viennacl::is_any_scalar<NumericT3>::value
-                            >::type
-asbs_s(ScalarT1 & s1,
-       ScalarT2 const & s2, NumericT2 const & alpha, vcl_size_t len_alpha, 
bool reciprocal_alpha, bool flip_sign_alpha,
-       ScalarT3 const & s3, NumericT3 const & beta,  vcl_size_t len_beta,  
bool reciprocal_beta,  bool flip_sign_beta)
-{
-  assert( &viennacl::traits::opencl_handle(s1).context() == 
&viennacl::traits::opencl_handle(s2).context() && bool("Operands not in the 
same OpenCL context!"));
-  assert( &viennacl::traits::opencl_handle(s2).context() == 
&viennacl::traits::opencl_handle(s3).context() && bool("Operands not in the 
same OpenCL context!"));
-
-  typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type        
value_type;
-  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(s1).context());
-  viennacl::linalg::opencl::kernels::scalar<value_type>::init(ctx);
-
-  std::string kernel_name;
-  if (viennacl::is_cpu_scalar<NumericT2>::value && 
viennacl::is_cpu_scalar<NumericT3>::value)
-    kernel_name = "asbs_s_cpu_cpu";
-  else if (viennacl::is_cpu_scalar<NumericT2>::value && 
!viennacl::is_cpu_scalar<NumericT3>::value)
-    kernel_name = "asbs_s_cpu_gpu";
-  else if (!viennacl::is_cpu_scalar<NumericT2>::value && 
viennacl::is_cpu_scalar<NumericT3>::value)
-    kernel_name = "asbs_s_gpu_cpu";
-  else
-    kernel_name = "asbs_s_gpu_gpu";
-
-  cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, 
flip_sign_alpha);
-  cl_uint options_beta  = detail::make_options(len_beta,  reciprocal_beta,  
flip_sign_beta);
-
-  viennacl::ocl::kernel & k = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::scalar<value_type>::program_name(),
 kernel_name);
-  k.local_work_size(0, 1);
-  k.global_work_size(0, 1);
-  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(s1),
-                           
viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha)),
-                           options_alpha,
-                           viennacl::traits::opencl_handle(s2),
-                           
viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(beta)),
-                           options_beta,
-                           viennacl::traits::opencl_handle(s3) )
-                        );
-}
-
-
-/** @brief Swaps the contents of two scalars, data is copied
-*
-* @param s1   The first scalar
-* @param s2   The second scalar
-*/
-template<typename ScalarT1, typename ScalarT2>
-typename viennacl::enable_if<    viennacl::is_scalar<ScalarT1>::value
-                              && viennacl::is_scalar<ScalarT2>::value
-                            >::type
-swap(ScalarT1 & s1, ScalarT2 & s2)
-{
-  assert( &viennacl::traits::opencl_handle(s1).context() == 
&viennacl::traits::opencl_handle(s2).context() && bool("Operands not in the 
same OpenCL context!"));
-
-  typedef typename viennacl::result_of::cpu_value_type<ScalarT1>::type        
value_type;
-  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context 
&>(viennacl::traits::opencl_handle(s1).context());
-  viennacl::linalg::opencl::kernels::scalar<value_type>::init(ctx);
-
-  viennacl::ocl::kernel & k = 
ctx.get_kernel(viennacl::linalg::opencl::kernels::scalar<value_type>::program_name(),
 "swap");
-  k.local_work_size(0, 1);
-  k.global_work_size(0, 1);
-  viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(s1),
-                           viennacl::traits::opencl_handle(s2))
-                        );
-}
-
-
-
-} //namespace opencl
-} //namespace linalg
-} //namespace viennacl
-
-
-#endif

Reply via email to