http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/vector_operations.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/vector_operations.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/vector_operations.hpp
deleted file mode 100644
index b7eaeb4..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/vector_operations.hpp
+++ /dev/null
@@ -1,3252 +0,0 @@
-#ifndef VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_
-#define VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-/** @file viennacl/linalg/cuda/vector_operations.hpp
-    @brief Implementations of vector operations using a plain single-threaded 
execution on CPU
-*/
-
-#include <cmath>
-#include "viennacl/forwards.h"
-#include "viennacl/scalar.hpp"
-#include "viennacl/tools/tools.hpp"
-#include "viennacl/meta/predicate.hpp"
-#include "viennacl/meta/enable_if.hpp"
-#include "viennacl/traits/size.hpp"
-#include "viennacl/traits/start.hpp"
-#include "viennacl/traits/stride.hpp"
-
-#include "viennacl/linalg/cuda/common.hpp"
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace cuda
-{
-
-//
-// Introductory note: By convention, all dimensions are already checked in the 
dispatcher frontend. No need to double-check again in here!
-//
-template<typename DestNumericT, typename SrcNumericT>
-__global__ void convert_kernel(DestNumericT      * dest, unsigned int 
start_dest, unsigned int inc_dest, unsigned int size_dest,
-                               SrcNumericT const * src,  unsigned int 
start_src,  unsigned int inc_src)
-{
-  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                    i < size_dest;
-                    i += gridDim.x * blockDim.x)
-    dest[i*inc_dest+start_dest] = src[i*inc_src+start_src];
-}
-
-
-template<typename DestNumericT, typename SrcNumericT>
-void convert(vector_base<DestNumericT> & dest, vector_base<SrcNumericT> const 
& src)
-{
-  convert_kernel<<<128, 128>>>(viennacl::cuda_arg(dest),
-                              static_cast<unsigned 
int>(viennacl::traits::start(dest)),
-                              static_cast<unsigned 
int>(viennacl::traits::stride(dest)),
-                              static_cast<unsigned 
int>(viennacl::traits::size(dest)),
-
-                              viennacl::cuda_arg(src),
-                              static_cast<unsigned 
int>(viennacl::traits::start(src)),
-                              static_cast<unsigned 
int>(viennacl::traits::stride(src)) );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("convert_kernel");
-}
-
-
-//////////////////////// av /////////////////////////////
-
-// gpu scalar
-template<typename NumericT>
-__global__ void av_kernel(NumericT * vec1,
-                          unsigned int start1,
-                          unsigned int inc1,
-                          unsigned int size1,
-
-                          const NumericT * fac2,
-                          unsigned int options2,
-                          const NumericT * vec2,
-                          unsigned int start2,
-                          unsigned int inc2)
-{
-  NumericT alpha = *fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  if (options2 & (1 << 1))
-  {
-    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                      i < size1;
-                      i += gridDim.x * blockDim.x)
-      vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha;
-  }
-  else
-  {
-    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                      i < size1;
-                      i += gridDim.x * blockDim.x)
-      vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha;
-  }
-}
-
-// cpu scalar
-template<typename NumericT>
-__global__ void av_kernel(NumericT * vec1,
-                          unsigned int start1,
-                          unsigned int inc1,
-                          unsigned int size1,
-
-                          NumericT fac2,
-                          unsigned int options2,
-                          const NumericT * vec2,
-                          unsigned int start2,
-                          unsigned int inc2)
-{
-  NumericT alpha = fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  if (options2 & (1 << 1))
-  {
-    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                      i < size1;
-                      i += gridDim.x * blockDim.x)
-      vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha;
-  }
-  else
-  {
-    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                      i < size1;
-                      i += gridDim.x * blockDim.x)
-      vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha;
-  }
-}
-
-
-
-template<typename NumericT, typename ScalarType1>
-void av(vector_base<NumericT> & vec1,
-        vector_base<NumericT> const & vec2, ScalarType1 const & alpha, 
vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
-{
-  typedef NumericT        value_type;
-
-  unsigned int options_alpha = detail::make_options(len_alpha, 
reciprocal_alpha, flip_sign_alpha);
-
-  value_type data_alpha = alpha;
-  if (flip_sign_alpha)
-    data_alpha = -data_alpha;
-  if (reciprocal_alpha)
-    data_alpha = static_cast<value_type>(1) / data_alpha;
-
-  value_type temporary_alpha = 0;
-  if (viennacl::is_cpu_scalar<ScalarType1>::value)
-    temporary_alpha = alpha;
-
-  av_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
-                          static_cast<unsigned 
int>(viennacl::traits::start(vec1)),
-                          static_cast<unsigned 
int>(viennacl::traits::stride(vec1)),
-                          static_cast<unsigned 
int>(viennacl::traits::size(vec1)),
-
-                          
viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
-                          options_alpha,
-                          viennacl::cuda_arg(vec2),
-                          static_cast<unsigned 
int>(viennacl::traits::start(vec2)),
-                          static_cast<unsigned 
int>(viennacl::traits::stride(vec2)) );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
-}
-
-
-///////////////////// avbv //////////////////////////////////
-
-// alpha and beta on GPU
-template<typename NumericT>
-__global__ void avbv_kernel(NumericT * vec1,
-                            unsigned int start1,
-                            unsigned int inc1,
-                            unsigned int size1,
-
-                            const NumericT * fac2,
-                            unsigned int options2,
-                            const NumericT * vec2,
-                            unsigned int start2,
-                            unsigned int inc2,
-
-                            const NumericT * fac3,
-                            unsigned int options3,
-                            const NumericT * vec3,
-                            unsigned int start3,
-                            unsigned int inc3)
-{
-  NumericT alpha = *fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  NumericT beta = *fac3;
-  if (options3 & (1 << 0))
-    beta = -beta;
-
-  if (options2 & (1 << 1))
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + 
vec3[i*inc3+start3] / beta;
-    }
-    else
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + 
vec3[i*inc3+start3] * beta;
-    }
-  }
-  else
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + 
vec3[i*inc3+start3] / beta;
-    }
-    else
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + 
vec3[i*inc3+start3] * beta;
-    }
-  }
-}
-
-// alpha on CPU, beta on GPU
-template<typename NumericT>
-__global__ void avbv_kernel(NumericT * vec1,
-                            unsigned int start1,
-                            unsigned int inc1,
-                            unsigned int size1,
-
-                            NumericT fac2,
-                            unsigned int options2,
-                            const NumericT * vec2,
-                            unsigned int start2,
-                            unsigned int inc2,
-
-                            const NumericT * fac3,
-                            unsigned int options3,
-                            const NumericT * vec3,
-                            unsigned int start3,
-                            unsigned int inc3)
-{
-  NumericT alpha = fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  NumericT beta = *fac3;
-  if (options3 & (1 << 0))
-    beta = -beta;
-
-  if (options2 & (1 << 1))
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + 
vec3[i*inc3+start3] / beta;
-    }
-    else
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + 
vec3[i*inc3+start3] * beta;
-    }
-  }
-  else
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + 
vec3[i*inc3+start3] / beta;
-    }
-    else
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + 
vec3[i*inc3+start3] * beta;
-    }
-  }
-}
-
-// alpha on GPU, beta on CPU
-template<typename NumericT>
-__global__ void avbv_kernel(NumericT * vec1,
-                            unsigned int start1,
-                            unsigned int inc1,
-                            unsigned int size1,
-
-                            const NumericT * fac2,
-                            unsigned int options2,
-                            const NumericT * vec2,
-                            unsigned int start2,
-                            unsigned int inc2,
-
-                            NumericT fac3,
-                            unsigned int options3,
-                            const NumericT * vec3,
-                            unsigned int start3,
-                            unsigned int inc3)
-{
-  NumericT alpha = *fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  NumericT beta = fac3;
-  if (options3 & (1 << 0))
-    beta = -beta;
-
-  if (options2 & (1 << 1))
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + 
vec3[i*inc3+start3] / beta;
-    }
-    else
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + 
vec3[i*inc3+start3] * beta;
-    }
-  }
-  else
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + 
vec3[i*inc3+start3] / beta;
-    }
-    else
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + 
vec3[i*inc3+start3] * beta;
-    }
-  }
-}
-
-// alpha and beta on CPU
-template<typename NumericT>
-__global__ void avbv_kernel(NumericT * vec1,
-                            unsigned int start1,
-                            unsigned int inc1,
-                            unsigned int size1,
-
-                            NumericT fac2,
-                            unsigned int options2,
-                            const NumericT * vec2,
-                            unsigned int start2,
-                            unsigned int inc2,
-
-                            NumericT fac3,
-                            unsigned int options3,
-                            const NumericT * vec3,
-                            unsigned int start3,
-                            unsigned int inc3)
-{
-  NumericT alpha = fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  NumericT beta = fac3;
-  if (options3 & (1 << 0))
-    beta = -beta;
-
-  if (options2 & (1 << 1))
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + 
vec3[i*inc3+start3] / beta;
-    }
-    else
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + 
vec3[i*inc3+start3] * beta;
-    }
-  }
-  else
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + 
vec3[i*inc3+start3] / beta;
-    }
-    else
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + 
vec3[i*inc3+start3] * beta;
-    }
-  }
-}
-
-
-
-
-template<typename NumericT, typename ScalarT1, typename ScalarT2>
-void avbv(vector_base<NumericT> & vec1,
-          vector_base<NumericT> const & vec2, ScalarT1 const & alpha, 
vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
-          vector_base<NumericT> const & vec3, ScalarT2 const & beta,  
vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
-{
-  typedef NumericT        value_type;
-
-  unsigned int options_alpha = detail::make_options(len_alpha, 
reciprocal_alpha, flip_sign_alpha);
-
-  value_type data_alpha = alpha;
-  if (flip_sign_alpha)
-    data_alpha = -data_alpha;
-  if (reciprocal_alpha)
-    data_alpha = static_cast<value_type>(1) / data_alpha;
-
-  value_type temporary_alpha = 0;
-  if (viennacl::is_cpu_scalar<ScalarT1>::value)
-    temporary_alpha = alpha;
-
-  unsigned int options_beta  = detail::make_options(len_beta,  
reciprocal_beta,  flip_sign_beta);
-
-  value_type temporary_beta = 0;
-  if (viennacl::is_cpu_scalar<ScalarT2>::value)
-    temporary_beta = beta;
-
-
-  avbv_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
-                            static_cast<unsigned 
int>(viennacl::traits::start(vec1)),
-                            static_cast<unsigned 
int>(viennacl::traits::stride(vec1)),
-                            static_cast<unsigned 
int>(viennacl::traits::size(vec1)),
-
-                            
viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
-                            options_alpha,
-                            viennacl::cuda_arg(vec2),
-                            static_cast<unsigned 
int>(viennacl::traits::start(vec2)),
-                            static_cast<unsigned 
int>(viennacl::traits::stride(vec2)),
-
-                            
viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
-                            options_beta,
-                            viennacl::cuda_arg(vec3),
-                            static_cast<unsigned 
int>(viennacl::traits::start(vec3)),
-                            static_cast<unsigned 
int>(viennacl::traits::stride(vec3)) );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("avbv_kernel");
-}
-
-
-////////////////////////// avbv_v //////////////////////////////////////
-
-
-// alpha and beta on GPU
-template<typename NumericT>
-__global__ void avbv_v_kernel(NumericT * vec1,
-                              unsigned int start1,
-                              unsigned int inc1,
-                              unsigned int size1,
-
-                              const NumericT * fac2,
-                              unsigned int options2,
-                              const NumericT * vec2,
-                              unsigned int start2,
-                              unsigned int inc2,
-
-                              const NumericT * fac3,
-                              unsigned int options3,
-                              const NumericT * vec3,
-                              unsigned int start3,
-                              unsigned int inc3)
-{
-  NumericT alpha = *fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  NumericT beta = *fac3;
-  if (options3 & (1 << 0))
-    beta = -beta;
-
-  if (options2 & (1 << 1))
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + 
vec3[i*inc3+start3] / beta;
-    }
-    else
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + 
vec3[i*inc3+start3] * beta;
-    }
-  }
-  else
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + 
vec3[i*inc3+start3] / beta;
-    }
-    else
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + 
vec3[i*inc3+start3] * beta;
-    }
-  }
-}
-
-// alpha on CPU, beta on GPU
-template<typename NumericT>
-__global__ void avbv_v_kernel(NumericT * vec1,
-                              unsigned int start1,
-                              unsigned int inc1,
-                              unsigned int size1,
-
-                              NumericT fac2,
-                              unsigned int options2,
-                              const NumericT * vec2,
-                              unsigned int start2,
-                              unsigned int inc2,
-
-                              const NumericT * fac3,
-                              unsigned int options3,
-                              const NumericT * vec3,
-                              unsigned int start3,
-                              unsigned int inc3)
-{
-  NumericT alpha = fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  NumericT beta = *fac3;
-  if (options3 & (1 << 0))
-    beta = -beta;
-
-  if (options2 & (1 << 1))
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + 
vec3[i*inc3+start3] / beta;
-    }
-    else
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + 
vec3[i*inc3+start3] * beta;
-    }
-  }
-  else
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + 
vec3[i*inc3+start3] / beta;
-    }
-    else
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + 
vec3[i*inc3+start3] * beta;
-    }
-  }
-}
-
-// alpha on GPU, beta on CPU
-template<typename NumericT>
-__global__ void avbv_v_kernel(NumericT * vec1,
-                              unsigned int start1,
-                              unsigned int inc1,
-                              unsigned int size1,
-
-                              const NumericT * fac2,
-                              unsigned int options2,
-                              const NumericT * vec2,
-                              unsigned int start2,
-                              unsigned int inc2,
-
-                              NumericT fac3,
-                              unsigned int options3,
-                              const NumericT * vec3,
-                              unsigned int start3,
-                              unsigned int inc3)
-{
-  NumericT alpha = *fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  NumericT beta = fac3;
-  if (options3 & (1 << 0))
-    beta = -beta;
-
-  if (options2 & (1 << 1))
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + 
vec3[i*inc3+start3] / beta;
-    }
-    else
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + 
vec3[i*inc3+start3] * beta;
-    }
-  }
-  else
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + 
vec3[i*inc3+start3] / beta;
-    }
-    else
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + 
vec3[i*inc3+start3] * beta;
-    }
-  }
-}
-
-// alpha and beta on CPU
-template<typename NumericT>
-__global__ void avbv_v_kernel(NumericT * vec1,
-                              unsigned int start1,
-                              unsigned int inc1,
-                              unsigned int size1,
-
-                              NumericT fac2,
-                              unsigned int options2,
-                              const NumericT * vec2,
-                              unsigned int start2,
-                              unsigned int inc2,
-
-                              NumericT fac3,
-                              unsigned int options3,
-                              const NumericT * vec3,
-                              unsigned int start3,
-                              unsigned int inc3)
-{
-  NumericT alpha = fac2;
-  if (options2 & (1 << 0))
-    alpha = -alpha;
-
-  NumericT beta = fac3;
-  if (options3 & (1 << 0))
-    beta = -beta;
-
-  if (options2 & (1 << 1))
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + 
vec3[i*inc3+start3] / beta;
-    }
-    else
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + 
vec3[i*inc3+start3] * beta;
-    }
-  }
-  else
-  {
-    if (options3 & (1 << 1))
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + 
vec3[i*inc3+start3] / beta;
-    }
-    else
-    {
-      for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                        i < size1;
-                        i += gridDim.x * blockDim.x)
-        vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + 
vec3[i*inc3+start3] * beta;
-    }
-  }
-}
-
-
-template<typename NumericT, typename ScalarT1, typename ScalarT2>
-void avbv_v(vector_base<NumericT> & vec1,
-            vector_base<NumericT> const & vec2, ScalarT1 const & alpha, 
vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
-            vector_base<NumericT> const & vec3, ScalarT2 const & beta,  
vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
-{
-  typedef NumericT        value_type;
-
-  unsigned int options_alpha = detail::make_options(len_alpha, 
reciprocal_alpha, flip_sign_alpha);
-
-  value_type data_alpha = alpha;
-  if (flip_sign_alpha)
-    data_alpha = -data_alpha;
-  if (reciprocal_alpha)
-    data_alpha = static_cast<value_type>(1) / data_alpha;
-
-  value_type temporary_alpha = 0;
-  if (viennacl::is_cpu_scalar<ScalarT1>::value)
-    temporary_alpha = alpha;
-
-  unsigned int options_beta  = detail::make_options(len_beta,  
reciprocal_beta,  flip_sign_beta);
-
-  value_type temporary_beta = 0;
-  if (viennacl::is_cpu_scalar<ScalarT2>::value)
-    temporary_beta = beta;
-
-
-  avbv_v_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
-                              static_cast<unsigned 
int>(viennacl::traits::start(vec1)),
-                              static_cast<unsigned 
int>(viennacl::traits::stride(vec1)),
-                              static_cast<unsigned 
int>(viennacl::traits::size(vec1)),
-
-                              
viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
-                              options_alpha,
-                              viennacl::cuda_arg(vec2),
-                              static_cast<unsigned 
int>(viennacl::traits::start(vec2)),
-                              static_cast<unsigned 
int>(viennacl::traits::stride(vec2)),
-
-                              
viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
-                              options_beta,
-                              viennacl::cuda_arg(vec3),
-                              static_cast<unsigned 
int>(viennacl::traits::start(vec3)),
-                              static_cast<unsigned 
int>(viennacl::traits::stride(vec3)) );
-}
-
-
-//////////////////////////
-
-template<typename NumericT>
-__global__ void vector_assign_kernel(NumericT * vec1,
-                                     unsigned int start1,
-                                     unsigned int inc1,
-                                     unsigned int size1,
-                                     unsigned int internal_size1,
-
-                                     NumericT alpha)
-{
-  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                    i < size1;
-                    i += gridDim.x * blockDim.x)
-    vec1[i*inc1+start1] =  (i < size1) ? alpha : 0;
-}
-
-/** @brief Assign a constant value to a vector (-range/-slice)
-*
-* @param vec1   The vector to which the value should be assigned
-* @param alpha  The value to be assigned
-* @param up_to_internal_size  Specifies whether alpha should also be written 
to padded memory (mostly used for clearing the whole buffer).
-*/
-template<typename NumericT, typename ScalarT1>
-void vector_assign(vector_base<NumericT> & vec1, ScalarT1 const & alpha, bool 
up_to_internal_size = false)
-{
-  typedef NumericT        value_type;
-
-  value_type temporary_alpha = 0;
-  if (viennacl::is_cpu_scalar<ScalarT1>::value)
-    temporary_alpha = alpha;
-
-  unsigned int size = up_to_internal_size ? static_cast<unsigned 
int>(vec1.internal_size()) : static_cast<unsigned 
int>(viennacl::traits::size(vec1));
-
-  vector_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
-                                     static_cast<unsigned 
int>(viennacl::traits::start(vec1)),
-                                     static_cast<unsigned 
int>(viennacl::traits::stride(vec1)),
-                                     size,
-                                     static_cast<unsigned 
int>(vec1.internal_size()),  //Note: Do NOT use traits::internal_size() here, 
because vector proxies don't require padding.
-
-                                     
viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)) );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_assign_kernel");
-}
-
-//////////////////////////
-
-template<typename NumericT>
-__global__ void vector_swap_kernel(NumericT * vec1,
-                                   unsigned int start1,
-                                   unsigned int inc1,
-                                   unsigned int size1,
-
-                                   NumericT * vec2,
-                                   unsigned int start2,
-                                   unsigned int inc2)
-{
-  NumericT tmp;
-  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                    i < size1;
-                    i += gridDim.x * blockDim.x)
-  {
-    tmp = vec2[i*inc2+start2];
-    vec2[i*inc2+start2] = vec1[i*inc1+start1];
-    vec1[i*inc1+start1] = tmp;
-  }
-}
-
-
-/** @brief Swaps the contents of two vectors, data is copied
-*
-* @param vec1   The first vector (or -range, or -slice)
-* @param vec2   The second vector (or -range, or -slice)
-*/
-template<typename NumericT>
-void vector_swap(vector_base<NumericT> & vec1, vector_base<NumericT> & vec2)
-{
-  vector_swap_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
-                                   static_cast<unsigned 
int>(viennacl::traits::start(vec1)),
-                                   static_cast<unsigned 
int>(viennacl::traits::stride(vec1)),
-                                   static_cast<unsigned 
int>(viennacl::traits::size(vec1)),
-
-                                   viennacl::cuda_arg(vec2),
-                                   static_cast<unsigned 
int>(viennacl::traits::start(vec2)),
-                                   static_cast<unsigned 
int>(viennacl::traits::stride(vec2)) );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_swap_kernel");
-}
-
-///////////////////////// Binary Elementwise operations /////////////
-
-template<typename NumericT>
-__global__ void element_op_kernel(NumericT * vec1,
-                                   unsigned int start1,
-                                   unsigned int inc1,
-                                   unsigned int size1,
-
-                                   NumericT const * vec2,
-                                   unsigned int start2,
-                                   unsigned int inc2,
-
-                                   NumericT const * vec3,
-                                   unsigned int start3,
-                                   unsigned int inc3,
-
-                                   unsigned int op_type
-                                 )
-{
-  if (op_type == 2)
-  {
-    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                      i < size1;
-                      i += gridDim.x * blockDim.x)
-    {
-      vec1[i*inc1+start1] = pow(vec2[i*inc2+start2], vec3[i*inc3+start3]);
-    }
-  }
-  else if (op_type == 1)
-  {
-    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                      i < size1;
-                      i += gridDim.x * blockDim.x)
-    {
-      vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3];
-    }
-  }
-  else if (op_type == 0)
-  {
-    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                      i < size1;
-                      i += gridDim.x * blockDim.x)
-    {
-      vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3];
-    }
-  }
-}
-
-template<typename NumericT>
-__global__ void element_op_int_kernel(NumericT * vec1,
-                                   unsigned int start1,
-                                   unsigned int inc1,
-                                   unsigned int size1,
-
-                                   NumericT const * vec2,
-                                   unsigned int start2,
-                                   unsigned int inc2,
-
-                                   NumericT const * vec3,
-                                   unsigned int start3,
-                                   unsigned int inc3,
-
-                                   unsigned int op_type
-                                 )
-{
-  if (op_type == 1)
-  {
-    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                      i < size1;
-                      i += gridDim.x * blockDim.x)
-    {
-      vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3];
-    }
-  }
-  else if (op_type == 0)
-  {
-    for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
-                      i < size1;
-                      i += gridDim.x * blockDim.x)
-    {
-      vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3];
-    }
-  }
-}
-
-/** @brief Implementation of the element-wise operation v1 = v2 .* v3 and v1 = 
v2 ./ v3    (using MATLAB syntax)
-*
-* @param vec1   The result vector (or -range, or -slice)
-* @param proxy  The proxy object holding v2, v3 and the operation
-*/
-template<typename NumericT, typename OpT>
-void element_op(vector_base<NumericT> & vec1,
-                vector_expression<const vector_base<NumericT>, const 
vector_base<NumericT>, op_element_binary<OpT> > const & proxy)
-{
-  unsigned int op_type = 2; //0: product, 1: division, 2: power
-  if (viennacl::is_division<OpT>::value)
-    op_type = 1;
-  else if (viennacl::is_product<OpT>::value)
-    op_type = 0;
-
-  element_op_int_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
-                                  static_cast<unsigned 
int>(viennacl::traits::start(vec1)),
-                                  static_cast<unsigned 
int>(viennacl::traits::stride(vec1)),
-                                  static_cast<unsigned 
int>(viennacl::traits::size(vec1)),
-
-                                  viennacl::cuda_arg(proxy.lhs()),
-                                  static_cast<unsigned 
int>(viennacl::traits::start(proxy.lhs())),
-                                  static_cast<unsigned 
int>(viennacl::traits::stride(proxy.lhs())),
-
-                                  viennacl::cuda_arg(proxy.rhs()),
-                                  static_cast<unsigned 
int>(viennacl::traits::start(proxy.rhs())),
-                                  static_cast<unsigned 
int>(viennacl::traits::stride(proxy.rhs())),
-
-                                  op_type
-                                 );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
-}
-
-template<typename OpT>
-void element_op(vector_base<float> & vec1,
-                vector_expression<const vector_base<float>, const 
vector_base<float>, op_element_binary<OpT> > const & proxy)
-{
-  unsigned int op_type = 2; //0: product, 1: division, 2: power
-  if (viennacl::is_division<OpT>::value)
-    op_type = 1;
-  else if (viennacl::is_product<OpT>::value)
-    op_type = 0;
-
-  element_op_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
-                                  static_cast<unsigned 
int>(viennacl::traits::start(vec1)),
-                                  static_cast<unsigned 
int>(viennacl::traits::stride(vec1)),
-                                  static_cast<unsigned 
int>(viennacl::traits::size(vec1)),
-
-                                  viennacl::cuda_arg(proxy.lhs()),
-                                  static_cast<unsigned 
int>(viennacl::traits::start(proxy.lhs())),
-                                  static_cast<unsigned 
int>(viennacl::traits::stride(proxy.lhs())),
-
-                                  viennacl::cuda_arg(proxy.rhs()),
-                                  static_cast<unsigned 
int>(viennacl::traits::start(proxy.rhs())),
-                                  static_cast<unsigned 
int>(viennacl::traits::stride(proxy.rhs())),
-
-                                  op_type
-                                 );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
-}
-
-template<typename OpT>
-void element_op(vector_base<double> & vec1,
-                vector_expression<const vector_base<double>, const 
vector_base<double>, op_element_binary<OpT> > const & proxy)
-{
-  unsigned int op_type = 2; //0: product, 1: division, 2: power
-  if (viennacl::is_division<OpT>::value)
-    op_type = 1;
-  else if (viennacl::is_product<OpT>::value)
-    op_type = 0;
-
-  element_op_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
-                                  static_cast<unsigned 
int>(viennacl::traits::start(vec1)),
-                                  static_cast<unsigned 
int>(viennacl::traits::stride(vec1)),
-                                  static_cast<unsigned 
int>(viennacl::traits::size(vec1)),
-
-                                  viennacl::cuda_arg(proxy.lhs()),
-                                  static_cast<unsigned 
int>(viennacl::traits::start(proxy.lhs())),
-                                  static_cast<unsigned 
int>(viennacl::traits::stride(proxy.lhs())),
-
-                                  viennacl::cuda_arg(proxy.rhs()),
-                                  static_cast<unsigned 
int>(viennacl::traits::start(proxy.rhs())),
-                                  static_cast<unsigned 
int>(viennacl::traits::stride(proxy.rhs())),
-
-                                  op_type
-                                 );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
-}
-
-///////////////////////// Unary Elementwise operations /////////////
-
-// Note: Trying to automate things with macros or template metaprogramming 
failed (preprocessor with nvcc did not work as expected), so this is terribly 
hand-rolled code
-// Question (Karl Rupp): Why is CUDA code always such a hassle when trying to 
use it in a library context?
-
-// acos
-template<typename NumericT>
-__global__ void vec_element_acos_kernel(
-    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned 
int size1,
-    NumericT const * vec2, unsigned int start2, unsigned int inc2)
-{
-  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += 
gridDim.x * blockDim.x)
-    vec1[i*inc1+start1] = acos(vec2[i*inc2+start2]);
-}
-
-template<typename NumericT>
-void element_op(vector_base<NumericT> & vec1,
-                vector_expression<const vector_base<NumericT>, const 
vector_base<NumericT>, op_element_unary<op_acos> > const & proxy)
-{
-  typedef NumericT        value_type;
-
-  vec_element_acos_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::size(vec1)),
-                                        viennacl::cuda_arg(proxy.lhs()),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(proxy.lhs())),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(proxy.lhs()))
-                                       );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_acos_kernel");
-}
-
-// asin
-template<typename NumericT>
-__global__ void vec_element_asin_kernel(
-    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned 
int size1,
-    NumericT const * vec2, unsigned int start2, unsigned int inc2)
-{
-  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += 
gridDim.x * blockDim.x)
-    vec1[i*inc1+start1] = asin(vec2[i*inc2+start2]);
-}
-
-template<typename NumericT>
-void element_op(vector_base<NumericT> & vec1,
-                vector_expression<const vector_base<NumericT>, const 
vector_base<NumericT>, op_element_unary<op_asin> > const & proxy)
-{
-  vec_element_asin_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::size(vec1)),
-                                        viennacl::cuda_arg(proxy.lhs()),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(proxy.lhs())),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(proxy.lhs()))
-                                       );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_asin_kernel");
-}
-
-
-// atan
-template<typename NumericT>
-__global__ void vec_element_atan_kernel(
-    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned 
int size1,
-    NumericT const * vec2, unsigned int start2, unsigned int inc2)
-{
-  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += 
gridDim.x * blockDim.x)
-    vec1[i*inc1+start1] = atan(vec2[i*inc2+start2]);
-}
-
-template<typename NumericT>
-void element_op(vector_base<NumericT> & vec1,
-                vector_expression<const vector_base<NumericT>, const 
vector_base<NumericT>, op_element_unary<op_atan> > const & proxy)
-{
-  vec_element_atan_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::size(vec1)),
-                                        viennacl::cuda_arg(proxy.lhs()),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(proxy.lhs())),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(proxy.lhs()))
-                                       );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_atan_kernel");
-}
-
-
-// ceil
-template<typename NumericT>
-__global__ void vec_element_ceil_kernel(
-    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned 
int size1,
-    NumericT const * vec2, unsigned int start2, unsigned int inc2)
-{
-  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += 
gridDim.x * blockDim.x)
-    vec1[i*inc1+start1] = ceil(vec2[i*inc2+start2]);
-}
-
-template<typename NumericT>
-void element_op(vector_base<NumericT> & vec1,
-                vector_expression<const vector_base<NumericT>, const 
vector_base<NumericT>, op_element_unary<op_ceil> > const & proxy)
-{
-  vec_element_ceil_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::size(vec1)),
-                                        viennacl::cuda_arg(proxy.lhs()),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(proxy.lhs())),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(proxy.lhs()))
-                                       );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_ceil_kernel");
-}
-
-
-// cos
-template<typename NumericT>
-__global__ void vec_element_cos_kernel(
-    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned 
int size1,
-    NumericT const * vec2, unsigned int start2, unsigned int inc2)
-{
-  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += 
gridDim.x * blockDim.x)
-    vec1[i*inc1+start1] = cos(vec2[i*inc2+start2]);
-}
-
-template<typename NumericT>
-void element_op(vector_base<NumericT> & vec1,
-                vector_expression<const vector_base<NumericT>, const 
vector_base<NumericT>, op_element_unary<op_cos> > const & proxy)
-{
-  vec_element_cos_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::size(vec1)),
-                                        viennacl::cuda_arg(proxy.lhs()),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(proxy.lhs())),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(proxy.lhs()))
-                                       );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_cos_kernel");
-}
-
-
-// cosh
-template<typename NumericT>
-__global__ void vec_element_cosh_kernel(
-    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned 
int size1,
-    NumericT const * vec2, unsigned int start2, unsigned int inc2)
-{
-  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += 
gridDim.x * blockDim.x)
-    vec1[i*inc1+start1] = cosh(vec2[i*inc2+start2]);
-}
-
-template<typename NumericT>
-void element_op(vector_base<NumericT> & vec1,
-                vector_expression<const vector_base<NumericT>, const 
vector_base<NumericT>, op_element_unary<op_cosh> > const & proxy)
-{
-  vec_element_cosh_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::size(vec1)),
-                                        viennacl::cuda_arg(proxy.lhs()),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(proxy.lhs())),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(proxy.lhs()))
-                                       );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_cosh_kernel");
-}
-
-
-// exp
-template<typename NumericT>
-__global__ void vec_element_exp_kernel(
-    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned 
int size1,
-    NumericT const * vec2, unsigned int start2, unsigned int inc2)
-{
-  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += 
gridDim.x * blockDim.x)
-    vec1[i*inc1+start1] = exp(vec2[i*inc2+start2]);
-}
-
-template<typename NumericT>
-void element_op(vector_base<NumericT> & vec1,
-                vector_expression<const vector_base<NumericT>, const 
vector_base<NumericT>, op_element_unary<op_exp> > const & proxy)
-{
-  vec_element_exp_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::size(vec1)),
-                                        viennacl::cuda_arg(proxy.lhs()),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(proxy.lhs())),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(proxy.lhs()))
-                                       );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_exp_kernel");
-}
-
-
-// fabs
-template<typename NumericT>
-__global__ void vec_element_fabs_kernel(
-    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned 
int size1,
-    NumericT const * vec2, unsigned int start2, unsigned int inc2)
-{
-  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += 
gridDim.x * blockDim.x)
-    vec1[i*inc1+start1] = fabs(vec2[i*inc2+start2]);
-}
-
-template<typename NumericT>
-void element_op(vector_base<NumericT> & vec1,
-                vector_expression<const vector_base<NumericT>, const 
vector_base<NumericT>, op_element_unary<op_fabs> > const & proxy)
-{
-  vec_element_fabs_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::size(vec1)),
-                                        viennacl::cuda_arg(proxy.lhs()),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(proxy.lhs())),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(proxy.lhs()))
-                                       );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_fabs_kernel");
-}
-
-// abs
-template<typename NumericT>
-__global__ void vec_element_abs_kernel(
-    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned 
int size1,
-    NumericT const * vec2, unsigned int start2, unsigned int inc2)
-{
-  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += 
gridDim.x * blockDim.x)
-    vec1[i*inc1+start1] = abs(vec2[i*inc2+start2]);
-}
-
-template<typename NumericT>
-void element_op(vector_base<NumericT> & vec1,
-                vector_expression<const vector_base<NumericT>, const 
vector_base<NumericT>, op_element_unary<op_abs> > const & proxy)
-{
-  vec_element_abs_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
-                                       static_cast<unsigned 
int>(viennacl::traits::start(vec1)),
-                                       static_cast<unsigned 
int>(viennacl::traits::stride(vec1)),
-                                       static_cast<unsigned 
int>(viennacl::traits::size(vec1)),
-                                       viennacl::cuda_arg(proxy.lhs()),
-                                       static_cast<unsigned 
int>(viennacl::traits::start(proxy.lhs())),
-                                       static_cast<unsigned 
int>(viennacl::traits::stride(proxy.lhs()))
-                                      );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_abs_kernel");
-}
-
-
-
-// floor
-template<typename NumericT>
-__global__ void vec_element_floor_kernel(
-    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned 
int size1,
-    NumericT const * vec2, unsigned int start2, unsigned int inc2)
-{
-  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += 
gridDim.x * blockDim.x)
-    vec1[i*inc1+start1] = floor(vec2[i*inc2+start2]);
-}
-
-template<typename NumericT>
-void element_op(vector_base<NumericT> & vec1,
-                vector_expression<const vector_base<NumericT>, const 
vector_base<NumericT>, op_element_unary<op_floor> > const & proxy)
-{
-  vec_element_floor_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::size(vec1)),
-                                        viennacl::cuda_arg(proxy.lhs()),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(proxy.lhs())),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(proxy.lhs()))
-                                       );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_floor_kernel");
-}
-
-
-// log
-template<typename NumericT>
-__global__ void vec_element_log_kernel(
-    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned 
int size1,
-    NumericT const * vec2, unsigned int start2, unsigned int inc2)
-{
-  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += 
gridDim.x * blockDim.x)
-    vec1[i*inc1+start1] = log(vec2[i*inc2+start2]);
-}
-
-template<typename NumericT>
-void element_op(vector_base<NumericT> & vec1,
-                vector_expression<const vector_base<NumericT>, const 
vector_base<NumericT>, op_element_unary<op_log> > const & proxy)
-{
-  vec_element_log_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::size(vec1)),
-                                        viennacl::cuda_arg(proxy.lhs()),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(proxy.lhs())),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(proxy.lhs()))
-                                       );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_log_kernel");
-}
-
-
-// log10
-template<typename NumericT>
-__global__ void vec_element_log10_kernel(
-    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned 
int size1,
-    NumericT const * vec2, unsigned int start2, unsigned int inc2)
-{
-  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += 
gridDim.x * blockDim.x)
-    vec1[i*inc1+start1] = log10(vec2[i*inc2+start2]);
-}
-
-template<typename NumericT>
-void element_op(vector_base<NumericT> & vec1,
-                vector_expression<const vector_base<NumericT>, const 
vector_base<NumericT>, op_element_unary<op_log10> > const & proxy)
-{
-  vec_element_log10_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::size(vec1)),
-                                        viennacl::cuda_arg(proxy.lhs()),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(proxy.lhs())),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(proxy.lhs()))
-                                       );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_log10_kernel");
-}
-
-
-// sin
-template<typename NumericT>
-__global__ void vec_element_sin_kernel(
-    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned 
int size1,
-    NumericT const * vec2, unsigned int start2, unsigned int inc2)
-{
-  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += 
gridDim.x * blockDim.x)
-    vec1[i*inc1+start1] = sin(vec2[i*inc2+start2]);
-}
-
-template<typename NumericT>
-void element_op(vector_base<NumericT> & vec1,
-                vector_expression<const vector_base<NumericT>, const 
vector_base<NumericT>, op_element_unary<op_sin> > const & proxy)
-{
-  vec_element_sin_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::size(vec1)),
-                                        viennacl::cuda_arg(proxy.lhs()),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(proxy.lhs())),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(proxy.lhs()))
-                                       );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sin_kernel");
-}
-
-
-// sinh
-template<typename NumericT>
-__global__ void vec_element_sinh_kernel(
-    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned 
int size1,
-    NumericT const * vec2, unsigned int start2, unsigned int inc2)
-{
-  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += 
gridDim.x * blockDim.x)
-    vec1[i*inc1+start1] = sinh(vec2[i*inc2+start2]);
-}
-
-template<typename NumericT>
-void element_op(vector_base<NumericT> & vec1,
-                vector_expression<const vector_base<NumericT>, const 
vector_base<NumericT>, op_element_unary<op_sinh> > const & proxy)
-{
-  vec_element_sinh_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::size(vec1)),
-                                        viennacl::cuda_arg(proxy.lhs()),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(proxy.lhs())),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(proxy.lhs()))
-                                       );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sinh_kernel");
-}
-
-
-// sqrt
-template<typename NumericT>
-__global__ void vec_element_sqrt_kernel(
-    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned 
int size1,
-    NumericT const * vec2, unsigned int start2, unsigned int inc2)
-{
-  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += 
gridDim.x * blockDim.x)
-    vec1[i*inc1+start1] = sqrt(vec2[i*inc2+start2]);
-}
-
-template<typename NumericT>
-void element_op(vector_base<NumericT> & vec1,
-                vector_expression<const vector_base<NumericT>, const 
vector_base<NumericT>, op_element_unary<op_sqrt> > const & proxy)
-{
-  vec_element_sqrt_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::size(vec1)),
-                                        viennacl::cuda_arg(proxy.lhs()),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(proxy.lhs())),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(proxy.lhs()))
-                                       );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sqrt_kernel");
-}
-
-
-// tan
-template<typename NumericT>
-__global__ void vec_element_tan_kernel(
-    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned 
int size1,
-    NumericT const * vec2, unsigned int start2, unsigned int inc2)
-{
-  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += 
gridDim.x * blockDim.x)
-    vec1[i*inc1+start1] = tan(vec2[i*inc2+start2]);
-}
-
-template<typename NumericT>
-void element_op(vector_base<NumericT> & vec1,
-                vector_expression<const vector_base<NumericT>, const 
vector_base<NumericT>, op_element_unary<op_tan> > const & proxy)
-{
-  vec_element_tan_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::size(vec1)),
-                                        viennacl::cuda_arg(proxy.lhs()),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(proxy.lhs())),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(proxy.lhs()))
-                                       );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_tan_kernel");
-}
-
-
-// tanh
-template<typename NumericT>
-__global__ void vec_element_tanh_kernel(
-    NumericT       * vec1, unsigned int start1, unsigned int inc1, unsigned 
int size1,
-    NumericT const * vec2, unsigned int start2, unsigned int inc2)
-{
-  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += 
gridDim.x * blockDim.x)
-    vec1[i*inc1+start1] = tanh(vec2[i*inc2+start2]);
-}
-
-template<typename NumericT>
-void element_op(vector_base<NumericT> & vec1,
-                vector_expression<const vector_base<NumericT>, const 
vector_base<NumericT>, op_element_unary<op_tanh> > const & proxy)
-{
-  vec_element_tanh_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(vec1)),
-                                        static_cast<unsigned 
int>(viennacl::traits::size(vec1)),
-                                        viennacl::cuda_arg(proxy.lhs()),
-                                        static_cast<unsigned 
int>(viennacl::traits::start(proxy.lhs())),
-                                        static_cast<unsigned 
int>(viennacl::traits::stride(proxy.lhs()))
-                                       );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_tanh_kernel");
-}
-
-
-
-///////////////////////// Norms and inner product ///////////////////
-
-
-template<typename NumericT>
-__global__ void inner_prod_kernel(const NumericT * vec1,
-                                  unsigned int start1,
-                                  unsigned int inc1,
-                                  unsigned int size1,
-                                  const NumericT * vec2,
-                                  unsigned int start2,
-                                  unsigned int inc2,
-                                  unsigned int size2,
-                                  NumericT * group_buffer)
-{
-  __shared__ NumericT tmp_buffer[128];
-  unsigned int group_start1 = (blockIdx.x * size1) / (gridDim.x) * inc1 + 
start1;
-  unsigned int group_start2 = (blockIdx.x * size2) / (gridDim.x) * inc2 + 
start2;
-
-  unsigned int group_size1 = ((blockIdx.x + 1) * size1) / (gridDim.x)
-                               - (  blockIdx.x * size1) / (gridDim.x);
-
-
-  NumericT tmp = 0;
-  for (unsigned int i = threadIdx.x; i < group_size1; i += blockDim.x)
-    tmp += vec1[i*inc1+group_start1] * vec2[i*inc2+group_start2];
-  tmp_buffer[threadIdx.x] = tmp;
-
-  // parallel reduction
-  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
-  {
-    __syncthreads();
-    if (threadIdx.x < stride)
-      tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride];
-  }
-
-  if (threadIdx.x == 0)
-    group_buffer[blockIdx.x] = tmp_buffer[0];
-
-}
-
-
-
-// sums the array 'vec1' and writes to result. Makes use of a single 
work-group only.
-template<typename NumericT>
-__global__ void vector_sum_kernel_floats(
-          const NumericT * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          unsigned int option, //0: use fmax, 1: just sum, 2: sum and return 
sqrt of sum
-          NumericT * result)
-{
-  __shared__ NumericT tmp_buffer[128];
-  NumericT thread_sum = 0;
-  for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
-  {
-    if (option > 0)
-      thread_sum += vec1[i*inc1+start1];
-    else
-      thread_sum = fmax(thread_sum, fabs(vec1[i*inc1+start1]));
-  }
-
-  tmp_buffer[threadIdx.x] = thread_sum;
-
-  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
-  {
-    __syncthreads();
-    if (threadIdx.x < stride)
-    {
-      if (option > 0)
-        tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
-      else
-        tmp_buffer[threadIdx.x] = fmax(tmp_buffer[threadIdx.x], 
tmp_buffer[threadIdx.x + stride]);
-    }
-  }
-
-  if (threadIdx.x == 0)
-  {
-    if (option == 2)
-      *result = sqrt(tmp_buffer[0]);
-    else
-      *result = tmp_buffer[0];
-  }
-}
-
-template<typename NumericT>
-__global__ void vector_sum_kernel_integers(
-          const NumericT * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          unsigned int option, //0: use max, 1: just sum
-          NumericT * result)
-{
-  __shared__ NumericT tmp_buffer[128];
-  NumericT thread_sum = 0;
-  for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
-  {
-    if (option > 0)
-      thread_sum += vec1[i*inc1+start1];
-    else
-      thread_sum = thread_sum > abs(vec1[i*inc1+start1]) ? thread_sum : 
abs(vec1[i*inc1+start1]);
-  }
-
-  tmp_buffer[threadIdx.x] = thread_sum;
-
-  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
-  {
-    __syncthreads();
-    if (threadIdx.x < stride)
-    {
-      if (option > 0)
-        tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
-      else
-        tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > 
tmp_buffer[threadIdx.x + stride] ? tmp_buffer[threadIdx.x] : 
tmp_buffer[threadIdx.x + stride];
-    }
-  }
-
-  if (threadIdx.x == 0)
-    *result = tmp_buffer[0];
-}
-
-template<typename NumericT>
-__global__ void vector_sum_kernel_unsigned_integers(
-          const NumericT * vec1,
-          unsigned int start1,
-          unsigned int inc1,
-          unsigned int size1,
-          unsigned int option, //0: use max, 1: just sum
-          NumericT * result)
-{
-  __shared__ NumericT tmp_buffer[128];
-  NumericT thread_sum = 0;
-  for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
-  {
-    if (option > 0)
-      thread_sum += vec1[i*inc1+start1];
-    else
-      thread_sum = (thread_sum > vec1[i*inc1+start1]) ? thread_sum : 
vec1[i*inc1+start1];
-  }
-
-  tmp_buffer[threadIdx.x] = thread_sum;
-
-  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
-  {
-    __syncthreads();
-    if (threadIdx.x < stride)
-    {
-      if (option > 0)
-        tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
-      else
-        tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > 
tmp_buffer[threadIdx.x + stride] ? tmp_buffer[threadIdx.x] : 
tmp_buffer[threadIdx.x + stride];
-    }
-  }
-
-  if (threadIdx.x == 0)
-    *result = tmp_buffer[0];
-}
-
-namespace detail
-{
-  /** \cond */
-  struct vector_sum_kernel_launcher_integers
-  {
-    template<typename NumericT, typename ScalarT>
-    static void apply(vector_base<NumericT> const & temp,
-                      unsigned int option,
-                      ScalarT & result)
-    {
-      typedef NumericT        value_type;
-      vector_sum_kernel_integers<<<1, 128>>>(viennacl::cuda_arg(temp),
-                                            static_cast<unsigned 
int>(viennacl::traits::start(temp)),
-                                            static_cast<unsigned 
int>(viennacl::traits::stride(temp)),
-                                            static_cast<unsigned 
int>(viennacl::traits::size(temp)),
-                                            static_cast<unsigned int>(option),
-                                            viennacl::cuda_arg(result) );
-      VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
-    }
-  };
-
-  struct vector_sum_kernel_launcher_unsigned_integers
-  {
-    template<typename NumericT, typename ScalarT>
-    static void apply(vector_base<NumericT> const & temp,
-                      unsigned int option,
-                      ScalarT & result)
-    {
-      typedef NumericT        value_type;
-      vector_sum_kernel_unsigned_integers<<<1, 128>>>(viennacl::cuda_arg(temp),
-                                                      static_cast<unsigned 
int>(viennacl::traits::start(temp)),
-                                                      static_cast<unsigned 
int>(viennacl::traits::stride(temp)),
-                                                      static_cast<unsigned 
int>(viennacl::traits::size(temp)),
-                                                      static_cast<unsigned 
int>(option),
-                                                      
viennacl::cuda_arg(result) );
-      VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
-    }
-  };
-
-  struct vector_sum_kernel_launcher_floats
-  {
-    template<typename NumericT, typename ScalarT>
-    static void apply(vector_base<NumericT> const & temp,
-                      unsigned int option,
-                      ScalarT & result)
-    {
-      typedef NumericT        value_type;
-      vector_sum_kernel_floats<<<1, 128>>>(viennacl::cuda_arg(temp),
-                                            static_cast<unsigned 
int>(viennacl::traits::start(temp)),
-                                            static_cast<unsigned 
int>(viennacl::traits::stride(temp)),
-                                            static_cast<unsigned 
int>(viennacl::traits::size(temp)),
-                                            static_cast<unsigned int>(option),
-                                            viennacl::cuda_arg(result) );
-      VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
-    }
-  };
-
-  template<typename NumericT>
-  struct vector_sum_kernel_launcher : public 
vector_sum_kernel_launcher_integers {};
-
-  template<>
-  struct vector_sum_kernel_launcher<unsigned char>  : public 
vector_sum_kernel_launcher_unsigned_integers {};
-
-  template<>
-  struct vector_sum_kernel_launcher<unsigned short>  : public 
vector_sum_kernel_launcher_unsigned_integers {};
-
-  template<>
-  struct vector_sum_kernel_launcher<unsigned int>  : public 
vector_sum_kernel_launcher_unsigned_integers {};
-
-  template<>
-  struct vector_sum_kernel_launcher<unsigned long>  : public 
vector_sum_kernel_launcher_unsigned_integers {};
-
-  template<>
-  struct vector_sum_kernel_launcher<float>  : public 
vector_sum_kernel_launcher_floats {};
-
-  template<>
-  struct vector_sum_kernel_launcher<double> : public 
vector_sum_kernel_launcher_floats {};
-
-  /** \endcond */
-}
-
-
-//implementation of inner product:
-//namespace {
-/** @brief Computes the inner product of two vectors - implementation. Library 
users should call inner_prod(vec1, vec2).
-*
-* @param vec1 The first vector
-* @param vec2 The second vector
-* @param result The result scalar (on the gpu)
-*/
-template<typename NumericT, typename ScalarT>
-void inner_prod_impl(vector_base<NumericT> const & vec1,
-                     vector_base<NumericT> const & vec2,
-                     ScalarT & result)
-{
-  typedef NumericT        value_type;
-
-  static const unsigned int work_groups = 128;
-  static viennacl::vector<value_type> temp(work_groups);
-
-  inner_prod_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
-                                  static_cast<unsigned 
int>(viennacl::traits::start(vec1)),
-                                  static_cast<unsigned 
int>(viennacl::traits::stride(vec1)),
-                                  static_cast<unsigned 
int>(viennacl::traits::size(vec1)),
-                                  viennacl::cuda_arg(vec2),
-                                  static_cast<unsigned 
int>(viennacl::traits::start(vec2)),
-                                  static_cast<unsigned 
int>(viennacl::traits::stride(vec2)),
-                                  static_cast<unsigned 
int>(viennacl::traits::size(vec2)),
-                                  viennacl::cuda_arg(temp)
-                                 );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel");
-
-  detail::vector_sum_kernel_launcher<NumericT>::apply(temp, 1, result);
-}
-
-
-/** @brief Computes the inner product of two vectors - implementation. Library 
users should call inner_prod(vec1, vec2).
-*
-* @param vec1 The first vector
-* @param vec2 The second vector
-* @param result The result scalar (on the host)
-*/
-template<typename NumericT>
-void inner_prod_cpu(vector_base<NumericT> const & vec1,
-                    vector_base<NumericT> const & vec2,
-                    NumericT & result)
-{
-  typedef NumericT        value_type;
-
-  const unsigned int work_groups = 128;
-  viennacl::vector<value_type> temp(work_groups);
-
-  inner_prod_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1),
-                                  static_cast<unsigned 
int>(viennacl::traits::start(vec1)),
-                                  static_cast<unsigned 
int>(viennacl::traits::stride(vec1)),
-                                  static_cast<unsigned 
int>(viennacl::traits::size(vec1)),
-                                  viennacl::cuda_arg(vec2),
-                                  static_cast<unsigned 
int>(viennacl::traits::start(vec2)),
-                                  static_cast<unsigned 
int>(viennacl::traits::stride(vec2)),
-                                  static_cast<unsigned 
int>(viennacl::traits::size(vec2)),
-                                  viennacl::cuda_arg(temp)
-                                 );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel");
-
-  // Now copy partial results from GPU back to CPU and run reduction there:
-  std::vector<value_type> temp_cpu(work_groups);
-  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
-
-  result = 0;
-  for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); 
it != temp_cpu.end(); ++it)
-    result += *it;
-}
-
-///////////////////////////////////
-
-#define VIENNACL_MDOT_WORKGROUP_SIZE  128
-#define VIENNACL_MDOT_WORKGROUP_NUM   128
-// M = 2:
-template<typename NumericT>
-__global__ void inner_prod_2_kernel(const NumericT *x,  unsigned int startx, 
unsigned int stridex, unsigned int sizex,
-                                    const NumericT *y0, unsigned int start0, 
unsigned int stride0,
-                                    const NumericT *y1, unsigned int start1, 
unsigned int stride1,
-                                    NumericT *group_results)
-{
-  __shared__ NumericT tmp_buffer[2*VIENNACL_MDOT_WORKGROUP_SIZE];
-  unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
-  unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
-  unsigned int vec_stop_index  = min((blockIdx.x + 1) * blockDim.x * 
entries_per_thread, sizex); // don't go beyond size of x
-
-  NumericT entry_x    = 0;
-  NumericT group_sum0 = 0;
-  NumericT group_sum1 = 0;
-  for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i 
+= blockDim.x) {
-    entry_x     = x[i * stridex + startx];   // load only once from global 
memory!
-    group_sum0 += entry_x * y0[i * stride0 + start0];
-    group_sum1 += entry_x * y1[i * stride1 + start1];
-  }
-  tmp_buffer[threadIdx.x]              = group_sum0;
-  tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
-
-  // parallel reduction
-  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
-    __syncthreads();
-    if (threadIdx.x < stride) {
-      tmp_buffer[threadIdx.x             ] += tmp_buffer[threadIdx.x+stride    
         ];
-      tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + 
blockDim.x];
-    }
-  }
-
-  // write result of group to group_results
-  if (threadIdx.x == 0) {
-    group_results[blockIdx.x]             = tmp_buffer[0];
-    group_results[blockIdx.x + gridDim.x] = tmp_buffer[blockDim.x];
-  }
-}
-
-// M = 3:
-template<typename NumericT>
-__global__ void inner_prod_3_kernel(const NumericT *x,  unsigned int startx, 
unsigned int stridex, unsigned int sizex,
-                                    const NumericT *y0, unsigned int start0, 
unsigned int stride0,
-                                    const NumericT *y1, unsigned int start1, 
unsigned int stride1,
-                                    const NumericT *y2, unsigned int start2, 
unsigned int stride2,
-                                    NumericT *group_results)
-{
-  __shared__ NumericT tmp_buffer[3*VIENNACL_MDOT_WORKGROUP_SIZE];
-  unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
-  unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
-  unsigned int vec_stop_index  = min((blockIdx.x + 1) * blockDim.x * 
entries_per_thread, sizex); // don't go beyond vec size
-
-  NumericT entry_x    = 0;
-  NumericT group_sum0 = 0;
-  NumericT group_sum1 = 0;
-  NumericT group_sum2 = 0;
-  for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i 
+= blockDim.x) {
-    entry_x     = x[i * stridex + startx];   // load only once from global 
memory!
-    group_sum0 += entry_x * y0[i * stride0 + start0];
-    group_sum1 += entry_x * y1[i * stride1 + start1];
-    group_sum2 += entry_x * y2[i * stride2 + start2];
-  }
-  tmp_buffer[threadIdx.x]                  = group_sum0;
-  tmp_buffer[threadIdx.x +     blockDim.x] = group_sum1;
-  tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
-
-  // parallel reduction
-  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
-    __syncthreads();
-    if (threadIdx.x < stride) {
-      tmp_buffer[threadIdx.x                 ] += 
tmp_buffer[threadIdx.x+stride                 ];
-      tmp_buffer[threadIdx.x +     blockDim.x] += 
tmp_buffer[threadIdx.x+stride +     blockDim.x];
-      tmp_buffer[threadIdx.x + 2 * blockDim.x] += 
tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
-    }
-  }
-
-  // write result of group to group_results
-  if (threadIdx.x == 0) {
-    group_results[blockIdx.x                ] = tmp_buffer[0];
-    group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    blockDim.x];
-    group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
-  }
-}
-
-// M = 4:
-template<typename NumericT>
-__global__ void inner_prod_4_kernel(const NumericT *x,  unsigned int startx, 
unsigned int stridex, unsigned int sizex,
-                                    const NumericT *y0, unsigned int start0, 
unsigned int stride0,
-                                    const NumericT *y1, unsigned int start1, 
unsigned int stride1,
-                                    const NumericT *y2, unsigned int start2, 
unsigned int stride2,
-                                    const NumericT *y3, unsigned int start3, 
unsigned int stride3,
-                                    NumericT *group_results)
-{
-  __shared__ NumericT tmp_buffer[4*VIENNACL_MDOT_WORKGROUP_SIZE];
-  unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
-  unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
-  unsigned int vec_stop_index  = min((blockIdx.x + 1) * blockDim.x * 
entries_per_thread, sizex); // don't go beyond vec size
-
-  NumericT entry_x    = 0;
-  NumericT group_sum0 = 0;
-  NumericT group_sum1 = 0;
-  NumericT group_sum2 = 0;
-  NumericT group_sum3 = 0;
-  for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i 
+= blockDim.x) {
-    entry_x     = x[i * stridex + startx];   // load only once from global 
memory!
-    group_sum0 += entry_x * y0[i * stride0 + start0];
-    group_sum1 += entry_x * y1[i * stride1 + start1];
-    group_sum2 += entry_x * y2[i * stride2 + start2];
-    group_sum3 += entry_x * y3[i * stride3 + start3];
-  }
-  tmp_buffer[threadIdx.x]                  = group_sum0;
-  tmp_buffer[threadIdx.x +     blockDim.x] = group_sum1;
-  tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
-  tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3;
-
-  // parallel reduction
-  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
-    __syncthreads();
-    if (threadIdx.x < stride) {
-      tmp_buffer[threadIdx.x                 ] += 
tmp_buffer[threadIdx.x+stride                 ];
-      tmp_buffer[threadIdx.x +     blockDim.x] += 
tmp_buffer[threadIdx.x+stride +     blockDim.x];
-      tmp_buffer[threadIdx.x + 2 * blockDim.x] += 
tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
-      tmp_buffer[threadIdx.x + 3 * blockDim.x] += 
tmp_buffer[threadIdx.x+stride + 3 * blockDim.x];
-    }
-  }
-
-  // write result of group to group_results
-  if (threadIdx.x == 0) {
-    group_results[blockIdx.x                ] = tmp_buffer[0];
-    group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    blockDim.x];
-    group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
-    group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x];
-  }
-}
-
-// M = 8:
-template<typename NumericT>
-__global__ void inner_prod_8_kernel(const NumericT *x,  unsigned int startx, 
unsigned int stridex, unsigned int sizex,
-                                    const NumericT *y0, unsigned int start0, 
unsigned int stride0,
-                                    const NumericT *y1, unsigned int start1, 
unsigned int stride1,
-                                    const NumericT *y2, unsigned int start2, 
unsigned int stride2,
-                                    const NumericT *y3, unsigned int start3, 
unsigned int stride3,
-                                    const NumericT *y4, unsigned int start4, 
unsigned int stride4,
-                                    const NumericT *y5, unsigned int start5, 
unsigned int stride5,
-                                    const NumericT *y6, unsigned int start6, 
unsigned int stride6,
-                                    const NumericT *y7, unsigned int start7, 
unsigned int stride7,
-                                    NumericT *group_results)
-{
-  __shared__ NumericT tmp_buffer[8*VIENNACL_MDOT_WORKGROUP_SIZE];
-  unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
-  unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
-  unsigned int vec_stop_index  = min((blockIdx.x + 1) * blockDim.x * 
entries_per_thread, sizex); // don't go beyond vec size
-
-  NumericT entry_x    = 0;
-  NumericT group_sum0 = 0;
-  NumericT group_sum1 = 0;
-  NumericT group_sum2 = 0;
-  NumericT group_sum3 = 0;
-  NumericT group_sum4 = 0;
-  NumericT group_sum5 = 0;
-  NumericT group_sum6 = 0;
-  NumericT group_sum7 = 0;
-  for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i 
+= blockDim.x) {
-    entry_x     = x[i * stridex + startx];   // load only once from global 
memory!
-    group_sum0 += entry_x * y0[i * stride0 + start0];
-    group_sum1 += entry_x * y1[i * stride1 + start1];
-    group_sum2 += entry_x * y2[i * stride2 + start2];
-    group_sum3 += entry_x * y3[i * stride3 + start3];
-    group_sum4 += entry_x * y4[i * stride4 + start4];
-    group_sum5 += entry_x * y5[i * stride5 + start5];
-    group_sum6 += entry_x * y6[i * stride6 + start6];
-    group_sum7 += entry_x * y7[i * stride7 + start7];
-  }
-  tmp_buffer[threadIdx.x]                  = group_sum0;
-  tmp_buffer[threadIdx.x +     blockDim.x] = group_sum1;
-  tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
-  tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3;
-  tmp_buffer[threadIdx.x + 4 * blockDim.x] = group_sum4;
-  tmp_buffer[threadIdx.x + 5 * blockDim.x] = group_sum5;
-  tmp_buffer[threadIdx.x + 6 * blockDim.x] = group_sum6;
-  tmp_buffer[threadIdx.x + 7 * blockDim.x] = group_sum7;
-
-  // parallel reduction
-  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
-    __syncthreads();
-    if (threadIdx.x < stride) {
-      tmp_buffer[threadIdx.x                 ] += 
tmp_buffer[threadIdx.x+stride                 ];
-      tmp_buffer[threadIdx.x +     blockDim.x] += 
tmp_buffer[threadIdx.x+stride +     blockDim.x];
-      tmp_buffer[threadIdx.x + 2 * blockDim.x] += 
tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
-      tmp_buffer[threadIdx.x + 3 * blockDim.x] += 
tmp_buffer[threadIdx.x+stride + 3 * blockDim.x];
-      tmp_buffer[threadIdx.x + 4 * blockDim.x] += 
tmp_buffer[threadIdx.x+stride + 4 * blockDim.x];
-      tmp_buffer[threadIdx.x + 5 * blockDim.x] += 
tmp_buffer[threadIdx.x+stride + 5 * blockDim.x];
-      tmp_buffer[threadIdx.x + 6 * blockDim.x] += 
tmp_buffer[threadIdx.x+stride + 6 * blockDim.x];
-      tmp_buffer[threadIdx.x + 7 * blockDim.x] += 
tmp_buffer[threadIdx.x+stride + 7 * blockDim.x];
-    }
-  }
-
-  // write result of group to group_results
-  if (threadIdx.x == 0) {
-    group_results[blockIdx.x                ] = tmp_buffer[0];
-    group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    blockDim.x];
-    group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
-    group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x];
-    group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * blockDim.x];
-    group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * blockDim.x];
-    group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * blockDim.x];
-    group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * blockDim.x];
-  }
-}
-
-// sums the array 'vec1' and writes to result. Makes use of a single 
work-group only.
-template<typename NumericT>
-__global__ void vector_multi_sum_kernel(
-          NumericT const * vec1,
-          NumericT * result,
-          unsigned int start_result,
-          unsigned int inc_result)
-{
-  __shared__ NumericT tmp_buffer[VIENNACL_MDOT_WORKGROUP_SIZE];
-
-  tmp_buffer[threadIdx.x] = vec1[threadIdx.x + blockIdx.x * 
VIENNACL_MDOT_WORKGROUP_SIZE];
-
-  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
-  {
-    __syncthreads();
-    if (threadIdx.x < stride)
-      tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
-  }
-
-  if (threadIdx.x == 0)
-    result[start_result + inc_result * blockIdx.x] = tmp_buffer[0];
-}
-
-template<typename NumericT>
-void inner_prod_impl(vector_base<NumericT> const & x,
-                     vector_tuple<NumericT> const & vec_tuple,
-                     vector_base<NumericT> & result)
-{
-  typedef NumericT        value_type;
-
-  static viennacl::vector<value_type> temp(8 * VIENNACL_MDOT_WORKGROUP_NUM);
-
-  vcl_size_t current_index = 0;
-  while (vec_tuple.const_size() > current_index)
-  {
-    switch (vec_tuple.const_size() - current_index)
-    {
-      case 7:
-      case 6:
-      case 5:
-      case 4:
-      {
-        vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index);
-        vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 
1);
-        vector_base<NumericT> const & y2 = vec_tuple.const_at(current_index + 
2);
-        vector_base<NumericT> const & y3 = vec_tuple.const_at(current_index + 
3);
-
-        inner_prod_4_kernel<<<VIENNACL_MDOT_WORKGROUP_NUM,
-                              VIENNACL_MDOT_WORKGROUP_SIZE>>>( 
viennacl::cuda_arg(x),
-                                                               
static_cast<unsigned int>(viennacl::traits::start(x)),
-                                                               
static_cast<unsigned int>(viennacl::traits::stride(x)),
-                                                               
static_cast<unsigned int>(viennacl::traits::size(x)),
-                                                               
viennacl::cuda_arg(y0),
-                                                               
static_cast<unsigned int>(viennacl::traits::start(y0)),
-                                                               
static_cast<unsigned int>(viennacl::traits::stride(y0)),
-                                                               
viennacl::cuda_arg(y1),
-                                                               
static_cast<unsigned int>(viennacl::traits::start(y1)),
-                                                               
static_cast<unsigned int>(viennacl::traits::stride(y1)),
-                                                               
viennacl::cuda_arg(y2),
-                                                               
static_cast<unsigned int>(vi

<TRUNCATED>

Reply via email to