http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/vector_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/vector_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/vector_operations.hpp new file mode 100644 index 0000000..b7eaeb4 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/vector_operations.hpp @@ -0,0 +1,3252 @@ +#ifndef VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_ +#define VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/cuda/vector_operations.hpp + @brief Implementations of vector operations using a plain single-threaded execution on CPU +*/ + +#include <cmath> +#include "viennacl/forwards.h" +#include "viennacl/scalar.hpp" +#include "viennacl/tools/tools.hpp" +#include "viennacl/meta/predicate.hpp" +#include "viennacl/meta/enable_if.hpp" +#include "viennacl/traits/size.hpp" +#include "viennacl/traits/start.hpp" +#include "viennacl/traits/stride.hpp" + +#include "viennacl/linalg/cuda/common.hpp" + +namespace viennacl +{ +namespace linalg +{ +namespace cuda +{ + +// +// Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here! +// +template<typename DestNumericT, typename SrcNumericT> +__global__ void convert_kernel(DestNumericT * dest, unsigned int start_dest, unsigned int inc_dest, unsigned int size_dest, + SrcNumericT const * src, unsigned int start_src, unsigned int inc_src) +{ + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size_dest; + i += gridDim.x * blockDim.x) + dest[i*inc_dest+start_dest] = src[i*inc_src+start_src]; +} + + +template<typename DestNumericT, typename SrcNumericT> +void convert(vector_base<DestNumericT> & dest, vector_base<SrcNumericT> const & src) +{ + convert_kernel<<<128, 128>>>(viennacl::cuda_arg(dest), + static_cast<unsigned int>(viennacl::traits::start(dest)), + static_cast<unsigned int>(viennacl::traits::stride(dest)), + static_cast<unsigned int>(viennacl::traits::size(dest)), + + viennacl::cuda_arg(src), + static_cast<unsigned int>(viennacl::traits::start(src)), + static_cast<unsigned int>(viennacl::traits::stride(src)) ); + VIENNACL_CUDA_LAST_ERROR_CHECK("convert_kernel"); +} + + +//////////////////////// av ///////////////////////////// + +// gpu scalar +template<typename NumericT> +__global__ void av_kernel(NumericT * vec1, + unsigned int start1, + unsigned int inc1, + unsigned int size1, + + const NumericT * fac2, + unsigned int options2, + const NumericT * vec2, + unsigned int start2, + unsigned int inc2) +{ + NumericT alpha = *fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + + if (options2 & (1 << 1)) + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha; + } + else + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha; + } +} + +// cpu scalar +template<typename NumericT> +__global__ void av_kernel(NumericT * vec1, + unsigned int start1, + unsigned int inc1, + unsigned int size1, + + NumericT fac2, + unsigned int options2, + const NumericT * vec2, + unsigned int start2, + unsigned int inc2) +{ + NumericT alpha = fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + + if (options2 & (1 << 1)) + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha; + } + else + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha; + } +} + + + +template<typename NumericT, typename ScalarType1> +void av(vector_base<NumericT> & vec1, + vector_base<NumericT> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha) +{ + typedef NumericT value_type; + + unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha); + + value_type data_alpha = alpha; + if (flip_sign_alpha) + data_alpha = -data_alpha; + if (reciprocal_alpha) + data_alpha = static_cast<value_type>(1) / data_alpha; + + value_type temporary_alpha = 0; + if (viennacl::is_cpu_scalar<ScalarType1>::value) + temporary_alpha = alpha; + + av_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), + static_cast<unsigned int>(viennacl::traits::start(vec1)), + static_cast<unsigned int>(viennacl::traits::stride(vec1)), + static_cast<unsigned int>(viennacl::traits::size(vec1)), + + viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), + options_alpha, + viennacl::cuda_arg(vec2), + static_cast<unsigned int>(viennacl::traits::start(vec2)), + static_cast<unsigned int>(viennacl::traits::stride(vec2)) ); + VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel"); +} + + +///////////////////// avbv ////////////////////////////////// + +// alpha and beta on GPU +template<typename NumericT> +__global__ void avbv_kernel(NumericT * vec1, + unsigned int start1, + unsigned int inc1, + unsigned int size1, + + const NumericT * fac2, + unsigned int options2, + const NumericT * vec2, + unsigned int start2, + unsigned int inc2, + + const NumericT * fac3, + unsigned int options3, + const NumericT * vec3, + unsigned int start3, + unsigned int inc3) +{ + NumericT alpha = *fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + + NumericT beta = *fac3; + if (options3 & (1 << 0)) + beta = -beta; + + if (options2 & (1 << 1)) + { + if (options3 & (1 << 1)) + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta; + } + else + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta; + } + } + else + { + if (options3 & (1 << 1)) + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta; + } + else + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta; + } + } +} + +// alpha on CPU, beta on GPU +template<typename NumericT> +__global__ void avbv_kernel(NumericT * vec1, + unsigned int start1, + unsigned int inc1, + unsigned int size1, + + NumericT fac2, + unsigned int options2, + const NumericT * vec2, + unsigned int start2, + unsigned int inc2, + + const NumericT * fac3, + unsigned int options3, + const NumericT * vec3, + unsigned int start3, + unsigned int inc3) +{ + NumericT alpha = fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + + NumericT beta = *fac3; + if (options3 & (1 << 0)) + beta = -beta; + + if (options2 & (1 << 1)) + { + if (options3 & (1 << 1)) + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta; + } + else + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta; + } + } + else + { + if (options3 & (1 << 1)) + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta; + } + else + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta; + } + } +} + +// alpha on GPU, beta on CPU +template<typename NumericT> +__global__ void avbv_kernel(NumericT * vec1, + unsigned int start1, + unsigned int inc1, + unsigned int size1, + + const NumericT * fac2, + unsigned int options2, + const NumericT * vec2, + unsigned int start2, + unsigned int inc2, + + NumericT fac3, + unsigned int options3, + const NumericT * vec3, + unsigned int start3, + unsigned int inc3) +{ + NumericT alpha = *fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + + NumericT beta = fac3; + if (options3 & (1 << 0)) + beta = -beta; + + if (options2 & (1 << 1)) + { + if (options3 & (1 << 1)) + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta; + } + else + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta; + } + } + else + { + if (options3 & (1 << 1)) + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta; + } + else + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta; + } + } +} + +// alpha and beta on CPU +template<typename NumericT> +__global__ void avbv_kernel(NumericT * vec1, + unsigned int start1, + unsigned int inc1, + unsigned int size1, + + NumericT fac2, + unsigned int options2, + const NumericT * vec2, + unsigned int start2, + unsigned int inc2, + + NumericT fac3, + unsigned int options3, + const NumericT * vec3, + unsigned int start3, + unsigned int inc3) +{ + NumericT alpha = fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + + NumericT beta = fac3; + if (options3 & (1 << 0)) + beta = -beta; + + if (options2 & (1 << 1)) + { + if (options3 & (1 << 1)) + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta; + } + else + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta; + } + } + else + { + if (options3 & (1 << 1)) + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta; + } + else + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta; + } + } +} + + + + +template<typename NumericT, typename ScalarT1, typename ScalarT2> +void avbv(vector_base<NumericT> & vec1, + vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, + vector_base<NumericT> const & vec3, ScalarT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta) +{ + typedef NumericT value_type; + + unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha); + + value_type data_alpha = alpha; + if (flip_sign_alpha) + data_alpha = -data_alpha; + if (reciprocal_alpha) + data_alpha = static_cast<value_type>(1) / data_alpha; + + value_type temporary_alpha = 0; + if (viennacl::is_cpu_scalar<ScalarT1>::value) + temporary_alpha = alpha; + + unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta); + + value_type temporary_beta = 0; + if (viennacl::is_cpu_scalar<ScalarT2>::value) + temporary_beta = beta; + + + avbv_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), + static_cast<unsigned int>(viennacl::traits::start(vec1)), + static_cast<unsigned int>(viennacl::traits::stride(vec1)), + static_cast<unsigned int>(viennacl::traits::size(vec1)), + + viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), + options_alpha, + viennacl::cuda_arg(vec2), + static_cast<unsigned int>(viennacl::traits::start(vec2)), + static_cast<unsigned int>(viennacl::traits::stride(vec2)), + + viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)), + options_beta, + viennacl::cuda_arg(vec3), + static_cast<unsigned int>(viennacl::traits::start(vec3)), + static_cast<unsigned int>(viennacl::traits::stride(vec3)) ); + VIENNACL_CUDA_LAST_ERROR_CHECK("avbv_kernel"); +} + + +////////////////////////// avbv_v ////////////////////////////////////// + + +// alpha and beta on GPU +template<typename NumericT> +__global__ void avbv_v_kernel(NumericT * vec1, + unsigned int start1, + unsigned int inc1, + unsigned int size1, + + const NumericT * fac2, + unsigned int options2, + const NumericT * vec2, + unsigned int start2, + unsigned int inc2, + + const NumericT * fac3, + unsigned int options3, + const NumericT * vec3, + unsigned int start3, + unsigned int inc3) +{ + NumericT alpha = *fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + + NumericT beta = *fac3; + if (options3 & (1 << 0)) + beta = -beta; + + if (options2 & (1 << 1)) + { + if (options3 & (1 << 1)) + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta; + } + else + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta; + } + } + else + { + if (options3 & (1 << 1)) + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta; + } + else + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta; + } + } +} + +// alpha on CPU, beta on GPU +template<typename NumericT> +__global__ void avbv_v_kernel(NumericT * vec1, + unsigned int start1, + unsigned int inc1, + unsigned int size1, + + NumericT fac2, + unsigned int options2, + const NumericT * vec2, + unsigned int start2, + unsigned int inc2, + + const NumericT * fac3, + unsigned int options3, + const NumericT * vec3, + unsigned int start3, + unsigned int inc3) +{ + NumericT alpha = fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + + NumericT beta = *fac3; + if (options3 & (1 << 0)) + beta = -beta; + + if (options2 & (1 << 1)) + { + if (options3 & (1 << 1)) + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta; + } + else + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta; + } + } + else + { + if (options3 & (1 << 1)) + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta; + } + else + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta; + } + } +} + +// alpha on GPU, beta on CPU +template<typename NumericT> +__global__ void avbv_v_kernel(NumericT * vec1, + unsigned int start1, + unsigned int inc1, + unsigned int size1, + + const NumericT * fac2, + unsigned int options2, + const NumericT * vec2, + unsigned int start2, + unsigned int inc2, + + NumericT fac3, + unsigned int options3, + const NumericT * vec3, + unsigned int start3, + unsigned int inc3) +{ + NumericT alpha = *fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + + NumericT beta = fac3; + if (options3 & (1 << 0)) + beta = -beta; + + if (options2 & (1 << 1)) + { + if (options3 & (1 << 1)) + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta; + } + else + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta; + } + } + else + { + if (options3 & (1 << 1)) + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta; + } + else + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta; + } + } +} + +// alpha and beta on CPU +template<typename NumericT> +__global__ void avbv_v_kernel(NumericT * vec1, + unsigned int start1, + unsigned int inc1, + unsigned int size1, + + NumericT fac2, + unsigned int options2, + const NumericT * vec2, + unsigned int start2, + unsigned int inc2, + + NumericT fac3, + unsigned int options3, + const NumericT * vec3, + unsigned int start3, + unsigned int inc3) +{ + NumericT alpha = fac2; + if (options2 & (1 << 0)) + alpha = -alpha; + + NumericT beta = fac3; + if (options3 & (1 << 0)) + beta = -beta; + + if (options2 & (1 << 1)) + { + if (options3 & (1 << 1)) + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta; + } + else + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta; + } + } + else + { + if (options3 & (1 << 1)) + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta; + } + else + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta; + } + } +} + + +template<typename NumericT, typename ScalarT1, typename ScalarT2> +void avbv_v(vector_base<NumericT> & vec1, + vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, + vector_base<NumericT> const & vec3, ScalarT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta) +{ + typedef NumericT value_type; + + unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha); + + value_type data_alpha = alpha; + if (flip_sign_alpha) + data_alpha = -data_alpha; + if (reciprocal_alpha) + data_alpha = static_cast<value_type>(1) / data_alpha; + + value_type temporary_alpha = 0; + if (viennacl::is_cpu_scalar<ScalarT1>::value) + temporary_alpha = alpha; + + unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta); + + value_type temporary_beta = 0; + if (viennacl::is_cpu_scalar<ScalarT2>::value) + temporary_beta = beta; + + + avbv_v_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), + static_cast<unsigned int>(viennacl::traits::start(vec1)), + static_cast<unsigned int>(viennacl::traits::stride(vec1)), + static_cast<unsigned int>(viennacl::traits::size(vec1)), + + viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), + options_alpha, + viennacl::cuda_arg(vec2), + static_cast<unsigned int>(viennacl::traits::start(vec2)), + static_cast<unsigned int>(viennacl::traits::stride(vec2)), + + viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)), + options_beta, + viennacl::cuda_arg(vec3), + static_cast<unsigned int>(viennacl::traits::start(vec3)), + static_cast<unsigned int>(viennacl::traits::stride(vec3)) ); +} + + +////////////////////////// + +template<typename NumericT> +__global__ void vector_assign_kernel(NumericT * vec1, + unsigned int start1, + unsigned int inc1, + unsigned int size1, + unsigned int internal_size1, + + NumericT alpha) +{ + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = (i < size1) ? alpha : 0; +} + +/** @brief Assign a constant value to a vector (-range/-slice) +* +* @param vec1 The vector to which the value should be assigned +* @param alpha The value to be assigned +* @param up_to_internal_size Specifies whether alpha should also be written to padded memory (mostly used for clearing the whole buffer). +*/ +template<typename NumericT, typename ScalarT1> +void vector_assign(vector_base<NumericT> & vec1, ScalarT1 const & alpha, bool up_to_internal_size = false) +{ + typedef NumericT value_type; + + value_type temporary_alpha = 0; + if (viennacl::is_cpu_scalar<ScalarT1>::value) + temporary_alpha = alpha; + + unsigned int size = up_to_internal_size ? static_cast<unsigned int>(vec1.internal_size()) : static_cast<unsigned int>(viennacl::traits::size(vec1)); + + vector_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), + static_cast<unsigned int>(viennacl::traits::start(vec1)), + static_cast<unsigned int>(viennacl::traits::stride(vec1)), + size, + static_cast<unsigned int>(vec1.internal_size()), //Note: Do NOT use traits::internal_size() here, because vector proxies don't require padding. + + viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)) ); + VIENNACL_CUDA_LAST_ERROR_CHECK("vector_assign_kernel"); +} + +////////////////////////// + +template<typename NumericT> +__global__ void vector_swap_kernel(NumericT * vec1, + unsigned int start1, + unsigned int inc1, + unsigned int size1, + + NumericT * vec2, + unsigned int start2, + unsigned int inc2) +{ + NumericT tmp; + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + { + tmp = vec2[i*inc2+start2]; + vec2[i*inc2+start2] = vec1[i*inc1+start1]; + vec1[i*inc1+start1] = tmp; + } +} + + +/** @brief Swaps the contents of two vectors, data is copied +* +* @param vec1 The first vector (or -range, or -slice) +* @param vec2 The second vector (or -range, or -slice) +*/ +template<typename NumericT> +void vector_swap(vector_base<NumericT> & vec1, vector_base<NumericT> & vec2) +{ + vector_swap_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), + static_cast<unsigned int>(viennacl::traits::start(vec1)), + static_cast<unsigned int>(viennacl::traits::stride(vec1)), + static_cast<unsigned int>(viennacl::traits::size(vec1)), + + viennacl::cuda_arg(vec2), + static_cast<unsigned int>(viennacl::traits::start(vec2)), + static_cast<unsigned int>(viennacl::traits::stride(vec2)) ); + VIENNACL_CUDA_LAST_ERROR_CHECK("vector_swap_kernel"); +} + +///////////////////////// Binary Elementwise operations ///////////// + +template<typename NumericT> +__global__ void element_op_kernel(NumericT * vec1, + unsigned int start1, + unsigned int inc1, + unsigned int size1, + + NumericT const * vec2, + unsigned int start2, + unsigned int inc2, + + NumericT const * vec3, + unsigned int start3, + unsigned int inc3, + + unsigned int op_type + ) +{ + if (op_type == 2) + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + { + vec1[i*inc1+start1] = pow(vec2[i*inc2+start2], vec3[i*inc3+start3]); + } + } + else if (op_type == 1) + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + { + vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3]; + } + } + else if (op_type == 0) + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + { + vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3]; + } + } +} + +template<typename NumericT> +__global__ void element_op_int_kernel(NumericT * vec1, + unsigned int start1, + unsigned int inc1, + unsigned int size1, + + NumericT const * vec2, + unsigned int start2, + unsigned int inc2, + + NumericT const * vec3, + unsigned int start3, + unsigned int inc3, + + unsigned int op_type + ) +{ + if (op_type == 1) + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + { + vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3]; + } + } + else if (op_type == 0) + { + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; + i < size1; + i += gridDim.x * blockDim.x) + { + vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3]; + } + } +} + +/** @brief Implementation of the element-wise operation v1 = v2 .* v3 and v1 = v2 ./ v3 (using MATLAB syntax) +* +* @param vec1 The result vector (or -range, or -slice) +* @param proxy The proxy object holding v2, v3 and the operation +*/ +template<typename NumericT, typename OpT> +void element_op(vector_base<NumericT> & vec1, + vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_binary<OpT> > const & proxy) +{ + unsigned int op_type = 2; //0: product, 1: division, 2: power + if (viennacl::is_division<OpT>::value) + op_type = 1; + else if (viennacl::is_product<OpT>::value) + op_type = 0; + + element_op_int_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), + static_cast<unsigned int>(viennacl::traits::start(vec1)), + static_cast<unsigned int>(viennacl::traits::stride(vec1)), + static_cast<unsigned int>(viennacl::traits::size(vec1)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())), + + viennacl::cuda_arg(proxy.rhs()), + static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())), + static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())), + + op_type + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel"); +} + +template<typename OpT> +void element_op(vector_base<float> & vec1, + vector_expression<const vector_base<float>, const vector_base<float>, op_element_binary<OpT> > const & proxy) +{ + unsigned int op_type = 2; //0: product, 1: division, 2: power + if (viennacl::is_division<OpT>::value) + op_type = 1; + else if (viennacl::is_product<OpT>::value) + op_type = 0; + + element_op_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), + static_cast<unsigned int>(viennacl::traits::start(vec1)), + static_cast<unsigned int>(viennacl::traits::stride(vec1)), + static_cast<unsigned int>(viennacl::traits::size(vec1)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())), + + viennacl::cuda_arg(proxy.rhs()), + static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())), + static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())), + + op_type + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel"); +} + +template<typename OpT> +void element_op(vector_base<double> & vec1, + vector_expression<const vector_base<double>, const vector_base<double>, op_element_binary<OpT> > const & proxy) +{ + unsigned int op_type = 2; //0: product, 1: division, 2: power + if (viennacl::is_division<OpT>::value) + op_type = 1; + else if (viennacl::is_product<OpT>::value) + op_type = 0; + + element_op_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), + static_cast<unsigned int>(viennacl::traits::start(vec1)), + static_cast<unsigned int>(viennacl::traits::stride(vec1)), + static_cast<unsigned int>(viennacl::traits::size(vec1)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())), + + viennacl::cuda_arg(proxy.rhs()), + static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())), + static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())), + + op_type + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel"); +} + +///////////////////////// Unary Elementwise operations ///////////// + +// Note: Trying to automate things with macros or template metaprogramming failed (preprocessor with nvcc did not work as expected), so this is terribly hand-rolled code +// Question (Karl Rupp): Why is CUDA code always such a hassle when trying to use it in a library context? + +// acos +template<typename NumericT> +__global__ void vec_element_acos_kernel( + NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, + NumericT const * vec2, unsigned int start2, unsigned int inc2) +{ + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = acos(vec2[i*inc2+start2]); +} + +template<typename NumericT> +void element_op(vector_base<NumericT> & vec1, + vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_acos> > const & proxy) +{ + typedef NumericT value_type; + + vec_element_acos_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), + static_cast<unsigned int>(viennacl::traits::start(vec1)), + static_cast<unsigned int>(viennacl::traits::stride(vec1)), + static_cast<unsigned int>(viennacl::traits::size(vec1)), + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_acos_kernel"); +} + +// asin +template<typename NumericT> +__global__ void vec_element_asin_kernel( + NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, + NumericT const * vec2, unsigned int start2, unsigned int inc2) +{ + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = asin(vec2[i*inc2+start2]); +} + +template<typename NumericT> +void element_op(vector_base<NumericT> & vec1, + vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_asin> > const & proxy) +{ + vec_element_asin_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), + static_cast<unsigned int>(viennacl::traits::start(vec1)), + static_cast<unsigned int>(viennacl::traits::stride(vec1)), + static_cast<unsigned int>(viennacl::traits::size(vec1)), + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_asin_kernel"); +} + + +// atan +template<typename NumericT> +__global__ void vec_element_atan_kernel( + NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, + NumericT const * vec2, unsigned int start2, unsigned int inc2) +{ + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = atan(vec2[i*inc2+start2]); +} + +template<typename NumericT> +void element_op(vector_base<NumericT> & vec1, + vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_atan> > const & proxy) +{ + vec_element_atan_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), + static_cast<unsigned int>(viennacl::traits::start(vec1)), + static_cast<unsigned int>(viennacl::traits::stride(vec1)), + static_cast<unsigned int>(viennacl::traits::size(vec1)), + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_atan_kernel"); +} + + +// ceil +template<typename NumericT> +__global__ void vec_element_ceil_kernel( + NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, + NumericT const * vec2, unsigned int start2, unsigned int inc2) +{ + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = ceil(vec2[i*inc2+start2]); +} + +template<typename NumericT> +void element_op(vector_base<NumericT> & vec1, + vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_ceil> > const & proxy) +{ + vec_element_ceil_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), + static_cast<unsigned int>(viennacl::traits::start(vec1)), + static_cast<unsigned int>(viennacl::traits::stride(vec1)), + static_cast<unsigned int>(viennacl::traits::size(vec1)), + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_ceil_kernel"); +} + + +// cos +template<typename NumericT> +__global__ void vec_element_cos_kernel( + NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, + NumericT const * vec2, unsigned int start2, unsigned int inc2) +{ + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = cos(vec2[i*inc2+start2]); +} + +template<typename NumericT> +void element_op(vector_base<NumericT> & vec1, + vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_cos> > const & proxy) +{ + vec_element_cos_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), + static_cast<unsigned int>(viennacl::traits::start(vec1)), + static_cast<unsigned int>(viennacl::traits::stride(vec1)), + static_cast<unsigned int>(viennacl::traits::size(vec1)), + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_cos_kernel"); +} + + +// cosh +template<typename NumericT> +__global__ void vec_element_cosh_kernel( + NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, + NumericT const * vec2, unsigned int start2, unsigned int inc2) +{ + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = cosh(vec2[i*inc2+start2]); +} + +template<typename NumericT> +void element_op(vector_base<NumericT> & vec1, + vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_cosh> > const & proxy) +{ + vec_element_cosh_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), + static_cast<unsigned int>(viennacl::traits::start(vec1)), + static_cast<unsigned int>(viennacl::traits::stride(vec1)), + static_cast<unsigned int>(viennacl::traits::size(vec1)), + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_cosh_kernel"); +} + + +// exp +template<typename NumericT> +__global__ void vec_element_exp_kernel( + NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, + NumericT const * vec2, unsigned int start2, unsigned int inc2) +{ + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = exp(vec2[i*inc2+start2]); +} + +template<typename NumericT> +void element_op(vector_base<NumericT> & vec1, + vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_exp> > const & proxy) +{ + vec_element_exp_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), + static_cast<unsigned int>(viennacl::traits::start(vec1)), + static_cast<unsigned int>(viennacl::traits::stride(vec1)), + static_cast<unsigned int>(viennacl::traits::size(vec1)), + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_exp_kernel"); +} + + +// fabs +template<typename NumericT> +__global__ void vec_element_fabs_kernel( + NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, + NumericT const * vec2, unsigned int start2, unsigned int inc2) +{ + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = fabs(vec2[i*inc2+start2]); +} + +template<typename NumericT> +void element_op(vector_base<NumericT> & vec1, + vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_fabs> > const & proxy) +{ + vec_element_fabs_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), + static_cast<unsigned int>(viennacl::traits::start(vec1)), + static_cast<unsigned int>(viennacl::traits::stride(vec1)), + static_cast<unsigned int>(viennacl::traits::size(vec1)), + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_fabs_kernel"); +} + +// abs +template<typename NumericT> +__global__ void vec_element_abs_kernel( + NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, + NumericT const * vec2, unsigned int start2, unsigned int inc2) +{ + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = abs(vec2[i*inc2+start2]); +} + +template<typename NumericT> +void element_op(vector_base<NumericT> & vec1, + vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_abs> > const & proxy) +{ + vec_element_abs_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), + static_cast<unsigned int>(viennacl::traits::start(vec1)), + static_cast<unsigned int>(viennacl::traits::stride(vec1)), + static_cast<unsigned int>(viennacl::traits::size(vec1)), + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_abs_kernel"); +} + + + +// floor +template<typename NumericT> +__global__ void vec_element_floor_kernel( + NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, + NumericT const * vec2, unsigned int start2, unsigned int inc2) +{ + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = floor(vec2[i*inc2+start2]); +} + +template<typename NumericT> +void element_op(vector_base<NumericT> & vec1, + vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_floor> > const & proxy) +{ + vec_element_floor_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), + static_cast<unsigned int>(viennacl::traits::start(vec1)), + static_cast<unsigned int>(viennacl::traits::stride(vec1)), + static_cast<unsigned int>(viennacl::traits::size(vec1)), + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_floor_kernel"); +} + + +// log +template<typename NumericT> +__global__ void vec_element_log_kernel( + NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, + NumericT const * vec2, unsigned int start2, unsigned int inc2) +{ + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = log(vec2[i*inc2+start2]); +} + +template<typename NumericT> +void element_op(vector_base<NumericT> & vec1, + vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_log> > const & proxy) +{ + vec_element_log_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), + static_cast<unsigned int>(viennacl::traits::start(vec1)), + static_cast<unsigned int>(viennacl::traits::stride(vec1)), + static_cast<unsigned int>(viennacl::traits::size(vec1)), + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_log_kernel"); +} + + +// log10 +template<typename NumericT> +__global__ void vec_element_log10_kernel( + NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, + NumericT const * vec2, unsigned int start2, unsigned int inc2) +{ + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = log10(vec2[i*inc2+start2]); +} + +template<typename NumericT> +void element_op(vector_base<NumericT> & vec1, + vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_log10> > const & proxy) +{ + vec_element_log10_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), + static_cast<unsigned int>(viennacl::traits::start(vec1)), + static_cast<unsigned int>(viennacl::traits::stride(vec1)), + static_cast<unsigned int>(viennacl::traits::size(vec1)), + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_log10_kernel"); +} + + +// sin +template<typename NumericT> +__global__ void vec_element_sin_kernel( + NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, + NumericT const * vec2, unsigned int start2, unsigned int inc2) +{ + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = sin(vec2[i*inc2+start2]); +} + +template<typename NumericT> +void element_op(vector_base<NumericT> & vec1, + vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_sin> > const & proxy) +{ + vec_element_sin_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), + static_cast<unsigned int>(viennacl::traits::start(vec1)), + static_cast<unsigned int>(viennacl::traits::stride(vec1)), + static_cast<unsigned int>(viennacl::traits::size(vec1)), + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sin_kernel"); +} + + +// sinh +template<typename NumericT> +__global__ void vec_element_sinh_kernel( + NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, + NumericT const * vec2, unsigned int start2, unsigned int inc2) +{ + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = sinh(vec2[i*inc2+start2]); +} + +template<typename NumericT> +void element_op(vector_base<NumericT> & vec1, + vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_sinh> > const & proxy) +{ + vec_element_sinh_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), + static_cast<unsigned int>(viennacl::traits::start(vec1)), + static_cast<unsigned int>(viennacl::traits::stride(vec1)), + static_cast<unsigned int>(viennacl::traits::size(vec1)), + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sinh_kernel"); +} + + +// sqrt +template<typename NumericT> +__global__ void vec_element_sqrt_kernel( + NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, + NumericT const * vec2, unsigned int start2, unsigned int inc2) +{ + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = sqrt(vec2[i*inc2+start2]); +} + +template<typename NumericT> +void element_op(vector_base<NumericT> & vec1, + vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_sqrt> > const & proxy) +{ + vec_element_sqrt_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), + static_cast<unsigned int>(viennacl::traits::start(vec1)), + static_cast<unsigned int>(viennacl::traits::stride(vec1)), + static_cast<unsigned int>(viennacl::traits::size(vec1)), + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sqrt_kernel"); +} + + +// tan +template<typename NumericT> +__global__ void vec_element_tan_kernel( + NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, + NumericT const * vec2, unsigned int start2, unsigned int inc2) +{ + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = tan(vec2[i*inc2+start2]); +} + +template<typename NumericT> +void element_op(vector_base<NumericT> & vec1, + vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_tan> > const & proxy) +{ + vec_element_tan_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), + static_cast<unsigned int>(viennacl::traits::start(vec1)), + static_cast<unsigned int>(viennacl::traits::stride(vec1)), + static_cast<unsigned int>(viennacl::traits::size(vec1)), + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_tan_kernel"); +} + + +// tanh +template<typename NumericT> +__global__ void vec_element_tanh_kernel( + NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, + NumericT const * vec2, unsigned int start2, unsigned int inc2) +{ + for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) + vec1[i*inc1+start1] = tanh(vec2[i*inc2+start2]); +} + +template<typename NumericT> +void element_op(vector_base<NumericT> & vec1, + vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_tanh> > const & proxy) +{ + vec_element_tanh_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), + static_cast<unsigned int>(viennacl::traits::start(vec1)), + static_cast<unsigned int>(viennacl::traits::stride(vec1)), + static_cast<unsigned int>(viennacl::traits::size(vec1)), + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_tanh_kernel"); +} + + + +///////////////////////// Norms and inner product /////////////////// + + +template<typename NumericT> +__global__ void inner_prod_kernel(const NumericT * vec1, + unsigned int start1, + unsigned int inc1, + unsigned int size1, + const NumericT * vec2, + unsigned int start2, + unsigned int inc2, + unsigned int size2, + NumericT * group_buffer) +{ + __shared__ NumericT tmp_buffer[128]; + unsigned int group_start1 = (blockIdx.x * size1) / (gridDim.x) * inc1 + start1; + unsigned int group_start2 = (blockIdx.x * size2) / (gridDim.x) * inc2 + start2; + + unsigned int group_size1 = ((blockIdx.x + 1) * size1) / (gridDim.x) + - ( blockIdx.x * size1) / (gridDim.x); + + + NumericT tmp = 0; + for (unsigned int i = threadIdx.x; i < group_size1; i += blockDim.x) + tmp += vec1[i*inc1+group_start1] * vec2[i*inc2+group_start2]; + tmp_buffer[threadIdx.x] = tmp; + + // parallel reduction + for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) + tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride]; + } + + if (threadIdx.x == 0) + group_buffer[blockIdx.x] = tmp_buffer[0]; + +} + + + +// sums the array 'vec1' and writes to result. Makes use of a single work-group only. +template<typename NumericT> +__global__ void vector_sum_kernel_floats( + const NumericT * vec1, + unsigned int start1, + unsigned int inc1, + unsigned int size1, + unsigned int option, //0: use fmax, 1: just sum, 2: sum and return sqrt of sum + NumericT * result) +{ + __shared__ NumericT tmp_buffer[128]; + NumericT thread_sum = 0; + for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x) + { + if (option > 0) + thread_sum += vec1[i*inc1+start1]; + else + thread_sum = fmax(thread_sum, fabs(vec1[i*inc1+start1])); + } + + tmp_buffer[threadIdx.x] = thread_sum; + + for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) + { + if (option > 0) + tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride]; + else + tmp_buffer[threadIdx.x] = fmax(tmp_buffer[threadIdx.x], tmp_buffer[threadIdx.x + stride]); + } + } + + if (threadIdx.x == 0) + { + if (option == 2) + *result = sqrt(tmp_buffer[0]); + else + *result = tmp_buffer[0]; + } +} + +template<typename NumericT> +__global__ void vector_sum_kernel_integers( + const NumericT * vec1, + unsigned int start1, + unsigned int inc1, + unsigned int size1, + unsigned int option, //0: use max, 1: just sum + NumericT * result) +{ + __shared__ NumericT tmp_buffer[128]; + NumericT thread_sum = 0; + for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x) + { + if (option > 0) + thread_sum += vec1[i*inc1+start1]; + else + thread_sum = thread_sum > abs(vec1[i*inc1+start1]) ? thread_sum : abs(vec1[i*inc1+start1]); + } + + tmp_buffer[threadIdx.x] = thread_sum; + + for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) + { + if (option > 0) + tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride]; + else + tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x + stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x + stride]; + } + } + + if (threadIdx.x == 0) + *result = tmp_buffer[0]; +} + +template<typename NumericT> +__global__ void vector_sum_kernel_unsigned_integers( + const NumericT * vec1, + unsigned int start1, + unsigned int inc1, + unsigned int size1, + unsigned int option, //0: use max, 1: just sum + NumericT * result) +{ + __shared__ NumericT tmp_buffer[128]; + NumericT thread_sum = 0; + for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x) + { + if (option > 0) + thread_sum += vec1[i*inc1+start1]; + else + thread_sum = (thread_sum > vec1[i*inc1+start1]) ? thread_sum : vec1[i*inc1+start1]; + } + + tmp_buffer[threadIdx.x] = thread_sum; + + for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) + { + if (option > 0) + tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride]; + else + tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x + stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x + stride]; + } + } + + if (threadIdx.x == 0) + *result = tmp_buffer[0]; +} + +namespace detail +{ + /** \cond */ + struct vector_sum_kernel_launcher_integers + { + template<typename NumericT, typename ScalarT> + static void apply(vector_base<NumericT> const & temp, + unsigned int option, + ScalarT & result) + { + typedef NumericT value_type; + vector_sum_kernel_integers<<<1, 128>>>(viennacl::cuda_arg(temp), + static_cast<unsigned int>(viennacl::traits::start(temp)), + static_cast<unsigned int>(viennacl::traits::stride(temp)), + static_cast<unsigned int>(viennacl::traits::size(temp)), + static_cast<unsigned int>(option), + viennacl::cuda_arg(result) ); + VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel"); + } + }; + + struct vector_sum_kernel_launcher_unsigned_integers + { + template<typename NumericT, typename ScalarT> + static void apply(vector_base<NumericT> const & temp, + unsigned int option, + ScalarT & result) + { + typedef NumericT value_type; + vector_sum_kernel_unsigned_integers<<<1, 128>>>(viennacl::cuda_arg(temp), + static_cast<unsigned int>(viennacl::traits::start(temp)), + static_cast<unsigned int>(viennacl::traits::stride(temp)), + static_cast<unsigned int>(viennacl::traits::size(temp)), + static_cast<unsigned int>(option), + viennacl::cuda_arg(result) ); + VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel"); + } + }; + + struct vector_sum_kernel_launcher_floats + { + template<typename NumericT, typename ScalarT> + static void apply(vector_base<NumericT> const & temp, + unsigned int option, + ScalarT & result) + { + typedef NumericT value_type; + vector_sum_kernel_floats<<<1, 128>>>(viennacl::cuda_arg(temp), + static_cast<unsigned int>(viennacl::traits::start(temp)), + static_cast<unsigned int>(viennacl::traits::stride(temp)), + static_cast<unsigned int>(viennacl::traits::size(temp)), + static_cast<unsigned int>(option), + viennacl::cuda_arg(result) ); + VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel"); + } + }; + + template<typename NumericT> + struct vector_sum_kernel_launcher : public vector_sum_kernel_launcher_integers {}; + + template<> + struct vector_sum_kernel_launcher<unsigned char> : public vector_sum_kernel_launcher_unsigned_integers {}; + + template<> + struct vector_sum_kernel_launcher<unsigned short> : public vector_sum_kernel_launcher_unsigned_integers {}; + + template<> + struct vector_sum_kernel_launcher<unsigned int> : public vector_sum_kernel_launcher_unsigned_integers {}; + + template<> + struct vector_sum_kernel_launcher<unsigned long> : public vector_sum_kernel_launcher_unsigned_integers {}; + + template<> + struct vector_sum_kernel_launcher<float> : public vector_sum_kernel_launcher_floats {}; + + template<> + struct vector_sum_kernel_launcher<double> : public vector_sum_kernel_launcher_floats {}; + + /** \endcond */ +} + + +//implementation of inner product: +//namespace { +/** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2). +* +* @param vec1 The first vector +* @param vec2 The second vector +* @param result The result scalar (on the gpu) +*/ +template<typename NumericT, typename ScalarT> +void inner_prod_impl(vector_base<NumericT> const & vec1, + vector_base<NumericT> const & vec2, + ScalarT & result) +{ + typedef NumericT value_type; + + static const unsigned int work_groups = 128; + static viennacl::vector<value_type> temp(work_groups); + + inner_prod_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), + static_cast<unsigned int>(viennacl::traits::start(vec1)), + static_cast<unsigned int>(viennacl::traits::stride(vec1)), + static_cast<unsigned int>(viennacl::traits::size(vec1)), + viennacl::cuda_arg(vec2), + static_cast<unsigned int>(viennacl::traits::start(vec2)), + static_cast<unsigned int>(viennacl::traits::stride(vec2)), + static_cast<unsigned int>(viennacl::traits::size(vec2)), + viennacl::cuda_arg(temp) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel"); + + detail::vector_sum_kernel_launcher<NumericT>::apply(temp, 1, result); +} + + +/** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2). +* +* @param vec1 The first vector +* @param vec2 The second vector +* @param result The result scalar (on the host) +*/ +template<typename NumericT> +void inner_prod_cpu(vector_base<NumericT> const & vec1, + vector_base<NumericT> const & vec2, + NumericT & result) +{ + typedef NumericT value_type; + + const unsigned int work_groups = 128; + viennacl::vector<value_type> temp(work_groups); + + inner_prod_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), + static_cast<unsigned int>(viennacl::traits::start(vec1)), + static_cast<unsigned int>(viennacl::traits::stride(vec1)), + static_cast<unsigned int>(viennacl::traits::size(vec1)), + viennacl::cuda_arg(vec2), + static_cast<unsigned int>(viennacl::traits::start(vec2)), + static_cast<unsigned int>(viennacl::traits::stride(vec2)), + static_cast<unsigned int>(viennacl::traits::size(vec2)), + viennacl::cuda_arg(temp) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel"); + + // Now copy partial results from GPU back to CPU and run reduction there: + std::vector<value_type> temp_cpu(work_groups); + viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin()); + + result = 0; + for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it) + result += *it; +} + +/////////////////////////////////// + +#define VIENNACL_MDOT_WORKGROUP_SIZE 128 +#define VIENNACL_MDOT_WORKGROUP_NUM 128 +// M = 2: +template<typename NumericT> +__global__ void inner_prod_2_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, + const NumericT *y0, unsigned int start0, unsigned int stride0, + const NumericT *y1, unsigned int start1, unsigned int stride1, + NumericT *group_results) +{ + __shared__ NumericT tmp_buffer[2*VIENNACL_MDOT_WORKGROUP_SIZE]; + unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1; + unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread; + unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond size of x + + NumericT entry_x = 0; + NumericT group_sum0 = 0; + NumericT group_sum1 = 0; + for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) { + entry_x = x[i * stridex + startx]; // load only once from global memory! + group_sum0 += entry_x * y0[i * stride0 + start0]; + group_sum1 += entry_x * y1[i * stride1 + start1]; + } + tmp_buffer[threadIdx.x] = group_sum0; + tmp_buffer[threadIdx.x + blockDim.x] = group_sum1; + + // parallel reduction + for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) { + __syncthreads(); + if (threadIdx.x < stride) { + tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ]; + tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x]; + } + } + + // write result of group to group_results + if (threadIdx.x == 0) { + group_results[blockIdx.x] = tmp_buffer[0]; + group_results[blockIdx.x + gridDim.x] = tmp_buffer[blockDim.x]; + } +} + +// M = 3: +template<typename NumericT> +__global__ void inner_prod_3_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, + const NumericT *y0, unsigned int start0, unsigned int stride0, + const NumericT *y1, unsigned int start1, unsigned int stride1, + const NumericT *y2, unsigned int start2, unsigned int stride2, + NumericT *group_results) +{ + __shared__ NumericT tmp_buffer[3*VIENNACL_MDOT_WORKGROUP_SIZE]; + unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1; + unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread; + unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size + + NumericT entry_x = 0; + NumericT group_sum0 = 0; + NumericT group_sum1 = 0; + NumericT group_sum2 = 0; + for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) { + entry_x = x[i * stridex + startx]; // load only once from global memory! + group_sum0 += entry_x * y0[i * stride0 + start0]; + group_sum1 += entry_x * y1[i * stride1 + start1]; + group_sum2 += entry_x * y2[i * stride2 + start2]; + } + tmp_buffer[threadIdx.x] = group_sum0; + tmp_buffer[threadIdx.x + blockDim.x] = group_sum1; + tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2; + + // parallel reduction + for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) { + __syncthreads(); + if (threadIdx.x < stride) { + tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ]; + tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x]; + tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x]; + } + } + + // write result of group to group_results + if (threadIdx.x == 0) { + group_results[blockIdx.x ] = tmp_buffer[0]; + group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x]; + group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x]; + } +} + +// M = 4: +template<typename NumericT> +__global__ void inner_prod_4_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, + const NumericT *y0, unsigned int start0, unsigned int stride0, + const NumericT *y1, unsigned int start1, unsigned int stride1, + const NumericT *y2, unsigned int start2, unsigned int stride2, + const NumericT *y3, unsigned int start3, unsigned int stride3, + NumericT *group_results) +{ + __shared__ NumericT tmp_buffer[4*VIENNACL_MDOT_WORKGROUP_SIZE]; + unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1; + unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread; + unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size + + NumericT entry_x = 0; + NumericT group_sum0 = 0; + NumericT group_sum1 = 0; + NumericT group_sum2 = 0; + NumericT group_sum3 = 0; + for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) { + entry_x = x[i * stridex + startx]; // load only once from global memory! + group_sum0 += entry_x * y0[i * stride0 + start0]; + group_sum1 += entry_x * y1[i * stride1 + start1]; + group_sum2 += entry_x * y2[i * stride2 + start2]; + group_sum3 += entry_x * y3[i * stride3 + start3]; + } + tmp_buffer[threadIdx.x] = group_sum0; + tmp_buffer[threadIdx.x + blockDim.x] = group_sum1; + tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2; + tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3; + + // parallel reduction + for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) { + __syncthreads(); + if (threadIdx.x < stride) { + tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ]; + tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x]; + tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x]; + tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 3 * blockDim.x]; + } + } + + // write result of group to group_results + if (threadIdx.x == 0) { + group_results[blockIdx.x ] = tmp_buffer[0]; + group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x]; + group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x]; + group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x]; + } +} + +// M = 8: +template<typename NumericT> +__global__ void inner_prod_8_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, + const NumericT *y0, unsigned int start0, unsigned int stride0, + const NumericT *y1, unsigned int start1, unsigned int stride1, + const NumericT *y2, unsigned int start2, unsigned int stride2, + const NumericT *y3, unsigned int start3, unsigned int stride3, + const NumericT *y4, unsigned int start4, unsigned int stride4, + const NumericT *y5, unsigned int start5, unsigned int stride5, + const NumericT *y6, unsigned int start6, unsigned int stride6, + const NumericT *y7, unsigned int start7, unsigned int stride7, + NumericT *group_results) +{ + __shared__ NumericT tmp_buffer[8*VIENNACL_MDOT_WORKGROUP_SIZE]; + unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1; + unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread; + unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size + + NumericT entry_x = 0; + NumericT group_sum0 = 0; + NumericT group_sum1 = 0; + NumericT group_sum2 = 0; + NumericT group_sum3 = 0; + NumericT group_sum4 = 0; + NumericT group_sum5 = 0; + NumericT group_sum6 = 0; + NumericT group_sum7 = 0; + for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) { + entry_x = x[i * stridex + startx]; // load only once from global memory! + group_sum0 += entry_x * y0[i * stride0 + start0]; + group_sum1 += entry_x * y1[i * stride1 + start1]; + group_sum2 += entry_x * y2[i * stride2 + start2]; + group_sum3 += entry_x * y3[i * stride3 + start3]; + group_sum4 += entry_x * y4[i * stride4 + start4]; + group_sum5 += entry_x * y5[i * stride5 + start5]; + group_sum6 += entry_x * y6[i * stride6 + start6]; + group_sum7 += entry_x * y7[i * stride7 + start7]; + } + tmp_buffer[threadIdx.x] = group_sum0; + tmp_buffer[threadIdx.x + blockDim.x] = group_sum1; + tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2; + tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3; + tmp_buffer[threadIdx.x + 4 * blockDim.x] = group_sum4; + tmp_buffer[threadIdx.x + 5 * blockDim.x] = group_sum5; + tmp_buffer[threadIdx.x + 6 * blockDim.x] = group_sum6; + tmp_buffer[threadIdx.x + 7 * blockDim.x] = group_sum7; + + // parallel reduction + for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) { + __syncthreads(); + if (threadIdx.x < stride) { + tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ]; + tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x]; + tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x]; + tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 3 * blockDim.x]; + tmp_buffer[threadIdx.x + 4 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 4 * blockDim.x]; + tmp_buffer[threadIdx.x + 5 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 5 * blockDim.x]; + tmp_buffer[threadIdx.x + 6 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 6 * blockDim.x]; + tmp_buffer[threadIdx.x + 7 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 7 * blockDim.x]; + } + } + + // write result of group to group_results + if (threadIdx.x == 0) { + group_results[blockIdx.x ] = tmp_buffer[0]; + group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x]; + group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x]; + group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x]; + group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * blockDim.x]; + group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * blockDim.x]; + group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * blockDim.x]; + group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * blockDim.x]; + } +} + +// sums the array 'vec1' and writes to result. Makes use of a single work-group only. +template<typename NumericT> +__global__ void vector_multi_sum_kernel( + NumericT const * vec1, + NumericT * result, + unsigned int start_result, + unsigned int inc_result) +{ + __shared__ NumericT tmp_buffer[VIENNACL_MDOT_WORKGROUP_SIZE]; + + tmp_buffer[threadIdx.x] = vec1[threadIdx.x + blockIdx.x * VIENNACL_MDOT_WORKGROUP_SIZE]; + + for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) + { + __syncthreads(); + if (threadIdx.x < stride) + tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride]; + } + + if (threadIdx.x == 0) + result[start_result + inc_result * blockIdx.x] = tmp_buffer[0]; +} + +template<typename NumericT> +void inner_prod_impl(vector_base<NumericT> const & x, + vector_tuple<NumericT> const & vec_tuple, + vector_base<NumericT> & result) +{ + typedef NumericT value_type; + + static viennacl::vector<value_type> temp(8 * VIENNACL_MDOT_WORKGROUP_NUM); + + vcl_size_t current_index = 0; + while (vec_tuple.const_size() > current_index) + { + switch (vec_tuple.const_size() - current_index) + { + case 7: + case 6: + case 5: + case 4: + { + vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index); + vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1); + vector_base<NumericT> const & y2 = vec_tuple.const_at(current_index + 2); + vector_base<NumericT> const & y3 = vec_tuple.const_at(current_index + 3); + + inner_prod_4_kernel<<<VIENNACL_MDOT_WORKGROUP_NUM, + VIENNACL_MDOT_WORKGROUP_SIZE>>>( viennacl::cuda_arg(x), + static_cast<unsigned int>(viennacl::traits::start(x)), + static_cast<unsigned int>(viennacl::traits::stride(x)), + static_cast<unsigned int>(viennacl::traits::size(x)), + viennacl::cuda_arg(y0), + static_cast<unsigned int>(viennacl::traits::start(y0)), + static_cast<unsigned int>(viennacl::traits::stride(y0)), + viennacl::cuda_arg(y1), + static_cast<unsigned int>(viennacl::traits::start(y1)), + static_cast<unsigned int>(viennacl::traits::stride(y1)), + viennacl::cuda_arg(y2), + static_cast<unsigned int>(vienna
<TRUNCATED>
