http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/vector_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/vector_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/vector_operations.hpp deleted file mode 100644 index b7eaeb4..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/vector_operations.hpp +++ /dev/null @@ -1,3252 +0,0 @@ -#ifndef VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_ -#define VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_ - -/* ========================================================================= - Copyright (c) 2010-2016, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. - Portions of this software are copyright by UChicago Argonne, LLC. - - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - - Project Head: Karl Rupp [email protected] - - (A list of authors and contributors can be found in the manual) - - License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - -/** @file viennacl/linalg/cuda/vector_operations.hpp - @brief Implementations of vector operations using a plain single-threaded execution on CPU -*/ - -#include <cmath> -#include "viennacl/forwards.h" -#include "viennacl/scalar.hpp" -#include "viennacl/tools/tools.hpp" -#include "viennacl/meta/predicate.hpp" -#include "viennacl/meta/enable_if.hpp" -#include "viennacl/traits/size.hpp" -#include "viennacl/traits/start.hpp" -#include "viennacl/traits/stride.hpp" - -#include "viennacl/linalg/cuda/common.hpp" - -namespace viennacl -{ -namespace linalg -{ -namespace cuda -{ - -// -// Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here! -// -template<typename DestNumericT, typename SrcNumericT> -__global__ void convert_kernel(DestNumericT * dest, unsigned int start_dest, unsigned int inc_dest, unsigned int size_dest, - SrcNumericT const * src, unsigned int start_src, unsigned int inc_src) -{ - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size_dest; - i += gridDim.x * blockDim.x) - dest[i*inc_dest+start_dest] = src[i*inc_src+start_src]; -} - - -template<typename DestNumericT, typename SrcNumericT> -void convert(vector_base<DestNumericT> & dest, vector_base<SrcNumericT> const & src) -{ - convert_kernel<<<128, 128>>>(viennacl::cuda_arg(dest), - static_cast<unsigned int>(viennacl::traits::start(dest)), - static_cast<unsigned int>(viennacl::traits::stride(dest)), - static_cast<unsigned int>(viennacl::traits::size(dest)), - - viennacl::cuda_arg(src), - static_cast<unsigned int>(viennacl::traits::start(src)), - static_cast<unsigned int>(viennacl::traits::stride(src)) ); - VIENNACL_CUDA_LAST_ERROR_CHECK("convert_kernel"); -} - - -//////////////////////// av ///////////////////////////// - -// gpu scalar -template<typename NumericT> -__global__ void av_kernel(NumericT * vec1, - unsigned int start1, - unsigned int inc1, - unsigned int size1, - - const NumericT * fac2, - unsigned int options2, - const NumericT * vec2, - unsigned int start2, - unsigned int inc2) -{ - NumericT alpha = *fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - - if (options2 & (1 << 1)) - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha; - } - else - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha; - } -} - -// cpu scalar -template<typename NumericT> -__global__ void av_kernel(NumericT * vec1, - unsigned int start1, - unsigned int inc1, - unsigned int size1, - - NumericT fac2, - unsigned int options2, - const NumericT * vec2, - unsigned int start2, - unsigned int inc2) -{ - NumericT alpha = fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - - if (options2 & (1 << 1)) - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha; - } - else - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha; - } -} - - - -template<typename NumericT, typename ScalarType1> -void av(vector_base<NumericT> & vec1, - vector_base<NumericT> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha) -{ - typedef NumericT value_type; - - unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha); - - value_type data_alpha = alpha; - if (flip_sign_alpha) - data_alpha = -data_alpha; - if (reciprocal_alpha) - data_alpha = static_cast<value_type>(1) / data_alpha; - - value_type temporary_alpha = 0; - if (viennacl::is_cpu_scalar<ScalarType1>::value) - temporary_alpha = alpha; - - av_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), - static_cast<unsigned int>(viennacl::traits::start(vec1)), - static_cast<unsigned int>(viennacl::traits::stride(vec1)), - static_cast<unsigned int>(viennacl::traits::size(vec1)), - - viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), - options_alpha, - viennacl::cuda_arg(vec2), - static_cast<unsigned int>(viennacl::traits::start(vec2)), - static_cast<unsigned int>(viennacl::traits::stride(vec2)) ); - VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel"); -} - - -///////////////////// avbv ////////////////////////////////// - -// alpha and beta on GPU -template<typename NumericT> -__global__ void avbv_kernel(NumericT * vec1, - unsigned int start1, - unsigned int inc1, - unsigned int size1, - - const NumericT * fac2, - unsigned int options2, - const NumericT * vec2, - unsigned int start2, - unsigned int inc2, - - const NumericT * fac3, - unsigned int options3, - const NumericT * vec3, - unsigned int start3, - unsigned int inc3) -{ - NumericT alpha = *fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - - NumericT beta = *fac3; - if (options3 & (1 << 0)) - beta = -beta; - - if (options2 & (1 << 1)) - { - if (options3 & (1 << 1)) - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta; - } - else - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta; - } - } - else - { - if (options3 & (1 << 1)) - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta; - } - else - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta; - } - } -} - -// alpha on CPU, beta on GPU -template<typename NumericT> -__global__ void avbv_kernel(NumericT * vec1, - unsigned int start1, - unsigned int inc1, - unsigned int size1, - - NumericT fac2, - unsigned int options2, - const NumericT * vec2, - unsigned int start2, - unsigned int inc2, - - const NumericT * fac3, - unsigned int options3, - const NumericT * vec3, - unsigned int start3, - unsigned int inc3) -{ - NumericT alpha = fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - - NumericT beta = *fac3; - if (options3 & (1 << 0)) - beta = -beta; - - if (options2 & (1 << 1)) - { - if (options3 & (1 << 1)) - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta; - } - else - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta; - } - } - else - { - if (options3 & (1 << 1)) - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta; - } - else - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta; - } - } -} - -// alpha on GPU, beta on CPU -template<typename NumericT> -__global__ void avbv_kernel(NumericT * vec1, - unsigned int start1, - unsigned int inc1, - unsigned int size1, - - const NumericT * fac2, - unsigned int options2, - const NumericT * vec2, - unsigned int start2, - unsigned int inc2, - - NumericT fac3, - unsigned int options3, - const NumericT * vec3, - unsigned int start3, - unsigned int inc3) -{ - NumericT alpha = *fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - - NumericT beta = fac3; - if (options3 & (1 << 0)) - beta = -beta; - - if (options2 & (1 << 1)) - { - if (options3 & (1 << 1)) - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta; - } - else - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta; - } - } - else - { - if (options3 & (1 << 1)) - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta; - } - else - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta; - } - } -} - -// alpha and beta on CPU -template<typename NumericT> -__global__ void avbv_kernel(NumericT * vec1, - unsigned int start1, - unsigned int inc1, - unsigned int size1, - - NumericT fac2, - unsigned int options2, - const NumericT * vec2, - unsigned int start2, - unsigned int inc2, - - NumericT fac3, - unsigned int options3, - const NumericT * vec3, - unsigned int start3, - unsigned int inc3) -{ - NumericT alpha = fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - - NumericT beta = fac3; - if (options3 & (1 << 0)) - beta = -beta; - - if (options2 & (1 << 1)) - { - if (options3 & (1 << 1)) - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta; - } - else - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta; - } - } - else - { - if (options3 & (1 << 1)) - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta; - } - else - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta; - } - } -} - - - - -template<typename NumericT, typename ScalarT1, typename ScalarT2> -void avbv(vector_base<NumericT> & vec1, - vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, - vector_base<NumericT> const & vec3, ScalarT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta) -{ - typedef NumericT value_type; - - unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha); - - value_type data_alpha = alpha; - if (flip_sign_alpha) - data_alpha = -data_alpha; - if (reciprocal_alpha) - data_alpha = static_cast<value_type>(1) / data_alpha; - - value_type temporary_alpha = 0; - if (viennacl::is_cpu_scalar<ScalarT1>::value) - temporary_alpha = alpha; - - unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta); - - value_type temporary_beta = 0; - if (viennacl::is_cpu_scalar<ScalarT2>::value) - temporary_beta = beta; - - - avbv_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), - static_cast<unsigned int>(viennacl::traits::start(vec1)), - static_cast<unsigned int>(viennacl::traits::stride(vec1)), - static_cast<unsigned int>(viennacl::traits::size(vec1)), - - viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), - options_alpha, - viennacl::cuda_arg(vec2), - static_cast<unsigned int>(viennacl::traits::start(vec2)), - static_cast<unsigned int>(viennacl::traits::stride(vec2)), - - viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)), - options_beta, - viennacl::cuda_arg(vec3), - static_cast<unsigned int>(viennacl::traits::start(vec3)), - static_cast<unsigned int>(viennacl::traits::stride(vec3)) ); - VIENNACL_CUDA_LAST_ERROR_CHECK("avbv_kernel"); -} - - -////////////////////////// avbv_v ////////////////////////////////////// - - -// alpha and beta on GPU -template<typename NumericT> -__global__ void avbv_v_kernel(NumericT * vec1, - unsigned int start1, - unsigned int inc1, - unsigned int size1, - - const NumericT * fac2, - unsigned int options2, - const NumericT * vec2, - unsigned int start2, - unsigned int inc2, - - const NumericT * fac3, - unsigned int options3, - const NumericT * vec3, - unsigned int start3, - unsigned int inc3) -{ - NumericT alpha = *fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - - NumericT beta = *fac3; - if (options3 & (1 << 0)) - beta = -beta; - - if (options2 & (1 << 1)) - { - if (options3 & (1 << 1)) - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta; - } - else - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta; - } - } - else - { - if (options3 & (1 << 1)) - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta; - } - else - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta; - } - } -} - -// alpha on CPU, beta on GPU -template<typename NumericT> -__global__ void avbv_v_kernel(NumericT * vec1, - unsigned int start1, - unsigned int inc1, - unsigned int size1, - - NumericT fac2, - unsigned int options2, - const NumericT * vec2, - unsigned int start2, - unsigned int inc2, - - const NumericT * fac3, - unsigned int options3, - const NumericT * vec3, - unsigned int start3, - unsigned int inc3) -{ - NumericT alpha = fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - - NumericT beta = *fac3; - if (options3 & (1 << 0)) - beta = -beta; - - if (options2 & (1 << 1)) - { - if (options3 & (1 << 1)) - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta; - } - else - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta; - } - } - else - { - if (options3 & (1 << 1)) - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta; - } - else - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta; - } - } -} - -// alpha on GPU, beta on CPU -template<typename NumericT> -__global__ void avbv_v_kernel(NumericT * vec1, - unsigned int start1, - unsigned int inc1, - unsigned int size1, - - const NumericT * fac2, - unsigned int options2, - const NumericT * vec2, - unsigned int start2, - unsigned int inc2, - - NumericT fac3, - unsigned int options3, - const NumericT * vec3, - unsigned int start3, - unsigned int inc3) -{ - NumericT alpha = *fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - - NumericT beta = fac3; - if (options3 & (1 << 0)) - beta = -beta; - - if (options2 & (1 << 1)) - { - if (options3 & (1 << 1)) - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta; - } - else - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta; - } - } - else - { - if (options3 & (1 << 1)) - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta; - } - else - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta; - } - } -} - -// alpha and beta on CPU -template<typename NumericT> -__global__ void avbv_v_kernel(NumericT * vec1, - unsigned int start1, - unsigned int inc1, - unsigned int size1, - - NumericT fac2, - unsigned int options2, - const NumericT * vec2, - unsigned int start2, - unsigned int inc2, - - NumericT fac3, - unsigned int options3, - const NumericT * vec3, - unsigned int start3, - unsigned int inc3) -{ - NumericT alpha = fac2; - if (options2 & (1 << 0)) - alpha = -alpha; - - NumericT beta = fac3; - if (options3 & (1 << 0)) - beta = -beta; - - if (options2 & (1 << 1)) - { - if (options3 & (1 << 1)) - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta; - } - else - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta; - } - } - else - { - if (options3 & (1 << 1)) - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta; - } - else - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta; - } - } -} - - -template<typename NumericT, typename ScalarT1, typename ScalarT2> -void avbv_v(vector_base<NumericT> & vec1, - vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, - vector_base<NumericT> const & vec3, ScalarT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta) -{ - typedef NumericT value_type; - - unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha); - - value_type data_alpha = alpha; - if (flip_sign_alpha) - data_alpha = -data_alpha; - if (reciprocal_alpha) - data_alpha = static_cast<value_type>(1) / data_alpha; - - value_type temporary_alpha = 0; - if (viennacl::is_cpu_scalar<ScalarT1>::value) - temporary_alpha = alpha; - - unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta); - - value_type temporary_beta = 0; - if (viennacl::is_cpu_scalar<ScalarT2>::value) - temporary_beta = beta; - - - avbv_v_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), - static_cast<unsigned int>(viennacl::traits::start(vec1)), - static_cast<unsigned int>(viennacl::traits::stride(vec1)), - static_cast<unsigned int>(viennacl::traits::size(vec1)), - - viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), - options_alpha, - viennacl::cuda_arg(vec2), - static_cast<unsigned int>(viennacl::traits::start(vec2)), - static_cast<unsigned int>(viennacl::traits::stride(vec2)), - - viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)), - options_beta, - viennacl::cuda_arg(vec3), - static_cast<unsigned int>(viennacl::traits::start(vec3)), - static_cast<unsigned int>(viennacl::traits::stride(vec3)) ); -} - - -////////////////////////// - -template<typename NumericT> -__global__ void vector_assign_kernel(NumericT * vec1, - unsigned int start1, - unsigned int inc1, - unsigned int size1, - unsigned int internal_size1, - - NumericT alpha) -{ - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = (i < size1) ? alpha : 0; -} - -/** @brief Assign a constant value to a vector (-range/-slice) -* -* @param vec1 The vector to which the value should be assigned -* @param alpha The value to be assigned -* @param up_to_internal_size Specifies whether alpha should also be written to padded memory (mostly used for clearing the whole buffer). -*/ -template<typename NumericT, typename ScalarT1> -void vector_assign(vector_base<NumericT> & vec1, ScalarT1 const & alpha, bool up_to_internal_size = false) -{ - typedef NumericT value_type; - - value_type temporary_alpha = 0; - if (viennacl::is_cpu_scalar<ScalarT1>::value) - temporary_alpha = alpha; - - unsigned int size = up_to_internal_size ? static_cast<unsigned int>(vec1.internal_size()) : static_cast<unsigned int>(viennacl::traits::size(vec1)); - - vector_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), - static_cast<unsigned int>(viennacl::traits::start(vec1)), - static_cast<unsigned int>(viennacl::traits::stride(vec1)), - size, - static_cast<unsigned int>(vec1.internal_size()), //Note: Do NOT use traits::internal_size() here, because vector proxies don't require padding. - - viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)) ); - VIENNACL_CUDA_LAST_ERROR_CHECK("vector_assign_kernel"); -} - -////////////////////////// - -template<typename NumericT> -__global__ void vector_swap_kernel(NumericT * vec1, - unsigned int start1, - unsigned int inc1, - unsigned int size1, - - NumericT * vec2, - unsigned int start2, - unsigned int inc2) -{ - NumericT tmp; - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - { - tmp = vec2[i*inc2+start2]; - vec2[i*inc2+start2] = vec1[i*inc1+start1]; - vec1[i*inc1+start1] = tmp; - } -} - - -/** @brief Swaps the contents of two vectors, data is copied -* -* @param vec1 The first vector (or -range, or -slice) -* @param vec2 The second vector (or -range, or -slice) -*/ -template<typename NumericT> -void vector_swap(vector_base<NumericT> & vec1, vector_base<NumericT> & vec2) -{ - vector_swap_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), - static_cast<unsigned int>(viennacl::traits::start(vec1)), - static_cast<unsigned int>(viennacl::traits::stride(vec1)), - static_cast<unsigned int>(viennacl::traits::size(vec1)), - - viennacl::cuda_arg(vec2), - static_cast<unsigned int>(viennacl::traits::start(vec2)), - static_cast<unsigned int>(viennacl::traits::stride(vec2)) ); - VIENNACL_CUDA_LAST_ERROR_CHECK("vector_swap_kernel"); -} - -///////////////////////// Binary Elementwise operations ///////////// - -template<typename NumericT> -__global__ void element_op_kernel(NumericT * vec1, - unsigned int start1, - unsigned int inc1, - unsigned int size1, - - NumericT const * vec2, - unsigned int start2, - unsigned int inc2, - - NumericT const * vec3, - unsigned int start3, - unsigned int inc3, - - unsigned int op_type - ) -{ - if (op_type == 2) - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - { - vec1[i*inc1+start1] = pow(vec2[i*inc2+start2], vec3[i*inc3+start3]); - } - } - else if (op_type == 1) - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - { - vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3]; - } - } - else if (op_type == 0) - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - { - vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3]; - } - } -} - -template<typename NumericT> -__global__ void element_op_int_kernel(NumericT * vec1, - unsigned int start1, - unsigned int inc1, - unsigned int size1, - - NumericT const * vec2, - unsigned int start2, - unsigned int inc2, - - NumericT const * vec3, - unsigned int start3, - unsigned int inc3, - - unsigned int op_type - ) -{ - if (op_type == 1) - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - { - vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3]; - } - } - else if (op_type == 0) - { - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; - i < size1; - i += gridDim.x * blockDim.x) - { - vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3]; - } - } -} - -/** @brief Implementation of the element-wise operation v1 = v2 .* v3 and v1 = v2 ./ v3 (using MATLAB syntax) -* -* @param vec1 The result vector (or -range, or -slice) -* @param proxy The proxy object holding v2, v3 and the operation -*/ -template<typename NumericT, typename OpT> -void element_op(vector_base<NumericT> & vec1, - vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_binary<OpT> > const & proxy) -{ - unsigned int op_type = 2; //0: product, 1: division, 2: power - if (viennacl::is_division<OpT>::value) - op_type = 1; - else if (viennacl::is_product<OpT>::value) - op_type = 0; - - element_op_int_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), - static_cast<unsigned int>(viennacl::traits::start(vec1)), - static_cast<unsigned int>(viennacl::traits::stride(vec1)), - static_cast<unsigned int>(viennacl::traits::size(vec1)), - - viennacl::cuda_arg(proxy.lhs()), - static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), - static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())), - - viennacl::cuda_arg(proxy.rhs()), - static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())), - static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())), - - op_type - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel"); -} - -template<typename OpT> -void element_op(vector_base<float> & vec1, - vector_expression<const vector_base<float>, const vector_base<float>, op_element_binary<OpT> > const & proxy) -{ - unsigned int op_type = 2; //0: product, 1: division, 2: power - if (viennacl::is_division<OpT>::value) - op_type = 1; - else if (viennacl::is_product<OpT>::value) - op_type = 0; - - element_op_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), - static_cast<unsigned int>(viennacl::traits::start(vec1)), - static_cast<unsigned int>(viennacl::traits::stride(vec1)), - static_cast<unsigned int>(viennacl::traits::size(vec1)), - - viennacl::cuda_arg(proxy.lhs()), - static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), - static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())), - - viennacl::cuda_arg(proxy.rhs()), - static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())), - static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())), - - op_type - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel"); -} - -template<typename OpT> -void element_op(vector_base<double> & vec1, - vector_expression<const vector_base<double>, const vector_base<double>, op_element_binary<OpT> > const & proxy) -{ - unsigned int op_type = 2; //0: product, 1: division, 2: power - if (viennacl::is_division<OpT>::value) - op_type = 1; - else if (viennacl::is_product<OpT>::value) - op_type = 0; - - element_op_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), - static_cast<unsigned int>(viennacl::traits::start(vec1)), - static_cast<unsigned int>(viennacl::traits::stride(vec1)), - static_cast<unsigned int>(viennacl::traits::size(vec1)), - - viennacl::cuda_arg(proxy.lhs()), - static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), - static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())), - - viennacl::cuda_arg(proxy.rhs()), - static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())), - static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())), - - op_type - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel"); -} - -///////////////////////// Unary Elementwise operations ///////////// - -// Note: Trying to automate things with macros or template metaprogramming failed (preprocessor with nvcc did not work as expected), so this is terribly hand-rolled code -// Question (Karl Rupp): Why is CUDA code always such a hassle when trying to use it in a library context? - -// acos -template<typename NumericT> -__global__ void vec_element_acos_kernel( - NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, - NumericT const * vec2, unsigned int start2, unsigned int inc2) -{ - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = acos(vec2[i*inc2+start2]); -} - -template<typename NumericT> -void element_op(vector_base<NumericT> & vec1, - vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_acos> > const & proxy) -{ - typedef NumericT value_type; - - vec_element_acos_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), - static_cast<unsigned int>(viennacl::traits::start(vec1)), - static_cast<unsigned int>(viennacl::traits::stride(vec1)), - static_cast<unsigned int>(viennacl::traits::size(vec1)), - viennacl::cuda_arg(proxy.lhs()), - static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), - static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_acos_kernel"); -} - -// asin -template<typename NumericT> -__global__ void vec_element_asin_kernel( - NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, - NumericT const * vec2, unsigned int start2, unsigned int inc2) -{ - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = asin(vec2[i*inc2+start2]); -} - -template<typename NumericT> -void element_op(vector_base<NumericT> & vec1, - vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_asin> > const & proxy) -{ - vec_element_asin_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), - static_cast<unsigned int>(viennacl::traits::start(vec1)), - static_cast<unsigned int>(viennacl::traits::stride(vec1)), - static_cast<unsigned int>(viennacl::traits::size(vec1)), - viennacl::cuda_arg(proxy.lhs()), - static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), - static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_asin_kernel"); -} - - -// atan -template<typename NumericT> -__global__ void vec_element_atan_kernel( - NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, - NumericT const * vec2, unsigned int start2, unsigned int inc2) -{ - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = atan(vec2[i*inc2+start2]); -} - -template<typename NumericT> -void element_op(vector_base<NumericT> & vec1, - vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_atan> > const & proxy) -{ - vec_element_atan_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), - static_cast<unsigned int>(viennacl::traits::start(vec1)), - static_cast<unsigned int>(viennacl::traits::stride(vec1)), - static_cast<unsigned int>(viennacl::traits::size(vec1)), - viennacl::cuda_arg(proxy.lhs()), - static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), - static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_atan_kernel"); -} - - -// ceil -template<typename NumericT> -__global__ void vec_element_ceil_kernel( - NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, - NumericT const * vec2, unsigned int start2, unsigned int inc2) -{ - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = ceil(vec2[i*inc2+start2]); -} - -template<typename NumericT> -void element_op(vector_base<NumericT> & vec1, - vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_ceil> > const & proxy) -{ - vec_element_ceil_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), - static_cast<unsigned int>(viennacl::traits::start(vec1)), - static_cast<unsigned int>(viennacl::traits::stride(vec1)), - static_cast<unsigned int>(viennacl::traits::size(vec1)), - viennacl::cuda_arg(proxy.lhs()), - static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), - static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_ceil_kernel"); -} - - -// cos -template<typename NumericT> -__global__ void vec_element_cos_kernel( - NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, - NumericT const * vec2, unsigned int start2, unsigned int inc2) -{ - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = cos(vec2[i*inc2+start2]); -} - -template<typename NumericT> -void element_op(vector_base<NumericT> & vec1, - vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_cos> > const & proxy) -{ - vec_element_cos_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), - static_cast<unsigned int>(viennacl::traits::start(vec1)), - static_cast<unsigned int>(viennacl::traits::stride(vec1)), - static_cast<unsigned int>(viennacl::traits::size(vec1)), - viennacl::cuda_arg(proxy.lhs()), - static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), - static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_cos_kernel"); -} - - -// cosh -template<typename NumericT> -__global__ void vec_element_cosh_kernel( - NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, - NumericT const * vec2, unsigned int start2, unsigned int inc2) -{ - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = cosh(vec2[i*inc2+start2]); -} - -template<typename NumericT> -void element_op(vector_base<NumericT> & vec1, - vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_cosh> > const & proxy) -{ - vec_element_cosh_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), - static_cast<unsigned int>(viennacl::traits::start(vec1)), - static_cast<unsigned int>(viennacl::traits::stride(vec1)), - static_cast<unsigned int>(viennacl::traits::size(vec1)), - viennacl::cuda_arg(proxy.lhs()), - static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), - static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_cosh_kernel"); -} - - -// exp -template<typename NumericT> -__global__ void vec_element_exp_kernel( - NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, - NumericT const * vec2, unsigned int start2, unsigned int inc2) -{ - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = exp(vec2[i*inc2+start2]); -} - -template<typename NumericT> -void element_op(vector_base<NumericT> & vec1, - vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_exp> > const & proxy) -{ - vec_element_exp_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), - static_cast<unsigned int>(viennacl::traits::start(vec1)), - static_cast<unsigned int>(viennacl::traits::stride(vec1)), - static_cast<unsigned int>(viennacl::traits::size(vec1)), - viennacl::cuda_arg(proxy.lhs()), - static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), - static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_exp_kernel"); -} - - -// fabs -template<typename NumericT> -__global__ void vec_element_fabs_kernel( - NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, - NumericT const * vec2, unsigned int start2, unsigned int inc2) -{ - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = fabs(vec2[i*inc2+start2]); -} - -template<typename NumericT> -void element_op(vector_base<NumericT> & vec1, - vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_fabs> > const & proxy) -{ - vec_element_fabs_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), - static_cast<unsigned int>(viennacl::traits::start(vec1)), - static_cast<unsigned int>(viennacl::traits::stride(vec1)), - static_cast<unsigned int>(viennacl::traits::size(vec1)), - viennacl::cuda_arg(proxy.lhs()), - static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), - static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_fabs_kernel"); -} - -// abs -template<typename NumericT> -__global__ void vec_element_abs_kernel( - NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, - NumericT const * vec2, unsigned int start2, unsigned int inc2) -{ - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = abs(vec2[i*inc2+start2]); -} - -template<typename NumericT> -void element_op(vector_base<NumericT> & vec1, - vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_abs> > const & proxy) -{ - vec_element_abs_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), - static_cast<unsigned int>(viennacl::traits::start(vec1)), - static_cast<unsigned int>(viennacl::traits::stride(vec1)), - static_cast<unsigned int>(viennacl::traits::size(vec1)), - viennacl::cuda_arg(proxy.lhs()), - static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), - static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_abs_kernel"); -} - - - -// floor -template<typename NumericT> -__global__ void vec_element_floor_kernel( - NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, - NumericT const * vec2, unsigned int start2, unsigned int inc2) -{ - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = floor(vec2[i*inc2+start2]); -} - -template<typename NumericT> -void element_op(vector_base<NumericT> & vec1, - vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_floor> > const & proxy) -{ - vec_element_floor_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), - static_cast<unsigned int>(viennacl::traits::start(vec1)), - static_cast<unsigned int>(viennacl::traits::stride(vec1)), - static_cast<unsigned int>(viennacl::traits::size(vec1)), - viennacl::cuda_arg(proxy.lhs()), - static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), - static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_floor_kernel"); -} - - -// log -template<typename NumericT> -__global__ void vec_element_log_kernel( - NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, - NumericT const * vec2, unsigned int start2, unsigned int inc2) -{ - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = log(vec2[i*inc2+start2]); -} - -template<typename NumericT> -void element_op(vector_base<NumericT> & vec1, - vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_log> > const & proxy) -{ - vec_element_log_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), - static_cast<unsigned int>(viennacl::traits::start(vec1)), - static_cast<unsigned int>(viennacl::traits::stride(vec1)), - static_cast<unsigned int>(viennacl::traits::size(vec1)), - viennacl::cuda_arg(proxy.lhs()), - static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), - static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_log_kernel"); -} - - -// log10 -template<typename NumericT> -__global__ void vec_element_log10_kernel( - NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, - NumericT const * vec2, unsigned int start2, unsigned int inc2) -{ - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = log10(vec2[i*inc2+start2]); -} - -template<typename NumericT> -void element_op(vector_base<NumericT> & vec1, - vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_log10> > const & proxy) -{ - vec_element_log10_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), - static_cast<unsigned int>(viennacl::traits::start(vec1)), - static_cast<unsigned int>(viennacl::traits::stride(vec1)), - static_cast<unsigned int>(viennacl::traits::size(vec1)), - viennacl::cuda_arg(proxy.lhs()), - static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), - static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_log10_kernel"); -} - - -// sin -template<typename NumericT> -__global__ void vec_element_sin_kernel( - NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, - NumericT const * vec2, unsigned int start2, unsigned int inc2) -{ - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = sin(vec2[i*inc2+start2]); -} - -template<typename NumericT> -void element_op(vector_base<NumericT> & vec1, - vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_sin> > const & proxy) -{ - vec_element_sin_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), - static_cast<unsigned int>(viennacl::traits::start(vec1)), - static_cast<unsigned int>(viennacl::traits::stride(vec1)), - static_cast<unsigned int>(viennacl::traits::size(vec1)), - viennacl::cuda_arg(proxy.lhs()), - static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), - static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sin_kernel"); -} - - -// sinh -template<typename NumericT> -__global__ void vec_element_sinh_kernel( - NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, - NumericT const * vec2, unsigned int start2, unsigned int inc2) -{ - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = sinh(vec2[i*inc2+start2]); -} - -template<typename NumericT> -void element_op(vector_base<NumericT> & vec1, - vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_sinh> > const & proxy) -{ - vec_element_sinh_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), - static_cast<unsigned int>(viennacl::traits::start(vec1)), - static_cast<unsigned int>(viennacl::traits::stride(vec1)), - static_cast<unsigned int>(viennacl::traits::size(vec1)), - viennacl::cuda_arg(proxy.lhs()), - static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), - static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sinh_kernel"); -} - - -// sqrt -template<typename NumericT> -__global__ void vec_element_sqrt_kernel( - NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, - NumericT const * vec2, unsigned int start2, unsigned int inc2) -{ - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = sqrt(vec2[i*inc2+start2]); -} - -template<typename NumericT> -void element_op(vector_base<NumericT> & vec1, - vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_sqrt> > const & proxy) -{ - vec_element_sqrt_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), - static_cast<unsigned int>(viennacl::traits::start(vec1)), - static_cast<unsigned int>(viennacl::traits::stride(vec1)), - static_cast<unsigned int>(viennacl::traits::size(vec1)), - viennacl::cuda_arg(proxy.lhs()), - static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), - static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sqrt_kernel"); -} - - -// tan -template<typename NumericT> -__global__ void vec_element_tan_kernel( - NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, - NumericT const * vec2, unsigned int start2, unsigned int inc2) -{ - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = tan(vec2[i*inc2+start2]); -} - -template<typename NumericT> -void element_op(vector_base<NumericT> & vec1, - vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_tan> > const & proxy) -{ - vec_element_tan_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), - static_cast<unsigned int>(viennacl::traits::start(vec1)), - static_cast<unsigned int>(viennacl::traits::stride(vec1)), - static_cast<unsigned int>(viennacl::traits::size(vec1)), - viennacl::cuda_arg(proxy.lhs()), - static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), - static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_tan_kernel"); -} - - -// tanh -template<typename NumericT> -__global__ void vec_element_tanh_kernel( - NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, - NumericT const * vec2, unsigned int start2, unsigned int inc2) -{ - for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) - vec1[i*inc1+start1] = tanh(vec2[i*inc2+start2]); -} - -template<typename NumericT> -void element_op(vector_base<NumericT> & vec1, - vector_expression<const vector_base<NumericT>, const vector_base<NumericT>, op_element_unary<op_tanh> > const & proxy) -{ - vec_element_tanh_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), - static_cast<unsigned int>(viennacl::traits::start(vec1)), - static_cast<unsigned int>(viennacl::traits::stride(vec1)), - static_cast<unsigned int>(viennacl::traits::size(vec1)), - viennacl::cuda_arg(proxy.lhs()), - static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), - static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_tanh_kernel"); -} - - - -///////////////////////// Norms and inner product /////////////////// - - -template<typename NumericT> -__global__ void inner_prod_kernel(const NumericT * vec1, - unsigned int start1, - unsigned int inc1, - unsigned int size1, - const NumericT * vec2, - unsigned int start2, - unsigned int inc2, - unsigned int size2, - NumericT * group_buffer) -{ - __shared__ NumericT tmp_buffer[128]; - unsigned int group_start1 = (blockIdx.x * size1) / (gridDim.x) * inc1 + start1; - unsigned int group_start2 = (blockIdx.x * size2) / (gridDim.x) * inc2 + start2; - - unsigned int group_size1 = ((blockIdx.x + 1) * size1) / (gridDim.x) - - ( blockIdx.x * size1) / (gridDim.x); - - - NumericT tmp = 0; - for (unsigned int i = threadIdx.x; i < group_size1; i += blockDim.x) - tmp += vec1[i*inc1+group_start1] * vec2[i*inc2+group_start2]; - tmp_buffer[threadIdx.x] = tmp; - - // parallel reduction - for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) - { - __syncthreads(); - if (threadIdx.x < stride) - tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride]; - } - - if (threadIdx.x == 0) - group_buffer[blockIdx.x] = tmp_buffer[0]; - -} - - - -// sums the array 'vec1' and writes to result. Makes use of a single work-group only. -template<typename NumericT> -__global__ void vector_sum_kernel_floats( - const NumericT * vec1, - unsigned int start1, - unsigned int inc1, - unsigned int size1, - unsigned int option, //0: use fmax, 1: just sum, 2: sum and return sqrt of sum - NumericT * result) -{ - __shared__ NumericT tmp_buffer[128]; - NumericT thread_sum = 0; - for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x) - { - if (option > 0) - thread_sum += vec1[i*inc1+start1]; - else - thread_sum = fmax(thread_sum, fabs(vec1[i*inc1+start1])); - } - - tmp_buffer[threadIdx.x] = thread_sum; - - for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) - { - __syncthreads(); - if (threadIdx.x < stride) - { - if (option > 0) - tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride]; - else - tmp_buffer[threadIdx.x] = fmax(tmp_buffer[threadIdx.x], tmp_buffer[threadIdx.x + stride]); - } - } - - if (threadIdx.x == 0) - { - if (option == 2) - *result = sqrt(tmp_buffer[0]); - else - *result = tmp_buffer[0]; - } -} - -template<typename NumericT> -__global__ void vector_sum_kernel_integers( - const NumericT * vec1, - unsigned int start1, - unsigned int inc1, - unsigned int size1, - unsigned int option, //0: use max, 1: just sum - NumericT * result) -{ - __shared__ NumericT tmp_buffer[128]; - NumericT thread_sum = 0; - for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x) - { - if (option > 0) - thread_sum += vec1[i*inc1+start1]; - else - thread_sum = thread_sum > abs(vec1[i*inc1+start1]) ? thread_sum : abs(vec1[i*inc1+start1]); - } - - tmp_buffer[threadIdx.x] = thread_sum; - - for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) - { - __syncthreads(); - if (threadIdx.x < stride) - { - if (option > 0) - tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride]; - else - tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x + stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x + stride]; - } - } - - if (threadIdx.x == 0) - *result = tmp_buffer[0]; -} - -template<typename NumericT> -__global__ void vector_sum_kernel_unsigned_integers( - const NumericT * vec1, - unsigned int start1, - unsigned int inc1, - unsigned int size1, - unsigned int option, //0: use max, 1: just sum - NumericT * result) -{ - __shared__ NumericT tmp_buffer[128]; - NumericT thread_sum = 0; - for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x) - { - if (option > 0) - thread_sum += vec1[i*inc1+start1]; - else - thread_sum = (thread_sum > vec1[i*inc1+start1]) ? thread_sum : vec1[i*inc1+start1]; - } - - tmp_buffer[threadIdx.x] = thread_sum; - - for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) - { - __syncthreads(); - if (threadIdx.x < stride) - { - if (option > 0) - tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride]; - else - tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x + stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x + stride]; - } - } - - if (threadIdx.x == 0) - *result = tmp_buffer[0]; -} - -namespace detail -{ - /** \cond */ - struct vector_sum_kernel_launcher_integers - { - template<typename NumericT, typename ScalarT> - static void apply(vector_base<NumericT> const & temp, - unsigned int option, - ScalarT & result) - { - typedef NumericT value_type; - vector_sum_kernel_integers<<<1, 128>>>(viennacl::cuda_arg(temp), - static_cast<unsigned int>(viennacl::traits::start(temp)), - static_cast<unsigned int>(viennacl::traits::stride(temp)), - static_cast<unsigned int>(viennacl::traits::size(temp)), - static_cast<unsigned int>(option), - viennacl::cuda_arg(result) ); - VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel"); - } - }; - - struct vector_sum_kernel_launcher_unsigned_integers - { - template<typename NumericT, typename ScalarT> - static void apply(vector_base<NumericT> const & temp, - unsigned int option, - ScalarT & result) - { - typedef NumericT value_type; - vector_sum_kernel_unsigned_integers<<<1, 128>>>(viennacl::cuda_arg(temp), - static_cast<unsigned int>(viennacl::traits::start(temp)), - static_cast<unsigned int>(viennacl::traits::stride(temp)), - static_cast<unsigned int>(viennacl::traits::size(temp)), - static_cast<unsigned int>(option), - viennacl::cuda_arg(result) ); - VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel"); - } - }; - - struct vector_sum_kernel_launcher_floats - { - template<typename NumericT, typename ScalarT> - static void apply(vector_base<NumericT> const & temp, - unsigned int option, - ScalarT & result) - { - typedef NumericT value_type; - vector_sum_kernel_floats<<<1, 128>>>(viennacl::cuda_arg(temp), - static_cast<unsigned int>(viennacl::traits::start(temp)), - static_cast<unsigned int>(viennacl::traits::stride(temp)), - static_cast<unsigned int>(viennacl::traits::size(temp)), - static_cast<unsigned int>(option), - viennacl::cuda_arg(result) ); - VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel"); - } - }; - - template<typename NumericT> - struct vector_sum_kernel_launcher : public vector_sum_kernel_launcher_integers {}; - - template<> - struct vector_sum_kernel_launcher<unsigned char> : public vector_sum_kernel_launcher_unsigned_integers {}; - - template<> - struct vector_sum_kernel_launcher<unsigned short> : public vector_sum_kernel_launcher_unsigned_integers {}; - - template<> - struct vector_sum_kernel_launcher<unsigned int> : public vector_sum_kernel_launcher_unsigned_integers {}; - - template<> - struct vector_sum_kernel_launcher<unsigned long> : public vector_sum_kernel_launcher_unsigned_integers {}; - - template<> - struct vector_sum_kernel_launcher<float> : public vector_sum_kernel_launcher_floats {}; - - template<> - struct vector_sum_kernel_launcher<double> : public vector_sum_kernel_launcher_floats {}; - - /** \endcond */ -} - - -//implementation of inner product: -//namespace { -/** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2). -* -* @param vec1 The first vector -* @param vec2 The second vector -* @param result The result scalar (on the gpu) -*/ -template<typename NumericT, typename ScalarT> -void inner_prod_impl(vector_base<NumericT> const & vec1, - vector_base<NumericT> const & vec2, - ScalarT & result) -{ - typedef NumericT value_type; - - static const unsigned int work_groups = 128; - static viennacl::vector<value_type> temp(work_groups); - - inner_prod_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), - static_cast<unsigned int>(viennacl::traits::start(vec1)), - static_cast<unsigned int>(viennacl::traits::stride(vec1)), - static_cast<unsigned int>(viennacl::traits::size(vec1)), - viennacl::cuda_arg(vec2), - static_cast<unsigned int>(viennacl::traits::start(vec2)), - static_cast<unsigned int>(viennacl::traits::stride(vec2)), - static_cast<unsigned int>(viennacl::traits::size(vec2)), - viennacl::cuda_arg(temp) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel"); - - detail::vector_sum_kernel_launcher<NumericT>::apply(temp, 1, result); -} - - -/** @brief Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1, vec2). -* -* @param vec1 The first vector -* @param vec2 The second vector -* @param result The result scalar (on the host) -*/ -template<typename NumericT> -void inner_prod_cpu(vector_base<NumericT> const & vec1, - vector_base<NumericT> const & vec2, - NumericT & result) -{ - typedef NumericT value_type; - - const unsigned int work_groups = 128; - viennacl::vector<value_type> temp(work_groups); - - inner_prod_kernel<<<128, 128>>>(viennacl::cuda_arg(vec1), - static_cast<unsigned int>(viennacl::traits::start(vec1)), - static_cast<unsigned int>(viennacl::traits::stride(vec1)), - static_cast<unsigned int>(viennacl::traits::size(vec1)), - viennacl::cuda_arg(vec2), - static_cast<unsigned int>(viennacl::traits::start(vec2)), - static_cast<unsigned int>(viennacl::traits::stride(vec2)), - static_cast<unsigned int>(viennacl::traits::size(vec2)), - viennacl::cuda_arg(temp) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel"); - - // Now copy partial results from GPU back to CPU and run reduction there: - std::vector<value_type> temp_cpu(work_groups); - viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin()); - - result = 0; - for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it) - result += *it; -} - -/////////////////////////////////// - -#define VIENNACL_MDOT_WORKGROUP_SIZE 128 -#define VIENNACL_MDOT_WORKGROUP_NUM 128 -// M = 2: -template<typename NumericT> -__global__ void inner_prod_2_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, - const NumericT *y0, unsigned int start0, unsigned int stride0, - const NumericT *y1, unsigned int start1, unsigned int stride1, - NumericT *group_results) -{ - __shared__ NumericT tmp_buffer[2*VIENNACL_MDOT_WORKGROUP_SIZE]; - unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1; - unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread; - unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond size of x - - NumericT entry_x = 0; - NumericT group_sum0 = 0; - NumericT group_sum1 = 0; - for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) { - entry_x = x[i * stridex + startx]; // load only once from global memory! - group_sum0 += entry_x * y0[i * stride0 + start0]; - group_sum1 += entry_x * y1[i * stride1 + start1]; - } - tmp_buffer[threadIdx.x] = group_sum0; - tmp_buffer[threadIdx.x + blockDim.x] = group_sum1; - - // parallel reduction - for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) { - __syncthreads(); - if (threadIdx.x < stride) { - tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ]; - tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x]; - } - } - - // write result of group to group_results - if (threadIdx.x == 0) { - group_results[blockIdx.x] = tmp_buffer[0]; - group_results[blockIdx.x + gridDim.x] = tmp_buffer[blockDim.x]; - } -} - -// M = 3: -template<typename NumericT> -__global__ void inner_prod_3_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, - const NumericT *y0, unsigned int start0, unsigned int stride0, - const NumericT *y1, unsigned int start1, unsigned int stride1, - const NumericT *y2, unsigned int start2, unsigned int stride2, - NumericT *group_results) -{ - __shared__ NumericT tmp_buffer[3*VIENNACL_MDOT_WORKGROUP_SIZE]; - unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1; - unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread; - unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size - - NumericT entry_x = 0; - NumericT group_sum0 = 0; - NumericT group_sum1 = 0; - NumericT group_sum2 = 0; - for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) { - entry_x = x[i * stridex + startx]; // load only once from global memory! - group_sum0 += entry_x * y0[i * stride0 + start0]; - group_sum1 += entry_x * y1[i * stride1 + start1]; - group_sum2 += entry_x * y2[i * stride2 + start2]; - } - tmp_buffer[threadIdx.x] = group_sum0; - tmp_buffer[threadIdx.x + blockDim.x] = group_sum1; - tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2; - - // parallel reduction - for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) { - __syncthreads(); - if (threadIdx.x < stride) { - tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ]; - tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x]; - tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x]; - } - } - - // write result of group to group_results - if (threadIdx.x == 0) { - group_results[blockIdx.x ] = tmp_buffer[0]; - group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x]; - group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x]; - } -} - -// M = 4: -template<typename NumericT> -__global__ void inner_prod_4_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, - const NumericT *y0, unsigned int start0, unsigned int stride0, - const NumericT *y1, unsigned int start1, unsigned int stride1, - const NumericT *y2, unsigned int start2, unsigned int stride2, - const NumericT *y3, unsigned int start3, unsigned int stride3, - NumericT *group_results) -{ - __shared__ NumericT tmp_buffer[4*VIENNACL_MDOT_WORKGROUP_SIZE]; - unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1; - unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread; - unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size - - NumericT entry_x = 0; - NumericT group_sum0 = 0; - NumericT group_sum1 = 0; - NumericT group_sum2 = 0; - NumericT group_sum3 = 0; - for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) { - entry_x = x[i * stridex + startx]; // load only once from global memory! - group_sum0 += entry_x * y0[i * stride0 + start0]; - group_sum1 += entry_x * y1[i * stride1 + start1]; - group_sum2 += entry_x * y2[i * stride2 + start2]; - group_sum3 += entry_x * y3[i * stride3 + start3]; - } - tmp_buffer[threadIdx.x] = group_sum0; - tmp_buffer[threadIdx.x + blockDim.x] = group_sum1; - tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2; - tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3; - - // parallel reduction - for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) { - __syncthreads(); - if (threadIdx.x < stride) { - tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ]; - tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x]; - tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x]; - tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 3 * blockDim.x]; - } - } - - // write result of group to group_results - if (threadIdx.x == 0) { - group_results[blockIdx.x ] = tmp_buffer[0]; - group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x]; - group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x]; - group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x]; - } -} - -// M = 8: -template<typename NumericT> -__global__ void inner_prod_8_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, - const NumericT *y0, unsigned int start0, unsigned int stride0, - const NumericT *y1, unsigned int start1, unsigned int stride1, - const NumericT *y2, unsigned int start2, unsigned int stride2, - const NumericT *y3, unsigned int start3, unsigned int stride3, - const NumericT *y4, unsigned int start4, unsigned int stride4, - const NumericT *y5, unsigned int start5, unsigned int stride5, - const NumericT *y6, unsigned int start6, unsigned int stride6, - const NumericT *y7, unsigned int start7, unsigned int stride7, - NumericT *group_results) -{ - __shared__ NumericT tmp_buffer[8*VIENNACL_MDOT_WORKGROUP_SIZE]; - unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1; - unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread; - unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size - - NumericT entry_x = 0; - NumericT group_sum0 = 0; - NumericT group_sum1 = 0; - NumericT group_sum2 = 0; - NumericT group_sum3 = 0; - NumericT group_sum4 = 0; - NumericT group_sum5 = 0; - NumericT group_sum6 = 0; - NumericT group_sum7 = 0; - for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) { - entry_x = x[i * stridex + startx]; // load only once from global memory! - group_sum0 += entry_x * y0[i * stride0 + start0]; - group_sum1 += entry_x * y1[i * stride1 + start1]; - group_sum2 += entry_x * y2[i * stride2 + start2]; - group_sum3 += entry_x * y3[i * stride3 + start3]; - group_sum4 += entry_x * y4[i * stride4 + start4]; - group_sum5 += entry_x * y5[i * stride5 + start5]; - group_sum6 += entry_x * y6[i * stride6 + start6]; - group_sum7 += entry_x * y7[i * stride7 + start7]; - } - tmp_buffer[threadIdx.x] = group_sum0; - tmp_buffer[threadIdx.x + blockDim.x] = group_sum1; - tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2; - tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3; - tmp_buffer[threadIdx.x + 4 * blockDim.x] = group_sum4; - tmp_buffer[threadIdx.x + 5 * blockDim.x] = group_sum5; - tmp_buffer[threadIdx.x + 6 * blockDim.x] = group_sum6; - tmp_buffer[threadIdx.x + 7 * blockDim.x] = group_sum7; - - // parallel reduction - for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) { - __syncthreads(); - if (threadIdx.x < stride) { - tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ]; - tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x]; - tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x]; - tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 3 * blockDim.x]; - tmp_buffer[threadIdx.x + 4 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 4 * blockDim.x]; - tmp_buffer[threadIdx.x + 5 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 5 * blockDim.x]; - tmp_buffer[threadIdx.x + 6 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 6 * blockDim.x]; - tmp_buffer[threadIdx.x + 7 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 7 * blockDim.x]; - } - } - - // write result of group to group_results - if (threadIdx.x == 0) { - group_results[blockIdx.x ] = tmp_buffer[0]; - group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x]; - group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x]; - group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x]; - group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * blockDim.x]; - group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * blockDim.x]; - group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * blockDim.x]; - group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * blockDim.x]; - } -} - -// sums the array 'vec1' and writes to result. Makes use of a single work-group only. -template<typename NumericT> -__global__ void vector_multi_sum_kernel( - NumericT const * vec1, - NumericT * result, - unsigned int start_result, - unsigned int inc_result) -{ - __shared__ NumericT tmp_buffer[VIENNACL_MDOT_WORKGROUP_SIZE]; - - tmp_buffer[threadIdx.x] = vec1[threadIdx.x + blockIdx.x * VIENNACL_MDOT_WORKGROUP_SIZE]; - - for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) - { - __syncthreads(); - if (threadIdx.x < stride) - tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride]; - } - - if (threadIdx.x == 0) - result[start_result + inc_result * blockIdx.x] = tmp_buffer[0]; -} - -template<typename NumericT> -void inner_prod_impl(vector_base<NumericT> const & x, - vector_tuple<NumericT> const & vec_tuple, - vector_base<NumericT> & result) -{ - typedef NumericT value_type; - - static viennacl::vector<value_type> temp(8 * VIENNACL_MDOT_WORKGROUP_NUM); - - vcl_size_t current_index = 0; - while (vec_tuple.const_size() > current_index) - { - switch (vec_tuple.const_size() - current_index) - { - case 7: - case 6: - case 5: - case 4: - { - vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index); - vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1); - vector_base<NumericT> const & y2 = vec_tuple.const_at(current_index + 2); - vector_base<NumericT> const & y3 = vec_tuple.const_at(current_index + 3); - - inner_prod_4_kernel<<<VIENNACL_MDOT_WORKGROUP_NUM, - VIENNACL_MDOT_WORKGROUP_SIZE>>>( viennacl::cuda_arg(x), - static_cast<unsigned int>(viennacl::traits::start(x)), - static_cast<unsigned int>(viennacl::traits::stride(x)), - static_cast<unsigned int>(viennacl::traits::size(x)), - viennacl::cuda_arg(y0), - static_cast<unsigned int>(viennacl::traits::start(y0)), - static_cast<unsigned int>(viennacl::traits::stride(y0)), - viennacl::cuda_arg(y1), - static_cast<unsigned int>(viennacl::traits::start(y1)), - static_cast<unsigned int>(viennacl::traits::stride(y1)), - viennacl::cuda_arg(y2), - static_cast<unsigned int>(vi
<TRUNCATED>
