http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/fft_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/fft_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/fft_operations.hpp deleted file mode 100644 index 198ac31..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/fft_operations.hpp +++ /dev/null @@ -1,858 +0,0 @@ -#ifndef VIENNACL_LINALG_CUDA_FFT_OPERATIONS_HPP_ -#define VIENNACL_LINALG_CUDA_FFT_OPERATIONS_HPP_ - -/* ========================================================================= - Copyright (c) 2010-2016, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. - Portions of this software are copyright by UChicago Argonne, LLC. - - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - - Project Head: Karl Rupp [email protected] - - (A list of authors and contributors can be found in the manual) - - License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - -/** @file viennacl/linalg/cuda/fft_operations.hpp - @brief Implementations of Fast Furier Transformation using cuda -*/ -#include <cmath> -#include <viennacl/matrix.hpp> -#include <viennacl/vector.hpp> - -#include "viennacl/forwards.h" -#include "viennacl/scalar.hpp" -#include "viennacl/tools/tools.hpp" -#include "viennacl/linalg/cuda/common.hpp" -#include "viennacl/linalg/host_based/vector_operations.hpp" -#include "viennacl/linalg/host_based/fft_operations.hpp" - -namespace viennacl -{ -namespace linalg -{ -namespace cuda -{ -namespace detail -{ - namespace fft - { - const vcl_size_t MAX_LOCAL_POINTS_NUM = 512; - - inline vcl_size_t num_bits(vcl_size_t size) - { - vcl_size_t bits_datasize = 0; - vcl_size_t ds = 1; - - while (ds < size) - { - ds = ds << 1; - bits_datasize++; - } - - return bits_datasize; - } - - inline vcl_size_t next_power_2(vcl_size_t n) - { - n = n - 1; - - vcl_size_t power = 1; - - while (power < sizeof(vcl_size_t) * 8) - { - n = n | (n >> power); - power *= 2; - } - - return n + 1; - } - - } //namespace fft -} //namespace detail - -// addition -inline __host__ __device__ float2 operator+(float2 a, float2 b) -{ - return make_float2(a.x + b.x, a.y + b.y); -} - -// subtract -inline __host__ __device__ float2 operator-(float2 a, float2 b) -{ - return make_float2(a.x - b.x, a.y - b.y); -} -// division -template<typename SCALARTYPE> -inline __device__ float2 operator/(float2 a,SCALARTYPE b) -{ - return make_float2(a.x/b, a.y/b); -} - -//multiplication -inline __device__ float2 operator*(float2 in1, float2 in2) -{ - return make_float2(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x); -} - -// addition -inline __host__ __device__ double2 operator+(double2 a, double2 b) -{ - return make_double2(a.x + b.x, a.y + b.y); -} - -// subtraction -inline __host__ __device__ double2 operator-(double2 a, double2 b) -{ - return make_double2(a.x - b.x, a.y - b.y); -} - -// division -template<typename SCALARTYPE> -inline __host__ __device__ double2 operator/(double2 a,SCALARTYPE b) -{ - return make_double2(a.x/b, a.y/b); -} - -//multiplication -inline __host__ __device__ double2 operator*(double2 in1, double2 in2) -{ - return make_double2(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x); -} - -inline __device__ unsigned int get_reorder_num(unsigned int v, unsigned int bit_size) -{ - v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); - v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); - v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); - v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); - v = (v >> 16) | (v << 16); - v = v >> (32 - bit_size); - return v; -} - -template<typename Numeric2T, typename NumericT> -__global__ void fft_direct( - const Numeric2T * input, - Numeric2T * output, - unsigned int size, - unsigned int stride, - unsigned int batch_num, - NumericT sign, - bool is_row_major) -{ - - const NumericT NUM_PI(3.14159265358979323846); - - for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) - { - for (unsigned int k = blockIdx.x * blockDim.x + threadIdx.x; k < size; k += gridDim.x * blockDim.x) - { - Numeric2T f; - f.x = 0; - f.y = 0; - - for (unsigned int n = 0; n < size; n++) - { - Numeric2T in; - if (!is_row_major) - in = input[batch_id * stride + n]; //input index here - else - in = input[n * stride + batch_id];//input index here - - NumericT sn,cs; - NumericT arg = sign * 2 * NUM_PI * k / size * n; - sn = sin(arg); - cs = cos(arg); - - Numeric2T ex; - ex.x = cs; - ex.y = sn; - Numeric2T tmp; - tmp.x = in.x * ex.x - in.y * ex.y; - tmp.y = in.x * ex.y + in.y * ex.x; - f = f + tmp; - } - - if (!is_row_major) - output[batch_id * stride + k] = f; // output index here - else - output[k * stride + batch_id] = f;// output index here - } - } -} - -/** - * @brief Direct 1D algorithm for computing Fourier transformation. - * - * Works on any sizes of data. - * Serial implementation has o(n^2) complexity - */ -template<typename NumericT, unsigned int AlignmentV> -void direct(viennacl::vector<NumericT, AlignmentV> const & in, - viennacl::vector<NumericT, AlignmentV> & out, - vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, - NumericT sign = NumericT(-1), - viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) -{ - typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type; - - fft_direct<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(in)), - reinterpret_cast< numeric2_type *>(viennacl::cuda_arg(out)), - static_cast<unsigned int>(size), - static_cast<unsigned int>(stride), - static_cast<unsigned int>(batch_num), - sign, - bool(data_order != viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)); - VIENNACL_CUDA_LAST_ERROR_CHECK("fft_direct"); -} - -/** - * @brief Direct 2D algorithm for computing Fourier transformation. - * - * Works on any sizes of data. - * Serial implementation has o(n^2) complexity - */ -template<typename NumericT, unsigned int AlignmentV> -void direct(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & in, - viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & out, - vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, - NumericT sign = NumericT(-1), - viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) -{ - typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type; - - fft_direct<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(in)), - reinterpret_cast< numeric2_type *>(viennacl::cuda_arg(out)), - static_cast<unsigned int>(size), - static_cast<unsigned int>(stride), - static_cast<unsigned int>(batch_num), - sign, - bool(data_order != viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)); - VIENNACL_CUDA_LAST_ERROR_CHECK("fft_direct"); -} - -template<typename NumericT> -__global__ void fft_reorder(NumericT * input, - unsigned int bit_size, - unsigned int size, - unsigned int stride, - unsigned int batch_num, - bool is_row_major) -{ - - unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int glb_sz = gridDim.x * blockDim.x; - - for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) - { - for (unsigned int i = glb_id; i < size; i += glb_sz) - { - unsigned int v = get_reorder_num(i, bit_size); - - if (i < v) - { - if (!is_row_major) - { - NumericT tmp = input[batch_id * stride + i]; // index - input[batch_id * stride + i] = input[batch_id * stride + v];//index - input[batch_id * stride + v] = tmp;//index - } - else - { - NumericT tmp = input[i * stride + batch_id]; - input[i * stride + batch_id] = input[v * stride + batch_id]; - input[v * stride + batch_id] = tmp; - } - } - } - } -} - -/*** - * This function performs reorder of input data. Indexes are sorted in bit-reversal order. - * Such reordering should be done before in-place FFT. - */ -template<typename NumericT, unsigned int AlignmentV> -void reorder(viennacl::vector<NumericT, AlignmentV> & in, - vcl_size_t size, vcl_size_t stride, vcl_size_t bits_datasize, vcl_size_t batch_num, - viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) -{ - typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type; - - fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)), - static_cast<unsigned int>(bits_datasize), - static_cast<unsigned int>(size), - static_cast<unsigned int>(stride), - static_cast<unsigned int>(batch_num), - static_cast<bool>(data_order)); - VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder"); -} - -template<typename Numeric2T, typename NumericT> -__global__ void fft_radix2_local(Numeric2T * input, - unsigned int bit_size, - unsigned int size, - unsigned int stride, - unsigned int batch_num, - NumericT sign, - bool is_row_major) -{ - __shared__ Numeric2T lcl_input[1024]; - unsigned int grp_id = blockIdx.x; - unsigned int grp_num = gridDim.x; - - unsigned int lcl_sz = blockDim.x; - unsigned int lcl_id = threadIdx.x; - const NumericT NUM_PI(3.14159265358979323846); - - for (unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num) - { - for (unsigned int p = lcl_id; p < size; p += lcl_sz) - { - unsigned int v = get_reorder_num(p, bit_size); - if (!is_row_major) - lcl_input[v] = input[batch_id * stride + p]; - else - lcl_input[v] = input[p * stride + batch_id]; - } - - __syncthreads(); - - //performs Cooley-Tukey FFT on local arrayfft - for (unsigned int s = 0; s < bit_size; s++) - { - unsigned int ss = 1 << s; - NumericT cs, sn; - for (unsigned int tid = lcl_id; tid < size; tid += lcl_sz) - { - unsigned int group = (tid & (ss - 1)); - unsigned int pos = ((tid >> s) << (s + 1)) + group; - - Numeric2T in1 = lcl_input[pos]; - Numeric2T in2 = lcl_input[pos + ss]; - - NumericT arg = group * sign * NUM_PI / ss; - - sn = sin(arg); - cs = cos(arg); - Numeric2T ex; - ex.x = cs; - ex.y = sn; - - Numeric2T tmp; - tmp.x = in2.x * ex.x - in2.y * ex.y; - tmp.y = in2.x * ex.y + in2.y * ex.x; - - lcl_input[pos + ss] = in1 - tmp; - lcl_input[pos] = in1 + tmp; - } - __syncthreads(); - } - - //copy local array back to global memory - for (unsigned int p = lcl_id; p < size; p += lcl_sz) - { - if (!is_row_major) - input[batch_id * stride + p] = lcl_input[p]; //index - else - input[p * stride + batch_id] = lcl_input[p]; - } - - } -} - -template<typename Numeric2T, typename NumericT> -__global__ void fft_radix2(Numeric2T * input, - unsigned int s, - unsigned int bit_size, - unsigned int size, - unsigned int stride, - unsigned int batch_num, - NumericT sign, - bool is_row_major) -{ - - unsigned int ss = 1 << s; - unsigned int half_size = size >> 1; - - NumericT cs, sn; - const NumericT NUM_PI(3.14159265358979323846); - - unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int glb_sz = gridDim.x * blockDim.x; - - for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) - { - for (unsigned int tid = glb_id; tid < half_size; tid += glb_sz) - { - unsigned int group = (tid & (ss - 1)); - unsigned int pos = ((tid >> s) << (s + 1)) + group; - Numeric2T in1; - Numeric2T in2; - unsigned int offset; - if (!is_row_major) - { - offset = batch_id * stride + pos; - in1 = input[offset]; //index - in2 = input[offset + ss];//index - } - else - { - offset = pos * stride + batch_id; - in1 = input[offset]; //index - in2 = input[offset + ss * stride];//index - } - - NumericT arg = group * sign * NUM_PI / ss; - - sn = sin(arg); - cs = cos(arg); - - Numeric2T ex; - ex.x = cs; - ex.y = sn; - - Numeric2T tmp; - tmp.x = in2.x * ex.x - in2.y * ex.y; - tmp.y = in2.x * ex.y + in2.y * ex.x; - - if (!is_row_major) - input[offset + ss] = in1 - tmp; //index - else - input[offset + ss * stride] = in1 - tmp; //index - input[offset] = in1 + tmp; //index - } - } -} - -/** - * @brief Radix-2 1D algorithm for computing Fourier transformation. - * - * Works only on power-of-two sizes of data. - * Serial implementation has o(n * lg n) complexity. - * This is a Cooley-Tukey algorithm - */ -template<typename NumericT, unsigned int AlignmentV> -void radix2(viennacl::vector<NumericT, AlignmentV> & in, - vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1), - viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) -{ - typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type; - - unsigned int bit_size = viennacl::linalg::cuda::detail::fft::num_bits(size); - - if (size <= viennacl::linalg::cuda::detail::fft::MAX_LOCAL_POINTS_NUM) - { - fft_radix2_local<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)), - static_cast<unsigned int>(bit_size), - static_cast<unsigned int>(size), - static_cast<unsigned int>(stride), - static_cast<unsigned int>(batch_num), - static_cast<NumericT>(sign), - static_cast<bool>(data_order)); - VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2_local"); - } - else - { - fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)), - static_cast<unsigned int>(bit_size), - static_cast<unsigned int>(size), - static_cast<unsigned int>(stride), - static_cast<unsigned int>(batch_num), - static_cast<bool>(data_order)); - VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder"); - - for (vcl_size_t step = 0; step < bit_size; step++) - { - fft_radix2<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)), - static_cast<unsigned int>(step), - static_cast<unsigned int>(bit_size), - static_cast<unsigned int>(size), - static_cast<unsigned int>(stride), - static_cast<unsigned int>(batch_num), - sign, - static_cast<bool>(data_order)); - VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2"); - } - } -} - -/** - * @brief Radix-2 2D algorithm for computing Fourier transformation. - * - * Works only on power-of-two sizes of data. - * Serial implementation has o(n * lg n) complexity. - * This is a Cooley-Tukey algorithm - */ -template<typename NumericT, unsigned int AlignmentV> -void radix2(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& in, - vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1), - viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) -{ - typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type; - - unsigned int bit_size = viennacl::linalg::cuda::detail::fft::num_bits(size); - - if (size <= viennacl::linalg::cuda::detail::fft::MAX_LOCAL_POINTS_NUM) - { - fft_radix2_local<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)), - static_cast<unsigned int>(bit_size), - static_cast<unsigned int>(size), - static_cast<unsigned int>(stride), - static_cast<unsigned int>(batch_num), - sign, - static_cast<bool>(data_order)); - VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2_local"); - } - else - { - fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)), - static_cast<unsigned int>(bit_size), - static_cast<unsigned int>(size), - static_cast<unsigned int>(stride), - static_cast<unsigned int>(batch_num), - static_cast<bool>(data_order)); - VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder"); - for (vcl_size_t step = 0; step < bit_size; step++) - { - fft_radix2<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)), - static_cast<unsigned int>(step), - static_cast<unsigned int>(bit_size), - static_cast<unsigned int>(size), - static_cast<unsigned int>(stride), - static_cast<unsigned int>(batch_num), - sign, - static_cast<bool>(data_order)); - VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2"); - } - } -} - -template<typename Numeric2T, typename NumericT> -__global__ void bluestein_post(Numeric2T * Z, Numeric2T * out, unsigned int size, NumericT sign) -{ - unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int glb_sz =gridDim.x * blockDim.x; - - unsigned int double_size = size << 1; - NumericT sn_a, cs_a; - const NumericT NUM_PI(3.14159265358979323846); - - for (unsigned int i = glb_id; i < size; i += glb_sz) - { - unsigned int rm = i * i % (double_size); - NumericT angle = (NumericT)rm / size * (-NUM_PI); - - sn_a = sin(angle); - cs_a= cos(angle); - - Numeric2T b_i; - b_i.x = cs_a; - b_i.y = sn_a; - out[i].x = Z[i].x * b_i.x - Z[i].y * b_i.y; - out[i].y = Z[i].x * b_i.y + Z[i].y * b_i.x; - } -} - -template<typename Numeric2T, typename NumericT> -__global__ void bluestein_pre(Numeric2T * input, Numeric2T * A, Numeric2T * B, - unsigned int size, unsigned int ext_size, NumericT sign) -{ - unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x; - unsigned int glb_sz = gridDim.x * blockDim.x; - - unsigned int double_size = size << 1; - - NumericT sn_a, cs_a; - const NumericT NUM_PI(3.14159265358979323846); - - for (unsigned int i = glb_id; i < size; i += glb_sz) - { - unsigned int rm = i * i % (double_size); - NumericT angle = (NumericT)rm / size * NUM_PI; - - sn_a = sin(-angle); - cs_a= cos(-angle); - - Numeric2T a_i; - a_i.x =cs_a; - a_i.y =sn_a; - - Numeric2T b_i; - b_i.x =cs_a; - b_i.y =-sn_a; - - A[i].x = input[i].x * a_i.x - input[i].y * a_i.y; - A[i].y = input[i].x * a_i.y + input[i].y * a_i.x; - B[i] = b_i; - - // very bad instruction, to be fixed - if (i) - B[ext_size - i] = b_i; - } -} - -template<typename NumericT> -__global__ void zero2(NumericT * input1, NumericT * input2, unsigned int size) -{ - for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) - { - input1[i].x = 0; - input1[i].y = 0; - - input2[i].x = 0; - input2[i].y = 0; - } -} - -/** - * @brief Bluestein's algorithm for computing Fourier transformation. - * - * Currently, Works only for sizes of input data which less than 2^16. - * Uses a lot of additional memory, but should be fast for any size of data. - * Serial implementation has something about o(n * lg n) complexity - */ -template<typename NumericT, unsigned int AlignmentV> -void bluestein(viennacl::vector<NumericT, AlignmentV> & in, - viennacl::vector<NumericT, AlignmentV> & out, vcl_size_t /*batch_num*/) -{ - typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type; - - vcl_size_t size = in.size() >> 1; - vcl_size_t ext_size = viennacl::linalg::cuda::detail::fft::next_power_2(2 * size - 1); - - viennacl::vector<NumericT, AlignmentV> A(ext_size << 1); - viennacl::vector<NumericT, AlignmentV> B(ext_size << 1); - viennacl::vector<NumericT, AlignmentV> Z(ext_size << 1); - - zero2<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(A)), - reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(B)), - static_cast<unsigned int>(ext_size)); - VIENNACL_CUDA_LAST_ERROR_CHECK("zero2"); - - bluestein_pre<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)), - reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(A)), - reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(B)), - static_cast<unsigned int>(size), - static_cast<unsigned int>(ext_size), - NumericT(1)); - VIENNACL_CUDA_LAST_ERROR_CHECK("bluestein_pre"); - - viennacl::linalg::convolve_i(A, B, Z); - - bluestein_post<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(Z)), - reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(out)), - static_cast<unsigned int>(size), - NumericT(1)); - VIENNACL_CUDA_LAST_ERROR_CHECK("bluestein_post"); -} - -template<typename NumericT> -__global__ void fft_mult_vec(const NumericT * input1, - const NumericT * input2, - NumericT * output, - unsigned int size) -{ - for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) - { - NumericT in1 = input1[i]; - NumericT in2 = input2[i]; - output[i] = in1 * in2; - } -} - -/** - * @brief Mutiply two complex vectors and store result in output - */ -template<typename NumericT, unsigned int AlignmentV> -void multiply_complex(viennacl::vector<NumericT, AlignmentV> const & input1, - viennacl::vector<NumericT, AlignmentV> const & input2, - viennacl::vector<NumericT, AlignmentV> & output) -{ - typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type; - - vcl_size_t size = input1.size() / 2; - - fft_mult_vec<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(input1)), - reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(input2)), - reinterpret_cast< numeric2_type *>(viennacl::cuda_arg(output)), - static_cast<unsigned int>(size)); - VIENNACL_CUDA_LAST_ERROR_CHECK("fft_mult_vec"); -} - -template<typename Numeric2T, typename NumericT> -__global__ void fft_div_vec_scalar(Numeric2T * input1, unsigned int size, NumericT factor) -{ - for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x*blockDim.x) - input1[i] = input1[i]/factor; -} - -/** - * @brief Normalize vector on with his own size - */ -template<typename NumericT, unsigned int AlignmentV> -void normalize(viennacl::vector<NumericT, AlignmentV> & input) -{ - typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type; - - vcl_size_t size = input.size() >> 1; - NumericT norm_factor = static_cast<NumericT>(size); - fft_div_vec_scalar<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(input)), - static_cast<unsigned int>(size), - norm_factor); - VIENNACL_CUDA_LAST_ERROR_CHECK("fft_div_vec_scalar"); -} - -template<typename NumericT> -__global__ void transpose(const NumericT * input, - NumericT * output, - unsigned int row_num, - unsigned int col_num) -{ - unsigned int size = row_num * col_num; - for (unsigned int i =blockIdx.x * blockDim.x + threadIdx.x; i < size; i+= gridDim.x * blockDim.x) - { - unsigned int row = i / col_num; - unsigned int col = i - row*col_num; - unsigned int new_pos = col * row_num + row; - output[new_pos] = input[i]; - } -} - -/** - * @brief Transpose matrix - */ -template<typename NumericT, unsigned int AlignmentV> -void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & input, - viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & output) -{ - typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type; - - transpose<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(input)), - reinterpret_cast< numeric2_type *>(viennacl::cuda_arg(output)), - static_cast<unsigned int>(input.internal_size1()>>1), - static_cast<unsigned int>(input.internal_size2()>>1)); - VIENNACL_CUDA_LAST_ERROR_CHECK("transpose"); - -} - -template<typename NumericT> -__global__ void transpose_inplace( - NumericT * input, - unsigned int row_num, - unsigned int col_num) -{ - unsigned int size = row_num * col_num; - for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i+= gridDim.x * blockDim.x) - { - unsigned int row = i / col_num; - unsigned int col = i - row*col_num; - unsigned int new_pos = col * row_num + row; - if (i < new_pos) - { - NumericT val = input[i]; - input[i] = input[new_pos]; - input[new_pos] = val; - } - } -} - -/** - * @brief Inplace_transpose matrix - */ -template<typename NumericT, unsigned int AlignmentV> -void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & input) -{ - typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type; - - transpose_inplace<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(input)), - static_cast<unsigned int>(input.internal_size1()>>1), - static_cast<unsigned int>(input.internal_size2() >> 1)); - VIENNACL_CUDA_LAST_ERROR_CHECK("transpose_inplace"); - -} - -template<typename RealT,typename ComplexT> -__global__ void real_to_complex(const RealT * in, ComplexT * out, unsigned int size) -{ - for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) - { - ComplexT val; - val.x = in[i]; - val.y = 0; - out[i] = val; - } -} - -/** - * @brief Create complex vector from real vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part) - */ -template<typename NumericT> -void real_to_complex(viennacl::vector_base<NumericT> const & in, - viennacl::vector_base<NumericT> & out, vcl_size_t size) -{ - typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type; - - real_to_complex<<<128,128>>>(viennacl::cuda_arg(in), - reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(out)), - static_cast<unsigned int>(size)); - VIENNACL_CUDA_LAST_ERROR_CHECK("real_to_complex"); -} - -template<typename ComplexT,typename RealT> -__global__ void complex_to_real(const ComplexT * in, RealT * out, unsigned int size) -{ - for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) - out[i] = in[i].x; -} - -/** - * @brief Create real vector from complex vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part) - */ -template<typename NumericT> -void complex_to_real(viennacl::vector_base<NumericT> const & in, - viennacl::vector_base<NumericT>& out, vcl_size_t size) -{ - typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type; - - complex_to_real<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(in)), - viennacl::cuda_arg(out), - static_cast<unsigned int>(size)); - VIENNACL_CUDA_LAST_ERROR_CHECK("complex_to_real"); - -} - -template<typename NumericT> -__global__ void reverse_inplace(NumericT * vec, unsigned int size) -{ - for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < (size >> 1); i+=gridDim.x * blockDim.x) - { - NumericT val1 = vec[i]; - NumericT val2 = vec[size - i - 1]; - vec[i] = val2; - vec[size - i - 1] = val1; - } -} - -/** - * @brief Reverse vector to oposite order and save it in input vector - */ -template<typename NumericT> -void reverse(viennacl::vector_base<NumericT>& in) -{ - vcl_size_t size = in.size(); - reverse_inplace<<<128,128>>>(viennacl::cuda_arg(in), static_cast<unsigned int>(size)); - VIENNACL_CUDA_LAST_ERROR_CHECK("reverse_inplace"); -} - -} //namespace cuda -} //namespace linalg -} //namespace viennacl - -#endif /* FFT_OPERATIONS_HPP_ */
http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/ilu_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/ilu_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/ilu_operations.hpp deleted file mode 100644 index 302a73c..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/ilu_operations.hpp +++ /dev/null @@ -1,666 +0,0 @@ -#ifndef VIENNACL_LINALG_CUDA_ILU_OPERATIONS_HPP_ -#define VIENNACL_LINALG_CUDA_ILU_OPERATIONS_HPP_ - -/* ========================================================================= - Copyright (c) 2010-2016, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. - Portions of this software are copyright by UChicago Argonne, LLC. - - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - - Project Head: Karl Rupp [email protected] - - (A list of authors and contributors can be found in the PDF manual) - - License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - -/** @file viennacl/linalg/cuda/ilu_operations.hpp - @brief Implementations of specialized routines for the Chow-Patel parallel ILU preconditioner using CUDA -*/ - -#include <cmath> -#include <algorithm> //for std::max and std::min - -#include "viennacl/forwards.h" -#include "viennacl/scalar.hpp" -#include "viennacl/tools/tools.hpp" -#include "viennacl/meta/predicate.hpp" -#include "viennacl/meta/enable_if.hpp" -#include "viennacl/traits/size.hpp" -#include "viennacl/traits/start.hpp" -#include "viennacl/linalg/vector_operations.hpp" -#include "viennacl/traits/stride.hpp" - - -namespace viennacl -{ -namespace linalg -{ -namespace cuda -{ - -template<typename IndexT> // to control external linkage -__global__ void extract_L_kernel_1( - const IndexT * A_row_indices, - const IndexT * A_col_indices, - unsigned int A_size1, - unsigned int * L_row_indices) -{ - for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; - row < A_size1; - row += gridDim.x * blockDim.x) - { - unsigned int row_begin = A_row_indices[row]; - unsigned int row_end = A_row_indices[row+1]; - - unsigned int num_entries_L = 0; - for (unsigned int j=row_begin; j<row_end; ++j) - { - unsigned int col = A_col_indices[j]; - if (col <= row) - ++num_entries_L; - } - - L_row_indices[row] = num_entries_L; - } -} - -template<typename NumericT> -__global__ void extract_L_kernel_2( - unsigned int const *A_row_indices, - unsigned int const *A_col_indices, - NumericT const *A_elements, - unsigned int A_size1, - - unsigned int const *L_row_indices, - unsigned int *L_col_indices, - NumericT *L_elements) -{ - for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; - row < A_size1; - row += gridDim.x * blockDim.x) - { - unsigned int row_begin = A_row_indices[row]; - unsigned int row_end = A_row_indices[row+1]; - - unsigned int index_L = L_row_indices[row]; - for (unsigned int j = row_begin; j < row_end; ++j) - { - unsigned int col = A_col_indices[j]; - NumericT value = A_elements[j]; - - if (col <= row) - { - L_col_indices[index_L] = col; - L_elements[index_L] = value; - ++index_L; - } - } - } -} - -template<typename NumericT> -void extract_L(compressed_matrix<NumericT> const & A, - compressed_matrix<NumericT> & L) -{ - // - // Step 1: Count elements in L and U: - // - extract_L_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()), - viennacl::cuda_arg<unsigned int>(A.handle2()), - static_cast<unsigned int>(A.size1()), - viennacl::cuda_arg<unsigned int>(L.handle1()) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("extract_L_kernel_1"); - - // - // Step 2: Exclusive scan on row_buffers: - // - viennacl::vector<unsigned int> wrapped_L_row_buffer(viennacl::cuda_arg<unsigned int>(L.handle1().cuda_handle()), viennacl::CUDA_MEMORY, A.size1() + 1); - viennacl::linalg::exclusive_scan(wrapped_L_row_buffer, wrapped_L_row_buffer); - L.reserve(wrapped_L_row_buffer[L.size1()], false); - - // - // Step 3: Write entries - // - extract_L_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()), - viennacl::cuda_arg<unsigned int>(A.handle2()), - viennacl::cuda_arg<NumericT>(A.handle()), - static_cast<unsigned int>(A.size1()), - viennacl::cuda_arg<unsigned int>(L.handle1()), - viennacl::cuda_arg<unsigned int>(L.handle2()), - viennacl::cuda_arg<NumericT>(L.handle()) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("extract_L_kernel_2"); - - L.generate_row_block_information(); - -} // extract_L - -/////////////////////////////////////////////// - - -template<typename NumericT> -__global__ void ilu_scale_kernel_1( - unsigned int const *A_row_indices, - unsigned int const *A_col_indices, - NumericT const *A_elements, - unsigned int A_size1, - - NumericT *D_elements) -{ - for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; - row < A_size1; - row += gridDim.x * blockDim.x) - { - unsigned int row_begin = A_row_indices[row]; - unsigned int row_end = A_row_indices[row+1]; - - for (unsigned int j = row_begin; j < row_end; ++j) - { - unsigned int col = A_col_indices[j]; - if (row == col) - { - D_elements[row] = NumericT(1) / sqrt(fabs(A_elements[j])); - break; - } - } - } -} - -/** @brief Scales values in a matrix such that output = D * input * D, where D is a diagonal matrix (only the diagonal is provided) */ -template<typename NumericT> -__global__ void ilu_scale_kernel_2( - unsigned int const *R_row_indices, - unsigned int const *R_col_indices, - NumericT *R_elements, - unsigned int R_size1, - - NumericT *D_elements) -{ - for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; - row < R_size1; - row += gridDim.x * blockDim.x) - { - unsigned int row_begin = R_row_indices[row]; - unsigned int row_end = R_row_indices[row+1]; - - NumericT D_row = D_elements[row]; - - for (unsigned int j = row_begin; j < row_end; ++j) - R_elements[j] *= D_row * D_elements[R_col_indices[j]]; - } -} - - - -/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */ -template<typename NumericT> -void icc_scale(compressed_matrix<NumericT> const & A, - compressed_matrix<NumericT> & L) -{ - viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A)); - - // fill D: - ilu_scale_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()), - viennacl::cuda_arg<unsigned int>(A.handle2()), - viennacl::cuda_arg<NumericT>(A.handle()), - static_cast<unsigned int>(A.size1()), - viennacl::cuda_arg(D) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_1"); - - // scale L: - ilu_scale_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.handle1()), - viennacl::cuda_arg<unsigned int>(L.handle2()), - viennacl::cuda_arg<NumericT>(L.handle()), - static_cast<unsigned int>(L.size1()), - viennacl::cuda_arg(D) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_1"); -} - -///////////////////////////////////// - -/** @brief CUDA kernel for one Chow-Patel-ICC sweep */ -template<typename NumericT> -__global__ void icc_chow_patel_sweep_kernel( - unsigned int const *L_row_indices, - unsigned int const *L_col_indices, - NumericT *L_elements, - NumericT const *L_backup, - unsigned int L_size1, - NumericT const *aij_L) -{ - for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; - row < L_size1; - row += gridDim.x * blockDim.x) - { - // - // update L: - // - unsigned int row_Li_start = L_row_indices[row]; - unsigned int row_Li_end = L_row_indices[row + 1]; - - for (unsigned int i = row_Li_start; i < row_Li_end; ++i) - { - unsigned int col = L_col_indices[i]; - - unsigned int row_Lj_start = L_row_indices[col]; - unsigned int row_Lj_end = L_row_indices[col + 1]; - - // compute \sum_{k=1}^{j-1} l_ik u_kj - unsigned int index_Lj = row_Lj_start; - unsigned int col_Lj = L_col_indices[index_Lj]; - NumericT s = aij_L[i]; - for (unsigned int index_Li = row_Li_start; index_Li < i; ++index_Li) - { - unsigned int col_Li = L_col_indices[index_Li]; - - // find element in U - while (col_Lj < col_Li) - { - ++index_Lj; - col_Lj = L_col_indices[index_Lj]; - } - - if (col_Lj == col_Li) - s -= L_backup[index_Li] * L_backup[index_Lj]; - } - - // update l_ij: - L_elements[i] = (row == col) ? sqrt(s) : (s / L_backup[row_Lj_end - 1]); // diagonal element is last entry in U - } - - } -} - - -/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenMP (cf. Algorithm 2 in paper) */ -template<typename NumericT> -void icc_chow_patel_sweep(compressed_matrix<NumericT> & L, - vector<NumericT> const & aij_L) -{ - viennacl::backend::mem_handle L_backup; - viennacl::backend::memory_create(L_backup, L.handle().raw_size(), viennacl::traits::context(L)); - viennacl::backend::memory_copy(L.handle(), L_backup, 0, 0, L.handle().raw_size()); - - icc_chow_patel_sweep_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.handle1()), - viennacl::cuda_arg<unsigned int>(L.handle2()), - viennacl::cuda_arg<NumericT>(L.handle()), - viennacl::cuda_arg<NumericT>(L_backup), - static_cast<unsigned int>(L.size1()), - - viennacl::cuda_arg<NumericT>(aij_L.handle()) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("icc_chow_patel_sweep_kernel"); - -} - - -////////////////////////////// ILU /////////////////////////// - -template<typename IndexT> // to control external linkage -__global__ void extract_LU_kernel_1( - const IndexT * A_row_indices, - const IndexT * A_col_indices, - unsigned int A_size1, - - unsigned int * L_row_indices, - - unsigned int * U_row_indices) -{ - for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; - row < A_size1; - row += gridDim.x * blockDim.x) - { - unsigned int row_begin = A_row_indices[row]; - unsigned int row_end = A_row_indices[row+1]; - - unsigned int num_entries_L = 0; - unsigned int num_entries_U = 0; - for (unsigned int j=row_begin; j<row_end; ++j) - { - unsigned int col = A_col_indices[j]; - if (col <= row) - ++num_entries_L; - if (col >= row) - ++num_entries_U; - } - - L_row_indices[row] = num_entries_L; - U_row_indices[row] = num_entries_U; - } -} - -template<typename NumericT> -__global__ void extract_LU_kernel_2( - unsigned int const *A_row_indices, - unsigned int const *A_col_indices, - NumericT const *A_elements, - unsigned int A_size1, - - unsigned int const *L_row_indices, - unsigned int *L_col_indices, - NumericT *L_elements, - - unsigned int const *U_row_indices, - unsigned int *U_col_indices, - NumericT *U_elements) -{ - for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; - row < A_size1; - row += gridDim.x * blockDim.x) - { - unsigned int row_begin = A_row_indices[row]; - unsigned int row_end = A_row_indices[row+1]; - - unsigned int index_L = L_row_indices[row]; - unsigned int index_U = U_row_indices[row]; - for (unsigned int j = row_begin; j < row_end; ++j) - { - unsigned int col = A_col_indices[j]; - NumericT value = A_elements[j]; - - if (col <= row) - { - L_col_indices[index_L] = col; - L_elements[index_L] = value; - ++index_L; - } - - if (col >= row) - { - U_col_indices[index_U] = col; - U_elements[index_U] = value; - ++index_U; - } - } - } -} - -template<typename NumericT> -void extract_LU(compressed_matrix<NumericT> const & A, - compressed_matrix<NumericT> & L, - compressed_matrix<NumericT> & U) -{ - // - // Step 1: Count elements in L and U: - // - extract_LU_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()), - viennacl::cuda_arg<unsigned int>(A.handle2()), - static_cast<unsigned int>(A.size1()), - viennacl::cuda_arg<unsigned int>(L.handle1()), - viennacl::cuda_arg<unsigned int>(U.handle1()) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("extract_LU_kernel_1"); - - // - // Step 2: Exclusive scan on row_buffers: - // - viennacl::vector<unsigned int> wrapped_L_row_buffer(viennacl::cuda_arg<unsigned int>(L.handle1()), viennacl::CUDA_MEMORY, A.size1() + 1); - viennacl::linalg::exclusive_scan(wrapped_L_row_buffer, wrapped_L_row_buffer); - L.reserve(wrapped_L_row_buffer[L.size1()], false); - - viennacl::vector<unsigned int> wrapped_U_row_buffer(viennacl::cuda_arg<unsigned int>(U.handle1()), viennacl::CUDA_MEMORY, A.size1() + 1); - viennacl::linalg::exclusive_scan(wrapped_U_row_buffer, wrapped_U_row_buffer); - U.reserve(wrapped_U_row_buffer[U.size1()], false); - - // - // Step 3: Write entries - // - extract_LU_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()), - viennacl::cuda_arg<unsigned int>(A.handle2()), - viennacl::cuda_arg<NumericT>(A.handle()), - static_cast<unsigned int>(A.size1()), - viennacl::cuda_arg<unsigned int>(L.handle1()), - viennacl::cuda_arg<unsigned int>(L.handle2()), - viennacl::cuda_arg<NumericT>(L.handle()), - viennacl::cuda_arg<unsigned int>(U.handle1()), - viennacl::cuda_arg<unsigned int>(U.handle2()), - viennacl::cuda_arg<NumericT>(U.handle()) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("extract_LU_kernel_2"); - - L.generate_row_block_information(); - // Note: block information for U will be generated after transposition - -} // extract_LU - -/////////////////////////////////////////////// - -/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */ -template<typename NumericT> -void ilu_scale(compressed_matrix<NumericT> const & A, - compressed_matrix<NumericT> & L, - compressed_matrix<NumericT> & U) -{ - viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A)); - - // fill D: - ilu_scale_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()), - viennacl::cuda_arg<unsigned int>(A.handle2()), - viennacl::cuda_arg<NumericT>(A.handle()), - static_cast<unsigned int>(A.size1()), - viennacl::cuda_arg<NumericT>(D.handle()) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_1"); - - // scale L: - ilu_scale_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.handle1()), - viennacl::cuda_arg<unsigned int>(L.handle2()), - viennacl::cuda_arg<NumericT>(L.handle()), - static_cast<unsigned int>(L.size1()), - viennacl::cuda_arg<NumericT>(D.handle()) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_2"); - - // scale U: - ilu_scale_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(U.handle1()), - viennacl::cuda_arg<unsigned int>(U.handle2()), - viennacl::cuda_arg<NumericT>(U.handle()), - static_cast<unsigned int>(U.size1()), - viennacl::cuda_arg<NumericT>(D.handle()) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_2"); -} - -///////////////////////////////////// - -/** @brief CUDA kernel for one Chow-Patel-ILU sweep */ -template<typename NumericT> -__global__ void ilu_chow_patel_sweep_kernel( - unsigned int const *L_row_indices, - unsigned int const *L_col_indices, - NumericT *L_elements, - NumericT const *L_backup, - unsigned int L_size1, - - NumericT const *aij_L, - - unsigned int const *U_trans_row_indices, - unsigned int const *U_trans_col_indices, - NumericT *U_trans_elements, - NumericT const *U_trans_backup, - - NumericT const *aij_U_trans) -{ - for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; - row < L_size1; - row += gridDim.x * blockDim.x) - { - // - // update L: - // - unsigned int row_L_start = L_row_indices[row]; - unsigned int row_L_end = L_row_indices[row + 1]; - - for (unsigned int j = row_L_start; j < row_L_end; ++j) - { - unsigned int col = L_col_indices[j]; - - if (col == row) - continue; - - unsigned int row_U_start = U_trans_row_indices[col]; - unsigned int row_U_end = U_trans_row_indices[col + 1]; - - // compute \sum_{k=1}^{j-1} l_ik u_kj - unsigned int index_U = row_U_start; - unsigned int col_U = (index_U < row_U_end) ? U_trans_col_indices[index_U] : L_size1; - NumericT sum = 0; - for (unsigned int k = row_L_start; k < j; ++k) - { - unsigned int col_L = L_col_indices[k]; - - // find element in U - while (col_U < col_L) - { - ++index_U; - col_U = U_trans_col_indices[index_U]; - } - - if (col_U == col_L) - sum += L_backup[k] * U_trans_backup[index_U]; - } - - // update l_ij: - L_elements[j] = (aij_L[j] - sum) / U_trans_backup[row_U_end - 1]; // diagonal element is last entry in U - } - - - // - // update U: - // - unsigned int row_U_start = U_trans_row_indices[row]; - unsigned int row_U_end = U_trans_row_indices[row + 1]; - for (unsigned int j = row_U_start; j < row_U_end; ++j) - { - unsigned int col = U_trans_col_indices[j]; - - row_L_start = L_row_indices[col]; - row_L_end = L_row_indices[col + 1]; - - // compute \sum_{k=1}^{j-1} l_ik u_kj - unsigned int index_L = row_L_start; - unsigned int col_L = (index_L < row_L_end) ? L_col_indices[index_L] : L_size1; - NumericT sum = 0; - for (unsigned int k = row_U_start; k < j; ++k) - { - unsigned int col_U = U_trans_col_indices[k]; - - // find element in L - while (col_L < col_U) - { - ++index_L; - col_L = L_col_indices[index_L]; - } - - if (col_U == col_L) - sum += L_backup[index_L] * U_trans_backup[k]; - } - - // update u_ij: - U_trans_elements[j] = aij_U_trans[j] - sum; - } - } -} - - -/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenMP (cf. Algorithm 2 in paper) */ -template<typename NumericT> -void ilu_chow_patel_sweep(compressed_matrix<NumericT> & L, - vector<NumericT> const & aij_L, - compressed_matrix<NumericT> & U_trans, - vector<NumericT> const & aij_U_trans) -{ - viennacl::backend::mem_handle L_backup; - viennacl::backend::memory_create(L_backup, L.handle().raw_size(), viennacl::traits::context(L)); - viennacl::backend::memory_copy(L.handle(), L_backup, 0, 0, L.handle().raw_size()); - - viennacl::backend::mem_handle U_backup; - viennacl::backend::memory_create(U_backup, U_trans.handle().raw_size(), viennacl::traits::context(U_trans)); - viennacl::backend::memory_copy(U_trans.handle(), U_backup, 0, 0, U_trans.handle().raw_size()); - - ilu_chow_patel_sweep_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.handle1()), - viennacl::cuda_arg<unsigned int>(L.handle2()), - viennacl::cuda_arg<NumericT>(L.handle()), - viennacl::cuda_arg<NumericT>(L_backup), - static_cast<unsigned int>(L.size1()), - - viennacl::cuda_arg<NumericT>(aij_L.handle()), - - viennacl::cuda_arg<unsigned int>(U_trans.handle1()), - viennacl::cuda_arg<unsigned int>(U_trans.handle2()), - viennacl::cuda_arg<NumericT>(U_trans.handle()), - viennacl::cuda_arg<NumericT>(U_backup), - - viennacl::cuda_arg<NumericT>(aij_U_trans.handle()) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_chow_patel_sweep_kernel"); - -} - -////////////////////////////////////// - -template<typename NumericT> -__global__ void ilu_form_neumann_matrix_kernel( - unsigned int const *R_row_indices, - unsigned int const *R_col_indices, - NumericT *R_elements, - unsigned int R_size1, - - NumericT *D_elements) -{ - for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; - row < R_size1; - row += gridDim.x * blockDim.x) - { - unsigned int row_begin = R_row_indices[row]; - unsigned int row_end = R_row_indices[row+1]; - - // part 1: extract diagonal entry - NumericT diag = 0; - for (unsigned int j = row_begin; j < row_end; ++j) - { - unsigned int col = R_col_indices[j]; - if (col == row) - { - diag = R_elements[j]; - R_elements[j] = 0; // (I - D^{-1}R) - break; - } - } - D_elements[row] = diag; - - // part2: scale - for (unsigned int j = row_begin; j < row_end; ++j) - R_elements[j] /= -diag; - } -} - - - -template<typename NumericT> -void ilu_form_neumann_matrix(compressed_matrix<NumericT> & R, - vector<NumericT> & diag_R) -{ - ilu_form_neumann_matrix_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(R.handle1()), - viennacl::cuda_arg<unsigned int>(R.handle2()), - viennacl::cuda_arg<NumericT>(R.handle()), - static_cast<unsigned int>(R.size1()), - viennacl::cuda_arg<NumericT>(diag_R.handle()) - ); - VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_form_neumann_matrix_kernel"); -} - -} //namespace host_based -} //namespace linalg -} //namespace viennacl - - -#endif
