http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/fft_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/fft_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/fft_operations.hpp new file mode 100644 index 0000000..198ac31 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/fft_operations.hpp @@ -0,0 +1,858 @@ +#ifndef VIENNACL_LINALG_CUDA_FFT_OPERATIONS_HPP_ +#define VIENNACL_LINALG_CUDA_FFT_OPERATIONS_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/cuda/fft_operations.hpp + @brief Implementations of Fast Furier Transformation using cuda +*/ +#include <cmath> +#include <viennacl/matrix.hpp> +#include <viennacl/vector.hpp> + +#include "viennacl/forwards.h" +#include "viennacl/scalar.hpp" +#include "viennacl/tools/tools.hpp" +#include "viennacl/linalg/cuda/common.hpp" +#include "viennacl/linalg/host_based/vector_operations.hpp" +#include "viennacl/linalg/host_based/fft_operations.hpp" + +namespace viennacl +{ +namespace linalg +{ +namespace cuda +{ +namespace detail +{ + namespace fft + { + const vcl_size_t MAX_LOCAL_POINTS_NUM = 512; + + inline vcl_size_t num_bits(vcl_size_t size) + { + vcl_size_t bits_datasize = 0; + vcl_size_t ds = 1; + + while (ds < size) + { + ds = ds << 1; + bits_datasize++; + } + + return bits_datasize; + } + + inline vcl_size_t next_power_2(vcl_size_t n) + { + n = n - 1; + + vcl_size_t power = 1; + + while (power < sizeof(vcl_size_t) * 8) + { + n = n | (n >> power); + power *= 2; + } + + return n + 1; + } + + } //namespace fft +} //namespace detail + +// addition +inline __host__ __device__ float2 operator+(float2 a, float2 b) +{ + return make_float2(a.x + b.x, a.y + b.y); +} + +// subtract +inline __host__ __device__ float2 operator-(float2 a, float2 b) +{ + return make_float2(a.x - b.x, a.y - b.y); +} +// division +template<typename SCALARTYPE> +inline __device__ float2 operator/(float2 a,SCALARTYPE b) +{ + return make_float2(a.x/b, a.y/b); +} + +//multiplication +inline __device__ float2 operator*(float2 in1, float2 in2) +{ + return make_float2(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x); +} + +// addition +inline __host__ __device__ double2 operator+(double2 a, double2 b) +{ + return make_double2(a.x + b.x, a.y + b.y); +} + +// subtraction +inline __host__ __device__ double2 operator-(double2 a, double2 b) +{ + return make_double2(a.x - b.x, a.y - b.y); +} + +// division +template<typename SCALARTYPE> +inline __host__ __device__ double2 operator/(double2 a,SCALARTYPE b) +{ + return make_double2(a.x/b, a.y/b); +} + +//multiplication +inline __host__ __device__ double2 operator*(double2 in1, double2 in2) +{ + return make_double2(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x); +} + +inline __device__ unsigned int get_reorder_num(unsigned int v, unsigned int bit_size) +{ + v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); + v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); + v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); + v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); + v = (v >> 16) | (v << 16); + v = v >> (32 - bit_size); + return v; +} + +template<typename Numeric2T, typename NumericT> +__global__ void fft_direct( + const Numeric2T * input, + Numeric2T * output, + unsigned int size, + unsigned int stride, + unsigned int batch_num, + NumericT sign, + bool is_row_major) +{ + + const NumericT NUM_PI(3.14159265358979323846); + + for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) + { + for (unsigned int k = blockIdx.x * blockDim.x + threadIdx.x; k < size; k += gridDim.x * blockDim.x) + { + Numeric2T f; + f.x = 0; + f.y = 0; + + for (unsigned int n = 0; n < size; n++) + { + Numeric2T in; + if (!is_row_major) + in = input[batch_id * stride + n]; //input index here + else + in = input[n * stride + batch_id];//input index here + + NumericT sn,cs; + NumericT arg = sign * 2 * NUM_PI * k / size * n; + sn = sin(arg); + cs = cos(arg); + + Numeric2T ex; + ex.x = cs; + ex.y = sn; + Numeric2T tmp; + tmp.x = in.x * ex.x - in.y * ex.y; + tmp.y = in.x * ex.y + in.y * ex.x; + f = f + tmp; + } + + if (!is_row_major) + output[batch_id * stride + k] = f; // output index here + else + output[k * stride + batch_id] = f;// output index here + } + } +} + +/** + * @brief Direct 1D algorithm for computing Fourier transformation. + * + * Works on any sizes of data. + * Serial implementation has o(n^2) complexity + */ +template<typename NumericT, unsigned int AlignmentV> +void direct(viennacl::vector<NumericT, AlignmentV> const & in, + viennacl::vector<NumericT, AlignmentV> & out, + vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, + NumericT sign = NumericT(-1), + viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) +{ + typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type; + + fft_direct<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(in)), + reinterpret_cast< numeric2_type *>(viennacl::cuda_arg(out)), + static_cast<unsigned int>(size), + static_cast<unsigned int>(stride), + static_cast<unsigned int>(batch_num), + sign, + bool(data_order != viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)); + VIENNACL_CUDA_LAST_ERROR_CHECK("fft_direct"); +} + +/** + * @brief Direct 2D algorithm for computing Fourier transformation. + * + * Works on any sizes of data. + * Serial implementation has o(n^2) complexity + */ +template<typename NumericT, unsigned int AlignmentV> +void direct(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & in, + viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & out, + vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, + NumericT sign = NumericT(-1), + viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) +{ + typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type; + + fft_direct<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(in)), + reinterpret_cast< numeric2_type *>(viennacl::cuda_arg(out)), + static_cast<unsigned int>(size), + static_cast<unsigned int>(stride), + static_cast<unsigned int>(batch_num), + sign, + bool(data_order != viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)); + VIENNACL_CUDA_LAST_ERROR_CHECK("fft_direct"); +} + +template<typename NumericT> +__global__ void fft_reorder(NumericT * input, + unsigned int bit_size, + unsigned int size, + unsigned int stride, + unsigned int batch_num, + bool is_row_major) +{ + + unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int glb_sz = gridDim.x * blockDim.x; + + for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) + { + for (unsigned int i = glb_id; i < size; i += glb_sz) + { + unsigned int v = get_reorder_num(i, bit_size); + + if (i < v) + { + if (!is_row_major) + { + NumericT tmp = input[batch_id * stride + i]; // index + input[batch_id * stride + i] = input[batch_id * stride + v];//index + input[batch_id * stride + v] = tmp;//index + } + else + { + NumericT tmp = input[i * stride + batch_id]; + input[i * stride + batch_id] = input[v * stride + batch_id]; + input[v * stride + batch_id] = tmp; + } + } + } + } +} + +/*** + * This function performs reorder of input data. Indexes are sorted in bit-reversal order. + * Such reordering should be done before in-place FFT. + */ +template<typename NumericT, unsigned int AlignmentV> +void reorder(viennacl::vector<NumericT, AlignmentV> & in, + vcl_size_t size, vcl_size_t stride, vcl_size_t bits_datasize, vcl_size_t batch_num, + viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) +{ + typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type; + + fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)), + static_cast<unsigned int>(bits_datasize), + static_cast<unsigned int>(size), + static_cast<unsigned int>(stride), + static_cast<unsigned int>(batch_num), + static_cast<bool>(data_order)); + VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder"); +} + +template<typename Numeric2T, typename NumericT> +__global__ void fft_radix2_local(Numeric2T * input, + unsigned int bit_size, + unsigned int size, + unsigned int stride, + unsigned int batch_num, + NumericT sign, + bool is_row_major) +{ + __shared__ Numeric2T lcl_input[1024]; + unsigned int grp_id = blockIdx.x; + unsigned int grp_num = gridDim.x; + + unsigned int lcl_sz = blockDim.x; + unsigned int lcl_id = threadIdx.x; + const NumericT NUM_PI(3.14159265358979323846); + + for (unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num) + { + for (unsigned int p = lcl_id; p < size; p += lcl_sz) + { + unsigned int v = get_reorder_num(p, bit_size); + if (!is_row_major) + lcl_input[v] = input[batch_id * stride + p]; + else + lcl_input[v] = input[p * stride + batch_id]; + } + + __syncthreads(); + + //performs Cooley-Tukey FFT on local arrayfft + for (unsigned int s = 0; s < bit_size; s++) + { + unsigned int ss = 1 << s; + NumericT cs, sn; + for (unsigned int tid = lcl_id; tid < size; tid += lcl_sz) + { + unsigned int group = (tid & (ss - 1)); + unsigned int pos = ((tid >> s) << (s + 1)) + group; + + Numeric2T in1 = lcl_input[pos]; + Numeric2T in2 = lcl_input[pos + ss]; + + NumericT arg = group * sign * NUM_PI / ss; + + sn = sin(arg); + cs = cos(arg); + Numeric2T ex; + ex.x = cs; + ex.y = sn; + + Numeric2T tmp; + tmp.x = in2.x * ex.x - in2.y * ex.y; + tmp.y = in2.x * ex.y + in2.y * ex.x; + + lcl_input[pos + ss] = in1 - tmp; + lcl_input[pos] = in1 + tmp; + } + __syncthreads(); + } + + //copy local array back to global memory + for (unsigned int p = lcl_id; p < size; p += lcl_sz) + { + if (!is_row_major) + input[batch_id * stride + p] = lcl_input[p]; //index + else + input[p * stride + batch_id] = lcl_input[p]; + } + + } +} + +template<typename Numeric2T, typename NumericT> +__global__ void fft_radix2(Numeric2T * input, + unsigned int s, + unsigned int bit_size, + unsigned int size, + unsigned int stride, + unsigned int batch_num, + NumericT sign, + bool is_row_major) +{ + + unsigned int ss = 1 << s; + unsigned int half_size = size >> 1; + + NumericT cs, sn; + const NumericT NUM_PI(3.14159265358979323846); + + unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int glb_sz = gridDim.x * blockDim.x; + + for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++) + { + for (unsigned int tid = glb_id; tid < half_size; tid += glb_sz) + { + unsigned int group = (tid & (ss - 1)); + unsigned int pos = ((tid >> s) << (s + 1)) + group; + Numeric2T in1; + Numeric2T in2; + unsigned int offset; + if (!is_row_major) + { + offset = batch_id * stride + pos; + in1 = input[offset]; //index + in2 = input[offset + ss];//index + } + else + { + offset = pos * stride + batch_id; + in1 = input[offset]; //index + in2 = input[offset + ss * stride];//index + } + + NumericT arg = group * sign * NUM_PI / ss; + + sn = sin(arg); + cs = cos(arg); + + Numeric2T ex; + ex.x = cs; + ex.y = sn; + + Numeric2T tmp; + tmp.x = in2.x * ex.x - in2.y * ex.y; + tmp.y = in2.x * ex.y + in2.y * ex.x; + + if (!is_row_major) + input[offset + ss] = in1 - tmp; //index + else + input[offset + ss * stride] = in1 - tmp; //index + input[offset] = in1 + tmp; //index + } + } +} + +/** + * @brief Radix-2 1D algorithm for computing Fourier transformation. + * + * Works only on power-of-two sizes of data. + * Serial implementation has o(n * lg n) complexity. + * This is a Cooley-Tukey algorithm + */ +template<typename NumericT, unsigned int AlignmentV> +void radix2(viennacl::vector<NumericT, AlignmentV> & in, + vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1), + viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) +{ + typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type; + + unsigned int bit_size = viennacl::linalg::cuda::detail::fft::num_bits(size); + + if (size <= viennacl::linalg::cuda::detail::fft::MAX_LOCAL_POINTS_NUM) + { + fft_radix2_local<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)), + static_cast<unsigned int>(bit_size), + static_cast<unsigned int>(size), + static_cast<unsigned int>(stride), + static_cast<unsigned int>(batch_num), + static_cast<NumericT>(sign), + static_cast<bool>(data_order)); + VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2_local"); + } + else + { + fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)), + static_cast<unsigned int>(bit_size), + static_cast<unsigned int>(size), + static_cast<unsigned int>(stride), + static_cast<unsigned int>(batch_num), + static_cast<bool>(data_order)); + VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder"); + + for (vcl_size_t step = 0; step < bit_size; step++) + { + fft_radix2<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)), + static_cast<unsigned int>(step), + static_cast<unsigned int>(bit_size), + static_cast<unsigned int>(size), + static_cast<unsigned int>(stride), + static_cast<unsigned int>(batch_num), + sign, + static_cast<bool>(data_order)); + VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2"); + } + } +} + +/** + * @brief Radix-2 2D algorithm for computing Fourier transformation. + * + * Works only on power-of-two sizes of data. + * Serial implementation has o(n * lg n) complexity. + * This is a Cooley-Tukey algorithm + */ +template<typename NumericT, unsigned int AlignmentV> +void radix2(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& in, + vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1), + viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) +{ + typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type; + + unsigned int bit_size = viennacl::linalg::cuda::detail::fft::num_bits(size); + + if (size <= viennacl::linalg::cuda::detail::fft::MAX_LOCAL_POINTS_NUM) + { + fft_radix2_local<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)), + static_cast<unsigned int>(bit_size), + static_cast<unsigned int>(size), + static_cast<unsigned int>(stride), + static_cast<unsigned int>(batch_num), + sign, + static_cast<bool>(data_order)); + VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2_local"); + } + else + { + fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)), + static_cast<unsigned int>(bit_size), + static_cast<unsigned int>(size), + static_cast<unsigned int>(stride), + static_cast<unsigned int>(batch_num), + static_cast<bool>(data_order)); + VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder"); + for (vcl_size_t step = 0; step < bit_size; step++) + { + fft_radix2<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)), + static_cast<unsigned int>(step), + static_cast<unsigned int>(bit_size), + static_cast<unsigned int>(size), + static_cast<unsigned int>(stride), + static_cast<unsigned int>(batch_num), + sign, + static_cast<bool>(data_order)); + VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2"); + } + } +} + +template<typename Numeric2T, typename NumericT> +__global__ void bluestein_post(Numeric2T * Z, Numeric2T * out, unsigned int size, NumericT sign) +{ + unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int glb_sz =gridDim.x * blockDim.x; + + unsigned int double_size = size << 1; + NumericT sn_a, cs_a; + const NumericT NUM_PI(3.14159265358979323846); + + for (unsigned int i = glb_id; i < size; i += glb_sz) + { + unsigned int rm = i * i % (double_size); + NumericT angle = (NumericT)rm / size * (-NUM_PI); + + sn_a = sin(angle); + cs_a= cos(angle); + + Numeric2T b_i; + b_i.x = cs_a; + b_i.y = sn_a; + out[i].x = Z[i].x * b_i.x - Z[i].y * b_i.y; + out[i].y = Z[i].x * b_i.y + Z[i].y * b_i.x; + } +} + +template<typename Numeric2T, typename NumericT> +__global__ void bluestein_pre(Numeric2T * input, Numeric2T * A, Numeric2T * B, + unsigned int size, unsigned int ext_size, NumericT sign) +{ + unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int glb_sz = gridDim.x * blockDim.x; + + unsigned int double_size = size << 1; + + NumericT sn_a, cs_a; + const NumericT NUM_PI(3.14159265358979323846); + + for (unsigned int i = glb_id; i < size; i += glb_sz) + { + unsigned int rm = i * i % (double_size); + NumericT angle = (NumericT)rm / size * NUM_PI; + + sn_a = sin(-angle); + cs_a= cos(-angle); + + Numeric2T a_i; + a_i.x =cs_a; + a_i.y =sn_a; + + Numeric2T b_i; + b_i.x =cs_a; + b_i.y =-sn_a; + + A[i].x = input[i].x * a_i.x - input[i].y * a_i.y; + A[i].y = input[i].x * a_i.y + input[i].y * a_i.x; + B[i] = b_i; + + // very bad instruction, to be fixed + if (i) + B[ext_size - i] = b_i; + } +} + +template<typename NumericT> +__global__ void zero2(NumericT * input1, NumericT * input2, unsigned int size) +{ + for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) + { + input1[i].x = 0; + input1[i].y = 0; + + input2[i].x = 0; + input2[i].y = 0; + } +} + +/** + * @brief Bluestein's algorithm for computing Fourier transformation. + * + * Currently, Works only for sizes of input data which less than 2^16. + * Uses a lot of additional memory, but should be fast for any size of data. + * Serial implementation has something about o(n * lg n) complexity + */ +template<typename NumericT, unsigned int AlignmentV> +void bluestein(viennacl::vector<NumericT, AlignmentV> & in, + viennacl::vector<NumericT, AlignmentV> & out, vcl_size_t /*batch_num*/) +{ + typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type; + + vcl_size_t size = in.size() >> 1; + vcl_size_t ext_size = viennacl::linalg::cuda::detail::fft::next_power_2(2 * size - 1); + + viennacl::vector<NumericT, AlignmentV> A(ext_size << 1); + viennacl::vector<NumericT, AlignmentV> B(ext_size << 1); + viennacl::vector<NumericT, AlignmentV> Z(ext_size << 1); + + zero2<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(A)), + reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(B)), + static_cast<unsigned int>(ext_size)); + VIENNACL_CUDA_LAST_ERROR_CHECK("zero2"); + + bluestein_pre<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(in)), + reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(A)), + reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(B)), + static_cast<unsigned int>(size), + static_cast<unsigned int>(ext_size), + NumericT(1)); + VIENNACL_CUDA_LAST_ERROR_CHECK("bluestein_pre"); + + viennacl::linalg::convolve_i(A, B, Z); + + bluestein_post<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(Z)), + reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(out)), + static_cast<unsigned int>(size), + NumericT(1)); + VIENNACL_CUDA_LAST_ERROR_CHECK("bluestein_post"); +} + +template<typename NumericT> +__global__ void fft_mult_vec(const NumericT * input1, + const NumericT * input2, + NumericT * output, + unsigned int size) +{ + for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) + { + NumericT in1 = input1[i]; + NumericT in2 = input2[i]; + output[i] = in1 * in2; + } +} + +/** + * @brief Mutiply two complex vectors and store result in output + */ +template<typename NumericT, unsigned int AlignmentV> +void multiply_complex(viennacl::vector<NumericT, AlignmentV> const & input1, + viennacl::vector<NumericT, AlignmentV> const & input2, + viennacl::vector<NumericT, AlignmentV> & output) +{ + typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type; + + vcl_size_t size = input1.size() / 2; + + fft_mult_vec<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(input1)), + reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(input2)), + reinterpret_cast< numeric2_type *>(viennacl::cuda_arg(output)), + static_cast<unsigned int>(size)); + VIENNACL_CUDA_LAST_ERROR_CHECK("fft_mult_vec"); +} + +template<typename Numeric2T, typename NumericT> +__global__ void fft_div_vec_scalar(Numeric2T * input1, unsigned int size, NumericT factor) +{ + for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x*blockDim.x) + input1[i] = input1[i]/factor; +} + +/** + * @brief Normalize vector on with his own size + */ +template<typename NumericT, unsigned int AlignmentV> +void normalize(viennacl::vector<NumericT, AlignmentV> & input) +{ + typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type; + + vcl_size_t size = input.size() >> 1; + NumericT norm_factor = static_cast<NumericT>(size); + fft_div_vec_scalar<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(input)), + static_cast<unsigned int>(size), + norm_factor); + VIENNACL_CUDA_LAST_ERROR_CHECK("fft_div_vec_scalar"); +} + +template<typename NumericT> +__global__ void transpose(const NumericT * input, + NumericT * output, + unsigned int row_num, + unsigned int col_num) +{ + unsigned int size = row_num * col_num; + for (unsigned int i =blockIdx.x * blockDim.x + threadIdx.x; i < size; i+= gridDim.x * blockDim.x) + { + unsigned int row = i / col_num; + unsigned int col = i - row*col_num; + unsigned int new_pos = col * row_num + row; + output[new_pos] = input[i]; + } +} + +/** + * @brief Transpose matrix + */ +template<typename NumericT, unsigned int AlignmentV> +void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & input, + viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & output) +{ + typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type; + + transpose<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(input)), + reinterpret_cast< numeric2_type *>(viennacl::cuda_arg(output)), + static_cast<unsigned int>(input.internal_size1()>>1), + static_cast<unsigned int>(input.internal_size2()>>1)); + VIENNACL_CUDA_LAST_ERROR_CHECK("transpose"); + +} + +template<typename NumericT> +__global__ void transpose_inplace( + NumericT * input, + unsigned int row_num, + unsigned int col_num) +{ + unsigned int size = row_num * col_num; + for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i+= gridDim.x * blockDim.x) + { + unsigned int row = i / col_num; + unsigned int col = i - row*col_num; + unsigned int new_pos = col * row_num + row; + if (i < new_pos) + { + NumericT val = input[i]; + input[i] = input[new_pos]; + input[new_pos] = val; + } + } +} + +/** + * @brief Inplace_transpose matrix + */ +template<typename NumericT, unsigned int AlignmentV> +void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & input) +{ + typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type; + + transpose_inplace<<<128,128>>>(reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(input)), + static_cast<unsigned int>(input.internal_size1()>>1), + static_cast<unsigned int>(input.internal_size2() >> 1)); + VIENNACL_CUDA_LAST_ERROR_CHECK("transpose_inplace"); + +} + +template<typename RealT,typename ComplexT> +__global__ void real_to_complex(const RealT * in, ComplexT * out, unsigned int size) +{ + for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) + { + ComplexT val; + val.x = in[i]; + val.y = 0; + out[i] = val; + } +} + +/** + * @brief Create complex vector from real vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part) + */ +template<typename NumericT> +void real_to_complex(viennacl::vector_base<NumericT> const & in, + viennacl::vector_base<NumericT> & out, vcl_size_t size) +{ + typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type; + + real_to_complex<<<128,128>>>(viennacl::cuda_arg(in), + reinterpret_cast<numeric2_type *>(viennacl::cuda_arg(out)), + static_cast<unsigned int>(size)); + VIENNACL_CUDA_LAST_ERROR_CHECK("real_to_complex"); +} + +template<typename ComplexT,typename RealT> +__global__ void complex_to_real(const ComplexT * in, RealT * out, unsigned int size) +{ + for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) + out[i] = in[i].x; +} + +/** + * @brief Create real vector from complex vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part) + */ +template<typename NumericT> +void complex_to_real(viennacl::vector_base<NumericT> const & in, + viennacl::vector_base<NumericT>& out, vcl_size_t size) +{ + typedef typename viennacl::linalg::cuda::detail::type_to_type2<NumericT>::type numeric2_type; + + complex_to_real<<<128,128>>>(reinterpret_cast<const numeric2_type *>(viennacl::cuda_arg(in)), + viennacl::cuda_arg(out), + static_cast<unsigned int>(size)); + VIENNACL_CUDA_LAST_ERROR_CHECK("complex_to_real"); + +} + +template<typename NumericT> +__global__ void reverse_inplace(NumericT * vec, unsigned int size) +{ + for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < (size >> 1); i+=gridDim.x * blockDim.x) + { + NumericT val1 = vec[i]; + NumericT val2 = vec[size - i - 1]; + vec[i] = val2; + vec[size - i - 1] = val1; + } +} + +/** + * @brief Reverse vector to oposite order and save it in input vector + */ +template<typename NumericT> +void reverse(viennacl::vector_base<NumericT>& in) +{ + vcl_size_t size = in.size(); + reverse_inplace<<<128,128>>>(viennacl::cuda_arg(in), static_cast<unsigned int>(size)); + VIENNACL_CUDA_LAST_ERROR_CHECK("reverse_inplace"); +} + +} //namespace cuda +} //namespace linalg +} //namespace viennacl + +#endif /* FFT_OPERATIONS_HPP_ */
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/ilu_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/ilu_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/ilu_operations.hpp new file mode 100644 index 0000000..302a73c --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/ilu_operations.hpp @@ -0,0 +1,666 @@ +#ifndef VIENNACL_LINALG_CUDA_ILU_OPERATIONS_HPP_ +#define VIENNACL_LINALG_CUDA_ILU_OPERATIONS_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the PDF manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/cuda/ilu_operations.hpp + @brief Implementations of specialized routines for the Chow-Patel parallel ILU preconditioner using CUDA +*/ + +#include <cmath> +#include <algorithm> //for std::max and std::min + +#include "viennacl/forwards.h" +#include "viennacl/scalar.hpp" +#include "viennacl/tools/tools.hpp" +#include "viennacl/meta/predicate.hpp" +#include "viennacl/meta/enable_if.hpp" +#include "viennacl/traits/size.hpp" +#include "viennacl/traits/start.hpp" +#include "viennacl/linalg/vector_operations.hpp" +#include "viennacl/traits/stride.hpp" + + +namespace viennacl +{ +namespace linalg +{ +namespace cuda +{ + +template<typename IndexT> // to control external linkage +__global__ void extract_L_kernel_1( + const IndexT * A_row_indices, + const IndexT * A_col_indices, + unsigned int A_size1, + unsigned int * L_row_indices) +{ + for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; + row < A_size1; + row += gridDim.x * blockDim.x) + { + unsigned int row_begin = A_row_indices[row]; + unsigned int row_end = A_row_indices[row+1]; + + unsigned int num_entries_L = 0; + for (unsigned int j=row_begin; j<row_end; ++j) + { + unsigned int col = A_col_indices[j]; + if (col <= row) + ++num_entries_L; + } + + L_row_indices[row] = num_entries_L; + } +} + +template<typename NumericT> +__global__ void extract_L_kernel_2( + unsigned int const *A_row_indices, + unsigned int const *A_col_indices, + NumericT const *A_elements, + unsigned int A_size1, + + unsigned int const *L_row_indices, + unsigned int *L_col_indices, + NumericT *L_elements) +{ + for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; + row < A_size1; + row += gridDim.x * blockDim.x) + { + unsigned int row_begin = A_row_indices[row]; + unsigned int row_end = A_row_indices[row+1]; + + unsigned int index_L = L_row_indices[row]; + for (unsigned int j = row_begin; j < row_end; ++j) + { + unsigned int col = A_col_indices[j]; + NumericT value = A_elements[j]; + + if (col <= row) + { + L_col_indices[index_L] = col; + L_elements[index_L] = value; + ++index_L; + } + } + } +} + +template<typename NumericT> +void extract_L(compressed_matrix<NumericT> const & A, + compressed_matrix<NumericT> & L) +{ + // + // Step 1: Count elements in L and U: + // + extract_L_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()), + viennacl::cuda_arg<unsigned int>(A.handle2()), + static_cast<unsigned int>(A.size1()), + viennacl::cuda_arg<unsigned int>(L.handle1()) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("extract_L_kernel_1"); + + // + // Step 2: Exclusive scan on row_buffers: + // + viennacl::vector<unsigned int> wrapped_L_row_buffer(viennacl::cuda_arg<unsigned int>(L.handle1().cuda_handle()), viennacl::CUDA_MEMORY, A.size1() + 1); + viennacl::linalg::exclusive_scan(wrapped_L_row_buffer, wrapped_L_row_buffer); + L.reserve(wrapped_L_row_buffer[L.size1()], false); + + // + // Step 3: Write entries + // + extract_L_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()), + viennacl::cuda_arg<unsigned int>(A.handle2()), + viennacl::cuda_arg<NumericT>(A.handle()), + static_cast<unsigned int>(A.size1()), + viennacl::cuda_arg<unsigned int>(L.handle1()), + viennacl::cuda_arg<unsigned int>(L.handle2()), + viennacl::cuda_arg<NumericT>(L.handle()) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("extract_L_kernel_2"); + + L.generate_row_block_information(); + +} // extract_L + +/////////////////////////////////////////////// + + +template<typename NumericT> +__global__ void ilu_scale_kernel_1( + unsigned int const *A_row_indices, + unsigned int const *A_col_indices, + NumericT const *A_elements, + unsigned int A_size1, + + NumericT *D_elements) +{ + for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; + row < A_size1; + row += gridDim.x * blockDim.x) + { + unsigned int row_begin = A_row_indices[row]; + unsigned int row_end = A_row_indices[row+1]; + + for (unsigned int j = row_begin; j < row_end; ++j) + { + unsigned int col = A_col_indices[j]; + if (row == col) + { + D_elements[row] = NumericT(1) / sqrt(fabs(A_elements[j])); + break; + } + } + } +} + +/** @brief Scales values in a matrix such that output = D * input * D, where D is a diagonal matrix (only the diagonal is provided) */ +template<typename NumericT> +__global__ void ilu_scale_kernel_2( + unsigned int const *R_row_indices, + unsigned int const *R_col_indices, + NumericT *R_elements, + unsigned int R_size1, + + NumericT *D_elements) +{ + for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; + row < R_size1; + row += gridDim.x * blockDim.x) + { + unsigned int row_begin = R_row_indices[row]; + unsigned int row_end = R_row_indices[row+1]; + + NumericT D_row = D_elements[row]; + + for (unsigned int j = row_begin; j < row_end; ++j) + R_elements[j] *= D_row * D_elements[R_col_indices[j]]; + } +} + + + +/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */ +template<typename NumericT> +void icc_scale(compressed_matrix<NumericT> const & A, + compressed_matrix<NumericT> & L) +{ + viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A)); + + // fill D: + ilu_scale_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()), + viennacl::cuda_arg<unsigned int>(A.handle2()), + viennacl::cuda_arg<NumericT>(A.handle()), + static_cast<unsigned int>(A.size1()), + viennacl::cuda_arg(D) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_1"); + + // scale L: + ilu_scale_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.handle1()), + viennacl::cuda_arg<unsigned int>(L.handle2()), + viennacl::cuda_arg<NumericT>(L.handle()), + static_cast<unsigned int>(L.size1()), + viennacl::cuda_arg(D) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_1"); +} + +///////////////////////////////////// + +/** @brief CUDA kernel for one Chow-Patel-ICC sweep */ +template<typename NumericT> +__global__ void icc_chow_patel_sweep_kernel( + unsigned int const *L_row_indices, + unsigned int const *L_col_indices, + NumericT *L_elements, + NumericT const *L_backup, + unsigned int L_size1, + NumericT const *aij_L) +{ + for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; + row < L_size1; + row += gridDim.x * blockDim.x) + { + // + // update L: + // + unsigned int row_Li_start = L_row_indices[row]; + unsigned int row_Li_end = L_row_indices[row + 1]; + + for (unsigned int i = row_Li_start; i < row_Li_end; ++i) + { + unsigned int col = L_col_indices[i]; + + unsigned int row_Lj_start = L_row_indices[col]; + unsigned int row_Lj_end = L_row_indices[col + 1]; + + // compute \sum_{k=1}^{j-1} l_ik u_kj + unsigned int index_Lj = row_Lj_start; + unsigned int col_Lj = L_col_indices[index_Lj]; + NumericT s = aij_L[i]; + for (unsigned int index_Li = row_Li_start; index_Li < i; ++index_Li) + { + unsigned int col_Li = L_col_indices[index_Li]; + + // find element in U + while (col_Lj < col_Li) + { + ++index_Lj; + col_Lj = L_col_indices[index_Lj]; + } + + if (col_Lj == col_Li) + s -= L_backup[index_Li] * L_backup[index_Lj]; + } + + // update l_ij: + L_elements[i] = (row == col) ? sqrt(s) : (s / L_backup[row_Lj_end - 1]); // diagonal element is last entry in U + } + + } +} + + +/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenMP (cf. Algorithm 2 in paper) */ +template<typename NumericT> +void icc_chow_patel_sweep(compressed_matrix<NumericT> & L, + vector<NumericT> const & aij_L) +{ + viennacl::backend::mem_handle L_backup; + viennacl::backend::memory_create(L_backup, L.handle().raw_size(), viennacl::traits::context(L)); + viennacl::backend::memory_copy(L.handle(), L_backup, 0, 0, L.handle().raw_size()); + + icc_chow_patel_sweep_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.handle1()), + viennacl::cuda_arg<unsigned int>(L.handle2()), + viennacl::cuda_arg<NumericT>(L.handle()), + viennacl::cuda_arg<NumericT>(L_backup), + static_cast<unsigned int>(L.size1()), + + viennacl::cuda_arg<NumericT>(aij_L.handle()) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("icc_chow_patel_sweep_kernel"); + +} + + +////////////////////////////// ILU /////////////////////////// + +template<typename IndexT> // to control external linkage +__global__ void extract_LU_kernel_1( + const IndexT * A_row_indices, + const IndexT * A_col_indices, + unsigned int A_size1, + + unsigned int * L_row_indices, + + unsigned int * U_row_indices) +{ + for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; + row < A_size1; + row += gridDim.x * blockDim.x) + { + unsigned int row_begin = A_row_indices[row]; + unsigned int row_end = A_row_indices[row+1]; + + unsigned int num_entries_L = 0; + unsigned int num_entries_U = 0; + for (unsigned int j=row_begin; j<row_end; ++j) + { + unsigned int col = A_col_indices[j]; + if (col <= row) + ++num_entries_L; + if (col >= row) + ++num_entries_U; + } + + L_row_indices[row] = num_entries_L; + U_row_indices[row] = num_entries_U; + } +} + +template<typename NumericT> +__global__ void extract_LU_kernel_2( + unsigned int const *A_row_indices, + unsigned int const *A_col_indices, + NumericT const *A_elements, + unsigned int A_size1, + + unsigned int const *L_row_indices, + unsigned int *L_col_indices, + NumericT *L_elements, + + unsigned int const *U_row_indices, + unsigned int *U_col_indices, + NumericT *U_elements) +{ + for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; + row < A_size1; + row += gridDim.x * blockDim.x) + { + unsigned int row_begin = A_row_indices[row]; + unsigned int row_end = A_row_indices[row+1]; + + unsigned int index_L = L_row_indices[row]; + unsigned int index_U = U_row_indices[row]; + for (unsigned int j = row_begin; j < row_end; ++j) + { + unsigned int col = A_col_indices[j]; + NumericT value = A_elements[j]; + + if (col <= row) + { + L_col_indices[index_L] = col; + L_elements[index_L] = value; + ++index_L; + } + + if (col >= row) + { + U_col_indices[index_U] = col; + U_elements[index_U] = value; + ++index_U; + } + } + } +} + +template<typename NumericT> +void extract_LU(compressed_matrix<NumericT> const & A, + compressed_matrix<NumericT> & L, + compressed_matrix<NumericT> & U) +{ + // + // Step 1: Count elements in L and U: + // + extract_LU_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()), + viennacl::cuda_arg<unsigned int>(A.handle2()), + static_cast<unsigned int>(A.size1()), + viennacl::cuda_arg<unsigned int>(L.handle1()), + viennacl::cuda_arg<unsigned int>(U.handle1()) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("extract_LU_kernel_1"); + + // + // Step 2: Exclusive scan on row_buffers: + // + viennacl::vector<unsigned int> wrapped_L_row_buffer(viennacl::cuda_arg<unsigned int>(L.handle1()), viennacl::CUDA_MEMORY, A.size1() + 1); + viennacl::linalg::exclusive_scan(wrapped_L_row_buffer, wrapped_L_row_buffer); + L.reserve(wrapped_L_row_buffer[L.size1()], false); + + viennacl::vector<unsigned int> wrapped_U_row_buffer(viennacl::cuda_arg<unsigned int>(U.handle1()), viennacl::CUDA_MEMORY, A.size1() + 1); + viennacl::linalg::exclusive_scan(wrapped_U_row_buffer, wrapped_U_row_buffer); + U.reserve(wrapped_U_row_buffer[U.size1()], false); + + // + // Step 3: Write entries + // + extract_LU_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()), + viennacl::cuda_arg<unsigned int>(A.handle2()), + viennacl::cuda_arg<NumericT>(A.handle()), + static_cast<unsigned int>(A.size1()), + viennacl::cuda_arg<unsigned int>(L.handle1()), + viennacl::cuda_arg<unsigned int>(L.handle2()), + viennacl::cuda_arg<NumericT>(L.handle()), + viennacl::cuda_arg<unsigned int>(U.handle1()), + viennacl::cuda_arg<unsigned int>(U.handle2()), + viennacl::cuda_arg<NumericT>(U.handle()) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("extract_LU_kernel_2"); + + L.generate_row_block_information(); + // Note: block information for U will be generated after transposition + +} // extract_LU + +/////////////////////////////////////////////// + +/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */ +template<typename NumericT> +void ilu_scale(compressed_matrix<NumericT> const & A, + compressed_matrix<NumericT> & L, + compressed_matrix<NumericT> & U) +{ + viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A)); + + // fill D: + ilu_scale_kernel_1<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(A.handle1()), + viennacl::cuda_arg<unsigned int>(A.handle2()), + viennacl::cuda_arg<NumericT>(A.handle()), + static_cast<unsigned int>(A.size1()), + viennacl::cuda_arg<NumericT>(D.handle()) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_1"); + + // scale L: + ilu_scale_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.handle1()), + viennacl::cuda_arg<unsigned int>(L.handle2()), + viennacl::cuda_arg<NumericT>(L.handle()), + static_cast<unsigned int>(L.size1()), + viennacl::cuda_arg<NumericT>(D.handle()) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_2"); + + // scale U: + ilu_scale_kernel_2<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(U.handle1()), + viennacl::cuda_arg<unsigned int>(U.handle2()), + viennacl::cuda_arg<NumericT>(U.handle()), + static_cast<unsigned int>(U.size1()), + viennacl::cuda_arg<NumericT>(D.handle()) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_scale_kernel_2"); +} + +///////////////////////////////////// + +/** @brief CUDA kernel for one Chow-Patel-ILU sweep */ +template<typename NumericT> +__global__ void ilu_chow_patel_sweep_kernel( + unsigned int const *L_row_indices, + unsigned int const *L_col_indices, + NumericT *L_elements, + NumericT const *L_backup, + unsigned int L_size1, + + NumericT const *aij_L, + + unsigned int const *U_trans_row_indices, + unsigned int const *U_trans_col_indices, + NumericT *U_trans_elements, + NumericT const *U_trans_backup, + + NumericT const *aij_U_trans) +{ + for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; + row < L_size1; + row += gridDim.x * blockDim.x) + { + // + // update L: + // + unsigned int row_L_start = L_row_indices[row]; + unsigned int row_L_end = L_row_indices[row + 1]; + + for (unsigned int j = row_L_start; j < row_L_end; ++j) + { + unsigned int col = L_col_indices[j]; + + if (col == row) + continue; + + unsigned int row_U_start = U_trans_row_indices[col]; + unsigned int row_U_end = U_trans_row_indices[col + 1]; + + // compute \sum_{k=1}^{j-1} l_ik u_kj + unsigned int index_U = row_U_start; + unsigned int col_U = (index_U < row_U_end) ? U_trans_col_indices[index_U] : L_size1; + NumericT sum = 0; + for (unsigned int k = row_L_start; k < j; ++k) + { + unsigned int col_L = L_col_indices[k]; + + // find element in U + while (col_U < col_L) + { + ++index_U; + col_U = U_trans_col_indices[index_U]; + } + + if (col_U == col_L) + sum += L_backup[k] * U_trans_backup[index_U]; + } + + // update l_ij: + L_elements[j] = (aij_L[j] - sum) / U_trans_backup[row_U_end - 1]; // diagonal element is last entry in U + } + + + // + // update U: + // + unsigned int row_U_start = U_trans_row_indices[row]; + unsigned int row_U_end = U_trans_row_indices[row + 1]; + for (unsigned int j = row_U_start; j < row_U_end; ++j) + { + unsigned int col = U_trans_col_indices[j]; + + row_L_start = L_row_indices[col]; + row_L_end = L_row_indices[col + 1]; + + // compute \sum_{k=1}^{j-1} l_ik u_kj + unsigned int index_L = row_L_start; + unsigned int col_L = (index_L < row_L_end) ? L_col_indices[index_L] : L_size1; + NumericT sum = 0; + for (unsigned int k = row_U_start; k < j; ++k) + { + unsigned int col_U = U_trans_col_indices[k]; + + // find element in L + while (col_L < col_U) + { + ++index_L; + col_L = L_col_indices[index_L]; + } + + if (col_U == col_L) + sum += L_backup[index_L] * U_trans_backup[k]; + } + + // update u_ij: + U_trans_elements[j] = aij_U_trans[j] - sum; + } + } +} + + +/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenMP (cf. Algorithm 2 in paper) */ +template<typename NumericT> +void ilu_chow_patel_sweep(compressed_matrix<NumericT> & L, + vector<NumericT> const & aij_L, + compressed_matrix<NumericT> & U_trans, + vector<NumericT> const & aij_U_trans) +{ + viennacl::backend::mem_handle L_backup; + viennacl::backend::memory_create(L_backup, L.handle().raw_size(), viennacl::traits::context(L)); + viennacl::backend::memory_copy(L.handle(), L_backup, 0, 0, L.handle().raw_size()); + + viennacl::backend::mem_handle U_backup; + viennacl::backend::memory_create(U_backup, U_trans.handle().raw_size(), viennacl::traits::context(U_trans)); + viennacl::backend::memory_copy(U_trans.handle(), U_backup, 0, 0, U_trans.handle().raw_size()); + + ilu_chow_patel_sweep_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(L.handle1()), + viennacl::cuda_arg<unsigned int>(L.handle2()), + viennacl::cuda_arg<NumericT>(L.handle()), + viennacl::cuda_arg<NumericT>(L_backup), + static_cast<unsigned int>(L.size1()), + + viennacl::cuda_arg<NumericT>(aij_L.handle()), + + viennacl::cuda_arg<unsigned int>(U_trans.handle1()), + viennacl::cuda_arg<unsigned int>(U_trans.handle2()), + viennacl::cuda_arg<NumericT>(U_trans.handle()), + viennacl::cuda_arg<NumericT>(U_backup), + + viennacl::cuda_arg<NumericT>(aij_U_trans.handle()) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_chow_patel_sweep_kernel"); + +} + +////////////////////////////////////// + +template<typename NumericT> +__global__ void ilu_form_neumann_matrix_kernel( + unsigned int const *R_row_indices, + unsigned int const *R_col_indices, + NumericT *R_elements, + unsigned int R_size1, + + NumericT *D_elements) +{ + for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; + row < R_size1; + row += gridDim.x * blockDim.x) + { + unsigned int row_begin = R_row_indices[row]; + unsigned int row_end = R_row_indices[row+1]; + + // part 1: extract diagonal entry + NumericT diag = 0; + for (unsigned int j = row_begin; j < row_end; ++j) + { + unsigned int col = R_col_indices[j]; + if (col == row) + { + diag = R_elements[j]; + R_elements[j] = 0; // (I - D^{-1}R) + break; + } + } + D_elements[row] = diag; + + // part2: scale + for (unsigned int j = row_begin; j < row_end; ++j) + R_elements[j] /= -diag; + } +} + + + +template<typename NumericT> +void ilu_form_neumann_matrix(compressed_matrix<NumericT> & R, + vector<NumericT> & diag_R) +{ + ilu_form_neumann_matrix_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned int>(R.handle1()), + viennacl::cuda_arg<unsigned int>(R.handle2()), + viennacl::cuda_arg<NumericT>(R.handle()), + static_cast<unsigned int>(R.size1()), + viennacl::cuda_arg<NumericT>(diag_R.handle()) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("ilu_form_neumann_matrix_kernel"); +} + +} //namespace host_based +} //namespace linalg +} //namespace viennacl + + +#endif
