http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/iterative_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/iterative_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/iterative_operations.hpp new file mode 100644 index 0000000..b350fe0 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/iterative_operations.hpp @@ -0,0 +1,945 @@ +#ifndef VIENNACL_LINALG_OPENCL_ITERATIVE_OPERATIONS_HPP_ +#define VIENNACL_LINALG_OPENCL_ITERATIVE_OPERATIONS_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/opencl/iterative_operations.hpp + @brief Implementations of specialized kernels for fast iterative solvers using OpenCL +*/ + +#include <cmath> + +#include "viennacl/forwards.h" +#include "viennacl/detail/vector_def.hpp" +#include "viennacl/ocl/device.hpp" +#include "viennacl/ocl/handle.hpp" +#include "viennacl/ocl/kernel.hpp" +#include "viennacl/scalar.hpp" +#include "viennacl/tools/tools.hpp" +#include "viennacl/linalg/opencl/common.hpp" +#include "viennacl/linalg/opencl/kernels/iterative.hpp" +#include "viennacl/meta/predicate.hpp" +#include "viennacl/meta/enable_if.hpp" +#include "viennacl/traits/size.hpp" +#include "viennacl/traits/start.hpp" +#include "viennacl/traits/handle.hpp" +#include "viennacl/traits/stride.hpp" + +namespace viennacl +{ +namespace linalg +{ +namespace opencl +{ + +template<typename NumericT> +void pipelined_cg_vector_update(vector_base<NumericT> & result, + NumericT alpha, + vector_base<NumericT> & p, + vector_base<NumericT> & r, + vector_base<NumericT> const & Ap, + NumericT beta, + vector_base<NumericT> & inner_prod_buffer) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(result).context()); + viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx); + + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "cg_vector_update"); + cl_uint vec_size = cl_uint(viennacl::traits::size(result)); + + k.local_work_size(0, 128); + k.global_work_size(0, 128*128); + + if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) + { + k.local_work_size(0, 256); + k.global_work_size(0, 256*256); + } + + viennacl::ocl::enqueue(k(result, alpha, p, r, Ap, beta, inner_prod_buffer, vec_size, viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)))); +} + +template<typename NumericT> +void pipelined_cg_prod(compressed_matrix<NumericT> const & A, + vector_base<NumericT> const & p, + vector_base<NumericT> & Ap, + vector_base<NumericT> & inner_prod_buffer) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); + viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx); + + bool use_nvidia_blocked = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id && (double(A.nnz()) / double(A.size1()) > 12.0)); + + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), use_nvidia_blocked ? "cg_csr_blocked_prod" : "cg_csr_prod"); + + cl_uint vec_size = cl_uint(viennacl::traits::size(p)); + cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3); + + k.local_work_size(0, 128); + k.global_work_size(0, 128*128); + + if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) + { + k.local_work_size(0, 256); + k.global_work_size(0, 256*256); + } + + if (use_nvidia_blocked) + { + viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), + p, + Ap, + vec_size, + inner_prod_buffer, + buffer_size_per_vector, + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)) + )); + } + else + { + viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()), + p, + Ap, + vec_size, + inner_prod_buffer, + buffer_size_per_vector, + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), + viennacl::ocl::local_mem(1024 * sizeof(NumericT)) + )); + } + +} + +template<typename NumericT> +void pipelined_cg_prod(coordinate_matrix<NumericT> const & A, + vector_base<NumericT> const & p, + vector_base<NumericT> & Ap, + vector_base<NumericT> & inner_prod_buffer) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); + viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx); + + cl_uint vec_size = cl_uint(viennacl::traits::size(p)); + cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3); + + Ap.clear(); + + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "cg_coo_prod"); + unsigned int thread_num = 256; //k.local_work_size(0); + + k.local_work_size(0, thread_num); + + k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases + + viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(), + p, + Ap, + vec_size, + viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num), + viennacl::ocl::local_mem(sizeof(NumericT)*thread_num), + inner_prod_buffer, + buffer_size_per_vector, + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)) + )); +} + +template<typename NumericT> +void pipelined_cg_prod(ell_matrix<NumericT> const & A, + vector_base<NumericT> const & p, + vector_base<NumericT> & Ap, + vector_base<NumericT> & inner_prod_buffer) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); + viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx); + + cl_uint vec_size = cl_uint(viennacl::traits::size(p)); + cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3); + + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "cg_ell_prod"); + + unsigned int thread_num = 128; + unsigned int group_num = 256; + + k.local_work_size(0, thread_num); + k.global_work_size(0, thread_num * group_num); + + if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) + { + k.local_work_size(0, 256); + k.global_work_size(0, 256*256); + } + + viennacl::ocl::enqueue(k(A.handle2().opencl_handle(), + A.handle().opencl_handle(), + cl_uint(A.internal_size1()), + cl_uint(A.maxnnz()), + cl_uint(A.internal_maxnnz()), + viennacl::traits::opencl_handle(p), + viennacl::traits::opencl_handle(Ap), + vec_size, + inner_prod_buffer, + buffer_size_per_vector, + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)) + ) + ); +} + +template<typename NumericT> +void pipelined_cg_prod(sliced_ell_matrix<NumericT> const & A, + vector_base<NumericT> const & p, + vector_base<NumericT> & Ap, + vector_base<NumericT> & inner_prod_buffer) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); + viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx); + + cl_uint vec_size = cl_uint(viennacl::traits::size(p)); + cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3); + + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "cg_sliced_ell_prod"); + + vcl_size_t thread_num = std::max(A.rows_per_block(), static_cast<vcl_size_t>(128)); + unsigned int group_num = 256; + + if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) + thread_num = 256; + + k.local_work_size(0, thread_num); + k.global_work_size(0, thread_num * group_num); + + viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), + A.handle2().opencl_handle(), + A.handle3().opencl_handle(), + A.handle().opencl_handle(), + viennacl::traits::opencl_handle(p), + viennacl::traits::opencl_handle(Ap), + vec_size, + cl_uint(A.rows_per_block()), + inner_prod_buffer, + buffer_size_per_vector, + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)) + ) + ); +} + + +template<typename NumericT> +void pipelined_cg_prod(hyb_matrix<NumericT> const & A, + vector_base<NumericT> const & p, + vector_base<NumericT> & Ap, + vector_base<NumericT> & inner_prod_buffer) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); + viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx); + + cl_uint vec_size = cl_uint(viennacl::traits::size(p)); + cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3); + + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "cg_hyb_prod"); + + unsigned int thread_num = 128; + unsigned int group_num = 128; + + k.local_work_size(0, thread_num); + k.global_work_size(0, thread_num * group_num); + + if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) + { + k.local_work_size(0, 256); + k.global_work_size(0, 256*256); + } + + viennacl::ocl::enqueue(k(A.handle2().opencl_handle(), + A.handle().opencl_handle(), + A.handle3().opencl_handle(), + A.handle4().opencl_handle(), + A.handle5().opencl_handle(), + cl_uint(A.internal_size1()), + cl_uint(A.ell_nnz()), + cl_uint(A.internal_ellnnz()), + viennacl::traits::opencl_handle(p), + viennacl::traits::opencl_handle(Ap), + vec_size, + inner_prod_buffer, + buffer_size_per_vector, + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)) + ) + ); +} + + +//////////////////////////// BiCGStab //////////////////////// + +template<typename NumericT> +void pipelined_bicgstab_update_s(vector_base<NumericT> & s, + vector_base<NumericT> & r, + vector_base<NumericT> const & Ap, + vector_base<NumericT> & inner_prod_buffer, + vcl_size_t buffer_chunk_size, + vcl_size_t buffer_chunk_offset) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(s).context()); + viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx); + + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "bicgstab_update_s"); + cl_uint vec_size = cl_uint(viennacl::traits::size(s)); + + k.local_work_size(0, 128); + k.global_work_size(0, 128*128); + + if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) + { + k.local_work_size(0, 256); + k.global_work_size(0, 256*256); + } + + cl_uint chunk_size = cl_uint(buffer_chunk_size); + cl_uint chunk_offset = cl_uint(buffer_chunk_offset); + viennacl::ocl::enqueue(k(s, r, Ap, + inner_prod_buffer, chunk_size, chunk_offset, vec_size, + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)))); +} + +template<typename NumericT> +void pipelined_bicgstab_vector_update(vector_base<NumericT> & result, NumericT alpha, vector_base<NumericT> & p, NumericT omega, vector_base<NumericT> const & s, + vector_base<NumericT> & residual, vector_base<NumericT> const & As, + NumericT beta, vector_base<NumericT> const & Ap, + vector_base<NumericT> const & r0star, + vector_base<NumericT> & inner_prod_buffer, vcl_size_t buffer_chunk_size) +{ + (void)buffer_chunk_size; + + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(s).context()); + viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx); + + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "bicgstab_vector_update"); + cl_uint vec_size = cl_uint(viennacl::traits::size(result)); + + k.local_work_size(0, 128); + k.global_work_size(0, 128*128); + + if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) + { + k.local_work_size(0, 256); + k.global_work_size(0, 256*256); + } + + viennacl::ocl::enqueue(k(result, alpha, p, omega, s, + residual, As, + beta, Ap, + r0star, + inner_prod_buffer, + vec_size, viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)) + ) + ); +} + +template<typename NumericT> +void pipelined_bicgstab_prod(compressed_matrix<NumericT> const & A, + vector_base<NumericT> const & p, + vector_base<NumericT> & Ap, + vector_base<NumericT> const & r0star, + vector_base<NumericT> & inner_prod_buffer, + vcl_size_t buffer_chunk_size, + vcl_size_t buffer_chunk_offset) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); + viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx); + + bool use_nvidia_blocked = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id && (double(A.nnz()) / double(A.size1()) > 12.0)); + + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), use_nvidia_blocked ? "bicgstab_csr_blocked_prod" : "bicgstab_csr_prod"); + + cl_uint vec_size = cl_uint(viennacl::traits::size(p)); + cl_uint chunk_size = cl_uint(buffer_chunk_size); + cl_uint chunk_offset = cl_uint(buffer_chunk_offset); + + k.local_work_size(0, 128); + k.global_work_size(0, 128*128); + + if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) + { + k.local_work_size(0, 256); + k.global_work_size(0, 256*256); + } + + if (use_nvidia_blocked) + { + viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), + p, + Ap, + r0star, + vec_size, + inner_prod_buffer, chunk_size, chunk_offset, + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)) + )); + } + else + { + viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()), + p, + Ap, + r0star, + vec_size, + inner_prod_buffer, chunk_size, chunk_offset, + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)) + )); + } + +} + + +template<typename NumericT> +void pipelined_bicgstab_prod(coordinate_matrix<NumericT> const & A, + vector_base<NumericT> const & p, + vector_base<NumericT> & Ap, + vector_base<NumericT> const & r0star, + vector_base<NumericT> & inner_prod_buffer, + vcl_size_t buffer_chunk_size, + vcl_size_t buffer_chunk_offset) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); + viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx); + + cl_uint vec_size = cl_uint(viennacl::traits::size(p)); + cl_uint chunk_size = cl_uint(buffer_chunk_size); + cl_uint chunk_offset = cl_uint(buffer_chunk_offset); + + Ap.clear(); + + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "bicgstab_coo_prod"); + unsigned int thread_num = 256; //k.local_work_size(0); + + k.local_work_size(0, thread_num); + + k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases + + viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(), + p, + Ap, + r0star, + vec_size, + viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num), + viennacl::ocl::local_mem(sizeof(NumericT)*thread_num), + inner_prod_buffer, chunk_size, chunk_offset, + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)) + )); +} + +template<typename NumericT> +void pipelined_bicgstab_prod(ell_matrix<NumericT> const & A, + vector_base<NumericT> const & p, + vector_base<NumericT> & Ap, + vector_base<NumericT> const & r0star, + vector_base<NumericT> & inner_prod_buffer, + vcl_size_t buffer_chunk_size, + vcl_size_t buffer_chunk_offset) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); + viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx); + + cl_uint vec_size = cl_uint(viennacl::traits::size(p)); + cl_uint chunk_size = cl_uint(buffer_chunk_size); + cl_uint chunk_offset = cl_uint(buffer_chunk_offset); + + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "bicgstab_ell_prod"); + + unsigned int thread_num = 128; + unsigned int group_num = 128; + + k.local_work_size(0, thread_num); + k.global_work_size(0, thread_num * group_num); + + if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) + { + k.local_work_size(0, 256); + k.global_work_size(0, 256*256); + } + + viennacl::ocl::enqueue(k(A.handle2().opencl_handle(), + A.handle().opencl_handle(), + cl_uint(A.internal_size1()), + cl_uint(A.maxnnz()), + cl_uint(A.internal_maxnnz()), + viennacl::traits::opencl_handle(p), + viennacl::traits::opencl_handle(Ap), + r0star, + vec_size, + inner_prod_buffer, chunk_size, chunk_offset, + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)) + ) + ); +} + +template<typename NumericT> +void pipelined_bicgstab_prod(sliced_ell_matrix<NumericT> const & A, + vector_base<NumericT> const & p, + vector_base<NumericT> & Ap, + vector_base<NumericT> const & r0star, + vector_base<NumericT> & inner_prod_buffer, + vcl_size_t buffer_chunk_size, + vcl_size_t buffer_chunk_offset) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); + viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx); + + cl_uint vec_size = cl_uint(viennacl::traits::size(p)); + cl_uint chunk_size = cl_uint(buffer_chunk_size); + cl_uint chunk_offset = cl_uint(buffer_chunk_offset); + + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "bicgstab_sliced_ell_prod"); + + vcl_size_t thread_num = std::max(A.rows_per_block(), static_cast<vcl_size_t>(128)); + unsigned int group_num = 256; + + if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) + thread_num = 256; + + k.local_work_size(0, thread_num); + k.global_work_size(0, thread_num * group_num); + + viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), + A.handle2().opencl_handle(), + A.handle3().opencl_handle(), + A.handle().opencl_handle(), + viennacl::traits::opencl_handle(p), + viennacl::traits::opencl_handle(Ap), + r0star, + vec_size, + cl_uint(A.rows_per_block()), + inner_prod_buffer, chunk_size, chunk_offset, + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)) + ) + ); +} + + +template<typename NumericT> +void pipelined_bicgstab_prod(hyb_matrix<NumericT> const & A, + vector_base<NumericT> const & p, + vector_base<NumericT> & Ap, + vector_base<NumericT> const & r0star, + vector_base<NumericT> & inner_prod_buffer, + vcl_size_t buffer_chunk_size, + vcl_size_t buffer_chunk_offset) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); + viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx); + + cl_uint vec_size = cl_uint(viennacl::traits::size(p)); + cl_uint chunk_size = cl_uint(buffer_chunk_size); + cl_uint chunk_offset = cl_uint(buffer_chunk_offset); + + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "bicgstab_hyb_prod"); + + unsigned int thread_num = 256; + unsigned int group_num = 128; + + k.local_work_size(0, thread_num); + k.global_work_size(0, thread_num * group_num); + + if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) + { + k.local_work_size(0, 256); + k.global_work_size(0, 256*256); + } + + viennacl::ocl::enqueue(k(A.handle2().opencl_handle(), + A.handle().opencl_handle(), + A.handle3().opencl_handle(), + A.handle4().opencl_handle(), + A.handle5().opencl_handle(), + cl_uint(A.internal_size1()), + cl_uint(A.ell_nnz()), + cl_uint(A.internal_ellnnz()), + viennacl::traits::opencl_handle(p), + viennacl::traits::opencl_handle(Ap), + r0star, + vec_size, + inner_prod_buffer, chunk_size, chunk_offset, + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), + viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)) + ) + ); +} + +/////////////////////////////////// + +/** @brief Performs a vector normalization needed for an efficient pipelined GMRES algorithm. + * + * This routines computes for vectors 'r', 'v_k': + * Second reduction step for ||v_k|| + * v_k /= ||v_k|| + * First reduction step for <r, v_k> + */ +template <typename T> +void pipelined_gmres_normalize_vk(vector_base<T> & v_k, + vector_base<T> const & residual, + vector_base<T> & R_buffer, + vcl_size_t offset_in_R, + vector_base<T> const & inner_prod_buffer, + vector_base<T> & r_dot_vk_buffer, + vcl_size_t buffer_chunk_size, + vcl_size_t buffer_chunk_offset) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(v_k).context()); + viennacl::linalg::opencl::kernels::iterative<T>::init(ctx); + + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_normalize_vk"); + + k.local_work_size(0, 128); + k.global_work_size(0, 128*128); + + cl_uint size_vk = cl_uint(v_k.size()); + cl_uint vk_offset = cl_uint(viennacl::traits::start(v_k)); + cl_uint R_offset = cl_uint(offset_in_R); + cl_uint chunk_size = cl_uint(buffer_chunk_size); + cl_uint chunk_offset = cl_uint(buffer_chunk_offset); + viennacl::ocl::enqueue(k(v_k, vk_offset, + residual, + R_buffer, R_offset, + inner_prod_buffer, chunk_size, + r_dot_vk_buffer, chunk_offset, + size_vk, + viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)) + )); +} + +template <typename T> +void pipelined_gmres_gram_schmidt_stage1(vector_base<T> const & device_krylov_basis, + vcl_size_t v_k_size, + vcl_size_t v_k_internal_size, + vcl_size_t param_k, + vector_base<T> & vi_in_vk_buffer, + vcl_size_t buffer_chunk_size) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(device_krylov_basis).context()); + viennacl::linalg::opencl::kernels::iterative<T>::init(ctx); + + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_gram_schmidt_1"); + + k.local_work_size(0, 128); + k.global_work_size(0, 128*128); + + cl_uint size_vk = cl_uint(v_k_size); + cl_uint internal_size_vk = cl_uint(v_k_internal_size); + cl_uint ocl_k = cl_uint(param_k); + cl_uint chunk_size = cl_uint(buffer_chunk_size); + viennacl::ocl::enqueue(k(device_krylov_basis, size_vk, internal_size_vk, ocl_k, + vi_in_vk_buffer, chunk_size + )); +} + +template <typename T> +void pipelined_gmres_gram_schmidt_stage2(vector_base<T> & device_krylov_basis, + vcl_size_t v_k_size, + vcl_size_t v_k_internal_size, + vcl_size_t param_k, + vector_base<T> const & vi_in_vk_buffer, + vector_base<T> & R_buffer, + vcl_size_t krylov_dim, + vector_base<T> & inner_prod_buffer, + vcl_size_t buffer_chunk_size) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(device_krylov_basis).context()); + viennacl::linalg::opencl::kernels::iterative<T>::init(ctx); + + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_gram_schmidt_2"); + + k.local_work_size(0, 128); + k.global_work_size(0, 128*128); + + cl_uint size_vk = cl_uint(v_k_size); + cl_uint internal_size_vk = cl_uint(v_k_internal_size); + cl_uint ocl_k = cl_uint(param_k); + cl_uint chunk_size = cl_uint(buffer_chunk_size); + cl_uint ocl_krylov_dim = cl_uint(krylov_dim); + viennacl::ocl::enqueue(k(device_krylov_basis, size_vk, internal_size_vk, ocl_k, + vi_in_vk_buffer, chunk_size, + R_buffer, ocl_krylov_dim, + inner_prod_buffer, + viennacl::ocl::local_mem(7 * k.local_work_size() * sizeof(T)) + )); +} + +template <typename T> +void pipelined_gmres_update_result(vector_base<T> & result, + vector_base<T> const & residual, + vector_base<T> const & krylov_basis, + vcl_size_t v_k_size, + vcl_size_t v_k_internal_size, + vector_base<T> const & coefficients, + vcl_size_t param_k) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(result).context()); + viennacl::linalg::opencl::kernels::iterative<T>::init(ctx); + + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_update_result"); + + k.local_work_size(0, 128); + k.global_work_size(0, 128*128); + + cl_uint size_vk = cl_uint(v_k_size); + cl_uint internal_size_vk = cl_uint(v_k_internal_size); + cl_uint ocl_k = cl_uint(param_k); + viennacl::ocl::enqueue(k(result, + residual, + krylov_basis, size_vk, internal_size_vk, + coefficients, ocl_k + )); +} + + +template <typename T> +void pipelined_gmres_prod(compressed_matrix<T> const & A, + vector_base<T> const & p, + vector_base<T> & Ap, + vector_base<T> & inner_prod_buffer) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); + viennacl::linalg::opencl::kernels::iterative<T>::init(ctx); + + bool use_nvidia_blocked = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id && (double(A.nnz()) / double(A.size1()) > 12.0)); + + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), use_nvidia_blocked ? "gmres_csr_blocked_prod" : "gmres_csr_prod"); + + cl_uint vec_size = cl_uint(viennacl::traits::size(p)); + cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3); + cl_uint start_p = cl_uint(viennacl::traits::start(p)); + cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap)); + + k.local_work_size(0, 128); + k.global_work_size(0, 128*128); + + if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) + { + k.local_work_size(0, 256); + k.global_work_size(0, 256*128); + } + + if (use_nvidia_blocked) + { + viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), + p, start_p, + Ap, start_Ap, + vec_size, + inner_prod_buffer, + buffer_size_per_vector, + viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)), + viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)) + )); + } + else + { + viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()), + p, start_p, + Ap, start_Ap, + vec_size, + inner_prod_buffer, + buffer_size_per_vector, + viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)), + viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)), + viennacl::ocl::local_mem(1024 * sizeof(T)) + )); + } +} + +template <typename T> +void pipelined_gmres_prod(coordinate_matrix<T> const & A, + vector_base<T> const & p, + vector_base<T> & Ap, + vector_base<T> & inner_prod_buffer) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); + viennacl::linalg::opencl::kernels::iterative<T>::init(ctx); + + cl_uint vec_size = cl_uint(viennacl::traits::size(p)); + cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3); + cl_uint start_p = cl_uint(viennacl::traits::start(p)); + cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap)); + + Ap.clear(); + inner_prod_buffer.clear(); + + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_coo_prod"); + unsigned int thread_num = 128; //k.local_work_size(0); + + k.local_work_size(0, thread_num); + + k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases + + viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(), + p, start_p, + Ap, start_Ap, + vec_size, + viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num), + viennacl::ocl::local_mem(sizeof(T)*thread_num), + inner_prod_buffer, + buffer_size_per_vector, + viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)), + viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)) + )); +} + +template <typename T> +void pipelined_gmres_prod(ell_matrix<T> const & A, + vector_base<T> const & p, + vector_base<T> & Ap, + vector_base<T> & inner_prod_buffer) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); + viennacl::linalg::opencl::kernels::iterative<T>::init(ctx); + + cl_uint vec_size = cl_uint(viennacl::traits::size(p)); + cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3); + cl_uint start_p = cl_uint(viennacl::traits::start(p)); + cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap)); + + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_ell_prod"); + + unsigned int thread_num = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) ? 256 : 128; + unsigned int group_num = 128; + + k.local_work_size(0, thread_num); + k.global_work_size(0, thread_num * group_num); + + viennacl::ocl::enqueue(k(A.handle2().opencl_handle(), + A.handle().opencl_handle(), + cl_uint(A.internal_size1()), + cl_uint(A.maxnnz()), + cl_uint(A.internal_maxnnz()), + viennacl::traits::opencl_handle(p), start_p, + viennacl::traits::opencl_handle(Ap), start_Ap, + vec_size, + inner_prod_buffer, + buffer_size_per_vector, + viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)), + viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)) + ) + ); +} + +template <typename T> +void pipelined_gmres_prod(sliced_ell_matrix<T> const & A, + vector_base<T> const & p, + vector_base<T> & Ap, + vector_base<T> & inner_prod_buffer) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); + viennacl::linalg::opencl::kernels::iterative<T>::init(ctx); + + cl_uint vec_size = cl_uint(viennacl::traits::size(p)); + cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3); + cl_uint start_p = cl_uint(viennacl::traits::start(p)); + cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap)); + + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_sliced_ell_prod"); + + vcl_size_t thread_num = std::max(A.rows_per_block(), static_cast<vcl_size_t>(128)); + unsigned int group_num = 128; + + if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) + thread_num = 256; + + k.local_work_size(0, thread_num); + k.global_work_size(0, thread_num * group_num); + + viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), + A.handle2().opencl_handle(), + A.handle3().opencl_handle(), + A.handle().opencl_handle(), + viennacl::traits::opencl_handle(p), start_p, + viennacl::traits::opencl_handle(Ap), start_Ap, + vec_size, + cl_uint(A.rows_per_block()), + inner_prod_buffer, + buffer_size_per_vector, + viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)), + viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)) + ) + ); +} + + +template <typename T> +void pipelined_gmres_prod(hyb_matrix<T> const & A, + vector_base<T> const & p, + vector_base<T> & Ap, + vector_base<T> & inner_prod_buffer) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); + viennacl::linalg::opencl::kernels::iterative<T>::init(ctx); + + cl_uint vec_size = cl_uint(viennacl::traits::size(p)); + cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3); + cl_uint start_p = cl_uint(viennacl::traits::start(p)); + cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap)); + + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_hyb_prod"); + + unsigned int thread_num = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) ? 256 : 128; + unsigned int group_num = 128; + + k.local_work_size(0, thread_num); + k.global_work_size(0, thread_num * group_num); + + + viennacl::ocl::enqueue(k(A.handle2().opencl_handle(), + A.handle().opencl_handle(), + A.handle3().opencl_handle(), + A.handle4().opencl_handle(), + A.handle5().opencl_handle(), + cl_uint(A.internal_size1()), + cl_uint(A.ell_nnz()), + cl_uint(A.internal_ellnnz()), + viennacl::traits::opencl_handle(p), start_p, + viennacl::traits::opencl_handle(Ap), start_Ap, + vec_size, + inner_prod_buffer, + buffer_size_per_vector, + viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)), + viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)) + ) + ); +} + + +} //namespace opencl +} //namespace linalg +} //namespace viennacl + + +#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/amg.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/amg.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/amg.hpp new file mode 100644 index 0000000..b0252d7 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/amg.hpp @@ -0,0 +1,393 @@ +#ifndef VIENNACL_LINALG_OPENCL_KERNELS_AMG_HPP +#define VIENNACL_LINALG_OPENCL_KERNELS_AMG_HPP + +#include "viennacl/tools/tools.hpp" +#include "viennacl/ocl/kernel.hpp" +#include "viennacl/ocl/platform.hpp" +#include "viennacl/ocl/utils.hpp" + +#include "viennacl/linalg/opencl/common.hpp" + +/** @file viennacl/linalg/opencl/kernels/amg.hpp + * @brief OpenCL kernel file for operations related to algebraic multigrid */ +namespace viennacl +{ +namespace linalg +{ +namespace opencl +{ +namespace kernels +{ + + +template<typename StringT> +void generate_amg_influence_trivial(StringT & source) +{ + + source.append("__kernel void amg_influence_trivial( \n"); + source.append(" __global const unsigned int * A_row_indices, \n"); + source.append(" __global const unsigned int * A_col_indices, \n"); + source.append(" unsigned int A_size1, \n"); + source.append(" unsigned int A_nnz, \n"); + source.append(" __global unsigned int * influences_row, \n"); + source.append(" __global unsigned int * influences_id, \n"); + source.append(" __global unsigned int * influences_values) { \n"); + + source.append(" for (unsigned int i = get_global_id(0); i < A_size1; i += get_global_size(0)) \n"); + source.append(" { \n"); + source.append(" unsigned int tmp = A_row_indices[i]; \n"); + source.append(" influences_row[i] = tmp; \n"); + source.append(" influences_values[i] = A_row_indices[i+1] - tmp; \n"); + source.append(" } \n"); + + source.append(" for (unsigned int i = get_global_id(0); i < A_nnz; i += get_global_size(0)) \n"); + source.append(" influences_id[i] = A_col_indices[i]; \n"); + + source.append(" if (get_global_id(0) == 0) \n"); + source.append(" influences_row[A_size1] = A_row_indices[A_size1]; \n"); + source.append("} \n"); + +} + + +template<typename StringT> +void generate_amg_pmis2_init_workdata(StringT & source) +{ + + source.append("__kernel void amg_pmis2_init_workdata( \n"); + source.append(" __global unsigned int *work_state, \n"); + source.append(" __global unsigned int *work_random, \n"); + source.append(" __global unsigned int *work_index, \n"); + source.append(" __global unsigned int const *point_types, \n"); + source.append(" __global unsigned int const *random_weights, \n"); + source.append(" unsigned int size) { \n"); + + source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n"); + source.append(" switch (point_types[i]) { \n"); + source.append(" case 0: work_state[i] = 1; break; \n"); //viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED + source.append(" case 1: work_state[i] = 2; break; \n"); //viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE + source.append(" case 2: work_state[i] = 0; break; \n"); //viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE + + source.append(" default: break; // do nothing \n"); + source.append(" } \n"); + + source.append(" work_random[i] = random_weights[i]; \n"); + source.append(" work_index[i] = i; \n"); + source.append(" } \n"); + source.append("} \n"); +} + + + +template<typename StringT> +void generate_amg_pmis2_max_neighborhood(StringT & source) +{ + + source.append("__kernel void amg_pmis2_max_neighborhood( \n"); + source.append(" __global unsigned int *work_state, \n"); + source.append(" __global unsigned int *work_random, \n"); + source.append(" __global unsigned int *work_index, \n"); + source.append(" __global unsigned int *work_state2, \n"); + source.append(" __global unsigned int *work_random2, \n"); + source.append(" __global unsigned int *work_index2, \n"); + source.append(" __global unsigned int const *influences_row, \n"); + source.append(" __global unsigned int const *influences_id, \n"); + source.append(" unsigned int size) { \n"); + + source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n"); + + // load + source.append(" unsigned int state = work_state[i]; \n"); + source.append(" unsigned int random = work_random[i]; \n"); + source.append(" unsigned int index = work_index[i]; \n"); + + // max + source.append(" unsigned int j_stop = influences_row[i + 1]; \n"); + source.append(" for (unsigned int j = influences_row[i]; j < j_stop; ++j) { \n"); + source.append(" unsigned int influenced_point_id = influences_id[j]; \n"); + + // lexigraphical triple-max (not particularly pretty, but does the job): + source.append(" if (state < work_state[influenced_point_id]) { \n"); + source.append(" state = work_state[influenced_point_id]; \n"); + source.append(" random = work_random[influenced_point_id]; \n"); + source.append(" index = work_index[influenced_point_id]; \n"); + source.append(" } else if (state == work_state[influenced_point_id]) { \n"); + source.append(" if (random < work_random[influenced_point_id]) { \n"); + source.append(" state = work_state[influenced_point_id]; \n"); + source.append(" random = work_random[influenced_point_id]; \n"); + source.append(" index = work_index[influenced_point_id]; \n"); + source.append(" } else if (random == work_random[influenced_point_id]) { \n"); + source.append(" if (index < work_index[influenced_point_id]) { \n"); + source.append(" state = work_state[influenced_point_id]; \n"); + source.append(" random = work_random[influenced_point_id]; \n"); + source.append(" index = work_index[influenced_point_id]; \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append(" } \n"); + + source.append(" }\n"); //for + + // store + source.append(" work_state2[i] = state; \n"); + source.append(" work_random2[i] = random; \n"); + source.append(" work_index2[i] = index; \n"); + source.append(" } \n"); + source.append("} \n"); +} + + + +template<typename StringT> +void generate_amg_pmis2_mark_mis_nodes(StringT & source) +{ + + source.append("__kernel void amg_pmis2_mark_mis_nodes( \n"); + source.append(" __global unsigned int const *work_state, \n"); + source.append(" __global unsigned int const *work_index, \n"); + source.append(" __global unsigned int *point_types, \n"); + source.append(" __global unsigned int *undecided_buffer, \n"); + source.append(" unsigned int size) { \n"); + + source.append(" unsigned int num_undecided = 0; \n"); + source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n"); + source.append(" unsigned int max_state = work_state[i]; \n"); + source.append(" unsigned int max_index = work_index[i]; \n"); + + source.append(" if (point_types[i] == 0) { \n"); // viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED + source.append(" if (i == max_index) point_types[i] = 1; \n"); // viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE + source.append(" else if (max_state == 2) point_types[i] = 2; \n"); // viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE + source.append(" else num_undecided += 1; \n"); + source.append(" } \n"); + source.append(" } \n"); + + // reduction in shared memory: + source.append(" __local unsigned int shared_buffer[256]; \n"); + source.append(" shared_buffer[get_local_id(0)] = num_undecided; \n"); + source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + source.append(" if (get_local_id(0) < stride) shared_buffer[get_local_id(0)] += shared_buffer[get_local_id(0)+stride]; \n"); + source.append(" } \n"); + + source.append(" if (get_local_id(0) == 0) \n"); + source.append(" undecided_buffer[get_group_id(0)] = shared_buffer[0]; \n"); + + source.append("} \n"); +} + + +template<typename StringT> +void generate_amg_pmis2_reset_state(StringT & source) +{ + + source.append("__kernel void amg_pmis2_reset_state( \n"); + source.append(" __global unsigned int *point_types, \n"); + source.append(" unsigned int size) { \n"); + + source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n"); + source.append(" if (point_types[i] != 1) point_types[i] = 0;\n"); // mind mapping of POINT_TYPE_COARSE and POINT_TYPE_UNDECIDED + source.append(" } \n"); + + source.append("} \n"); +} + + + +////////////// + + + +template<typename StringT> +void generate_amg_agg_propagate_coarse_indices(StringT & source) +{ + + source.append(" __kernel void amg_agg_propagate_coarse_indices( \n"); + source.append(" __global unsigned int *point_types, \n"); + source.append(" __global unsigned int *coarse_ids, \n"); + source.append(" __global unsigned int const *influences_row, \n"); + source.append(" __global unsigned int const *influences_id, \n"); + source.append(" unsigned int size) { \n"); + + source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n"); + source.append(" { \n"); + source.append(" if (point_types[i] == 1) { \n"); //viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE + source.append(" unsigned int coarse_index = coarse_ids[i]; \n"); + + source.append(" unsigned int j_stop = influences_row[i + 1]; \n"); + source.append(" for (unsigned int j = influences_row[i]; j < j_stop; ++j) { \n"); + source.append(" unsigned int influenced_point_id = influences_id[j]; \n"); + source.append(" coarse_ids[influenced_point_id] = coarse_index; \n"); + source.append(" if (influenced_point_id != i) point_types[influenced_point_id] = 2; \n"); //viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE + source.append(" } \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append("} \n"); + +} + + + +template<typename StringT> +void generate_amg_agg_merge_undecided(StringT & source) +{ + + source.append(" __kernel void amg_agg_merge_undecided( \n"); + source.append(" __global unsigned int *point_types, \n"); + source.append(" __global unsigned int *coarse_ids, \n"); + source.append(" __global unsigned int const *influences_row, \n"); + source.append(" __global unsigned int const *influences_id, \n"); + source.append(" unsigned int size) { \n"); + + source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n"); + source.append(" { \n"); + source.append(" if (point_types[i] == 0) { \n"); //viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED + + source.append(" unsigned int j_stop = influences_row[i + 1]; \n"); + source.append(" for (unsigned int j = influences_row[i]; j < j_stop; ++j) { \n"); + source.append(" unsigned int influenced_point_id = influences_id[j]; \n"); + source.append(" if (point_types[influenced_point_id] != 0) { \n"); // viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED + source.append(" coarse_ids[i] = coarse_ids[influenced_point_id]; \n"); + source.append(" break; \n"); + source.append(" } \n"); + source.append(" } \n"); + + source.append(" } \n"); + source.append(" } \n"); + source.append("} \n"); + +} + + +template<typename StringT> +void generate_amg_agg_merge_undecided_2(StringT & source) +{ + + source.append(" __kernel void amg_agg_merge_undecided_2( \n"); + source.append(" __global unsigned int *point_types, \n"); + source.append(" unsigned int size) { \n"); + + source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n"); + source.append(" if (point_types[i] == 0) point_types[i] = 2; \n"); // POINT_TYPE_UNDECIDED to POINT_TYPE_FINE + + source.append("} \n"); +} + +////////////////////// + +template<typename StringT> +void generate_amg_interpol_ag(StringT & source, std::string const & numeric_string) +{ + + source.append(" __kernel void amg_interpol_ag( \n"); + source.append(" __global unsigned int * P_row_indices, \n"); + source.append(" __global unsigned int * P_column_indices, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * P_elements, \n"); + source.append(" __global const unsigned int * coarse_agg_ids, \n"); + source.append(" unsigned int size) { \n"); + + source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n"); + source.append(" { \n"); + source.append(" P_row_indices[i] = i; \n"); + source.append(" P_column_indices[i] = coarse_agg_ids[i]; \n"); + source.append(" P_elements[i] = 1; \n"); + source.append(" } \n"); + source.append(" if (get_global_id(0) == 0) P_row_indices[size] = size; \n"); + source.append(" } \n"); + +} + +template<typename StringT> +void generate_amg_interpol_sa(StringT & source, std::string const & numeric_string) +{ + + source.append("__kernel void amg_interpol_sa( \n"); + source.append(" __global unsigned int const *A_row_indices, \n"); + source.append(" __global unsigned int const *A_col_indices, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" const *A_elements, \n"); + source.append(" unsigned int A_size1, \n"); + source.append(" unsigned int A_nnz, \n"); + source.append(" __global unsigned int *Jacobi_row_indices, \n"); + source.append(" __global unsigned int *Jacobi_col_indices, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" *Jacobi_elements, \n"); + source.append(" "); source.append(numeric_string); source.append(" omega) { \n"); + + source.append(" for (unsigned int row = get_global_id(0); row < A_size1; row += get_global_size(0)) \n"); + source.append(" { \n"); + source.append(" unsigned int row_begin = A_row_indices[row]; \n"); + source.append(" unsigned int row_end = A_row_indices[row+1]; \n"); + + source.append(" Jacobi_row_indices[row] = row_begin; \n"); + + // Step 1: Extract diagonal: + source.append(" "); source.append(numeric_string); source.append(" diag = 0; \n"); + source.append(" for (unsigned int j = row_begin; j < row_end; ++j) { \n"); + source.append(" if (A_col_indices[j] == row) { \n"); + source.append(" diag = A_elements[j]; \n"); + source.append(" break; \n"); + source.append(" } \n"); + source.append(" } \n"); + + // Step 2: Write entries: + source.append(" for (unsigned int j = row_begin; j < row_end; ++j) { \n"); + source.append(" unsigned int col_index = A_col_indices[j]; \n"); + source.append(" Jacobi_col_indices[j] = col_index; \n"); + source.append(" Jacobi_elements[j] = (col_index == row) ? (1 - omega) : (-omega * A_elements[j] / diag); \n"); + source.append(" } \n"); + + source.append(" } \n"); + source.append(" if (get_global_id(0) == 0) Jacobi_row_indices[A_size1] = A_nnz; \n"); + source.append("} \n"); + +} +//////////////////////////// Part 2: Main kernel class //////////////////////////////////// + +// main kernel class +/** @brief Main kernel class for generating OpenCL kernels for compressed_matrix. */ +template<typename NumericT> +struct amg +{ + static std::string program_name() + { + return viennacl::ocl::type_to_string<NumericT>::apply() + "_amg"; + } + + static void init(viennacl::ocl::context & ctx) + { + static std::map<cl_context, bool> init_done; + if (!init_done[ctx.handle().get()]) + { + viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx); + std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply(); + + std::string source; + source.reserve(2048); + + viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source); + + generate_amg_influence_trivial(source); + generate_amg_pmis2_init_workdata(source); + generate_amg_pmis2_max_neighborhood(source); + generate_amg_pmis2_mark_mis_nodes(source); + generate_amg_pmis2_reset_state(source); + generate_amg_agg_propagate_coarse_indices(source); + generate_amg_agg_merge_undecided(source); + generate_amg_agg_merge_undecided_2(source); + + generate_amg_interpol_ag(source, numeric_string); + generate_amg_interpol_sa(source, numeric_string); + + std::string prog_name = program_name(); + #ifdef VIENNACL_BUILD_INFO + std::cout << "Creating program " << prog_name << std::endl; + #endif + ctx.add_program(source, prog_name); + init_done[ctx.handle().get()] = true; + } //if + } //init +}; + +} // namespace kernels +} // namespace opencl +} // namespace linalg +} // namespace viennacl +#endif +
