http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/iterative_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/iterative_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/iterative_operations.hpp deleted file mode 100644 index b350fe0..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/iterative_operations.hpp +++ /dev/null @@ -1,945 +0,0 @@ -#ifndef VIENNACL_LINALG_OPENCL_ITERATIVE_OPERATIONS_HPP_ -#define VIENNACL_LINALG_OPENCL_ITERATIVE_OPERATIONS_HPP_ - -/* ========================================================================= - Copyright (c) 2010-2016, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. - Portions of this software are copyright by UChicago Argonne, LLC. - - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - - Project Head: Karl Rupp [email protected] - - (A list of authors and contributors can be found in the manual) - - License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - -/** @file viennacl/linalg/opencl/iterative_operations.hpp - @brief Implementations of specialized kernels for fast iterative solvers using OpenCL -*/ - -#include <cmath> - -#include "viennacl/forwards.h" -#include "viennacl/detail/vector_def.hpp" -#include "viennacl/ocl/device.hpp" -#include "viennacl/ocl/handle.hpp" -#include "viennacl/ocl/kernel.hpp" -#include "viennacl/scalar.hpp" -#include "viennacl/tools/tools.hpp" -#include "viennacl/linalg/opencl/common.hpp" -#include "viennacl/linalg/opencl/kernels/iterative.hpp" -#include "viennacl/meta/predicate.hpp" -#include "viennacl/meta/enable_if.hpp" -#include "viennacl/traits/size.hpp" -#include "viennacl/traits/start.hpp" -#include "viennacl/traits/handle.hpp" -#include "viennacl/traits/stride.hpp" - -namespace viennacl -{ -namespace linalg -{ -namespace opencl -{ - -template<typename NumericT> -void pipelined_cg_vector_update(vector_base<NumericT> & result, - NumericT alpha, - vector_base<NumericT> & p, - vector_base<NumericT> & r, - vector_base<NumericT> const & Ap, - NumericT beta, - vector_base<NumericT> & inner_prod_buffer) -{ - viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(result).context()); - viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx); - - viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "cg_vector_update"); - cl_uint vec_size = cl_uint(viennacl::traits::size(result)); - - k.local_work_size(0, 128); - k.global_work_size(0, 128*128); - - if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) - { - k.local_work_size(0, 256); - k.global_work_size(0, 256*256); - } - - viennacl::ocl::enqueue(k(result, alpha, p, r, Ap, beta, inner_prod_buffer, vec_size, viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)))); -} - -template<typename NumericT> -void pipelined_cg_prod(compressed_matrix<NumericT> const & A, - vector_base<NumericT> const & p, - vector_base<NumericT> & Ap, - vector_base<NumericT> & inner_prod_buffer) -{ - viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); - viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx); - - bool use_nvidia_blocked = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id && (double(A.nnz()) / double(A.size1()) > 12.0)); - - viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), use_nvidia_blocked ? "cg_csr_blocked_prod" : "cg_csr_prod"); - - cl_uint vec_size = cl_uint(viennacl::traits::size(p)); - cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3); - - k.local_work_size(0, 128); - k.global_work_size(0, 128*128); - - if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) - { - k.local_work_size(0, 256); - k.global_work_size(0, 256*256); - } - - if (use_nvidia_blocked) - { - viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), - p, - Ap, - vec_size, - inner_prod_buffer, - buffer_size_per_vector, - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)) - )); - } - else - { - viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()), - p, - Ap, - vec_size, - inner_prod_buffer, - buffer_size_per_vector, - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), - viennacl::ocl::local_mem(1024 * sizeof(NumericT)) - )); - } - -} - -template<typename NumericT> -void pipelined_cg_prod(coordinate_matrix<NumericT> const & A, - vector_base<NumericT> const & p, - vector_base<NumericT> & Ap, - vector_base<NumericT> & inner_prod_buffer) -{ - viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); - viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx); - - cl_uint vec_size = cl_uint(viennacl::traits::size(p)); - cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3); - - Ap.clear(); - - viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "cg_coo_prod"); - unsigned int thread_num = 256; //k.local_work_size(0); - - k.local_work_size(0, thread_num); - - k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases - - viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(), - p, - Ap, - vec_size, - viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num), - viennacl::ocl::local_mem(sizeof(NumericT)*thread_num), - inner_prod_buffer, - buffer_size_per_vector, - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)) - )); -} - -template<typename NumericT> -void pipelined_cg_prod(ell_matrix<NumericT> const & A, - vector_base<NumericT> const & p, - vector_base<NumericT> & Ap, - vector_base<NumericT> & inner_prod_buffer) -{ - viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); - viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx); - - cl_uint vec_size = cl_uint(viennacl::traits::size(p)); - cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3); - - viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "cg_ell_prod"); - - unsigned int thread_num = 128; - unsigned int group_num = 256; - - k.local_work_size(0, thread_num); - k.global_work_size(0, thread_num * group_num); - - if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) - { - k.local_work_size(0, 256); - k.global_work_size(0, 256*256); - } - - viennacl::ocl::enqueue(k(A.handle2().opencl_handle(), - A.handle().opencl_handle(), - cl_uint(A.internal_size1()), - cl_uint(A.maxnnz()), - cl_uint(A.internal_maxnnz()), - viennacl::traits::opencl_handle(p), - viennacl::traits::opencl_handle(Ap), - vec_size, - inner_prod_buffer, - buffer_size_per_vector, - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)) - ) - ); -} - -template<typename NumericT> -void pipelined_cg_prod(sliced_ell_matrix<NumericT> const & A, - vector_base<NumericT> const & p, - vector_base<NumericT> & Ap, - vector_base<NumericT> & inner_prod_buffer) -{ - viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); - viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx); - - cl_uint vec_size = cl_uint(viennacl::traits::size(p)); - cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3); - - viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "cg_sliced_ell_prod"); - - vcl_size_t thread_num = std::max(A.rows_per_block(), static_cast<vcl_size_t>(128)); - unsigned int group_num = 256; - - if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) - thread_num = 256; - - k.local_work_size(0, thread_num); - k.global_work_size(0, thread_num * group_num); - - viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), - A.handle2().opencl_handle(), - A.handle3().opencl_handle(), - A.handle().opencl_handle(), - viennacl::traits::opencl_handle(p), - viennacl::traits::opencl_handle(Ap), - vec_size, - cl_uint(A.rows_per_block()), - inner_prod_buffer, - buffer_size_per_vector, - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)) - ) - ); -} - - -template<typename NumericT> -void pipelined_cg_prod(hyb_matrix<NumericT> const & A, - vector_base<NumericT> const & p, - vector_base<NumericT> & Ap, - vector_base<NumericT> & inner_prod_buffer) -{ - viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); - viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx); - - cl_uint vec_size = cl_uint(viennacl::traits::size(p)); - cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3); - - viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "cg_hyb_prod"); - - unsigned int thread_num = 128; - unsigned int group_num = 128; - - k.local_work_size(0, thread_num); - k.global_work_size(0, thread_num * group_num); - - if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) - { - k.local_work_size(0, 256); - k.global_work_size(0, 256*256); - } - - viennacl::ocl::enqueue(k(A.handle2().opencl_handle(), - A.handle().opencl_handle(), - A.handle3().opencl_handle(), - A.handle4().opencl_handle(), - A.handle5().opencl_handle(), - cl_uint(A.internal_size1()), - cl_uint(A.ell_nnz()), - cl_uint(A.internal_ellnnz()), - viennacl::traits::opencl_handle(p), - viennacl::traits::opencl_handle(Ap), - vec_size, - inner_prod_buffer, - buffer_size_per_vector, - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)) - ) - ); -} - - -//////////////////////////// BiCGStab //////////////////////// - -template<typename NumericT> -void pipelined_bicgstab_update_s(vector_base<NumericT> & s, - vector_base<NumericT> & r, - vector_base<NumericT> const & Ap, - vector_base<NumericT> & inner_prod_buffer, - vcl_size_t buffer_chunk_size, - vcl_size_t buffer_chunk_offset) -{ - viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(s).context()); - viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx); - - viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "bicgstab_update_s"); - cl_uint vec_size = cl_uint(viennacl::traits::size(s)); - - k.local_work_size(0, 128); - k.global_work_size(0, 128*128); - - if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) - { - k.local_work_size(0, 256); - k.global_work_size(0, 256*256); - } - - cl_uint chunk_size = cl_uint(buffer_chunk_size); - cl_uint chunk_offset = cl_uint(buffer_chunk_offset); - viennacl::ocl::enqueue(k(s, r, Ap, - inner_prod_buffer, chunk_size, chunk_offset, vec_size, - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)))); -} - -template<typename NumericT> -void pipelined_bicgstab_vector_update(vector_base<NumericT> & result, NumericT alpha, vector_base<NumericT> & p, NumericT omega, vector_base<NumericT> const & s, - vector_base<NumericT> & residual, vector_base<NumericT> const & As, - NumericT beta, vector_base<NumericT> const & Ap, - vector_base<NumericT> const & r0star, - vector_base<NumericT> & inner_prod_buffer, vcl_size_t buffer_chunk_size) -{ - (void)buffer_chunk_size; - - viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(s).context()); - viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx); - - viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "bicgstab_vector_update"); - cl_uint vec_size = cl_uint(viennacl::traits::size(result)); - - k.local_work_size(0, 128); - k.global_work_size(0, 128*128); - - if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) - { - k.local_work_size(0, 256); - k.global_work_size(0, 256*256); - } - - viennacl::ocl::enqueue(k(result, alpha, p, omega, s, - residual, As, - beta, Ap, - r0star, - inner_prod_buffer, - vec_size, viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)) - ) - ); -} - -template<typename NumericT> -void pipelined_bicgstab_prod(compressed_matrix<NumericT> const & A, - vector_base<NumericT> const & p, - vector_base<NumericT> & Ap, - vector_base<NumericT> const & r0star, - vector_base<NumericT> & inner_prod_buffer, - vcl_size_t buffer_chunk_size, - vcl_size_t buffer_chunk_offset) -{ - viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); - viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx); - - bool use_nvidia_blocked = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id && (double(A.nnz()) / double(A.size1()) > 12.0)); - - viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), use_nvidia_blocked ? "bicgstab_csr_blocked_prod" : "bicgstab_csr_prod"); - - cl_uint vec_size = cl_uint(viennacl::traits::size(p)); - cl_uint chunk_size = cl_uint(buffer_chunk_size); - cl_uint chunk_offset = cl_uint(buffer_chunk_offset); - - k.local_work_size(0, 128); - k.global_work_size(0, 128*128); - - if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) - { - k.local_work_size(0, 256); - k.global_work_size(0, 256*256); - } - - if (use_nvidia_blocked) - { - viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), - p, - Ap, - r0star, - vec_size, - inner_prod_buffer, chunk_size, chunk_offset, - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)) - )); - } - else - { - viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()), - p, - Ap, - r0star, - vec_size, - inner_prod_buffer, chunk_size, chunk_offset, - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)) - )); - } - -} - - -template<typename NumericT> -void pipelined_bicgstab_prod(coordinate_matrix<NumericT> const & A, - vector_base<NumericT> const & p, - vector_base<NumericT> & Ap, - vector_base<NumericT> const & r0star, - vector_base<NumericT> & inner_prod_buffer, - vcl_size_t buffer_chunk_size, - vcl_size_t buffer_chunk_offset) -{ - viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); - viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx); - - cl_uint vec_size = cl_uint(viennacl::traits::size(p)); - cl_uint chunk_size = cl_uint(buffer_chunk_size); - cl_uint chunk_offset = cl_uint(buffer_chunk_offset); - - Ap.clear(); - - viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "bicgstab_coo_prod"); - unsigned int thread_num = 256; //k.local_work_size(0); - - k.local_work_size(0, thread_num); - - k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases - - viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(), - p, - Ap, - r0star, - vec_size, - viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num), - viennacl::ocl::local_mem(sizeof(NumericT)*thread_num), - inner_prod_buffer, chunk_size, chunk_offset, - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)) - )); -} - -template<typename NumericT> -void pipelined_bicgstab_prod(ell_matrix<NumericT> const & A, - vector_base<NumericT> const & p, - vector_base<NumericT> & Ap, - vector_base<NumericT> const & r0star, - vector_base<NumericT> & inner_prod_buffer, - vcl_size_t buffer_chunk_size, - vcl_size_t buffer_chunk_offset) -{ - viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); - viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx); - - cl_uint vec_size = cl_uint(viennacl::traits::size(p)); - cl_uint chunk_size = cl_uint(buffer_chunk_size); - cl_uint chunk_offset = cl_uint(buffer_chunk_offset); - - viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "bicgstab_ell_prod"); - - unsigned int thread_num = 128; - unsigned int group_num = 128; - - k.local_work_size(0, thread_num); - k.global_work_size(0, thread_num * group_num); - - if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) - { - k.local_work_size(0, 256); - k.global_work_size(0, 256*256); - } - - viennacl::ocl::enqueue(k(A.handle2().opencl_handle(), - A.handle().opencl_handle(), - cl_uint(A.internal_size1()), - cl_uint(A.maxnnz()), - cl_uint(A.internal_maxnnz()), - viennacl::traits::opencl_handle(p), - viennacl::traits::opencl_handle(Ap), - r0star, - vec_size, - inner_prod_buffer, chunk_size, chunk_offset, - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)) - ) - ); -} - -template<typename NumericT> -void pipelined_bicgstab_prod(sliced_ell_matrix<NumericT> const & A, - vector_base<NumericT> const & p, - vector_base<NumericT> & Ap, - vector_base<NumericT> const & r0star, - vector_base<NumericT> & inner_prod_buffer, - vcl_size_t buffer_chunk_size, - vcl_size_t buffer_chunk_offset) -{ - viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); - viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx); - - cl_uint vec_size = cl_uint(viennacl::traits::size(p)); - cl_uint chunk_size = cl_uint(buffer_chunk_size); - cl_uint chunk_offset = cl_uint(buffer_chunk_offset); - - viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "bicgstab_sliced_ell_prod"); - - vcl_size_t thread_num = std::max(A.rows_per_block(), static_cast<vcl_size_t>(128)); - unsigned int group_num = 256; - - if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) - thread_num = 256; - - k.local_work_size(0, thread_num); - k.global_work_size(0, thread_num * group_num); - - viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), - A.handle2().opencl_handle(), - A.handle3().opencl_handle(), - A.handle().opencl_handle(), - viennacl::traits::opencl_handle(p), - viennacl::traits::opencl_handle(Ap), - r0star, - vec_size, - cl_uint(A.rows_per_block()), - inner_prod_buffer, chunk_size, chunk_offset, - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)) - ) - ); -} - - -template<typename NumericT> -void pipelined_bicgstab_prod(hyb_matrix<NumericT> const & A, - vector_base<NumericT> const & p, - vector_base<NumericT> & Ap, - vector_base<NumericT> const & r0star, - vector_base<NumericT> & inner_prod_buffer, - vcl_size_t buffer_chunk_size, - vcl_size_t buffer_chunk_offset) -{ - viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); - viennacl::linalg::opencl::kernels::iterative<NumericT>::init(ctx); - - cl_uint vec_size = cl_uint(viennacl::traits::size(p)); - cl_uint chunk_size = cl_uint(buffer_chunk_size); - cl_uint chunk_offset = cl_uint(buffer_chunk_offset); - - viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<NumericT>::program_name(), "bicgstab_hyb_prod"); - - unsigned int thread_num = 256; - unsigned int group_num = 128; - - k.local_work_size(0, thread_num); - k.global_work_size(0, thread_num * group_num); - - if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) - { - k.local_work_size(0, 256); - k.global_work_size(0, 256*256); - } - - viennacl::ocl::enqueue(k(A.handle2().opencl_handle(), - A.handle().opencl_handle(), - A.handle3().opencl_handle(), - A.handle4().opencl_handle(), - A.handle5().opencl_handle(), - cl_uint(A.internal_size1()), - cl_uint(A.ell_nnz()), - cl_uint(A.internal_ellnnz()), - viennacl::traits::opencl_handle(p), - viennacl::traits::opencl_handle(Ap), - r0star, - vec_size, - inner_prod_buffer, chunk_size, chunk_offset, - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)), - viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)) - ) - ); -} - -/////////////////////////////////// - -/** @brief Performs a vector normalization needed for an efficient pipelined GMRES algorithm. - * - * This routines computes for vectors 'r', 'v_k': - * Second reduction step for ||v_k|| - * v_k /= ||v_k|| - * First reduction step for <r, v_k> - */ -template <typename T> -void pipelined_gmres_normalize_vk(vector_base<T> & v_k, - vector_base<T> const & residual, - vector_base<T> & R_buffer, - vcl_size_t offset_in_R, - vector_base<T> const & inner_prod_buffer, - vector_base<T> & r_dot_vk_buffer, - vcl_size_t buffer_chunk_size, - vcl_size_t buffer_chunk_offset) -{ - viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(v_k).context()); - viennacl::linalg::opencl::kernels::iterative<T>::init(ctx); - - viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_normalize_vk"); - - k.local_work_size(0, 128); - k.global_work_size(0, 128*128); - - cl_uint size_vk = cl_uint(v_k.size()); - cl_uint vk_offset = cl_uint(viennacl::traits::start(v_k)); - cl_uint R_offset = cl_uint(offset_in_R); - cl_uint chunk_size = cl_uint(buffer_chunk_size); - cl_uint chunk_offset = cl_uint(buffer_chunk_offset); - viennacl::ocl::enqueue(k(v_k, vk_offset, - residual, - R_buffer, R_offset, - inner_prod_buffer, chunk_size, - r_dot_vk_buffer, chunk_offset, - size_vk, - viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)) - )); -} - -template <typename T> -void pipelined_gmres_gram_schmidt_stage1(vector_base<T> const & device_krylov_basis, - vcl_size_t v_k_size, - vcl_size_t v_k_internal_size, - vcl_size_t param_k, - vector_base<T> & vi_in_vk_buffer, - vcl_size_t buffer_chunk_size) -{ - viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(device_krylov_basis).context()); - viennacl::linalg::opencl::kernels::iterative<T>::init(ctx); - - viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_gram_schmidt_1"); - - k.local_work_size(0, 128); - k.global_work_size(0, 128*128); - - cl_uint size_vk = cl_uint(v_k_size); - cl_uint internal_size_vk = cl_uint(v_k_internal_size); - cl_uint ocl_k = cl_uint(param_k); - cl_uint chunk_size = cl_uint(buffer_chunk_size); - viennacl::ocl::enqueue(k(device_krylov_basis, size_vk, internal_size_vk, ocl_k, - vi_in_vk_buffer, chunk_size - )); -} - -template <typename T> -void pipelined_gmres_gram_schmidt_stage2(vector_base<T> & device_krylov_basis, - vcl_size_t v_k_size, - vcl_size_t v_k_internal_size, - vcl_size_t param_k, - vector_base<T> const & vi_in_vk_buffer, - vector_base<T> & R_buffer, - vcl_size_t krylov_dim, - vector_base<T> & inner_prod_buffer, - vcl_size_t buffer_chunk_size) -{ - viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(device_krylov_basis).context()); - viennacl::linalg::opencl::kernels::iterative<T>::init(ctx); - - viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_gram_schmidt_2"); - - k.local_work_size(0, 128); - k.global_work_size(0, 128*128); - - cl_uint size_vk = cl_uint(v_k_size); - cl_uint internal_size_vk = cl_uint(v_k_internal_size); - cl_uint ocl_k = cl_uint(param_k); - cl_uint chunk_size = cl_uint(buffer_chunk_size); - cl_uint ocl_krylov_dim = cl_uint(krylov_dim); - viennacl::ocl::enqueue(k(device_krylov_basis, size_vk, internal_size_vk, ocl_k, - vi_in_vk_buffer, chunk_size, - R_buffer, ocl_krylov_dim, - inner_prod_buffer, - viennacl::ocl::local_mem(7 * k.local_work_size() * sizeof(T)) - )); -} - -template <typename T> -void pipelined_gmres_update_result(vector_base<T> & result, - vector_base<T> const & residual, - vector_base<T> const & krylov_basis, - vcl_size_t v_k_size, - vcl_size_t v_k_internal_size, - vector_base<T> const & coefficients, - vcl_size_t param_k) -{ - viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(result).context()); - viennacl::linalg::opencl::kernels::iterative<T>::init(ctx); - - viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_update_result"); - - k.local_work_size(0, 128); - k.global_work_size(0, 128*128); - - cl_uint size_vk = cl_uint(v_k_size); - cl_uint internal_size_vk = cl_uint(v_k_internal_size); - cl_uint ocl_k = cl_uint(param_k); - viennacl::ocl::enqueue(k(result, - residual, - krylov_basis, size_vk, internal_size_vk, - coefficients, ocl_k - )); -} - - -template <typename T> -void pipelined_gmres_prod(compressed_matrix<T> const & A, - vector_base<T> const & p, - vector_base<T> & Ap, - vector_base<T> & inner_prod_buffer) -{ - viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); - viennacl::linalg::opencl::kernels::iterative<T>::init(ctx); - - bool use_nvidia_blocked = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id && (double(A.nnz()) / double(A.size1()) > 12.0)); - - viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), use_nvidia_blocked ? "gmres_csr_blocked_prod" : "gmres_csr_prod"); - - cl_uint vec_size = cl_uint(viennacl::traits::size(p)); - cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3); - cl_uint start_p = cl_uint(viennacl::traits::start(p)); - cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap)); - - k.local_work_size(0, 128); - k.global_work_size(0, 128*128); - - if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) - { - k.local_work_size(0, 256); - k.global_work_size(0, 256*128); - } - - if (use_nvidia_blocked) - { - viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), - p, start_p, - Ap, start_Ap, - vec_size, - inner_prod_buffer, - buffer_size_per_vector, - viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)), - viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)) - )); - } - else - { - viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()), - p, start_p, - Ap, start_Ap, - vec_size, - inner_prod_buffer, - buffer_size_per_vector, - viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)), - viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)), - viennacl::ocl::local_mem(1024 * sizeof(T)) - )); - } -} - -template <typename T> -void pipelined_gmres_prod(coordinate_matrix<T> const & A, - vector_base<T> const & p, - vector_base<T> & Ap, - vector_base<T> & inner_prod_buffer) -{ - viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); - viennacl::linalg::opencl::kernels::iterative<T>::init(ctx); - - cl_uint vec_size = cl_uint(viennacl::traits::size(p)); - cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3); - cl_uint start_p = cl_uint(viennacl::traits::start(p)); - cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap)); - - Ap.clear(); - inner_prod_buffer.clear(); - - viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_coo_prod"); - unsigned int thread_num = 128; //k.local_work_size(0); - - k.local_work_size(0, thread_num); - - k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases - - viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(), - p, start_p, - Ap, start_Ap, - vec_size, - viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num), - viennacl::ocl::local_mem(sizeof(T)*thread_num), - inner_prod_buffer, - buffer_size_per_vector, - viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)), - viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)) - )); -} - -template <typename T> -void pipelined_gmres_prod(ell_matrix<T> const & A, - vector_base<T> const & p, - vector_base<T> & Ap, - vector_base<T> & inner_prod_buffer) -{ - viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); - viennacl::linalg::opencl::kernels::iterative<T>::init(ctx); - - cl_uint vec_size = cl_uint(viennacl::traits::size(p)); - cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3); - cl_uint start_p = cl_uint(viennacl::traits::start(p)); - cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap)); - - viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_ell_prod"); - - unsigned int thread_num = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) ? 256 : 128; - unsigned int group_num = 128; - - k.local_work_size(0, thread_num); - k.global_work_size(0, thread_num * group_num); - - viennacl::ocl::enqueue(k(A.handle2().opencl_handle(), - A.handle().opencl_handle(), - cl_uint(A.internal_size1()), - cl_uint(A.maxnnz()), - cl_uint(A.internal_maxnnz()), - viennacl::traits::opencl_handle(p), start_p, - viennacl::traits::opencl_handle(Ap), start_Ap, - vec_size, - inner_prod_buffer, - buffer_size_per_vector, - viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)), - viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)) - ) - ); -} - -template <typename T> -void pipelined_gmres_prod(sliced_ell_matrix<T> const & A, - vector_base<T> const & p, - vector_base<T> & Ap, - vector_base<T> & inner_prod_buffer) -{ - viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); - viennacl::linalg::opencl::kernels::iterative<T>::init(ctx); - - cl_uint vec_size = cl_uint(viennacl::traits::size(p)); - cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3); - cl_uint start_p = cl_uint(viennacl::traits::start(p)); - cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap)); - - viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_sliced_ell_prod"); - - vcl_size_t thread_num = std::max(A.rows_per_block(), static_cast<vcl_size_t>(128)); - unsigned int group_num = 128; - - if (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) - thread_num = 256; - - k.local_work_size(0, thread_num); - k.global_work_size(0, thread_num * group_num); - - viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), - A.handle2().opencl_handle(), - A.handle3().opencl_handle(), - A.handle().opencl_handle(), - viennacl::traits::opencl_handle(p), start_p, - viennacl::traits::opencl_handle(Ap), start_Ap, - vec_size, - cl_uint(A.rows_per_block()), - inner_prod_buffer, - buffer_size_per_vector, - viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)), - viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)) - ) - ); -} - - -template <typename T> -void pipelined_gmres_prod(hyb_matrix<T> const & A, - vector_base<T> const & p, - vector_base<T> & Ap, - vector_base<T> & inner_prod_buffer) -{ - viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); - viennacl::linalg::opencl::kernels::iterative<T>::init(ctx); - - cl_uint vec_size = cl_uint(viennacl::traits::size(p)); - cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3); - cl_uint start_p = cl_uint(viennacl::traits::start(p)); - cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap)); - - viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::iterative<T>::program_name(), "gmres_hyb_prod"); - - unsigned int thread_num = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) ? 256 : 128; - unsigned int group_num = 128; - - k.local_work_size(0, thread_num); - k.global_work_size(0, thread_num * group_num); - - - viennacl::ocl::enqueue(k(A.handle2().opencl_handle(), - A.handle().opencl_handle(), - A.handle3().opencl_handle(), - A.handle4().opencl_handle(), - A.handle5().opencl_handle(), - cl_uint(A.internal_size1()), - cl_uint(A.ell_nnz()), - cl_uint(A.internal_ellnnz()), - viennacl::traits::opencl_handle(p), start_p, - viennacl::traits::opencl_handle(Ap), start_Ap, - vec_size, - inner_prod_buffer, - buffer_size_per_vector, - viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)), - viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)) - ) - ); -} - - -} //namespace opencl -} //namespace linalg -} //namespace viennacl - - -#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/amg.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/amg.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/amg.hpp deleted file mode 100644 index b0252d7..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/amg.hpp +++ /dev/null @@ -1,393 +0,0 @@ -#ifndef VIENNACL_LINALG_OPENCL_KERNELS_AMG_HPP -#define VIENNACL_LINALG_OPENCL_KERNELS_AMG_HPP - -#include "viennacl/tools/tools.hpp" -#include "viennacl/ocl/kernel.hpp" -#include "viennacl/ocl/platform.hpp" -#include "viennacl/ocl/utils.hpp" - -#include "viennacl/linalg/opencl/common.hpp" - -/** @file viennacl/linalg/opencl/kernels/amg.hpp - * @brief OpenCL kernel file for operations related to algebraic multigrid */ -namespace viennacl -{ -namespace linalg -{ -namespace opencl -{ -namespace kernels -{ - - -template<typename StringT> -void generate_amg_influence_trivial(StringT & source) -{ - - source.append("__kernel void amg_influence_trivial( \n"); - source.append(" __global const unsigned int * A_row_indices, \n"); - source.append(" __global const unsigned int * A_col_indices, \n"); - source.append(" unsigned int A_size1, \n"); - source.append(" unsigned int A_nnz, \n"); - source.append(" __global unsigned int * influences_row, \n"); - source.append(" __global unsigned int * influences_id, \n"); - source.append(" __global unsigned int * influences_values) { \n"); - - source.append(" for (unsigned int i = get_global_id(0); i < A_size1; i += get_global_size(0)) \n"); - source.append(" { \n"); - source.append(" unsigned int tmp = A_row_indices[i]; \n"); - source.append(" influences_row[i] = tmp; \n"); - source.append(" influences_values[i] = A_row_indices[i+1] - tmp; \n"); - source.append(" } \n"); - - source.append(" for (unsigned int i = get_global_id(0); i < A_nnz; i += get_global_size(0)) \n"); - source.append(" influences_id[i] = A_col_indices[i]; \n"); - - source.append(" if (get_global_id(0) == 0) \n"); - source.append(" influences_row[A_size1] = A_row_indices[A_size1]; \n"); - source.append("} \n"); - -} - - -template<typename StringT> -void generate_amg_pmis2_init_workdata(StringT & source) -{ - - source.append("__kernel void amg_pmis2_init_workdata( \n"); - source.append(" __global unsigned int *work_state, \n"); - source.append(" __global unsigned int *work_random, \n"); - source.append(" __global unsigned int *work_index, \n"); - source.append(" __global unsigned int const *point_types, \n"); - source.append(" __global unsigned int const *random_weights, \n"); - source.append(" unsigned int size) { \n"); - - source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n"); - source.append(" switch (point_types[i]) { \n"); - source.append(" case 0: work_state[i] = 1; break; \n"); //viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED - source.append(" case 1: work_state[i] = 2; break; \n"); //viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE - source.append(" case 2: work_state[i] = 0; break; \n"); //viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE - - source.append(" default: break; // do nothing \n"); - source.append(" } \n"); - - source.append(" work_random[i] = random_weights[i]; \n"); - source.append(" work_index[i] = i; \n"); - source.append(" } \n"); - source.append("} \n"); -} - - - -template<typename StringT> -void generate_amg_pmis2_max_neighborhood(StringT & source) -{ - - source.append("__kernel void amg_pmis2_max_neighborhood( \n"); - source.append(" __global unsigned int *work_state, \n"); - source.append(" __global unsigned int *work_random, \n"); - source.append(" __global unsigned int *work_index, \n"); - source.append(" __global unsigned int *work_state2, \n"); - source.append(" __global unsigned int *work_random2, \n"); - source.append(" __global unsigned int *work_index2, \n"); - source.append(" __global unsigned int const *influences_row, \n"); - source.append(" __global unsigned int const *influences_id, \n"); - source.append(" unsigned int size) { \n"); - - source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n"); - - // load - source.append(" unsigned int state = work_state[i]; \n"); - source.append(" unsigned int random = work_random[i]; \n"); - source.append(" unsigned int index = work_index[i]; \n"); - - // max - source.append(" unsigned int j_stop = influences_row[i + 1]; \n"); - source.append(" for (unsigned int j = influences_row[i]; j < j_stop; ++j) { \n"); - source.append(" unsigned int influenced_point_id = influences_id[j]; \n"); - - // lexigraphical triple-max (not particularly pretty, but does the job): - source.append(" if (state < work_state[influenced_point_id]) { \n"); - source.append(" state = work_state[influenced_point_id]; \n"); - source.append(" random = work_random[influenced_point_id]; \n"); - source.append(" index = work_index[influenced_point_id]; \n"); - source.append(" } else if (state == work_state[influenced_point_id]) { \n"); - source.append(" if (random < work_random[influenced_point_id]) { \n"); - source.append(" state = work_state[influenced_point_id]; \n"); - source.append(" random = work_random[influenced_point_id]; \n"); - source.append(" index = work_index[influenced_point_id]; \n"); - source.append(" } else if (random == work_random[influenced_point_id]) { \n"); - source.append(" if (index < work_index[influenced_point_id]) { \n"); - source.append(" state = work_state[influenced_point_id]; \n"); - source.append(" random = work_random[influenced_point_id]; \n"); - source.append(" index = work_index[influenced_point_id]; \n"); - source.append(" } \n"); - source.append(" } \n"); - source.append(" } \n"); - - source.append(" }\n"); //for - - // store - source.append(" work_state2[i] = state; \n"); - source.append(" work_random2[i] = random; \n"); - source.append(" work_index2[i] = index; \n"); - source.append(" } \n"); - source.append("} \n"); -} - - - -template<typename StringT> -void generate_amg_pmis2_mark_mis_nodes(StringT & source) -{ - - source.append("__kernel void amg_pmis2_mark_mis_nodes( \n"); - source.append(" __global unsigned int const *work_state, \n"); - source.append(" __global unsigned int const *work_index, \n"); - source.append(" __global unsigned int *point_types, \n"); - source.append(" __global unsigned int *undecided_buffer, \n"); - source.append(" unsigned int size) { \n"); - - source.append(" unsigned int num_undecided = 0; \n"); - source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n"); - source.append(" unsigned int max_state = work_state[i]; \n"); - source.append(" unsigned int max_index = work_index[i]; \n"); - - source.append(" if (point_types[i] == 0) { \n"); // viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED - source.append(" if (i == max_index) point_types[i] = 1; \n"); // viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE - source.append(" else if (max_state == 2) point_types[i] = 2; \n"); // viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE - source.append(" else num_undecided += 1; \n"); - source.append(" } \n"); - source.append(" } \n"); - - // reduction in shared memory: - source.append(" __local unsigned int shared_buffer[256]; \n"); - source.append(" shared_buffer[get_local_id(0)] = num_undecided; \n"); - source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n"); - source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); - source.append(" if (get_local_id(0) < stride) shared_buffer[get_local_id(0)] += shared_buffer[get_local_id(0)+stride]; \n"); - source.append(" } \n"); - - source.append(" if (get_local_id(0) == 0) \n"); - source.append(" undecided_buffer[get_group_id(0)] = shared_buffer[0]; \n"); - - source.append("} \n"); -} - - -template<typename StringT> -void generate_amg_pmis2_reset_state(StringT & source) -{ - - source.append("__kernel void amg_pmis2_reset_state( \n"); - source.append(" __global unsigned int *point_types, \n"); - source.append(" unsigned int size) { \n"); - - source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n"); - source.append(" if (point_types[i] != 1) point_types[i] = 0;\n"); // mind mapping of POINT_TYPE_COARSE and POINT_TYPE_UNDECIDED - source.append(" } \n"); - - source.append("} \n"); -} - - - -////////////// - - - -template<typename StringT> -void generate_amg_agg_propagate_coarse_indices(StringT & source) -{ - - source.append(" __kernel void amg_agg_propagate_coarse_indices( \n"); - source.append(" __global unsigned int *point_types, \n"); - source.append(" __global unsigned int *coarse_ids, \n"); - source.append(" __global unsigned int const *influences_row, \n"); - source.append(" __global unsigned int const *influences_id, \n"); - source.append(" unsigned int size) { \n"); - - source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n"); - source.append(" { \n"); - source.append(" if (point_types[i] == 1) { \n"); //viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE - source.append(" unsigned int coarse_index = coarse_ids[i]; \n"); - - source.append(" unsigned int j_stop = influences_row[i + 1]; \n"); - source.append(" for (unsigned int j = influences_row[i]; j < j_stop; ++j) { \n"); - source.append(" unsigned int influenced_point_id = influences_id[j]; \n"); - source.append(" coarse_ids[influenced_point_id] = coarse_index; \n"); - source.append(" if (influenced_point_id != i) point_types[influenced_point_id] = 2; \n"); //viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE - source.append(" } \n"); - source.append(" } \n"); - source.append(" } \n"); - source.append("} \n"); - -} - - - -template<typename StringT> -void generate_amg_agg_merge_undecided(StringT & source) -{ - - source.append(" __kernel void amg_agg_merge_undecided( \n"); - source.append(" __global unsigned int *point_types, \n"); - source.append(" __global unsigned int *coarse_ids, \n"); - source.append(" __global unsigned int const *influences_row, \n"); - source.append(" __global unsigned int const *influences_id, \n"); - source.append(" unsigned int size) { \n"); - - source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n"); - source.append(" { \n"); - source.append(" if (point_types[i] == 0) { \n"); //viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED - - source.append(" unsigned int j_stop = influences_row[i + 1]; \n"); - source.append(" for (unsigned int j = influences_row[i]; j < j_stop; ++j) { \n"); - source.append(" unsigned int influenced_point_id = influences_id[j]; \n"); - source.append(" if (point_types[influenced_point_id] != 0) { \n"); // viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED - source.append(" coarse_ids[i] = coarse_ids[influenced_point_id]; \n"); - source.append(" break; \n"); - source.append(" } \n"); - source.append(" } \n"); - - source.append(" } \n"); - source.append(" } \n"); - source.append("} \n"); - -} - - -template<typename StringT> -void generate_amg_agg_merge_undecided_2(StringT & source) -{ - - source.append(" __kernel void amg_agg_merge_undecided_2( \n"); - source.append(" __global unsigned int *point_types, \n"); - source.append(" unsigned int size) { \n"); - - source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n"); - source.append(" if (point_types[i] == 0) point_types[i] = 2; \n"); // POINT_TYPE_UNDECIDED to POINT_TYPE_FINE - - source.append("} \n"); -} - -////////////////////// - -template<typename StringT> -void generate_amg_interpol_ag(StringT & source, std::string const & numeric_string) -{ - - source.append(" __kernel void amg_interpol_ag( \n"); - source.append(" __global unsigned int * P_row_indices, \n"); - source.append(" __global unsigned int * P_column_indices, \n"); - source.append(" __global "); source.append(numeric_string); source.append(" * P_elements, \n"); - source.append(" __global const unsigned int * coarse_agg_ids, \n"); - source.append(" unsigned int size) { \n"); - - source.append(" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) \n"); - source.append(" { \n"); - source.append(" P_row_indices[i] = i; \n"); - source.append(" P_column_indices[i] = coarse_agg_ids[i]; \n"); - source.append(" P_elements[i] = 1; \n"); - source.append(" } \n"); - source.append(" if (get_global_id(0) == 0) P_row_indices[size] = size; \n"); - source.append(" } \n"); - -} - -template<typename StringT> -void generate_amg_interpol_sa(StringT & source, std::string const & numeric_string) -{ - - source.append("__kernel void amg_interpol_sa( \n"); - source.append(" __global unsigned int const *A_row_indices, \n"); - source.append(" __global unsigned int const *A_col_indices, \n"); - source.append(" __global "); source.append(numeric_string); source.append(" const *A_elements, \n"); - source.append(" unsigned int A_size1, \n"); - source.append(" unsigned int A_nnz, \n"); - source.append(" __global unsigned int *Jacobi_row_indices, \n"); - source.append(" __global unsigned int *Jacobi_col_indices, \n"); - source.append(" __global "); source.append(numeric_string); source.append(" *Jacobi_elements, \n"); - source.append(" "); source.append(numeric_string); source.append(" omega) { \n"); - - source.append(" for (unsigned int row = get_global_id(0); row < A_size1; row += get_global_size(0)) \n"); - source.append(" { \n"); - source.append(" unsigned int row_begin = A_row_indices[row]; \n"); - source.append(" unsigned int row_end = A_row_indices[row+1]; \n"); - - source.append(" Jacobi_row_indices[row] = row_begin; \n"); - - // Step 1: Extract diagonal: - source.append(" "); source.append(numeric_string); source.append(" diag = 0; \n"); - source.append(" for (unsigned int j = row_begin; j < row_end; ++j) { \n"); - source.append(" if (A_col_indices[j] == row) { \n"); - source.append(" diag = A_elements[j]; \n"); - source.append(" break; \n"); - source.append(" } \n"); - source.append(" } \n"); - - // Step 2: Write entries: - source.append(" for (unsigned int j = row_begin; j < row_end; ++j) { \n"); - source.append(" unsigned int col_index = A_col_indices[j]; \n"); - source.append(" Jacobi_col_indices[j] = col_index; \n"); - source.append(" Jacobi_elements[j] = (col_index == row) ? (1 - omega) : (-omega * A_elements[j] / diag); \n"); - source.append(" } \n"); - - source.append(" } \n"); - source.append(" if (get_global_id(0) == 0) Jacobi_row_indices[A_size1] = A_nnz; \n"); - source.append("} \n"); - -} -//////////////////////////// Part 2: Main kernel class //////////////////////////////////// - -// main kernel class -/** @brief Main kernel class for generating OpenCL kernels for compressed_matrix. */ -template<typename NumericT> -struct amg -{ - static std::string program_name() - { - return viennacl::ocl::type_to_string<NumericT>::apply() + "_amg"; - } - - static void init(viennacl::ocl::context & ctx) - { - static std::map<cl_context, bool> init_done; - if (!init_done[ctx.handle().get()]) - { - viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx); - std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply(); - - std::string source; - source.reserve(2048); - - viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source); - - generate_amg_influence_trivial(source); - generate_amg_pmis2_init_workdata(source); - generate_amg_pmis2_max_neighborhood(source); - generate_amg_pmis2_mark_mis_nodes(source); - generate_amg_pmis2_reset_state(source); - generate_amg_agg_propagate_coarse_indices(source); - generate_amg_agg_merge_undecided(source); - generate_amg_agg_merge_undecided_2(source); - - generate_amg_interpol_ag(source, numeric_string); - generate_amg_interpol_sa(source, numeric_string); - - std::string prog_name = program_name(); - #ifdef VIENNACL_BUILD_INFO - std::cout << "Creating program " << prog_name << std::endl; - #endif - ctx.add_program(source, prog_name); - init_done[ctx.handle().get()] = true; - } //if - } //init -}; - -} // namespace kernels -} // namespace opencl -} // namespace linalg -} // namespace viennacl -#endif -
