http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/norm_inf.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/norm_inf.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/norm_inf.hpp new file mode 100644 index 0000000..959bbd8 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/norm_inf.hpp @@ -0,0 +1,108 @@ +#ifndef VIENNACL_LINALG_NORM_INF_HPP_ +#define VIENNACL_LINALG_NORM_INF_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file norm_inf.hpp + @brief Generic interface for the l^infty-norm. See viennacl/linalg/vector_operations.hpp for implementations. +*/ + +#include <cmath> +#include "viennacl/forwards.h" +#include "viennacl/tools/tools.hpp" +#include "viennacl/meta/enable_if.hpp" +#include "viennacl/meta/tag_of.hpp" + +namespace viennacl +{ + // + // generic norm_inf function + // uses tag dispatch to identify which algorithm + // should be called + // + namespace linalg + { + + #ifdef VIENNACL_WITH_UBLAS + // ---------------------------------------------------- + // UBLAS + // + template< typename VectorT > + typename viennacl::enable_if< viennacl::is_ublas< typename viennacl::traits::tag_of< VectorT >::type >::value, + typename VectorT::value_type + >::type + norm_inf(VectorT const& v1) + { + return boost::numeric::ublas::norm_inf(v1); + } + #endif + + + // ---------------------------------------------------- + // STL + // + template< typename T, typename A > + T norm_inf(std::vector<T, A> const & v1) + { + //std::cout << "stl .. " << std::endl; + T result = 0; + for (typename std::vector<T, A>::size_type i=0; i<v1.size(); ++i) + { + if (std::fabs(v1[i]) > result) + result = std::fabs(v1[i]); + } + + return result; + } + + // ---------------------------------------------------- + // VIENNACL + // + template< typename ScalarType> + viennacl::scalar_expression< const viennacl::vector_base<ScalarType>, + const viennacl::vector_base<ScalarType>, + viennacl::op_norm_inf > + norm_inf(viennacl::vector_base<ScalarType> const & v1) + { + //std::cout << "viennacl .. " << std::endl; + return viennacl::scalar_expression< const viennacl::vector_base<ScalarType>, + const viennacl::vector_base<ScalarType>, + viennacl::op_norm_inf >(v1, v1); + } + + // with vector expression: + template<typename LHS, typename RHS, typename OP> + viennacl::scalar_expression<const viennacl::vector_expression<const LHS, const RHS, OP>, + const viennacl::vector_expression<const LHS, const RHS, OP>, + viennacl::op_norm_inf> + norm_inf(viennacl::vector_expression<const LHS, const RHS, OP> const & vector) + { + return viennacl::scalar_expression< const viennacl::vector_expression<const LHS, const RHS, OP>, + const viennacl::vector_expression<const LHS, const RHS, OP>, + viennacl::op_norm_inf >(vector, vector); + } + + + } // end namespace linalg +} // end namespace viennacl +#endif + + + + +
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/amg_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/amg_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/amg_operations.hpp new file mode 100644 index 0000000..7cdcf89 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/amg_operations.hpp @@ -0,0 +1,458 @@ +#ifndef VIENNACL_LINALG_OPENCL_AMG_OPERATIONS_HPP +#define VIENNACL_LINALG_OPENCL_AMG_OPERATIONS_HPP + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the PDF manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file opencl/amg_operations.hpp + @brief Implementations of routines for AMG in OpenCL. +*/ + +#include <cstdlib> +#include <cmath> +#include <map> + +#include "viennacl/linalg/detail/amg/amg_base.hpp" +#include "viennacl/linalg/opencl/common.hpp" +#include "viennacl/linalg/opencl/kernels/amg.hpp" + + +namespace viennacl +{ +namespace linalg +{ +namespace opencl +{ +namespace amg +{ + + +/////////////////////////////////////////// + +/** @brief Routine for taking all connections in the matrix as strong */ +template<typename NumericT> +void amg_influence_trivial(compressed_matrix<NumericT> const & A, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + (void)tag; + + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); + viennacl::linalg::opencl::kernels::amg<NumericT>::init(ctx); + viennacl::ocl::kernel & influence_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_influence_trivial"); + + viennacl::ocl::enqueue(influence_kernel(A.handle1().opencl_handle(), A.handle2().opencl_handle(), + cl_uint(A.size1()), + cl_uint(A.nnz()), + viennacl::traits::opencl_handle(amg_context.influence_jumper_), + viennacl::traits::opencl_handle(amg_context.influence_ids_), + viennacl::traits::opencl_handle(amg_context.influence_values_) + ) + ); +} + + +/** @brief Routine for extracting strongly connected points considering a user-provided threshold value */ +template<typename NumericT> +void amg_influence_advanced(compressed_matrix<NumericT> const & A, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + (void)A; (void)amg_context; (void)tag; + throw std::runtime_error("amg_influence_advanced() not implemented for OpenCL yet"); +} + + +/** @brief Dispatcher for influence processing */ +template<typename NumericT> +void amg_influence(compressed_matrix<NumericT> const & A, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + // TODO: dispatch based on influence tolerance provided + amg_influence_trivial(A, amg_context, tag); +} + + + +/** @brief Assign IDs to coarse points. +* +* TODO: Use exclusive_scan on GPU for this. +*/ +inline void enumerate_coarse_points(viennacl::linalg::detail::amg::amg_level_context & amg_context) +{ + viennacl::backend::typesafe_host_array<unsigned int> point_types(amg_context.point_types_.handle(), amg_context.point_types_.size()); + viennacl::backend::typesafe_host_array<unsigned int> coarse_ids(amg_context.coarse_id_.handle(), amg_context.coarse_id_.size()); + viennacl::backend::memory_read(amg_context.point_types_.handle(), 0, point_types.raw_size(), point_types.get()); + viennacl::backend::memory_read(amg_context.coarse_id_.handle(), 0, coarse_ids.raw_size(), coarse_ids.get()); + + unsigned int coarse_id = 0; + for (std::size_t i=0; i<amg_context.point_types_.size(); ++i) + { + coarse_ids.set(i, coarse_id); + if (point_types[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE) + ++coarse_id; + } + + amg_context.num_coarse_ = coarse_id; + + viennacl::backend::memory_write(amg_context.coarse_id_.handle(), 0, coarse_ids.raw_size(), coarse_ids.get()); +} + + +////////////////////////////////////// + + + +/** @brief AG (aggregation based) coarsening, single-threaded version of stage 1 +* +* @param A Operator matrix on all levels +* @param amg_context AMG hierarchy datastructures +* @param tag AMG preconditioner tag +*/ +template<typename NumericT> +void amg_coarse_ag_stage1_mis2(compressed_matrix<NumericT> const & A, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + (void)tag; + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); + viennacl::linalg::opencl::kernels::amg<NumericT>::init(ctx); + + viennacl::vector<unsigned int> random_weights(A.size1(), viennacl::context(viennacl::MAIN_MEMORY)); + unsigned int *random_weights_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(random_weights.handle()); + for (std::size_t i=0; i<random_weights.size(); ++i) + random_weights_ptr[i] = static_cast<unsigned int>(rand()) % static_cast<unsigned int>(A.size1()); + random_weights.switch_memory_context(viennacl::traits::context(A)); + + // work vectors: + viennacl::vector<unsigned int> work_state(A.size1(), viennacl::traits::context(A)); + viennacl::vector<unsigned int> work_random(A.size1(), viennacl::traits::context(A)); + viennacl::vector<unsigned int> work_index(A.size1(), viennacl::traits::context(A)); + + viennacl::vector<unsigned int> work_state2(A.size1(), viennacl::traits::context(A)); + viennacl::vector<unsigned int> work_random2(A.size1(), viennacl::traits::context(A)); + viennacl::vector<unsigned int> work_index2(A.size1(), viennacl::traits::context(A)); + + unsigned int num_undecided = static_cast<unsigned int>(A.size1()); + viennacl::vector<unsigned int> undecided_buffer(256, viennacl::traits::context(A)); + viennacl::backend::typesafe_host_array<unsigned int> undecided_buffer_host(undecided_buffer.handle(), undecided_buffer.size()); + + viennacl::ocl::kernel & init_workdata_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_pmis2_init_workdata"); + viennacl::ocl::kernel & max_neighborhood_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_pmis2_max_neighborhood"); + viennacl::ocl::kernel & mark_mis_nodes_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_pmis2_mark_mis_nodes"); + viennacl::ocl::kernel & reset_state_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_pmis2_reset_state"); + + unsigned int pmis_iters = 0; + while (num_undecided > 0) + { + ++pmis_iters; + + // + // init temporary work data: + // + viennacl::ocl::enqueue(init_workdata_kernel(work_state, work_random, work_index, + amg_context.point_types_, + random_weights, + cl_uint(A.size1()) + ) + ); + + // + // Propagate maximum tuple twice + // + for (unsigned int r = 0; r < 2; ++r) + { + // max operation + viennacl::ocl::enqueue(max_neighborhood_kernel(work_state, work_random, work_index, + work_state2, work_random2, work_index2, + amg_context.influence_jumper_, amg_context.influence_ids_, + cl_uint(A.size1()) + ) + ); + + // copy work array (can be fused into a single kernel if needed. Previous kernel is in most cases sufficiently heavy) + work_state = work_state2; + work_random = work_random2; + work_index = work_index2; + } + + // + // mark MIS and non-MIS nodes: + // + viennacl::ocl::enqueue(mark_mis_nodes_kernel(work_state, work_index, + amg_context.point_types_, + undecided_buffer, + cl_uint(A.size1()) + ) + ); + + // get number of undecided points on host: + viennacl::backend::memory_read(undecided_buffer.handle(), 0, undecided_buffer_host.raw_size(), undecided_buffer_host.get()); + num_undecided = 0; + for (std::size_t i=0; i<undecided_buffer.size(); ++i) + num_undecided += undecided_buffer_host[i]; + + } //while + + viennacl::ocl::enqueue(reset_state_kernel(amg_context.point_types_, cl_uint(amg_context.point_types_.size()) ) ); +} + + + +/** @brief AG (aggregation based) coarsening. Partially single-threaded version (VIENNACL_AMG_COARSE_AG) +* +* @param A Operator matrix +* @param amg_context AMG hierarchy datastructures +* @param tag AMG preconditioner tag +*/ +template<typename NumericT> +void amg_coarse_ag(compressed_matrix<NumericT> const & A, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); + viennacl::linalg::opencl::kernels::amg<NumericT>::init(ctx); + + amg_influence_trivial(A, amg_context, tag); + + // + // Stage 1: Build aggregates: + // + if (tag.get_coarsening_method() == viennacl::linalg::AMG_COARSENING_METHOD_MIS2_AGGREGATION) + amg_coarse_ag_stage1_mis2(A, amg_context, tag); + else + throw std::runtime_error("Only MIS2 coarsening implemented. Selected coarsening not available with OpenCL backend!"); + + viennacl::linalg::opencl::amg::enumerate_coarse_points(amg_context); + + // + // Stage 2: Propagate coarse aggregate indices to neighbors: + // + viennacl::ocl::kernel & propagate_coarse_indices = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_agg_propagate_coarse_indices"); + viennacl::ocl::enqueue(propagate_coarse_indices(amg_context.point_types_, + amg_context.coarse_id_, + amg_context.influence_jumper_, + amg_context.influence_ids_, + cl_uint(A.size1()) + ) + ); + + // + // Stage 3: Merge remaining undecided points (merging to first aggregate found when cycling over the hierarchy + // + viennacl::ocl::kernel & merge_undecided = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_agg_merge_undecided"); + viennacl::ocl::enqueue(merge_undecided(amg_context.point_types_, + amg_context.coarse_id_, + amg_context.influence_jumper_, + amg_context.influence_ids_, + cl_uint(A.size1()) + ) + ); + + // + // Stage 4: Set undecided points to fine points (coarse ID already set in Stage 3) + // Note: Stage 3 and Stage 4 were initially fused, but are now split in order to avoid race conditions (or a fallback to sequential execution). + // + viennacl::ocl::kernel & merge_undecided_2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_agg_merge_undecided_2"); + viennacl::ocl::enqueue(merge_undecided_2(amg_context.point_types_, cl_uint(A.size1()) ) ); + +} + + + + +/** @brief Calls the right coarsening procedure +* +* @param A Operator matrix on all levels +* @param amg_context AMG hierarchy datastructures +* @param tag AMG preconditioner tag +*/ +template<typename InternalT1> +void amg_coarse(InternalT1 & A, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + switch (tag.get_coarsening_method()) + { + case viennacl::linalg::AMG_COARSENING_METHOD_MIS2_AGGREGATION: amg_coarse_ag(A, amg_context, tag); break; + default: throw std::runtime_error("not implemented yet"); + } +} + + + + +////////////////////////////////////// Interpolation ///////////////////////////// + + +/** @brief AG (aggregation based) interpolation. Multi-Threaded! (VIENNACL_INTERPOL_SA) + * + * @param A Operator matrix + * @param P Prolongation matrix + * @param amg_context AMG hierarchy datastructures + * @param tag AMG configuration tag +*/ +template<typename NumericT> +void amg_interpol_ag(compressed_matrix<NumericT> const & A, + compressed_matrix<NumericT> & P, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); + viennacl::linalg::opencl::kernels::amg<NumericT>::init(ctx); + + (void)tag; + P = compressed_matrix<NumericT>(A.size1(), amg_context.num_coarse_, A.size1(), viennacl::traits::context(A)); + + // build matrix here + viennacl::ocl::kernel & interpolate_ag = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_interpol_ag"); + viennacl::ocl::enqueue(interpolate_ag(P.handle1().opencl_handle(), + P.handle2().opencl_handle(), + P.handle().opencl_handle(), + amg_context.coarse_id_, + cl_uint(A.size1()) + ) + ); + + P.generate_row_block_information(); +} + +/** @brief Smoothed aggregation interpolation. (VIENNACL_INTERPOL_SA) + * + * @param A Operator matrix + * @param P Prolongation matrix + * @param amg_context AMG hierarchy datastructures + * @param tag AMG configuration tag +*/ +template<typename NumericT> +void amg_interpol_sa(compressed_matrix<NumericT> const & A, + compressed_matrix<NumericT> & P, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); + viennacl::linalg::opencl::kernels::amg<NumericT>::init(ctx); + + (void)tag; + viennacl::compressed_matrix<NumericT> P_tentative(A.size1(), amg_context.num_coarse_, A.size1(), viennacl::traits::context(A)); + + // form tentative operator: + amg_interpol_ag(A, P_tentative, amg_context, tag); + + viennacl::compressed_matrix<NumericT> Jacobi(A.size1(), A.size1(), A.nnz(), viennacl::traits::context(A)); + + viennacl::ocl::kernel & interpol_sa = ctx.get_kernel(viennacl::linalg::opencl::kernels::amg<NumericT>::program_name(), "amg_interpol_sa"); + viennacl::ocl::enqueue(interpol_sa(A.handle1().opencl_handle(), + A.handle2().opencl_handle(), + A.handle().opencl_handle(), + cl_uint(A.size1()), + cl_uint(A.nnz()), + Jacobi.handle1().opencl_handle(), + Jacobi.handle2().opencl_handle(), + Jacobi.handle().opencl_handle(), + NumericT(tag.get_jacobi_weight()) + ) + ); + + P = viennacl::linalg::prod(Jacobi, P_tentative); + + P.generate_row_block_information(); +} + +/** @brief Dispatcher for building the interpolation matrix + * + * @param A Operator matrix + * @param P Prolongation matrix + * @param amg_context AMG hierarchy datastructures + * @param tag AMG configuration tag +*/ +template<typename MatrixT> +void amg_interpol(MatrixT const & A, + MatrixT & P, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + switch (tag.get_interpolation_method()) + { + case viennacl::linalg::AMG_INTERPOLATION_METHOD_AGGREGATION: amg_interpol_ag (A, P, amg_context, tag); break; + case viennacl::linalg::AMG_INTERPOLATION_METHOD_SMOOTHED_AGGREGATION: amg_interpol_sa (A, P, amg_context, tag); break; + default: throw std::runtime_error("Not implemented yet!"); + } +} + +/** Assign sparse matrix A to dense matrix B */ +template<typename NumericT, unsigned int AlignmentV> +void assign_to_dense(viennacl::compressed_matrix<NumericT, AlignmentV> const & A, + viennacl::matrix_base<NumericT> & B) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); + viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::init(ctx); + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::program_name(), + "assign_to_dense"); + + viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), + viennacl::traits::opencl_handle(B), + cl_uint(viennacl::traits::start1(B)), cl_uint(viennacl::traits::start2(B)), + cl_uint(viennacl::traits::stride1(B)), cl_uint(viennacl::traits::stride2(B)), + cl_uint(viennacl::traits::size1(B)), cl_uint(viennacl::traits::size2(B)), + cl_uint(viennacl::traits::internal_size1(B)), cl_uint(viennacl::traits::internal_size2(B)) )); + +} + +/** @brief Jacobi Smoother (OpenCL version) +* +* @param iterations Number of smoother iterations +* @param A Operator matrix for the smoothing +* @param x The vector smoothing is applied to +* @param x_backup (Different) Vector holding the same values as x +* @param rhs_smooth The right hand side of the equation for the smoother +* @param weight Damping factor. 0: No effect of smoother. 1: Undamped Jacobi iteration +*/ +template<typename NumericT> +void smooth_jacobi(unsigned int iterations, + compressed_matrix<NumericT> const & A, + vector<NumericT> & x, + vector<NumericT> & x_backup, + vector<NumericT> const & rhs_smooth, + NumericT weight) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(x).context()); + viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::init(ctx); + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::compressed_matrix<NumericT>::program_name(), "jacobi"); + + for (unsigned int i=0; i<iterations; ++i) + { + x_backup = x; + + viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), + static_cast<NumericT>(weight), + viennacl::traits::opencl_handle(x_backup), + viennacl::traits::opencl_handle(x), + viennacl::traits::opencl_handle(rhs_smooth), + static_cast<cl_uint>(rhs_smooth.size()))); + + } +} + + +} //namespace amg +} //namespace host_based +} //namespace linalg +} //namespace viennacl + +#endif http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/bisect_kernel_calls.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/bisect_kernel_calls.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/bisect_kernel_calls.hpp new file mode 100644 index 0000000..2fcd6fa --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/bisect_kernel_calls.hpp @@ -0,0 +1,177 @@ +#ifndef VIENNACL_LINALG_OPENCL_BISECT_KERNEL_CALLS_HPP_ +#define VIENNACL_LINALG_OPENCL_BISECT_KERNEL_CALLS_HPP_ + + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + + +/** @file viennacl/linalg/opencl/bisect_kernel_calls.hpp + @brief OpenCL kernel calls for the bisection algorithm + + Implementation based on the sample provided with the CUDA 6.0 SDK, for which + the creation of derivative works is allowed by including the following statement: + "This software contains source code provided by NVIDIA Corporation." +*/ + +// includes, project +#include "viennacl/linalg/opencl/kernels/bisect.hpp" +#include "viennacl/linalg/detail/bisect/structs.hpp" +#include "viennacl/linalg/detail/bisect/config.hpp" +#include "viennacl/linalg/detail/bisect/util.hpp" + +namespace viennacl +{ +namespace linalg +{ +namespace opencl +{ +const std::string BISECT_KERNEL_SMALL = "bisectKernelSmall"; +const std::string BISECT_KERNEL_LARGE = "bisectKernelLarge"; +const std::string BISECT_KERNEL_LARGE_ONE_INTERVALS = "bisectKernelLarge_OneIntervals"; +const std::string BISECT_KERNEL_LARGE_MULT_INTERVALS = "bisectKernelLarge_MultIntervals"; + +template<typename NumericT> +void bisectSmall(const viennacl::linalg::detail::InputData<NumericT> &input, + viennacl::linalg::detail::ResultDataSmall<NumericT> &result, + const unsigned int mat_size, + const NumericT lg, const NumericT ug, + const NumericT precision) + { + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input.g_a).context()); + viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::init(ctx); + + viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::program_name(), BISECT_KERNEL_SMALL); + kernel.global_work_size(0, 1 * VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX); + kernel.local_work_size(0, VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX); + + viennacl::ocl::enqueue(kernel(viennacl::traits::opencl_handle(input.g_a), + viennacl::traits::opencl_handle(input.g_b), + static_cast<cl_uint>(mat_size), + viennacl::traits::opencl_handle(result.vcl_g_left), + viennacl::traits::opencl_handle(result.vcl_g_right), + viennacl::traits::opencl_handle(result.vcl_g_left_count), + viennacl::traits::opencl_handle(result.vcl_g_right_count), + static_cast<NumericT>(lg), + static_cast<NumericT>(ug), + static_cast<cl_uint>(0), + static_cast<cl_uint>(mat_size), + static_cast<NumericT>(precision) + )); + + } + +template<typename NumericT> +void bisectLarge(const viennacl::linalg::detail::InputData<NumericT> &input, + viennacl::linalg::detail::ResultDataLarge<NumericT> &result, + const unsigned int mat_size, + const NumericT lg, const NumericT ug, + const NumericT precision) + { + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input.g_a).context()); + viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::init(ctx); + + viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::program_name(), BISECT_KERNEL_LARGE); + kernel.global_work_size(0, mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2); // Use only 128 threads for 256 < n <= 512, this + kernel.local_work_size(0, mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2); // is reasoned + + viennacl::ocl::enqueue(kernel(viennacl::traits::opencl_handle(input.g_a), + viennacl::traits::opencl_handle(input.g_b), + static_cast<cl_uint>(mat_size), + static_cast<NumericT>(lg), + static_cast<NumericT>(ug), + static_cast<cl_uint>(0), + static_cast<cl_uint>(mat_size), + static_cast<NumericT>(precision), + viennacl::traits::opencl_handle(result.g_num_one), + viennacl::traits::opencl_handle(result.g_num_blocks_mult), + viennacl::traits::opencl_handle(result.g_left_one), + viennacl::traits::opencl_handle(result.g_right_one), + viennacl::traits::opencl_handle(result.g_pos_one), + viennacl::traits::opencl_handle(result.g_left_mult), + viennacl::traits::opencl_handle(result.g_right_mult), + viennacl::traits::opencl_handle(result.g_left_count_mult), + viennacl::traits::opencl_handle(result.g_right_count_mult), + viennacl::traits::opencl_handle(result.g_blocks_mult), + viennacl::traits::opencl_handle(result.g_blocks_mult_sum) + )); + + } + +template<typename NumericT> +void bisectLargeOneIntervals(const viennacl::linalg::detail::InputData<NumericT> &input, + viennacl::linalg::detail::ResultDataLarge<NumericT> &result, + const unsigned int mat_size, + const NumericT precision) + { + unsigned int num_one_intervals = result.g_num_one; + unsigned int num_blocks = viennacl::linalg::detail::getNumBlocksLinear(num_one_intervals, + mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK: VIENNACL_BISECT_MAX_THREADS_BLOCK / 2); + + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input.g_a).context()); + viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::init(ctx); + + viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::program_name(), BISECT_KERNEL_LARGE_ONE_INTERVALS); + kernel.global_work_size(0, num_blocks * (mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2)); + kernel.local_work_size(0, mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2); + + viennacl::ocl::enqueue(kernel(viennacl::traits::opencl_handle(input.g_a), + viennacl::traits::opencl_handle(input.g_b), + static_cast<cl_uint>(mat_size), + static_cast<cl_uint>(num_one_intervals), + viennacl::traits::opencl_handle(result.g_left_one), + viennacl::traits::opencl_handle(result.g_right_one), + viennacl::traits::opencl_handle(result.g_pos_one), + static_cast<NumericT>(precision) + )); + } + + +template<typename NumericT> +void bisectLargeMultIntervals(const viennacl::linalg::detail::InputData<NumericT> &input, + viennacl::linalg::detail::ResultDataLarge<NumericT> &result, + const unsigned int mat_size, + const NumericT precision) + { + unsigned int num_blocks_mult = result.g_num_blocks_mult; + + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input.g_a).context()); + viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::init(ctx); + + viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::bisect_kernel<NumericT>::program_name(), BISECT_KERNEL_LARGE_MULT_INTERVALS); + kernel.global_work_size(0, num_blocks_mult * (mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2)); + kernel.local_work_size(0, mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2); + + viennacl::ocl::enqueue(kernel(viennacl::traits::opencl_handle(input.g_a), + viennacl::traits::opencl_handle(input.g_b), + static_cast<cl_uint>(mat_size), + viennacl::traits::opencl_handle(result.g_blocks_mult), + viennacl::traits::opencl_handle(result.g_blocks_mult_sum), + viennacl::traits::opencl_handle(result.g_left_mult), + viennacl::traits::opencl_handle(result.g_right_mult), + viennacl::traits::opencl_handle(result.g_left_count_mult), + viennacl::traits::opencl_handle(result.g_right_count_mult), + viennacl::traits::opencl_handle(result.g_lambda_mult), + viennacl::traits::opencl_handle(result.g_pos_mult), + static_cast<NumericT>(precision) + )); + } +} // namespace opencl +} // namespace linalg +} // namespace viennacl + +#endif http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/common.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/common.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/common.hpp new file mode 100644 index 0000000..d6a288b --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/common.hpp @@ -0,0 +1,102 @@ +#ifndef VIENNACL_LINALG_OPENCL_COMMON_HPP_ +#define VIENNACL_LINALG_OPENCL_COMMON_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/opencl/common.hpp + @brief Common implementations shared by OpenCL-based operations +*/ + +#include <cmath> + +#include "viennacl/forwards.h" +#include "viennacl/ocl/platform.hpp" +#include "viennacl/traits/handle.hpp" + +namespace viennacl +{ +namespace linalg +{ +namespace opencl +{ +namespace detail +{ + + + +inline cl_uint make_options(vcl_size_t length, bool reciprocal, bool flip_sign) +{ + return static_cast<cl_uint>( ((length > 1) ? (cl_uint(length) << 2) : 0) + (reciprocal ? 2 : 0) + (flip_sign ? 1 : 0) ); +} + + +/** @brief Returns the OpenCL kernel string for the operation C = A * B with A sparse, B, C dense matrices. */ +inline std::string sparse_dense_matmult_kernel_name(bool B_transposed, bool B_row_major, bool C_row_major) +{ + if (B_transposed) + { + if (B_row_major && C_row_major) + return "trans_mat_mult_row_row"; + if (B_row_major && !C_row_major) + return "trans_mat_mult_row_col"; + if (!B_row_major && C_row_major) + return "trans_mat_mult_col_row"; + + return "trans_mat_mult_col_col"; + } + + if (B_row_major && C_row_major) + return "mat_mult_row_row"; + if (B_row_major && !C_row_major) + return "mat_mult_row_col"; + if (!B_row_major && C_row_major) + return "mat_mult_col_row"; + + return "mat_mult_col_col"; +} + + + +template<typename SomeT> +ocl::device const & current_device(SomeT const & obj) { return traits::opencl_handle(obj).context().current_device(); } + +inline std::string op_to_string(op_abs) { return "abs"; } +inline std::string op_to_string(op_acos) { return "acos"; } +inline std::string op_to_string(op_asin) { return "asin"; } +inline std::string op_to_string(op_atan) { return "atan"; } +inline std::string op_to_string(op_ceil) { return "ceil"; } +inline std::string op_to_string(op_cos) { return "cos"; } +inline std::string op_to_string(op_cosh) { return "cosh"; } +inline std::string op_to_string(op_exp) { return "exp"; } +inline std::string op_to_string(op_fabs) { return "fabs"; } +inline std::string op_to_string(op_floor) { return "floor"; } +inline std::string op_to_string(op_log) { return "log"; } +inline std::string op_to_string(op_log10) { return "log10"; } +inline std::string op_to_string(op_sin) { return "sin"; } +inline std::string op_to_string(op_sinh) { return "sinh"; } +inline std::string op_to_string(op_sqrt) { return "sqrt"; } +inline std::string op_to_string(op_tan) { return "tan"; } +inline std::string op_to_string(op_tanh) { return "tanh"; } + +} //namespace detail +} //namespace opencl +} //namespace linalg +} //namespace viennacl + + +#endif http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/direct_solve.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/direct_solve.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/direct_solve.hpp new file mode 100644 index 0000000..76874b1 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/direct_solve.hpp @@ -0,0 +1,153 @@ +#ifndef VIENNACL_LINALG_OPENCL_DIRECT_SOLVE_HPP +#define VIENNACL_LINALG_OPENCL_DIRECT_SOLVE_HPP + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/opencl/direct_solve.hpp + @brief Implementations of dense direct solvers are found here. +*/ + +#include "viennacl/vector.hpp" +#include "viennacl/matrix.hpp" +#include "viennacl/ocl/kernel.hpp" +#include "viennacl/ocl/device.hpp" +#include "viennacl/ocl/handle.hpp" +#include "viennacl/linalg/opencl/kernels/matrix_solve.hpp" +#include "viennacl/linalg/opencl/kernels/matrix.hpp" + +namespace viennacl +{ +namespace linalg +{ +namespace opencl +{ + +namespace detail +{ + inline cl_uint get_option_for_solver_tag(viennacl::linalg::upper_tag) { return 0; } + inline cl_uint get_option_for_solver_tag(viennacl::linalg::unit_upper_tag) { return (1 << 0); } + inline cl_uint get_option_for_solver_tag(viennacl::linalg::lower_tag) { return (1 << 2); } + inline cl_uint get_option_for_solver_tag(viennacl::linalg::unit_lower_tag) { return (1 << 2) | (1 << 0); } + + template<typename MatrixT1, typename MatrixT2, typename KernelT> + void inplace_solve_impl(MatrixT1 const & A, MatrixT2 & B, KernelT & k) + { + viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(A), + cl_uint(viennacl::traits::start1(A)), cl_uint(viennacl::traits::start2(A)), + cl_uint(viennacl::traits::stride1(A)), cl_uint(viennacl::traits::stride2(A)), + cl_uint(viennacl::traits::size1(A)), cl_uint(viennacl::traits::size2(A)), + cl_uint(viennacl::traits::internal_size1(A)), cl_uint(viennacl::traits::internal_size2(A)), + viennacl::traits::opencl_handle(B), + cl_uint(viennacl::traits::start1(B)), cl_uint(viennacl::traits::start2(B)), + cl_uint(viennacl::traits::stride1(B)), cl_uint(viennacl::traits::stride2(B)), + cl_uint(viennacl::traits::size1(B)), cl_uint(viennacl::traits::size2(B)), + cl_uint(viennacl::traits::internal_size1(B)), cl_uint(viennacl::traits::internal_size2(B)) + ) + ); + } +} + + +// +// Note: By convention, all size checks are performed in the calling frontend. No need to double-check here. +// + +////////////////// upper triangular solver (upper_tag) ////////////////////////////////////// +/** @brief Direct inplace solver for dense triangular systems. Matlab notation: A \ B +* +* @param A The system matrix +* @param B The matrix of row vectors, where the solution is directly written to +*/ +template<typename NumericT, typename SolverTagT> +void inplace_solve(matrix_base<NumericT> const & A, + matrix_base<NumericT> & B, + SolverTagT) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); + + std::string program_name; + if (A.row_major() && B.row_major()) + { + typedef viennacl::linalg::opencl::kernels::matrix_solve<NumericT, row_major, row_major> KernelClass; + KernelClass::init(ctx); + program_name = KernelClass::program_name(); + } + else if (A.row_major() && !B.row_major()) + { + typedef viennacl::linalg::opencl::kernels::matrix_solve<NumericT, row_major, column_major> KernelClass; + KernelClass::init(ctx); + program_name = KernelClass::program_name(); + } + else if (!A.row_major() && B.row_major()) + { + typedef viennacl::linalg::opencl::kernels::matrix_solve<NumericT, column_major, row_major> KernelClass; + KernelClass::init(ctx); + program_name = KernelClass::program_name(); + } + else + { + typedef viennacl::linalg::opencl::kernels::matrix_solve<NumericT, column_major, column_major> KernelClass; + KernelClass::init(ctx); + program_name = KernelClass::program_name(); + } + + std::stringstream ss; + ss << SolverTagT::name(); + ss << "_solve"; + + viennacl::ocl::kernel & k = ctx.get_kernel(program_name, ss.str()); + + k.global_work_size(0, B.size2() * k.local_work_size()); + detail::inplace_solve_impl(A, B, k); +} + + + +// +// Solve on vector +// + +template<typename NumericT, typename SOLVERTAG> +void inplace_solve(matrix_base<NumericT> const & A, + vector_base<NumericT> & x, + SOLVERTAG) +{ + cl_uint options = detail::get_option_for_solver_tag(SOLVERTAG()); + + viennacl::ocl::kernel & k = detail::legacy_kernel_for_matrix(A, "triangular_substitute_inplace"); + + k.global_work_size(0, k.local_work_size()); + viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(A), + cl_uint(viennacl::traits::start1(A)), cl_uint(viennacl::traits::start2(A)), + cl_uint(viennacl::traits::stride1(A)), cl_uint(viennacl::traits::stride2(A)), + cl_uint(viennacl::traits::size1(A)), cl_uint(viennacl::traits::size2(A)), + cl_uint(viennacl::traits::internal_size1(A)), cl_uint(viennacl::traits::internal_size2(A)), + viennacl::traits::opencl_handle(x), + cl_uint(viennacl::traits::start(x)), + cl_uint(viennacl::traits::stride(x)), + cl_uint(viennacl::traits::size(x)), + options + ) + ); +} + +} //namespace opencl +} //namespace linalg +} //namespace viennacl + +#endif http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/fft_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/fft_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/fft_operations.hpp new file mode 100644 index 0000000..a7b12b3 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/fft_operations.hpp @@ -0,0 +1,350 @@ +#ifndef VIENNACL_LINALG_OPENCL_FFT_OPERATIONS_HPP_ +#define VIENNACL_LINALG_OPENCL_FFT_OPERATIONS_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory + ============================================================================= */ + +/** @file viennacl/linalg/opencl/fft_operations.hpp + @brief Implementations of Fast Furier Transformation using OpenCL + */ + +#include "viennacl/forwards.h" +#include "viennacl/ocl/device.hpp" +#include "viennacl/ocl/kernel.hpp" +#include "viennacl/traits/handle.hpp" +#include "viennacl/traits/stride.hpp" +#include "viennacl/linalg/host_based/fft_operations.hpp" +#include "viennacl/linalg/opencl/kernels/fft.hpp" +#include "viennacl/linalg/opencl/kernels/matrix.hpp" + +#include <viennacl/vector.hpp> +#include <viennacl/matrix.hpp> + +#include <cmath> +#include <stdexcept> + +namespace viennacl +{ +namespace linalg +{ +namespace detail +{ +namespace fft +{ + + const vcl_size_t MAX_LOCAL_POINTS_NUM = 512; + + /** + * @brief Get number of bits + */ + inline vcl_size_t num_bits(vcl_size_t size) + { + vcl_size_t bits_datasize = 0; + vcl_size_t ds = 1; + + while (ds < size) + { + ds = ds << 1; + bits_datasize++; + } + + return bits_datasize; + } + + /** + * @brief Find next power of two + */ + inline vcl_size_t next_power_2(vcl_size_t n) + { + n = n - 1; + + vcl_size_t power = 1; + + while (power < sizeof(vcl_size_t) * 8) + { + n = n | (n >> power); + power *= 2; + } + + return n + 1; + } + +} //namespce fft +} //namespace detail + +namespace opencl +{ + +/** + * @brief Direct algorithm for computing Fourier transformation. + * + * Works on any sizes of data. + * Serial implementation has o(n^2) complexity + */ +template<typename NumericT> +void direct(viennacl::ocl::handle<cl_mem> const & in, + viennacl::ocl::handle<cl_mem> const & out, + vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1), + viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(in.context()); + viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx); + + std::string program_string = viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, row_major>::program_name(); + if (data_order == viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::COL_MAJOR) + { + viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, column_major>::init(ctx); + program_string = + viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, column_major>::program_name(); + } else + viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, row_major>::init(ctx); + + viennacl::ocl::kernel & k = ctx.get_kernel(program_string, "fft_direct"); + viennacl::ocl::enqueue(k(in, out, + static_cast<cl_uint>(size), + static_cast<cl_uint>(stride), + static_cast<cl_uint>(batch_num), + sign) + ); +} + +/* + * This function performs reorder of input data. Indexes are sorted in bit-reversal order. + * Such reordering should be done before in-place FFT. + */ +template<typename NumericT> +void reorder(viennacl::ocl::handle<cl_mem> const & in, + vcl_size_t size, vcl_size_t stride, + vcl_size_t bits_datasize, vcl_size_t batch_num, + viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(in.context()); + viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx); + + std::string program_string = viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, row_major>::program_name(); + if (data_order == viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::COL_MAJOR) + { + viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, column_major>::init(ctx); + program_string = viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, column_major>::program_name(); + } else + viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, row_major>::init(ctx); + + viennacl::ocl::kernel& k = ctx.get_kernel(program_string, "fft_reorder"); + viennacl::ocl::enqueue(k(in, + static_cast<cl_uint>(bits_datasize), static_cast<cl_uint>(size), + static_cast<cl_uint>(stride), static_cast<cl_uint>(batch_num)) + ); +} + +/** + * @brief Radix-2 algorithm for computing Fourier transformation. + * + * Works only on power-of-two sizes of data. + * Serial implementation has o(n * lg n) complexity. + * This is a Cooley-Tukey algorithm + */ +template<typename NumericT> +void radix2(viennacl::ocl::handle<cl_mem> const & in, + vcl_size_t size, vcl_size_t stride, + vcl_size_t batch_num, NumericT sign = NumericT(-1), + viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(in.context()); + viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx); + + assert(batch_num != 0 && bool("batch_num must be larger than 0")); + + std::string program_string = viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, row_major>::program_name(); + if (data_order == viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::COL_MAJOR) + { + viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, column_major>::init(ctx); + program_string = viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, column_major>::program_name(); + } else + viennacl::linalg::opencl::kernels::matrix_legacy<NumericT, row_major>::init(ctx); + + vcl_size_t bits_datasize = viennacl::linalg::detail::fft::num_bits(size); + if (size <= viennacl::linalg::detail::fft::MAX_LOCAL_POINTS_NUM) + { + viennacl::ocl::kernel & k = ctx.get_kernel(program_string, "fft_radix2_local"); + viennacl::ocl::enqueue(k(in, + viennacl::ocl::local_mem((size * 4) * sizeof(NumericT)), + static_cast<cl_uint>(bits_datasize), static_cast<cl_uint>(size), + static_cast<cl_uint>(stride), static_cast<cl_uint>(batch_num), sign)); + + } + else + { + viennacl::linalg::opencl::reorder<NumericT>(in, size, stride, bits_datasize, batch_num); + + for (vcl_size_t step = 0; step < bits_datasize; step++) + { + viennacl::ocl::kernel & k = ctx.get_kernel(program_string, "fft_radix2"); + viennacl::ocl::enqueue(k(in, + static_cast<cl_uint>(step), static_cast<cl_uint>(bits_datasize), + static_cast<cl_uint>(size), static_cast<cl_uint>(stride), + static_cast<cl_uint>(batch_num), sign)); + } + } +} + +/** + * @brief Bluestein's algorithm for computing Fourier transformation. + * + * Currently, Works only for sizes of input data which less than 2^16. + * Uses a lot of additional memory, but should be fast for any size of data. + * Serial implementation has something about o(n * lg n) complexity + */ +template<typename NumericT, unsigned int AlignmentV> +void bluestein(viennacl::vector<NumericT, AlignmentV>& in, + viennacl::vector<NumericT, AlignmentV>& out, vcl_size_t /*batch_num*/) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(in).context()); + viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx); + + vcl_size_t size = in.size() >> 1; + vcl_size_t ext_size = viennacl::linalg::detail::fft::next_power_2(2 * size - 1); + + viennacl::vector<NumericT, AlignmentV> A(ext_size << 1); + viennacl::vector<NumericT, AlignmentV> B(ext_size << 1); + viennacl::vector<NumericT, AlignmentV> Z(ext_size << 1); + + { + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "zero2"); + viennacl::ocl::enqueue(k(A, B, static_cast<cl_uint>(ext_size))); + } + { + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "bluestein_pre"); + viennacl::ocl::enqueue(k(in, A, B, static_cast<cl_uint>(size), static_cast<cl_uint>(ext_size))); + } + + viennacl::linalg::convolve_i(A, B, Z); + + { + viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "bluestein_post"); + viennacl::ocl::enqueue(k(Z, out, static_cast<cl_uint>(size))); + } +} + +/** + * @brief Mutiply two complex vectors and store result in output + */ +template<typename NumericT, unsigned int AlignmentV> +void multiply_complex(viennacl::vector<NumericT, AlignmentV> const & input1, + viennacl::vector<NumericT, AlignmentV> const & input2, + viennacl::vector<NumericT, AlignmentV> & output) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input1).context()); + viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx); + vcl_size_t size = input1.size() >> 1; + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "fft_mult_vec"); + viennacl::ocl::enqueue(k(input1, input2, output, static_cast<cl_uint>(size))); +} + +/** + * @brief Normalize vector on with his own size + */ +template<typename NumericT, unsigned int AlignmentV> +void normalize(viennacl::vector<NumericT, AlignmentV> & input) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input).context()); + viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx); + + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "fft_div_vec_scalar"); + + vcl_size_t size = input.size() >> 1; + NumericT norm_factor = static_cast<NumericT>(size); + viennacl::ocl::enqueue(k(input, static_cast<cl_uint>(size), norm_factor)); +} + +/** + * @brief Inplace_transpose matrix + */ +template<typename NumericT, unsigned int AlignmentV> +void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & input) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input).context()); + viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx); + + viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "transpose_inplace"); + viennacl::ocl::enqueue(k(input, static_cast<cl_uint>(input.internal_size1() >> 1), + static_cast<cl_uint>(input.internal_size2()) >> 1)); +} + +/** + * @brief Transpose matrix + */ +template<typename NumericT, unsigned int AlignmentV> +void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & input, + viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & output) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(input).context()); + viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx); + + viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "transpose"); + viennacl::ocl::enqueue(k(input, output, static_cast<cl_uint>(input.internal_size1() >> 1), + static_cast<cl_uint>(input.internal_size2() >> 1))); +} + +/** + * @brief Create complex vector from real vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part) + */ +template<typename NumericT> +void real_to_complex(viennacl::vector_base<NumericT> const & in, + viennacl::vector_base<NumericT> & out, vcl_size_t size) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(in).context()); + viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx); + + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "real_to_complex"); + viennacl::ocl::enqueue(k(in, out, static_cast<cl_uint>(size))); +} + +/** + * @brief Create real vector from complex vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part) + */ +template<typename NumericT> +void complex_to_real(viennacl::vector_base<NumericT> const & in, + viennacl::vector_base<NumericT> & out, vcl_size_t size) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(in).context()); + viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx); + + viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "complex_to_real"); + viennacl::ocl::enqueue(k(in, out, static_cast<cl_uint>(size))); +} + +/** + * @brief Reverse vector to oposite order and save it in input vector + */ +template<typename NumericT> +void reverse(viennacl::vector_base<NumericT>& in) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(in).context()); + viennacl::linalg::opencl::kernels::fft<NumericT>::init(ctx); + + vcl_size_t size = in.size(); + + viennacl::ocl::kernel& k = ctx.get_kernel(viennacl::linalg::opencl::kernels::fft<NumericT>::program_name(), "reverse_inplace"); + viennacl::ocl::enqueue(k(in, static_cast<cl_uint>(size))); +} + +} //namespace opencl +} //namespace linalg +} //namespace viennacl + +#endif /* FFT_OPERATIONS_HPP_ */ + http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/ilu_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/ilu_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/ilu_operations.hpp new file mode 100644 index 0000000..248a88a --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/ilu_operations.hpp @@ -0,0 +1,260 @@ +#ifndef VIENNACL_LINALG_OPENCL_ILU_OPERATIONS_HPP_ +#define VIENNACL_LINALG_OPENCL_ILU_OPERATIONS_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the PDF manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/opencl/ilu_operations.hpp + @brief Implementations of specialized routines for the Chow-Patel parallel ILU preconditioner using OpenCL +*/ + +#include <cmath> +#include <algorithm> //for std::max and std::min + +#include "viennacl/forwards.h" +#include "viennacl/scalar.hpp" +#include "viennacl/tools/tools.hpp" +#include "viennacl/linalg/opencl/common.hpp" +#include "viennacl/linalg/opencl/kernels/ilu.hpp" +#include "viennacl/meta/predicate.hpp" +#include "viennacl/meta/enable_if.hpp" +#include "viennacl/traits/size.hpp" +#include "viennacl/traits/start.hpp" +#include "viennacl/traits/stride.hpp" +#include "viennacl/linalg/vector_operations.hpp" + + +namespace viennacl +{ +namespace linalg +{ +namespace opencl +{ + +/////////////////////// ICC ///////////////////// + +template<typename NumericT> +void extract_L(compressed_matrix<NumericT> const & A, + compressed_matrix<NumericT> & L) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); + viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx); + + // + // Step 1: Count elements in L: + // + viennacl::ocl::kernel & k1 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "extract_L_1"); + + viennacl::ocl::enqueue(k1(A.handle1().opencl_handle(), A.handle2().opencl_handle(), cl_uint(A.size1()), + L.handle1().opencl_handle()) + ); + + // + // Step 2: Exclusive scan on row_buffers: + // + viennacl::vector_base<unsigned int> wrapped_L_row_buffer(L.handle1(), A.size1() + 1, 0, 1); + viennacl::linalg::exclusive_scan(wrapped_L_row_buffer, wrapped_L_row_buffer); + L.reserve(wrapped_L_row_buffer[L.size1()], false); + + + // + // Step 3: Write entries + // + viennacl::ocl::kernel & k2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "extract_L_2"); + + viennacl::ocl::enqueue(k2(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.size1()), + L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle()) + ); + + L.generate_row_block_information(); + +} // extract_LU + +/////////////////////////////////////////////// + + + +/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */ +template<typename NumericT> +void icc_scale(compressed_matrix<NumericT> const & A, + compressed_matrix<NumericT> & L) +{ + viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A)); + + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); + viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx); + + // fill D: + viennacl::ocl::kernel & k1 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "ilu_scale_kernel_1"); + viennacl::ocl::enqueue(k1(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.size1()), D) ); + + // scale L: + viennacl::ocl::kernel & k2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "ilu_scale_kernel_2"); + viennacl::ocl::enqueue(k2(L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(), cl_uint(A.size1()), D) ); + +} + +///////////////////////////////////// + + +/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenCL (cf. Algorithm 2 in paper) */ +template<typename NumericT> +void icc_chow_patel_sweep(compressed_matrix<NumericT> & L, + vector<NumericT> const & aij_L) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(L).context()); + viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx); + + viennacl::backend::mem_handle L_backup; + viennacl::backend::memory_create(L_backup, L.handle().raw_size(), viennacl::traits::context(L)); + viennacl::backend::memory_copy(L.handle(), L_backup, 0, 0, L.handle().raw_size()); + + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "icc_chow_patel_sweep_kernel"); + viennacl::ocl::enqueue(k(L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(), L_backup.opencl_handle(), cl_uint(L.size1()), + aij_L) + ); + +} + + +/////////////////////// ILU ///////////////////// + +template<typename NumericT> +void extract_LU(compressed_matrix<NumericT> const & A, + compressed_matrix<NumericT> & L, + compressed_matrix<NumericT> & U) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); + viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx); + + // + // Step 1: Count elements in L and U: + // + viennacl::ocl::kernel & k1 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "extract_LU_1"); + + viennacl::ocl::enqueue(k1(A.handle1().opencl_handle(), A.handle2().opencl_handle(), cl_uint(A.size1()), + L.handle1().opencl_handle(), + U.handle1().opencl_handle()) + ); + + // + // Step 2: Exclusive scan on row_buffers: + // + viennacl::vector_base<unsigned int> wrapped_L_row_buffer(L.handle1(), A.size1() + 1, 0, 1); + viennacl::linalg::exclusive_scan(wrapped_L_row_buffer, wrapped_L_row_buffer); + L.reserve(wrapped_L_row_buffer[L.size1()], false); + + viennacl::vector_base<unsigned int> wrapped_U_row_buffer(U.handle1(), A.size1() + 1, 0, 1); + viennacl::linalg::exclusive_scan(wrapped_U_row_buffer, wrapped_U_row_buffer); + U.reserve(wrapped_U_row_buffer[U.size1()], false); + + // + // Step 3: Write entries + // + viennacl::ocl::kernel & k2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "extract_LU_2"); + + viennacl::ocl::enqueue(k2(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.size1()), + L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(), + U.handle1().opencl_handle(), U.handle2().opencl_handle(), U.handle().opencl_handle()) + ); + + L.generate_row_block_information(); + // Note: block information for U will be generated after transposition + +} // extract_LU + +/////////////////////////////////////////////// + + + +/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */ +template<typename NumericT> +void ilu_scale(compressed_matrix<NumericT> const & A, + compressed_matrix<NumericT> & L, + compressed_matrix<NumericT> & U) +{ + viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A)); + + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); + viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx); + + // fill D: + viennacl::ocl::kernel & k1 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "ilu_scale_kernel_1"); + viennacl::ocl::enqueue(k1(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.size1()), D) ); + + // scale L: + viennacl::ocl::kernel & k2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "ilu_scale_kernel_2"); + viennacl::ocl::enqueue(k2(L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(), cl_uint(A.size1()), D) ); + + // scale U: + viennacl::ocl::enqueue(k2(U.handle1().opencl_handle(), U.handle2().opencl_handle(), U.handle().opencl_handle(), cl_uint(A.size1()), D) ); + +} + +///////////////////////////////////// + + +/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenCL (cf. Algorithm 2 in paper) */ +template<typename NumericT> +void ilu_chow_patel_sweep(compressed_matrix<NumericT> & L, + vector<NumericT> const & aij_L, + compressed_matrix<NumericT> & U_trans, + vector<NumericT> const & aij_U_trans) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(L).context()); + viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx); + + viennacl::backend::mem_handle L_backup; + viennacl::backend::memory_create(L_backup, L.handle().raw_size(), viennacl::traits::context(L)); + viennacl::backend::memory_copy(L.handle(), L_backup, 0, 0, L.handle().raw_size()); + + viennacl::backend::mem_handle U_backup; + viennacl::backend::memory_create(U_backup, U_trans.handle().raw_size(), viennacl::traits::context(U_trans)); + viennacl::backend::memory_copy(U_trans.handle(), U_backup, 0, 0, U_trans.handle().raw_size()); + + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "ilu_chow_patel_sweep_kernel"); + viennacl::ocl::enqueue(k(L.handle1().opencl_handle(), L.handle2().opencl_handle(), L.handle().opencl_handle(), L_backup.opencl_handle(), cl_uint(L.size1()), + aij_L, + U_trans.handle1().opencl_handle(), U_trans.handle2().opencl_handle(), U_trans.handle().opencl_handle(), U_backup.opencl_handle(), + aij_U_trans) + ); + +} + +////////////////////////////////////// + + + +template<typename NumericT> +void ilu_form_neumann_matrix(compressed_matrix<NumericT> & R, + vector<NumericT> & diag_R) +{ + viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(R).context()); + viennacl::linalg::opencl::kernels::ilu<NumericT>::init(ctx); + + viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::ilu<NumericT>::program_name(), "ilu_form_neumann_matrix_kernel"); + viennacl::ocl::enqueue(k(R.handle1().opencl_handle(), R.handle2().opencl_handle(), R.handle().opencl_handle(), cl_uint(R.size1()), + diag_R) + ); +} + +} //namespace opencl +} //namespace linalg +} //namespace viennacl + + +#endif
