http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/gmres.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/gmres.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/gmres.hpp new file mode 100644 index 0000000..fb89742 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/gmres.hpp @@ -0,0 +1,738 @@ +#ifndef VIENNACL_GMRES_HPP_ +#define VIENNACL_GMRES_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/gmres.hpp + @brief Implementations of the generalized minimum residual method are in this file. +*/ + +#include <vector> +#include <cmath> +#include <limits> +#include "viennacl/forwards.h" +#include "viennacl/tools/tools.hpp" +#include "viennacl/linalg/norm_2.hpp" +#include "viennacl/linalg/prod.hpp" +#include "viennacl/linalg/inner_prod.hpp" +#include "viennacl/traits/clear.hpp" +#include "viennacl/traits/size.hpp" +#include "viennacl/traits/context.hpp" +#include "viennacl/meta/result_of.hpp" + +#include "viennacl/linalg/iterative_operations.hpp" +#include "viennacl/vector_proxy.hpp" + + +namespace viennacl +{ +namespace linalg +{ + +/** @brief A tag for the solver GMRES. Used for supplying solver parameters and for dispatching the solve() function +*/ +class gmres_tag //generalized minimum residual +{ +public: + /** @brief The constructor + * + * @param tol Relative tolerance for the residual (solver quits if ||r|| < tol * ||r_initial||) + * @param max_iterations The maximum number of iterations (including restarts + * @param krylov_dim The maximum dimension of the Krylov space before restart (number of restarts is found by max_iterations / krylov_dim) + */ + gmres_tag(double tol = 1e-10, unsigned int max_iterations = 300, unsigned int krylov_dim = 20) + : tol_(tol), abs_tol_(0), iterations_(max_iterations), krylov_dim_(krylov_dim), iters_taken_(0) {} + + /** @brief Returns the relative tolerance */ + double tolerance() const { return tol_; } + + /** @brief Returns the absolute tolerance */ + double abs_tolerance() const { return abs_tol_; } + /** @brief Sets the absolute tolerance */ + void abs_tolerance(double new_tol) { if (new_tol >= 0) abs_tol_ = new_tol; } + + /** @brief Returns the maximum number of iterations */ + unsigned int max_iterations() const { return iterations_; } + /** @brief Returns the maximum dimension of the Krylov space before restart */ + unsigned int krylov_dim() const { return krylov_dim_; } + /** @brief Returns the maximum number of GMRES restarts */ + unsigned int max_restarts() const + { + unsigned int ret = iterations_ / krylov_dim_; + if (ret > 0 && (ret * krylov_dim_ == iterations_) ) + return ret - 1; + return ret; + } + + /** @brief Return the number of solver iterations: */ + unsigned int iters() const { return iters_taken_; } + /** @brief Set the number of solver iterations (should only be modified by the solver) */ + void iters(unsigned int i) const { iters_taken_ = i; } + + /** @brief Returns the estimated relative error at the end of the solver run */ + double error() const { return last_error_; } + /** @brief Sets the estimated relative error at the end of the solver run */ + void error(double e) const { last_error_ = e; } + +private: + double tol_; + double abs_tol_; + unsigned int iterations_; + unsigned int krylov_dim_; + + //return values from solver + mutable unsigned int iters_taken_; + mutable double last_error_; +}; + +namespace detail +{ + + template<typename SrcVectorT, typename DestVectorT> + void gmres_copy_helper(SrcVectorT const & src, DestVectorT & dest, vcl_size_t len, vcl_size_t start = 0) + { + for (vcl_size_t i=0; i<len; ++i) + dest[start+i] = src[start+i]; + } + + template<typename NumericT, typename DestVectorT> + void gmres_copy_helper(viennacl::vector<NumericT> const & src, DestVectorT & dest, vcl_size_t len, vcl_size_t start = 0) + { + typedef typename viennacl::vector<NumericT>::difference_type difference_type; + viennacl::copy( src.begin() + static_cast<difference_type>(start), + src.begin() + static_cast<difference_type>(start + len), + dest.begin() + static_cast<difference_type>(start)); + } + + /** @brief Computes the householder vector 'hh_vec' which rotates 'input_vec' such that all entries below the j-th entry of 'v' become zero. + * + * @param input_vec The input vector + * @param hh_vec The householder vector defining the relection (I - beta * hh_vec * hh_vec^T) + * @param beta The coefficient beta in (I - beta * hh_vec * hh_vec^T) + * @param mu The norm of the input vector part relevant for the reflection: norm_2(input_vec[j:size]) + * @param j Index of the last nonzero index in 'input_vec' after applying the reflection + */ + template<typename VectorT, typename NumericT> + void gmres_setup_householder_vector(VectorT const & input_vec, VectorT & hh_vec, NumericT & beta, NumericT & mu, vcl_size_t j) + { + NumericT input_j = input_vec(j); + + // copy entries from input vector to householder vector: + detail::gmres_copy_helper(input_vec, hh_vec, viennacl::traits::size(hh_vec) - (j+1), j+1); + + NumericT sigma = viennacl::linalg::norm_2(hh_vec); + sigma *= sigma; + + if (sigma <= 0) + { + beta = 0; + mu = input_j; + } + else + { + mu = std::sqrt(sigma + input_j*input_j); + + NumericT hh_vec_0 = (input_j <= 0) ? (input_j - mu) : (-sigma / (input_j + mu)); + + beta = NumericT(2) * hh_vec_0 * hh_vec_0 / (sigma + hh_vec_0 * hh_vec_0); + + //divide hh_vec by its diagonal element hh_vec_0 + hh_vec /= hh_vec_0; + hh_vec[j] = NumericT(1); + } + } + + // Apply (I - beta h h^T) to x (Householder reflection with Householder vector h) + template<typename VectorT, typename NumericT> + void gmres_householder_reflect(VectorT & x, VectorT const & h, NumericT beta) + { + NumericT hT_in_x = viennacl::linalg::inner_prod(h, x); + x -= (beta * hT_in_x) * h; + } + + + /** @brief Implementation of a pipelined GMRES solver without preconditioner + * + * Following algorithm 2.1 proposed by Walker in "A Simpler GMRES", but uses classical Gram-Schmidt instead of modified Gram-Schmidt for better parallelization. + * Uses some pipelining techniques for minimizing host-device transfer + * + * @param A The system matrix + * @param rhs The load vector + * @param tag Solver configuration tag + * @param monitor A callback routine which is called at each GMRES restart + * @param monitor_data Data pointer to be passed to the callback routine to pass on user-specific data + * @return The result vector + */ + template <typename MatrixType, typename ScalarType> + viennacl::vector<ScalarType> pipelined_solve(MatrixType const & A, + viennacl::vector<ScalarType> const & rhs, + gmres_tag const & tag, + viennacl::linalg::no_precond, + bool (*monitor)(viennacl::vector<ScalarType> const &, ScalarType, void*) = NULL, + void *monitor_data = NULL) + { + viennacl::vector<ScalarType> residual(rhs); + viennacl::vector<ScalarType> result = viennacl::zero_vector<ScalarType>(rhs.size(), viennacl::traits::context(rhs)); + + viennacl::vector<ScalarType> device_krylov_basis(rhs.internal_size() * tag.krylov_dim(), viennacl::traits::context(rhs)); // not using viennacl::matrix here because of spurious padding in column number + viennacl::vector<ScalarType> device_buffer_R(tag.krylov_dim()*tag.krylov_dim(), viennacl::traits::context(rhs)); + std::vector<ScalarType> host_buffer_R(device_buffer_R.size()); + + vcl_size_t buffer_size_per_vector = 128; + vcl_size_t num_buffer_chunks = 3; + viennacl::vector<ScalarType> device_inner_prod_buffer = viennacl::zero_vector<ScalarType>(num_buffer_chunks*buffer_size_per_vector, viennacl::traits::context(rhs)); // temporary buffer + viennacl::vector<ScalarType> device_r_dot_vk_buffer = viennacl::zero_vector<ScalarType>(buffer_size_per_vector * tag.krylov_dim(), viennacl::traits::context(rhs)); // holds result of first reduction stage for <r, v_k> on device + viennacl::vector<ScalarType> device_vi_in_vk_buffer = viennacl::zero_vector<ScalarType>(buffer_size_per_vector * tag.krylov_dim(), viennacl::traits::context(rhs)); // holds <v_i, v_k> for i=0..k-1 on device + viennacl::vector<ScalarType> device_values_xi_k = viennacl::zero_vector<ScalarType>(tag.krylov_dim(), viennacl::traits::context(rhs)); // holds values \xi_k = <r, v_k> on device + std::vector<ScalarType> host_r_dot_vk_buffer(device_r_dot_vk_buffer.size()); + std::vector<ScalarType> host_values_xi_k(tag.krylov_dim()); + std::vector<ScalarType> host_values_eta_k_buffer(tag.krylov_dim()); + std::vector<ScalarType> host_update_coefficients(tag.krylov_dim()); + + ScalarType norm_rhs = viennacl::linalg::norm_2(residual); + ScalarType rho_0 = norm_rhs; + ScalarType rho = ScalarType(1); + + tag.iters(0); + + for (unsigned int restart_count = 0; restart_count <= tag.max_restarts(); ++restart_count) + { + // + // prepare restart: + // + if (restart_count > 0) + { + // compute new residual without introducing a temporary for A*x: + residual = viennacl::linalg::prod(A, result); + residual = rhs - residual; + + rho_0 = viennacl::linalg::norm_2(residual); + } + + if (rho_0 <= ScalarType(tag.abs_tolerance())) // trivial right hand side? + break; + + residual /= rho_0; + rho = ScalarType(1); + + // check for convergence: + if (rho_0 / norm_rhs < tag.tolerance() || rho_0 < tag.abs_tolerance()) + break; + + // + // minimize in Krylov basis: + // + vcl_size_t k = 0; + for (k = 0; k < static_cast<vcl_size_t>(tag.krylov_dim()); ++k) + { + if (k == 0) + { + // compute v0 = A*r and perform first reduction stage for ||v0|| + viennacl::vector_range<viennacl::vector<ScalarType> > v0(device_krylov_basis, viennacl::range(0, rhs.size())); + viennacl::linalg::pipelined_gmres_prod(A, residual, v0, device_inner_prod_buffer); + + // Normalize v_1 and compute first reduction stage for <r, v_0> in device_r_dot_vk_buffer: + viennacl::linalg::pipelined_gmres_normalize_vk(v0, residual, + device_buffer_R, k*tag.krylov_dim() + k, + device_inner_prod_buffer, device_r_dot_vk_buffer, + buffer_size_per_vector, k*buffer_size_per_vector); + } + else + { + // compute v0 = A*r and perform first reduction stage for ||v0|| + viennacl::vector_range<viennacl::vector<ScalarType> > vk (device_krylov_basis, viennacl::range( k *rhs.internal_size(), k *rhs.internal_size() + rhs.size())); + viennacl::vector_range<viennacl::vector<ScalarType> > vk_minus_1(device_krylov_basis, viennacl::range((k-1)*rhs.internal_size(), (k-1)*rhs.internal_size() + rhs.size())); + viennacl::linalg::pipelined_gmres_prod(A, vk_minus_1, vk, device_inner_prod_buffer); + + // + // Gram-Schmidt, stage 1: compute first reduction stage of <v_i, v_k> + // + viennacl::linalg::pipelined_gmres_gram_schmidt_stage1(device_krylov_basis, rhs.size(), rhs.internal_size(), k, device_vi_in_vk_buffer, buffer_size_per_vector); + + // + // Gram-Schmidt, stage 2: compute second reduction stage of <v_i, v_k> and use that to compute v_k -= sum_i <v_i, v_k> v_i. + // Store <v_i, v_k> in R-matrix and compute first reduction stage for ||v_k|| + // + viennacl::linalg::pipelined_gmres_gram_schmidt_stage2(device_krylov_basis, rhs.size(), rhs.internal_size(), k, + device_vi_in_vk_buffer, + device_buffer_R, tag.krylov_dim(), + device_inner_prod_buffer, buffer_size_per_vector); + + // + // Normalize v_k and compute first reduction stage for <r, v_k> in device_r_dot_vk_buffer: + // + viennacl::linalg::pipelined_gmres_normalize_vk(vk, residual, + device_buffer_R, k*tag.krylov_dim() + k, + device_inner_prod_buffer, device_r_dot_vk_buffer, + buffer_size_per_vector, k*buffer_size_per_vector); + } + } + + // + // Run reduction to obtain the values \xi_k = <r, v_k>. + // Note that unlike Algorithm 2.1 in Walker: "A Simpler GMRES", we do not update the residual + // + viennacl::fast_copy(device_r_dot_vk_buffer.begin(), device_r_dot_vk_buffer.end(), host_r_dot_vk_buffer.begin()); + for (std::size_t i=0; i<k; ++i) + { + host_values_xi_k[i] = ScalarType(0); + for (std::size_t j=0; j<buffer_size_per_vector; ++j) + host_values_xi_k[i] += host_r_dot_vk_buffer[i*buffer_size_per_vector + j]; + } + + // + // Bring values in R back to host: + // + viennacl::fast_copy(device_buffer_R.begin(), device_buffer_R.end(), host_buffer_R.begin()); + + // + // Check for premature convergence: If the diagonal element drops too far below the first norm, we're done and restrict the Krylov size accordingly. + // + vcl_size_t full_krylov_dim = k; //needed for proper access to R + for (std::size_t i=0; i<k; ++i) + { + if (std::fabs(host_buffer_R[i + i*k]) < tag.tolerance() * host_buffer_R[0]) + { + k = i; + break; + } + } + + + // Compute error estimator: + for (std::size_t i=0; i<k; ++i) + { + tag.iters( tag.iters() + 1 ); //increase iteration counter + + // check for accumulation of round-off errors for poorly conditioned systems + if (host_values_xi_k[i] >= rho || host_values_xi_k[i] <= -rho) + { + k = i; + break; // restrict Krylov space at this point. No gain from using additional basis vectors, since orthogonality is lost. + } + + // update error estimator + rho *= std::sin( std::acos(host_values_xi_k[i] / rho) ); + } + + // + // Solve minimization problem: + // + host_values_eta_k_buffer = host_values_xi_k; + + for (int i2=static_cast<int>(k)-1; i2>-1; --i2) + { + vcl_size_t i = static_cast<vcl_size_t>(i2); + for (vcl_size_t j=static_cast<vcl_size_t>(i)+1; j<k; ++j) + host_values_eta_k_buffer[i] -= host_buffer_R[i + j*full_krylov_dim] * host_values_eta_k_buffer[j]; + + host_values_eta_k_buffer[i] /= host_buffer_R[i + i*full_krylov_dim]; + } + + // + // Update x += rho * z with z = \eta_0 * residual + sum_{i=0}^{k-1} \eta_{i+1} v_i + // Note that we have not updated the residual yet, hence this slightly modified as compared to the form given in Algorithm 2.1 in Walker: "A Simpler GMRES" + // + for (vcl_size_t i=0; i<k; ++i) + host_update_coefficients[i] = rho_0 * host_values_eta_k_buffer[i]; + + viennacl::fast_copy(host_update_coefficients.begin(), host_update_coefficients.end(), device_values_xi_k.begin()); //reuse device_values_xi_k_buffer here for simplicity + + viennacl::linalg::pipelined_gmres_update_result(result, residual, + device_krylov_basis, rhs.size(), rhs.internal_size(), + device_values_xi_k, k); + + tag.error( std::fabs(rho*rho_0 / norm_rhs) ); + + if (monitor && monitor(result, std::fabs(rho*rho_0 / norm_rhs), monitor_data)) + break; + } + + return result; + } + + /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */ + template<typename NumericT> + viennacl::vector<NumericT> solve_impl(viennacl::compressed_matrix<NumericT> const & A, + viennacl::vector<NumericT> const & rhs, + gmres_tag const & tag, + viennacl::linalg::no_precond, + bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL, + void *monitor_data = NULL) + { + return pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data); + } + + + /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */ + template<typename NumericT> + viennacl::vector<NumericT> solve_impl(viennacl::coordinate_matrix<NumericT> const & A, + viennacl::vector<NumericT> const & rhs, + gmres_tag const & tag, + viennacl::linalg::no_precond, + bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL, + void *monitor_data = NULL) + { + return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data); + } + + + + /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */ + template<typename NumericT> + viennacl::vector<NumericT> solve_impl(viennacl::ell_matrix<NumericT> const & A, + viennacl::vector<NumericT> const & rhs, + gmres_tag const & tag, + viennacl::linalg::no_precond, + bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL, + void *monitor_data = NULL) + { + return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data); + } + + + + /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */ + template<typename NumericT> + viennacl::vector<NumericT> solve_impl(viennacl::sliced_ell_matrix<NumericT> const & A, + viennacl::vector<NumericT> const & rhs, + gmres_tag const & tag, + viennacl::linalg::no_precond, + bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL, + void *monitor_data = NULL) + { + return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data); + } + + + /** @brief Overload for the pipelined CG implementation for the ViennaCL sparse matrix types */ + template<typename NumericT> + viennacl::vector<NumericT> solve_impl(viennacl::hyb_matrix<NumericT> const & A, + viennacl::vector<NumericT> const & rhs, + gmres_tag const & tag, + viennacl::linalg::no_precond, + bool (*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL, + void *monitor_data = NULL) + { + return detail::pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), monitor, monitor_data); + } + + + /** @brief Implementation of the GMRES solver. + * + * Following the algorithm proposed by Walker in "A Simpler GMRES" + * + * @param matrix The system matrix + * @param rhs The load vector + * @param tag Solver configuration tag + * @param precond A preconditioner. Precondition operation is done via member function apply() + * @param monitor A callback routine which is called at each GMRES restart + * @param monitor_data Data pointer to be passed to the callback routine to pass on user-specific data + * + * @return The result vector + */ + template<typename MatrixT, typename VectorT, typename PreconditionerT> + VectorT solve_impl(MatrixT const & matrix, + VectorT const & rhs, + gmres_tag const & tag, + PreconditionerT const & precond, + bool (*monitor)(VectorT const &, typename viennacl::result_of::cpu_value_type<typename viennacl::result_of::value_type<VectorT>::type>::type, void*) = NULL, + void *monitor_data = NULL) + { + typedef typename viennacl::result_of::value_type<VectorT>::type NumericType; + typedef typename viennacl::result_of::cpu_value_type<NumericType>::type CPU_NumericType; + + unsigned int problem_size = static_cast<unsigned int>(viennacl::traits::size(rhs)); + VectorT result = rhs; + viennacl::traits::clear(result); + + vcl_size_t krylov_dim = static_cast<vcl_size_t>(tag.krylov_dim()); + if (problem_size < krylov_dim) + krylov_dim = problem_size; //A Krylov space larger than the matrix would lead to seg-faults (mathematically, error is certain to be zero already) + + VectorT res = rhs; + VectorT v_k_tilde = rhs; + VectorT v_k_tilde_temp = rhs; + + std::vector< std::vector<CPU_NumericType> > R(krylov_dim, std::vector<CPU_NumericType>(tag.krylov_dim())); + std::vector<CPU_NumericType> projection_rhs(krylov_dim); + + std::vector<VectorT> householder_reflectors(krylov_dim, rhs); + std::vector<CPU_NumericType> betas(krylov_dim); + + CPU_NumericType norm_rhs = viennacl::linalg::norm_2(rhs); + + if (norm_rhs <= tag.abs_tolerance()) //solution is zero if RHS norm is zero + return result; + + tag.iters(0); + + for (unsigned int it = 0; it <= tag.max_restarts(); ++it) + { + // + // (Re-)Initialize residual: r = b - A*x (without temporary for the result of A*x) + // + res = viennacl::linalg::prod(matrix, result); //initial guess zero + res = rhs - res; + precond.apply(res); + + CPU_NumericType rho_0 = viennacl::linalg::norm_2(res); + + // + // Check for premature convergence + // + if (rho_0 / norm_rhs < tag.tolerance() || rho_0 < tag.abs_tolerance()) // norm_rhs is known to be nonzero here + { + tag.error(rho_0 / norm_rhs); + return result; + } + + // + // Normalize residual and set 'rho' to 1 as requested in 'A Simpler GMRES' by Walker and Zhou. + // + res /= rho_0; + CPU_NumericType rho = static_cast<CPU_NumericType>(1.0); + + + // + // Iterate up until maximal Krylove space dimension is reached: + // + vcl_size_t k = 0; + for (k = 0; k < krylov_dim; ++k) + { + tag.iters( tag.iters() + 1 ); //increase iteration counter + + // prepare storage: + viennacl::traits::clear(R[k]); + viennacl::traits::clear(householder_reflectors[k]); + + //compute v_k = A * v_{k-1} via Householder matrices + if (k == 0) + { + v_k_tilde = viennacl::linalg::prod(matrix, res); + precond.apply(v_k_tilde); + } + else + { + viennacl::traits::clear(v_k_tilde); + v_k_tilde[k-1] = CPU_NumericType(1); + + //Householder rotations, part 1: Compute P_1 * P_2 * ... * P_{k-1} * e_{k-1} + for (int i = static_cast<int>(k)-1; i > -1; --i) + detail::gmres_householder_reflect(v_k_tilde, householder_reflectors[vcl_size_t(i)], betas[vcl_size_t(i)]); + + v_k_tilde_temp = viennacl::linalg::prod(matrix, v_k_tilde); + precond.apply(v_k_tilde_temp); + v_k_tilde = v_k_tilde_temp; + + //Householder rotations, part 2: Compute P_{k-1} * ... * P_{1} * v_k_tilde + for (vcl_size_t i = 0; i < k; ++i) + detail::gmres_householder_reflect(v_k_tilde, householder_reflectors[i], betas[i]); + } + + // + // Compute Householder reflection for v_k_tilde such that all entries below k-th entry are zero: + // + CPU_NumericType rho_k_k = 0; + detail::gmres_setup_householder_vector(v_k_tilde, householder_reflectors[k], betas[k], rho_k_k, k); + + // + // copy first k entries from v_k_tilde to R[k] in order to fill k-th column with result of + // P_k * v_k_tilde = (v[0], ... , v[k-1], norm(v), 0, 0, ...) =: (rho_{1,k}, rho_{2,k}, ..., rho_{k,k}, 0, ..., 0); + // + detail::gmres_copy_helper(v_k_tilde, R[k], k); + R[k][k] = rho_k_k; + + // + // Update residual: r = P_k r + // Set zeta_k = r[k] including machine precision considerations: mathematically we have |r[k]| <= rho + // Set rho *= sin(acos(r[k] / rho)) + // + detail::gmres_householder_reflect(res, householder_reflectors[k], betas[k]); + + if (res[k] > rho) //machine precision reached + res[k] = rho; + if (res[k] < -rho) //machine precision reached + res[k] = -rho; + projection_rhs[k] = res[k]; + + rho *= std::sin( std::acos(projection_rhs[k] / rho) ); + + if (std::fabs(rho * rho_0 / norm_rhs) < tag.tolerance()) // Residual is sufficiently reduced, stop here + { + tag.error( std::fabs(rho*rho_0 / norm_rhs) ); + ++k; + break; + } + } // for k + + // + // Triangular solver stage: + // + + for (int i2=static_cast<int>(k)-1; i2>-1; --i2) + { + vcl_size_t i = static_cast<vcl_size_t>(i2); + for (vcl_size_t j=i+1; j<k; ++j) + projection_rhs[i] -= R[j][i] * projection_rhs[j]; //R is transposed + + projection_rhs[i] /= R[i][i]; + } + + // + // Note: 'projection_rhs' now holds the solution (eta_1, ..., eta_k) + // + + res *= projection_rhs[0]; + + if (k > 0) + { + for (unsigned int i = 0; i < k-1; ++i) + res[i] += projection_rhs[i+1]; + } + + // + // Form z inplace in 'res' by applying P_1 * ... * P_{k} + // + for (int i=static_cast<int>(k)-1; i>=0; --i) + detail::gmres_householder_reflect(res, householder_reflectors[vcl_size_t(i)], betas[vcl_size_t(i)]); + + res *= rho_0; + result += res; // x += rho_0 * z in the paper + + // + // Check for convergence: + // + tag.error(std::fabs(rho*rho_0 / norm_rhs)); + + if (monitor && monitor(result, std::fabs(rho*rho_0 / norm_rhs), monitor_data)) + break; + + if ( tag.error() < tag.tolerance() ) + return result; + } + + return result; + } + +} + +template<typename MatrixT, typename VectorT, typename PreconditionerT> +VectorT solve(MatrixT const & matrix, VectorT const & rhs, gmres_tag const & tag, PreconditionerT const & precond) +{ + return detail::solve_impl(matrix, rhs, tag, precond); +} + +/** @brief Convenience overload for calling the preconditioned BiCGStab solver using types from the C++ STL. + * + * A std::vector<std::map<T, U> > matrix is convenient for e.g. finite element assembly. + * It is not the fastest option for setting up a system, but often it is fast enough - particularly for just trying things out. + */ +template<typename IndexT, typename NumericT, typename PreconditionerT> +std::vector<NumericT> solve(std::vector< std::map<IndexT, NumericT> > const & A, std::vector<NumericT> const & rhs, gmres_tag const & tag, PreconditionerT const & precond) +{ + viennacl::compressed_matrix<NumericT> vcl_A; + viennacl::copy(A, vcl_A); + + viennacl::vector<NumericT> vcl_rhs(rhs.size()); + viennacl::copy(rhs, vcl_rhs); + + viennacl::vector<NumericT> vcl_result = solve(vcl_A, vcl_rhs, tag, precond); + + std::vector<NumericT> result(vcl_result.size()); + viennacl::copy(vcl_result, result); + return result; +} + +/** @brief Entry point for the unpreconditioned GMRES method. + * + * @param A The system matrix + * @param rhs Right hand side vector (load vector) + * @param tag A BiCGStab tag providing relative tolerances, etc. + */ + +template<typename MatrixT, typename VectorT> +VectorT solve(MatrixT const & A, VectorT const & rhs, gmres_tag const & tag) +{ + return solve(A, rhs, tag, no_precond()); +} + + + +template<typename VectorT> +class gmres_solver +{ +public: + typedef typename viennacl::result_of::cpu_value_type<VectorT>::type numeric_type; + + gmres_solver(gmres_tag const & tag) : tag_(tag), monitor_callback_(NULL), user_data_(NULL) {} + + template<typename MatrixT, typename PreconditionerT> + VectorT operator()(MatrixT const & A, VectorT const & b, PreconditionerT const & precond) const + { + if (viennacl::traits::size(init_guess_) > 0) // take initial guess into account + { + VectorT mod_rhs = viennacl::linalg::prod(A, init_guess_); + mod_rhs = b - mod_rhs; + VectorT y = detail::solve_impl(A, mod_rhs, tag_, precond, monitor_callback_, user_data_); + return init_guess_ + y; + } + return detail::solve_impl(A, b, tag_, precond, monitor_callback_, user_data_); + } + + + template<typename MatrixT> + VectorT operator()(MatrixT const & A, VectorT const & b) const + { + return operator()(A, b, viennacl::linalg::no_precond()); + } + + /** @brief Specifies an initial guess for the iterative solver. + * + * An iterative solver for Ax = b with initial guess x_0 is equivalent to an iterative solver for Ay = b' := b - Ax_0, where x = x_0 + y. + */ + void set_initial_guess(VectorT const & x) { init_guess_ = x; } + + /** @brief Sets a monitor function pointer to be called in each iteration. Set to NULL to run without monitor. + * + * The monitor function is called with the current guess for the result as first argument and the current relative residual estimate as second argument. + * The third argument is a pointer to user-defined data, through which additional information can be passed. + * This pointer needs to be set with set_monitor_data. If not set, NULL is passed. + * If the montior function returns true, the solver terminates (either convergence or divergence). + */ + void set_monitor(bool (*monitor_fun)(VectorT const &, numeric_type, void *), void *user_data) + { + monitor_callback_ = monitor_fun; + user_data_ = user_data; + } + + /** @brief Returns the solver tag containing basic configuration such as tolerances, etc. */ + gmres_tag const & tag() const { return tag_; } + +private: + gmres_tag tag_; + VectorT init_guess_; + bool (*monitor_callback_)(VectorT const &, numeric_type, void *); + void *user_data_; +}; + + +} +} + +#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/hankel_matrix_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/hankel_matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/hankel_matrix_operations.hpp new file mode 100644 index 0000000..43ca928 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/hankel_matrix_operations.hpp @@ -0,0 +1,66 @@ +#ifndef VIENNACL_LINALG_HANKEL_MATRIX_OPERATIONS_HPP_ +#define VIENNACL_LINALG_HANKEL_MATRIX_OPERATIONS_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/hankel_matrix_operations.hpp + @brief Implementations of operations using hankel_matrix. Experimental. +*/ + +#include "viennacl/forwards.h" +#include "viennacl/ocl/backend.hpp" +#include "viennacl/scalar.hpp" +#include "viennacl/vector.hpp" +#include "viennacl/tools/tools.hpp" +#include "viennacl/fft.hpp" +#include "viennacl/linalg/toeplitz_matrix_operations.hpp" + +namespace viennacl +{ +namespace linalg +{ + +// A * x + +/** @brief Carries out matrix-vector multiplication with a hankel_matrix +* +* Implementation of the convenience expression result = prod(mat, vec); +* +* @param A The matrix +* @param vec The vector +* @param result The result vector +*/ +template<typename NumericT, unsigned int AlignmentV> +void prod_impl(viennacl::hankel_matrix<NumericT, AlignmentV> const & A, + viennacl::vector_base<NumericT> const & vec, + viennacl::vector_base<NumericT> & result) +{ + assert(A.size1() == result.size() && bool("Dimension mismatch")); + assert(A.size2() == vec.size() && bool("Dimension mismatch")); + + prod_impl(A.elements(), vec, result); + viennacl::linalg::reverse(result); +} + +} //namespace linalg + + +} //namespace viennacl + + +#endif http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/amg_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/amg_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/amg_operations.hpp new file mode 100644 index 0000000..78bd150 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/amg_operations.hpp @@ -0,0 +1,1123 @@ +#ifndef VIENNACL_LINALG_HOST_BASED_AMG_OPERATIONS_HPP +#define VIENNACL_LINALG_HOST_BASED_AMG_OPERATIONS_HPP + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the PDF manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file host_based/amg_operations.hpp + @brief Implementations of routines for AMG using the CPU on the host (with OpenMP if enabled). +*/ + +#include <cstdlib> +#include <cmath> +#include "viennacl/linalg/detail/amg/amg_base.hpp" + +#include <map> +#include <set> +#include <functional> +#ifdef VIENNACL_WITH_OPENMP +#include <omp.h> +#endif + +namespace viennacl +{ +namespace linalg +{ +namespace host_based +{ +namespace amg +{ + + +/////////////////////////////////////////// + +/** @brief Routine for taking all connections in the matrix as strong */ +template<typename NumericT> +void amg_influence_trivial(compressed_matrix<NumericT> const & A, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + (void)tag; + + unsigned int const * A_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1()); + unsigned int const * A_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2()); + + unsigned int *influences_row_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_jumper_.handle()); + unsigned int *influences_id_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_ids_.handle()); + unsigned int *influences_values_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_values_.handle()); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long i2=0; i2<static_cast<long>(A.size1()); ++i2) + { + vcl_size_t i = vcl_size_t(i2); + influences_row_ptr[i] = A_row_buffer[i]; + influences_values_ptr[i] = A_row_buffer[i+1] - A_row_buffer[i]; + } + influences_row_ptr[A.size1()] = A_row_buffer[A.size1()]; + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long i=0; i<long(A.nnz()); ++i) + influences_id_ptr[i] = A_col_buffer[i]; +} + + +/** @brief Routine for extracting strongly connected points considering a user-provided threshold value */ +template<typename NumericT> +void amg_influence_advanced(compressed_matrix<NumericT> const & A, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + NumericT const * A_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle()); + unsigned int const * A_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1()); + unsigned int const * A_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2()); + + unsigned int *influences_row_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_jumper_.handle()); + unsigned int *influences_id_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_ids_.handle()); + + // + // Step 1: Scan influences in order to allocate the necessary memory + // +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long i2=0; i2<static_cast<long>(A.size1()); ++i2) + { + vcl_size_t i = vcl_size_t(i2); + unsigned int row_start = A_row_buffer[i]; + unsigned int row_stop = A_row_buffer[i+1]; + NumericT diag = 0; + NumericT largest_positive = 0; + NumericT largest_negative = 0; + unsigned int num_influences = 0; + + // obtain diagonal element as well as maximum positive and negative off-diagonal entries + for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index) + { + unsigned int col = A_col_buffer[nnz_index]; + NumericT value = A_elements[nnz_index]; + + if (col == i) + diag = value; + else if (value > largest_positive) + largest_positive = value; + else if (value < largest_negative) + largest_negative = value; + } + + if (largest_positive <= 0 && largest_negative >= 0) // no offdiagonal entries + { + influences_row_ptr[i] = 0; + continue; + } + + // Find all points that strongly influence current point (Yang, p.5) + //std::cout << "Looking for strongly influencing points for point " << i << std::endl; + for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index) + { + unsigned int col = A_col_buffer[nnz_index]; + + if (i == col) + continue; + + NumericT value = A_elements[nnz_index]; + + if ( (diag > 0 && diag * value <= tag.get_strong_connection_threshold() * diag * largest_negative) + || (diag < 0 && diag * value <= tag.get_strong_connection_threshold() * diag * largest_positive)) + { + ++num_influences; + } + } + + influences_row_ptr[i] = num_influences; + } + + // + // Step 2: Exclusive scan on number of influences to obtain CSR-like datastructure + // + unsigned int current_entry = 0; + for (std::size_t i=0; i<A.size1(); ++i) + { + unsigned int tmp = influences_row_ptr[i]; + influences_row_ptr[i] = current_entry; + current_entry += tmp; + } + influences_row_ptr[A.size1()] = current_entry; + + + // + // Step 3: Write actual influences + // +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long i2=0; i2<static_cast<long>(A.size1()); ++i2) + { + unsigned int i = static_cast<unsigned int>(i2); + unsigned int row_start = A_row_buffer[i]; + unsigned int row_stop = A_row_buffer[i+1]; + NumericT diag = 0; + NumericT largest_positive = 0; + NumericT largest_negative = 0; + + // obtain diagonal element as well as maximum positive and negative off-diagonal entries + for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index) + { + unsigned int col = A_col_buffer[nnz_index]; + NumericT value = A_elements[nnz_index]; + + if (col == i) + diag = value; + else if (value > largest_positive) + largest_positive = value; + else if (value < largest_negative) + largest_negative = value; + } + + if (largest_positive <= 0 && largest_negative >= 0) // no offdiagonal entries + continue; + + // Find all points that strongly influence current point (Yang, p.5) + //std::cout << "Looking for strongly influencing points for point " << i << std::endl; + unsigned int *influences_id_write_ptr = influences_id_ptr + influences_row_ptr[i]; + for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index) + { + unsigned int col = A_col_buffer[nnz_index]; + + if (i == col) + continue; + + NumericT value = A_elements[nnz_index]; + + if ( (diag > 0 && diag * value <= tag.get_strong_connection_threshold() * diag * largest_negative) + || (diag < 0 && diag * value <= tag.get_strong_connection_threshold() * diag * largest_positive)) + { + //std::cout << " - Adding influence from point " << col << std::endl; + *influences_id_write_ptr = col; + ++influences_id_write_ptr; + } + } + } + +} + + +/** @brief Dispatcher for influence processing */ +template<typename NumericT> +void amg_influence(compressed_matrix<NumericT> const & A, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + // TODO: dispatch based on influence tolerance provided + amg_influence_trivial(A, amg_context, tag); +} + + + +/** @brief Assign IDs to coarse points */ +inline void enumerate_coarse_points(viennacl::linalg::detail::amg::amg_level_context & amg_context) +{ + unsigned int *point_types_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.point_types_.handle()); + unsigned int *coarse_id_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.coarse_id_.handle()); + + unsigned int coarse_id = 0; + for (vcl_size_t i=0; i<amg_context.coarse_id_.size(); ++i) + { + //assert(point_types_ptr[i] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED && bool("Logic error in enumerate_coarse_points(): Undecided points detected!")); + + if (point_types_ptr[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE) + coarse_id_ptr[i] = coarse_id++; + } + + //std::cout << "Coarse nodes after enumerate_coarse_points(): " << coarse_id << std::endl; + amg_context.num_coarse_ = coarse_id; +} + + + + +////////////////////////////////////// + + +/** @brief Helper struct for sequential classical one-pass coarsening */ +struct amg_id_influence +{ + amg_id_influence(std::size_t id2, std::size_t influences2) : id(static_cast<unsigned int>(id2)), influences(static_cast<unsigned int>(influences2)) {} + + unsigned int id; + unsigned int influences; +}; + +inline bool operator>(amg_id_influence const & a, amg_id_influence const & b) +{ + if (a.influences > b.influences) + return true; + if (a.influences == b.influences) + return a.id > b.id; + return false; +} + +/** @brief Classical (RS) one-pass coarsening. Single-Threaded! (VIENNACL_AMG_COARSE_CLASSIC_ONEPASS) +* +* @param A Operator matrix for the respective level +* @param amg_context AMG datastructure object for the grid hierarchy +* @param tag AMG preconditioner tag +*/ +template<typename NumericT> +void amg_coarse_classic_onepass(compressed_matrix<NumericT> const & A, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + unsigned int *point_types_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.point_types_.handle()); + unsigned int *influences_row_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_jumper_.handle()); + unsigned int *influences_id_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_ids_.handle()); + unsigned int *influences_values_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_values_.handle()); + + std::set<amg_id_influence, std::greater<amg_id_influence> > points_by_influences; + + amg_influence_advanced(A, amg_context, tag); + + for (std::size_t i=0; i<A.size1(); ++i) + points_by_influences.insert(amg_id_influence(i, influences_values_ptr[i])); + + //std::cout << "Starting coarsening process..." << std::endl; + + while (!points_by_influences.empty()) + { + amg_id_influence point = *(points_by_influences.begin()); + + // remove point from queue: + points_by_influences.erase(points_by_influences.begin()); + + //std::cout << "Working on point " << point.id << std::endl; + + // point is already coarse or fine point, continue; + if (point_types_ptr[point.id] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED) + continue; + + //std::cout << " Setting point " << point.id << " to a coarse point." << std::endl; + // make this a coarse point: + point_types_ptr[point.id] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE; + + // Set strongly influenced points to fine points: + unsigned int j_stop = influences_row_ptr[point.id + 1]; + for (unsigned int j = influences_row_ptr[point.id]; j < j_stop; ++j) + { + unsigned int influenced_point_id = influences_id_ptr[j]; + + //std::cout << "Checking point " << influenced_point_id << std::endl; + if (point_types_ptr[influenced_point_id] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED) + continue; + + //std::cout << " Setting point " << influenced_point_id << " to a fine point." << std::endl; + point_types_ptr[influenced_point_id] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE; + + // add one to influence measure for all undecided points strongly influencing this fine point. + unsigned int k_stop = influences_row_ptr[influenced_point_id + 1]; + for (unsigned int k = influences_row_ptr[influenced_point_id]; k < k_stop; ++k) + { + unsigned int influenced_influenced_point_id = influences_id_ptr[k]; + if (point_types_ptr[influenced_influenced_point_id] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED) + { + // grab and remove from set, increase influence counter, store back: + amg_id_influence point_to_find(influenced_influenced_point_id, influences_values_ptr[influenced_influenced_point_id]); + points_by_influences.erase(point_to_find); + + point_to_find.influences += 1; + influences_values_ptr[influenced_influenced_point_id] += 1; // for consistency + + points_by_influences.insert(point_to_find); + } + } //for + } // for + + } // while + + viennacl::linalg::host_based::amg::enumerate_coarse_points(amg_context); +} + + +////////////////////////// + + +/** @brief AG (aggregation based) coarsening, single-threaded version of stage 1 +* +* @param A Operator matrix for the respective level +* @param amg_context AMG datastructure object for the grid hierarchy +* @param tag AMG preconditioner tag +*/ +template<typename NumericT> +void amg_coarse_ag_stage1_sequential(compressed_matrix<NumericT> const & A, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + (void)tag; + unsigned int *point_types_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.point_types_.handle()); + unsigned int *influences_row_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_jumper_.handle()); + unsigned int *influences_id_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_ids_.handle()); + + for (unsigned int i=0; i<static_cast<unsigned int>(A.size1()); ++i) + { + // check if node has no aggregates next to it (MIS-2) + bool is_new_coarse_node = true; + + // Set strongly influenced points to fine points: + unsigned int j_stop = influences_row_ptr[i + 1]; + for (unsigned int j = influences_row_ptr[i]; j < j_stop; ++j) + { + unsigned int influenced_point_id = influences_id_ptr[j]; + if (point_types_ptr[influenced_point_id] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED) // either coarse or fine point + { + is_new_coarse_node = false; + break; + } + } + + if (is_new_coarse_node) + { + // make all strongly influenced neighbors fine points: + for (unsigned int j = influences_row_ptr[i]; j < j_stop; ++j) + { + unsigned int influenced_point_id = influences_id_ptr[j]; + point_types_ptr[influenced_point_id] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE; + } + + //std::cout << "Setting new coarse node: " << i << std::endl; + // Note: influences may include diagonal element, so it's important to *first* set fine points above before setting the coarse information here + point_types_ptr[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE; + } + } +} + + + +/** @brief AG (aggregation based) coarsening, multi-threaded version of stage 1 using parallel maximum independent sets +* +* @param A Operator matrix for the respective level +* @param amg_context AMG datastructure object for the grid hierarchy +* @param tag AMG preconditioner tag +*/ +template<typename NumericT> +void amg_coarse_ag_stage1_mis2(compressed_matrix<NumericT> const & A, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + (void)tag; + unsigned int *point_types_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.point_types_.handle()); + unsigned int *influences_row_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_jumper_.handle()); + unsigned int *influences_id_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_ids_.handle()); + + std::vector<unsigned int> random_weights(A.size1()); + for (std::size_t i=0; i<random_weights.size(); ++i) + random_weights[i] = static_cast<unsigned int>(rand()) % static_cast<unsigned int>(A.size1()); + + std::size_t num_threads = 1; +#ifdef VIENNACL_WITH_OPENMP + num_threads = omp_get_max_threads(); +#endif + + viennacl::vector<unsigned int> work_state(A.size1(), viennacl::traits::context(A)); + viennacl::vector<unsigned int> work_random(A.size1(), viennacl::traits::context(A)); + viennacl::vector<unsigned int> work_index(A.size1(), viennacl::traits::context(A)); + + unsigned int *work_state_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(work_state.handle()); + unsigned int *work_random_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(work_random.handle()); + unsigned int *work_index_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(work_index.handle()); + + viennacl::vector<unsigned int> work_state2(A.size1(), viennacl::traits::context(A)); + viennacl::vector<unsigned int> work_random2(A.size1(), viennacl::traits::context(A)); + viennacl::vector<unsigned int> work_index2(A.size1(), viennacl::traits::context(A)); + + unsigned int *work_state2_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(work_state2.handle()); + unsigned int *work_random2_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(work_random2.handle()); + unsigned int *work_index2_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(work_index2.handle()); + + + unsigned int num_undecided = static_cast<unsigned int>(A.size1()); + unsigned int pmis_iters = 0; + while (num_undecided > 0) + { + ++pmis_iters; + + // + // init temporary work data: + // +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long i2=0; i2<static_cast<long>(A.size1()); ++i2) + { + unsigned int i = static_cast<unsigned int>(i2); + switch (point_types_ptr[i]) + { + case viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED: work_state_ptr[i] = 1; break; + case viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE: work_state_ptr[i] = 0; break; + case viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE: work_state_ptr[i] = 2; break; + default: + throw std::runtime_error("Unexpected state encountered in MIS2 setup for AMG."); + } + + work_random_ptr[i] = random_weights[i]; + work_index_ptr[i] = i; + } + + + // + // Propagate maximum tuple twice + // + for (unsigned int r = 0; r < 2; ++r) + { + // max operation +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long i2=0; i2<static_cast<long>(A.size1()); ++i2) + { + unsigned int i = static_cast<unsigned int>(i2); + // load + unsigned int state = work_state_ptr[i]; + unsigned int random = work_random_ptr[i]; + unsigned int index = work_index_ptr[i]; + + // max + unsigned int j_stop = influences_row_ptr[i + 1]; + for (unsigned int j = influences_row_ptr[i]; j < j_stop; ++j) + { + unsigned int influenced_point_id = influences_id_ptr[j]; + + // lexigraphical triple-max (not particularly pretty, but does the job): + if (state < work_state_ptr[influenced_point_id]) + { + state = work_state_ptr[influenced_point_id]; + random = work_random_ptr[influenced_point_id]; + index = work_index_ptr[influenced_point_id]; + } + else if (state == work_state_ptr[influenced_point_id]) + { + if (random < work_random_ptr[influenced_point_id]) + { + state = work_state_ptr[influenced_point_id]; + random = work_random_ptr[influenced_point_id]; + index = work_index_ptr[influenced_point_id]; + } + else if (random == work_random_ptr[influenced_point_id]) + { + if (index < work_index_ptr[influenced_point_id]) + { + state = work_state_ptr[influenced_point_id]; + random = work_random_ptr[influenced_point_id]; + index = work_index_ptr[influenced_point_id]; + } + } // max(random) + } // max(state) + } // for + + // store + work_state2_ptr[i] = state; + work_random2_ptr[i] = random; + work_index2_ptr[i] = index; + } + + // copy work array +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long i2=0; i2<static_cast<long>(A.size1()); ++i2) + { + unsigned int i = static_cast<unsigned int>(i2); + work_state_ptr[i] = work_state2_ptr[i]; + work_random_ptr[i] = work_random2_ptr[i]; + work_index_ptr[i] = work_index2_ptr[i]; + } + } + + // + // mark MIS and non-MIS nodes: + // + std::vector<unsigned int> thread_buffer(num_threads); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long i2=0; i2<static_cast<long>(A.size1()); ++i2) + { + unsigned int i = static_cast<unsigned int>(i2); + unsigned int max_state = work_state_ptr[i]; + unsigned int max_index = work_index_ptr[i]; + + if (point_types_ptr[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED) + { + if (i == max_index) // make this a MIS node + point_types_ptr[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE; + else if (max_state == 2) // mind the mapping of viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE above! + point_types_ptr[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE; + else +#ifdef VIENNACL_WITH_OPENMP + thread_buffer[omp_get_thread_num()] += 1; +#else + thread_buffer[0] += 1; +#endif + } + } + + num_undecided = 0; + for (std::size_t i=0; i<thread_buffer.size(); ++i) + num_undecided += thread_buffer[i]; + } // while + + // consistency with sequential MIS: reset state for non-coarse points, so that coarse indices are correctly picked up later +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long i=0; i<static_cast<long>(A.size1()); ++i) + if (point_types_ptr[i] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE) + point_types_ptr[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED; + +} + + + +/** @brief AG (aggregation based) coarsening. Partially single-threaded version (VIENNACL_AMG_COARSE_AG) +* +* @param A Operator matrix for the respective level +* @param amg_context AMG datastructure object for the grid hierarchy +* @param tag AMG preconditioner tag +*/ +template<typename NumericT> +void amg_coarse_ag(compressed_matrix<NumericT> const & A, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + unsigned int *point_types_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.point_types_.handle()); + unsigned int *influences_row_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_jumper_.handle()); + unsigned int *influences_id_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_ids_.handle()); + unsigned int *coarse_id_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.coarse_id_.handle()); + + amg_influence_trivial(A, amg_context, tag); + + // + // Stage 1: Build aggregates: + // + if (tag.get_coarsening_method() == viennacl::linalg::AMG_COARSENING_METHOD_AGGREGATION) amg_coarse_ag_stage1_sequential(A, amg_context, tag); + if (tag.get_coarsening_method() == viennacl::linalg::AMG_COARSENING_METHOD_MIS2_AGGREGATION) amg_coarse_ag_stage1_mis2(A, amg_context, tag); + + viennacl::linalg::host_based::amg::enumerate_coarse_points(amg_context); + + // + // Stage 2: Propagate coarse aggregate indices to neighbors: + // +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long i2=0; i2<static_cast<long>(A.size1()); ++i2) + { + unsigned int i = static_cast<unsigned int>(i2); + if (point_types_ptr[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE) + { + unsigned int coarse_index = coarse_id_ptr[i]; + + unsigned int j_stop = influences_row_ptr[i + 1]; + for (unsigned int j = influences_row_ptr[i]; j < j_stop; ++j) + { + unsigned int influenced_point_id = influences_id_ptr[j]; + coarse_id_ptr[influenced_point_id] = coarse_index; // Set aggregate index for fine point + + if (influenced_point_id != i) // Note: Any write races between threads are harmless here + point_types_ptr[influenced_point_id] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE; + } + } + } + + + // + // Stage 3: Merge remaining undecided points (merging to first aggregate found when cycling over the hierarchy + // +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long i2=0; i2<static_cast<long>(A.size1()); ++i2) + { + unsigned int i = static_cast<unsigned int>(i2); + if (point_types_ptr[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED) + { + unsigned int j_stop = influences_row_ptr[i + 1]; + for (unsigned int j = influences_row_ptr[i]; j < j_stop; ++j) + { + unsigned int influenced_point_id = influences_id_ptr[j]; + if (point_types_ptr[influenced_point_id] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED) // either coarse or fine point + { + //std::cout << "Setting fine node " << i << " to be aggregated with node " << *influence_iter << "/" << pointvector.get_coarse_index(*influence_iter) << std::endl; + coarse_id_ptr[i] = coarse_id_ptr[influenced_point_id]; + break; + } + } + } + } + + // + // Stage 4: Set undecided points to fine points (coarse ID already set in Stage 3) + // Note: Stage 3 and Stage 4 were initially fused, but are now split in order to avoid race conditions (or a fallback to sequential execution). + // +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long i=0; i<static_cast<long>(A.size1()); ++i) + if (point_types_ptr[i] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED) + point_types_ptr[i] = viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE; + +} + + + + +/** @brief Entry point and dispatcher for coarsening procedures +* +* @param A Operator matrix for the respective level +* @param amg_context AMG datastructure object for the grid hierarchy +* @param tag AMG preconditioner tag +*/ +template<typename MatrixT> +void amg_coarse(MatrixT & A, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + switch (tag.get_coarsening_method()) + { + case viennacl::linalg::AMG_COARSENING_METHOD_ONEPASS: amg_coarse_classic_onepass(A, amg_context, tag); break; + case viennacl::linalg::AMG_COARSENING_METHOD_AGGREGATION: + case viennacl::linalg::AMG_COARSENING_METHOD_MIS2_AGGREGATION: amg_coarse_ag(A, amg_context, tag); break; + //default: throw std::runtime_error("not implemented yet"); + } +} + + + + +////////////////////////////////////// Interpolation ///////////////////////////// + + +/** @brief Direct interpolation. Multi-threaded! (VIENNACL_AMG_INTERPOL_DIRECT) + * + * @param A Operator matrix + * @param P Prolongation matrix + * @param amg_context AMG hierarchy datastructures + * @param tag AMG preconditioner tag +*/ +template<typename NumericT> +void amg_interpol_direct(compressed_matrix<NumericT> const & A, + compressed_matrix<NumericT> & P, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + NumericT const * A_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle()); + unsigned int const * A_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1()); + unsigned int const * A_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2()); + + unsigned int *point_types_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.point_types_.handle()); + unsigned int *influences_row_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_jumper_.handle()); + unsigned int *influences_id_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.influence_ids_.handle()); + unsigned int *coarse_id_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.coarse_id_.handle()); + + P.resize(A.size1(), amg_context.num_coarse_, false); + + std::vector<std::map<unsigned int, NumericT> > P_setup(A.size1()); + + // Iterate over all points to build the interpolation matrix row-by-row + // Interpolation for coarse points is immediate using '1'. + // Interpolation for fine points is set up via corresponding row weights (cf. Yang paper, p. 14) +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long row2=0; row2<static_cast<long>(A.size1()); ++row2) + { + unsigned int row = static_cast<unsigned int>(row2); + std::map<unsigned int, NumericT> & P_setup_row = P_setup[row]; + //std::cout << "Row " << row << ": " << std::endl; + if (point_types_ptr[row] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE) + { + //std::cout << " Setting value 1.0 at " << coarse_id_ptr[row] << std::endl; + P_setup_row[coarse_id_ptr[row]] = NumericT(1); + } + else if (point_types_ptr[row] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE) + { + //std::cout << "Building interpolant for fine point " << row << std::endl; + + NumericT row_sum = 0; + NumericT row_coarse_sum = 0; + NumericT diag = 0; + + // Row sum of coefficients (without diagonal) and sum of influencing coarse point coefficients has to be computed + unsigned int row_A_start = A_row_buffer[row]; + unsigned int row_A_end = A_row_buffer[row + 1]; + unsigned int const *influence_iter = influences_id_ptr + influences_row_ptr[row]; + unsigned int const *influence_end = influences_id_ptr + influences_row_ptr[row + 1]; + for (unsigned int index = row_A_start; index < row_A_end; ++index) + { + unsigned int col = A_col_buffer[index]; + NumericT value = A_elements[index]; + + if (col == row) + { + diag = value; + continue; + } + else if (point_types_ptr[col] == viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE) + { + // Note: One increment is sufficient, because influence_iter traverses an ordered subset of the column indices in this row + while (influence_iter != influence_end && *influence_iter < col) + ++influence_iter; + + if (influence_iter != influence_end && *influence_iter == col) + row_coarse_sum += value; + } + + row_sum += value; + } + + NumericT temp_res = -row_sum/(row_coarse_sum*diag); + //std::cout << "row_sum: " << row_sum << ", row_coarse_sum: " << row_coarse_sum << ", diag: " << diag << std::endl; + + if (std::fabs(temp_res) > 1e-2 * std::fabs(diag)) + { + // Iterate over all strongly influencing points to build the interpolant + influence_iter = influences_id_ptr + influences_row_ptr[row]; + for (unsigned int index = row_A_start; index < row_A_end; ++index) + { + unsigned int col = A_col_buffer[index]; + if (point_types_ptr[col] != viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE) + continue; + NumericT value = A_elements[index]; + + // Advance to correct influence metric: + while (influence_iter != influence_end && *influence_iter < col) + ++influence_iter; + + if (influence_iter != influence_end && *influence_iter == col) + { + //std::cout << " Setting entry " << temp_res * value << " at " << coarse_id_ptr[col] << " for point " << col << std::endl; + P_setup_row[coarse_id_ptr[col]] = temp_res * value; + } + } + } + + // TODO truncate interpolation if specified by the user. + (void)tag; + } + else + throw std::runtime_error("Logic error in direct interpolation: Point is neither coarse-point nor fine-point!"); + } + + // TODO: P_setup can be avoided without sacrificing parallelism. + viennacl::tools::sparse_matrix_adapter<NumericT> P_adapter(P_setup, P.size1(), P.size2()); + viennacl::copy(P_adapter, P); +} + + +/** @brief AG (aggregation based) interpolation. Multi-Threaded! (VIENNACL_INTERPOL_AG) + * + * @param A Operator matrix + * @param P Prolongation matrix + * @param amg_context AMG hierarchy datastructures + * @param tag AMG configuration tag +*/ +template<typename NumericT> +void amg_interpol_ag(compressed_matrix<NumericT> const & A, + compressed_matrix<NumericT> & P, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + (void)tag; + P = compressed_matrix<NumericT>(A.size1(), amg_context.num_coarse_, A.size1(), viennacl::traits::context(A)); + + NumericT * P_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(P.handle()); + unsigned int * P_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(P.handle1()); + unsigned int * P_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(P.handle2()); + + unsigned int *coarse_id_ptr = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(amg_context.coarse_id_.handle()); + + // Build interpolation matrix: +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long row2 = 0; row2 < long(A.size1()); ++row2) + { + unsigned int row = static_cast<unsigned int>(row2); + P_elements[row] = NumericT(1); + P_row_buffer[row] = row; + P_col_buffer[row] = coarse_id_ptr[row]; + } + P_row_buffer[A.size1()] = static_cast<unsigned int>(A.size1()); // don't forget finalizer + + P.generate_row_block_information(); +} + + +/** @brief Smoothed aggregation interpolation. Multi-Threaded! (VIENNACL_INTERPOL_SA) + * + * @param A Operator matrix + * @param P Prolongation matrix + * @param amg_context AMG hierarchy datastructures + * @param tag AMG configuration tag +*/ +template<typename NumericT> +void amg_interpol_sa(compressed_matrix<NumericT> const & A, + compressed_matrix<NumericT> & P, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + (void)tag; + viennacl::compressed_matrix<NumericT> P_tentative(A.size1(), amg_context.num_coarse_, A.size1(), viennacl::traits::context(A)); + + // form tentative operator: + amg_interpol_ag(A, P_tentative, amg_context, tag); + + unsigned int const * A_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1()); + unsigned int const * A_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2()); + NumericT const * A_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle()); + + viennacl::compressed_matrix<NumericT> Jacobi(A.size1(), A.size1(), A.nnz(), viennacl::traits::context(A)); + unsigned int * Jacobi_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(Jacobi.handle1()); + unsigned int * Jacobi_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(Jacobi.handle2()); + NumericT * Jacobi_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(Jacobi.handle()); + + + // Build Jacobi matrix: +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long row2=0; row2<static_cast<long>(A.size1()); ++row2) + { + unsigned int row = static_cast<unsigned int>(row2); + unsigned int row_begin = A_row_buffer[row]; + unsigned int row_end = A_row_buffer[row+1]; + + Jacobi_row_buffer[row] = row_begin; + + // Step 1: Extract diagonal: + NumericT diag = 0; + for (unsigned int j = row_begin; j < row_end; ++j) + { + if (A_col_buffer[j] == row) + { + diag = A_elements[j]; + break; + } + } + + // Step 2: Write entries: + for (unsigned int j = row_begin; j < row_end; ++j) + { + unsigned int col_index = A_col_buffer[j]; + Jacobi_col_buffer[j] = col_index; + + if (col_index == row) + Jacobi_elements[j] = NumericT(1) - NumericT(tag.get_jacobi_weight()); + else + Jacobi_elements[j] = - NumericT(tag.get_jacobi_weight()) * A_elements[j] / diag; + } + } + Jacobi_row_buffer[A.size1()] = static_cast<unsigned int>(Jacobi.nnz()); // don't forget finalizer + + P = viennacl::linalg::prod(Jacobi, P_tentative); + + P.generate_row_block_information(); +} + + +/** @brief Dispatcher for building the interpolation matrix + * + * @param A Operator matrix + * @param P Prolongation matrix + * @param amg_context AMG hierarchy datastructures + * @param tag AMG configuration tag +*/ +template<typename MatrixT> +void amg_interpol(MatrixT const & A, + MatrixT & P, + viennacl::linalg::detail::amg::amg_level_context & amg_context, + viennacl::linalg::amg_tag & tag) +{ + switch (tag.get_interpolation_method()) + { + case viennacl::linalg::AMG_INTERPOLATION_METHOD_DIRECT: amg_interpol_direct (A, P, amg_context, tag); break; + case viennacl::linalg::AMG_INTERPOLATION_METHOD_AGGREGATION: amg_interpol_ag (A, P, amg_context, tag); break; + case viennacl::linalg::AMG_INTERPOLATION_METHOD_SMOOTHED_AGGREGATION: amg_interpol_sa (A, P, amg_context, tag); break; + default: throw std::runtime_error("Not implemented yet!"); + } +} + + +/** @brief Computes B = trans(A). + * + * To be replaced by native functionality in ViennaCL. + */ +template<typename NumericT> +void amg_transpose(compressed_matrix<NumericT> const & A, + compressed_matrix<NumericT> & B) +{ + NumericT const * A_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle()); + unsigned int const * A_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1()); + unsigned int const * A_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2()); + + // initialize datastructures for B: + B = compressed_matrix<NumericT>(A.size2(), A.size1(), A.nnz(), viennacl::traits::context(A)); + + NumericT * B_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(B.handle()); + unsigned int * B_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(B.handle1()); + unsigned int * B_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(B.handle2()); + + // prepare uninitialized B_row_buffer: + for (std::size_t i = 0; i < B.size1(); ++i) + B_row_buffer[i] = 0; + + // + // Stage 1: Compute pattern for B + // + for (std::size_t row = 0; row < A.size1(); ++row) + { + unsigned int row_start = A_row_buffer[row]; + unsigned int row_stop = A_row_buffer[row+1]; + + for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index) + B_row_buffer[A_col_buffer[nnz_index]] += 1; + } + + // Bring row-start array in place using inclusive-scan: + unsigned int offset = B_row_buffer[0]; + B_row_buffer[0] = 0; + for (std::size_t row = 1; row < B.size1(); ++row) + { + unsigned int tmp = B_row_buffer[row]; + B_row_buffer[row] = offset; + offset += tmp; + } + B_row_buffer[B.size1()] = offset; + + // + // Stage 2: Fill with data + // + + std::vector<unsigned int> B_row_offsets(B.size1()); //number of elements already written per row + + for (std::size_t row = 0; row < A.size1(); ++row) + { + //std::cout << "Row " << row << ": "; + unsigned int row_start = A_row_buffer[row]; + unsigned int row_stop = A_row_buffer[row+1]; + + for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index) + { + unsigned int col_in_A = A_col_buffer[nnz_index]; + unsigned int B_nnz_index = B_row_buffer[col_in_A] + B_row_offsets[col_in_A]; + B_col_buffer[B_nnz_index] = static_cast<unsigned int>(row); + B_elements[B_nnz_index] = A_elements[nnz_index]; + ++B_row_offsets[col_in_A]; + //B_temp.at(A_col_buffer[nnz_index])[row] = A_elements[nnz_index]; + } + } + + // Step 3: Make datastructure consistent (row blocks!) + B.generate_row_block_information(); +} + +/** Assign sparse matrix A to dense matrix B */ +template<typename NumericT, unsigned int AlignmentV> +void assign_to_dense(viennacl::compressed_matrix<NumericT, AlignmentV> const & A, + viennacl::matrix_base<NumericT> & B) +{ + NumericT const * A_elements = detail::extract_raw_pointer<NumericT>(A.handle()); + unsigned int const * A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1()); + unsigned int const * A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2()); + + NumericT * B_elements = detail::extract_raw_pointer<NumericT>(B.handle()); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long row = 0; row < static_cast<long>(A.size1()); ++row) + { + unsigned int row_stop = A_row_buffer[row+1]; + + for (unsigned int nnz_index = A_row_buffer[row]; nnz_index < row_stop; ++nnz_index) + B_elements[static_cast<unsigned int>(row) * static_cast<unsigned int>(B.internal_size2()) + A_col_buffer[nnz_index]] = A_elements[nnz_index]; + } + +} + +/** @brief Damped Jacobi Smoother (CUDA version) +* +* @param iterations Number of smoother iterations +* @param A Operator matrix for the smoothing +* @param x The vector smoothing is applied to +* @param x_backup (Different) Vector holding the same values as x +* @param rhs_smooth The right hand side of the equation for the smoother +* @param weight Damping factor. 0: No effect of smoother. 1: Undamped Jacobi iteration +*/ +template<typename NumericT> +void smooth_jacobi(unsigned int iterations, + compressed_matrix<NumericT> const & A, + vector<NumericT> & x, + vector<NumericT> & x_backup, + vector<NumericT> const & rhs_smooth, + NumericT weight) +{ + + NumericT const * A_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle()); + unsigned int const * A_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1()); + unsigned int const * A_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2()); + NumericT const * rhs_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(rhs_smooth.handle()); + + NumericT * x_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(x.handle()); + NumericT const * x_old_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(x_backup.handle()); + + for (unsigned int i=0; i<iterations; ++i) + { + x_backup = x; + + #ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for + #endif + for (long row2 = 0; row2 < static_cast<long>(A.size1()); ++row2) + { + unsigned int row = static_cast<unsigned int>(row2); + unsigned int col_end = A_row_buffer[row+1]; + + NumericT sum = NumericT(0); + NumericT diag = NumericT(1); + for (unsigned int index = A_row_buffer[row]; index != col_end; ++index) + { + unsigned int col = A_col_buffer[index]; + if (col == row) + diag = A_elements[index]; + else + sum += A_elements[index] * x_old_elements[col]; + } + + x_elements[row] = weight * (rhs_elements[row] - sum) / diag + (NumericT(1) - weight) * x_old_elements[row]; + } + } +} + +} //namespace amg +} //namespace host_based +} //namespace linalg +} //namespace viennacl + +#endif http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/common.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/common.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/common.hpp new file mode 100644 index 0000000..8ddb4c1 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/common.hpp @@ -0,0 +1,149 @@ +#ifndef VIENNACL_LINALG_HOST_BASED_COMMON_HPP_ +#define VIENNACL_LINALG_HOST_BASED_COMMON_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/host_based/common.hpp + @brief Common routines for single-threaded or OpenMP-enabled execution on CPU +*/ + +#include "viennacl/traits/handle.hpp" + +namespace viennacl +{ +namespace linalg +{ +namespace host_based +{ +namespace detail +{ + +template<typename ResultT, typename VectorT> +ResultT * extract_raw_pointer(VectorT & vec) +{ + return reinterpret_cast<ResultT *>(viennacl::traits::ram_handle(vec).get()); +} + +template<typename ResultT, typename VectorT> +ResultT const * extract_raw_pointer(VectorT const & vec) +{ + return reinterpret_cast<ResultT const *>(viennacl::traits::ram_handle(vec).get()); +} + +/** @brief Helper class for accessing a strided subvector of a larger vector. */ +template<typename NumericT> +class vector_array_wrapper +{ +public: + typedef NumericT value_type; + + vector_array_wrapper(value_type * A, + vcl_size_t start, + vcl_size_t inc) + : A_(A), + start_(start), + inc_(inc) {} + + value_type & operator()(vcl_size_t i) { return A_[i * inc_ + start_]; } + +private: + value_type * A_; + vcl_size_t start_; + vcl_size_t inc_; +}; + + +/** @brief Helper array for accessing a strided submatrix embedded in a larger matrix. */ +template<typename NumericT, typename LayoutT, bool is_transposed> +class matrix_array_wrapper +{ + public: + typedef NumericT value_type; + + matrix_array_wrapper(value_type * A, + vcl_size_t start1, vcl_size_t start2, + vcl_size_t inc1, vcl_size_t inc2, + vcl_size_t internal_size1, vcl_size_t internal_size2) + : A_(A), + start1_(start1), start2_(start2), + inc1_(inc1), inc2_(inc2), + internal_size1_(internal_size1), internal_size2_(internal_size2) {} + + value_type & operator()(vcl_size_t i, vcl_size_t j) + { + return A_[LayoutT::mem_index(i * inc1_ + start1_, + j * inc2_ + start2_, + internal_size1_, internal_size2_)]; + } + + // convenience overloads to address signed index types for OpenMP: + value_type & operator()(vcl_size_t i, long j) { return operator()(i, static_cast<vcl_size_t>(j)); } + value_type & operator()(long i, vcl_size_t j) { return operator()(static_cast<vcl_size_t>(i), j); } + value_type & operator()(long i, long j) { return operator()(static_cast<vcl_size_t>(i), static_cast<vcl_size_t>(j)); } + + private: + value_type * A_; + vcl_size_t start1_, start2_; + vcl_size_t inc1_, inc2_; + vcl_size_t internal_size1_, internal_size2_; +}; + +/** \cond */ +template<typename NumericT, typename LayoutT> +class matrix_array_wrapper<NumericT, LayoutT, true> +{ +public: + typedef NumericT value_type; + + matrix_array_wrapper(value_type * A, + vcl_size_t start1, vcl_size_t start2, + vcl_size_t inc1, vcl_size_t inc2, + vcl_size_t internal_size1, vcl_size_t internal_size2) + : A_(A), + start1_(start1), start2_(start2), + inc1_(inc1), inc2_(inc2), + internal_size1_(internal_size1), internal_size2_(internal_size2) {} + + value_type & operator()(vcl_size_t i, vcl_size_t j) + { + //swapping row and column indices here + return A_[LayoutT::mem_index(j * inc1_ + start1_, + i * inc2_ + start2_, + internal_size1_, internal_size2_)]; + } + + // convenience overloads to address signed index types for OpenMP: + value_type & operator()(vcl_size_t i, long j) { return operator()(i, static_cast<vcl_size_t>(j)); } + value_type & operator()(long i, vcl_size_t j) { return operator()(static_cast<vcl_size_t>(i), j); } + value_type & operator()(long i, long j) { return operator()(static_cast<vcl_size_t>(i), static_cast<vcl_size_t>(j)); } + +private: + value_type * A_; + vcl_size_t start1_, start2_; + vcl_size_t inc1_, inc2_; + vcl_size_t internal_size1_, internal_size2_; +}; +/** \endcond */ + +} //namespace detail +} //namespace host_based +} //namespace linalg +} //namespace viennacl + + +#endif
