http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/gmres.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/gmres.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/gmres.hpp
deleted file mode 100644
index fb89742..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/gmres.hpp
+++ /dev/null
@@ -1,738 +0,0 @@
-#ifndef VIENNACL_GMRES_HPP_
-#define VIENNACL_GMRES_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-/** @file viennacl/linalg/gmres.hpp
-    @brief Implementations of the generalized minimum residual method are in 
this file.
-*/
-
-#include <vector>
-#include <cmath>
-#include <limits>
-#include "viennacl/forwards.h"
-#include "viennacl/tools/tools.hpp"
-#include "viennacl/linalg/norm_2.hpp"
-#include "viennacl/linalg/prod.hpp"
-#include "viennacl/linalg/inner_prod.hpp"
-#include "viennacl/traits/clear.hpp"
-#include "viennacl/traits/size.hpp"
-#include "viennacl/traits/context.hpp"
-#include "viennacl/meta/result_of.hpp"
-
-#include "viennacl/linalg/iterative_operations.hpp"
-#include "viennacl/vector_proxy.hpp"
-
-
-namespace viennacl
-{
-namespace linalg
-{
-
-/** @brief A tag for the solver GMRES. Used for supplying solver parameters 
and for dispatching the solve() function
-*/
-class gmres_tag       //generalized minimum residual
-{
-public:
-  /** @brief The constructor
-  *
-  * @param tol            Relative tolerance for the residual (solver quits if 
||r|| < tol * ||r_initial||)
-  * @param max_iterations The maximum number of iterations (including restarts
-  * @param krylov_dim     The maximum dimension of the Krylov space before 
restart (number of restarts is found by max_iterations / krylov_dim)
-  */
-  gmres_tag(double tol = 1e-10, unsigned int max_iterations = 300, unsigned 
int krylov_dim = 20)
-   : tol_(tol), abs_tol_(0), iterations_(max_iterations), 
krylov_dim_(krylov_dim), iters_taken_(0) {}
-
-  /** @brief Returns the relative tolerance */
-  double tolerance() const { return tol_; }
-
-  /** @brief Returns the absolute tolerance */
-  double abs_tolerance() const { return abs_tol_; }
-  /** @brief Sets the absolute tolerance */
-  void abs_tolerance(double new_tol) { if (new_tol >= 0) abs_tol_ = new_tol; }
-
-  /** @brief Returns the maximum number of iterations */
-  unsigned int max_iterations() const { return iterations_; }
-  /** @brief Returns the maximum dimension of the Krylov space before restart 
*/
-  unsigned int krylov_dim() const { return krylov_dim_; }
-  /** @brief Returns the maximum number of GMRES restarts */
-  unsigned int max_restarts() const
-  {
-    unsigned int ret = iterations_ / krylov_dim_;
-    if (ret > 0 && (ret * krylov_dim_ == iterations_) )
-      return ret - 1;
-    return ret;
-  }
-
-  /** @brief Return the number of solver iterations: */
-  unsigned int iters() const { return iters_taken_; }
-  /** @brief Set the number of solver iterations (should only be modified by 
the solver) */
-  void iters(unsigned int i) const { iters_taken_ = i; }
-
-  /** @brief Returns the estimated relative error at the end of the solver run 
*/
-  double error() const { return last_error_; }
-  /** @brief Sets the estimated relative error at the end of the solver run */
-  void error(double e) const { last_error_ = e; }
-
-private:
-  double tol_;
-  double abs_tol_;
-  unsigned int iterations_;
-  unsigned int krylov_dim_;
-
-  //return values from solver
-  mutable unsigned int iters_taken_;
-  mutable double last_error_;
-};
-
-namespace detail
-{
-
-  template<typename SrcVectorT, typename DestVectorT>
-  void gmres_copy_helper(SrcVectorT const & src, DestVectorT & dest, 
vcl_size_t len, vcl_size_t start = 0)
-  {
-    for (vcl_size_t i=0; i<len; ++i)
-      dest[start+i] = src[start+i];
-  }
-
-  template<typename NumericT, typename DestVectorT>
-  void gmres_copy_helper(viennacl::vector<NumericT> const & src, DestVectorT & 
dest, vcl_size_t len, vcl_size_t start = 0)
-  {
-    typedef typename viennacl::vector<NumericT>::difference_type   
difference_type;
-    viennacl::copy( src.begin() + static_cast<difference_type>(start),
-                    src.begin() + static_cast<difference_type>(start + len),
-                   dest.begin() + static_cast<difference_type>(start));
-  }
-
-  /** @brief Computes the householder vector 'hh_vec' which rotates 
'input_vec' such that all entries below the j-th entry of 'v' become zero.
-    *
-    * @param input_vec       The input vector
-    * @param hh_vec          The householder vector defining the relection (I 
- beta * hh_vec * hh_vec^T)
-    * @param beta            The coefficient beta in (I - beta  * hh_vec * 
hh_vec^T)
-    * @param mu              The norm of the input vector part relevant for 
the reflection: norm_2(input_vec[j:size])
-    * @param j               Index of the last nonzero index in 'input_vec' 
after applying the reflection
-  */
-  template<typename VectorT, typename NumericT>
-  void gmres_setup_householder_vector(VectorT const & input_vec, VectorT & 
hh_vec, NumericT & beta, NumericT & mu, vcl_size_t j)
-  {
-    NumericT input_j = input_vec(j);
-
-    // copy entries from input vector to householder vector:
-    detail::gmres_copy_helper(input_vec, hh_vec, 
viennacl::traits::size(hh_vec) - (j+1), j+1);
-
-    NumericT sigma = viennacl::linalg::norm_2(hh_vec);
-    sigma *= sigma;
-
-    if (sigma <= 0)
-    {
-      beta = 0;
-      mu = input_j;
-    }
-    else
-    {
-      mu = std::sqrt(sigma + input_j*input_j);
-
-      NumericT hh_vec_0 = (input_j <= 0) ? (input_j - mu) : (-sigma / (input_j 
+ mu));
-
-      beta = NumericT(2) * hh_vec_0 * hh_vec_0 / (sigma + hh_vec_0 * hh_vec_0);
-
-      //divide hh_vec by its diagonal element hh_vec_0
-      hh_vec /= hh_vec_0;
-      hh_vec[j] = NumericT(1);
-    }
-  }
-
-  // Apply (I - beta h h^T) to x (Householder reflection with Householder 
vector h)
-  template<typename VectorT, typename NumericT>
-  void gmres_householder_reflect(VectorT & x, VectorT const & h, NumericT beta)
-  {
-    NumericT hT_in_x = viennacl::linalg::inner_prod(h, x);
-    x -= (beta * hT_in_x) * h;
-  }
-
-
-  /** @brief Implementation of a pipelined GMRES solver without preconditioner
-  *
-  * Following algorithm 2.1 proposed by Walker in "A Simpler GMRES", but uses 
classical Gram-Schmidt instead of modified Gram-Schmidt for better 
parallelization.
-  * Uses some pipelining techniques for minimizing host-device transfer
-  *
-  * @param A            The system matrix
-  * @param rhs          The load vector
-  * @param tag          Solver configuration tag
-  * @param monitor      A callback routine which is called at each GMRES 
restart
-  * @param monitor_data Data pointer to be passed to the callback routine to 
pass on user-specific data
-  * @return The result vector
-  */
-  template <typename MatrixType, typename ScalarType>
-  viennacl::vector<ScalarType> pipelined_solve(MatrixType const & A,
-                                               viennacl::vector<ScalarType> 
const & rhs,
-                                               gmres_tag const & tag,
-                                               viennacl::linalg::no_precond,
-                                               bool 
(*monitor)(viennacl::vector<ScalarType> const &, ScalarType, void*) = NULL,
-                                               void *monitor_data = NULL)
-  {
-    viennacl::vector<ScalarType> residual(rhs);
-    viennacl::vector<ScalarType> result = 
viennacl::zero_vector<ScalarType>(rhs.size(), viennacl::traits::context(rhs));
-
-    viennacl::vector<ScalarType> device_krylov_basis(rhs.internal_size() * 
tag.krylov_dim(), viennacl::traits::context(rhs)); // not using 
viennacl::matrix here because of spurious padding in column number
-    viennacl::vector<ScalarType> 
device_buffer_R(tag.krylov_dim()*tag.krylov_dim(), 
viennacl::traits::context(rhs));
-    std::vector<ScalarType>      host_buffer_R(device_buffer_R.size());
-
-    vcl_size_t buffer_size_per_vector = 128;
-    vcl_size_t num_buffer_chunks      = 3;
-    viennacl::vector<ScalarType> device_inner_prod_buffer = 
viennacl::zero_vector<ScalarType>(num_buffer_chunks*buffer_size_per_vector, 
viennacl::traits::context(rhs)); // temporary buffer
-    viennacl::vector<ScalarType> device_r_dot_vk_buffer   = 
viennacl::zero_vector<ScalarType>(buffer_size_per_vector * tag.krylov_dim(), 
viennacl::traits::context(rhs)); // holds result of first reduction stage for 
<r, v_k> on device
-    viennacl::vector<ScalarType> device_vi_in_vk_buffer   = 
viennacl::zero_vector<ScalarType>(buffer_size_per_vector * tag.krylov_dim(), 
viennacl::traits::context(rhs)); // holds <v_i, v_k> for i=0..k-1 on device
-    viennacl::vector<ScalarType> device_values_xi_k       = 
viennacl::zero_vector<ScalarType>(tag.krylov_dim(), 
viennacl::traits::context(rhs)); // holds values \xi_k = <r, v_k> on device
-    std::vector<ScalarType>      
host_r_dot_vk_buffer(device_r_dot_vk_buffer.size());
-    std::vector<ScalarType>      host_values_xi_k(tag.krylov_dim());
-    std::vector<ScalarType>      host_values_eta_k_buffer(tag.krylov_dim());
-    std::vector<ScalarType>      host_update_coefficients(tag.krylov_dim());
-
-    ScalarType norm_rhs = viennacl::linalg::norm_2(residual);
-    ScalarType rho_0 = norm_rhs;
-    ScalarType rho = ScalarType(1);
-
-    tag.iters(0);
-
-    for (unsigned int restart_count = 0; restart_count <= tag.max_restarts(); 
++restart_count)
-    {
-      //
-      // prepare restart:
-      //
-      if (restart_count > 0)
-      {
-        // compute new residual without introducing a temporary for A*x:
-        residual = viennacl::linalg::prod(A, result);
-        residual = rhs - residual;
-
-        rho_0 = viennacl::linalg::norm_2(residual);
-      }
-
-      if (rho_0 <= ScalarType(tag.abs_tolerance()))  // trivial right hand 
side?
-        break;
-
-      residual /= rho_0;
-      rho = ScalarType(1);
-
-      // check for convergence:
-      if (rho_0 / norm_rhs < tag.tolerance() || rho_0 < tag.abs_tolerance())
-        break;
-
-      //
-      // minimize in Krylov basis:
-      //
-      vcl_size_t k = 0;
-      for (k = 0; k < static_cast<vcl_size_t>(tag.krylov_dim()); ++k)
-      {
-        if (k == 0)
-        {
-          // compute v0 = A*r and perform first reduction stage for ||v0||
-          viennacl::vector_range<viennacl::vector<ScalarType> > 
v0(device_krylov_basis, viennacl::range(0, rhs.size()));
-          viennacl::linalg::pipelined_gmres_prod(A, residual, v0, 
device_inner_prod_buffer);
-
-          // Normalize v_1 and compute first reduction stage for <r, v_0> in 
device_r_dot_vk_buffer:
-          viennacl::linalg::pipelined_gmres_normalize_vk(v0, residual,
-                                                         device_buffer_R, 
k*tag.krylov_dim() + k,
-                                                         
device_inner_prod_buffer, device_r_dot_vk_buffer,
-                                                         
buffer_size_per_vector, k*buffer_size_per_vector);
-        }
-        else
-        {
-          // compute v0 = A*r and perform first reduction stage for ||v0||
-          viennacl::vector_range<viennacl::vector<ScalarType> > vk        
(device_krylov_basis, viennacl::range( k   *rhs.internal_size(),  k   
*rhs.internal_size() + rhs.size()));
-          viennacl::vector_range<viennacl::vector<ScalarType> > 
vk_minus_1(device_krylov_basis, viennacl::range((k-1)*rhs.internal_size(), 
(k-1)*rhs.internal_size() + rhs.size()));
-          viennacl::linalg::pipelined_gmres_prod(A, vk_minus_1, vk, 
device_inner_prod_buffer);
-
-          //
-          // Gram-Schmidt, stage 1: compute first reduction stage of <v_i, v_k>
-          //
-          
viennacl::linalg::pipelined_gmres_gram_schmidt_stage1(device_krylov_basis, 
rhs.size(), rhs.internal_size(), k, device_vi_in_vk_buffer, 
buffer_size_per_vector);
-
-          //
-          // Gram-Schmidt, stage 2: compute second reduction stage of <v_i, 
v_k> and use that to compute v_k -= sum_i <v_i, v_k> v_i.
-          //                        Store <v_i, v_k> in R-matrix and compute 
first reduction stage for ||v_k||
-          //
-          
viennacl::linalg::pipelined_gmres_gram_schmidt_stage2(device_krylov_basis, 
rhs.size(), rhs.internal_size(), k,
-                                                                
device_vi_in_vk_buffer,
-                                                                
device_buffer_R, tag.krylov_dim(),
-                                                                
device_inner_prod_buffer, buffer_size_per_vector);
-
-          //
-          // Normalize v_k and compute first reduction stage for <r, v_k> in 
device_r_dot_vk_buffer:
-          //
-          viennacl::linalg::pipelined_gmres_normalize_vk(vk, residual,
-                                                         device_buffer_R, 
k*tag.krylov_dim() + k,
-                                                         
device_inner_prod_buffer, device_r_dot_vk_buffer,
-                                                         
buffer_size_per_vector, k*buffer_size_per_vector);
-        }
-      }
-
-      //
-      // Run reduction to obtain the values \xi_k = <r, v_k>.
-      // Note that unlike Algorithm 2.1 in Walker: "A Simpler GMRES", we do 
not update the residual
-      //
-      viennacl::fast_copy(device_r_dot_vk_buffer.begin(), 
device_r_dot_vk_buffer.end(), host_r_dot_vk_buffer.begin());
-      for (std::size_t i=0; i<k; ++i)
-      {
-        host_values_xi_k[i] = ScalarType(0);
-        for (std::size_t j=0; j<buffer_size_per_vector; ++j)
-          host_values_xi_k[i] += host_r_dot_vk_buffer[i*buffer_size_per_vector 
+ j];
-      }
-
-      //
-      // Bring values in R  back to host:
-      //
-      viennacl::fast_copy(device_buffer_R.begin(), device_buffer_R.end(), 
host_buffer_R.begin());
-
-      //
-      // Check for premature convergence: If the diagonal element drops too 
far below the first norm, we're done and restrict the Krylov size accordingly.
-      //
-      vcl_size_t full_krylov_dim = k; //needed for proper access to R
-      for (std::size_t i=0; i<k; ++i)
-      {
-        if (std::fabs(host_buffer_R[i + i*k]) < tag.tolerance() * 
host_buffer_R[0])
-        {
-          k = i;
-          break;
-        }
-      }
-
-
-      // Compute error estimator:
-      for (std::size_t i=0; i<k; ++i)
-      {
-        tag.iters( tag.iters() + 1 ); //increase iteration counter
-
-        // check for accumulation of round-off errors for poorly conditioned 
systems
-        if (host_values_xi_k[i] >= rho || host_values_xi_k[i] <= -rho)
-        {
-          k = i;
-          break;  // restrict Krylov space at this point. No gain from using 
additional basis vectors, since orthogonality is lost.
-        }
-
-        // update error estimator
-        rho *= std::sin( std::acos(host_values_xi_k[i] / rho) );
-      }
-
-      //
-      // Solve minimization problem:
-      //
-      host_values_eta_k_buffer = host_values_xi_k;
-
-      for (int i2=static_cast<int>(k)-1; i2>-1; --i2)
-      {
-        vcl_size_t i = static_cast<vcl_size_t>(i2);
-        for (vcl_size_t j=static_cast<vcl_size_t>(i)+1; j<k; ++j)
-          host_values_eta_k_buffer[i] -= host_buffer_R[i + j*full_krylov_dim] 
* host_values_eta_k_buffer[j];
-
-        host_values_eta_k_buffer[i] /= host_buffer_R[i + i*full_krylov_dim];
-      }
-
-      //
-      // Update x += rho * z with z = \eta_0 * residual + sum_{i=0}^{k-1} 
\eta_{i+1} v_i
-      // Note that we have not updated the residual yet, hence this slightly 
modified as compared to the form given in Algorithm 2.1 in Walker: "A Simpler 
GMRES"
-      //
-      for (vcl_size_t i=0; i<k; ++i)
-        host_update_coefficients[i] = rho_0 * host_values_eta_k_buffer[i];
-
-      viennacl::fast_copy(host_update_coefficients.begin(), 
host_update_coefficients.end(), device_values_xi_k.begin()); //reuse 
device_values_xi_k_buffer here for simplicity
-
-      viennacl::linalg::pipelined_gmres_update_result(result, residual,
-                                                      device_krylov_basis, 
rhs.size(), rhs.internal_size(),
-                                                      device_values_xi_k, k);
-
-      tag.error( std::fabs(rho*rho_0 / norm_rhs) );
-
-      if (monitor && monitor(result, std::fabs(rho*rho_0 / norm_rhs), 
monitor_data))
-        break;
-    }
-
-    return result;
-  }
-
-  /** @brief Overload for the pipelined CG implementation for the ViennaCL 
sparse matrix types */
-  template<typename NumericT>
-  viennacl::vector<NumericT> solve_impl(viennacl::compressed_matrix<NumericT> 
const & A,
-                                        viennacl::vector<NumericT> const & rhs,
-                                        gmres_tag const & tag,
-                                        viennacl::linalg::no_precond,
-                                        bool 
(*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
-                                        void *monitor_data = NULL)
-  {
-    return pipelined_solve(A, rhs, tag, viennacl::linalg::no_precond(), 
monitor, monitor_data);
-  }
-
-
-  /** @brief Overload for the pipelined CG implementation for the ViennaCL 
sparse matrix types */
-  template<typename NumericT>
-  viennacl::vector<NumericT> solve_impl(viennacl::coordinate_matrix<NumericT> 
const & A,
-                                        viennacl::vector<NumericT> const & rhs,
-                                        gmres_tag const & tag,
-                                        viennacl::linalg::no_precond,
-                                        bool 
(*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
-                                        void *monitor_data = NULL)
-  {
-    return detail::pipelined_solve(A, rhs, tag, 
viennacl::linalg::no_precond(), monitor, monitor_data);
-  }
-
-
-
-  /** @brief Overload for the pipelined CG implementation for the ViennaCL 
sparse matrix types */
-  template<typename NumericT>
-  viennacl::vector<NumericT> solve_impl(viennacl::ell_matrix<NumericT> const & 
A,
-                                        viennacl::vector<NumericT> const & rhs,
-                                        gmres_tag const & tag,
-                                        viennacl::linalg::no_precond,
-                                        bool 
(*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
-                                        void *monitor_data = NULL)
-  {
-    return detail::pipelined_solve(A, rhs, tag, 
viennacl::linalg::no_precond(), monitor, monitor_data);
-  }
-
-
-
-  /** @brief Overload for the pipelined CG implementation for the ViennaCL 
sparse matrix types */
-  template<typename NumericT>
-  viennacl::vector<NumericT> solve_impl(viennacl::sliced_ell_matrix<NumericT> 
const & A,
-                                        viennacl::vector<NumericT> const & rhs,
-                                        gmres_tag const & tag,
-                                        viennacl::linalg::no_precond,
-                                        bool 
(*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
-                                        void *monitor_data = NULL)
-  {
-    return detail::pipelined_solve(A, rhs, tag, 
viennacl::linalg::no_precond(), monitor, monitor_data);
-  }
-
-
-  /** @brief Overload for the pipelined CG implementation for the ViennaCL 
sparse matrix types */
-  template<typename NumericT>
-  viennacl::vector<NumericT> solve_impl(viennacl::hyb_matrix<NumericT> const & 
A,
-                                        viennacl::vector<NumericT> const & rhs,
-                                        gmres_tag const & tag,
-                                        viennacl::linalg::no_precond,
-                                        bool 
(*monitor)(viennacl::vector<NumericT> const &, NumericT, void*) = NULL,
-                                        void *monitor_data = NULL)
-  {
-    return detail::pipelined_solve(A, rhs, tag, 
viennacl::linalg::no_precond(), monitor, monitor_data);
-  }
-
-
-  /** @brief Implementation of the GMRES solver.
-  *
-  * Following the algorithm proposed by Walker in "A Simpler GMRES"
-  *
-  * @param matrix       The system matrix
-  * @param rhs          The load vector
-  * @param tag          Solver configuration tag
-  * @param precond      A preconditioner. Precondition operation is done via 
member function apply()
-  * @param monitor      A callback routine which is called at each GMRES 
restart
-  * @param monitor_data Data pointer to be passed to the callback routine to 
pass on user-specific data
-  *
-  * @return The result vector
-  */
-  template<typename MatrixT, typename VectorT, typename PreconditionerT>
-  VectorT solve_impl(MatrixT const & matrix,
-                     VectorT const & rhs,
-                     gmres_tag const & tag,
-                     PreconditionerT const & precond,
-                     bool (*monitor)(VectorT const &, typename 
viennacl::result_of::cpu_value_type<typename 
viennacl::result_of::value_type<VectorT>::type>::type, void*) = NULL,
-                     void *monitor_data = NULL)
-  {
-    typedef typename viennacl::result_of::value_type<VectorT>::type            
NumericType;
-    typedef typename viennacl::result_of::cpu_value_type<NumericType>::type    
CPU_NumericType;
-
-    unsigned int problem_size = static_cast<unsigned 
int>(viennacl::traits::size(rhs));
-    VectorT result = rhs;
-    viennacl::traits::clear(result);
-
-    vcl_size_t krylov_dim = static_cast<vcl_size_t>(tag.krylov_dim());
-    if (problem_size < krylov_dim)
-      krylov_dim = problem_size; //A Krylov space larger than the matrix would 
lead to seg-faults (mathematically, error is certain to be zero already)
-
-    VectorT res = rhs;
-    VectorT v_k_tilde = rhs;
-    VectorT v_k_tilde_temp = rhs;
-
-    std::vector< std::vector<CPU_NumericType> > R(krylov_dim, 
std::vector<CPU_NumericType>(tag.krylov_dim()));
-    std::vector<CPU_NumericType> projection_rhs(krylov_dim);
-
-    std::vector<VectorT>          householder_reflectors(krylov_dim, rhs);
-    std::vector<CPU_NumericType>  betas(krylov_dim);
-
-    CPU_NumericType norm_rhs = viennacl::linalg::norm_2(rhs);
-
-    if (norm_rhs <= tag.abs_tolerance()) //solution is zero if RHS norm is zero
-      return result;
-
-    tag.iters(0);
-
-    for (unsigned int it = 0; it <= tag.max_restarts(); ++it)
-    {
-      //
-      // (Re-)Initialize residual: r = b - A*x (without temporary for the 
result of A*x)
-      //
-      res = viennacl::linalg::prod(matrix, result);  //initial guess zero
-      res = rhs - res;
-      precond.apply(res);
-
-      CPU_NumericType rho_0 = viennacl::linalg::norm_2(res);
-
-      //
-      // Check for premature convergence
-      //
-      if (rho_0 / norm_rhs < tag.tolerance() || rho_0 < tag.abs_tolerance()) 
// norm_rhs is known to be nonzero here
-      {
-        tag.error(rho_0 / norm_rhs);
-        return result;
-      }
-
-      //
-      // Normalize residual and set 'rho' to 1 as requested in 'A Simpler 
GMRES' by Walker and Zhou.
-      //
-      res /= rho_0;
-      CPU_NumericType rho = static_cast<CPU_NumericType>(1.0);
-
-
-      //
-      // Iterate up until maximal Krylove space dimension is reached:
-      //
-      vcl_size_t k = 0;
-      for (k = 0; k < krylov_dim; ++k)
-      {
-        tag.iters( tag.iters() + 1 ); //increase iteration counter
-
-        // prepare storage:
-        viennacl::traits::clear(R[k]);
-        viennacl::traits::clear(householder_reflectors[k]);
-
-        //compute v_k = A * v_{k-1} via Householder matrices
-        if (k == 0)
-        {
-          v_k_tilde = viennacl::linalg::prod(matrix, res);
-          precond.apply(v_k_tilde);
-        }
-        else
-        {
-          viennacl::traits::clear(v_k_tilde);
-          v_k_tilde[k-1] = CPU_NumericType(1);
-
-          //Householder rotations, part 1: Compute P_1 * P_2 * ... * P_{k-1} * 
e_{k-1}
-          for (int i = static_cast<int>(k)-1; i > -1; --i)
-            detail::gmres_householder_reflect(v_k_tilde, 
householder_reflectors[vcl_size_t(i)], betas[vcl_size_t(i)]);
-
-          v_k_tilde_temp = viennacl::linalg::prod(matrix, v_k_tilde);
-          precond.apply(v_k_tilde_temp);
-          v_k_tilde = v_k_tilde_temp;
-
-          //Householder rotations, part 2: Compute P_{k-1} * ... * P_{1} * 
v_k_tilde
-          for (vcl_size_t i = 0; i < k; ++i)
-            detail::gmres_householder_reflect(v_k_tilde, 
householder_reflectors[i], betas[i]);
-        }
-
-        //
-        // Compute Householder reflection for v_k_tilde such that all entries 
below k-th entry are zero:
-        //
-        CPU_NumericType rho_k_k = 0;
-        detail::gmres_setup_householder_vector(v_k_tilde, 
householder_reflectors[k], betas[k], rho_k_k, k);
-
-        //
-        // copy first k entries from v_k_tilde to R[k] in order to fill k-th 
column with result of
-        // P_k * v_k_tilde = (v[0], ... , v[k-1], norm(v), 0, 0, ...) =: 
(rho_{1,k}, rho_{2,k}, ..., rho_{k,k}, 0, ..., 0);
-        //
-        detail::gmres_copy_helper(v_k_tilde, R[k], k);
-        R[k][k] = rho_k_k;
-
-        //
-        // Update residual: r = P_k r
-        // Set zeta_k = r[k] including machine precision considerations: 
mathematically we have |r[k]| <= rho
-        // Set rho *= sin(acos(r[k] / rho))
-        //
-        detail::gmres_householder_reflect(res, householder_reflectors[k], 
betas[k]);
-
-        if (res[k] > rho) //machine precision reached
-          res[k] = rho;
-        if (res[k] < -rho) //machine precision reached
-          res[k] = -rho;
-        projection_rhs[k] = res[k];
-
-        rho *= std::sin( std::acos(projection_rhs[k] / rho) );
-
-        if (std::fabs(rho * rho_0 / norm_rhs) < tag.tolerance())  // Residual 
is sufficiently reduced, stop here
-        {
-          tag.error( std::fabs(rho*rho_0 / norm_rhs) );
-          ++k;
-          break;
-        }
-      } // for k
-
-      //
-      // Triangular solver stage:
-      //
-
-      for (int i2=static_cast<int>(k)-1; i2>-1; --i2)
-      {
-        vcl_size_t i = static_cast<vcl_size_t>(i2);
-        for (vcl_size_t j=i+1; j<k; ++j)
-          projection_rhs[i] -= R[j][i] * projection_rhs[j];     //R is 
transposed
-
-        projection_rhs[i] /= R[i][i];
-      }
-
-      //
-      // Note: 'projection_rhs' now holds the solution (eta_1, ..., eta_k)
-      //
-
-      res *= projection_rhs[0];
-
-      if (k > 0)
-      {
-        for (unsigned int i = 0; i < k-1; ++i)
-          res[i] += projection_rhs[i+1];
-      }
-
-      //
-      // Form z inplace in 'res' by applying P_1 * ... * P_{k}
-      //
-      for (int i=static_cast<int>(k)-1; i>=0; --i)
-        detail::gmres_householder_reflect(res, 
householder_reflectors[vcl_size_t(i)], betas[vcl_size_t(i)]);
-
-      res *= rho_0;
-      result += res;  // x += rho_0 * z    in the paper
-
-      //
-      // Check for convergence:
-      //
-      tag.error(std::fabs(rho*rho_0 / norm_rhs));
-
-      if (monitor && monitor(result, std::fabs(rho*rho_0 / norm_rhs), 
monitor_data))
-        break;
-
-      if ( tag.error() < tag.tolerance() )
-        return result;
-    }
-
-    return result;
-  }
-
-}
-
-template<typename MatrixT, typename VectorT, typename PreconditionerT>
-VectorT solve(MatrixT const & matrix, VectorT const & rhs, gmres_tag const & 
tag, PreconditionerT const & precond)
-{
-  return detail::solve_impl(matrix, rhs, tag, precond);
-}
-
-/** @brief Convenience overload for calling the preconditioned BiCGStab solver 
using types from the C++ STL.
-  *
-  * A std::vector<std::map<T, U> > matrix is convenient for e.g. finite 
element assembly.
-  * It is not the fastest option for setting up a system, but often it is fast 
enough - particularly for just trying things out.
-  */
-template<typename IndexT, typename NumericT, typename PreconditionerT>
-std::vector<NumericT> solve(std::vector< std::map<IndexT, NumericT> > const & 
A, std::vector<NumericT> const & rhs, gmres_tag const & tag, PreconditionerT 
const & precond)
-{
-  viennacl::compressed_matrix<NumericT> vcl_A;
-  viennacl::copy(A, vcl_A);
-
-  viennacl::vector<NumericT> vcl_rhs(rhs.size());
-  viennacl::copy(rhs, vcl_rhs);
-
-  viennacl::vector<NumericT> vcl_result = solve(vcl_A, vcl_rhs, tag, precond);
-
-  std::vector<NumericT> result(vcl_result.size());
-  viennacl::copy(vcl_result, result);
-  return result;
-}
-
-/** @brief Entry point for the unpreconditioned GMRES method.
- *
- *  @param A         The system matrix
- *  @param rhs       Right hand side vector (load vector)
- *  @param tag       A BiCGStab tag providing relative tolerances, etc.
- */
-
-template<typename MatrixT, typename VectorT>
-VectorT solve(MatrixT const & A, VectorT const & rhs, gmres_tag const & tag)
-{
-  return solve(A, rhs, tag, no_precond());
-}
-
-
-
-template<typename VectorT>
-class gmres_solver
-{
-public:
-  typedef typename viennacl::result_of::cpu_value_type<VectorT>::type   
numeric_type;
-
-  gmres_solver(gmres_tag const & tag) : tag_(tag), monitor_callback_(NULL), 
user_data_(NULL) {}
-
-  template<typename MatrixT, typename PreconditionerT>
-  VectorT operator()(MatrixT const & A, VectorT const & b, PreconditionerT 
const & precond) const
-  {
-    if (viennacl::traits::size(init_guess_) > 0) // take initial guess into 
account
-    {
-      VectorT mod_rhs = viennacl::linalg::prod(A, init_guess_);
-      mod_rhs = b - mod_rhs;
-      VectorT y = detail::solve_impl(A, mod_rhs, tag_, precond, 
monitor_callback_, user_data_);
-      return init_guess_ + y;
-    }
-    return detail::solve_impl(A, b, tag_, precond, monitor_callback_, 
user_data_);
-  }
-
-
-  template<typename MatrixT>
-  VectorT operator()(MatrixT const & A, VectorT const & b) const
-  {
-    return operator()(A, b, viennacl::linalg::no_precond());
-  }
-
-  /** @brief Specifies an initial guess for the iterative solver.
-    *
-    * An iterative solver for Ax = b with initial guess x_0 is equivalent to 
an iterative solver for Ay = b' := b - Ax_0, where x = x_0 + y.
-    */
-  void set_initial_guess(VectorT const & x) { init_guess_ = x; }
-
-  /** @brief Sets a monitor function pointer to be called in each iteration. 
Set to NULL to run without monitor.
-   *
-   *  The monitor function is called with the current guess for the result as 
first argument and the current relative residual estimate as second argument.
-   *  The third argument is a pointer to user-defined data, through which 
additional information can be passed.
-   *  This pointer needs to be set with set_monitor_data. If not set, NULL is 
passed.
-   *  If the montior function returns true, the solver terminates (either 
convergence or divergence).
-   */
-  void set_monitor(bool (*monitor_fun)(VectorT const &, numeric_type, void *), 
void *user_data)
-  {
-    monitor_callback_ = monitor_fun;
-    user_data_ = user_data;
-  }
-
-  /** @brief Returns the solver tag containing basic configuration such as 
tolerances, etc. */
-  gmres_tag const & tag() const { return tag_; }
-
-private:
-  gmres_tag  tag_;
-  VectorT    init_guess_;
-  bool       (*monitor_callback_)(VectorT const &, numeric_type, void *);
-  void       *user_data_;
-};
-
-
-}
-}
-
-#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/hankel_matrix_operations.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/hankel_matrix_operations.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/hankel_matrix_operations.hpp
deleted file mode 100644
index 43ca928..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/hankel_matrix_operations.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-#ifndef VIENNACL_LINALG_HANKEL_MATRIX_OPERATIONS_HPP_
-#define VIENNACL_LINALG_HANKEL_MATRIX_OPERATIONS_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-/** @file viennacl/linalg/hankel_matrix_operations.hpp
-    @brief Implementations of operations using hankel_matrix. Experimental.
-*/
-
-#include "viennacl/forwards.h"
-#include "viennacl/ocl/backend.hpp"
-#include "viennacl/scalar.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/tools/tools.hpp"
-#include "viennacl/fft.hpp"
-#include "viennacl/linalg/toeplitz_matrix_operations.hpp"
-
-namespace viennacl
-{
-namespace linalg
-{
-
-// A * x
-
-/** @brief Carries out matrix-vector multiplication with a hankel_matrix
-*
-* Implementation of the convenience expression result = prod(mat, vec);
-*
-* @param A      The matrix
-* @param vec    The vector
-* @param result The result vector
-*/
-template<typename NumericT, unsigned int AlignmentV>
-void prod_impl(viennacl::hankel_matrix<NumericT, AlignmentV> const & A,
-               viennacl::vector_base<NumericT> const & vec,
-               viennacl::vector_base<NumericT>       & result)
-{
-  assert(A.size1() == result.size() && bool("Dimension mismatch"));
-  assert(A.size2() == vec.size()    && bool("Dimension mismatch"));
-
-  prod_impl(A.elements(), vec, result);
-  viennacl::linalg::reverse(result);
-}
-
-} //namespace linalg
-
-
-} //namespace viennacl
-
-
-#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/amg_operations.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/amg_operations.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/amg_operations.hpp
deleted file mode 100644
index 78bd150..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/amg_operations.hpp
+++ /dev/null
@@ -1,1123 +0,0 @@
-#ifndef VIENNACL_LINALG_HOST_BASED_AMG_OPERATIONS_HPP
-#define VIENNACL_LINALG_HOST_BASED_AMG_OPERATIONS_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-/** @file host_based/amg_operations.hpp
-    @brief Implementations of routines for AMG using the CPU on the host (with 
OpenMP if enabled).
-*/
-
-#include <cstdlib>
-#include <cmath>
-#include "viennacl/linalg/detail/amg/amg_base.hpp"
-
-#include <map>
-#include <set>
-#include <functional>
-#ifdef VIENNACL_WITH_OPENMP
-#include <omp.h>
-#endif
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace host_based
-{
-namespace amg
-{
-
-
-///////////////////////////////////////////
-
-/** @brief Routine for taking all connections in the matrix as strong */
-template<typename NumericT>
-void amg_influence_trivial(compressed_matrix<NumericT> const & A,
-                           viennacl::linalg::detail::amg::amg_level_context & 
amg_context,
-                           viennacl::linalg::amg_tag & tag)
-{
-  (void)tag;
-
-  unsigned int const * A_row_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(A.handle1());
-  unsigned int const * A_col_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(A.handle2());
-
-  unsigned int *influences_row_ptr = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(amg_context.influence_jumper_.handle());
-  unsigned int *influences_id_ptr  = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(amg_context.influence_ids_.handle());
-  unsigned int *influences_values_ptr  = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(amg_context.influence_values_.handle());
-
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for
-#endif
-  for (long i2=0; i2<static_cast<long>(A.size1()); ++i2)
-  {
-    vcl_size_t i = vcl_size_t(i2);
-    influences_row_ptr[i] = A_row_buffer[i];
-    influences_values_ptr[i] = A_row_buffer[i+1] - A_row_buffer[i];
-  }
-  influences_row_ptr[A.size1()] = A_row_buffer[A.size1()];
-
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for
-#endif
-  for (long i=0; i<long(A.nnz()); ++i)
-    influences_id_ptr[i] = A_col_buffer[i];
-}
-
-
-/** @brief Routine for extracting strongly connected points considering a 
user-provided threshold value */
-template<typename NumericT>
-void amg_influence_advanced(compressed_matrix<NumericT> const & A,
-                            viennacl::linalg::detail::amg::amg_level_context & 
amg_context,
-                            viennacl::linalg::amg_tag & tag)
-{
-  NumericT     const * A_elements   = 
viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
-  unsigned int const * A_row_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(A.handle1());
-  unsigned int const * A_col_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(A.handle2());
-
-  unsigned int *influences_row_ptr = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(amg_context.influence_jumper_.handle());
-  unsigned int *influences_id_ptr  = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(amg_context.influence_ids_.handle());
-
-  //
-  // Step 1: Scan influences in order to allocate the necessary memory
-  //
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for
-#endif
-  for (long i2=0; i2<static_cast<long>(A.size1()); ++i2)
-  {
-    vcl_size_t i = vcl_size_t(i2);
-    unsigned int row_start = A_row_buffer[i];
-    unsigned int row_stop  = A_row_buffer[i+1];
-    NumericT diag = 0;
-    NumericT largest_positive = 0;
-    NumericT largest_negative = 0;
-    unsigned int num_influences = 0;
-
-    // obtain diagonal element as well as maximum positive and negative 
off-diagonal entries
-    for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index)
-    {
-      unsigned int col = A_col_buffer[nnz_index];
-      NumericT value   = A_elements[nnz_index];
-
-      if (col == i)
-        diag = value;
-      else if (value > largest_positive)
-        largest_positive = value;
-      else if (value < largest_negative)
-        largest_negative = value;
-    }
-
-    if (largest_positive <= 0 && largest_negative >= 0) // no offdiagonal 
entries
-    {
-      influences_row_ptr[i] = 0;
-      continue;
-    }
-
-    // Find all points that strongly influence current point (Yang, p.5)
-    //std::cout << "Looking for strongly influencing points for point " << i 
<< std::endl;
-    for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index)
-    {
-      unsigned int col = A_col_buffer[nnz_index];
-
-      if (i == col)
-        continue;
-
-      NumericT value   = A_elements[nnz_index];
-
-      if (   (diag > 0 && diag * value <= 
tag.get_strong_connection_threshold() * diag * largest_negative)
-          || (diag < 0 && diag * value <= 
tag.get_strong_connection_threshold() * diag * largest_positive))
-      {
-        ++num_influences;
-      }
-    }
-
-    influences_row_ptr[i] = num_influences;
-  }
-
-  //
-  // Step 2: Exclusive scan on number of influences to obtain CSR-like 
datastructure
-  //
-  unsigned int current_entry = 0;
-  for (std::size_t i=0; i<A.size1(); ++i)
-  {
-    unsigned int tmp = influences_row_ptr[i];
-    influences_row_ptr[i] = current_entry;
-    current_entry += tmp;
-  }
-  influences_row_ptr[A.size1()] = current_entry;
-
-
-  //
-  // Step 3: Write actual influences
-  //
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for
-#endif
-  for (long i2=0; i2<static_cast<long>(A.size1()); ++i2)
-  {
-    unsigned int i = static_cast<unsigned int>(i2);
-    unsigned int row_start = A_row_buffer[i];
-    unsigned int row_stop  = A_row_buffer[i+1];
-    NumericT diag = 0;
-    NumericT largest_positive = 0;
-    NumericT largest_negative = 0;
-
-    // obtain diagonal element as well as maximum positive and negative 
off-diagonal entries
-    for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index)
-    {
-      unsigned int col = A_col_buffer[nnz_index];
-      NumericT value   = A_elements[nnz_index];
-
-      if (col == i)
-        diag = value;
-      else if (value > largest_positive)
-        largest_positive = value;
-      else if (value < largest_negative)
-        largest_negative = value;
-    }
-
-    if (largest_positive <= 0 && largest_negative >= 0) // no offdiagonal 
entries
-      continue;
-
-    // Find all points that strongly influence current point (Yang, p.5)
-    //std::cout << "Looking for strongly influencing points for point " << i 
<< std::endl;
-    unsigned int *influences_id_write_ptr = influences_id_ptr + 
influences_row_ptr[i];
-    for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index)
-    {
-      unsigned int col = A_col_buffer[nnz_index];
-
-      if (i == col)
-        continue;
-
-      NumericT value   = A_elements[nnz_index];
-
-      if (   (diag > 0 && diag * value <= 
tag.get_strong_connection_threshold() * diag * largest_negative)
-          || (diag < 0 && diag * value <= 
tag.get_strong_connection_threshold() * diag * largest_positive))
-      {
-        //std::cout << " - Adding influence from point " << col << std::endl;
-        *influences_id_write_ptr = col;
-        ++influences_id_write_ptr;
-      }
-    }
-  }
-
-}
-
-
-/** @brief Dispatcher for influence processing */
-template<typename NumericT>
-void amg_influence(compressed_matrix<NumericT> const & A,
-                   viennacl::linalg::detail::amg::amg_level_context & 
amg_context,
-                   viennacl::linalg::amg_tag & tag)
-{
-  // TODO: dispatch based on influence tolerance provided
-  amg_influence_trivial(A, amg_context, tag);
-}
-
-
-
-/** @brief Assign IDs to coarse points */
-inline void 
enumerate_coarse_points(viennacl::linalg::detail::amg::amg_level_context & 
amg_context)
-{
-  unsigned int *point_types_ptr  = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(amg_context.point_types_.handle());
-  unsigned int *coarse_id_ptr    = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(amg_context.coarse_id_.handle());
-
-  unsigned int coarse_id = 0;
-  for (vcl_size_t i=0; i<amg_context.coarse_id_.size(); ++i)
-  {
-    //assert(point_types_ptr[i] != 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED && 
bool("Logic error in enumerate_coarse_points(): Undecided points detected!"));
-
-    if (point_types_ptr[i] == 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
-      coarse_id_ptr[i] = coarse_id++;
-  }
-
-  //std::cout << "Coarse nodes after enumerate_coarse_points(): " << coarse_id 
<< std::endl;
-  amg_context.num_coarse_ = coarse_id;
-}
-
-
-
-
-//////////////////////////////////////
-
-
-/** @brief Helper struct for sequential classical one-pass coarsening */
-struct amg_id_influence
-{
-  amg_id_influence(std::size_t id2, std::size_t influences2) : 
id(static_cast<unsigned int>(id2)), influences(static_cast<unsigned 
int>(influences2)) {}
-
-  unsigned int  id;
-  unsigned int  influences;
-};
-
-inline bool operator>(amg_id_influence const & a, amg_id_influence const & b)
-{
-  if (a.influences > b.influences)
-    return true;
-  if (a.influences == b.influences)
-    return a.id > b.id;
-  return false;
-}
-
-/** @brief Classical (RS) one-pass coarsening. Single-Threaded! 
(VIENNACL_AMG_COARSE_CLASSIC_ONEPASS)
-*
-* @param A             Operator matrix for the respective level
-* @param amg_context   AMG datastructure object for the grid hierarchy
-* @param tag           AMG preconditioner tag
-*/
-template<typename NumericT>
-void amg_coarse_classic_onepass(compressed_matrix<NumericT> const & A,
-                                
viennacl::linalg::detail::amg::amg_level_context & amg_context,
-                                viennacl::linalg::amg_tag & tag)
-{
-  unsigned int *point_types_ptr       = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(amg_context.point_types_.handle());
-  unsigned int *influences_row_ptr    = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(amg_context.influence_jumper_.handle());
-  unsigned int *influences_id_ptr     = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(amg_context.influence_ids_.handle());
-  unsigned int *influences_values_ptr = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(amg_context.influence_values_.handle());
-
-  std::set<amg_id_influence, std::greater<amg_id_influence> > 
points_by_influences;
-
-  amg_influence_advanced(A, amg_context, tag);
-
-  for (std::size_t i=0; i<A.size1(); ++i)
-    points_by_influences.insert(amg_id_influence(i, influences_values_ptr[i]));
-
-  //std::cout << "Starting coarsening process..." << std::endl;
-
-  while (!points_by_influences.empty())
-  {
-    amg_id_influence point = *(points_by_influences.begin());
-
-    // remove point from queue:
-    points_by_influences.erase(points_by_influences.begin());
-
-    //std::cout << "Working on point " << point.id << std::endl;
-
-    // point is already coarse or fine point, continue;
-    if (point_types_ptr[point.id] != 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
-      continue;
-
-    //std::cout << " Setting point " << point.id << " to a coarse point." << 
std::endl;
-    // make this a coarse point:
-    point_types_ptr[point.id] = 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE;
-
-    // Set strongly influenced points to fine points:
-    unsigned int j_stop = influences_row_ptr[point.id + 1];
-    for (unsigned int j = influences_row_ptr[point.id]; j < j_stop; ++j)
-    {
-      unsigned int influenced_point_id = influences_id_ptr[j];
-
-      //std::cout << "Checking point " << influenced_point_id << std::endl;
-      if (point_types_ptr[influenced_point_id] != 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
-        continue;
-
-      //std::cout << " Setting point " << influenced_point_id << " to a fine 
point." << std::endl;
-      point_types_ptr[influenced_point_id] = 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE;
-
-      // add one to influence measure for all undecided points strongly 
influencing this fine point.
-      unsigned int k_stop = influences_row_ptr[influenced_point_id + 1];
-      for (unsigned int k = influences_row_ptr[influenced_point_id]; k < 
k_stop; ++k)
-      {
-        unsigned int influenced_influenced_point_id = influences_id_ptr[k];
-        if (point_types_ptr[influenced_influenced_point_id] == 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
-        {
-          // grab and remove from set, increase influence counter, store back:
-          amg_id_influence point_to_find(influenced_influenced_point_id, 
influences_values_ptr[influenced_influenced_point_id]);
-          points_by_influences.erase(point_to_find);
-
-          point_to_find.influences += 1;
-          influences_values_ptr[influenced_influenced_point_id] += 1; // for 
consistency
-
-          points_by_influences.insert(point_to_find);
-        }
-      } //for
-    } // for
-
-  } // while
-
-  viennacl::linalg::host_based::amg::enumerate_coarse_points(amg_context);
-}
-
-
-//////////////////////////
-
-
-/** @brief AG (aggregation based) coarsening, single-threaded version of stage 
1
-*
-* @param A             Operator matrix for the respective level
-* @param amg_context   AMG datastructure object for the grid hierarchy
-* @param tag           AMG preconditioner tag
-*/
-template<typename NumericT>
-void amg_coarse_ag_stage1_sequential(compressed_matrix<NumericT> const & A,
-                                     
viennacl::linalg::detail::amg::amg_level_context & amg_context,
-                                     viennacl::linalg::amg_tag & tag)
-{
-  (void)tag;
-  unsigned int *point_types_ptr       = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(amg_context.point_types_.handle());
-  unsigned int *influences_row_ptr    = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(amg_context.influence_jumper_.handle());
-  unsigned int *influences_id_ptr     = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(amg_context.influence_ids_.handle());
-
-  for (unsigned int i=0; i<static_cast<unsigned int>(A.size1()); ++i)
-  {
-    // check if node has no aggregates next to it (MIS-2)
-    bool is_new_coarse_node = true;
-
-    // Set strongly influenced points to fine points:
-    unsigned int j_stop = influences_row_ptr[i + 1];
-    for (unsigned int j = influences_row_ptr[i]; j < j_stop; ++j)
-    {
-      unsigned int influenced_point_id = influences_id_ptr[j];
-      if (point_types_ptr[influenced_point_id] != 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED) // 
either coarse or fine point
-      {
-        is_new_coarse_node = false;
-        break;
-      }
-    }
-
-    if (is_new_coarse_node)
-    {
-      // make all strongly influenced neighbors fine points:
-      for (unsigned int j = influences_row_ptr[i]; j < j_stop; ++j)
-      {
-        unsigned int influenced_point_id = influences_id_ptr[j];
-        point_types_ptr[influenced_point_id] = 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE;
-      }
-
-      //std::cout << "Setting new coarse node: " << i << std::endl;
-      // Note: influences may include diagonal element, so it's important to 
*first* set fine points above before setting the coarse information here
-      point_types_ptr[i] = 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE;
-    }
-  }
-}
-
-
-
-/** @brief AG (aggregation based) coarsening, multi-threaded version of stage 
1 using parallel maximum independent sets
-*
-* @param A             Operator matrix for the respective level
-* @param amg_context   AMG datastructure object for the grid hierarchy
-* @param tag           AMG preconditioner tag
-*/
-template<typename NumericT>
-void amg_coarse_ag_stage1_mis2(compressed_matrix<NumericT> const & A,
-                               
viennacl::linalg::detail::amg::amg_level_context & amg_context,
-                               viennacl::linalg::amg_tag & tag)
-{
-  (void)tag;
-  unsigned int  *point_types_ptr       = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(amg_context.point_types_.handle());
-  unsigned int *influences_row_ptr    = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(amg_context.influence_jumper_.handle());
-  unsigned int *influences_id_ptr     = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(amg_context.influence_ids_.handle());
-
-  std::vector<unsigned int> random_weights(A.size1());
-  for (std::size_t i=0; i<random_weights.size(); ++i)
-    random_weights[i] = static_cast<unsigned int>(rand()) % 
static_cast<unsigned int>(A.size1());
-
-  std::size_t num_threads = 1;
-#ifdef VIENNACL_WITH_OPENMP
-  num_threads = omp_get_max_threads();
-#endif
-
-  viennacl::vector<unsigned int> work_state(A.size1(), 
viennacl::traits::context(A));
-  viennacl::vector<unsigned int> work_random(A.size1(), 
viennacl::traits::context(A));
-  viennacl::vector<unsigned int> work_index(A.size1(), 
viennacl::traits::context(A));
-
-  unsigned int *work_state_ptr     = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(work_state.handle());
-  unsigned int *work_random_ptr    = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(work_random.handle());
-  unsigned int *work_index_ptr     = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(work_index.handle());
-
-  viennacl::vector<unsigned int> work_state2(A.size1(), 
viennacl::traits::context(A));
-  viennacl::vector<unsigned int> work_random2(A.size1(), 
viennacl::traits::context(A));
-  viennacl::vector<unsigned int> work_index2(A.size1(), 
viennacl::traits::context(A));
-
-  unsigned int *work_state2_ptr     = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(work_state2.handle());
-  unsigned int *work_random2_ptr    = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(work_random2.handle());
-  unsigned int *work_index2_ptr     = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(work_index2.handle());
-
-
-  unsigned int num_undecided = static_cast<unsigned int>(A.size1());
-  unsigned int pmis_iters = 0;
-  while (num_undecided > 0)
-  {
-    ++pmis_iters;
-
-    //
-    // init temporary work data:
-    //
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for
-#endif
-    for (long i2=0; i2<static_cast<long>(A.size1()); ++i2)
-    {
-      unsigned int i = static_cast<unsigned int>(i2);
-      switch (point_types_ptr[i])
-      {
-      case 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED: 
work_state_ptr[i] = 1; break;
-      case viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE:  
    work_state_ptr[i] = 0; break;
-      case 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE:    
work_state_ptr[i] = 2; break;
-      default:
-        throw std::runtime_error("Unexpected state encountered in MIS2 setup 
for AMG.");
-      }
-
-      work_random_ptr[i] = random_weights[i];
-      work_index_ptr[i]  = i;
-    }
-
-
-    //
-    // Propagate maximum tuple twice
-    //
-    for (unsigned int r = 0; r < 2; ++r)
-    {
-      // max operation
-#ifdef VIENNACL_WITH_OPENMP
-      #pragma omp parallel for
-#endif
-      for (long i2=0; i2<static_cast<long>(A.size1()); ++i2)
-      {
-        unsigned int i = static_cast<unsigned int>(i2);
-        // load
-        unsigned int state  = work_state_ptr[i];
-        unsigned int random = work_random_ptr[i];
-        unsigned int index  = work_index_ptr[i];
-
-        // max
-        unsigned int j_stop = influences_row_ptr[i + 1];
-        for (unsigned int j = influences_row_ptr[i]; j < j_stop; ++j)
-        {
-          unsigned int influenced_point_id = influences_id_ptr[j];
-
-          // lexigraphical triple-max (not particularly pretty, but does the 
job):
-          if (state < work_state_ptr[influenced_point_id])
-          {
-            state  = work_state_ptr[influenced_point_id];
-            random = work_random_ptr[influenced_point_id];
-            index  = work_index_ptr[influenced_point_id];
-          }
-          else if (state == work_state_ptr[influenced_point_id])
-          {
-            if (random < work_random_ptr[influenced_point_id])
-            {
-              state  = work_state_ptr[influenced_point_id];
-              random = work_random_ptr[influenced_point_id];
-              index  = work_index_ptr[influenced_point_id];
-            }
-            else if (random == work_random_ptr[influenced_point_id])
-            {
-              if (index < work_index_ptr[influenced_point_id])
-              {
-                state  = work_state_ptr[influenced_point_id];
-                random = work_random_ptr[influenced_point_id];
-                index  = work_index_ptr[influenced_point_id];
-              }
-            } // max(random)
-          } // max(state)
-        } // for
-
-        // store
-        work_state2_ptr[i]  = state;
-        work_random2_ptr[i] = random;
-        work_index2_ptr[i]  = index;
-      }
-
-      // copy work array
-#ifdef VIENNACL_WITH_OPENMP
-      #pragma omp parallel for
-#endif
-      for (long i2=0; i2<static_cast<long>(A.size1()); ++i2)
-      {
-        unsigned int i = static_cast<unsigned int>(i2);
-        work_state_ptr[i]  = work_state2_ptr[i];
-        work_random_ptr[i] = work_random2_ptr[i];
-        work_index_ptr[i]  = work_index2_ptr[i];
-      }
-    }
-
-    //
-    // mark MIS and non-MIS nodes:
-    //
-    std::vector<unsigned int> thread_buffer(num_threads);
-
-#ifdef VIENNACL_WITH_OPENMP
-    #pragma omp parallel for
-#endif
-    for (long i2=0; i2<static_cast<long>(A.size1()); ++i2)
-    {
-      unsigned int i = static_cast<unsigned int>(i2);
-      unsigned int max_state  = work_state_ptr[i];
-      unsigned int max_index  = work_index_ptr[i];
-
-      if (point_types_ptr[i] == 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
-      {
-        if (i == max_index) // make this a MIS node
-          point_types_ptr[i] = 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE;
-        else if (max_state == 2) // mind the mapping of 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE above!
-          point_types_ptr[i] = 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE;
-        else
-#ifdef VIENNACL_WITH_OPENMP
-          thread_buffer[omp_get_thread_num()] += 1;
-#else
-          thread_buffer[0] += 1;
-#endif
-      }
-    }
-
-    num_undecided = 0;
-    for (std::size_t i=0; i<thread_buffer.size(); ++i)
-      num_undecided += thread_buffer[i];
-  } // while
-
-  // consistency with sequential MIS: reset state for non-coarse points, so 
that coarse indices are correctly picked up later
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for
-#endif
-  for (long i=0; i<static_cast<long>(A.size1()); ++i)
-    if (point_types_ptr[i] != 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
-      point_types_ptr[i] = 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED;
-
-}
-
-
-
-/** @brief AG (aggregation based) coarsening. Partially single-threaded 
version (VIENNACL_AMG_COARSE_AG)
-*
-* @param A             Operator matrix for the respective level
-* @param amg_context   AMG datastructure object for the grid hierarchy
-* @param tag           AMG preconditioner tag
-*/
-template<typename NumericT>
-void amg_coarse_ag(compressed_matrix<NumericT> const & A,
-                   viennacl::linalg::detail::amg::amg_level_context & 
amg_context,
-                   viennacl::linalg::amg_tag & tag)
-{
-  unsigned int *point_types_ptr       = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(amg_context.point_types_.handle());
-  unsigned int *influences_row_ptr    = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(amg_context.influence_jumper_.handle());
-  unsigned int *influences_id_ptr     = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(amg_context.influence_ids_.handle());
-  unsigned int *coarse_id_ptr         = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(amg_context.coarse_id_.handle());
-
-  amg_influence_trivial(A, amg_context, tag);
-
-  //
-  // Stage 1: Build aggregates:
-  //
-  if (tag.get_coarsening_method() == 
viennacl::linalg::AMG_COARSENING_METHOD_AGGREGATION)      
amg_coarse_ag_stage1_sequential(A, amg_context, tag);
-  if (tag.get_coarsening_method() == 
viennacl::linalg::AMG_COARSENING_METHOD_MIS2_AGGREGATION) 
amg_coarse_ag_stage1_mis2(A, amg_context, tag);
-
-  viennacl::linalg::host_based::amg::enumerate_coarse_points(amg_context);
-
-  //
-  // Stage 2: Propagate coarse aggregate indices to neighbors:
-  //
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for
-#endif
-  for (long i2=0; i2<static_cast<long>(A.size1()); ++i2)
-  {
-    unsigned int i = static_cast<unsigned int>(i2);
-    if (point_types_ptr[i] == 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
-    {
-      unsigned int coarse_index = coarse_id_ptr[i];
-
-      unsigned int j_stop = influences_row_ptr[i + 1];
-      for (unsigned int j = influences_row_ptr[i]; j < j_stop; ++j)
-      {
-        unsigned int influenced_point_id = influences_id_ptr[j];
-        coarse_id_ptr[influenced_point_id] = coarse_index; // Set aggregate 
index for fine point
-
-        if (influenced_point_id != i) // Note: Any write races between threads 
are harmless here
-          point_types_ptr[influenced_point_id] = 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE;
-      }
-    }
-  }
-
-
-  //
-  // Stage 3: Merge remaining undecided points (merging to first aggregate 
found when cycling over the hierarchy
-  //
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for
-#endif
-  for (long i2=0; i2<static_cast<long>(A.size1()); ++i2)
-  {
-    unsigned int i = static_cast<unsigned int>(i2);
-    if (point_types_ptr[i] == 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
-    {
-      unsigned int j_stop = influences_row_ptr[i + 1];
-      for (unsigned int j = influences_row_ptr[i]; j < j_stop; ++j)
-      {
-        unsigned int influenced_point_id = influences_id_ptr[j];
-        if (point_types_ptr[influenced_point_id] != 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED) // 
either coarse or fine point
-        {
-          //std::cout << "Setting fine node " << i << " to be aggregated with 
node " << *influence_iter << "/" << 
pointvector.get_coarse_index(*influence_iter) << std::endl;
-          coarse_id_ptr[i] = coarse_id_ptr[influenced_point_id];
-          break;
-        }
-      }
-    }
-  }
-
-  //
-  // Stage 4: Set undecided points to fine points (coarse ID already set in 
Stage 3)
-  //          Note: Stage 3 and Stage 4 were initially fused, but are now 
split in order to avoid race conditions (or a fallback to sequential execution).
-  //
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for
-#endif
-  for (long i=0; i<static_cast<long>(A.size1()); ++i)
-    if (point_types_ptr[i] == 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
-      point_types_ptr[i] = 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE;
-
-}
-
-
-
-
-/** @brief Entry point and dispatcher for coarsening procedures
-*
-* @param A             Operator matrix for the respective level
-* @param amg_context   AMG datastructure object for the grid hierarchy
-* @param tag           AMG preconditioner tag
-*/
-template<typename MatrixT>
-void amg_coarse(MatrixT & A,
-                viennacl::linalg::detail::amg::amg_level_context & amg_context,
-                viennacl::linalg::amg_tag & tag)
-{
-  switch (tag.get_coarsening_method())
-  {
-  case viennacl::linalg::AMG_COARSENING_METHOD_ONEPASS: 
amg_coarse_classic_onepass(A, amg_context, tag); break;
-  case viennacl::linalg::AMG_COARSENING_METHOD_AGGREGATION:
-  case viennacl::linalg::AMG_COARSENING_METHOD_MIS2_AGGREGATION: 
amg_coarse_ag(A, amg_context, tag); break;
-  //default: throw std::runtime_error("not implemented yet");
-  }
-}
-
-
-
-
-////////////////////////////////////// Interpolation 
/////////////////////////////
-
-
-/** @brief Direct interpolation. Multi-threaded! (VIENNACL_AMG_INTERPOL_DIRECT)
- *
- * @param A            Operator matrix
- * @param P            Prolongation matrix
- * @param amg_context  AMG hierarchy datastructures
- * @param tag          AMG preconditioner tag
-*/
-template<typename NumericT>
-void amg_interpol_direct(compressed_matrix<NumericT> const & A,
-                         compressed_matrix<NumericT> & P,
-                         viennacl::linalg::detail::amg::amg_level_context & 
amg_context,
-                         viennacl::linalg::amg_tag & tag)
-{
-  NumericT     const * A_elements   = 
viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
-  unsigned int const * A_row_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(A.handle1());
-  unsigned int const * A_col_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(A.handle2());
-
-  unsigned int *point_types_ptr       = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(amg_context.point_types_.handle());
-  unsigned int *influences_row_ptr    = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(amg_context.influence_jumper_.handle());
-  unsigned int *influences_id_ptr     = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(amg_context.influence_ids_.handle());
-  unsigned int *coarse_id_ptr         = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(amg_context.coarse_id_.handle());
-
-  P.resize(A.size1(), amg_context.num_coarse_, false);
-
-  std::vector<std::map<unsigned int, NumericT> > P_setup(A.size1());
-
-  // Iterate over all points to build the interpolation matrix row-by-row
-  // Interpolation for coarse points is immediate using '1'.
-  // Interpolation for fine points is set up via corresponding row weights 
(cf. Yang paper, p. 14)
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for
-#endif
-  for (long row2=0; row2<static_cast<long>(A.size1()); ++row2)
-  {
-    unsigned int row = static_cast<unsigned int>(row2);
-    std::map<unsigned int, NumericT> & P_setup_row = P_setup[row];
-    //std::cout << "Row " << row << ": " << std::endl;
-    if (point_types_ptr[row] == 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
-    {
-      //std::cout << "  Setting value 1.0 at " << coarse_id_ptr[row] << 
std::endl;
-      P_setup_row[coarse_id_ptr[row]] = NumericT(1);
-    }
-    else if (point_types_ptr[row] == 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE)
-    {
-      //std::cout << "Building interpolant for fine point " << row << 
std::endl;
-
-      NumericT row_sum = 0;
-      NumericT row_coarse_sum = 0;
-      NumericT diag = 0;
-
-      // Row sum of coefficients (without diagonal) and sum of influencing 
coarse point coefficients has to be computed
-      unsigned int row_A_start = A_row_buffer[row];
-      unsigned int row_A_end   = A_row_buffer[row + 1];
-      unsigned int const *influence_iter = influences_id_ptr + 
influences_row_ptr[row];
-      unsigned int const *influence_end  = influences_id_ptr + 
influences_row_ptr[row + 1];
-      for (unsigned int index = row_A_start; index < row_A_end; ++index)
-      {
-        unsigned int col = A_col_buffer[index];
-        NumericT value = A_elements[index];
-
-        if (col == row)
-        {
-          diag = value;
-          continue;
-        }
-        else if (point_types_ptr[col] == 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
-        {
-          // Note: One increment is sufficient, because influence_iter 
traverses an ordered subset of the column indices in this row
-          while (influence_iter != influence_end && *influence_iter < col)
-            ++influence_iter;
-
-          if (influence_iter != influence_end && *influence_iter == col)
-            row_coarse_sum += value;
-        }
-
-        row_sum += value;
-      }
-
-      NumericT temp_res = -row_sum/(row_coarse_sum*diag);
-      //std::cout << "row_sum: " << row_sum << ", row_coarse_sum: " << 
row_coarse_sum << ", diag: " << diag << std::endl;
-
-      if (std::fabs(temp_res) > 1e-2 * std::fabs(diag))
-      {
-        // Iterate over all strongly influencing points to build the 
interpolant
-        influence_iter = influences_id_ptr + influences_row_ptr[row];
-        for (unsigned int index = row_A_start; index < row_A_end; ++index)
-        {
-          unsigned int col = A_col_buffer[index];
-          if (point_types_ptr[col] != 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
-            continue;
-          NumericT value = A_elements[index];
-
-          // Advance to correct influence metric:
-          while (influence_iter != influence_end && *influence_iter < col)
-            ++influence_iter;
-
-          if (influence_iter != influence_end && *influence_iter == col)
-          {
-            //std::cout << " Setting entry "  << temp_res * value << " at " << 
coarse_id_ptr[col] << " for point " << col << std::endl;
-            P_setup_row[coarse_id_ptr[col]] = temp_res * value;
-          }
-        }
-      }
-
-      // TODO truncate interpolation if specified by the user.
-      (void)tag;
-    }
-    else
-      throw std::runtime_error("Logic error in direct interpolation: Point is 
neither coarse-point nor fine-point!");
-  }
-
-  // TODO: P_setup can be avoided without sacrificing parallelism.
-  viennacl::tools::sparse_matrix_adapter<NumericT> P_adapter(P_setup, 
P.size1(), P.size2());
-  viennacl::copy(P_adapter, P);
-}
-
-
-/** @brief AG (aggregation based) interpolation. Multi-Threaded! 
(VIENNACL_INTERPOL_AG)
- *
- * @param A            Operator matrix
- * @param P            Prolongation matrix
- * @param amg_context  AMG hierarchy datastructures
- * @param tag          AMG configuration tag
-*/
-template<typename NumericT>
-void amg_interpol_ag(compressed_matrix<NumericT> const & A,
-                     compressed_matrix<NumericT> & P,
-                     viennacl::linalg::detail::amg::amg_level_context & 
amg_context,
-                     viennacl::linalg::amg_tag & tag)
-{
-  (void)tag;
-  P = compressed_matrix<NumericT>(A.size1(), amg_context.num_coarse_, 
A.size1(), viennacl::traits::context(A));
-
-  NumericT     * P_elements   = 
viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(P.handle());
-  unsigned int * P_row_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(P.handle1());
-  unsigned int * P_col_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(P.handle2());
-
-  unsigned int *coarse_id_ptr = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(amg_context.coarse_id_.handle());
-
-  // Build interpolation matrix:
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for
-#endif
-  for (long row2 = 0; row2 < long(A.size1()); ++row2)
-  {
-    unsigned int row = static_cast<unsigned int>(row2);
-    P_elements[row]   = NumericT(1);
-    P_row_buffer[row] = row;
-    P_col_buffer[row] = coarse_id_ptr[row];
-  }
-  P_row_buffer[A.size1()] = static_cast<unsigned int>(A.size1()); // don't 
forget finalizer
-
-  P.generate_row_block_information();
-}
-
-
-/** @brief Smoothed aggregation interpolation. Multi-Threaded! 
(VIENNACL_INTERPOL_SA)
- *
- * @param A            Operator matrix
- * @param P            Prolongation matrix
- * @param amg_context  AMG hierarchy datastructures
- * @param tag          AMG configuration tag
-*/
-template<typename NumericT>
-void amg_interpol_sa(compressed_matrix<NumericT> const & A,
-                     compressed_matrix<NumericT> & P,
-                     viennacl::linalg::detail::amg::amg_level_context & 
amg_context,
-                     viennacl::linalg::amg_tag & tag)
-{
-  (void)tag;
-  viennacl::compressed_matrix<NumericT> P_tentative(A.size1(), 
amg_context.num_coarse_, A.size1(), viennacl::traits::context(A));
-
-  // form tentative operator:
-  amg_interpol_ag(A, P_tentative, amg_context, tag);
-
-  unsigned int const * A_row_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(A.handle1());
-  unsigned int const * A_col_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(A.handle2());
-  NumericT     const * A_elements   = 
viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
-
-  viennacl::compressed_matrix<NumericT> Jacobi(A.size1(), A.size1(), A.nnz(), 
viennacl::traits::context(A));
-  unsigned int * Jacobi_row_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(Jacobi.handle1());
-  unsigned int * Jacobi_col_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(Jacobi.handle2());
-  NumericT     * Jacobi_elements   = 
viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(Jacobi.handle());
-
-
-  // Build Jacobi matrix:
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for
-#endif
-  for (long row2=0; row2<static_cast<long>(A.size1()); ++row2)
-  {
-    unsigned int row = static_cast<unsigned int>(row2);
-    unsigned int row_begin = A_row_buffer[row];
-    unsigned int row_end   = A_row_buffer[row+1];
-
-    Jacobi_row_buffer[row] = row_begin;
-
-    // Step 1: Extract diagonal:
-    NumericT diag = 0;
-    for (unsigned int j = row_begin; j < row_end; ++j)
-    {
-      if (A_col_buffer[j] == row)
-      {
-        diag = A_elements[j];
-        break;
-      }
-    }
-
-    // Step 2: Write entries:
-    for (unsigned int j = row_begin; j < row_end; ++j)
-    {
-      unsigned int col_index = A_col_buffer[j];
-      Jacobi_col_buffer[j] = col_index;
-
-      if (col_index == row)
-        Jacobi_elements[j] = NumericT(1) - NumericT(tag.get_jacobi_weight());
-      else
-        Jacobi_elements[j] = - NumericT(tag.get_jacobi_weight()) * 
A_elements[j] / diag;
-    }
-  }
-  Jacobi_row_buffer[A.size1()] = static_cast<unsigned int>(Jacobi.nnz()); // 
don't forget finalizer
-
-  P = viennacl::linalg::prod(Jacobi, P_tentative);
-
-  P.generate_row_block_information();
-}
-
-
-/** @brief Dispatcher for building the interpolation matrix
- *
- * @param A            Operator matrix
- * @param P            Prolongation matrix
- * @param amg_context  AMG hierarchy datastructures
- * @param tag          AMG configuration tag
-*/
-template<typename MatrixT>
-void amg_interpol(MatrixT const & A,
-                  MatrixT & P,
-                  viennacl::linalg::detail::amg::amg_level_context & 
amg_context,
-                  viennacl::linalg::amg_tag & tag)
-{
-  switch (tag.get_interpolation_method())
-  {
-  case viennacl::linalg::AMG_INTERPOLATION_METHOD_DIRECT:               
amg_interpol_direct (A, P, amg_context, tag); break;
-  case viennacl::linalg::AMG_INTERPOLATION_METHOD_AGGREGATION:          
amg_interpol_ag     (A, P, amg_context, tag); break;
-  case viennacl::linalg::AMG_INTERPOLATION_METHOD_SMOOTHED_AGGREGATION: 
amg_interpol_sa     (A, P, amg_context, tag); break;
-  default: throw std::runtime_error("Not implemented yet!");
-  }
-}
-
-
-/** @brief Computes B = trans(A).
-  *
-  * To be replaced by native functionality in ViennaCL.
-  */
-template<typename NumericT>
-void amg_transpose(compressed_matrix<NumericT> const & A,
-                   compressed_matrix<NumericT> & B)
-{
-  NumericT     const * A_elements   = 
viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
-  unsigned int const * A_row_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(A.handle1());
-  unsigned int const * A_col_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(A.handle2());
-
-  // initialize datastructures for B:
-  B = compressed_matrix<NumericT>(A.size2(), A.size1(), A.nnz(), 
viennacl::traits::context(A));
-
-  NumericT     * B_elements   = 
viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(B.handle());
-  unsigned int * B_row_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(B.handle1());
-  unsigned int * B_col_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(B.handle2());
-
-  // prepare uninitialized B_row_buffer:
-  for (std::size_t i = 0; i < B.size1(); ++i)
-    B_row_buffer[i] = 0;
-
-  //
-  // Stage 1: Compute pattern for B
-  //
-  for (std::size_t row = 0; row < A.size1(); ++row)
-  {
-    unsigned int row_start = A_row_buffer[row];
-    unsigned int row_stop  = A_row_buffer[row+1];
-
-    for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index)
-      B_row_buffer[A_col_buffer[nnz_index]] += 1;
-  }
-
-  // Bring row-start array in place using inclusive-scan:
-  unsigned int offset = B_row_buffer[0];
-  B_row_buffer[0] = 0;
-  for (std::size_t row = 1; row < B.size1(); ++row)
-  {
-    unsigned int tmp = B_row_buffer[row];
-    B_row_buffer[row] = offset;
-    offset += tmp;
-  }
-  B_row_buffer[B.size1()] = offset;
-
-  //
-  // Stage 2: Fill with data
-  //
-
-  std::vector<unsigned int> B_row_offsets(B.size1()); //number of elements 
already written per row
-
-  for (std::size_t row = 0; row < A.size1(); ++row)
-  {
-    //std::cout << "Row " << row << ": ";
-    unsigned int row_start = A_row_buffer[row];
-    unsigned int row_stop  = A_row_buffer[row+1];
-
-    for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index)
-    {
-      unsigned int col_in_A = A_col_buffer[nnz_index];
-      unsigned int B_nnz_index = B_row_buffer[col_in_A] + 
B_row_offsets[col_in_A];
-      B_col_buffer[B_nnz_index] = static_cast<unsigned int>(row);
-      B_elements[B_nnz_index] = A_elements[nnz_index];
-      ++B_row_offsets[col_in_A];
-      //B_temp.at(A_col_buffer[nnz_index])[row] = A_elements[nnz_index];
-    }
-  }
-
-  // Step 3: Make datastructure consistent (row blocks!)
-  B.generate_row_block_information();
-}
-
-/** Assign sparse matrix A to dense matrix B */
-template<typename NumericT, unsigned int AlignmentV>
-void assign_to_dense(viennacl::compressed_matrix<NumericT, AlignmentV> const & 
A,
-                     viennacl::matrix_base<NumericT> & B)
-{
-  NumericT     const * A_elements   = 
detail::extract_raw_pointer<NumericT>(A.handle());
-  unsigned int const * A_row_buffer = detail::extract_raw_pointer<unsigned 
int>(A.handle1());
-  unsigned int const * A_col_buffer = detail::extract_raw_pointer<unsigned 
int>(A.handle2());
-
-  NumericT           * B_elements   = 
detail::extract_raw_pointer<NumericT>(B.handle());
-
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for
-#endif
-  for (long row = 0; row < static_cast<long>(A.size1()); ++row)
-  {
-    unsigned int row_stop  = A_row_buffer[row+1];
-
-    for (unsigned int nnz_index = A_row_buffer[row]; nnz_index < row_stop; 
++nnz_index)
-      B_elements[static_cast<unsigned int>(row) * static_cast<unsigned 
int>(B.internal_size2()) + A_col_buffer[nnz_index]] = A_elements[nnz_index];
-  }
-
-}
-
-/** @brief Damped Jacobi Smoother (CUDA version)
-*
-* @param iterations  Number of smoother iterations
-* @param A           Operator matrix for the smoothing
-* @param x           The vector smoothing is applied to
-* @param x_backup    (Different) Vector holding the same values as x
-* @param rhs_smooth  The right hand side of the equation for the smoother
-* @param weight      Damping factor. 0: No effect of smoother. 1: Undamped 
Jacobi iteration
-*/
-template<typename NumericT>
-void smooth_jacobi(unsigned int iterations,
-                   compressed_matrix<NumericT> const & A,
-                   vector<NumericT> & x,
-                   vector<NumericT> & x_backup,
-                   vector<NumericT> const & rhs_smooth,
-                   NumericT weight)
-{
-
-  NumericT     const * A_elements   = 
viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle());
-  unsigned int const * A_row_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(A.handle1());
-  unsigned int const * A_col_buffer = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(A.handle2());
-  NumericT     const * rhs_elements = 
viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(rhs_smooth.handle());
-
-  NumericT           * x_elements     = 
viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(x.handle());
-  NumericT     const * x_old_elements = 
viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(x_backup.handle());
-
-  for (unsigned int i=0; i<iterations; ++i)
-  {
-    x_backup = x;
-
-    #ifdef VIENNACL_WITH_OPENMP
-    #pragma omp parallel for
-    #endif
-    for (long row2 = 0; row2 < static_cast<long>(A.size1()); ++row2)
-    {
-      unsigned int row = static_cast<unsigned int>(row2);
-      unsigned int col_end   = A_row_buffer[row+1];
-
-      NumericT sum  = NumericT(0);
-      NumericT diag = NumericT(1);
-      for (unsigned int index = A_row_buffer[row]; index != col_end; ++index)
-      {
-        unsigned int col = A_col_buffer[index];
-        if (col == row)
-          diag = A_elements[index];
-        else
-          sum += A_elements[index] * x_old_elements[col];
-      }
-
-      x_elements[row] = weight * (rhs_elements[row] - sum) / diag + 
(NumericT(1) - weight) * x_old_elements[row];
-    }
-  }
-}
-
-} //namespace amg
-} //namespace host_based
-} //namespace linalg
-} //namespace viennacl
-
-#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/common.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/common.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/common.hpp
deleted file mode 100644
index 8ddb4c1..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/common.hpp
+++ /dev/null
@@ -1,149 +0,0 @@
-#ifndef VIENNACL_LINALG_HOST_BASED_COMMON_HPP_
-#define VIENNACL_LINALG_HOST_BASED_COMMON_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-/** @file viennacl/linalg/host_based/common.hpp
-    @brief Common routines for single-threaded or OpenMP-enabled execution on 
CPU
-*/
-
-#include "viennacl/traits/handle.hpp"
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace host_based
-{
-namespace detail
-{
-
-template<typename ResultT, typename VectorT>
-ResultT * extract_raw_pointer(VectorT & vec)
-{
-  return reinterpret_cast<ResultT *>(viennacl::traits::ram_handle(vec).get());
-}
-
-template<typename ResultT, typename VectorT>
-ResultT const * extract_raw_pointer(VectorT const & vec)
-{
-  return reinterpret_cast<ResultT const 
*>(viennacl::traits::ram_handle(vec).get());
-}
-
-/** @brief Helper class for accessing a strided subvector of a larger vector. 
*/
-template<typename NumericT>
-class vector_array_wrapper
-{
-public:
-  typedef NumericT   value_type;
-
-  vector_array_wrapper(value_type * A,
-                       vcl_size_t start,
-                       vcl_size_t inc)
-   : A_(A),
-     start_(start),
-     inc_(inc) {}
-
-  value_type & operator()(vcl_size_t i) { return A_[i * inc_ + start_]; }
-
-private:
-  value_type * A_;
-  vcl_size_t start_;
-  vcl_size_t inc_;
-};
-
-
-/** @brief Helper array for accessing a strided submatrix embedded in a larger 
matrix. */
-template<typename NumericT, typename LayoutT, bool is_transposed>
-class matrix_array_wrapper
-{
-  public:
-    typedef NumericT   value_type;
-
-    matrix_array_wrapper(value_type * A,
-                         vcl_size_t start1, vcl_size_t start2,
-                         vcl_size_t inc1,   vcl_size_t inc2,
-                         vcl_size_t internal_size1, vcl_size_t internal_size2)
-     : A_(A),
-       start1_(start1), start2_(start2),
-       inc1_(inc1), inc2_(inc2),
-       internal_size1_(internal_size1), internal_size2_(internal_size2) {}
-
-    value_type & operator()(vcl_size_t i, vcl_size_t j)
-    {
-      return A_[LayoutT::mem_index(i * inc1_ + start1_,
-                                   j * inc2_ + start2_,
-                                   internal_size1_, internal_size2_)];
-    }
-
-    // convenience overloads to address signed index types for OpenMP:
-    value_type & operator()(vcl_size_t i, long j) { return operator()(i, 
static_cast<vcl_size_t>(j)); }
-    value_type & operator()(long i, vcl_size_t j) { return 
operator()(static_cast<vcl_size_t>(i), j); }
-    value_type & operator()(long i, long j)       { return 
operator()(static_cast<vcl_size_t>(i), static_cast<vcl_size_t>(j)); }
-
-  private:
-    value_type * A_;
-    vcl_size_t start1_, start2_;
-    vcl_size_t inc1_, inc2_;
-    vcl_size_t internal_size1_, internal_size2_;
-};
-
-/** \cond */
-template<typename NumericT, typename LayoutT>
-class matrix_array_wrapper<NumericT, LayoutT, true>
-{
-public:
-  typedef NumericT   value_type;
-
-  matrix_array_wrapper(value_type * A,
-                       vcl_size_t start1, vcl_size_t start2,
-                       vcl_size_t inc1,   vcl_size_t inc2,
-                       vcl_size_t internal_size1, vcl_size_t internal_size2)
-   : A_(A),
-     start1_(start1), start2_(start2),
-     inc1_(inc1), inc2_(inc2),
-     internal_size1_(internal_size1), internal_size2_(internal_size2) {}
-
-  value_type & operator()(vcl_size_t i, vcl_size_t j)
-  {
-    //swapping row and column indices here
-    return A_[LayoutT::mem_index(j * inc1_ + start1_,
-                                 i * inc2_ + start2_,
-                                 internal_size1_, internal_size2_)];
-  }
-
-  // convenience overloads to address signed index types for OpenMP:
-  value_type & operator()(vcl_size_t i, long j) { return operator()(i, 
static_cast<vcl_size_t>(j)); }
-  value_type & operator()(long i, vcl_size_t j) { return 
operator()(static_cast<vcl_size_t>(i), j); }
-  value_type & operator()(long i, long j) { return 
operator()(static_cast<vcl_size_t>(i), static_cast<vcl_size_t>(j)); }
-
-private:
-  value_type * A_;
-  vcl_size_t start1_, start2_;
-  vcl_size_t inc1_, inc2_;
-  vcl_size_t internal_size1_, internal_size2_;
-};
-/** \endcond */
-
-} //namespace detail
-} //namespace host_based
-} //namespace linalg
-} //namespace viennacl
-
-
-#endif

Reply via email to