mahout#241"

apalumbo Fri, 10 Jun 2016 09:53:10 -0700

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai.hpp
deleted file mode 100644
index dba094b..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai.hpp
+++ /dev/null
@@ -1,832 +0,0 @@
-#ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_HPP
-#define VIENNACL_LINALG_DETAIL_SPAI_SPAI_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-/** @file viennacl/linalg/detail/spai/spai.hpp
-    @brief Main implementation of SPAI (not FSPAI). Experimental.
-*/
-
-#include <utility>
-#include <iostream>
-#include <fstream>
-#include <string>
-#include <algorithm>
-#include <vector>
-#include <math.h>
-#include <map>
-
-//local includes
-#include "viennacl/linalg/detail/spai/spai_tag.hpp"
-#include "viennacl/linalg/qr.hpp"
-#include "viennacl/linalg/detail/spai/spai-dynamic.hpp"
-#include "viennacl/linalg/detail/spai/spai-static.hpp"
-#include "viennacl/linalg/detail/spai/sparse_vector.hpp"
-#include "viennacl/linalg/detail/spai/block_matrix.hpp"
-#include "viennacl/linalg/detail/spai/block_vector.hpp"
-
-//boost includes
-#include "boost/numeric/ublas/vector.hpp"
-#include "boost/numeric/ublas/matrix.hpp"
-#include "boost/numeric/ublas/matrix_proxy.hpp"
-#include "boost/numeric/ublas/vector_proxy.hpp"
-#include "boost/numeric/ublas/storage.hpp"
-#include "boost/numeric/ublas/io.hpp"
-#include "boost/numeric/ublas/lu.hpp"
-#include "boost/numeric/ublas/triangular.hpp"
-#include "boost/numeric/ublas/matrix_expression.hpp"
-
-// ViennaCL includes
-#include "viennacl/linalg/prod.hpp"
-#include "viennacl/matrix.hpp"
-#include "viennacl/compressed_matrix.hpp"
-#include "viennacl/linalg/sparse_matrix_operations.hpp"
-#include "viennacl/linalg/matrix_operations.hpp"
-#include "viennacl/scalar.hpp"
-#include "viennacl/linalg/inner_prod.hpp"
-#include "viennacl/linalg/ilu.hpp"
-#include "viennacl/ocl/backend.hpp"
-#include "viennacl/linalg/opencl/kernels/spai.hpp"
-
-
-
-#define VIENNACL_SPAI_K_b 20
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace detail
-{
-namespace spai
-{
-
-//debug function for print
-template<typename SparseVectorT>
-void print_sparse_vector(SparseVectorT const & v)
-{
-  for (typename SparseVectorT::const_iterator vec_it = v.begin(); vec_it!= 
v.end(); ++vec_it)
-    std::cout << "[ " << vec_it->first << " ]:" << vec_it->second << std::endl;
-}
-
-template<typename DenseMatrixT>
-void print_matrix(DenseMatrixT & m)
-{
-  for (int i = 0; i < m.size2(); ++i)
-  {
-    for (int j = 0; j < m.size1(); ++j)
-      std::cout<<m(j, i)<<" ";
-    std::cout<<std::endl;
-  }
-}
-
-/** @brief Add two sparse vectors res_v = b*v
- *
- * @param v      initial sparse vector
- * @param b      scalar
- * @param res_v  output vector
- */
-template<typename SparseVectorT, typename NumericT>
-void add_sparse_vectors(SparseVectorT const & v, NumericT b,  SparseVectorT & 
res_v)
-{
-  for (typename SparseVectorT::const_iterator v_it = v.begin(); v_it != 
v.end(); ++v_it)
-    res_v[v_it->first] += b*v_it->second;
-}
-
-//sparse-matrix - vector product
-/** @brief Computation of residual res = A*v - e
- *
- * @param A_v_c   column major vectorized input sparse matrix
- * @param v       sparse vector, in this case new column of preconditioner 
matrix
- * @param ind     index for current column
- * @param res     residual
- */
-template<typename SparseVectorT, typename NumericT>
-void compute_spai_residual(std::vector<SparseVectorT> const & A_v_c,
-                           SparseVectorT const & v,
-                           unsigned int ind,
-                           SparseVectorT & res)
-{
-  for (typename SparseVectorT::const_iterator v_it = v.begin(); v_it != 
v.end(); ++v_it)
-    add_sparse_vectors(A_v_c[v_it->first], v_it->second, res);
-
-  res[ind] -= NumericT(1);
-}
-
-/** @brief Setting up index set of columns and rows for certain column
- *
- * @param A_v_c   column major vectorized initial sparse matrix
- * @param v       current column of preconditioner matrix
- * @param J       set of column indices
- * @param I       set of row indices
- */
-template<typename SparseVectorT>
-void build_index_set(std::vector<SparseVectorT> const & A_v_c,
-                     SparseVectorT const & v,
-                     std::vector<unsigned int> & J,
-                     std::vector<unsigned int> & I)
-{
-  buildColumnIndexSet(v, J);
-  projectRows(A_v_c, J, I);
-}
-
-/** @brief Initializes a dense matrix from a sparse one
- *
- * @param A_in    Oiginal sparse matrix
- * @param J       Set of column indices
- * @param I       Set of row indices
- * @param A_out   dense matrix output
- */
-template<typename SparseMatrixT, typename DenseMatrixT>
-void initProjectSubMatrix(SparseMatrixT const & A_in,
-                          std::vector<unsigned int> const & J,
-                          std::vector<unsigned int> & I,
-                          DenseMatrixT & A_out)
-{
-  A_out.resize(I.size(), J.size(), false);
-  for (vcl_size_t j = 0; j < J.size(); ++j)
-    for (vcl_size_t i = 0; i < I.size(); ++i)
-      A_out(i,j) = A_in(I[i],J[j]);
-}
-
-
-/************************************************** CPU BLOCK SET UP 
***************************************/
-
-/** @brief Setting up blocks and QR factorizing them on CPU
- *
- * @param A        initial sparse matrix
- * @param A_v_c    column major vectorized initial sparse matrix
- * @param M_v      initialized preconditioner
- * @param g_I      container of row indices
- * @param g_J      container of column indices
- * @param g_A_I_J  container of dense matrices -> R matrices after QR 
factorization
- * @param g_b_v    container of vectors beta, necessary for Q recovery
- */
-template<typename SparseMatrixT, typename DenseMatrixT, typename 
SparseVectorT, typename VectorT>
-void block_set_up(SparseMatrixT const & A,
-                  std::vector<SparseVectorT> const & A_v_c,
-                  std::vector<SparseVectorT> const & M_v,
-                  std::vector<std::vector<unsigned int> >& g_I,
-                  std::vector<std::vector<unsigned int> >& g_J,
-                  std::vector<DenseMatrixT>& g_A_I_J,
-                  std::vector<VectorT>& g_b_v)
-{
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for
-#endif
-  for (long i2 = 0; i2 < static_cast<long>(M_v.size()); ++i2)
-  {
-    vcl_size_t i = static_cast<vcl_size_t>(i2);
-    build_index_set(A_v_c, M_v[i], g_J[i], g_I[i]);
-    initProjectSubMatrix(A, g_J[i], g_I[i], g_A_I_J[i]);
-    //print_matrix(g_A_I_J[i]);
-    single_qr(g_A_I_J[i], g_b_v[i]);
-    //print_matrix(g_A_I_J[i]);
-  }
-}
-
-/** @brief Setting up index set of columns and rows for all columns
- *
- * @param A_v_c   column major vectorized initial sparse matrix
- * @param M_v     initialized preconditioner
- * @param g_J     container of column indices
- * @param g_I     container of row indices
- */
-template<typename SparseVectorT>
-void index_set_up(std::vector<SparseVectorT> const & A_v_c,
-                  std::vector<SparseVectorT> const & M_v,
-                  std::vector<std::vector<unsigned int> > & g_J,
-                  std::vector<std::vector<unsigned int> > & g_I)
-{
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for
-#endif
-  for (long i2 = 0; i2 < static_cast<long>(M_v.size()); ++i2)
-  {
-    vcl_size_t i = static_cast<vcl_size_t>(i2);
-    build_index_set(A_v_c, M_v[i], g_J[i], g_I[i]);
-  }
-}
-
-/************************************************** GPU BLOCK SET UP 
***************************************/
-
-/** @brief Setting up blocks and QR factorizing them on GPU
- *
- * @param A            initial sparse matrix
- * @param A_v_c        column major vectorized initial sparse matrix
- * @param M_v          initialized preconditioner
- * @param g_is_update  container that indicates which blocks are active
- * @param g_I          container of row indices
- * @param g_J          container of column indices
- * @param g_A_I_J      container of dense matrices -> R matrices after QR 
factorization
- * @param g_bv         container of vectors beta, necessary for Q recovery
- */
-template<typename NumericT, unsigned int AlignmentV, typename SparseVectorT>
-void block_set_up(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,
-                  std::vector<SparseVectorT> const & A_v_c,
-                  std::vector<SparseVectorT> const & M_v,
-                  std::vector<cl_uint> g_is_update,
-                  std::vector<std::vector<unsigned int> > & g_I,
-                  std::vector<std::vector<unsigned int> > & g_J,
-                  block_matrix & g_A_I_J,
-                  block_vector & g_bv)
-{
-  viennacl::context ctx = viennacl::traits::context(A);
-  bool is_empty_block;
-
-  //build index set
-  index_set_up(A_v_c, M_v, g_J, g_I);
-  block_assembly(A, g_J, g_I, g_A_I_J, g_is_update, is_empty_block);
-  block_qr<NumericT>(g_I, g_J, g_A_I_J, g_bv, g_is_update, ctx);
-}
-
-
-/***************************************************************************************************/
-/******************************** SOLVING LS PROBLEMS ON GPU 
***************************************/
-/***************************************************************************************************/
-
-/** @brief Elicitation of sparse vector m for particular column from m_in - 
contigious vector for all columns
- *
- * @param m_in          contigious sparse vector for all columns
- * @param start_m_ind   start index of particular vector
- * @param J             column index set
- * @param m             sparse vector for particular column
- */
-template<typename NumericT, typename SparseVectorT>
-void custom_fan_out(std::vector<NumericT> const & m_in,
-                    unsigned int start_m_ind,
-                    std::vector<unsigned int> const & J,
-                    SparseVectorT & m)
-{
-  unsigned int  cnt = 0;
-  for (vcl_size_t i = 0; i < J.size(); ++i)
-    m[J[i]] = m_in[start_m_ind + cnt++];
-}
-
-
-
-//GPU based least square problem
-/** @brief Solution of Least square problem on GPU
- *
- * @param A_v_c        column-major vectorized initial sparse matrix
- * @param M_v          column-major vectorized sparse preconditioner matrix
- * @param g_I          container of row set indices
- * @param g_J          container of column set indices
- * @param g_A_I_J_vcl  contigious matrix that consists of blocks A(I_k, J_k)
- * @param g_bv_vcl     contigious vector that consists of betas, necessary for 
Q recovery
- * @param g_res        container of residuals
- * @param g_is_update  container with indicators which blocks are active
- * @param tag          spai tag
- * @param ctx          Optional context in which the auxiliary data is created 
(one out of multiple OpenCL contexts, CUDA, host)
- */
-template<typename SparseVectorT, typename NumericT>
-void least_square_solve(std::vector<SparseVectorT> & A_v_c,
-                        std::vector<SparseVectorT> & M_v,
-                        std::vector<std::vector<unsigned int> >& g_I,
-                        std::vector<std::vector<unsigned int> > & g_J,
-                        block_matrix & g_A_I_J_vcl,
-                        block_vector & g_bv_vcl,
-                        std::vector<SparseVectorT> & g_res,
-                        std::vector<cl_uint> & g_is_update,
-                        const spai_tag & tag,
-                        viennacl::context ctx)
-{
-  viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context 
&>(ctx.opencl_context());
-  unsigned int y_sz, m_sz;
-  std::vector<cl_uint> y_inds(M_v.size() + 1, static_cast<cl_uint>(0));
-  std::vector<cl_uint> m_inds(M_v.size() + 1, static_cast<cl_uint>(0));
-
-  get_size(g_I, y_sz);
-  init_start_inds(g_I, y_inds);
-  init_start_inds(g_J, m_inds);
-
-  //create y_v
-  std::vector<NumericT> y_v(y_sz, NumericT(0));
-  for (vcl_size_t i = 0; i < M_v.size(); ++i)
-  {
-    for (vcl_size_t j = 0; j < g_I[i].size(); ++j)
-    {
-      if (g_I[i][j] == i)
-        y_v[y_inds[i] + j] = NumericT(1.0);
-    }
-  }
-  //compute m_v
-  get_size(g_J, m_sz);
-  std::vector<NumericT> m_v(m_sz, static_cast<cl_uint>(0));
-
-  block_vector y_v_vcl;
-  block_vector m_v_vcl;
-  //prepearing memory for least square problem on GPU
-  y_v_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
-                                              static_cast<unsigned 
int>(sizeof(NumericT)*y_v.size()),
-                                              &(y_v[0]));
-  m_v_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
-                                              static_cast<unsigned 
int>(sizeof(NumericT)*m_v.size()),
-                                              &(m_v[0]));
-  y_v_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
-                                               static_cast<unsigned 
int>(sizeof(cl_uint)*(g_I.size() + 1)),
-                                               &(y_inds[0]));
-  viennacl::ocl::handle<cl_mem> g_is_update_vcl = 
opencl_ctx.create_memory(CL_MEM_READ_WRITE,
-                                                                           
static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
-                                                                           
&(g_is_update[0]));
-  viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);
-  viennacl::ocl::kernel & ls_kernel = 
opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(),
 "block_least_squares");
-  ls_kernel.local_work_size(0, 1);
-  ls_kernel.global_work_size(0, 256);
-  viennacl::ocl::enqueue(ls_kernel(g_A_I_J_vcl.handle(), 
g_A_I_J_vcl.handle2(), g_bv_vcl.handle(), g_bv_vcl.handle1(), m_v_vcl.handle(),
-                                   y_v_vcl.handle(), y_v_vcl.handle1(),
-                                   g_A_I_J_vcl.handle1(), g_is_update_vcl,
-                                   
//viennacl::ocl::local_mem(static_cast<unsigned 
int>(sizeof(ScalarType)*(local_r_n*local_c_n))),
-                                   static_cast<unsigned int>(M_v.size())));
-  //copy vector m_v back from GPU to CPU
-  cl_int vcl_err = clEnqueueReadBuffer(opencl_ctx.get_queue().handle().get(),
-                                       m_v_vcl.handle().get(), CL_TRUE, 0,
-                                       sizeof(NumericT)*(m_v.size()),
-                                       &(m_v[0]), 0, NULL, NULL);
-  VIENNACL_ERR_CHECK(vcl_err);
-
-  //fan out vector in parallel
-  //#pragma omp parallel for
-  for (long i = 0; i < static_cast<long>(M_v.size()); ++i)
-  {
-    if (g_is_update[static_cast<vcl_size_t>(i)])
-    {
-      //faned out onto sparse vector
-      custom_fan_out(m_v, m_inds[static_cast<vcl_size_t>(i)], 
g_J[static_cast<vcl_size_t>(i)], M_v[static_cast<vcl_size_t>(i)]);
-      g_res[static_cast<vcl_size_t>(i)].clear();
-      compute_spai_residual<SparseVectorT, NumericT>(A_v_c,  
M_v[static_cast<vcl_size_t>(i)], static_cast<unsigned int>(i), 
g_res[static_cast<vcl_size_t>(i)]);
-      NumericT res_norm = 0;
-      //compute norm of res - just to make sure that this implementatino works 
correct
-      sparse_norm_2(g_res[static_cast<vcl_size_t>(i)], res_norm);
-      //std::cout<<"Residual norm of column #: "<<i<<std::endl;
-      //std::cout<<res_norm<<std::endl;
-      //std::cout<<"************************"<<std::endl;
-      g_is_update[static_cast<vcl_size_t>(i)] = (res_norm > 
tag.getResidualNormThreshold())&& (!tag.getIsStatic())?(1):(0);
-    }
-  }
-}
-
-//CPU based least square problems
-/** @brief Solution of Least square problem on CPU
- *
- * @param A_v_c        column-major vectorized initial sparse matrix
- * @param g_R          blocks for least square solution
- * @param g_b_v        vectors beta, necessary for Q recovery
- * @param g_I          container of row index set for all columns of matrix M
- * @param g_J          container of column index set for all columns of matrix 
M
- * @param g_res        container of residuals
- * @param g_is_update  container with indicators which blocks are active
- * @param M_v          column-major vectorized sparse matrix, final 
preconditioner
- * @param tag          spai tag
- */
-template<typename SparseVectorT, typename DenseMatrixT, typename VectorT>
-void least_square_solve(std::vector<SparseVectorT> const & A_v_c,
-                        std::vector<DenseMatrixT> & g_R,
-                        std::vector<VectorT> & g_b_v,
-                        std::vector<std::vector<unsigned int> > & g_I,
-                        std::vector<std::vector<unsigned int> > & g_J,
-                        std::vector<SparseVectorT> & g_res,
-                        std::vector<bool> & g_is_update,
-                        std::vector<SparseVectorT> & M_v,
-                        spai_tag const & tag)
-{
-  typedef typename DenseMatrixT::value_type       NumericType;
-
-#ifdef VIENNACL_WITH_OPENMP
-  #pragma omp parallel for
-#endif
-  for (long i2 = 0; i2 < static_cast<long>(M_v.size()); ++i2)
-  {
-    vcl_size_t i = static_cast<vcl_size_t>(i2);
-    if (g_is_update[i])
-    {
-      VectorT y = 
boost::numeric::ublas::zero_vector<NumericType>(g_I[i].size());
-
-      projectI<VectorT, NumericType>(g_I[i], y, static_cast<unsigned 
int>(tag.getBegInd() + long(i)));
-      apply_q_trans_vec(g_R[i], g_b_v[i], y);
-
-      VectorT m_new =  
boost::numeric::ublas::zero_vector<NumericType>(g_R[i].size2());
-      backwardSolve(g_R[i], y, m_new);
-      fanOutVector(m_new, g_J[i], M_v[i]);
-      g_res[i].clear();
-
-      compute_spai_residual<SparseVectorT, NumericType>(A_v_c,  M_v[i], 
static_cast<unsigned int>(tag.getBegInd() + long(i)), g_res[i]);
-
-      NumericType res_norm = 0;
-      sparse_norm_2(g_res[i], res_norm);
-//                    std::cout<<"Residual norm of column #: "<<i<<std::endl;
-//                    std::cout<<res_norm<<std::endl;
-//                    std::cout<<"************************"<<std::endl;
-      g_is_update[i] = (res_norm > tag.getResidualNormThreshold())&& 
(!tag.getIsStatic());
-    }
-  }
-}
-
-//************************************ UPDATE CHECK 
***************************************************//
-
-template<typename VectorType>
-bool is_all_update(VectorType& parallel_is_update)
-{
-  for (unsigned int i = 0; i < parallel_is_update.size(); ++i)
-  {
-    if (parallel_is_update[i])
-      return true;
-  }
-  return false;
-}
-
-//********************************** MATRIX VECTORIZATION 
***********************************************//
-
-//Matrix vectorization, column based approach
-/** @brief Solution of Least square problem on CPU
- *
- * @param M_in   input sparse, boost::numeric::ublas::compressed_matrix
- * @param M_v    array of sparse vectors
- */
-template<typename SparseMatrixT, typename SparseVectorT>
-void vectorize_column_matrix(SparseMatrixT const & M_in,
-                             std::vector<SparseVectorT> & M_v)
-{
-  for (typename SparseMatrixT::const_iterator1 row_it = M_in.begin1(); 
row_it!= M_in.end1(); ++row_it)
-    for (typename SparseMatrixT::const_iterator2 col_it = row_it.begin(); 
col_it != row_it.end(); ++col_it)
-        M_v[static_cast<unsigned int>(col_it.index2())][static_cast<unsigned 
int>(col_it.index1())] = *col_it;
-}
-
-//Matrix vectorization row based approach
-template<typename SparseMatrixT, typename SparseVectorT>
-void vectorize_row_matrix(SparseMatrixT const & M_in,
-                          std::vector<SparseVectorT> & M_v)
-{
-  for (typename SparseMatrixT::const_iterator1 row_it = M_in.begin1(); 
row_it!= M_in.end1(); ++row_it)
-    for (typename SparseMatrixT::const_iterator2 col_it = row_it.begin(); 
col_it != row_it.end(); ++col_it)
-      M_v[static_cast<unsigned int>(col_it.index1())][static_cast<unsigned 
int>(col_it.index2())] = *col_it;
-}
-
-//************************************* BLOCK ASSEMBLY CODE 
*********************************************//
-
-
-template<typename SizeT>
-void write_set_to_array(std::vector<std::vector<SizeT> > const & ind_set,
-                        std::vector<cl_uint> & a)
-{
-  vcl_size_t cnt = 0;
-
-  for (vcl_size_t i = 0; i < ind_set.size(); ++i)
-    for (vcl_size_t j = 0; j < ind_set[i].size(); ++j)
-      a[cnt++] = static_cast<cl_uint>(ind_set[i][j]);
-}
-
-
-
-//assembling blocks on GPU
-/** @brief Assembly of blocks on GPU by a gived set of row indices: g_I and 
column indices: g_J
- *
- * @param A               intial sparse matrix
- * @param g_J             container of column index set
- * @param g_I             container of row index set
- * @param g_A_I_J_vcl     contigious blocks A(I, J) using GPU memory
- * @param g_is_update     container with indicators which blocks are active
- * @param is_empty_block  parameter that indicates if no block were assembled
- */
-template<typename NumericT, unsigned int AlignmentV>
-void block_assembly(viennacl::compressed_matrix<NumericT, AlignmentV> const & 
A,
-                    std::vector<std::vector<unsigned int> > const & g_J,
-                    std::vector<std::vector<unsigned int> > const & g_I,
-                    block_matrix & g_A_I_J_vcl,
-                    std::vector<cl_uint> & g_is_update,
-                    bool & is_empty_block)
-{
-  //computing start indices for index sets and start indices for block matrices
-  unsigned int sz_I, sz_J, sz_blocks;
-  std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
-  std::vector<cl_uint> i_ind(g_I.size() + 1, static_cast<cl_uint>(0));
-  std::vector<cl_uint> j_ind(g_I.size() + 1, static_cast<cl_uint>(0));
-  std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
-  //
-  init_start_inds(g_J, j_ind);
-  init_start_inds(g_I, i_ind);
-  //
-  get_size(g_J, sz_J);
-  get_size(g_I, sz_I);
-  std::vector<cl_uint> I_set(sz_I, static_cast<cl_uint>(0));
-  //
-  std::vector<cl_uint> J_set(sz_J, static_cast<cl_uint>(0));
-
-  // computing size for blocks
-  // writing set to arrays
-  write_set_to_array(g_I, I_set);
-  write_set_to_array(g_J, J_set);
-
-  // if block for assembly does exist
-  if (I_set.size() > 0 && J_set.size() > 0)
-  {
-    viennacl::context ctx = viennacl::traits::context(A);
-    viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context 
&>(ctx.opencl_context());
-    compute_blocks_size(g_I, g_J, sz_blocks, blocks_ind, matrix_dims);
-    std::vector<NumericT> con_A_I_J(sz_blocks, NumericT(0));
-
-    block_vector set_I_vcl, set_J_vcl;
-    //init memory on GPU
-    //contigious g_A_I_J
-    g_A_I_J_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
-                                                    static_cast<unsigned 
int>(sizeof(NumericT)*(sz_blocks)),
-                                                    &(con_A_I_J[0]));
-    g_A_I_J_vcl.handle().context(opencl_ctx);
-
-    //matrix_dimensions
-    g_A_I_J_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
-                                                     static_cast<unsigned 
int>(sizeof(cl_uint)*2*static_cast<cl_uint>(g_I.size())),
-                                                     &(matrix_dims[0]));
-    g_A_I_J_vcl.handle1().context(opencl_ctx);
-
-    //start_block inds
-    g_A_I_J_vcl.handle2() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
-                                                     static_cast<unsigned 
int>(sizeof(cl_uint)*(g_I.size() + 1)),
-                                                     &(blocks_ind[0]));
-    g_A_I_J_vcl.handle2().context(opencl_ctx);
-
-    //set_I
-    set_I_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
-                                                  static_cast<unsigned 
int>(sizeof(cl_uint)*sz_I),
-                                                  &(I_set[0]));
-    set_I_vcl.handle().context(opencl_ctx);
-
-    //set_J
-    set_J_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
-                                                  static_cast<unsigned 
int>(sizeof(cl_uint)*sz_J),
-                                                  &(J_set[0]));
-    set_J_vcl.handle().context(opencl_ctx);
-
-    //i_ind
-    set_I_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
-                                                   static_cast<unsigned 
int>(sizeof(cl_uint)*(g_I.size() + 1)),
-                                                   &(i_ind[0]));
-    set_I_vcl.handle().context(opencl_ctx);
-
-    //j_ind
-    set_J_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
-                                                   static_cast<unsigned 
int>(sizeof(cl_uint)*(g_I.size() + 1)),
-                                                   &(j_ind[0]));
-    set_J_vcl.handle().context(opencl_ctx);
-
-    viennacl::ocl::handle<cl_mem> g_is_update_vcl = 
opencl_ctx.create_memory(CL_MEM_READ_WRITE,
-                                                                             
static_cast<unsigned int>(sizeof(cl_uint)*g_is_update.size()),
-                                                                             
&(g_is_update[0]));
-
-    viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);
-    viennacl::ocl::kernel& assembly_kernel = 
opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(),
 "assemble_blocks");
-    assembly_kernel.local_work_size(0, 1);
-    assembly_kernel.global_work_size(0, 256);
-    viennacl::ocl::enqueue(assembly_kernel(A.handle1().opencl_handle(), 
A.handle2().opencl_handle(), A.handle().opencl_handle(),
-                                           set_I_vcl.handle(), 
set_J_vcl.handle(), set_I_vcl.handle1(),
-                                           set_J_vcl.handle1(),
-                                           g_A_I_J_vcl.handle2(), 
g_A_I_J_vcl.handle1(), g_A_I_J_vcl.handle(),
-                                           g_is_update_vcl,
-                                           static_cast<unsigned 
int>(g_I.size())));
-    is_empty_block = false;
-  }
-  else
-    is_empty_block = true;
-}
-
-/************************************************************************************************************************/
-
-/** @brief Insertion of vectorized matrix column into original sparse matrix
- *
- * @param M_v       column-major vectorized matrix
- * @param M         original sparse matrix
- * @param is_right  indicates if matrix should be transposed in the output
- */
-template<typename SparseMatrixT, typename SparseVectorT>
-void insert_sparse_columns(std::vector<SparseVectorT> const & M_v,
-                           SparseMatrixT& M,
-                           bool is_right)
-{
-  if (is_right)
-  {
-    for (unsigned int i = 0; i < M_v.size(); ++i)
-      for (typename SparseVectorT::const_iterator vec_it = M_v[i].begin(); 
vec_it!=M_v[i].end(); ++vec_it)
-        M(vec_it->first, i) = vec_it->second;
-  }
-  else  //transposed fill of M
-  {
-    for (unsigned int i = 0; i < M_v.size(); ++i)
-      for (typename SparseVectorT::const_iterator vec_it = M_v[i].begin(); 
vec_it!=M_v[i].end(); ++vec_it)
-        M(i, vec_it->first) = vec_it->second;
-  }
-}
-
-/** @brief Transposition of sparse matrix
- *
- * @param A_in      intial sparse matrix
- * @param A output  transposed matrix
- */
-template<typename MatrixT>
-void sparse_transpose(MatrixT const & A_in, MatrixT & A)
-{
-  typedef typename MatrixT::value_type         NumericType;
-
-  std::vector<std::map<vcl_size_t, NumericType> >   temp_A(A_in.size2());
-  A.resize(A_in.size2(), A_in.size1(), false);
-
-  for (typename MatrixT::const_iterator1 row_it = A_in.begin1();
-       row_it != A_in.end1();
-       ++row_it)
-  {
-    for (typename MatrixT::const_iterator2 col_it = row_it.begin();
-         col_it != row_it.end();
-         ++col_it)
-    {
-      temp_A[col_it.index2()][col_it.index1()] = *col_it;
-    }
-  }
-
-  for (vcl_size_t i=0; i<temp_A.size(); ++i)
-  {
-    for (typename std::map<vcl_size_t, NumericType>::const_iterator it = 
temp_A[i].begin();
-         it != temp_A[i].end();
-         ++it)
-      A(i, it->first) = it->second;
-  }
-}
-
-
-
-
-//        template<typename SparseVectorType>
-//        void custom_copy(std::vector<SparseVectorType> & M_v, 
std::vector<SparseVectorType> & l_M_v, const unsigned int beg_ind){
-//            for (int i = 0; i < l_M_v.size(); ++i){
-//                l_M_v[i] = M_v[i + beg_ind];
-//            }
-//        }
-
-//CPU version
-/** @brief Construction of SPAI preconditioner on CPU
- *
- * @param A     initial sparse matrix
- * @param M     output preconditioner
- * @param tag   spai tag
- */
-template<typename MatrixT>
-void computeSPAI(MatrixT const & A,
-                 MatrixT & M,
-                 spai_tag & tag)
-{
-  typedef typename MatrixT::value_type                                       
NumericT;
-  typedef typename boost::numeric::ublas::vector<NumericT>                   
VectorType;
-  typedef typename viennacl::linalg::detail::spai::sparse_vector<NumericT>   
SparseVectorType;
-  typedef typename boost::numeric::ublas::matrix<NumericT>                   
DenseMatrixType;
-
-  //sparse matrix transpose...
-  unsigned int cur_iter = 0;
-  tag.setBegInd(0); tag.setEndInd(VIENNACL_SPAI_K_b);
-  bool go_on = true;
-  std::vector<SparseVectorType> A_v_c(M.size2());
-  std::vector<SparseVectorType> M_v(M.size2());
-  vectorize_column_matrix(A, A_v_c);
-  vectorize_column_matrix(M, M_v);
-
-
-  while (go_on)
-  {
-    go_on = (tag.getEndInd() < static_cast<long>(M.size2()));
-    cur_iter = 0;
-    unsigned int l_sz = static_cast<unsigned int>(tag.getEndInd() - 
tag.getBegInd());
-    //std::vector<bool> g_is_update(M.size2(), true);
-    std::vector<bool> g_is_update(l_sz, true);
-
-    //init is update
-    //init_parallel_is_update(g_is_update);
-    //std::vector<SparseVectorType> A_v_c(K);
-    //std::vector<SparseVectorType> M_v(K);
-    //vectorization of marices
-    //print_matrix(M_v);
-
-    std::vector<SparseVectorType> l_M_v(l_sz);
-    //custom_copy(M_v, l_M_v, beg_ind);
-    std::copy(M_v.begin() + tag.getBegInd(), M_v.begin() + tag.getEndInd(), 
l_M_v.begin());
-
-    //print_matrix(l_M_v);
-    //std::vector<SparseVectorType> l_A_v_c(K);
-    //custom_copy(A_v_c, l_A_v_c, beg_ind);
-    //std::copy(A_v_c.begin() + beg_ind, A_v_c.begin() + end_ind, 
l_A_v_c.begin());
-    //print_matrix(l_A_v_c);
-    //vectorize_row_matrix(A, A_v_r);
-    //working blocks
-
-    std::vector<DenseMatrixType> g_A_I_J(l_sz);
-    std::vector<VectorType> g_b_v(l_sz);
-    std::vector<SparseVectorType> g_res(l_sz);
-    std::vector<std::vector<unsigned int> > g_I(l_sz);
-    std::vector<std::vector<unsigned int> > g_J(l_sz);
-
-    while ((cur_iter < tag.getIterationLimit())&&is_all_update(g_is_update))
-    {
-      // SET UP THE BLOCKS..
-      // PHASE ONE
-      if (cur_iter == 0)
-        block_set_up(A, A_v_c, l_M_v,  g_I, g_J, g_A_I_J, g_b_v);
-      else
-        block_update(A, A_v_c, g_res, g_is_update, g_I, g_J, g_b_v, g_A_I_J, 
tag);
-
-      //PHASE TWO, LEAST SQUARE SOLUTION
-      least_square_solve(A_v_c, g_A_I_J, g_b_v, g_I, g_J, g_res, g_is_update, 
l_M_v, tag);
-
-      if (tag.getIsStatic()) break;
-      cur_iter++;
-    }
-
-    std::copy(l_M_v.begin(), l_M_v.end(), M_v.begin() + tag.getBegInd());
-    tag.setBegInd(tag.getEndInd());//beg_ind = end_ind;
-    tag.setEndInd(std::min(static_cast<long>(tag.getBegInd() + 
VIENNACL_SPAI_K_b), static_cast<long>(M.size2())));
-    //std::copy(l_M_v.begin(), l_M_v.end(), M_v.begin() + tag.getBegInd());
-  }
-
-  M.resize(M.size1(), M.size2(), false);
-  insert_sparse_columns(M_v, M, tag.getIsRight());
-}
-
-
-//GPU - based version
-/** @brief Construction of SPAI preconditioner on GPU
- *
- * @param A      initial sparse matrix
- * @param cpu_A  copy of initial matrix on CPU
- * @param cpu_M  output preconditioner on CPU
- * @param M      output preconditioner
- * @param tag    SPAI tag class with parameters
- */
-template<typename NumericT, unsigned int AlignmentV>
-void computeSPAI(viennacl::compressed_matrix<NumericT, AlignmentV> const & A, 
//input
-                 boost::numeric::ublas::compressed_matrix<NumericT> const & 
cpu_A,
-                 boost::numeric::ublas::compressed_matrix<NumericT> & cpu_M, 
//output
-                 viennacl::compressed_matrix<NumericT, AlignmentV> & M,
-                 spai_tag const & tag)
-{
-  typedef typename viennacl::linalg::detail::spai::sparse_vector<NumericT>     
   SparseVectorType;
-
-  //typedef typename viennacl::compressed_matrix<ScalarType> 
GPUSparseMatrixType;
-  //sparse matrix transpose...
-  unsigned int cur_iter = 0;
-  std::vector<cl_uint> g_is_update(cpu_M.size2(), static_cast<cl_uint>(1));
-  //init is update
-  //init_parallel_is_update(g_is_update);
-  std::vector<SparseVectorType> A_v_c(cpu_M.size2());
-  std::vector<SparseVectorType> M_v(cpu_M.size2());
-  vectorize_column_matrix(cpu_A, A_v_c);
-  vectorize_column_matrix(cpu_M, M_v);
-  std::vector<SparseVectorType> g_res(cpu_M.size2());
-  std::vector<std::vector<unsigned int> > g_I(cpu_M.size2());
-  std::vector<std::vector<unsigned int> > g_J(cpu_M.size2());
-
-  //OpenCL variables
-  block_matrix g_A_I_J_vcl;
-  block_vector g_bv_vcl;
-  while ((cur_iter < tag.getIterationLimit())&&is_all_update(g_is_update))
-  {
-    // SET UP THE BLOCKS..
-    // PHASE ONE..
-    //timer.start();
-    //index set up on CPU
-    if (cur_iter == 0)
-      block_set_up(A, A_v_c, M_v, g_is_update, g_I, g_J, g_A_I_J_vcl, 
g_bv_vcl);
-    else
-      block_update(A, A_v_c, g_is_update, g_res, g_J, g_I, g_A_I_J_vcl, 
g_bv_vcl, tag);
-    //std::cout<<"Phase 2 timing: "<<timer.get()<<std::endl;
-    //PERFORM LEAST SQUARE problems solution
-    //PHASE TWO
-    //timer.start();
-    least_square_solve<SparseVectorType, NumericT>(A_v_c, M_v, g_I, g_J, 
g_A_I_J_vcl, g_bv_vcl, g_res, g_is_update, tag, viennacl::traits::context(A));
-    //std::cout<<"Phase 3 timing: "<<timer.get()<<std::endl;
-    if (tag.getIsStatic())
-      break;
-    cur_iter++;
-  }
-
-  cpu_M.resize(cpu_M.size1(), cpu_M.size2(), false);
-  insert_sparse_columns(M_v, cpu_M, tag.getIsRight());
-  //copy back to GPU
-  M.resize(static_cast<unsigned int>(cpu_M.size1()), static_cast<unsigned 
int>(cpu_M.size2()));
-  viennacl::copy(cpu_M, M);
-}
-
-}
-}
-}
-}
-#endif


http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai_tag.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai_tag.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai_tag.hpp
deleted file mode 100644
index d8c718c..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai_tag.hpp
+++ /dev/null
@@ -1,143 +0,0 @@
-#ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_TAG_HPP
-#define VIENNACL_LINALG_DETAIL_SPAI_SPAI_TAG_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-
-/** @file viennacl/linalg/detail/spai/spai_tag.hpp
-    @brief Implementation of the spai tag holding SPAI configuration 
parameters. Experimental.
-
-    SPAI code contributed by Nikolay Lukash
-*/
-
-
-#include <utility>
-#include <iostream>
-#include <fstream>
-#include <string>
-#include <algorithm>
-#include <vector>
-#include <math.h>
-#include <cmath>
-#include <sstream>
-#include "viennacl/ocl/backend.hpp"
-#include "boost/numeric/ublas/vector.hpp"
-#include "boost/numeric/ublas/matrix.hpp"
-#include "boost/numeric/ublas/matrix_proxy.hpp"
-#include "boost/numeric/ublas/storage.hpp"
-#include "boost/numeric/ublas/io.hpp"
-#include "boost/numeric/ublas/matrix_expression.hpp"
-#include "boost/numeric/ublas/detail/matrix_assign.hpp"
-
-#include "viennacl/linalg/detail/spai/block_matrix.hpp"
-#include "viennacl/linalg/detail/spai/block_vector.hpp"
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace detail
-{
-namespace spai
-{
-
-/** @brief A tag for SPAI
- *
- * Contains values for the algorithm.
- * Must be passed to spai_precond constructor
- */
-class spai_tag
-{
-  /** @brief Constructor
-   *
-   * @param residual_norm_threshold   Calculate until the norm of the residual 
falls below this threshold
-   * @param iteration_limit           maximum number of iterations
-   * @param residual_threshold        determines starting threshold in 
residual vector for including new indices into set J
-   * @param is_static                 determines if static version of SPAI 
should be used
-   * @param is_right                  determines if left or right 
preconditioner should be used
-   */
-public:
-  spai_tag(double residual_norm_threshold = 1e-3,
-           unsigned int iteration_limit = 5,
-           double residual_threshold = 1e-2,
-           bool is_static = false,
-           bool is_right = false)
-    : residual_norm_threshold_(residual_norm_threshold),
-      iteration_limit_(iteration_limit),
-      residual_threshold_(residual_threshold),
-      is_static_(is_static),
-      is_right_(is_right) {}
-
-  double getResidualNormThreshold() const { return residual_norm_threshold_; }
-
-  double getResidualThreshold() const { return residual_threshold_; }
-
-  unsigned int getIterationLimit () const { return iteration_limit_; }
-
-  bool getIsStatic() const { return is_static_; }
-
-  bool getIsRight() const { return is_right_; }
-
-  long getBegInd() const { return beg_ind_; }
-
-  long getEndInd() const { return end_ind_; }
-
-
-
-  void setResidualNormThreshold(double residual_norm_threshold)
-  {
-    if (residual_norm_threshold > 0)
-      residual_norm_threshold_ = residual_norm_threshold;
-  }
-
-  void setResidualThreshold(double residual_threshold)
-  {
-    if (residual_threshold > 0)
-      residual_threshold_ = residual_threshold;
-  }
-
-  void setIterationLimit(unsigned int iteration_limit)
-  {
-    if (iteration_limit > 0)
-      iteration_limit_ = iteration_limit;
-  }
-
-  void setIsRight(bool is_right) { is_right_ = is_right; }
-
-  void setIsStatic(bool is_static) { is_static_ = is_static; }
-
-  void setBegInd(long beg_ind) { beg_ind_ = beg_ind; }
-
-  void setEndInd(long end_ind){ end_ind_ = end_ind; }
-
-
-private:
-  double        residual_norm_threshold_;
-  unsigned int  iteration_limit_;
-  long          beg_ind_;
-  long          end_ind_;
-  double        residual_threshold_;
-  bool          is_static_;
-  bool          is_right_;
-};
-
-}
-}
-}
-}
-#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/sparse_vector.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/sparse_vector.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/sparse_vector.hpp
deleted file mode 100644
index c99eda1..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/sparse_vector.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-#ifndef VIENNACL_LINALG_DETAIL_SPAI_SPARSE_VECTOR_HPP
-#define VIENNACL_LINALG_DETAIL_SPAI_SPARSE_VECTOR_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-/** @file viennacl/linalg/detail/spai/sparse_vector.hpp
-    @brief Implementation of a helper sparse vector class for SPAI. 
Experimental.
-
-    SPAI code contributed by Nikolay Lukash
-*/
-
-#include <utility>
-#include <iostream>
-#include <fstream>
-#include <string>
-#include <algorithm>
-#include <vector>
-#include <math.h>
-#include <map>
-
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace detail
-{
-namespace spai
-{
-
-/**
- * @brief Represents a sparse vector based on std::map<unsigned int, NumericT>
- */
-template<typename NumericT>
-class sparse_vector
-{
-public:
-  typedef typename std::map<unsigned int, NumericT>::iterator        iterator;
-  typedef typename std::map<unsigned int, NumericT>::const_iterator  
const_iterator;
-
-  sparse_vector() {}
-
-  /** @brief Set the index of the vector in the original matrix
-   *
-   * May only be called once.
-   */
-  //getter
-  NumericT & operator[] (unsigned int ind) { return v_[ind]; }
-
-  void clear() { v_.clear(); }
-
-  const_iterator find(unsigned int var) const { return v_.find(var); }
-        iterator find(unsigned int var)       { return v_.find(var); }
-
-  const_iterator begin() const { return v_.begin(); }
-        iterator begin()       { return v_.begin(); }
-  const_iterator end() const { return v_.end(); }
-        iterator end()       { return v_.end(); }
-
-private:
-  unsigned int                      size_;
-  std::map<unsigned int, NumericT>  v_;
-};
-
-}
-}
-}
-}
-
-#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/direct_solve.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/direct_solve.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/direct_solve.hpp
deleted file mode 100644
index a3340d7..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/direct_solve.hpp
+++ /dev/null
@@ -1,580 +0,0 @@
-#ifndef VIENNACL_LINALG_DIRECT_SOLVE_HPP_
-#define VIENNACL_LINALG_DIRECT_SOLVE_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-/** @file viennacl/linalg/direct_solve.hpp
-    @brief Implementations of dense direct solvers are found here.
-*/
-
-#include "viennacl/forwards.h"
-#include "viennacl/meta/enable_if.hpp"
-#include "viennacl/vector.hpp"
-#include "viennacl/vector_proxy.hpp"
-#include "viennacl/matrix.hpp"
-#include "viennacl/matrix_proxy.hpp"
-#include "viennacl/linalg/prod.hpp"
-#include "viennacl/linalg/host_based/direct_solve.hpp"
-
-#ifdef VIENNACL_WITH_OPENCL
-  #include "viennacl/linalg/opencl/direct_solve.hpp"
-#endif
-
-#ifdef VIENNACL_WITH_CUDA
-  #include "viennacl/linalg/cuda/direct_solve.hpp"
-#endif
-
-#define VIENNACL_DIRECT_SOLVE_BLOCKSIZE 128
-
-namespace viennacl
-{
-namespace linalg
-{
-
-namespace detail
-{
-
-  //
-  // A \ B:
-  //
-
-  /** @brief Direct inplace solver for dense triangular systems using a single 
kernel launch. Matlab notation: A \ B
-  *
-  * @param A    The system matrix
-  * @param B    The matrix of row vectors, where the solution is directly 
written to
-  */
-  template<typename NumericT, typename SolverTagT>
-  void inplace_solve_kernel(const matrix_base<NumericT>  & A, const 
matrix_base<NumericT> & B, SolverTagT)
-  {
-    assert( (viennacl::traits::size1(A) == viennacl::traits::size2(A)) && 
bool("Size check failed in inplace_solve(): size1(A) != size2(A)"));
-    assert( (viennacl::traits::size1(A) == viennacl::traits::size1(B)) && 
bool("Size check failed in inplace_solve(): size1(A) != size1(B)"));
-    switch (viennacl::traits::handle(A).get_active_handle_id())
-    {
-      case viennacl::MAIN_MEMORY:
-        viennacl::linalg::host_based::inplace_solve(A, 
const_cast<matrix_base<NumericT> &>(B), SolverTagT());
-        break;
-  #ifdef VIENNACL_WITH_OPENCL
-      case viennacl::OPENCL_MEMORY:
-        viennacl::linalg::opencl::inplace_solve(A, 
const_cast<matrix_base<NumericT> &>(B), SolverTagT());
-        break;
-  #endif
-  #ifdef VIENNACL_WITH_CUDA
-      case viennacl::CUDA_MEMORY:
-        viennacl::linalg::cuda::inplace_solve(A, 
const_cast<matrix_base<NumericT> &>(B), SolverTagT());
-        break;
-  #endif
-      case viennacl::MEMORY_NOT_INITIALIZED:
-        throw memory_exception("not initialised!");
-      default:
-        throw memory_exception("not implemented");
-    }
-  }
-
-
-  //
-  // A \ b
-  //
-
-  template<typename NumericT, typename SolverTagT>
-  void inplace_solve_vec_kernel(const matrix_base<NumericT> & mat,
-                                const vector_base<NumericT> & vec,
-                                SolverTagT)
-  {
-    assert( (mat.size1() == vec.size()) && bool("Size check failed in 
inplace_solve(): size1(A) != size(b)"));
-    assert( (mat.size2() == vec.size()) && bool("Size check failed in 
inplace_solve(): size2(A) != size(b)"));
-
-    switch (viennacl::traits::handle(mat).get_active_handle_id())
-    {
-      case viennacl::MAIN_MEMORY:
-        viennacl::linalg::host_based::inplace_solve(mat, 
const_cast<vector_base<NumericT> &>(vec), SolverTagT());
-        break;
-  #ifdef VIENNACL_WITH_OPENCL
-      case viennacl::OPENCL_MEMORY:
-        viennacl::linalg::opencl::inplace_solve(mat, 
const_cast<vector_base<NumericT> &>(vec), SolverTagT());
-        break;
-  #endif
-  #ifdef VIENNACL_WITH_CUDA
-      case viennacl::CUDA_MEMORY:
-        viennacl::linalg::cuda::inplace_solve(mat, 
const_cast<vector_base<NumericT> &>(vec), SolverTagT());
-        break;
-  #endif
-      case viennacl::MEMORY_NOT_INITIALIZED:
-        throw memory_exception("not initialised!");
-      default:
-        throw memory_exception("not implemented");
-    }
-  }
-
-
-  template<typename MatrixT1, typename MatrixT2, typename SolverTagT>
-  void inplace_solve_lower_impl(MatrixT1 const & A, MatrixT2 & B, SolverTagT)
-  {
-    typedef typename viennacl::result_of::cpu_value_type<MatrixT1>::type  
NumericType;
-
-    vcl_size_t blockSize = VIENNACL_DIRECT_SOLVE_BLOCKSIZE;
-    if (A.size1() <= blockSize)
-      inplace_solve_kernel(A, B, SolverTagT());
-    else
-    {
-      for (vcl_size_t i = 0; i < A.size1(); i = i + blockSize)
-      {
-        vcl_size_t Apos1 = i;
-        vcl_size_t Apos2 = std::min<vcl_size_t>(A.size1(), i + blockSize);
-        vcl_size_t Bpos = B.size2();
-        inplace_solve_kernel(viennacl::project(A, viennacl::range(Apos1, 
Apos2), viennacl::range(Apos1, Apos2)),
-                             viennacl::project(B, viennacl::range(Apos1, 
Apos2), viennacl::range(0,     Bpos)),
-                             SolverTagT());
-        if (Apos2 < A.size1())
-        {
-          viennacl::matrix_range<MatrixT2> B_lower(B, viennacl::range(Apos2, 
B.size1()), viennacl::range(0, Bpos));
-          viennacl::linalg::prod_impl(viennacl::project(A, 
viennacl::range(Apos2, A.size1()), viennacl::range(Apos1, Apos2)),
-                                      viennacl::project(B, 
viennacl::range(Apos1, Apos2),     viennacl::range(0,     Bpos)),
-                                      B_lower,
-                                      NumericType(-1.0), NumericType(1.0));
-        }
-      }
-    }
-  }
-
-  template<typename MatrixT1, typename MatrixT2>
-  void inplace_solve_impl(MatrixT1 const & A, MatrixT2 & B, 
viennacl::linalg::lower_tag)
-  {
-    inplace_solve_lower_impl(A, B, viennacl::linalg::lower_tag());
-  }
-
-  template<typename MatrixT1, typename MatrixT2>
-  void inplace_solve_impl(MatrixT1 const & A, MatrixT2 & B, 
viennacl::linalg::unit_lower_tag)
-  {
-    inplace_solve_lower_impl(A, B, viennacl::linalg::unit_lower_tag());
-  }
-
-  template<typename MatrixT1, typename MatrixT2, typename SolverTagT>
-  void inplace_solve_upper_impl(MatrixT1 const & A, MatrixT2 & B, SolverTagT)
-  {
-    typedef typename viennacl::result_of::cpu_value_type<MatrixT1>::type  
NumericType;
-
-    int blockSize = VIENNACL_DIRECT_SOLVE_BLOCKSIZE;
-    if (static_cast<int>(A.size1()) <= blockSize)
-      inplace_solve_kernel(A, B, SolverTagT());
-    else
-    {
-      for (int i = static_cast<int>(A.size1()); i > 0; i = i - blockSize)
-      {
-        vcl_size_t Apos1 = vcl_size_t(std::max<int>(0, i - blockSize));
-        vcl_size_t Apos2 = vcl_size_t(i);
-        vcl_size_t Bpos = B.size2();
-        inplace_solve_kernel(viennacl::project(A, viennacl::range(Apos1, 
Apos2), viennacl::range(Apos1, Apos2)),
-                             viennacl::project(B, viennacl::range(Apos1, 
Apos2), viennacl::range(0, Bpos)),
-                             SolverTagT());
-        if (Apos1 > 0)
-        {
-          viennacl::matrix_range<MatrixT2> B_upper(B, viennacl::range(0, 
Apos1), viennacl::range(0, Bpos));
-
-          viennacl::linalg::prod_impl(viennacl::project(A, viennacl::range(0,  
   Apos1), viennacl::range(Apos1, Apos2)),
-                                      viennacl::project(B, 
viennacl::range(Apos1, Apos2), viennacl::range(0,     Bpos)),
-                                      B_upper,
-                                      NumericType(-1.0), NumericType(1.0));
-        }
-      }
-    }
-  }
-
-  template<typename MatrixT1, typename MatrixT2>
-  void inplace_solve_impl(MatrixT1 const & A, MatrixT2 & B, 
viennacl::linalg::upper_tag)
-  {
-    inplace_solve_upper_impl(A, B, viennacl::linalg::upper_tag());
-  }
-
-  template<typename MatrixT1, typename MatrixT2>
-  void inplace_solve_impl(MatrixT1 const & A, MatrixT2 & B, 
viennacl::linalg::unit_upper_tag)
-  {
-    inplace_solve_upper_impl(A, B, viennacl::linalg::unit_upper_tag());
-  }
-
-} // namespace detail
-
-/** @brief Direct inplace solver for triangular systems with multiple right 
hand sides, i.e. A \ B   (MATLAB notation)
-*
-* @param A      The system matrix
-* @param B      The matrix of row vectors, where the solution is directly 
written to
-*/
-template<typename NumericT, typename SolverTagT>
-void inplace_solve(const matrix_base<NumericT> & A,
-                   matrix_base<NumericT> & B,
-                   SolverTagT)
-{
-  detail::inplace_solve_impl(A,B,SolverTagT());
-}
-
-/** @brief Direct inplace solver for triangular systems with multiple 
transposed right hand sides, i.e. A \ B^T   (MATLAB notation)
-*
-* @param A       The system matrix
-* @param proxy_B The proxy for the transposed matrix of row vectors, where the 
solution is directly written to
-*/
-template<typename NumericT, typename SolverTagT>
-void inplace_solve(const matrix_base<NumericT> & A,
-                   matrix_expression<const matrix_base<NumericT>, const 
matrix_base<NumericT>, op_trans> proxy_B,
-                   SolverTagT)
-{
-  typedef typename matrix_base<NumericT>::handle_type    handle_type;
-
-  matrix_base<NumericT> B(const_cast<handle_type &>(proxy_B.lhs().handle()),
-                          proxy_B.lhs().size2(), proxy_B.lhs().start2(), 
proxy_B.lhs().stride2(), proxy_B.lhs().internal_size2(),
-                          proxy_B.lhs().size1(), proxy_B.lhs().start1(), 
proxy_B.lhs().stride1(), proxy_B.lhs().internal_size1(),
-                          !proxy_B.lhs().row_major());
-
-  detail::inplace_solve_impl(A,B,SolverTagT());
-}
-
-//upper triangular solver for transposed lower triangular matrices
-/** @brief Direct inplace solver for transposed triangular systems with 
multiple right hand sides, i.e. A^T \ B   (MATLAB notation)
-*
-* @param proxy_A  The transposed system matrix proxy
-* @param B        The matrix holding the load vectors, where the solution is 
directly written to
-*/
-template<typename NumericT, typename SolverTagT>
-void inplace_solve(const matrix_expression< const matrix_base<NumericT>, const 
matrix_base<NumericT>, op_trans>  & proxy_A,
-                   matrix_base<NumericT> & B,
-                   SolverTagT)
-{
-  typedef typename matrix_base<NumericT>::handle_type    handle_type;
-
-  matrix_base<NumericT> A(const_cast<handle_type &>(proxy_A.lhs().handle()),
-                          proxy_A.lhs().size2(), proxy_A.lhs().start2(), 
proxy_A.lhs().stride2(), proxy_A.lhs().internal_size2(),
-                          proxy_A.lhs().size1(), proxy_A.lhs().start1(), 
proxy_A.lhs().stride1(), proxy_A.lhs().internal_size1(),
-                          !proxy_A.lhs().row_major());
-
-  detail::inplace_solve_impl(A,B,SolverTagT());
-}
-
-/** @brief Direct inplace solver for transposed triangular systems with 
multiple transposed right hand sides, i.e. A^T \ B^T   (MATLAB notation)
-*
-* @param proxy_A    The transposed system matrix proxy
-* @param proxy_B    The transposed matrix holding the load vectors, where the 
solution is directly written to
-*/
-template<typename NumericT, typename SolverTagT>
-void inplace_solve(matrix_expression< const matrix_base<NumericT>, const 
matrix_base<NumericT>, op_trans> const & proxy_A,
-                   matrix_expression< const matrix_base<NumericT>, const 
matrix_base<NumericT>, op_trans>         proxy_B,
-                   SolverTagT)
-{
-  typedef typename matrix_base<NumericT>::handle_type    handle_type;
-
-  matrix_base<NumericT> A(const_cast<handle_type &>(proxy_A.lhs().handle()),
-                          proxy_A.lhs().size2(), proxy_A.lhs().start2(), 
proxy_A.lhs().stride2(), proxy_A.lhs().internal_size2(),
-                          proxy_A.lhs().size1(), proxy_A.lhs().start1(), 
proxy_A.lhs().stride1(), proxy_A.lhs().internal_size1(),
-                          !proxy_A.lhs().row_major());
-
-  matrix_base<NumericT> B(const_cast<handle_type &>(proxy_B.lhs().handle()),
-                          proxy_B.lhs().size2(), proxy_B.lhs().start2(), 
proxy_B.lhs().stride2(), proxy_B.lhs().internal_size2(),
-                          proxy_B.lhs().size1(), proxy_B.lhs().start1(), 
proxy_B.lhs().stride1(), proxy_B.lhs().internal_size1(),
-                          !proxy_B.lhs().row_major());
-
-  detail::inplace_solve_impl(A,B,SolverTagT());
-}
-
-
-/////////////////// general wrappers for non-inplace solution 
//////////////////////
-
-
-/** @brief Convenience functions for C = solve(A, B, some_tag()); Creates a 
temporary result matrix and forwards the request to inplace_solve()
-*
-* @param A    The system matrix
-* @param B    The matrix of load vectors
-* @param tag    Dispatch tag
-*/
-template<typename NumericT, typename SolverTagT>
-matrix_base<NumericT> solve(const matrix_base<NumericT> & A,
-                            const matrix_base<NumericT> & B,
-                            SolverTagT tag)
-{
-  // do an inplace solve on the result vector:
-  matrix_base<NumericT> result(B);
-  inplace_solve(A, result, tag);
-  return result;
-}
-
-/** @brief Convenience functions for C = solve(A, B^T, some_tag()); Creates a 
temporary result matrix and forwards the request to inplace_solve()
-*
-* @param A    The system matrix
-* @param proxy  The transposed load vector
-* @param tag    Dispatch tag
-*/
-template<typename NumericT, typename SolverTagT>
-matrix_base<NumericT> solve(const matrix_base<NumericT> & A,
-                            const matrix_expression< const 
matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & proxy,
-                            SolverTagT tag)
-{
-  // do an inplace solve on the result vector:
-  matrix_base<NumericT> result(proxy);
-  inplace_solve(A, result, tag);
-  return result;
-}
-
-/** @brief Convenience functions for result = solve(trans(mat), B, 
some_tag()); Creates a temporary result matrix and forwards the request to 
inplace_solve()
-*
-* @param proxy  The transposed system matrix proxy
-* @param B      The matrix of load vectors
-* @param tag    Dispatch tag
-*/
-template<typename NumericT, typename SolverTagT>
-matrix_base<NumericT> solve(const matrix_expression< const 
matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & proxy,
-                            const matrix_base<NumericT> & B,
-                            SolverTagT tag)
-{
-  // do an inplace solve on the result vector:
-  matrix_base<NumericT> result(B);
-  inplace_solve(proxy, result, tag);
-  return result;
-}
-
-/** @brief Convenience functions for result = solve(trans(mat), vec, 
some_tag()); Creates a temporary result vector and forwards the request to 
inplace_solve()
-*
-* @param proxy_A  The transposed system matrix proxy
-* @param proxy_B  The transposed matrix of load vectors, where the solution is 
directly written to
-* @param tag    Dispatch tag
-*/
-template<typename NumericT, typename SolverTagT>
-matrix_base<NumericT> solve(const matrix_expression< const 
matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & proxy_A,
-                            const matrix_expression< const 
matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & proxy_B,
-                            SolverTagT tag)
-{
-  // run an inplace solve on the result vector:
-  matrix_base<NumericT> result(proxy_B);
-  inplace_solve(proxy_A, result, tag);
-  return result;
-}
-
-//
-/////////// solves with vector as right hand side ///////////////////
-//
-
-namespace detail
-{
-  template<typename MatrixT1, typename VectorT, typename SolverTagT>
-  void inplace_solve_lower_vec_impl(MatrixT1 const & A, VectorT & b, 
SolverTagT)
-  {
-    vcl_size_t blockSize = VIENNACL_DIRECT_SOLVE_BLOCKSIZE;
-    if (A.size1() <= blockSize)
-      inplace_solve_vec_kernel(A, b, SolverTagT());
-    else
-    {
-      VectorT temp(b);
-      for (vcl_size_t i = 0; i < A.size1(); i = i + blockSize)
-      {
-        vcl_size_t Apos1 = i;
-        vcl_size_t Apos2 = std::min<vcl_size_t>(A.size1(), i + blockSize);
-        inplace_solve_vec_kernel(viennacl::project(A, viennacl::range(Apos1, 
Apos2), viennacl::range(Apos1, Apos2)),
-                                 viennacl::project(b, viennacl::range(Apos1, 
Apos2)),
-                                 SolverTagT());
-        if (Apos2 < A.size1())
-        {
-          viennacl::project(temp, viennacl::range(Apos2, A.size1())) = 
viennacl::linalg::prod(viennacl::project(A, viennacl::range(Apos2, A.size1()), 
viennacl::range(Apos1, Apos2)),
-                                                                               
               viennacl::project(b, viennacl::range(Apos1, Apos2)));
-          viennacl::project(b, viennacl::range(Apos2, A.size1())) -= 
viennacl::project(temp, viennacl::range(Apos2, A.size1()));
-        }
-      }
-    }
-  }
-
-  template<typename MatrixT1, typename VectorT>
-  void inplace_solve_vec_impl(MatrixT1 const & A, VectorT & B, 
viennacl::linalg::lower_tag)
-  {
-    inplace_solve_lower_vec_impl(A, B, viennacl::linalg::lower_tag());
-  }
-
-  template<typename MatrixT1, typename VectorT>
-  void inplace_solve_vec_impl(MatrixT1 const & A, VectorT & B, 
viennacl::linalg::unit_lower_tag)
-  {
-    inplace_solve_lower_vec_impl(A, B, viennacl::linalg::unit_lower_tag());
-  }
-
-  template<typename MatrixT1, typename VectorT, typename SolverTagT>
-  void inplace_solve_upper_vec_impl(MatrixT1 const & A, VectorT & b, 
SolverTagT)
-  {
-    int blockSize = VIENNACL_DIRECT_SOLVE_BLOCKSIZE;
-    if (static_cast<int>(A.size1()) <= blockSize)
-      inplace_solve_vec_kernel(A, b, SolverTagT());
-    else
-    {
-      VectorT temp(b);
-      for (int i = static_cast<int>(A.size1()); i > 0; i = i - blockSize)
-      {
-        vcl_size_t Apos1 = vcl_size_t(std::max<int>(0, i - blockSize));
-        vcl_size_t Apos2 = vcl_size_t(i);
-        inplace_solve_vec_kernel(viennacl::project(A, viennacl::range(Apos1, 
Apos2), viennacl::range(Apos1, Apos2)),
-                                 viennacl::project(b, viennacl::range(Apos1, 
Apos2)),
-                                 SolverTagT());
-        if (Apos1 > 0)
-        {
-          viennacl::project(temp, viennacl::range(0, Apos1)) = 
viennacl::linalg::prod(viennacl::project(A, viennacl::range(0,     Apos1), 
viennacl::range(Apos1, Apos2)),
-                                                                               
       viennacl::project(b, viennacl::range(Apos1, Apos2)));
-          viennacl::project(b, viennacl::range(0, Apos1)) -= 
viennacl::project(temp, viennacl::range(0, Apos1));
-        }
-      }
-    }
-  }
-
-  template<typename MatrixT1, typename VectorT>
-  void inplace_solve_vec_impl(MatrixT1 const & A, VectorT & b, 
viennacl::linalg::upper_tag)
-  {
-    inplace_solve_upper_vec_impl(A, b, viennacl::linalg::upper_tag());
-  }
-
-  template<typename MatrixT1, typename VectorT>
-  void inplace_solve_vec_impl(MatrixT1 const & A, VectorT & b, 
viennacl::linalg::unit_upper_tag)
-  {
-    inplace_solve_upper_vec_impl(A, b, viennacl::linalg::unit_upper_tag());
-  }
-
-} // namespace detail
-
-/** @brief Inplace solution of a triangular system. Matlab notation A \ b.
-*
-* @param mat    The system matrix (a dense matrix for which only the 
respective triangular form is used)
-* @param vec    The right hand side vector
-* @param tag    The tag (either lower_tag, unit_lower_tag, upper_tag, or 
unit_upper_tag)
-*/
-template<typename NumericT, typename SolverTagT>
-void inplace_solve(const matrix_base<NumericT> & mat,
-                   vector_base<NumericT> & vec,
-                   SolverTagT const & tag)
-{
-
-  detail::inplace_solve_vec_impl(mat, vec, tag);
-}
-
-/** @brief Inplace solution of a triangular system with transposed system 
matrix.. Matlab notation A' \ b.
-*
-* @param proxy  The transposed system matrix (a dense matrix for which only 
the respective triangular form is used)
-* @param vec    The right hand side vector
-* @param tag    The tag (either lower_tag, unit_lower_tag, upper_tag, or 
unit_upper_tag)
-*/
-template<typename NumericT, typename SolverTagT>
-void inplace_solve(matrix_expression<const matrix_base<NumericT>, const 
matrix_base<NumericT>, op_trans> const & proxy,
-                   vector_base<NumericT> & vec,
-                   SolverTagT const & tag)
-{
-  typedef typename matrix_base<NumericT>::handle_type    handle_type;
-
-  // wrap existing matrix in a new matrix_base object (no data copy)
-  matrix_base<NumericT> mat(const_cast<handle_type &>(proxy.lhs().handle()),
-                            proxy.lhs().size2(), proxy.lhs().start2(), 
proxy.lhs().stride2(), proxy.lhs().internal_size2(),
-                            proxy.lhs().size1(), proxy.lhs().start1(), 
proxy.lhs().stride1(), proxy.lhs().internal_size1(),
-                            !proxy.lhs().row_major());
-  detail::inplace_solve_vec_impl(mat, vec, tag);
-}
-
-
-/** @brief Convenience function for result = solve(mat, vec, upper_tag()); for 
an upper triangular solve.
-*
-* Creates a temporary result vector and forwards the request to inplace_solve()
-*
-* @param mat    The system matrix
-* @param vec    The load vector
-* @param tag    Dispatch tag
-*/
-template<typename NumericT>
-vector<NumericT> solve(const matrix_base<NumericT> & mat,
-                       const vector_base<NumericT> & vec,
-                       viennacl::linalg::upper_tag const & tag)
-{
-  // run an inplace solve on the result vector:
-  vector<NumericT> result(vec);
-  inplace_solve(mat, result, tag);
-  return result;
-}
-
-/** @brief Convenience function for result = solve(mat, vec, upper_tag()); for 
an upper triangular solve with unit diagonal.
-*
-* Creates a temporary result vector and forwards the request to inplace_solve()
-*
-* @param mat    The system matrix
-* @param vec    The load vector
-* @param tag    Dispatch tag
-*/
-template<typename NumericT>
-vector<NumericT> solve(const matrix_base<NumericT> & mat,
-                       const vector_base<NumericT> & vec,
-                       viennacl::linalg::unit_upper_tag const & tag)
-{
-  // run an inplace solve on the result vector:
-  vector<NumericT> result(vec);
-  inplace_solve(mat, result, tag);
-  return result;
-}
-
-/** @brief Convenience function for result = solve(mat, vec, upper_tag()); for 
a lower triangular solve.
-*
-* Creates a temporary result vector and forwards the request to inplace_solve()
-*
-* @param mat    The system matrix
-* @param vec    The load vector
-* @param tag    Dispatch tag
-*/
-template<typename NumericT>
-vector<NumericT> solve(const matrix_base<NumericT> & mat,
-                       const vector_base<NumericT> & vec,
-                       viennacl::linalg::lower_tag const & tag)
-{
-  // run an inplace solve on the result vector:
-  vector<NumericT> result(vec);
-  inplace_solve(mat, result, tag);
-  return result;
-}
-
-/** @brief Convenience function for result = solve(mat, vec, upper_tag()); for 
a lower triangular solve with unit diagonal.
-*
-* Creates a temporary result vector and forwards the request to inplace_solve()
-*
-* @param mat    The system matrix
-* @param vec    The load vector
-* @param tag    Dispatch tag
-*/
-template<typename NumericT>
-vector<NumericT> solve(const matrix_base<NumericT> & mat,
-                       const vector_base<NumericT> & vec,
-                       viennacl::linalg::unit_lower_tag const & tag)
-{
-  // run an inplace solve on the result vector:
-  vector<NumericT> result(vec);
-  inplace_solve(mat, result, tag);
-  return result;
-}
-
-/** @brief Convenience functions for result = solve(trans(mat), vec, 
some_tag()); Creates a temporary result vector and forwards the request to 
inplace_solve()
-*
-* @param proxy  The transposed system matrix proxy
-* @param vec    The load vector, where the solution is directly written to
-* @param tag    Dispatch tag
-*/
-template<typename NumericT, typename SolverTagT>
-vector<NumericT> solve(const matrix_expression< const matrix_base<NumericT>, 
const matrix_base<NumericT>, op_trans> & proxy,
-                       const vector_base<NumericT> & vec,
-                       SolverTagT const & tag)
-{
-  // run an inplace solve on the result vector:
-  vector<NumericT> result(vec);
-  inplace_solve(proxy, result, tag);
-  return result;
-}
-
-
-}
-}
-
-#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/eig.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/eig.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/eig.hpp
deleted file mode 100644
index 36be3b3..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/eig.hpp
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef VIENNACL_LINALG_EIG_HPP_
-#define VIENNACL_LINALG_EIG_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-/** @file viennacl/linalg/eig.hpp
-*   @brief Convenience header file including all available eigenvalue 
algorithms
-*/
-
-#include "viennacl/linalg/bisect.hpp"
-#include "viennacl/linalg/lanczos.hpp"
-#include "viennacl/linalg/power_iter.hpp"
-
-#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/fft_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/fft_operations.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/fft_operations.hpp
deleted file mode 100644
index ae9ade2..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/fft_operations.hpp
+++ /dev/null
@@ -1,481 +0,0 @@
-#ifndef VIENNACL_LINALG_FFT_OPERATIONS_HPP_
-#define VIENNACL_LINALG_FFT_OPERATIONS_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-   Institute for Analysis and Scientific Computing,
-   TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-   -----------------
-   ViennaCL - The Vienna Computing Library
-   -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
- ============================================================================= 
*/
-
-/**  @file viennacl/linalg/fft_operations.hpp
- @brief Implementations of Fast Furier Transformation.
- */
-
-#include <viennacl/vector.hpp>
-#include <viennacl/matrix.hpp>
-
-#include "viennacl/linalg/host_based/fft_operations.hpp"
-
-#ifdef VIENNACL_WITH_OPENCL
-#include "viennacl/linalg/opencl/fft_operations.hpp"
-#include "viennacl/linalg/opencl/kernels/fft.hpp"
-#endif
-
-#ifdef VIENNACL_WITH_CUDA
-#include "viennacl/linalg/cuda/fft_operations.hpp"
-#endif
-
-namespace viennacl
-{
-namespace linalg
-{
-
-/**
- * @brief Direct 1D algorithm for computing Fourier transformation.
- *
- * Works on any sizes of data.
- * Serial implementation has o(n^2) complexity
- */
-template<typename NumericT, unsigned int AlignmentV>
-void direct(viennacl::vector<NumericT, AlignmentV> const & in,
-            viennacl::vector<NumericT, AlignmentV>       & out, vcl_size_t 
size, vcl_size_t stride,
-            vcl_size_t batch_num, NumericT sign = NumericT(-1),
-            
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER 
data_order = 
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
-{
-
-  switch (viennacl::traits::handle(in).get_active_handle_id())
-  {
-  case viennacl::MAIN_MEMORY:
-    viennacl::linalg::host_based::direct(in, out, size, stride, batch_num, 
sign, data_order);
-    break;
-#ifdef VIENNACL_WITH_OPENCL
-  case viennacl::OPENCL_MEMORY:
-    viennacl::linalg::opencl::direct(viennacl::traits::opencl_handle(in), 
viennacl::traits::opencl_handle(out), size, stride, batch_num, sign,data_order);
-    break;
-#endif
-
-#ifdef VIENNACL_WITH_CUDA
-  case viennacl::CUDA_MEMORY:
-    viennacl::linalg::cuda::direct(in, out, size, stride, 
batch_num,sign,data_order);
-    break;
-#endif
-
-  case viennacl::MEMORY_NOT_INITIALIZED:
-    throw memory_exception("not initialised!");
-  default:
-    throw memory_exception("not implemented");
-
-  }
-}
-
-/**
- * @brief Direct 2D algorithm for computing Fourier transformation.
- *
- * Works on any sizes of data.
- * Serial implementation has o(n^2) complexity
- */
-template<typename NumericT, unsigned int AlignmentV>
-void direct(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const 
& in,
-            viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& out, 
vcl_size_t size,
-            vcl_size_t stride, vcl_size_t batch_num, NumericT sign = 
NumericT(-1),
-            
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER 
data_order = 
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
-{
-
-  switch (viennacl::traits::handle(in).get_active_handle_id())
-  {
-  case viennacl::MAIN_MEMORY:
-    viennacl::linalg::host_based::direct(in, out, size, stride, batch_num, 
sign, data_order);
-    break;
-#ifdef VIENNACL_WITH_OPENCL
-  case viennacl::OPENCL_MEMORY:
-    viennacl::linalg::opencl::direct(viennacl::traits::opencl_handle(in), 
viennacl::traits::opencl_handle(out), size, stride, batch_num, sign,data_order);
-    break;
-#endif
-
-#ifdef VIENNACL_WITH_CUDA
-  case viennacl::CUDA_MEMORY:
-    viennacl::linalg::cuda::direct(in, out, size, stride, 
batch_num,sign,data_order);
-    break;
-#endif
-
-  case viennacl::MEMORY_NOT_INITIALIZED:
-    throw memory_exception("not initialised!");
-  default:
-    throw memory_exception("not implemented");
-
-  }
-}
-
-/*
- * This function performs reorder of input data. Indexes are sorted in 
bit-reversal order.
- * Such reordering should be done before in-place FFT.
- */
-template<typename NumericT, unsigned int AlignmentV>
-void reorder(viennacl::vector<NumericT, AlignmentV>& in, vcl_size_t size, 
vcl_size_t stride,
-             vcl_size_t bits_datasize, vcl_size_t batch_num,
-             
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER 
data_order = 
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
-{
-  switch (viennacl::traits::handle(in).get_active_handle_id())
-  {
-  case viennacl::MAIN_MEMORY:
-    viennacl::linalg::host_based::reorder(in, size, stride, bits_datasize, 
batch_num, data_order);
-    break;
-#ifdef VIENNACL_WITH_OPENCL
-  case viennacl::OPENCL_MEMORY:
-    
viennacl::linalg::opencl::reorder<NumericT>(viennacl::traits::opencl_handle(in),
 size, stride, bits_datasize, batch_num, data_order);
-    break;
-#endif
-
-#ifdef VIENNACL_WITH_CUDA
-  case viennacl::CUDA_MEMORY:
-    viennacl::linalg::cuda::reorder(in, size, stride, bits_datasize, 
batch_num, data_order);
-    break;
-#endif
-
-  case viennacl::MEMORY_NOT_INITIALIZED:
-    throw memory_exception("not initialised!");
-  default:
-    throw memory_exception("not implemented");
-
-  }
-}
-
-/**
- * @brief Radix-2 1D algorithm for computing Fourier transformation.
- *
- * Works only on power-of-two sizes of data.
- * Serial implementation has o(n * lg n) complexity.
- * This is a Cooley-Tukey algorithm
- */
-template<typename NumericT, unsigned int AlignmentV>
-void radix2(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & in, 
vcl_size_t size,
-            vcl_size_t stride, vcl_size_t batch_num, NumericT sign = 
NumericT(-1),
-            
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER 
data_order = 
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
-{
-  switch (viennacl::traits::handle(in).get_active_handle_id())
-  {
-  case viennacl::MAIN_MEMORY:
-    viennacl::linalg::host_based::radix2(in, size, stride, batch_num, sign, 
data_order);
-    break;
-#ifdef VIENNACL_WITH_OPENCL
-  case viennacl::OPENCL_MEMORY:
-    viennacl::linalg::opencl::radix2(viennacl::traits::opencl_handle(in), 
size, stride, batch_num, sign,data_order);
-    break;
-#endif
-
-#ifdef VIENNACL_WITH_CUDA
-  case viennacl::CUDA_MEMORY:
-    viennacl::linalg::cuda::radix2(in, size, stride, batch_num, sign, 
data_order);
-    break;
-#endif
-
-  case viennacl::MEMORY_NOT_INITIALIZED:
-    throw memory_exception("not initialised!");
-  default:
-    throw memory_exception("not implemented");
-  }
-}
-
-/**
- * @brief Radix-2 2D algorithm for computing Fourier transformation.
- *
- * Works only on power-of-two sizes of data.
- * Serial implementation has o(n * lg n) complexity.
- * This is a Cooley-Tukey algorithm
- */
-template<typename NumericT, unsigned int AlignmentV>
-void radix2(viennacl::vector<NumericT, AlignmentV>& in, vcl_size_t size, 
vcl_size_t stride,
-            vcl_size_t batch_num, NumericT sign = NumericT(-1),
-            
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER 
data_order = 
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
-{
-
-  switch (viennacl::traits::handle(in).get_active_handle_id())
-  {
-  case viennacl::MAIN_MEMORY:
-    viennacl::linalg::host_based::radix2(in, size, stride, batch_num, sign, 
data_order);
-    break;
-#ifdef VIENNACL_WITH_OPENCL
-  case viennacl::OPENCL_MEMORY:
-    viennacl::linalg::opencl::radix2(viennacl::traits::opencl_handle(in), 
size, stride, batch_num, sign,data_order);
-    break;
-#endif
-
-#ifdef VIENNACL_WITH_CUDA
-  case viennacl::CUDA_MEMORY:
-    viennacl::linalg::cuda::radix2(in, size, stride, batch_num, 
sign,data_order);
-    break;
-#endif
-
-  case viennacl::MEMORY_NOT_INITIALIZED:
-    throw memory_exception("not initialised!");
-  default:
-    throw memory_exception("not implemented");
-  }
-}
-
-/**
- * @brief Bluestein's algorithm for computing Fourier transformation.
- *
- * Currently,  Works only for sizes of input data which less than 2^16.
- * Uses a lot of additional memory, but should be fast for any size of data.
- * Serial implementation has something about o(n * lg n) complexity
- */
-template<typename NumericT, unsigned int AlignmentV>
-void bluestein(viennacl::vector<NumericT, AlignmentV> & in,
-               viennacl::vector<NumericT, AlignmentV> & out, vcl_size_t 
/*batch_num*/)
-{
-
-  switch (viennacl::traits::handle(in).get_active_handle_id())
-  {
-  case viennacl::MAIN_MEMORY:
-    viennacl::linalg::host_based::bluestein(in, out, 1);
-    break;
-#ifdef VIENNACL_WITH_OPENCL
-  case viennacl::OPENCL_MEMORY:
-    viennacl::linalg::opencl::bluestein(in, out, 1);
-    break;
-#endif
-
-#ifdef VIENNACL_WITH_CUDA
-  case viennacl::CUDA_MEMORY:
-    viennacl::linalg::cuda::bluestein(in, out, 1);
-    break;
-#endif
-
-  case viennacl::MEMORY_NOT_INITIALIZED:
-    throw memory_exception("not initialised!");
-  default:
-    throw memory_exception("not implemented");
-  }
-}
-
-/**
- * @brief Mutiply two complex vectors and store result in output
- */
-template<typename NumericT, unsigned int AlignmentV>
-void multiply_complex(viennacl::vector<NumericT, AlignmentV> const & input1,
-                      viennacl::vector<NumericT, AlignmentV> const & input2,
-                      viennacl::vector<NumericT, AlignmentV>       & output)
-{
-  switch (viennacl::traits::handle(input1).get_active_handle_id())
-  {
-  case viennacl::MAIN_MEMORY:
-    viennacl::linalg::host_based::multiply_complex(input1, input2, output);
-    break;
-#ifdef VIENNACL_WITH_OPENCL
-  case viennacl::OPENCL_MEMORY:
-    viennacl::linalg::opencl::multiply_complex(input1, input2, output);
-    break;
-#endif
-
-#ifdef VIENNACL_WITH_CUDA
-  case viennacl::CUDA_MEMORY:
-    viennacl::linalg::cuda::multiply_complex(input1, input2, output);
-    break;
-#endif
-
-  case viennacl::MEMORY_NOT_INITIALIZED:
-    throw memory_exception("not initialised!");
-  default:
-    throw memory_exception("not implemented");
-  }
-}
-
-/**
- * @brief Normalize vector on with his own size
- */
-template<typename NumericT, unsigned int AlignmentV>
-void normalize(viennacl::vector<NumericT, AlignmentV> & input)
-{
-  switch (viennacl::traits::handle(input).get_active_handle_id())
-  {
-  case viennacl::MAIN_MEMORY:
-    viennacl::linalg::host_based::normalize(input);
-    break;
-#ifdef VIENNACL_WITH_OPENCL
-  case viennacl::OPENCL_MEMORY:
-    viennacl::linalg::opencl::normalize(input);
-    break;
-#endif
-
-#ifdef VIENNACL_WITH_CUDA
-  case viennacl::CUDA_MEMORY:
-    viennacl::linalg::cuda::normalize(input);
-    break;
-#endif
-
-  case viennacl::MEMORY_NOT_INITIALIZED:
-    throw memory_exception("not initialised!");
-  default:
-    throw memory_exception("not implemented");
-  }
-}
-
-/**
- * @brief Inplace_transpose matrix
- */
-template<typename NumericT, unsigned int AlignmentV>
-void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & 
input)
-{
-  switch (viennacl::traits::handle(input).get_active_handle_id())
-  {
-  case viennacl::MAIN_MEMORY:
-    viennacl::linalg::host_based::transpose(input);
-    break;
-#ifdef VIENNACL_WITH_OPENCL
-  case viennacl::OPENCL_MEMORY:
-    viennacl::linalg::opencl::transpose(input);
-    break;
-#endif
-
-#ifdef VIENNACL_WITH_CUDA
-  case viennacl::CUDA_MEMORY:
-    viennacl::linalg::cuda::transpose(input);
-    break;
-#endif
-
-  case viennacl::MEMORY_NOT_INITIALIZED:
-    throw memory_exception("not initialised!");
-  default:
-    throw memory_exception("not implemented");
-  }
-}
-
-/**
- * @brief Transpose matrix
- */
-template<typename NumericT, unsigned int AlignmentV>
-void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> 
const & input,
-               viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>     
  & output)
-{
-  switch (viennacl::traits::handle(input).get_active_handle_id())
-  {
-  case viennacl::MAIN_MEMORY:
-    viennacl::linalg::host_based::transpose(input, output);
-    break;
-#ifdef VIENNACL_WITH_OPENCL
-  case viennacl::OPENCL_MEMORY:
-    viennacl::linalg::opencl::transpose(input, output);
-    break;
-#endif
-
-#ifdef VIENNACL_WITH_CUDA
-  case viennacl::CUDA_MEMORY:
-    viennacl::linalg::cuda::transpose(input, output);
-    break;
-#endif
-
-  case viennacl::MEMORY_NOT_INITIALIZED:
-    throw memory_exception("not initialised!");
-  default:
-    throw memory_exception("not implemented");
-  }
-}
-
-/**
- * @brief Create complex vector from real vector (even elements(2*k) = real 
part, odd elements(2*k+1) = imaginary part)
- */
-template<typename NumericT>
-void real_to_complex(viennacl::vector_base<NumericT> const & in,
-                     viennacl::vector_base<NumericT>       & out, vcl_size_t 
size)
-{
-  switch (viennacl::traits::handle(in).get_active_handle_id())
-  {
-    case viennacl::MAIN_MEMORY:
-      viennacl::linalg::host_based::real_to_complex(in, out, size);
-      break;
-#ifdef VIENNACL_WITH_OPENCL
-      case viennacl::OPENCL_MEMORY:
-      viennacl::linalg::opencl::real_to_complex(in,out,size);
-      break;
-#endif
-
-#ifdef VIENNACL_WITH_CUDA
-      case viennacl::CUDA_MEMORY:
-      viennacl::linalg::cuda::real_to_complex(in,out,size);
-      break;
-#endif
-
-    case viennacl::MEMORY_NOT_INITIALIZED:
-      throw memory_exception("not initialised!");
-    default:
-      throw memory_exception("not implemented");
-  }
-}
-
-/**
- * @brief Create real vector from complex vector (even elements(2*k) = real 
part, odd elements(2*k+1) = imaginary part)
- */
-template<typename NumericT>
-void complex_to_real(viennacl::vector_base<NumericT> const & in,
-                     viennacl::vector_base<NumericT>       & out, vcl_size_t 
size)
-{
-  switch (viennacl::traits::handle(in).get_active_handle_id())
-  {
-  case viennacl::MAIN_MEMORY:
-    viennacl::linalg::host_based::complex_to_real(in, out, size);
-    break;
-#ifdef VIENNACL_WITH_OPENCL
-  case viennacl::OPENCL_MEMORY:
-    viennacl::linalg::opencl::complex_to_real(in, out, size);
-    break;
-#endif
-
-#ifdef VIENNACL_WITH_CUDA
-  case viennacl::CUDA_MEMORY:
-    viennacl::linalg::cuda::complex_to_real(in, out, size);
-    break;
-#endif
-
-  case viennacl::MEMORY_NOT_INITIALIZED:
-    throw memory_exception("not initialised!");
-  default:
-    throw memory_exception("not implemented");
-  }
-}
-
-/**
- * @brief Reverse vector to oposite order and save it in input vector
- */
-template<typename NumericT>
-void reverse(viennacl::vector_base<NumericT> & in)
-{
-  switch (viennacl::traits::handle(in).get_active_handle_id())
-  {
-  case viennacl::MAIN_MEMORY:
-    viennacl::linalg::host_based::reverse(in);
-    break;
-#ifdef VIENNACL_WITH_OPENCL
-  case viennacl::OPENCL_MEMORY:
-    viennacl::linalg::opencl::reverse(in);
-    break;
-#endif
-
-#ifdef VIENNACL_WITH_CUDA
-  case viennacl::CUDA_MEMORY:
-    viennacl::linalg::cuda::reverse(in);
-    break;
-#endif
-
-  case viennacl::MEMORY_NOT_INITIALIZED:
-    throw memory_exception("not initialised!");
-  default:
-    throw memory_exception("not implemented");
-  }
-}
-
-}
-}
-
-#endif /* FFT_OPERATIONS_HPP_ */

[24/51] [partial] mahout git commit: Revert "(nojira) add native-viennaCL module to codebase. closes apache/mahout#241"

Reply via email to