http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai.hpp
new file mode 100644
index 0000000..dba094b
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai.hpp
@@ -0,0 +1,832 @@
+#ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_HPP
+#define VIENNACL_LINALG_DETAIL_SPAI_SPAI_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+/** @file viennacl/linalg/detail/spai/spai.hpp
+    @brief Main implementation of SPAI (not FSPAI). Experimental.
+*/
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include <math.h>
+#include <map>
+
+//local includes
+#include "viennacl/linalg/detail/spai/spai_tag.hpp"
+#include "viennacl/linalg/qr.hpp"
+#include "viennacl/linalg/detail/spai/spai-dynamic.hpp"
+#include "viennacl/linalg/detail/spai/spai-static.hpp"
+#include "viennacl/linalg/detail/spai/sparse_vector.hpp"
+#include "viennacl/linalg/detail/spai/block_matrix.hpp"
+#include "viennacl/linalg/detail/spai/block_vector.hpp"
+
+//boost includes
+#include "boost/numeric/ublas/vector.hpp"
+#include "boost/numeric/ublas/matrix.hpp"
+#include "boost/numeric/ublas/matrix_proxy.hpp"
+#include "boost/numeric/ublas/vector_proxy.hpp"
+#include "boost/numeric/ublas/storage.hpp"
+#include "boost/numeric/ublas/io.hpp"
+#include "boost/numeric/ublas/lu.hpp"
+#include "boost/numeric/ublas/triangular.hpp"
+#include "boost/numeric/ublas/matrix_expression.hpp"
+
+// ViennaCL includes
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/compressed_matrix.hpp"
+#include "viennacl/linalg/sparse_matrix_operations.hpp"
+#include "viennacl/linalg/matrix_operations.hpp"
+#include "viennacl/scalar.hpp"
+#include "viennacl/linalg/inner_prod.hpp"
+#include "viennacl/linalg/ilu.hpp"
+#include "viennacl/ocl/backend.hpp"
+#include "viennacl/linalg/opencl/kernels/spai.hpp"
+
+
+
+#define VIENNACL_SPAI_K_b 20
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace spai
+{
+
+//debug function for print
+template<typename SparseVectorT>
+void print_sparse_vector(SparseVectorT const & v)
+{
+  for (typename SparseVectorT::const_iterator vec_it = v.begin(); vec_it!= 
v.end(); ++vec_it)
+    std::cout << "[ " << vec_it->first << " ]:" << vec_it->second << std::endl;
+}
+
+template<typename DenseMatrixT>
+void print_matrix(DenseMatrixT & m)
+{
+  for (int i = 0; i < m.size2(); ++i)
+  {
+    for (int j = 0; j < m.size1(); ++j)
+      std::cout<<m(j, i)<<" ";
+    std::cout<<std::endl;
+  }
+}
+
+/** @brief Add two sparse vectors res_v = b*v
+ *
+ * @param v      initial sparse vector
+ * @param b      scalar
+ * @param res_v  output vector
+ */
+template<typename SparseVectorT, typename NumericT>
+void add_sparse_vectors(SparseVectorT const & v, NumericT b,  SparseVectorT & 
res_v)
+{
+  for (typename SparseVectorT::const_iterator v_it = v.begin(); v_it != 
v.end(); ++v_it)
+    res_v[v_it->first] += b*v_it->second;
+}
+
+//sparse-matrix - vector product
+/** @brief Computation of residual res = A*v - e
+ *
+ * @param A_v_c   column major vectorized input sparse matrix
+ * @param v       sparse vector, in this case new column of preconditioner 
matrix
+ * @param ind     index for current column
+ * @param res     residual
+ */
+template<typename SparseVectorT, typename NumericT>
+void compute_spai_residual(std::vector<SparseVectorT> const & A_v_c,
+                           SparseVectorT const & v,
+                           unsigned int ind,
+                           SparseVectorT & res)
+{
+  for (typename SparseVectorT::const_iterator v_it = v.begin(); v_it != 
v.end(); ++v_it)
+    add_sparse_vectors(A_v_c[v_it->first], v_it->second, res);
+
+  res[ind] -= NumericT(1);
+}
+
+/** @brief Setting up index set of columns and rows for certain column
+ *
+ * @param A_v_c   column major vectorized initial sparse matrix
+ * @param v       current column of preconditioner matrix
+ * @param J       set of column indices
+ * @param I       set of row indices
+ */
+template<typename SparseVectorT>
+void build_index_set(std::vector<SparseVectorT> const & A_v_c,
+                     SparseVectorT const & v,
+                     std::vector<unsigned int> & J,
+                     std::vector<unsigned int> & I)
+{
+  buildColumnIndexSet(v, J);
+  projectRows(A_v_c, J, I);
+}
+
+/** @brief Initializes a dense matrix from a sparse one
+ *
+ * @param A_in    Oiginal sparse matrix
+ * @param J       Set of column indices
+ * @param I       Set of row indices
+ * @param A_out   dense matrix output
+ */
+template<typename SparseMatrixT, typename DenseMatrixT>
+void initProjectSubMatrix(SparseMatrixT const & A_in,
+                          std::vector<unsigned int> const & J,
+                          std::vector<unsigned int> & I,
+                          DenseMatrixT & A_out)
+{
+  A_out.resize(I.size(), J.size(), false);
+  for (vcl_size_t j = 0; j < J.size(); ++j)
+    for (vcl_size_t i = 0; i < I.size(); ++i)
+      A_out(i,j) = A_in(I[i],J[j]);
+}
+
+
+/************************************************** CPU BLOCK SET UP 
***************************************/
+
+/** @brief Setting up blocks and QR factorizing them on CPU
+ *
+ * @param A        initial sparse matrix
+ * @param A_v_c    column major vectorized initial sparse matrix
+ * @param M_v      initialized preconditioner
+ * @param g_I      container of row indices
+ * @param g_J      container of column indices
+ * @param g_A_I_J  container of dense matrices -> R matrices after QR 
factorization
+ * @param g_b_v    container of vectors beta, necessary for Q recovery
+ */
+template<typename SparseMatrixT, typename DenseMatrixT, typename 
SparseVectorT, typename VectorT>
+void block_set_up(SparseMatrixT const & A,
+                  std::vector<SparseVectorT> const & A_v_c,
+                  std::vector<SparseVectorT> const & M_v,
+                  std::vector<std::vector<unsigned int> >& g_I,
+                  std::vector<std::vector<unsigned int> >& g_J,
+                  std::vector<DenseMatrixT>& g_A_I_J,
+                  std::vector<VectorT>& g_b_v)
+{
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i2 = 0; i2 < static_cast<long>(M_v.size()); ++i2)
+  {
+    vcl_size_t i = static_cast<vcl_size_t>(i2);
+    build_index_set(A_v_c, M_v[i], g_J[i], g_I[i]);
+    initProjectSubMatrix(A, g_J[i], g_I[i], g_A_I_J[i]);
+    //print_matrix(g_A_I_J[i]);
+    single_qr(g_A_I_J[i], g_b_v[i]);
+    //print_matrix(g_A_I_J[i]);
+  }
+}
+
+/** @brief Setting up index set of columns and rows for all columns
+ *
+ * @param A_v_c   column major vectorized initial sparse matrix
+ * @param M_v     initialized preconditioner
+ * @param g_J     container of column indices
+ * @param g_I     container of row indices
+ */
+template<typename SparseVectorT>
+void index_set_up(std::vector<SparseVectorT> const & A_v_c,
+                  std::vector<SparseVectorT> const & M_v,
+                  std::vector<std::vector<unsigned int> > & g_J,
+                  std::vector<std::vector<unsigned int> > & g_I)
+{
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i2 = 0; i2 < static_cast<long>(M_v.size()); ++i2)
+  {
+    vcl_size_t i = static_cast<vcl_size_t>(i2);
+    build_index_set(A_v_c, M_v[i], g_J[i], g_I[i]);
+  }
+}
+
+/************************************************** GPU BLOCK SET UP 
***************************************/
+
+/** @brief Setting up blocks and QR factorizing them on GPU
+ *
+ * @param A            initial sparse matrix
+ * @param A_v_c        column major vectorized initial sparse matrix
+ * @param M_v          initialized preconditioner
+ * @param g_is_update  container that indicates which blocks are active
+ * @param g_I          container of row indices
+ * @param g_J          container of column indices
+ * @param g_A_I_J      container of dense matrices -> R matrices after QR 
factorization
+ * @param g_bv         container of vectors beta, necessary for Q recovery
+ */
+template<typename NumericT, unsigned int AlignmentV, typename SparseVectorT>
+void block_set_up(viennacl::compressed_matrix<NumericT, AlignmentV> const & A,
+                  std::vector<SparseVectorT> const & A_v_c,
+                  std::vector<SparseVectorT> const & M_v,
+                  std::vector<cl_uint> g_is_update,
+                  std::vector<std::vector<unsigned int> > & g_I,
+                  std::vector<std::vector<unsigned int> > & g_J,
+                  block_matrix & g_A_I_J,
+                  block_vector & g_bv)
+{
+  viennacl::context ctx = viennacl::traits::context(A);
+  bool is_empty_block;
+
+  //build index set
+  index_set_up(A_v_c, M_v, g_J, g_I);
+  block_assembly(A, g_J, g_I, g_A_I_J, g_is_update, is_empty_block);
+  block_qr<NumericT>(g_I, g_J, g_A_I_J, g_bv, g_is_update, ctx);
+}
+
+
+/***************************************************************************************************/
+/******************************** SOLVING LS PROBLEMS ON GPU 
***************************************/
+/***************************************************************************************************/
+
+/** @brief Elicitation of sparse vector m for particular column from m_in - 
contigious vector for all columns
+ *
+ * @param m_in          contigious sparse vector for all columns
+ * @param start_m_ind   start index of particular vector
+ * @param J             column index set
+ * @param m             sparse vector for particular column
+ */
+template<typename NumericT, typename SparseVectorT>
+void custom_fan_out(std::vector<NumericT> const & m_in,
+                    unsigned int start_m_ind,
+                    std::vector<unsigned int> const & J,
+                    SparseVectorT & m)
+{
+  unsigned int  cnt = 0;
+  for (vcl_size_t i = 0; i < J.size(); ++i)
+    m[J[i]] = m_in[start_m_ind + cnt++];
+}
+
+
+
+//GPU based least square problem
+/** @brief Solution of Least square problem on GPU
+ *
+ * @param A_v_c        column-major vectorized initial sparse matrix
+ * @param M_v          column-major vectorized sparse preconditioner matrix
+ * @param g_I          container of row set indices
+ * @param g_J          container of column set indices
+ * @param g_A_I_J_vcl  contigious matrix that consists of blocks A(I_k, J_k)
+ * @param g_bv_vcl     contigious vector that consists of betas, necessary for 
Q recovery
+ * @param g_res        container of residuals
+ * @param g_is_update  container with indicators which blocks are active
+ * @param tag          spai tag
+ * @param ctx          Optional context in which the auxiliary data is created 
(one out of multiple OpenCL contexts, CUDA, host)
+ */
+template<typename SparseVectorT, typename NumericT>
+void least_square_solve(std::vector<SparseVectorT> & A_v_c,
+                        std::vector<SparseVectorT> & M_v,
+                        std::vector<std::vector<unsigned int> >& g_I,
+                        std::vector<std::vector<unsigned int> > & g_J,
+                        block_matrix & g_A_I_J_vcl,
+                        block_vector & g_bv_vcl,
+                        std::vector<SparseVectorT> & g_res,
+                        std::vector<cl_uint> & g_is_update,
+                        const spai_tag & tag,
+                        viennacl::context ctx)
+{
+  viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context 
&>(ctx.opencl_context());
+  unsigned int y_sz, m_sz;
+  std::vector<cl_uint> y_inds(M_v.size() + 1, static_cast<cl_uint>(0));
+  std::vector<cl_uint> m_inds(M_v.size() + 1, static_cast<cl_uint>(0));
+
+  get_size(g_I, y_sz);
+  init_start_inds(g_I, y_inds);
+  init_start_inds(g_J, m_inds);
+
+  //create y_v
+  std::vector<NumericT> y_v(y_sz, NumericT(0));
+  for (vcl_size_t i = 0; i < M_v.size(); ++i)
+  {
+    for (vcl_size_t j = 0; j < g_I[i].size(); ++j)
+    {
+      if (g_I[i][j] == i)
+        y_v[y_inds[i] + j] = NumericT(1.0);
+    }
+  }
+  //compute m_v
+  get_size(g_J, m_sz);
+  std::vector<NumericT> m_v(m_sz, static_cast<cl_uint>(0));
+
+  block_vector y_v_vcl;
+  block_vector m_v_vcl;
+  //prepearing memory for least square problem on GPU
+  y_v_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                              static_cast<unsigned 
int>(sizeof(NumericT)*y_v.size()),
+                                              &(y_v[0]));
+  m_v_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                              static_cast<unsigned 
int>(sizeof(NumericT)*m_v.size()),
+                                              &(m_v[0]));
+  y_v_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                               static_cast<unsigned 
int>(sizeof(cl_uint)*(g_I.size() + 1)),
+                                               &(y_inds[0]));
+  viennacl::ocl::handle<cl_mem> g_is_update_vcl = 
opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                                           
static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())),
+                                                                           
&(g_is_update[0]));
+  viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);
+  viennacl::ocl::kernel & ls_kernel = 
opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(),
 "block_least_squares");
+  ls_kernel.local_work_size(0, 1);
+  ls_kernel.global_work_size(0, 256);
+  viennacl::ocl::enqueue(ls_kernel(g_A_I_J_vcl.handle(), 
g_A_I_J_vcl.handle2(), g_bv_vcl.handle(), g_bv_vcl.handle1(), m_v_vcl.handle(),
+                                   y_v_vcl.handle(), y_v_vcl.handle1(),
+                                   g_A_I_J_vcl.handle1(), g_is_update_vcl,
+                                   
//viennacl::ocl::local_mem(static_cast<unsigned 
int>(sizeof(ScalarType)*(local_r_n*local_c_n))),
+                                   static_cast<unsigned int>(M_v.size())));
+  //copy vector m_v back from GPU to CPU
+  cl_int vcl_err = clEnqueueReadBuffer(opencl_ctx.get_queue().handle().get(),
+                                       m_v_vcl.handle().get(), CL_TRUE, 0,
+                                       sizeof(NumericT)*(m_v.size()),
+                                       &(m_v[0]), 0, NULL, NULL);
+  VIENNACL_ERR_CHECK(vcl_err);
+
+  //fan out vector in parallel
+  //#pragma omp parallel for
+  for (long i = 0; i < static_cast<long>(M_v.size()); ++i)
+  {
+    if (g_is_update[static_cast<vcl_size_t>(i)])
+    {
+      //faned out onto sparse vector
+      custom_fan_out(m_v, m_inds[static_cast<vcl_size_t>(i)], 
g_J[static_cast<vcl_size_t>(i)], M_v[static_cast<vcl_size_t>(i)]);
+      g_res[static_cast<vcl_size_t>(i)].clear();
+      compute_spai_residual<SparseVectorT, NumericT>(A_v_c,  
M_v[static_cast<vcl_size_t>(i)], static_cast<unsigned int>(i), 
g_res[static_cast<vcl_size_t>(i)]);
+      NumericT res_norm = 0;
+      //compute norm of res - just to make sure that this implementatino works 
correct
+      sparse_norm_2(g_res[static_cast<vcl_size_t>(i)], res_norm);
+      //std::cout<<"Residual norm of column #: "<<i<<std::endl;
+      //std::cout<<res_norm<<std::endl;
+      //std::cout<<"************************"<<std::endl;
+      g_is_update[static_cast<vcl_size_t>(i)] = (res_norm > 
tag.getResidualNormThreshold())&& (!tag.getIsStatic())?(1):(0);
+    }
+  }
+}
+
+//CPU based least square problems
+/** @brief Solution of Least square problem on CPU
+ *
+ * @param A_v_c        column-major vectorized initial sparse matrix
+ * @param g_R          blocks for least square solution
+ * @param g_b_v        vectors beta, necessary for Q recovery
+ * @param g_I          container of row index set for all columns of matrix M
+ * @param g_J          container of column index set for all columns of matrix 
M
+ * @param g_res        container of residuals
+ * @param g_is_update  container with indicators which blocks are active
+ * @param M_v          column-major vectorized sparse matrix, final 
preconditioner
+ * @param tag          spai tag
+ */
+template<typename SparseVectorT, typename DenseMatrixT, typename VectorT>
+void least_square_solve(std::vector<SparseVectorT> const & A_v_c,
+                        std::vector<DenseMatrixT> & g_R,
+                        std::vector<VectorT> & g_b_v,
+                        std::vector<std::vector<unsigned int> > & g_I,
+                        std::vector<std::vector<unsigned int> > & g_J,
+                        std::vector<SparseVectorT> & g_res,
+                        std::vector<bool> & g_is_update,
+                        std::vector<SparseVectorT> & M_v,
+                        spai_tag const & tag)
+{
+  typedef typename DenseMatrixT::value_type       NumericType;
+
+#ifdef VIENNACL_WITH_OPENMP
+  #pragma omp parallel for
+#endif
+  for (long i2 = 0; i2 < static_cast<long>(M_v.size()); ++i2)
+  {
+    vcl_size_t i = static_cast<vcl_size_t>(i2);
+    if (g_is_update[i])
+    {
+      VectorT y = 
boost::numeric::ublas::zero_vector<NumericType>(g_I[i].size());
+
+      projectI<VectorT, NumericType>(g_I[i], y, static_cast<unsigned 
int>(tag.getBegInd() + long(i)));
+      apply_q_trans_vec(g_R[i], g_b_v[i], y);
+
+      VectorT m_new =  
boost::numeric::ublas::zero_vector<NumericType>(g_R[i].size2());
+      backwardSolve(g_R[i], y, m_new);
+      fanOutVector(m_new, g_J[i], M_v[i]);
+      g_res[i].clear();
+
+      compute_spai_residual<SparseVectorT, NumericType>(A_v_c,  M_v[i], 
static_cast<unsigned int>(tag.getBegInd() + long(i)), g_res[i]);
+
+      NumericType res_norm = 0;
+      sparse_norm_2(g_res[i], res_norm);
+//                    std::cout<<"Residual norm of column #: "<<i<<std::endl;
+//                    std::cout<<res_norm<<std::endl;
+//                    std::cout<<"************************"<<std::endl;
+      g_is_update[i] = (res_norm > tag.getResidualNormThreshold())&& 
(!tag.getIsStatic());
+    }
+  }
+}
+
+//************************************ UPDATE CHECK 
***************************************************//
+
+template<typename VectorType>
+bool is_all_update(VectorType& parallel_is_update)
+{
+  for (unsigned int i = 0; i < parallel_is_update.size(); ++i)
+  {
+    if (parallel_is_update[i])
+      return true;
+  }
+  return false;
+}
+
+//********************************** MATRIX VECTORIZATION 
***********************************************//
+
+//Matrix vectorization, column based approach
+/** @brief Solution of Least square problem on CPU
+ *
+ * @param M_in   input sparse, boost::numeric::ublas::compressed_matrix
+ * @param M_v    array of sparse vectors
+ */
+template<typename SparseMatrixT, typename SparseVectorT>
+void vectorize_column_matrix(SparseMatrixT const & M_in,
+                             std::vector<SparseVectorT> & M_v)
+{
+  for (typename SparseMatrixT::const_iterator1 row_it = M_in.begin1(); 
row_it!= M_in.end1(); ++row_it)
+    for (typename SparseMatrixT::const_iterator2 col_it = row_it.begin(); 
col_it != row_it.end(); ++col_it)
+        M_v[static_cast<unsigned int>(col_it.index2())][static_cast<unsigned 
int>(col_it.index1())] = *col_it;
+}
+
+//Matrix vectorization row based approach
+template<typename SparseMatrixT, typename SparseVectorT>
+void vectorize_row_matrix(SparseMatrixT const & M_in,
+                          std::vector<SparseVectorT> & M_v)
+{
+  for (typename SparseMatrixT::const_iterator1 row_it = M_in.begin1(); 
row_it!= M_in.end1(); ++row_it)
+    for (typename SparseMatrixT::const_iterator2 col_it = row_it.begin(); 
col_it != row_it.end(); ++col_it)
+      M_v[static_cast<unsigned int>(col_it.index1())][static_cast<unsigned 
int>(col_it.index2())] = *col_it;
+}
+
+//************************************* BLOCK ASSEMBLY CODE 
*********************************************//
+
+
+template<typename SizeT>
+void write_set_to_array(std::vector<std::vector<SizeT> > const & ind_set,
+                        std::vector<cl_uint> & a)
+{
+  vcl_size_t cnt = 0;
+
+  for (vcl_size_t i = 0; i < ind_set.size(); ++i)
+    for (vcl_size_t j = 0; j < ind_set[i].size(); ++j)
+      a[cnt++] = static_cast<cl_uint>(ind_set[i][j]);
+}
+
+
+
+//assembling blocks on GPU
+/** @brief Assembly of blocks on GPU by a gived set of row indices: g_I and 
column indices: g_J
+ *
+ * @param A               intial sparse matrix
+ * @param g_J             container of column index set
+ * @param g_I             container of row index set
+ * @param g_A_I_J_vcl     contigious blocks A(I, J) using GPU memory
+ * @param g_is_update     container with indicators which blocks are active
+ * @param is_empty_block  parameter that indicates if no block were assembled
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void block_assembly(viennacl::compressed_matrix<NumericT, AlignmentV> const & 
A,
+                    std::vector<std::vector<unsigned int> > const & g_J,
+                    std::vector<std::vector<unsigned int> > const & g_I,
+                    block_matrix & g_A_I_J_vcl,
+                    std::vector<cl_uint> & g_is_update,
+                    bool & is_empty_block)
+{
+  //computing start indices for index sets and start indices for block matrices
+  unsigned int sz_I, sz_J, sz_blocks;
+  std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0));
+  std::vector<cl_uint> i_ind(g_I.size() + 1, static_cast<cl_uint>(0));
+  std::vector<cl_uint> j_ind(g_I.size() + 1, static_cast<cl_uint>(0));
+  std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0));
+  //
+  init_start_inds(g_J, j_ind);
+  init_start_inds(g_I, i_ind);
+  //
+  get_size(g_J, sz_J);
+  get_size(g_I, sz_I);
+  std::vector<cl_uint> I_set(sz_I, static_cast<cl_uint>(0));
+  //
+  std::vector<cl_uint> J_set(sz_J, static_cast<cl_uint>(0));
+
+  // computing size for blocks
+  // writing set to arrays
+  write_set_to_array(g_I, I_set);
+  write_set_to_array(g_J, J_set);
+
+  // if block for assembly does exist
+  if (I_set.size() > 0 && J_set.size() > 0)
+  {
+    viennacl::context ctx = viennacl::traits::context(A);
+    viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context 
&>(ctx.opencl_context());
+    compute_blocks_size(g_I, g_J, sz_blocks, blocks_ind, matrix_dims);
+    std::vector<NumericT> con_A_I_J(sz_blocks, NumericT(0));
+
+    block_vector set_I_vcl, set_J_vcl;
+    //init memory on GPU
+    //contigious g_A_I_J
+    g_A_I_J_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                    static_cast<unsigned 
int>(sizeof(NumericT)*(sz_blocks)),
+                                                    &(con_A_I_J[0]));
+    g_A_I_J_vcl.handle().context(opencl_ctx);
+
+    //matrix_dimensions
+    g_A_I_J_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                     static_cast<unsigned 
int>(sizeof(cl_uint)*2*static_cast<cl_uint>(g_I.size())),
+                                                     &(matrix_dims[0]));
+    g_A_I_J_vcl.handle1().context(opencl_ctx);
+
+    //start_block inds
+    g_A_I_J_vcl.handle2() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                     static_cast<unsigned 
int>(sizeof(cl_uint)*(g_I.size() + 1)),
+                                                     &(blocks_ind[0]));
+    g_A_I_J_vcl.handle2().context(opencl_ctx);
+
+    //set_I
+    set_I_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                  static_cast<unsigned 
int>(sizeof(cl_uint)*sz_I),
+                                                  &(I_set[0]));
+    set_I_vcl.handle().context(opencl_ctx);
+
+    //set_J
+    set_J_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                  static_cast<unsigned 
int>(sizeof(cl_uint)*sz_J),
+                                                  &(J_set[0]));
+    set_J_vcl.handle().context(opencl_ctx);
+
+    //i_ind
+    set_I_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                   static_cast<unsigned 
int>(sizeof(cl_uint)*(g_I.size() + 1)),
+                                                   &(i_ind[0]));
+    set_I_vcl.handle().context(opencl_ctx);
+
+    //j_ind
+    set_J_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                   static_cast<unsigned 
int>(sizeof(cl_uint)*(g_I.size() + 1)),
+                                                   &(j_ind[0]));
+    set_J_vcl.handle().context(opencl_ctx);
+
+    viennacl::ocl::handle<cl_mem> g_is_update_vcl = 
opencl_ctx.create_memory(CL_MEM_READ_WRITE,
+                                                                             
static_cast<unsigned int>(sizeof(cl_uint)*g_is_update.size()),
+                                                                             
&(g_is_update[0]));
+
+    viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx);
+    viennacl::ocl::kernel& assembly_kernel = 
opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(),
 "assemble_blocks");
+    assembly_kernel.local_work_size(0, 1);
+    assembly_kernel.global_work_size(0, 256);
+    viennacl::ocl::enqueue(assembly_kernel(A.handle1().opencl_handle(), 
A.handle2().opencl_handle(), A.handle().opencl_handle(),
+                                           set_I_vcl.handle(), 
set_J_vcl.handle(), set_I_vcl.handle1(),
+                                           set_J_vcl.handle1(),
+                                           g_A_I_J_vcl.handle2(), 
g_A_I_J_vcl.handle1(), g_A_I_J_vcl.handle(),
+                                           g_is_update_vcl,
+                                           static_cast<unsigned 
int>(g_I.size())));
+    is_empty_block = false;
+  }
+  else
+    is_empty_block = true;
+}
+
+/************************************************************************************************************************/
+
+/** @brief Insertion of vectorized matrix column into original sparse matrix
+ *
+ * @param M_v       column-major vectorized matrix
+ * @param M         original sparse matrix
+ * @param is_right  indicates if matrix should be transposed in the output
+ */
+template<typename SparseMatrixT, typename SparseVectorT>
+void insert_sparse_columns(std::vector<SparseVectorT> const & M_v,
+                           SparseMatrixT& M,
+                           bool is_right)
+{
+  if (is_right)
+  {
+    for (unsigned int i = 0; i < M_v.size(); ++i)
+      for (typename SparseVectorT::const_iterator vec_it = M_v[i].begin(); 
vec_it!=M_v[i].end(); ++vec_it)
+        M(vec_it->first, i) = vec_it->second;
+  }
+  else  //transposed fill of M
+  {
+    for (unsigned int i = 0; i < M_v.size(); ++i)
+      for (typename SparseVectorT::const_iterator vec_it = M_v[i].begin(); 
vec_it!=M_v[i].end(); ++vec_it)
+        M(i, vec_it->first) = vec_it->second;
+  }
+}
+
+/** @brief Transposition of sparse matrix
+ *
+ * @param A_in      intial sparse matrix
+ * @param A output  transposed matrix
+ */
+template<typename MatrixT>
+void sparse_transpose(MatrixT const & A_in, MatrixT & A)
+{
+  typedef typename MatrixT::value_type         NumericType;
+
+  std::vector<std::map<vcl_size_t, NumericType> >   temp_A(A_in.size2());
+  A.resize(A_in.size2(), A_in.size1(), false);
+
+  for (typename MatrixT::const_iterator1 row_it = A_in.begin1();
+       row_it != A_in.end1();
+       ++row_it)
+  {
+    for (typename MatrixT::const_iterator2 col_it = row_it.begin();
+         col_it != row_it.end();
+         ++col_it)
+    {
+      temp_A[col_it.index2()][col_it.index1()] = *col_it;
+    }
+  }
+
+  for (vcl_size_t i=0; i<temp_A.size(); ++i)
+  {
+    for (typename std::map<vcl_size_t, NumericType>::const_iterator it = 
temp_A[i].begin();
+         it != temp_A[i].end();
+         ++it)
+      A(i, it->first) = it->second;
+  }
+}
+
+
+
+
+//        template<typename SparseVectorType>
+//        void custom_copy(std::vector<SparseVectorType> & M_v, 
std::vector<SparseVectorType> & l_M_v, const unsigned int beg_ind){
+//            for (int i = 0; i < l_M_v.size(); ++i){
+//                l_M_v[i] = M_v[i + beg_ind];
+//            }
+//        }
+
+//CPU version
+/** @brief Construction of SPAI preconditioner on CPU
+ *
+ * @param A     initial sparse matrix
+ * @param M     output preconditioner
+ * @param tag   spai tag
+ */
+template<typename MatrixT>
+void computeSPAI(MatrixT const & A,
+                 MatrixT & M,
+                 spai_tag & tag)
+{
+  typedef typename MatrixT::value_type                                       
NumericT;
+  typedef typename boost::numeric::ublas::vector<NumericT>                   
VectorType;
+  typedef typename viennacl::linalg::detail::spai::sparse_vector<NumericT>   
SparseVectorType;
+  typedef typename boost::numeric::ublas::matrix<NumericT>                   
DenseMatrixType;
+
+  //sparse matrix transpose...
+  unsigned int cur_iter = 0;
+  tag.setBegInd(0); tag.setEndInd(VIENNACL_SPAI_K_b);
+  bool go_on = true;
+  std::vector<SparseVectorType> A_v_c(M.size2());
+  std::vector<SparseVectorType> M_v(M.size2());
+  vectorize_column_matrix(A, A_v_c);
+  vectorize_column_matrix(M, M_v);
+
+
+  while (go_on)
+  {
+    go_on = (tag.getEndInd() < static_cast<long>(M.size2()));
+    cur_iter = 0;
+    unsigned int l_sz = static_cast<unsigned int>(tag.getEndInd() - 
tag.getBegInd());
+    //std::vector<bool> g_is_update(M.size2(), true);
+    std::vector<bool> g_is_update(l_sz, true);
+
+    //init is update
+    //init_parallel_is_update(g_is_update);
+    //std::vector<SparseVectorType> A_v_c(K);
+    //std::vector<SparseVectorType> M_v(K);
+    //vectorization of marices
+    //print_matrix(M_v);
+
+    std::vector<SparseVectorType> l_M_v(l_sz);
+    //custom_copy(M_v, l_M_v, beg_ind);
+    std::copy(M_v.begin() + tag.getBegInd(), M_v.begin() + tag.getEndInd(), 
l_M_v.begin());
+
+    //print_matrix(l_M_v);
+    //std::vector<SparseVectorType> l_A_v_c(K);
+    //custom_copy(A_v_c, l_A_v_c, beg_ind);
+    //std::copy(A_v_c.begin() + beg_ind, A_v_c.begin() + end_ind, 
l_A_v_c.begin());
+    //print_matrix(l_A_v_c);
+    //vectorize_row_matrix(A, A_v_r);
+    //working blocks
+
+    std::vector<DenseMatrixType> g_A_I_J(l_sz);
+    std::vector<VectorType> g_b_v(l_sz);
+    std::vector<SparseVectorType> g_res(l_sz);
+    std::vector<std::vector<unsigned int> > g_I(l_sz);
+    std::vector<std::vector<unsigned int> > g_J(l_sz);
+
+    while ((cur_iter < tag.getIterationLimit())&&is_all_update(g_is_update))
+    {
+      // SET UP THE BLOCKS..
+      // PHASE ONE
+      if (cur_iter == 0)
+        block_set_up(A, A_v_c, l_M_v,  g_I, g_J, g_A_I_J, g_b_v);
+      else
+        block_update(A, A_v_c, g_res, g_is_update, g_I, g_J, g_b_v, g_A_I_J, 
tag);
+
+      //PHASE TWO, LEAST SQUARE SOLUTION
+      least_square_solve(A_v_c, g_A_I_J, g_b_v, g_I, g_J, g_res, g_is_update, 
l_M_v, tag);
+
+      if (tag.getIsStatic()) break;
+      cur_iter++;
+    }
+
+    std::copy(l_M_v.begin(), l_M_v.end(), M_v.begin() + tag.getBegInd());
+    tag.setBegInd(tag.getEndInd());//beg_ind = end_ind;
+    tag.setEndInd(std::min(static_cast<long>(tag.getBegInd() + 
VIENNACL_SPAI_K_b), static_cast<long>(M.size2())));
+    //std::copy(l_M_v.begin(), l_M_v.end(), M_v.begin() + tag.getBegInd());
+  }
+
+  M.resize(M.size1(), M.size2(), false);
+  insert_sparse_columns(M_v, M, tag.getIsRight());
+}
+
+
+//GPU - based version
+/** @brief Construction of SPAI preconditioner on GPU
+ *
+ * @param A      initial sparse matrix
+ * @param cpu_A  copy of initial matrix on CPU
+ * @param cpu_M  output preconditioner on CPU
+ * @param M      output preconditioner
+ * @param tag    SPAI tag class with parameters
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void computeSPAI(viennacl::compressed_matrix<NumericT, AlignmentV> const & A, 
//input
+                 boost::numeric::ublas::compressed_matrix<NumericT> const & 
cpu_A,
+                 boost::numeric::ublas::compressed_matrix<NumericT> & cpu_M, 
//output
+                 viennacl::compressed_matrix<NumericT, AlignmentV> & M,
+                 spai_tag const & tag)
+{
+  typedef typename viennacl::linalg::detail::spai::sparse_vector<NumericT>     
   SparseVectorType;
+
+  //typedef typename viennacl::compressed_matrix<ScalarType> 
GPUSparseMatrixType;
+  //sparse matrix transpose...
+  unsigned int cur_iter = 0;
+  std::vector<cl_uint> g_is_update(cpu_M.size2(), static_cast<cl_uint>(1));
+  //init is update
+  //init_parallel_is_update(g_is_update);
+  std::vector<SparseVectorType> A_v_c(cpu_M.size2());
+  std::vector<SparseVectorType> M_v(cpu_M.size2());
+  vectorize_column_matrix(cpu_A, A_v_c);
+  vectorize_column_matrix(cpu_M, M_v);
+  std::vector<SparseVectorType> g_res(cpu_M.size2());
+  std::vector<std::vector<unsigned int> > g_I(cpu_M.size2());
+  std::vector<std::vector<unsigned int> > g_J(cpu_M.size2());
+
+  //OpenCL variables
+  block_matrix g_A_I_J_vcl;
+  block_vector g_bv_vcl;
+  while ((cur_iter < tag.getIterationLimit())&&is_all_update(g_is_update))
+  {
+    // SET UP THE BLOCKS..
+    // PHASE ONE..
+    //timer.start();
+    //index set up on CPU
+    if (cur_iter == 0)
+      block_set_up(A, A_v_c, M_v, g_is_update, g_I, g_J, g_A_I_J_vcl, 
g_bv_vcl);
+    else
+      block_update(A, A_v_c, g_is_update, g_res, g_J, g_I, g_A_I_J_vcl, 
g_bv_vcl, tag);
+    //std::cout<<"Phase 2 timing: "<<timer.get()<<std::endl;
+    //PERFORM LEAST SQUARE problems solution
+    //PHASE TWO
+    //timer.start();
+    least_square_solve<SparseVectorType, NumericT>(A_v_c, M_v, g_I, g_J, 
g_A_I_J_vcl, g_bv_vcl, g_res, g_is_update, tag, viennacl::traits::context(A));
+    //std::cout<<"Phase 3 timing: "<<timer.get()<<std::endl;
+    if (tag.getIsStatic())
+      break;
+    cur_iter++;
+  }
+
+  cpu_M.resize(cpu_M.size1(), cpu_M.size2(), false);
+  insert_sparse_columns(M_v, cpu_M, tag.getIsRight());
+  //copy back to GPU
+  M.resize(static_cast<unsigned int>(cpu_M.size1()), static_cast<unsigned 
int>(cpu_M.size2()));
+  viennacl::copy(cpu_M, M);
+}
+
+}
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai_tag.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai_tag.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai_tag.hpp
new file mode 100644
index 0000000..d8c718c
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai_tag.hpp
@@ -0,0 +1,143 @@
+#ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_TAG_HPP
+#define VIENNACL_LINALG_DETAIL_SPAI_SPAI_TAG_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+
+/** @file viennacl/linalg/detail/spai/spai_tag.hpp
+    @brief Implementation of the spai tag holding SPAI configuration 
parameters. Experimental.
+
+    SPAI code contributed by Nikolay Lukash
+*/
+
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include <math.h>
+#include <cmath>
+#include <sstream>
+#include "viennacl/ocl/backend.hpp"
+#include "boost/numeric/ublas/vector.hpp"
+#include "boost/numeric/ublas/matrix.hpp"
+#include "boost/numeric/ublas/matrix_proxy.hpp"
+#include "boost/numeric/ublas/storage.hpp"
+#include "boost/numeric/ublas/io.hpp"
+#include "boost/numeric/ublas/matrix_expression.hpp"
+#include "boost/numeric/ublas/detail/matrix_assign.hpp"
+
+#include "viennacl/linalg/detail/spai/block_matrix.hpp"
+#include "viennacl/linalg/detail/spai/block_vector.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace spai
+{
+
+/** @brief A tag for SPAI
+ *
+ * Contains values for the algorithm.
+ * Must be passed to spai_precond constructor
+ */
+class spai_tag
+{
+  /** @brief Constructor
+   *
+   * @param residual_norm_threshold   Calculate until the norm of the residual 
falls below this threshold
+   * @param iteration_limit           maximum number of iterations
+   * @param residual_threshold        determines starting threshold in 
residual vector for including new indices into set J
+   * @param is_static                 determines if static version of SPAI 
should be used
+   * @param is_right                  determines if left or right 
preconditioner should be used
+   */
+public:
+  spai_tag(double residual_norm_threshold = 1e-3,
+           unsigned int iteration_limit = 5,
+           double residual_threshold = 1e-2,
+           bool is_static = false,
+           bool is_right = false)
+    : residual_norm_threshold_(residual_norm_threshold),
+      iteration_limit_(iteration_limit),
+      residual_threshold_(residual_threshold),
+      is_static_(is_static),
+      is_right_(is_right) {}
+
+  double getResidualNormThreshold() const { return residual_norm_threshold_; }
+
+  double getResidualThreshold() const { return residual_threshold_; }
+
+  unsigned int getIterationLimit () const { return iteration_limit_; }
+
+  bool getIsStatic() const { return is_static_; }
+
+  bool getIsRight() const { return is_right_; }
+
+  long getBegInd() const { return beg_ind_; }
+
+  long getEndInd() const { return end_ind_; }
+
+
+
+  void setResidualNormThreshold(double residual_norm_threshold)
+  {
+    if (residual_norm_threshold > 0)
+      residual_norm_threshold_ = residual_norm_threshold;
+  }
+
+  void setResidualThreshold(double residual_threshold)
+  {
+    if (residual_threshold > 0)
+      residual_threshold_ = residual_threshold;
+  }
+
+  void setIterationLimit(unsigned int iteration_limit)
+  {
+    if (iteration_limit > 0)
+      iteration_limit_ = iteration_limit;
+  }
+
+  void setIsRight(bool is_right) { is_right_ = is_right; }
+
+  void setIsStatic(bool is_static) { is_static_ = is_static; }
+
+  void setBegInd(long beg_ind) { beg_ind_ = beg_ind; }
+
+  void setEndInd(long end_ind){ end_ind_ = end_ind; }
+
+
+private:
+  double        residual_norm_threshold_;
+  unsigned int  iteration_limit_;
+  long          beg_ind_;
+  long          end_ind_;
+  double        residual_threshold_;
+  bool          is_static_;
+  bool          is_right_;
+};
+
+}
+}
+}
+}
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/sparse_vector.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/sparse_vector.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/sparse_vector.hpp
new file mode 100644
index 0000000..c99eda1
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/sparse_vector.hpp
@@ -0,0 +1,85 @@
+#ifndef VIENNACL_LINALG_DETAIL_SPAI_SPARSE_VECTOR_HPP
+#define VIENNACL_LINALG_DETAIL_SPAI_SPARSE_VECTOR_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+/** @file viennacl/linalg/detail/spai/sparse_vector.hpp
+    @brief Implementation of a helper sparse vector class for SPAI. 
Experimental.
+
+    SPAI code contributed by Nikolay Lukash
+*/
+
+#include <utility>
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <algorithm>
+#include <vector>
+#include <math.h>
+#include <map>
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace detail
+{
+namespace spai
+{
+
+/**
+ * @brief Represents a sparse vector based on std::map<unsigned int, NumericT>
+ */
+template<typename NumericT>
+class sparse_vector
+{
+public:
+  typedef typename std::map<unsigned int, NumericT>::iterator        iterator;
+  typedef typename std::map<unsigned int, NumericT>::const_iterator  
const_iterator;
+
+  sparse_vector() {}
+
+  /** @brief Set the index of the vector in the original matrix
+   *
+   * May only be called once.
+   */
+  //getter
+  NumericT & operator[] (unsigned int ind) { return v_[ind]; }
+
+  void clear() { v_.clear(); }
+
+  const_iterator find(unsigned int var) const { return v_.find(var); }
+        iterator find(unsigned int var)       { return v_.find(var); }
+
+  const_iterator begin() const { return v_.begin(); }
+        iterator begin()       { return v_.begin(); }
+  const_iterator end() const { return v_.end(); }
+        iterator end()       { return v_.end(); }
+
+private:
+  unsigned int                      size_;
+  std::map<unsigned int, NumericT>  v_;
+};
+
+}
+}
+}
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/direct_solve.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/direct_solve.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/direct_solve.hpp
new file mode 100644
index 0000000..a3340d7
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/direct_solve.hpp
@@ -0,0 +1,580 @@
+#ifndef VIENNACL_LINALG_DIRECT_SOLVE_HPP_
+#define VIENNACL_LINALG_DIRECT_SOLVE_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+/** @file viennacl/linalg/direct_solve.hpp
+    @brief Implementations of dense direct solvers are found here.
+*/
+
+#include "viennacl/forwards.h"
+#include "viennacl/meta/enable_if.hpp"
+#include "viennacl/vector.hpp"
+#include "viennacl/vector_proxy.hpp"
+#include "viennacl/matrix.hpp"
+#include "viennacl/matrix_proxy.hpp"
+#include "viennacl/linalg/prod.hpp"
+#include "viennacl/linalg/host_based/direct_solve.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+  #include "viennacl/linalg/opencl/direct_solve.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  #include "viennacl/linalg/cuda/direct_solve.hpp"
+#endif
+
+#define VIENNACL_DIRECT_SOLVE_BLOCKSIZE 128
+
+namespace viennacl
+{
+namespace linalg
+{
+
+namespace detail
+{
+
+  //
+  // A \ B:
+  //
+
+  /** @brief Direct inplace solver for dense triangular systems using a single 
kernel launch. Matlab notation: A \ B
+  *
+  * @param A    The system matrix
+  * @param B    The matrix of row vectors, where the solution is directly 
written to
+  */
+  template<typename NumericT, typename SolverTagT>
+  void inplace_solve_kernel(const matrix_base<NumericT>  & A, const 
matrix_base<NumericT> & B, SolverTagT)
+  {
+    assert( (viennacl::traits::size1(A) == viennacl::traits::size2(A)) && 
bool("Size check failed in inplace_solve(): size1(A) != size2(A)"));
+    assert( (viennacl::traits::size1(A) == viennacl::traits::size1(B)) && 
bool("Size check failed in inplace_solve(): size1(A) != size1(B)"));
+    switch (viennacl::traits::handle(A).get_active_handle_id())
+    {
+      case viennacl::MAIN_MEMORY:
+        viennacl::linalg::host_based::inplace_solve(A, 
const_cast<matrix_base<NumericT> &>(B), SolverTagT());
+        break;
+  #ifdef VIENNACL_WITH_OPENCL
+      case viennacl::OPENCL_MEMORY:
+        viennacl::linalg::opencl::inplace_solve(A, 
const_cast<matrix_base<NumericT> &>(B), SolverTagT());
+        break;
+  #endif
+  #ifdef VIENNACL_WITH_CUDA
+      case viennacl::CUDA_MEMORY:
+        viennacl::linalg::cuda::inplace_solve(A, 
const_cast<matrix_base<NumericT> &>(B), SolverTagT());
+        break;
+  #endif
+      case viennacl::MEMORY_NOT_INITIALIZED:
+        throw memory_exception("not initialised!");
+      default:
+        throw memory_exception("not implemented");
+    }
+  }
+
+
+  //
+  // A \ b
+  //
+
+  template<typename NumericT, typename SolverTagT>
+  void inplace_solve_vec_kernel(const matrix_base<NumericT> & mat,
+                                const vector_base<NumericT> & vec,
+                                SolverTagT)
+  {
+    assert( (mat.size1() == vec.size()) && bool("Size check failed in 
inplace_solve(): size1(A) != size(b)"));
+    assert( (mat.size2() == vec.size()) && bool("Size check failed in 
inplace_solve(): size2(A) != size(b)"));
+
+    switch (viennacl::traits::handle(mat).get_active_handle_id())
+    {
+      case viennacl::MAIN_MEMORY:
+        viennacl::linalg::host_based::inplace_solve(mat, 
const_cast<vector_base<NumericT> &>(vec), SolverTagT());
+        break;
+  #ifdef VIENNACL_WITH_OPENCL
+      case viennacl::OPENCL_MEMORY:
+        viennacl::linalg::opencl::inplace_solve(mat, 
const_cast<vector_base<NumericT> &>(vec), SolverTagT());
+        break;
+  #endif
+  #ifdef VIENNACL_WITH_CUDA
+      case viennacl::CUDA_MEMORY:
+        viennacl::linalg::cuda::inplace_solve(mat, 
const_cast<vector_base<NumericT> &>(vec), SolverTagT());
+        break;
+  #endif
+      case viennacl::MEMORY_NOT_INITIALIZED:
+        throw memory_exception("not initialised!");
+      default:
+        throw memory_exception("not implemented");
+    }
+  }
+
+
+  template<typename MatrixT1, typename MatrixT2, typename SolverTagT>
+  void inplace_solve_lower_impl(MatrixT1 const & A, MatrixT2 & B, SolverTagT)
+  {
+    typedef typename viennacl::result_of::cpu_value_type<MatrixT1>::type  
NumericType;
+
+    vcl_size_t blockSize = VIENNACL_DIRECT_SOLVE_BLOCKSIZE;
+    if (A.size1() <= blockSize)
+      inplace_solve_kernel(A, B, SolverTagT());
+    else
+    {
+      for (vcl_size_t i = 0; i < A.size1(); i = i + blockSize)
+      {
+        vcl_size_t Apos1 = i;
+        vcl_size_t Apos2 = std::min<vcl_size_t>(A.size1(), i + blockSize);
+        vcl_size_t Bpos = B.size2();
+        inplace_solve_kernel(viennacl::project(A, viennacl::range(Apos1, 
Apos2), viennacl::range(Apos1, Apos2)),
+                             viennacl::project(B, viennacl::range(Apos1, 
Apos2), viennacl::range(0,     Bpos)),
+                             SolverTagT());
+        if (Apos2 < A.size1())
+        {
+          viennacl::matrix_range<MatrixT2> B_lower(B, viennacl::range(Apos2, 
B.size1()), viennacl::range(0, Bpos));
+          viennacl::linalg::prod_impl(viennacl::project(A, 
viennacl::range(Apos2, A.size1()), viennacl::range(Apos1, Apos2)),
+                                      viennacl::project(B, 
viennacl::range(Apos1, Apos2),     viennacl::range(0,     Bpos)),
+                                      B_lower,
+                                      NumericType(-1.0), NumericType(1.0));
+        }
+      }
+    }
+  }
+
+  template<typename MatrixT1, typename MatrixT2>
+  void inplace_solve_impl(MatrixT1 const & A, MatrixT2 & B, 
viennacl::linalg::lower_tag)
+  {
+    inplace_solve_lower_impl(A, B, viennacl::linalg::lower_tag());
+  }
+
+  template<typename MatrixT1, typename MatrixT2>
+  void inplace_solve_impl(MatrixT1 const & A, MatrixT2 & B, 
viennacl::linalg::unit_lower_tag)
+  {
+    inplace_solve_lower_impl(A, B, viennacl::linalg::unit_lower_tag());
+  }
+
+  template<typename MatrixT1, typename MatrixT2, typename SolverTagT>
+  void inplace_solve_upper_impl(MatrixT1 const & A, MatrixT2 & B, SolverTagT)
+  {
+    typedef typename viennacl::result_of::cpu_value_type<MatrixT1>::type  
NumericType;
+
+    int blockSize = VIENNACL_DIRECT_SOLVE_BLOCKSIZE;
+    if (static_cast<int>(A.size1()) <= blockSize)
+      inplace_solve_kernel(A, B, SolverTagT());
+    else
+    {
+      for (int i = static_cast<int>(A.size1()); i > 0; i = i - blockSize)
+      {
+        vcl_size_t Apos1 = vcl_size_t(std::max<int>(0, i - blockSize));
+        vcl_size_t Apos2 = vcl_size_t(i);
+        vcl_size_t Bpos = B.size2();
+        inplace_solve_kernel(viennacl::project(A, viennacl::range(Apos1, 
Apos2), viennacl::range(Apos1, Apos2)),
+                             viennacl::project(B, viennacl::range(Apos1, 
Apos2), viennacl::range(0, Bpos)),
+                             SolverTagT());
+        if (Apos1 > 0)
+        {
+          viennacl::matrix_range<MatrixT2> B_upper(B, viennacl::range(0, 
Apos1), viennacl::range(0, Bpos));
+
+          viennacl::linalg::prod_impl(viennacl::project(A, viennacl::range(0,  
   Apos1), viennacl::range(Apos1, Apos2)),
+                                      viennacl::project(B, 
viennacl::range(Apos1, Apos2), viennacl::range(0,     Bpos)),
+                                      B_upper,
+                                      NumericType(-1.0), NumericType(1.0));
+        }
+      }
+    }
+  }
+
+  template<typename MatrixT1, typename MatrixT2>
+  void inplace_solve_impl(MatrixT1 const & A, MatrixT2 & B, 
viennacl::linalg::upper_tag)
+  {
+    inplace_solve_upper_impl(A, B, viennacl::linalg::upper_tag());
+  }
+
+  template<typename MatrixT1, typename MatrixT2>
+  void inplace_solve_impl(MatrixT1 const & A, MatrixT2 & B, 
viennacl::linalg::unit_upper_tag)
+  {
+    inplace_solve_upper_impl(A, B, viennacl::linalg::unit_upper_tag());
+  }
+
+} // namespace detail
+
+/** @brief Direct inplace solver for triangular systems with multiple right 
hand sides, i.e. A \ B   (MATLAB notation)
+*
+* @param A      The system matrix
+* @param B      The matrix of row vectors, where the solution is directly 
written to
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(const matrix_base<NumericT> & A,
+                   matrix_base<NumericT> & B,
+                   SolverTagT)
+{
+  detail::inplace_solve_impl(A,B,SolverTagT());
+}
+
+/** @brief Direct inplace solver for triangular systems with multiple 
transposed right hand sides, i.e. A \ B^T   (MATLAB notation)
+*
+* @param A       The system matrix
+* @param proxy_B The proxy for the transposed matrix of row vectors, where the 
solution is directly written to
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(const matrix_base<NumericT> & A,
+                   matrix_expression<const matrix_base<NumericT>, const 
matrix_base<NumericT>, op_trans> proxy_B,
+                   SolverTagT)
+{
+  typedef typename matrix_base<NumericT>::handle_type    handle_type;
+
+  matrix_base<NumericT> B(const_cast<handle_type &>(proxy_B.lhs().handle()),
+                          proxy_B.lhs().size2(), proxy_B.lhs().start2(), 
proxy_B.lhs().stride2(), proxy_B.lhs().internal_size2(),
+                          proxy_B.lhs().size1(), proxy_B.lhs().start1(), 
proxy_B.lhs().stride1(), proxy_B.lhs().internal_size1(),
+                          !proxy_B.lhs().row_major());
+
+  detail::inplace_solve_impl(A,B,SolverTagT());
+}
+
+//upper triangular solver for transposed lower triangular matrices
+/** @brief Direct inplace solver for transposed triangular systems with 
multiple right hand sides, i.e. A^T \ B   (MATLAB notation)
+*
+* @param proxy_A  The transposed system matrix proxy
+* @param B        The matrix holding the load vectors, where the solution is 
directly written to
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(const matrix_expression< const matrix_base<NumericT>, const 
matrix_base<NumericT>, op_trans>  & proxy_A,
+                   matrix_base<NumericT> & B,
+                   SolverTagT)
+{
+  typedef typename matrix_base<NumericT>::handle_type    handle_type;
+
+  matrix_base<NumericT> A(const_cast<handle_type &>(proxy_A.lhs().handle()),
+                          proxy_A.lhs().size2(), proxy_A.lhs().start2(), 
proxy_A.lhs().stride2(), proxy_A.lhs().internal_size2(),
+                          proxy_A.lhs().size1(), proxy_A.lhs().start1(), 
proxy_A.lhs().stride1(), proxy_A.lhs().internal_size1(),
+                          !proxy_A.lhs().row_major());
+
+  detail::inplace_solve_impl(A,B,SolverTagT());
+}
+
+/** @brief Direct inplace solver for transposed triangular systems with 
multiple transposed right hand sides, i.e. A^T \ B^T   (MATLAB notation)
+*
+* @param proxy_A    The transposed system matrix proxy
+* @param proxy_B    The transposed matrix holding the load vectors, where the 
solution is directly written to
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(matrix_expression< const matrix_base<NumericT>, const 
matrix_base<NumericT>, op_trans> const & proxy_A,
+                   matrix_expression< const matrix_base<NumericT>, const 
matrix_base<NumericT>, op_trans>         proxy_B,
+                   SolverTagT)
+{
+  typedef typename matrix_base<NumericT>::handle_type    handle_type;
+
+  matrix_base<NumericT> A(const_cast<handle_type &>(proxy_A.lhs().handle()),
+                          proxy_A.lhs().size2(), proxy_A.lhs().start2(), 
proxy_A.lhs().stride2(), proxy_A.lhs().internal_size2(),
+                          proxy_A.lhs().size1(), proxy_A.lhs().start1(), 
proxy_A.lhs().stride1(), proxy_A.lhs().internal_size1(),
+                          !proxy_A.lhs().row_major());
+
+  matrix_base<NumericT> B(const_cast<handle_type &>(proxy_B.lhs().handle()),
+                          proxy_B.lhs().size2(), proxy_B.lhs().start2(), 
proxy_B.lhs().stride2(), proxy_B.lhs().internal_size2(),
+                          proxy_B.lhs().size1(), proxy_B.lhs().start1(), 
proxy_B.lhs().stride1(), proxy_B.lhs().internal_size1(),
+                          !proxy_B.lhs().row_major());
+
+  detail::inplace_solve_impl(A,B,SolverTagT());
+}
+
+
+/////////////////// general wrappers for non-inplace solution 
//////////////////////
+
+
+/** @brief Convenience functions for C = solve(A, B, some_tag()); Creates a 
temporary result matrix and forwards the request to inplace_solve()
+*
+* @param A    The system matrix
+* @param B    The matrix of load vectors
+* @param tag    Dispatch tag
+*/
+template<typename NumericT, typename SolverTagT>
+matrix_base<NumericT> solve(const matrix_base<NumericT> & A,
+                            const matrix_base<NumericT> & B,
+                            SolverTagT tag)
+{
+  // do an inplace solve on the result vector:
+  matrix_base<NumericT> result(B);
+  inplace_solve(A, result, tag);
+  return result;
+}
+
+/** @brief Convenience functions for C = solve(A, B^T, some_tag()); Creates a 
temporary result matrix and forwards the request to inplace_solve()
+*
+* @param A    The system matrix
+* @param proxy  The transposed load vector
+* @param tag    Dispatch tag
+*/
+template<typename NumericT, typename SolverTagT>
+matrix_base<NumericT> solve(const matrix_base<NumericT> & A,
+                            const matrix_expression< const 
matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & proxy,
+                            SolverTagT tag)
+{
+  // do an inplace solve on the result vector:
+  matrix_base<NumericT> result(proxy);
+  inplace_solve(A, result, tag);
+  return result;
+}
+
+/** @brief Convenience functions for result = solve(trans(mat), B, 
some_tag()); Creates a temporary result matrix and forwards the request to 
inplace_solve()
+*
+* @param proxy  The transposed system matrix proxy
+* @param B      The matrix of load vectors
+* @param tag    Dispatch tag
+*/
+template<typename NumericT, typename SolverTagT>
+matrix_base<NumericT> solve(const matrix_expression< const 
matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & proxy,
+                            const matrix_base<NumericT> & B,
+                            SolverTagT tag)
+{
+  // do an inplace solve on the result vector:
+  matrix_base<NumericT> result(B);
+  inplace_solve(proxy, result, tag);
+  return result;
+}
+
+/** @brief Convenience functions for result = solve(trans(mat), vec, 
some_tag()); Creates a temporary result vector and forwards the request to 
inplace_solve()
+*
+* @param proxy_A  The transposed system matrix proxy
+* @param proxy_B  The transposed matrix of load vectors, where the solution is 
directly written to
+* @param tag    Dispatch tag
+*/
+template<typename NumericT, typename SolverTagT>
+matrix_base<NumericT> solve(const matrix_expression< const 
matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & proxy_A,
+                            const matrix_expression< const 
matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & proxy_B,
+                            SolverTagT tag)
+{
+  // run an inplace solve on the result vector:
+  matrix_base<NumericT> result(proxy_B);
+  inplace_solve(proxy_A, result, tag);
+  return result;
+}
+
+//
+/////////// solves with vector as right hand side ///////////////////
+//
+
+namespace detail
+{
+  template<typename MatrixT1, typename VectorT, typename SolverTagT>
+  void inplace_solve_lower_vec_impl(MatrixT1 const & A, VectorT & b, 
SolverTagT)
+  {
+    vcl_size_t blockSize = VIENNACL_DIRECT_SOLVE_BLOCKSIZE;
+    if (A.size1() <= blockSize)
+      inplace_solve_vec_kernel(A, b, SolverTagT());
+    else
+    {
+      VectorT temp(b);
+      for (vcl_size_t i = 0; i < A.size1(); i = i + blockSize)
+      {
+        vcl_size_t Apos1 = i;
+        vcl_size_t Apos2 = std::min<vcl_size_t>(A.size1(), i + blockSize);
+        inplace_solve_vec_kernel(viennacl::project(A, viennacl::range(Apos1, 
Apos2), viennacl::range(Apos1, Apos2)),
+                                 viennacl::project(b, viennacl::range(Apos1, 
Apos2)),
+                                 SolverTagT());
+        if (Apos2 < A.size1())
+        {
+          viennacl::project(temp, viennacl::range(Apos2, A.size1())) = 
viennacl::linalg::prod(viennacl::project(A, viennacl::range(Apos2, A.size1()), 
viennacl::range(Apos1, Apos2)),
+                                                                               
               viennacl::project(b, viennacl::range(Apos1, Apos2)));
+          viennacl::project(b, viennacl::range(Apos2, A.size1())) -= 
viennacl::project(temp, viennacl::range(Apos2, A.size1()));
+        }
+      }
+    }
+  }
+
+  template<typename MatrixT1, typename VectorT>
+  void inplace_solve_vec_impl(MatrixT1 const & A, VectorT & B, 
viennacl::linalg::lower_tag)
+  {
+    inplace_solve_lower_vec_impl(A, B, viennacl::linalg::lower_tag());
+  }
+
+  template<typename MatrixT1, typename VectorT>
+  void inplace_solve_vec_impl(MatrixT1 const & A, VectorT & B, 
viennacl::linalg::unit_lower_tag)
+  {
+    inplace_solve_lower_vec_impl(A, B, viennacl::linalg::unit_lower_tag());
+  }
+
+  template<typename MatrixT1, typename VectorT, typename SolverTagT>
+  void inplace_solve_upper_vec_impl(MatrixT1 const & A, VectorT & b, 
SolverTagT)
+  {
+    int blockSize = VIENNACL_DIRECT_SOLVE_BLOCKSIZE;
+    if (static_cast<int>(A.size1()) <= blockSize)
+      inplace_solve_vec_kernel(A, b, SolverTagT());
+    else
+    {
+      VectorT temp(b);
+      for (int i = static_cast<int>(A.size1()); i > 0; i = i - blockSize)
+      {
+        vcl_size_t Apos1 = vcl_size_t(std::max<int>(0, i - blockSize));
+        vcl_size_t Apos2 = vcl_size_t(i);
+        inplace_solve_vec_kernel(viennacl::project(A, viennacl::range(Apos1, 
Apos2), viennacl::range(Apos1, Apos2)),
+                                 viennacl::project(b, viennacl::range(Apos1, 
Apos2)),
+                                 SolverTagT());
+        if (Apos1 > 0)
+        {
+          viennacl::project(temp, viennacl::range(0, Apos1)) = 
viennacl::linalg::prod(viennacl::project(A, viennacl::range(0,     Apos1), 
viennacl::range(Apos1, Apos2)),
+                                                                               
       viennacl::project(b, viennacl::range(Apos1, Apos2)));
+          viennacl::project(b, viennacl::range(0, Apos1)) -= 
viennacl::project(temp, viennacl::range(0, Apos1));
+        }
+      }
+    }
+  }
+
+  template<typename MatrixT1, typename VectorT>
+  void inplace_solve_vec_impl(MatrixT1 const & A, VectorT & b, 
viennacl::linalg::upper_tag)
+  {
+    inplace_solve_upper_vec_impl(A, b, viennacl::linalg::upper_tag());
+  }
+
+  template<typename MatrixT1, typename VectorT>
+  void inplace_solve_vec_impl(MatrixT1 const & A, VectorT & b, 
viennacl::linalg::unit_upper_tag)
+  {
+    inplace_solve_upper_vec_impl(A, b, viennacl::linalg::unit_upper_tag());
+  }
+
+} // namespace detail
+
+/** @brief Inplace solution of a triangular system. Matlab notation A \ b.
+*
+* @param mat    The system matrix (a dense matrix for which only the 
respective triangular form is used)
+* @param vec    The right hand side vector
+* @param tag    The tag (either lower_tag, unit_lower_tag, upper_tag, or 
unit_upper_tag)
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(const matrix_base<NumericT> & mat,
+                   vector_base<NumericT> & vec,
+                   SolverTagT const & tag)
+{
+
+  detail::inplace_solve_vec_impl(mat, vec, tag);
+}
+
+/** @brief Inplace solution of a triangular system with transposed system 
matrix.. Matlab notation A' \ b.
+*
+* @param proxy  The transposed system matrix (a dense matrix for which only 
the respective triangular form is used)
+* @param vec    The right hand side vector
+* @param tag    The tag (either lower_tag, unit_lower_tag, upper_tag, or 
unit_upper_tag)
+*/
+template<typename NumericT, typename SolverTagT>
+void inplace_solve(matrix_expression<const matrix_base<NumericT>, const 
matrix_base<NumericT>, op_trans> const & proxy,
+                   vector_base<NumericT> & vec,
+                   SolverTagT const & tag)
+{
+  typedef typename matrix_base<NumericT>::handle_type    handle_type;
+
+  // wrap existing matrix in a new matrix_base object (no data copy)
+  matrix_base<NumericT> mat(const_cast<handle_type &>(proxy.lhs().handle()),
+                            proxy.lhs().size2(), proxy.lhs().start2(), 
proxy.lhs().stride2(), proxy.lhs().internal_size2(),
+                            proxy.lhs().size1(), proxy.lhs().start1(), 
proxy.lhs().stride1(), proxy.lhs().internal_size1(),
+                            !proxy.lhs().row_major());
+  detail::inplace_solve_vec_impl(mat, vec, tag);
+}
+
+
+/** @brief Convenience function for result = solve(mat, vec, upper_tag()); for 
an upper triangular solve.
+*
+* Creates a temporary result vector and forwards the request to inplace_solve()
+*
+* @param mat    The system matrix
+* @param vec    The load vector
+* @param tag    Dispatch tag
+*/
+template<typename NumericT>
+vector<NumericT> solve(const matrix_base<NumericT> & mat,
+                       const vector_base<NumericT> & vec,
+                       viennacl::linalg::upper_tag const & tag)
+{
+  // run an inplace solve on the result vector:
+  vector<NumericT> result(vec);
+  inplace_solve(mat, result, tag);
+  return result;
+}
+
+/** @brief Convenience function for result = solve(mat, vec, upper_tag()); for 
an upper triangular solve with unit diagonal.
+*
+* Creates a temporary result vector and forwards the request to inplace_solve()
+*
+* @param mat    The system matrix
+* @param vec    The load vector
+* @param tag    Dispatch tag
+*/
+template<typename NumericT>
+vector<NumericT> solve(const matrix_base<NumericT> & mat,
+                       const vector_base<NumericT> & vec,
+                       viennacl::linalg::unit_upper_tag const & tag)
+{
+  // run an inplace solve on the result vector:
+  vector<NumericT> result(vec);
+  inplace_solve(mat, result, tag);
+  return result;
+}
+
+/** @brief Convenience function for result = solve(mat, vec, upper_tag()); for 
a lower triangular solve.
+*
+* Creates a temporary result vector and forwards the request to inplace_solve()
+*
+* @param mat    The system matrix
+* @param vec    The load vector
+* @param tag    Dispatch tag
+*/
+template<typename NumericT>
+vector<NumericT> solve(const matrix_base<NumericT> & mat,
+                       const vector_base<NumericT> & vec,
+                       viennacl::linalg::lower_tag const & tag)
+{
+  // run an inplace solve on the result vector:
+  vector<NumericT> result(vec);
+  inplace_solve(mat, result, tag);
+  return result;
+}
+
+/** @brief Convenience function for result = solve(mat, vec, upper_tag()); for 
a lower triangular solve with unit diagonal.
+*
+* Creates a temporary result vector and forwards the request to inplace_solve()
+*
+* @param mat    The system matrix
+* @param vec    The load vector
+* @param tag    Dispatch tag
+*/
+template<typename NumericT>
+vector<NumericT> solve(const matrix_base<NumericT> & mat,
+                       const vector_base<NumericT> & vec,
+                       viennacl::linalg::unit_lower_tag const & tag)
+{
+  // run an inplace solve on the result vector:
+  vector<NumericT> result(vec);
+  inplace_solve(mat, result, tag);
+  return result;
+}
+
+/** @brief Convenience functions for result = solve(trans(mat), vec, 
some_tag()); Creates a temporary result vector and forwards the request to 
inplace_solve()
+*
+* @param proxy  The transposed system matrix proxy
+* @param vec    The load vector, where the solution is directly written to
+* @param tag    Dispatch tag
+*/
+template<typename NumericT, typename SolverTagT>
+vector<NumericT> solve(const matrix_expression< const matrix_base<NumericT>, 
const matrix_base<NumericT>, op_trans> & proxy,
+                       const vector_base<NumericT> & vec,
+                       SolverTagT const & tag)
+{
+  // run an inplace solve on the result vector:
+  vector<NumericT> result(vec);
+  inplace_solve(proxy, result, tag);
+  return result;
+}
+
+
+}
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/eig.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/eig.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/eig.hpp
new file mode 100644
index 0000000..36be3b3
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/eig.hpp
@@ -0,0 +1,29 @@
+#ifndef VIENNACL_LINALG_EIG_HPP_
+#define VIENNACL_LINALG_EIG_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+/** @file viennacl/linalg/eig.hpp
+*   @brief Convenience header file including all available eigenvalue 
algorithms
+*/
+
+#include "viennacl/linalg/bisect.hpp"
+#include "viennacl/linalg/lanczos.hpp"
+#include "viennacl/linalg/power_iter.hpp"
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/fft_operations.hpp
----------------------------------------------------------------------
diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/fft_operations.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/fft_operations.hpp
new file mode 100644
index 0000000..ae9ade2
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/fft_operations.hpp
@@ -0,0 +1,481 @@
+#ifndef VIENNACL_LINALG_FFT_OPERATIONS_HPP_
+#define VIENNACL_LINALG_FFT_OPERATIONS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+   Institute for Analysis and Scientific Computing,
+   TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+   -----------------
+   ViennaCL - The Vienna Computing Library
+   -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+ ============================================================================= 
*/
+
+/**  @file viennacl/linalg/fft_operations.hpp
+ @brief Implementations of Fast Furier Transformation.
+ */
+
+#include <viennacl/vector.hpp>
+#include <viennacl/matrix.hpp>
+
+#include "viennacl/linalg/host_based/fft_operations.hpp"
+
+#ifdef VIENNACL_WITH_OPENCL
+#include "viennacl/linalg/opencl/fft_operations.hpp"
+#include "viennacl/linalg/opencl/kernels/fft.hpp"
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+#include "viennacl/linalg/cuda/fft_operations.hpp"
+#endif
+
+namespace viennacl
+{
+namespace linalg
+{
+
+/**
+ * @brief Direct 1D algorithm for computing Fourier transformation.
+ *
+ * Works on any sizes of data.
+ * Serial implementation has o(n^2) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void direct(viennacl::vector<NumericT, AlignmentV> const & in,
+            viennacl::vector<NumericT, AlignmentV>       & out, vcl_size_t 
size, vcl_size_t stride,
+            vcl_size_t batch_num, NumericT sign = NumericT(-1),
+            
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER 
data_order = 
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+
+  switch (viennacl::traits::handle(in).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::direct(in, out, size, stride, batch_num, 
sign, data_order);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::direct(viennacl::traits::opencl_handle(in), 
viennacl::traits::opencl_handle(out), size, stride, batch_num, sign,data_order);
+    break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::direct(in, out, size, stride, 
batch_num,sign,data_order);
+    break;
+#endif
+
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+
+  }
+}
+
+/**
+ * @brief Direct 2D algorithm for computing Fourier transformation.
+ *
+ * Works on any sizes of data.
+ * Serial implementation has o(n^2) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void direct(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const 
& in,
+            viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& out, 
vcl_size_t size,
+            vcl_size_t stride, vcl_size_t batch_num, NumericT sign = 
NumericT(-1),
+            
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER 
data_order = 
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+
+  switch (viennacl::traits::handle(in).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::direct(in, out, size, stride, batch_num, 
sign, data_order);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::direct(viennacl::traits::opencl_handle(in), 
viennacl::traits::opencl_handle(out), size, stride, batch_num, sign,data_order);
+    break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::direct(in, out, size, stride, 
batch_num,sign,data_order);
+    break;
+#endif
+
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+
+  }
+}
+
+/*
+ * This function performs reorder of input data. Indexes are sorted in 
bit-reversal order.
+ * Such reordering should be done before in-place FFT.
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void reorder(viennacl::vector<NumericT, AlignmentV>& in, vcl_size_t size, 
vcl_size_t stride,
+             vcl_size_t bits_datasize, vcl_size_t batch_num,
+             
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER 
data_order = 
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+  switch (viennacl::traits::handle(in).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::reorder(in, size, stride, bits_datasize, 
batch_num, data_order);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    
viennacl::linalg::opencl::reorder<NumericT>(viennacl::traits::opencl_handle(in),
 size, stride, bits_datasize, batch_num, data_order);
+    break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::reorder(in, size, stride, bits_datasize, 
batch_num, data_order);
+    break;
+#endif
+
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+
+  }
+}
+
+/**
+ * @brief Radix-2 1D algorithm for computing Fourier transformation.
+ *
+ * Works only on power-of-two sizes of data.
+ * Serial implementation has o(n * lg n) complexity.
+ * This is a Cooley-Tukey algorithm
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void radix2(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & in, 
vcl_size_t size,
+            vcl_size_t stride, vcl_size_t batch_num, NumericT sign = 
NumericT(-1),
+            
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER 
data_order = 
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+  switch (viennacl::traits::handle(in).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::radix2(in, size, stride, batch_num, sign, 
data_order);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::radix2(viennacl::traits::opencl_handle(in), 
size, stride, batch_num, sign,data_order);
+    break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::radix2(in, size, stride, batch_num, sign, 
data_order);
+    break;
+#endif
+
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/**
+ * @brief Radix-2 2D algorithm for computing Fourier transformation.
+ *
+ * Works only on power-of-two sizes of data.
+ * Serial implementation has o(n * lg n) complexity.
+ * This is a Cooley-Tukey algorithm
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void radix2(viennacl::vector<NumericT, AlignmentV>& in, vcl_size_t size, 
vcl_size_t stride,
+            vcl_size_t batch_num, NumericT sign = NumericT(-1),
+            
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER 
data_order = 
viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
+{
+
+  switch (viennacl::traits::handle(in).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::radix2(in, size, stride, batch_num, sign, 
data_order);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::radix2(viennacl::traits::opencl_handle(in), 
size, stride, batch_num, sign,data_order);
+    break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::radix2(in, size, stride, batch_num, 
sign,data_order);
+    break;
+#endif
+
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/**
+ * @brief Bluestein's algorithm for computing Fourier transformation.
+ *
+ * Currently,  Works only for sizes of input data which less than 2^16.
+ * Uses a lot of additional memory, but should be fast for any size of data.
+ * Serial implementation has something about o(n * lg n) complexity
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void bluestein(viennacl::vector<NumericT, AlignmentV> & in,
+               viennacl::vector<NumericT, AlignmentV> & out, vcl_size_t 
/*batch_num*/)
+{
+
+  switch (viennacl::traits::handle(in).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::bluestein(in, out, 1);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::bluestein(in, out, 1);
+    break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::bluestein(in, out, 1);
+    break;
+#endif
+
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/**
+ * @brief Mutiply two complex vectors and store result in output
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void multiply_complex(viennacl::vector<NumericT, AlignmentV> const & input1,
+                      viennacl::vector<NumericT, AlignmentV> const & input2,
+                      viennacl::vector<NumericT, AlignmentV>       & output)
+{
+  switch (viennacl::traits::handle(input1).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::multiply_complex(input1, input2, output);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::multiply_complex(input1, input2, output);
+    break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::multiply_complex(input1, input2, output);
+    break;
+#endif
+
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/**
+ * @brief Normalize vector on with his own size
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void normalize(viennacl::vector<NumericT, AlignmentV> & input)
+{
+  switch (viennacl::traits::handle(input).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::normalize(input);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::normalize(input);
+    break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::normalize(input);
+    break;
+#endif
+
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/**
+ * @brief Inplace_transpose matrix
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & 
input)
+{
+  switch (viennacl::traits::handle(input).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::transpose(input);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::transpose(input);
+    break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::transpose(input);
+    break;
+#endif
+
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/**
+ * @brief Transpose matrix
+ */
+template<typename NumericT, unsigned int AlignmentV>
+void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> 
const & input,
+               viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>     
  & output)
+{
+  switch (viennacl::traits::handle(input).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::transpose(input, output);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::transpose(input, output);
+    break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::transpose(input, output);
+    break;
+#endif
+
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/**
+ * @brief Create complex vector from real vector (even elements(2*k) = real 
part, odd elements(2*k+1) = imaginary part)
+ */
+template<typename NumericT>
+void real_to_complex(viennacl::vector_base<NumericT> const & in,
+                     viennacl::vector_base<NumericT>       & out, vcl_size_t 
size)
+{
+  switch (viennacl::traits::handle(in).get_active_handle_id())
+  {
+    case viennacl::MAIN_MEMORY:
+      viennacl::linalg::host_based::real_to_complex(in, out, size);
+      break;
+#ifdef VIENNACL_WITH_OPENCL
+      case viennacl::OPENCL_MEMORY:
+      viennacl::linalg::opencl::real_to_complex(in,out,size);
+      break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+      case viennacl::CUDA_MEMORY:
+      viennacl::linalg::cuda::real_to_complex(in,out,size);
+      break;
+#endif
+
+    case viennacl::MEMORY_NOT_INITIALIZED:
+      throw memory_exception("not initialised!");
+    default:
+      throw memory_exception("not implemented");
+  }
+}
+
+/**
+ * @brief Create real vector from complex vector (even elements(2*k) = real 
part, odd elements(2*k+1) = imaginary part)
+ */
+template<typename NumericT>
+void complex_to_real(viennacl::vector_base<NumericT> const & in,
+                     viennacl::vector_base<NumericT>       & out, vcl_size_t 
size)
+{
+  switch (viennacl::traits::handle(in).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::complex_to_real(in, out, size);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::complex_to_real(in, out, size);
+    break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::complex_to_real(in, out, size);
+    break;
+#endif
+
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+/**
+ * @brief Reverse vector to oposite order and save it in input vector
+ */
+template<typename NumericT>
+void reverse(viennacl::vector_base<NumericT> & in)
+{
+  switch (viennacl::traits::handle(in).get_active_handle_id())
+  {
+  case viennacl::MAIN_MEMORY:
+    viennacl::linalg::host_based::reverse(in);
+    break;
+#ifdef VIENNACL_WITH_OPENCL
+  case viennacl::OPENCL_MEMORY:
+    viennacl::linalg::opencl::reverse(in);
+    break;
+#endif
+
+#ifdef VIENNACL_WITH_CUDA
+  case viennacl::CUDA_MEMORY:
+    viennacl::linalg::cuda::reverse(in);
+    break;
+#endif
+
+  case viennacl::MEMORY_NOT_INITIALIZED:
+    throw memory_exception("not initialised!");
+  default:
+    throw memory_exception("not implemented");
+  }
+}
+
+}
+}
+
+#endif /* FFT_OPERATIONS_HPP_ */

Reply via email to