http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai.hpp new file mode 100644 index 0000000..dba094b --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai.hpp @@ -0,0 +1,832 @@ +#ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_HPP +#define VIENNACL_LINALG_DETAIL_SPAI_SPAI_HPP + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/detail/spai/spai.hpp + @brief Main implementation of SPAI (not FSPAI). Experimental. +*/ + +#include <utility> +#include <iostream> +#include <fstream> +#include <string> +#include <algorithm> +#include <vector> +#include <math.h> +#include <map> + +//local includes +#include "viennacl/linalg/detail/spai/spai_tag.hpp" +#include "viennacl/linalg/qr.hpp" +#include "viennacl/linalg/detail/spai/spai-dynamic.hpp" +#include "viennacl/linalg/detail/spai/spai-static.hpp" +#include "viennacl/linalg/detail/spai/sparse_vector.hpp" +#include "viennacl/linalg/detail/spai/block_matrix.hpp" +#include "viennacl/linalg/detail/spai/block_vector.hpp" + +//boost includes +#include "boost/numeric/ublas/vector.hpp" +#include "boost/numeric/ublas/matrix.hpp" +#include "boost/numeric/ublas/matrix_proxy.hpp" +#include "boost/numeric/ublas/vector_proxy.hpp" +#include "boost/numeric/ublas/storage.hpp" +#include "boost/numeric/ublas/io.hpp" +#include "boost/numeric/ublas/lu.hpp" +#include "boost/numeric/ublas/triangular.hpp" +#include "boost/numeric/ublas/matrix_expression.hpp" + +// ViennaCL includes +#include "viennacl/linalg/prod.hpp" +#include "viennacl/matrix.hpp" +#include "viennacl/compressed_matrix.hpp" +#include "viennacl/linalg/sparse_matrix_operations.hpp" +#include "viennacl/linalg/matrix_operations.hpp" +#include "viennacl/scalar.hpp" +#include "viennacl/linalg/inner_prod.hpp" +#include "viennacl/linalg/ilu.hpp" +#include "viennacl/ocl/backend.hpp" +#include "viennacl/linalg/opencl/kernels/spai.hpp" + + + +#define VIENNACL_SPAI_K_b 20 + +namespace viennacl +{ +namespace linalg +{ +namespace detail +{ +namespace spai +{ + +//debug function for print +template<typename SparseVectorT> +void print_sparse_vector(SparseVectorT const & v) +{ + for (typename SparseVectorT::const_iterator vec_it = v.begin(); vec_it!= v.end(); ++vec_it) + std::cout << "[ " << vec_it->first << " ]:" << vec_it->second << std::endl; +} + +template<typename DenseMatrixT> +void print_matrix(DenseMatrixT & m) +{ + for (int i = 0; i < m.size2(); ++i) + { + for (int j = 0; j < m.size1(); ++j) + std::cout<<m(j, i)<<" "; + std::cout<<std::endl; + } +} + +/** @brief Add two sparse vectors res_v = b*v + * + * @param v initial sparse vector + * @param b scalar + * @param res_v output vector + */ +template<typename SparseVectorT, typename NumericT> +void add_sparse_vectors(SparseVectorT const & v, NumericT b, SparseVectorT & res_v) +{ + for (typename SparseVectorT::const_iterator v_it = v.begin(); v_it != v.end(); ++v_it) + res_v[v_it->first] += b*v_it->second; +} + +//sparse-matrix - vector product +/** @brief Computation of residual res = A*v - e + * + * @param A_v_c column major vectorized input sparse matrix + * @param v sparse vector, in this case new column of preconditioner matrix + * @param ind index for current column + * @param res residual + */ +template<typename SparseVectorT, typename NumericT> +void compute_spai_residual(std::vector<SparseVectorT> const & A_v_c, + SparseVectorT const & v, + unsigned int ind, + SparseVectorT & res) +{ + for (typename SparseVectorT::const_iterator v_it = v.begin(); v_it != v.end(); ++v_it) + add_sparse_vectors(A_v_c[v_it->first], v_it->second, res); + + res[ind] -= NumericT(1); +} + +/** @brief Setting up index set of columns and rows for certain column + * + * @param A_v_c column major vectorized initial sparse matrix + * @param v current column of preconditioner matrix + * @param J set of column indices + * @param I set of row indices + */ +template<typename SparseVectorT> +void build_index_set(std::vector<SparseVectorT> const & A_v_c, + SparseVectorT const & v, + std::vector<unsigned int> & J, + std::vector<unsigned int> & I) +{ + buildColumnIndexSet(v, J); + projectRows(A_v_c, J, I); +} + +/** @brief Initializes a dense matrix from a sparse one + * + * @param A_in Oiginal sparse matrix + * @param J Set of column indices + * @param I Set of row indices + * @param A_out dense matrix output + */ +template<typename SparseMatrixT, typename DenseMatrixT> +void initProjectSubMatrix(SparseMatrixT const & A_in, + std::vector<unsigned int> const & J, + std::vector<unsigned int> & I, + DenseMatrixT & A_out) +{ + A_out.resize(I.size(), J.size(), false); + for (vcl_size_t j = 0; j < J.size(); ++j) + for (vcl_size_t i = 0; i < I.size(); ++i) + A_out(i,j) = A_in(I[i],J[j]); +} + + +/************************************************** CPU BLOCK SET UP ***************************************/ + +/** @brief Setting up blocks and QR factorizing them on CPU + * + * @param A initial sparse matrix + * @param A_v_c column major vectorized initial sparse matrix + * @param M_v initialized preconditioner + * @param g_I container of row indices + * @param g_J container of column indices + * @param g_A_I_J container of dense matrices -> R matrices after QR factorization + * @param g_b_v container of vectors beta, necessary for Q recovery + */ +template<typename SparseMatrixT, typename DenseMatrixT, typename SparseVectorT, typename VectorT> +void block_set_up(SparseMatrixT const & A, + std::vector<SparseVectorT> const & A_v_c, + std::vector<SparseVectorT> const & M_v, + std::vector<std::vector<unsigned int> >& g_I, + std::vector<std::vector<unsigned int> >& g_J, + std::vector<DenseMatrixT>& g_A_I_J, + std::vector<VectorT>& g_b_v) +{ +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long i2 = 0; i2 < static_cast<long>(M_v.size()); ++i2) + { + vcl_size_t i = static_cast<vcl_size_t>(i2); + build_index_set(A_v_c, M_v[i], g_J[i], g_I[i]); + initProjectSubMatrix(A, g_J[i], g_I[i], g_A_I_J[i]); + //print_matrix(g_A_I_J[i]); + single_qr(g_A_I_J[i], g_b_v[i]); + //print_matrix(g_A_I_J[i]); + } +} + +/** @brief Setting up index set of columns and rows for all columns + * + * @param A_v_c column major vectorized initial sparse matrix + * @param M_v initialized preconditioner + * @param g_J container of column indices + * @param g_I container of row indices + */ +template<typename SparseVectorT> +void index_set_up(std::vector<SparseVectorT> const & A_v_c, + std::vector<SparseVectorT> const & M_v, + std::vector<std::vector<unsigned int> > & g_J, + std::vector<std::vector<unsigned int> > & g_I) +{ +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long i2 = 0; i2 < static_cast<long>(M_v.size()); ++i2) + { + vcl_size_t i = static_cast<vcl_size_t>(i2); + build_index_set(A_v_c, M_v[i], g_J[i], g_I[i]); + } +} + +/************************************************** GPU BLOCK SET UP ***************************************/ + +/** @brief Setting up blocks and QR factorizing them on GPU + * + * @param A initial sparse matrix + * @param A_v_c column major vectorized initial sparse matrix + * @param M_v initialized preconditioner + * @param g_is_update container that indicates which blocks are active + * @param g_I container of row indices + * @param g_J container of column indices + * @param g_A_I_J container of dense matrices -> R matrices after QR factorization + * @param g_bv container of vectors beta, necessary for Q recovery + */ +template<typename NumericT, unsigned int AlignmentV, typename SparseVectorT> +void block_set_up(viennacl::compressed_matrix<NumericT, AlignmentV> const & A, + std::vector<SparseVectorT> const & A_v_c, + std::vector<SparseVectorT> const & M_v, + std::vector<cl_uint> g_is_update, + std::vector<std::vector<unsigned int> > & g_I, + std::vector<std::vector<unsigned int> > & g_J, + block_matrix & g_A_I_J, + block_vector & g_bv) +{ + viennacl::context ctx = viennacl::traits::context(A); + bool is_empty_block; + + //build index set + index_set_up(A_v_c, M_v, g_J, g_I); + block_assembly(A, g_J, g_I, g_A_I_J, g_is_update, is_empty_block); + block_qr<NumericT>(g_I, g_J, g_A_I_J, g_bv, g_is_update, ctx); +} + + +/***************************************************************************************************/ +/******************************** SOLVING LS PROBLEMS ON GPU ***************************************/ +/***************************************************************************************************/ + +/** @brief Elicitation of sparse vector m for particular column from m_in - contigious vector for all columns + * + * @param m_in contigious sparse vector for all columns + * @param start_m_ind start index of particular vector + * @param J column index set + * @param m sparse vector for particular column + */ +template<typename NumericT, typename SparseVectorT> +void custom_fan_out(std::vector<NumericT> const & m_in, + unsigned int start_m_ind, + std::vector<unsigned int> const & J, + SparseVectorT & m) +{ + unsigned int cnt = 0; + for (vcl_size_t i = 0; i < J.size(); ++i) + m[J[i]] = m_in[start_m_ind + cnt++]; +} + + + +//GPU based least square problem +/** @brief Solution of Least square problem on GPU + * + * @param A_v_c column-major vectorized initial sparse matrix + * @param M_v column-major vectorized sparse preconditioner matrix + * @param g_I container of row set indices + * @param g_J container of column set indices + * @param g_A_I_J_vcl contigious matrix that consists of blocks A(I_k, J_k) + * @param g_bv_vcl contigious vector that consists of betas, necessary for Q recovery + * @param g_res container of residuals + * @param g_is_update container with indicators which blocks are active + * @param tag spai tag + * @param ctx Optional context in which the auxiliary data is created (one out of multiple OpenCL contexts, CUDA, host) + */ +template<typename SparseVectorT, typename NumericT> +void least_square_solve(std::vector<SparseVectorT> & A_v_c, + std::vector<SparseVectorT> & M_v, + std::vector<std::vector<unsigned int> >& g_I, + std::vector<std::vector<unsigned int> > & g_J, + block_matrix & g_A_I_J_vcl, + block_vector & g_bv_vcl, + std::vector<SparseVectorT> & g_res, + std::vector<cl_uint> & g_is_update, + const spai_tag & tag, + viennacl::context ctx) +{ + viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context()); + unsigned int y_sz, m_sz; + std::vector<cl_uint> y_inds(M_v.size() + 1, static_cast<cl_uint>(0)); + std::vector<cl_uint> m_inds(M_v.size() + 1, static_cast<cl_uint>(0)); + + get_size(g_I, y_sz); + init_start_inds(g_I, y_inds); + init_start_inds(g_J, m_inds); + + //create y_v + std::vector<NumericT> y_v(y_sz, NumericT(0)); + for (vcl_size_t i = 0; i < M_v.size(); ++i) + { + for (vcl_size_t j = 0; j < g_I[i].size(); ++j) + { + if (g_I[i][j] == i) + y_v[y_inds[i] + j] = NumericT(1.0); + } + } + //compute m_v + get_size(g_J, m_sz); + std::vector<NumericT> m_v(m_sz, static_cast<cl_uint>(0)); + + block_vector y_v_vcl; + block_vector m_v_vcl; + //prepearing memory for least square problem on GPU + y_v_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, + static_cast<unsigned int>(sizeof(NumericT)*y_v.size()), + &(y_v[0])); + m_v_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, + static_cast<unsigned int>(sizeof(NumericT)*m_v.size()), + &(m_v[0])); + y_v_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, + static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)), + &(y_inds[0])); + viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE, + static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())), + &(g_is_update[0])); + viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx); + viennacl::ocl::kernel & ls_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_least_squares"); + ls_kernel.local_work_size(0, 1); + ls_kernel.global_work_size(0, 256); + viennacl::ocl::enqueue(ls_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_bv_vcl.handle(), g_bv_vcl.handle1(), m_v_vcl.handle(), + y_v_vcl.handle(), y_v_vcl.handle1(), + g_A_I_J_vcl.handle1(), g_is_update_vcl, + //viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(ScalarType)*(local_r_n*local_c_n))), + static_cast<unsigned int>(M_v.size()))); + //copy vector m_v back from GPU to CPU + cl_int vcl_err = clEnqueueReadBuffer(opencl_ctx.get_queue().handle().get(), + m_v_vcl.handle().get(), CL_TRUE, 0, + sizeof(NumericT)*(m_v.size()), + &(m_v[0]), 0, NULL, NULL); + VIENNACL_ERR_CHECK(vcl_err); + + //fan out vector in parallel + //#pragma omp parallel for + for (long i = 0; i < static_cast<long>(M_v.size()); ++i) + { + if (g_is_update[static_cast<vcl_size_t>(i)]) + { + //faned out onto sparse vector + custom_fan_out(m_v, m_inds[static_cast<vcl_size_t>(i)], g_J[static_cast<vcl_size_t>(i)], M_v[static_cast<vcl_size_t>(i)]); + g_res[static_cast<vcl_size_t>(i)].clear(); + compute_spai_residual<SparseVectorT, NumericT>(A_v_c, M_v[static_cast<vcl_size_t>(i)], static_cast<unsigned int>(i), g_res[static_cast<vcl_size_t>(i)]); + NumericT res_norm = 0; + //compute norm of res - just to make sure that this implementatino works correct + sparse_norm_2(g_res[static_cast<vcl_size_t>(i)], res_norm); + //std::cout<<"Residual norm of column #: "<<i<<std::endl; + //std::cout<<res_norm<<std::endl; + //std::cout<<"************************"<<std::endl; + g_is_update[static_cast<vcl_size_t>(i)] = (res_norm > tag.getResidualNormThreshold())&& (!tag.getIsStatic())?(1):(0); + } + } +} + +//CPU based least square problems +/** @brief Solution of Least square problem on CPU + * + * @param A_v_c column-major vectorized initial sparse matrix + * @param g_R blocks for least square solution + * @param g_b_v vectors beta, necessary for Q recovery + * @param g_I container of row index set for all columns of matrix M + * @param g_J container of column index set for all columns of matrix M + * @param g_res container of residuals + * @param g_is_update container with indicators which blocks are active + * @param M_v column-major vectorized sparse matrix, final preconditioner + * @param tag spai tag + */ +template<typename SparseVectorT, typename DenseMatrixT, typename VectorT> +void least_square_solve(std::vector<SparseVectorT> const & A_v_c, + std::vector<DenseMatrixT> & g_R, + std::vector<VectorT> & g_b_v, + std::vector<std::vector<unsigned int> > & g_I, + std::vector<std::vector<unsigned int> > & g_J, + std::vector<SparseVectorT> & g_res, + std::vector<bool> & g_is_update, + std::vector<SparseVectorT> & M_v, + spai_tag const & tag) +{ + typedef typename DenseMatrixT::value_type NumericType; + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long i2 = 0; i2 < static_cast<long>(M_v.size()); ++i2) + { + vcl_size_t i = static_cast<vcl_size_t>(i2); + if (g_is_update[i]) + { + VectorT y = boost::numeric::ublas::zero_vector<NumericType>(g_I[i].size()); + + projectI<VectorT, NumericType>(g_I[i], y, static_cast<unsigned int>(tag.getBegInd() + long(i))); + apply_q_trans_vec(g_R[i], g_b_v[i], y); + + VectorT m_new = boost::numeric::ublas::zero_vector<NumericType>(g_R[i].size2()); + backwardSolve(g_R[i], y, m_new); + fanOutVector(m_new, g_J[i], M_v[i]); + g_res[i].clear(); + + compute_spai_residual<SparseVectorT, NumericType>(A_v_c, M_v[i], static_cast<unsigned int>(tag.getBegInd() + long(i)), g_res[i]); + + NumericType res_norm = 0; + sparse_norm_2(g_res[i], res_norm); +// std::cout<<"Residual norm of column #: "<<i<<std::endl; +// std::cout<<res_norm<<std::endl; +// std::cout<<"************************"<<std::endl; + g_is_update[i] = (res_norm > tag.getResidualNormThreshold())&& (!tag.getIsStatic()); + } + } +} + +//************************************ UPDATE CHECK ***************************************************// + +template<typename VectorType> +bool is_all_update(VectorType& parallel_is_update) +{ + for (unsigned int i = 0; i < parallel_is_update.size(); ++i) + { + if (parallel_is_update[i]) + return true; + } + return false; +} + +//********************************** MATRIX VECTORIZATION ***********************************************// + +//Matrix vectorization, column based approach +/** @brief Solution of Least square problem on CPU + * + * @param M_in input sparse, boost::numeric::ublas::compressed_matrix + * @param M_v array of sparse vectors + */ +template<typename SparseMatrixT, typename SparseVectorT> +void vectorize_column_matrix(SparseMatrixT const & M_in, + std::vector<SparseVectorT> & M_v) +{ + for (typename SparseMatrixT::const_iterator1 row_it = M_in.begin1(); row_it!= M_in.end1(); ++row_it) + for (typename SparseMatrixT::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it) + M_v[static_cast<unsigned int>(col_it.index2())][static_cast<unsigned int>(col_it.index1())] = *col_it; +} + +//Matrix vectorization row based approach +template<typename SparseMatrixT, typename SparseVectorT> +void vectorize_row_matrix(SparseMatrixT const & M_in, + std::vector<SparseVectorT> & M_v) +{ + for (typename SparseMatrixT::const_iterator1 row_it = M_in.begin1(); row_it!= M_in.end1(); ++row_it) + for (typename SparseMatrixT::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it) + M_v[static_cast<unsigned int>(col_it.index1())][static_cast<unsigned int>(col_it.index2())] = *col_it; +} + +//************************************* BLOCK ASSEMBLY CODE *********************************************// + + +template<typename SizeT> +void write_set_to_array(std::vector<std::vector<SizeT> > const & ind_set, + std::vector<cl_uint> & a) +{ + vcl_size_t cnt = 0; + + for (vcl_size_t i = 0; i < ind_set.size(); ++i) + for (vcl_size_t j = 0; j < ind_set[i].size(); ++j) + a[cnt++] = static_cast<cl_uint>(ind_set[i][j]); +} + + + +//assembling blocks on GPU +/** @brief Assembly of blocks on GPU by a gived set of row indices: g_I and column indices: g_J + * + * @param A intial sparse matrix + * @param g_J container of column index set + * @param g_I container of row index set + * @param g_A_I_J_vcl contigious blocks A(I, J) using GPU memory + * @param g_is_update container with indicators which blocks are active + * @param is_empty_block parameter that indicates if no block were assembled + */ +template<typename NumericT, unsigned int AlignmentV> +void block_assembly(viennacl::compressed_matrix<NumericT, AlignmentV> const & A, + std::vector<std::vector<unsigned int> > const & g_J, + std::vector<std::vector<unsigned int> > const & g_I, + block_matrix & g_A_I_J_vcl, + std::vector<cl_uint> & g_is_update, + bool & is_empty_block) +{ + //computing start indices for index sets and start indices for block matrices + unsigned int sz_I, sz_J, sz_blocks; + std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0)); + std::vector<cl_uint> i_ind(g_I.size() + 1, static_cast<cl_uint>(0)); + std::vector<cl_uint> j_ind(g_I.size() + 1, static_cast<cl_uint>(0)); + std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0)); + // + init_start_inds(g_J, j_ind); + init_start_inds(g_I, i_ind); + // + get_size(g_J, sz_J); + get_size(g_I, sz_I); + std::vector<cl_uint> I_set(sz_I, static_cast<cl_uint>(0)); + // + std::vector<cl_uint> J_set(sz_J, static_cast<cl_uint>(0)); + + // computing size for blocks + // writing set to arrays + write_set_to_array(g_I, I_set); + write_set_to_array(g_J, J_set); + + // if block for assembly does exist + if (I_set.size() > 0 && J_set.size() > 0) + { + viennacl::context ctx = viennacl::traits::context(A); + viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context()); + compute_blocks_size(g_I, g_J, sz_blocks, blocks_ind, matrix_dims); + std::vector<NumericT> con_A_I_J(sz_blocks, NumericT(0)); + + block_vector set_I_vcl, set_J_vcl; + //init memory on GPU + //contigious g_A_I_J + g_A_I_J_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, + static_cast<unsigned int>(sizeof(NumericT)*(sz_blocks)), + &(con_A_I_J[0])); + g_A_I_J_vcl.handle().context(opencl_ctx); + + //matrix_dimensions + g_A_I_J_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, + static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<cl_uint>(g_I.size())), + &(matrix_dims[0])); + g_A_I_J_vcl.handle1().context(opencl_ctx); + + //start_block inds + g_A_I_J_vcl.handle2() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, + static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)), + &(blocks_ind[0])); + g_A_I_J_vcl.handle2().context(opencl_ctx); + + //set_I + set_I_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, + static_cast<unsigned int>(sizeof(cl_uint)*sz_I), + &(I_set[0])); + set_I_vcl.handle().context(opencl_ctx); + + //set_J + set_J_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, + static_cast<unsigned int>(sizeof(cl_uint)*sz_J), + &(J_set[0])); + set_J_vcl.handle().context(opencl_ctx); + + //i_ind + set_I_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, + static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)), + &(i_ind[0])); + set_I_vcl.handle().context(opencl_ctx); + + //j_ind + set_J_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, + static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)), + &(j_ind[0])); + set_J_vcl.handle().context(opencl_ctx); + + viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE, + static_cast<unsigned int>(sizeof(cl_uint)*g_is_update.size()), + &(g_is_update[0])); + + viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx); + viennacl::ocl::kernel& assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "assemble_blocks"); + assembly_kernel.local_work_size(0, 1); + assembly_kernel.global_work_size(0, 256); + viennacl::ocl::enqueue(assembly_kernel(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle().opencl_handle(), + set_I_vcl.handle(), set_J_vcl.handle(), set_I_vcl.handle1(), + set_J_vcl.handle1(), + g_A_I_J_vcl.handle2(), g_A_I_J_vcl.handle1(), g_A_I_J_vcl.handle(), + g_is_update_vcl, + static_cast<unsigned int>(g_I.size()))); + is_empty_block = false; + } + else + is_empty_block = true; +} + +/************************************************************************************************************************/ + +/** @brief Insertion of vectorized matrix column into original sparse matrix + * + * @param M_v column-major vectorized matrix + * @param M original sparse matrix + * @param is_right indicates if matrix should be transposed in the output + */ +template<typename SparseMatrixT, typename SparseVectorT> +void insert_sparse_columns(std::vector<SparseVectorT> const & M_v, + SparseMatrixT& M, + bool is_right) +{ + if (is_right) + { + for (unsigned int i = 0; i < M_v.size(); ++i) + for (typename SparseVectorT::const_iterator vec_it = M_v[i].begin(); vec_it!=M_v[i].end(); ++vec_it) + M(vec_it->first, i) = vec_it->second; + } + else //transposed fill of M + { + for (unsigned int i = 0; i < M_v.size(); ++i) + for (typename SparseVectorT::const_iterator vec_it = M_v[i].begin(); vec_it!=M_v[i].end(); ++vec_it) + M(i, vec_it->first) = vec_it->second; + } +} + +/** @brief Transposition of sparse matrix + * + * @param A_in intial sparse matrix + * @param A output transposed matrix + */ +template<typename MatrixT> +void sparse_transpose(MatrixT const & A_in, MatrixT & A) +{ + typedef typename MatrixT::value_type NumericType; + + std::vector<std::map<vcl_size_t, NumericType> > temp_A(A_in.size2()); + A.resize(A_in.size2(), A_in.size1(), false); + + for (typename MatrixT::const_iterator1 row_it = A_in.begin1(); + row_it != A_in.end1(); + ++row_it) + { + for (typename MatrixT::const_iterator2 col_it = row_it.begin(); + col_it != row_it.end(); + ++col_it) + { + temp_A[col_it.index2()][col_it.index1()] = *col_it; + } + } + + for (vcl_size_t i=0; i<temp_A.size(); ++i) + { + for (typename std::map<vcl_size_t, NumericType>::const_iterator it = temp_A[i].begin(); + it != temp_A[i].end(); + ++it) + A(i, it->first) = it->second; + } +} + + + + +// template<typename SparseVectorType> +// void custom_copy(std::vector<SparseVectorType> & M_v, std::vector<SparseVectorType> & l_M_v, const unsigned int beg_ind){ +// for (int i = 0; i < l_M_v.size(); ++i){ +// l_M_v[i] = M_v[i + beg_ind]; +// } +// } + +//CPU version +/** @brief Construction of SPAI preconditioner on CPU + * + * @param A initial sparse matrix + * @param M output preconditioner + * @param tag spai tag + */ +template<typename MatrixT> +void computeSPAI(MatrixT const & A, + MatrixT & M, + spai_tag & tag) +{ + typedef typename MatrixT::value_type NumericT; + typedef typename boost::numeric::ublas::vector<NumericT> VectorType; + typedef typename viennacl::linalg::detail::spai::sparse_vector<NumericT> SparseVectorType; + typedef typename boost::numeric::ublas::matrix<NumericT> DenseMatrixType; + + //sparse matrix transpose... + unsigned int cur_iter = 0; + tag.setBegInd(0); tag.setEndInd(VIENNACL_SPAI_K_b); + bool go_on = true; + std::vector<SparseVectorType> A_v_c(M.size2()); + std::vector<SparseVectorType> M_v(M.size2()); + vectorize_column_matrix(A, A_v_c); + vectorize_column_matrix(M, M_v); + + + while (go_on) + { + go_on = (tag.getEndInd() < static_cast<long>(M.size2())); + cur_iter = 0; + unsigned int l_sz = static_cast<unsigned int>(tag.getEndInd() - tag.getBegInd()); + //std::vector<bool> g_is_update(M.size2(), true); + std::vector<bool> g_is_update(l_sz, true); + + //init is update + //init_parallel_is_update(g_is_update); + //std::vector<SparseVectorType> A_v_c(K); + //std::vector<SparseVectorType> M_v(K); + //vectorization of marices + //print_matrix(M_v); + + std::vector<SparseVectorType> l_M_v(l_sz); + //custom_copy(M_v, l_M_v, beg_ind); + std::copy(M_v.begin() + tag.getBegInd(), M_v.begin() + tag.getEndInd(), l_M_v.begin()); + + //print_matrix(l_M_v); + //std::vector<SparseVectorType> l_A_v_c(K); + //custom_copy(A_v_c, l_A_v_c, beg_ind); + //std::copy(A_v_c.begin() + beg_ind, A_v_c.begin() + end_ind, l_A_v_c.begin()); + //print_matrix(l_A_v_c); + //vectorize_row_matrix(A, A_v_r); + //working blocks + + std::vector<DenseMatrixType> g_A_I_J(l_sz); + std::vector<VectorType> g_b_v(l_sz); + std::vector<SparseVectorType> g_res(l_sz); + std::vector<std::vector<unsigned int> > g_I(l_sz); + std::vector<std::vector<unsigned int> > g_J(l_sz); + + while ((cur_iter < tag.getIterationLimit())&&is_all_update(g_is_update)) + { + // SET UP THE BLOCKS.. + // PHASE ONE + if (cur_iter == 0) + block_set_up(A, A_v_c, l_M_v, g_I, g_J, g_A_I_J, g_b_v); + else + block_update(A, A_v_c, g_res, g_is_update, g_I, g_J, g_b_v, g_A_I_J, tag); + + //PHASE TWO, LEAST SQUARE SOLUTION + least_square_solve(A_v_c, g_A_I_J, g_b_v, g_I, g_J, g_res, g_is_update, l_M_v, tag); + + if (tag.getIsStatic()) break; + cur_iter++; + } + + std::copy(l_M_v.begin(), l_M_v.end(), M_v.begin() + tag.getBegInd()); + tag.setBegInd(tag.getEndInd());//beg_ind = end_ind; + tag.setEndInd(std::min(static_cast<long>(tag.getBegInd() + VIENNACL_SPAI_K_b), static_cast<long>(M.size2()))); + //std::copy(l_M_v.begin(), l_M_v.end(), M_v.begin() + tag.getBegInd()); + } + + M.resize(M.size1(), M.size2(), false); + insert_sparse_columns(M_v, M, tag.getIsRight()); +} + + +//GPU - based version +/** @brief Construction of SPAI preconditioner on GPU + * + * @param A initial sparse matrix + * @param cpu_A copy of initial matrix on CPU + * @param cpu_M output preconditioner on CPU + * @param M output preconditioner + * @param tag SPAI tag class with parameters + */ +template<typename NumericT, unsigned int AlignmentV> +void computeSPAI(viennacl::compressed_matrix<NumericT, AlignmentV> const & A, //input + boost::numeric::ublas::compressed_matrix<NumericT> const & cpu_A, + boost::numeric::ublas::compressed_matrix<NumericT> & cpu_M, //output + viennacl::compressed_matrix<NumericT, AlignmentV> & M, + spai_tag const & tag) +{ + typedef typename viennacl::linalg::detail::spai::sparse_vector<NumericT> SparseVectorType; + + //typedef typename viennacl::compressed_matrix<ScalarType> GPUSparseMatrixType; + //sparse matrix transpose... + unsigned int cur_iter = 0; + std::vector<cl_uint> g_is_update(cpu_M.size2(), static_cast<cl_uint>(1)); + //init is update + //init_parallel_is_update(g_is_update); + std::vector<SparseVectorType> A_v_c(cpu_M.size2()); + std::vector<SparseVectorType> M_v(cpu_M.size2()); + vectorize_column_matrix(cpu_A, A_v_c); + vectorize_column_matrix(cpu_M, M_v); + std::vector<SparseVectorType> g_res(cpu_M.size2()); + std::vector<std::vector<unsigned int> > g_I(cpu_M.size2()); + std::vector<std::vector<unsigned int> > g_J(cpu_M.size2()); + + //OpenCL variables + block_matrix g_A_I_J_vcl; + block_vector g_bv_vcl; + while ((cur_iter < tag.getIterationLimit())&&is_all_update(g_is_update)) + { + // SET UP THE BLOCKS.. + // PHASE ONE.. + //timer.start(); + //index set up on CPU + if (cur_iter == 0) + block_set_up(A, A_v_c, M_v, g_is_update, g_I, g_J, g_A_I_J_vcl, g_bv_vcl); + else + block_update(A, A_v_c, g_is_update, g_res, g_J, g_I, g_A_I_J_vcl, g_bv_vcl, tag); + //std::cout<<"Phase 2 timing: "<<timer.get()<<std::endl; + //PERFORM LEAST SQUARE problems solution + //PHASE TWO + //timer.start(); + least_square_solve<SparseVectorType, NumericT>(A_v_c, M_v, g_I, g_J, g_A_I_J_vcl, g_bv_vcl, g_res, g_is_update, tag, viennacl::traits::context(A)); + //std::cout<<"Phase 3 timing: "<<timer.get()<<std::endl; + if (tag.getIsStatic()) + break; + cur_iter++; + } + + cpu_M.resize(cpu_M.size1(), cpu_M.size2(), false); + insert_sparse_columns(M_v, cpu_M, tag.getIsRight()); + //copy back to GPU + M.resize(static_cast<unsigned int>(cpu_M.size1()), static_cast<unsigned int>(cpu_M.size2())); + viennacl::copy(cpu_M, M); +} + +} +} +} +} +#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai_tag.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai_tag.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai_tag.hpp new file mode 100644 index 0000000..d8c718c --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai_tag.hpp @@ -0,0 +1,143 @@ +#ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_TAG_HPP +#define VIENNACL_LINALG_DETAIL_SPAI_SPAI_TAG_HPP + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + + +/** @file viennacl/linalg/detail/spai/spai_tag.hpp + @brief Implementation of the spai tag holding SPAI configuration parameters. Experimental. + + SPAI code contributed by Nikolay Lukash +*/ + + +#include <utility> +#include <iostream> +#include <fstream> +#include <string> +#include <algorithm> +#include <vector> +#include <math.h> +#include <cmath> +#include <sstream> +#include "viennacl/ocl/backend.hpp" +#include "boost/numeric/ublas/vector.hpp" +#include "boost/numeric/ublas/matrix.hpp" +#include "boost/numeric/ublas/matrix_proxy.hpp" +#include "boost/numeric/ublas/storage.hpp" +#include "boost/numeric/ublas/io.hpp" +#include "boost/numeric/ublas/matrix_expression.hpp" +#include "boost/numeric/ublas/detail/matrix_assign.hpp" + +#include "viennacl/linalg/detail/spai/block_matrix.hpp" +#include "viennacl/linalg/detail/spai/block_vector.hpp" + +namespace viennacl +{ +namespace linalg +{ +namespace detail +{ +namespace spai +{ + +/** @brief A tag for SPAI + * + * Contains values for the algorithm. + * Must be passed to spai_precond constructor + */ +class spai_tag +{ + /** @brief Constructor + * + * @param residual_norm_threshold Calculate until the norm of the residual falls below this threshold + * @param iteration_limit maximum number of iterations + * @param residual_threshold determines starting threshold in residual vector for including new indices into set J + * @param is_static determines if static version of SPAI should be used + * @param is_right determines if left or right preconditioner should be used + */ +public: + spai_tag(double residual_norm_threshold = 1e-3, + unsigned int iteration_limit = 5, + double residual_threshold = 1e-2, + bool is_static = false, + bool is_right = false) + : residual_norm_threshold_(residual_norm_threshold), + iteration_limit_(iteration_limit), + residual_threshold_(residual_threshold), + is_static_(is_static), + is_right_(is_right) {} + + double getResidualNormThreshold() const { return residual_norm_threshold_; } + + double getResidualThreshold() const { return residual_threshold_; } + + unsigned int getIterationLimit () const { return iteration_limit_; } + + bool getIsStatic() const { return is_static_; } + + bool getIsRight() const { return is_right_; } + + long getBegInd() const { return beg_ind_; } + + long getEndInd() const { return end_ind_; } + + + + void setResidualNormThreshold(double residual_norm_threshold) + { + if (residual_norm_threshold > 0) + residual_norm_threshold_ = residual_norm_threshold; + } + + void setResidualThreshold(double residual_threshold) + { + if (residual_threshold > 0) + residual_threshold_ = residual_threshold; + } + + void setIterationLimit(unsigned int iteration_limit) + { + if (iteration_limit > 0) + iteration_limit_ = iteration_limit; + } + + void setIsRight(bool is_right) { is_right_ = is_right; } + + void setIsStatic(bool is_static) { is_static_ = is_static; } + + void setBegInd(long beg_ind) { beg_ind_ = beg_ind; } + + void setEndInd(long end_ind){ end_ind_ = end_ind; } + + +private: + double residual_norm_threshold_; + unsigned int iteration_limit_; + long beg_ind_; + long end_ind_; + double residual_threshold_; + bool is_static_; + bool is_right_; +}; + +} +} +} +} +#endif http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/sparse_vector.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/sparse_vector.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/sparse_vector.hpp new file mode 100644 index 0000000..c99eda1 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/sparse_vector.hpp @@ -0,0 +1,85 @@ +#ifndef VIENNACL_LINALG_DETAIL_SPAI_SPARSE_VECTOR_HPP +#define VIENNACL_LINALG_DETAIL_SPAI_SPARSE_VECTOR_HPP + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/detail/spai/sparse_vector.hpp + @brief Implementation of a helper sparse vector class for SPAI. Experimental. + + SPAI code contributed by Nikolay Lukash +*/ + +#include <utility> +#include <iostream> +#include <fstream> +#include <string> +#include <algorithm> +#include <vector> +#include <math.h> +#include <map> + + +namespace viennacl +{ +namespace linalg +{ +namespace detail +{ +namespace spai +{ + +/** + * @brief Represents a sparse vector based on std::map<unsigned int, NumericT> + */ +template<typename NumericT> +class sparse_vector +{ +public: + typedef typename std::map<unsigned int, NumericT>::iterator iterator; + typedef typename std::map<unsigned int, NumericT>::const_iterator const_iterator; + + sparse_vector() {} + + /** @brief Set the index of the vector in the original matrix + * + * May only be called once. + */ + //getter + NumericT & operator[] (unsigned int ind) { return v_[ind]; } + + void clear() { v_.clear(); } + + const_iterator find(unsigned int var) const { return v_.find(var); } + iterator find(unsigned int var) { return v_.find(var); } + + const_iterator begin() const { return v_.begin(); } + iterator begin() { return v_.begin(); } + const_iterator end() const { return v_.end(); } + iterator end() { return v_.end(); } + +private: + unsigned int size_; + std::map<unsigned int, NumericT> v_; +}; + +} +} +} +} + +#endif http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/direct_solve.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/direct_solve.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/direct_solve.hpp new file mode 100644 index 0000000..a3340d7 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/direct_solve.hpp @@ -0,0 +1,580 @@ +#ifndef VIENNACL_LINALG_DIRECT_SOLVE_HPP_ +#define VIENNACL_LINALG_DIRECT_SOLVE_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/direct_solve.hpp + @brief Implementations of dense direct solvers are found here. +*/ + +#include "viennacl/forwards.h" +#include "viennacl/meta/enable_if.hpp" +#include "viennacl/vector.hpp" +#include "viennacl/vector_proxy.hpp" +#include "viennacl/matrix.hpp" +#include "viennacl/matrix_proxy.hpp" +#include "viennacl/linalg/prod.hpp" +#include "viennacl/linalg/host_based/direct_solve.hpp" + +#ifdef VIENNACL_WITH_OPENCL + #include "viennacl/linalg/opencl/direct_solve.hpp" +#endif + +#ifdef VIENNACL_WITH_CUDA + #include "viennacl/linalg/cuda/direct_solve.hpp" +#endif + +#define VIENNACL_DIRECT_SOLVE_BLOCKSIZE 128 + +namespace viennacl +{ +namespace linalg +{ + +namespace detail +{ + + // + // A \ B: + // + + /** @brief Direct inplace solver for dense triangular systems using a single kernel launch. Matlab notation: A \ B + * + * @param A The system matrix + * @param B The matrix of row vectors, where the solution is directly written to + */ + template<typename NumericT, typename SolverTagT> + void inplace_solve_kernel(const matrix_base<NumericT> & A, const matrix_base<NumericT> & B, SolverTagT) + { + assert( (viennacl::traits::size1(A) == viennacl::traits::size2(A)) && bool("Size check failed in inplace_solve(): size1(A) != size2(A)")); + assert( (viennacl::traits::size1(A) == viennacl::traits::size1(B)) && bool("Size check failed in inplace_solve(): size1(A) != size1(B)")); + switch (viennacl::traits::handle(A).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::inplace_solve(A, const_cast<matrix_base<NumericT> &>(B), SolverTagT()); + break; + #ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::inplace_solve(A, const_cast<matrix_base<NumericT> &>(B), SolverTagT()); + break; + #endif + #ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::inplace_solve(A, const_cast<matrix_base<NumericT> &>(B), SolverTagT()); + break; + #endif + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } + } + + + // + // A \ b + // + + template<typename NumericT, typename SolverTagT> + void inplace_solve_vec_kernel(const matrix_base<NumericT> & mat, + const vector_base<NumericT> & vec, + SolverTagT) + { + assert( (mat.size1() == vec.size()) && bool("Size check failed in inplace_solve(): size1(A) != size(b)")); + assert( (mat.size2() == vec.size()) && bool("Size check failed in inplace_solve(): size2(A) != size(b)")); + + switch (viennacl::traits::handle(mat).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::inplace_solve(mat, const_cast<vector_base<NumericT> &>(vec), SolverTagT()); + break; + #ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::inplace_solve(mat, const_cast<vector_base<NumericT> &>(vec), SolverTagT()); + break; + #endif + #ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::inplace_solve(mat, const_cast<vector_base<NumericT> &>(vec), SolverTagT()); + break; + #endif + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } + } + + + template<typename MatrixT1, typename MatrixT2, typename SolverTagT> + void inplace_solve_lower_impl(MatrixT1 const & A, MatrixT2 & B, SolverTagT) + { + typedef typename viennacl::result_of::cpu_value_type<MatrixT1>::type NumericType; + + vcl_size_t blockSize = VIENNACL_DIRECT_SOLVE_BLOCKSIZE; + if (A.size1() <= blockSize) + inplace_solve_kernel(A, B, SolverTagT()); + else + { + for (vcl_size_t i = 0; i < A.size1(); i = i + blockSize) + { + vcl_size_t Apos1 = i; + vcl_size_t Apos2 = std::min<vcl_size_t>(A.size1(), i + blockSize); + vcl_size_t Bpos = B.size2(); + inplace_solve_kernel(viennacl::project(A, viennacl::range(Apos1, Apos2), viennacl::range(Apos1, Apos2)), + viennacl::project(B, viennacl::range(Apos1, Apos2), viennacl::range(0, Bpos)), + SolverTagT()); + if (Apos2 < A.size1()) + { + viennacl::matrix_range<MatrixT2> B_lower(B, viennacl::range(Apos2, B.size1()), viennacl::range(0, Bpos)); + viennacl::linalg::prod_impl(viennacl::project(A, viennacl::range(Apos2, A.size1()), viennacl::range(Apos1, Apos2)), + viennacl::project(B, viennacl::range(Apos1, Apos2), viennacl::range(0, Bpos)), + B_lower, + NumericType(-1.0), NumericType(1.0)); + } + } + } + } + + template<typename MatrixT1, typename MatrixT2> + void inplace_solve_impl(MatrixT1 const & A, MatrixT2 & B, viennacl::linalg::lower_tag) + { + inplace_solve_lower_impl(A, B, viennacl::linalg::lower_tag()); + } + + template<typename MatrixT1, typename MatrixT2> + void inplace_solve_impl(MatrixT1 const & A, MatrixT2 & B, viennacl::linalg::unit_lower_tag) + { + inplace_solve_lower_impl(A, B, viennacl::linalg::unit_lower_tag()); + } + + template<typename MatrixT1, typename MatrixT2, typename SolverTagT> + void inplace_solve_upper_impl(MatrixT1 const & A, MatrixT2 & B, SolverTagT) + { + typedef typename viennacl::result_of::cpu_value_type<MatrixT1>::type NumericType; + + int blockSize = VIENNACL_DIRECT_SOLVE_BLOCKSIZE; + if (static_cast<int>(A.size1()) <= blockSize) + inplace_solve_kernel(A, B, SolverTagT()); + else + { + for (int i = static_cast<int>(A.size1()); i > 0; i = i - blockSize) + { + vcl_size_t Apos1 = vcl_size_t(std::max<int>(0, i - blockSize)); + vcl_size_t Apos2 = vcl_size_t(i); + vcl_size_t Bpos = B.size2(); + inplace_solve_kernel(viennacl::project(A, viennacl::range(Apos1, Apos2), viennacl::range(Apos1, Apos2)), + viennacl::project(B, viennacl::range(Apos1, Apos2), viennacl::range(0, Bpos)), + SolverTagT()); + if (Apos1 > 0) + { + viennacl::matrix_range<MatrixT2> B_upper(B, viennacl::range(0, Apos1), viennacl::range(0, Bpos)); + + viennacl::linalg::prod_impl(viennacl::project(A, viennacl::range(0, Apos1), viennacl::range(Apos1, Apos2)), + viennacl::project(B, viennacl::range(Apos1, Apos2), viennacl::range(0, Bpos)), + B_upper, + NumericType(-1.0), NumericType(1.0)); + } + } + } + } + + template<typename MatrixT1, typename MatrixT2> + void inplace_solve_impl(MatrixT1 const & A, MatrixT2 & B, viennacl::linalg::upper_tag) + { + inplace_solve_upper_impl(A, B, viennacl::linalg::upper_tag()); + } + + template<typename MatrixT1, typename MatrixT2> + void inplace_solve_impl(MatrixT1 const & A, MatrixT2 & B, viennacl::linalg::unit_upper_tag) + { + inplace_solve_upper_impl(A, B, viennacl::linalg::unit_upper_tag()); + } + +} // namespace detail + +/** @brief Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notation) +* +* @param A The system matrix +* @param B The matrix of row vectors, where the solution is directly written to +*/ +template<typename NumericT, typename SolverTagT> +void inplace_solve(const matrix_base<NumericT> & A, + matrix_base<NumericT> & B, + SolverTagT) +{ + detail::inplace_solve_impl(A,B,SolverTagT()); +} + +/** @brief Direct inplace solver for triangular systems with multiple transposed right hand sides, i.e. A \ B^T (MATLAB notation) +* +* @param A The system matrix +* @param proxy_B The proxy for the transposed matrix of row vectors, where the solution is directly written to +*/ +template<typename NumericT, typename SolverTagT> +void inplace_solve(const matrix_base<NumericT> & A, + matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> proxy_B, + SolverTagT) +{ + typedef typename matrix_base<NumericT>::handle_type handle_type; + + matrix_base<NumericT> B(const_cast<handle_type &>(proxy_B.lhs().handle()), + proxy_B.lhs().size2(), proxy_B.lhs().start2(), proxy_B.lhs().stride2(), proxy_B.lhs().internal_size2(), + proxy_B.lhs().size1(), proxy_B.lhs().start1(), proxy_B.lhs().stride1(), proxy_B.lhs().internal_size1(), + !proxy_B.lhs().row_major()); + + detail::inplace_solve_impl(A,B,SolverTagT()); +} + +//upper triangular solver for transposed lower triangular matrices +/** @brief Direct inplace solver for transposed triangular systems with multiple right hand sides, i.e. A^T \ B (MATLAB notation) +* +* @param proxy_A The transposed system matrix proxy +* @param B The matrix holding the load vectors, where the solution is directly written to +*/ +template<typename NumericT, typename SolverTagT> +void inplace_solve(const matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & proxy_A, + matrix_base<NumericT> & B, + SolverTagT) +{ + typedef typename matrix_base<NumericT>::handle_type handle_type; + + matrix_base<NumericT> A(const_cast<handle_type &>(proxy_A.lhs().handle()), + proxy_A.lhs().size2(), proxy_A.lhs().start2(), proxy_A.lhs().stride2(), proxy_A.lhs().internal_size2(), + proxy_A.lhs().size1(), proxy_A.lhs().start1(), proxy_A.lhs().stride1(), proxy_A.lhs().internal_size1(), + !proxy_A.lhs().row_major()); + + detail::inplace_solve_impl(A,B,SolverTagT()); +} + +/** @brief Direct inplace solver for transposed triangular systems with multiple transposed right hand sides, i.e. A^T \ B^T (MATLAB notation) +* +* @param proxy_A The transposed system matrix proxy +* @param proxy_B The transposed matrix holding the load vectors, where the solution is directly written to +*/ +template<typename NumericT, typename SolverTagT> +void inplace_solve(matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> const & proxy_A, + matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> proxy_B, + SolverTagT) +{ + typedef typename matrix_base<NumericT>::handle_type handle_type; + + matrix_base<NumericT> A(const_cast<handle_type &>(proxy_A.lhs().handle()), + proxy_A.lhs().size2(), proxy_A.lhs().start2(), proxy_A.lhs().stride2(), proxy_A.lhs().internal_size2(), + proxy_A.lhs().size1(), proxy_A.lhs().start1(), proxy_A.lhs().stride1(), proxy_A.lhs().internal_size1(), + !proxy_A.lhs().row_major()); + + matrix_base<NumericT> B(const_cast<handle_type &>(proxy_B.lhs().handle()), + proxy_B.lhs().size2(), proxy_B.lhs().start2(), proxy_B.lhs().stride2(), proxy_B.lhs().internal_size2(), + proxy_B.lhs().size1(), proxy_B.lhs().start1(), proxy_B.lhs().stride1(), proxy_B.lhs().internal_size1(), + !proxy_B.lhs().row_major()); + + detail::inplace_solve_impl(A,B,SolverTagT()); +} + + +/////////////////// general wrappers for non-inplace solution ////////////////////// + + +/** @brief Convenience functions for C = solve(A, B, some_tag()); Creates a temporary result matrix and forwards the request to inplace_solve() +* +* @param A The system matrix +* @param B The matrix of load vectors +* @param tag Dispatch tag +*/ +template<typename NumericT, typename SolverTagT> +matrix_base<NumericT> solve(const matrix_base<NumericT> & A, + const matrix_base<NumericT> & B, + SolverTagT tag) +{ + // do an inplace solve on the result vector: + matrix_base<NumericT> result(B); + inplace_solve(A, result, tag); + return result; +} + +/** @brief Convenience functions for C = solve(A, B^T, some_tag()); Creates a temporary result matrix and forwards the request to inplace_solve() +* +* @param A The system matrix +* @param proxy The transposed load vector +* @param tag Dispatch tag +*/ +template<typename NumericT, typename SolverTagT> +matrix_base<NumericT> solve(const matrix_base<NumericT> & A, + const matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & proxy, + SolverTagT tag) +{ + // do an inplace solve on the result vector: + matrix_base<NumericT> result(proxy); + inplace_solve(A, result, tag); + return result; +} + +/** @brief Convenience functions for result = solve(trans(mat), B, some_tag()); Creates a temporary result matrix and forwards the request to inplace_solve() +* +* @param proxy The transposed system matrix proxy +* @param B The matrix of load vectors +* @param tag Dispatch tag +*/ +template<typename NumericT, typename SolverTagT> +matrix_base<NumericT> solve(const matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & proxy, + const matrix_base<NumericT> & B, + SolverTagT tag) +{ + // do an inplace solve on the result vector: + matrix_base<NumericT> result(B); + inplace_solve(proxy, result, tag); + return result; +} + +/** @brief Convenience functions for result = solve(trans(mat), vec, some_tag()); Creates a temporary result vector and forwards the request to inplace_solve() +* +* @param proxy_A The transposed system matrix proxy +* @param proxy_B The transposed matrix of load vectors, where the solution is directly written to +* @param tag Dispatch tag +*/ +template<typename NumericT, typename SolverTagT> +matrix_base<NumericT> solve(const matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & proxy_A, + const matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & proxy_B, + SolverTagT tag) +{ + // run an inplace solve on the result vector: + matrix_base<NumericT> result(proxy_B); + inplace_solve(proxy_A, result, tag); + return result; +} + +// +/////////// solves with vector as right hand side /////////////////// +// + +namespace detail +{ + template<typename MatrixT1, typename VectorT, typename SolverTagT> + void inplace_solve_lower_vec_impl(MatrixT1 const & A, VectorT & b, SolverTagT) + { + vcl_size_t blockSize = VIENNACL_DIRECT_SOLVE_BLOCKSIZE; + if (A.size1() <= blockSize) + inplace_solve_vec_kernel(A, b, SolverTagT()); + else + { + VectorT temp(b); + for (vcl_size_t i = 0; i < A.size1(); i = i + blockSize) + { + vcl_size_t Apos1 = i; + vcl_size_t Apos2 = std::min<vcl_size_t>(A.size1(), i + blockSize); + inplace_solve_vec_kernel(viennacl::project(A, viennacl::range(Apos1, Apos2), viennacl::range(Apos1, Apos2)), + viennacl::project(b, viennacl::range(Apos1, Apos2)), + SolverTagT()); + if (Apos2 < A.size1()) + { + viennacl::project(temp, viennacl::range(Apos2, A.size1())) = viennacl::linalg::prod(viennacl::project(A, viennacl::range(Apos2, A.size1()), viennacl::range(Apos1, Apos2)), + viennacl::project(b, viennacl::range(Apos1, Apos2))); + viennacl::project(b, viennacl::range(Apos2, A.size1())) -= viennacl::project(temp, viennacl::range(Apos2, A.size1())); + } + } + } + } + + template<typename MatrixT1, typename VectorT> + void inplace_solve_vec_impl(MatrixT1 const & A, VectorT & B, viennacl::linalg::lower_tag) + { + inplace_solve_lower_vec_impl(A, B, viennacl::linalg::lower_tag()); + } + + template<typename MatrixT1, typename VectorT> + void inplace_solve_vec_impl(MatrixT1 const & A, VectorT & B, viennacl::linalg::unit_lower_tag) + { + inplace_solve_lower_vec_impl(A, B, viennacl::linalg::unit_lower_tag()); + } + + template<typename MatrixT1, typename VectorT, typename SolverTagT> + void inplace_solve_upper_vec_impl(MatrixT1 const & A, VectorT & b, SolverTagT) + { + int blockSize = VIENNACL_DIRECT_SOLVE_BLOCKSIZE; + if (static_cast<int>(A.size1()) <= blockSize) + inplace_solve_vec_kernel(A, b, SolverTagT()); + else + { + VectorT temp(b); + for (int i = static_cast<int>(A.size1()); i > 0; i = i - blockSize) + { + vcl_size_t Apos1 = vcl_size_t(std::max<int>(0, i - blockSize)); + vcl_size_t Apos2 = vcl_size_t(i); + inplace_solve_vec_kernel(viennacl::project(A, viennacl::range(Apos1, Apos2), viennacl::range(Apos1, Apos2)), + viennacl::project(b, viennacl::range(Apos1, Apos2)), + SolverTagT()); + if (Apos1 > 0) + { + viennacl::project(temp, viennacl::range(0, Apos1)) = viennacl::linalg::prod(viennacl::project(A, viennacl::range(0, Apos1), viennacl::range(Apos1, Apos2)), + viennacl::project(b, viennacl::range(Apos1, Apos2))); + viennacl::project(b, viennacl::range(0, Apos1)) -= viennacl::project(temp, viennacl::range(0, Apos1)); + } + } + } + } + + template<typename MatrixT1, typename VectorT> + void inplace_solve_vec_impl(MatrixT1 const & A, VectorT & b, viennacl::linalg::upper_tag) + { + inplace_solve_upper_vec_impl(A, b, viennacl::linalg::upper_tag()); + } + + template<typename MatrixT1, typename VectorT> + void inplace_solve_vec_impl(MatrixT1 const & A, VectorT & b, viennacl::linalg::unit_upper_tag) + { + inplace_solve_upper_vec_impl(A, b, viennacl::linalg::unit_upper_tag()); + } + +} // namespace detail + +/** @brief Inplace solution of a triangular system. Matlab notation A \ b. +* +* @param mat The system matrix (a dense matrix for which only the respective triangular form is used) +* @param vec The right hand side vector +* @param tag The tag (either lower_tag, unit_lower_tag, upper_tag, or unit_upper_tag) +*/ +template<typename NumericT, typename SolverTagT> +void inplace_solve(const matrix_base<NumericT> & mat, + vector_base<NumericT> & vec, + SolverTagT const & tag) +{ + + detail::inplace_solve_vec_impl(mat, vec, tag); +} + +/** @brief Inplace solution of a triangular system with transposed system matrix.. Matlab notation A' \ b. +* +* @param proxy The transposed system matrix (a dense matrix for which only the respective triangular form is used) +* @param vec The right hand side vector +* @param tag The tag (either lower_tag, unit_lower_tag, upper_tag, or unit_upper_tag) +*/ +template<typename NumericT, typename SolverTagT> +void inplace_solve(matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> const & proxy, + vector_base<NumericT> & vec, + SolverTagT const & tag) +{ + typedef typename matrix_base<NumericT>::handle_type handle_type; + + // wrap existing matrix in a new matrix_base object (no data copy) + matrix_base<NumericT> mat(const_cast<handle_type &>(proxy.lhs().handle()), + proxy.lhs().size2(), proxy.lhs().start2(), proxy.lhs().stride2(), proxy.lhs().internal_size2(), + proxy.lhs().size1(), proxy.lhs().start1(), proxy.lhs().stride1(), proxy.lhs().internal_size1(), + !proxy.lhs().row_major()); + detail::inplace_solve_vec_impl(mat, vec, tag); +} + + +/** @brief Convenience function for result = solve(mat, vec, upper_tag()); for an upper triangular solve. +* +* Creates a temporary result vector and forwards the request to inplace_solve() +* +* @param mat The system matrix +* @param vec The load vector +* @param tag Dispatch tag +*/ +template<typename NumericT> +vector<NumericT> solve(const matrix_base<NumericT> & mat, + const vector_base<NumericT> & vec, + viennacl::linalg::upper_tag const & tag) +{ + // run an inplace solve on the result vector: + vector<NumericT> result(vec); + inplace_solve(mat, result, tag); + return result; +} + +/** @brief Convenience function for result = solve(mat, vec, upper_tag()); for an upper triangular solve with unit diagonal. +* +* Creates a temporary result vector and forwards the request to inplace_solve() +* +* @param mat The system matrix +* @param vec The load vector +* @param tag Dispatch tag +*/ +template<typename NumericT> +vector<NumericT> solve(const matrix_base<NumericT> & mat, + const vector_base<NumericT> & vec, + viennacl::linalg::unit_upper_tag const & tag) +{ + // run an inplace solve on the result vector: + vector<NumericT> result(vec); + inplace_solve(mat, result, tag); + return result; +} + +/** @brief Convenience function for result = solve(mat, vec, upper_tag()); for a lower triangular solve. +* +* Creates a temporary result vector and forwards the request to inplace_solve() +* +* @param mat The system matrix +* @param vec The load vector +* @param tag Dispatch tag +*/ +template<typename NumericT> +vector<NumericT> solve(const matrix_base<NumericT> & mat, + const vector_base<NumericT> & vec, + viennacl::linalg::lower_tag const & tag) +{ + // run an inplace solve on the result vector: + vector<NumericT> result(vec); + inplace_solve(mat, result, tag); + return result; +} + +/** @brief Convenience function for result = solve(mat, vec, upper_tag()); for a lower triangular solve with unit diagonal. +* +* Creates a temporary result vector and forwards the request to inplace_solve() +* +* @param mat The system matrix +* @param vec The load vector +* @param tag Dispatch tag +*/ +template<typename NumericT> +vector<NumericT> solve(const matrix_base<NumericT> & mat, + const vector_base<NumericT> & vec, + viennacl::linalg::unit_lower_tag const & tag) +{ + // run an inplace solve on the result vector: + vector<NumericT> result(vec); + inplace_solve(mat, result, tag); + return result; +} + +/** @brief Convenience functions for result = solve(trans(mat), vec, some_tag()); Creates a temporary result vector and forwards the request to inplace_solve() +* +* @param proxy The transposed system matrix proxy +* @param vec The load vector, where the solution is directly written to +* @param tag Dispatch tag +*/ +template<typename NumericT, typename SolverTagT> +vector<NumericT> solve(const matrix_expression< const matrix_base<NumericT>, const matrix_base<NumericT>, op_trans> & proxy, + const vector_base<NumericT> & vec, + SolverTagT const & tag) +{ + // run an inplace solve on the result vector: + vector<NumericT> result(vec); + inplace_solve(proxy, result, tag); + return result; +} + + +} +} + +#endif http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/eig.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/eig.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/eig.hpp new file mode 100644 index 0000000..36be3b3 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/eig.hpp @@ -0,0 +1,29 @@ +#ifndef VIENNACL_LINALG_EIG_HPP_ +#define VIENNACL_LINALG_EIG_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/eig.hpp +* @brief Convenience header file including all available eigenvalue algorithms +*/ + +#include "viennacl/linalg/bisect.hpp" +#include "viennacl/linalg/lanczos.hpp" +#include "viennacl/linalg/power_iter.hpp" + +#endif http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/fft_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/fft_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/fft_operations.hpp new file mode 100644 index 0000000..ae9ade2 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/fft_operations.hpp @@ -0,0 +1,481 @@ +#ifndef VIENNACL_LINALG_FFT_OPERATIONS_HPP_ +#define VIENNACL_LINALG_FFT_OPERATIONS_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory + ============================================================================= */ + +/** @file viennacl/linalg/fft_operations.hpp + @brief Implementations of Fast Furier Transformation. + */ + +#include <viennacl/vector.hpp> +#include <viennacl/matrix.hpp> + +#include "viennacl/linalg/host_based/fft_operations.hpp" + +#ifdef VIENNACL_WITH_OPENCL +#include "viennacl/linalg/opencl/fft_operations.hpp" +#include "viennacl/linalg/opencl/kernels/fft.hpp" +#endif + +#ifdef VIENNACL_WITH_CUDA +#include "viennacl/linalg/cuda/fft_operations.hpp" +#endif + +namespace viennacl +{ +namespace linalg +{ + +/** + * @brief Direct 1D algorithm for computing Fourier transformation. + * + * Works on any sizes of data. + * Serial implementation has o(n^2) complexity + */ +template<typename NumericT, unsigned int AlignmentV> +void direct(viennacl::vector<NumericT, AlignmentV> const & in, + viennacl::vector<NumericT, AlignmentV> & out, vcl_size_t size, vcl_size_t stride, + vcl_size_t batch_num, NumericT sign = NumericT(-1), + viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) +{ + + switch (viennacl::traits::handle(in).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::direct(in, out, size, stride, batch_num, sign, data_order); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::direct(viennacl::traits::opencl_handle(in), viennacl::traits::opencl_handle(out), size, stride, batch_num, sign,data_order); + break; +#endif + +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::direct(in, out, size, stride, batch_num,sign,data_order); + break; +#endif + + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + + } +} + +/** + * @brief Direct 2D algorithm for computing Fourier transformation. + * + * Works on any sizes of data. + * Serial implementation has o(n^2) complexity + */ +template<typename NumericT, unsigned int AlignmentV> +void direct(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & in, + viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& out, vcl_size_t size, + vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1), + viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) +{ + + switch (viennacl::traits::handle(in).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::direct(in, out, size, stride, batch_num, sign, data_order); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::direct(viennacl::traits::opencl_handle(in), viennacl::traits::opencl_handle(out), size, stride, batch_num, sign,data_order); + break; +#endif + +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::direct(in, out, size, stride, batch_num,sign,data_order); + break; +#endif + + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + + } +} + +/* + * This function performs reorder of input data. Indexes are sorted in bit-reversal order. + * Such reordering should be done before in-place FFT. + */ +template<typename NumericT, unsigned int AlignmentV> +void reorder(viennacl::vector<NumericT, AlignmentV>& in, vcl_size_t size, vcl_size_t stride, + vcl_size_t bits_datasize, vcl_size_t batch_num, + viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) +{ + switch (viennacl::traits::handle(in).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::reorder(in, size, stride, bits_datasize, batch_num, data_order); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::reorder<NumericT>(viennacl::traits::opencl_handle(in), size, stride, bits_datasize, batch_num, data_order); + break; +#endif + +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::reorder(in, size, stride, bits_datasize, batch_num, data_order); + break; +#endif + + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + + } +} + +/** + * @brief Radix-2 1D algorithm for computing Fourier transformation. + * + * Works only on power-of-two sizes of data. + * Serial implementation has o(n * lg n) complexity. + * This is a Cooley-Tukey algorithm + */ +template<typename NumericT, unsigned int AlignmentV> +void radix2(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & in, vcl_size_t size, + vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1), + viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) +{ + switch (viennacl::traits::handle(in).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::radix2(in, size, stride, batch_num, sign, data_order); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::radix2(viennacl::traits::opencl_handle(in), size, stride, batch_num, sign,data_order); + break; +#endif + +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::radix2(in, size, stride, batch_num, sign, data_order); + break; +#endif + + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } +} + +/** + * @brief Radix-2 2D algorithm for computing Fourier transformation. + * + * Works only on power-of-two sizes of data. + * Serial implementation has o(n * lg n) complexity. + * This is a Cooley-Tukey algorithm + */ +template<typename NumericT, unsigned int AlignmentV> +void radix2(viennacl::vector<NumericT, AlignmentV>& in, vcl_size_t size, vcl_size_t stride, + vcl_size_t batch_num, NumericT sign = NumericT(-1), + viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) +{ + + switch (viennacl::traits::handle(in).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::radix2(in, size, stride, batch_num, sign, data_order); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::radix2(viennacl::traits::opencl_handle(in), size, stride, batch_num, sign,data_order); + break; +#endif + +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::radix2(in, size, stride, batch_num, sign,data_order); + break; +#endif + + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } +} + +/** + * @brief Bluestein's algorithm for computing Fourier transformation. + * + * Currently, Works only for sizes of input data which less than 2^16. + * Uses a lot of additional memory, but should be fast for any size of data. + * Serial implementation has something about o(n * lg n) complexity + */ +template<typename NumericT, unsigned int AlignmentV> +void bluestein(viennacl::vector<NumericT, AlignmentV> & in, + viennacl::vector<NumericT, AlignmentV> & out, vcl_size_t /*batch_num*/) +{ + + switch (viennacl::traits::handle(in).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::bluestein(in, out, 1); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::bluestein(in, out, 1); + break; +#endif + +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::bluestein(in, out, 1); + break; +#endif + + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } +} + +/** + * @brief Mutiply two complex vectors and store result in output + */ +template<typename NumericT, unsigned int AlignmentV> +void multiply_complex(viennacl::vector<NumericT, AlignmentV> const & input1, + viennacl::vector<NumericT, AlignmentV> const & input2, + viennacl::vector<NumericT, AlignmentV> & output) +{ + switch (viennacl::traits::handle(input1).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::multiply_complex(input1, input2, output); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::multiply_complex(input1, input2, output); + break; +#endif + +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::multiply_complex(input1, input2, output); + break; +#endif + + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } +} + +/** + * @brief Normalize vector on with his own size + */ +template<typename NumericT, unsigned int AlignmentV> +void normalize(viennacl::vector<NumericT, AlignmentV> & input) +{ + switch (viennacl::traits::handle(input).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::normalize(input); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::normalize(input); + break; +#endif + +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::normalize(input); + break; +#endif + + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } +} + +/** + * @brief Inplace_transpose matrix + */ +template<typename NumericT, unsigned int AlignmentV> +void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & input) +{ + switch (viennacl::traits::handle(input).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::transpose(input); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::transpose(input); + break; +#endif + +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::transpose(input); + break; +#endif + + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } +} + +/** + * @brief Transpose matrix + */ +template<typename NumericT, unsigned int AlignmentV> +void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & input, + viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & output) +{ + switch (viennacl::traits::handle(input).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::transpose(input, output); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::transpose(input, output); + break; +#endif + +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::transpose(input, output); + break; +#endif + + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } +} + +/** + * @brief Create complex vector from real vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part) + */ +template<typename NumericT> +void real_to_complex(viennacl::vector_base<NumericT> const & in, + viennacl::vector_base<NumericT> & out, vcl_size_t size) +{ + switch (viennacl::traits::handle(in).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::real_to_complex(in, out, size); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::real_to_complex(in,out,size); + break; +#endif + +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::real_to_complex(in,out,size); + break; +#endif + + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } +} + +/** + * @brief Create real vector from complex vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part) + */ +template<typename NumericT> +void complex_to_real(viennacl::vector_base<NumericT> const & in, + viennacl::vector_base<NumericT> & out, vcl_size_t size) +{ + switch (viennacl::traits::handle(in).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::complex_to_real(in, out, size); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::complex_to_real(in, out, size); + break; +#endif + +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::complex_to_real(in, out, size); + break; +#endif + + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } +} + +/** + * @brief Reverse vector to oposite order and save it in input vector + */ +template<typename NumericT> +void reverse(viennacl::vector_base<NumericT> & in) +{ + switch (viennacl::traits::handle(in).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::reverse(in); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::reverse(in); + break; +#endif + +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::reverse(in); + break; +#endif + + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } +} + +} +} + +#endif /* FFT_OPERATIONS_HPP_ */
