http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/qr.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/qr.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/qr.hpp new file mode 100644 index 0000000..907eb57 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/qr.hpp @@ -0,0 +1,497 @@ +#ifndef VIENNACL_LINALG_DETAIL_SPAI_QR_HPP +#define VIENNACL_LINALG_DETAIL_SPAI_QR_HPP + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/detail/spai/qr.hpp + @brief Implementation of a simultaneous QR factorization of multiple matrices. Experimental. + + SPAI code contributed by Nikolay Lukash +*/ + +#include <utility> +#include <iostream> +#include <fstream> +#include <string> +#include <algorithm> +#include <vector> +#include <math.h> +#include <cmath> +#include <sstream> +#include "viennacl/ocl/backend.hpp" +#include "boost/numeric/ublas/vector.hpp" +#include "boost/numeric/ublas/matrix.hpp" +#include "boost/numeric/ublas/matrix_proxy.hpp" +#include "boost/numeric/ublas/storage.hpp" +#include "boost/numeric/ublas/io.hpp" +#include "boost/numeric/ublas/matrix_expression.hpp" +#include "boost/numeric/ublas/detail/matrix_assign.hpp" + +#include "viennacl/vector.hpp" +#include "viennacl/matrix.hpp" + +#include "viennacl/linalg/detail/spai/block_matrix.hpp" +#include "viennacl/linalg/detail/spai/block_vector.hpp" +#include "viennacl/linalg/opencl/kernels/spai.hpp" + +namespace viennacl +{ +namespace linalg +{ +namespace detail +{ +namespace spai +{ + +//********** DEBUG FUNCTIONS *****************// +template< typename T, typename InputIteratorT> +void Print(std::ostream & ostr, InputIteratorT it_begin, InputIteratorT it_end) +{ + //std::ostream_iterator<int> it_os(ostr, delimiter); + std::string delimiters = " "; + std::copy(it_begin, it_end, std::ostream_iterator<T>(ostr, delimiters.c_str())); + ostr << std::endl; +} + +template<typename VectorT, typename MatrixT> +void write_to_block(VectorT & con_A_I_J, + unsigned int start_ind, + std::vector<unsigned int> const & I, + std::vector<unsigned int> const & J, + MatrixT& m) +{ + m.resize(I.size(), J.size(), false); + for (vcl_size_t i = 0; i < J.size(); ++i) + for (vcl_size_t j = 0; j < I.size(); ++j) + m(j,i) = con_A_I_J[start_ind + i*I.size() + j]; +} + +template<typename VectorT> +void print_continious_matrix(VectorT & con_A_I_J, + std::vector<cl_uint> & blocks_ind, + std::vector<std::vector<unsigned int> > const & g_I, + std::vector<std::vector<unsigned int> > const & g_J) +{ + typedef typename VectorT::value_type NumericType; + + std::vector<boost::numeric::ublas::matrix<NumericType> > com_A_I_J(g_I.size()); + for (vcl_size_t i = 0; i < g_I.size(); ++i) + { + write_to_block(con_A_I_J, blocks_ind[i], g_I[i], g_J[i], com_A_I_J[i]); + std::cout << com_A_I_J[i] << std::endl; + } +} + +template<typename VectorT> +void print_continious_vector(VectorT & con_v, + std::vector<cl_uint> & block_ind, + std::vector<std::vector<unsigned int> > const & g_J) +{ + typedef typename VectorT::value_type NumericType; + + std::vector<boost::numeric::ublas::vector<NumericType> > com_v(g_J.size()); + //Print<ScalarType>(std::cout, con_v.begin(), con_v.end()); + for (vcl_size_t i = 0; i < g_J.size(); ++i) + { + com_v[i].resize(g_J[i].size()); + for (vcl_size_t j = 0; j < g_J[i].size(); ++j) + com_v[i](j) = con_v[block_ind[i] + j]; + std::cout << com_v[i] << std::endl; + } +} + + +///**************************************** BLOCK FUNCTIONS ************************************// + +/** @brief Computes size of elements, start indices and matrix dimensions for a certain block + * + * @param g_I container of row indices + * @param g_J container of column indices + * @param sz general size for all elements in a certain block + * @param blocks_ind start indices in a certain + * @param matrix_dims matrix dimensions for each block + */ +inline void compute_blocks_size(std::vector<std::vector<unsigned int> > const & g_I, + std::vector<std::vector<unsigned int> > const & g_J, + unsigned int& sz, + std::vector<cl_uint> & blocks_ind, + std::vector<cl_uint> & matrix_dims) +{ + sz = 0; + for (vcl_size_t i = 0; i < g_I.size(); ++i) + { + sz += static_cast<unsigned int>(g_I[i].size()*g_J[i].size()); + matrix_dims[2*i] = static_cast<cl_uint>(g_I[i].size()); + matrix_dims[2*i + 1] = static_cast<cl_uint>(g_J[i].size()); + blocks_ind[i+1] = blocks_ind[i] + static_cast<cl_uint>(g_I[i].size()*g_J[i].size()); + } +} + +/** @brief Computes size of particular container of index set + * + * @param inds container of index sets + * @param size output size + */ +template<typename SizeT> +void get_size(std::vector<std::vector<SizeT> > const & inds, + SizeT & size) +{ + size = 0; + for (vcl_size_t i = 0; i < inds.size(); ++i) + size += static_cast<unsigned int>(inds[i].size()); +} + +/** @brief Initializes start indices of particular index set + * + * @param inds container of index sets + * @param start_inds output index set + */ +template<typename SizeT> +void init_start_inds(std::vector<std::vector<SizeT> > const & inds, + std::vector<cl_uint>& start_inds) +{ + for (vcl_size_t i = 0; i < inds.size(); ++i) + start_inds[i+1] = start_inds[i] + static_cast<cl_uint>(inds[i].size()); +} + + +//************************************* QR FUNCTIONS ***************************************// + +/** @brief Dot prod of particular column of martix A with it's self starting at a certain index beg_ind + * + * @param A init matrix + * @param beg_ind starting index + * @param res result of dot product + */ +template<typename MatrixT, typename NumericT> +void dot_prod(MatrixT const & A, + unsigned int beg_ind, + NumericT & res) +{ + res = NumericT(0); + for (vcl_size_t i = beg_ind; i < A.size1(); ++i) + res += A(i, beg_ind-1)*A(i, beg_ind-1); +} + +/** @brief Dot prod of particular matrix column with arbitrary vector: A(:, col_ind) + * + * @param A init matrix + * @param v input vector + * @param col_ind starting column index + * @param start_ind starting index inside column + * @param res result of dot product + */ +template<typename MatrixT, typename VectorT, typename NumericT> +void custom_inner_prod(MatrixT const & A, + VectorT const & v, + unsigned int col_ind, + unsigned int start_ind, + NumericT & res) +{ + res = static_cast<NumericT>(0); + for (unsigned int i = start_ind; i < static_cast<unsigned int>(A.size1()); ++i) + res += A(i, col_ind)*v(i); +} + +/** @brief Copying part of matrix column + * + * @param A init matrix + * @param v output vector + * @param beg_ind start index for copying + */ +template<typename MatrixT, typename VectorT> +void copy_vector(MatrixT const & A, + VectorT & v, + unsigned int beg_ind) +{ + for (unsigned int i = beg_ind; i < static_cast<unsigned int>(A.size1()); ++i) + v(i) = A( i, beg_ind-1); +} + + +//householder reflection c.f. Gene H. Golub, Charles F. Van Loan "Matrix Computations" 3rd edition p.210 +/** @brief Computation of Householder vector, householder reflection c.f. Gene H. Golub, Charles F. Van Loan "Matrix Computations" 3rd edition p.210 + * + * @param A init matrix + * @param j start index for computations + * @param v output Householder vector + * @param b beta + */ +template<typename MatrixT, typename VectorT, typename NumericT> +void householder_vector(MatrixT const & A, + unsigned int j, + VectorT & v, + NumericT & b) +{ + NumericT sg; + + dot_prod(A, j+1, sg); + copy_vector(A, v, j+1); + NumericT mu; + v(j) = static_cast<NumericT>(1.0); + if (!sg) + b = 0; + else + { + mu = std::sqrt(A(j,j)*A(j, j) + sg); + if (A(j, j) <= 0) + v(j) = A(j, j) - mu; + else + v(j) = -sg/(A(j, j) + mu); + + b = 2*(v(j)*v(j))/(sg + v(j)*v(j)); + v = v/v(j); + } +} + + +/** @brief Inplace application of Householder vector to a matrix A + * + * @param A init matrix + * @param iter_cnt current iteration + * @param v Householder vector + * @param b beta + */ +template<typename MatrixT, typename VectorT, typename NumericT> +void apply_householder_reflection(MatrixT & A, + unsigned int iter_cnt, + VectorT & v, + NumericT b) +{ + //update every column of matrix A + NumericT in_prod_res; + + for (unsigned int i = iter_cnt; i < static_cast<unsigned int>(A.size2()); ++i) + { + //update each column in a fashion: ai = ai - b*v*(v'*ai) + custom_inner_prod(A, v, i, iter_cnt, in_prod_res); + for (unsigned int j = iter_cnt; j < static_cast<unsigned int>(A.size1()); ++j) + A(j, i) -= b*in_prod_res*v(j); + } +} + +/** @brief Storage of vector v in column(A, ind), starting from ind-1 index of a column + * + * @param A init matrix + * @param ind index of a column + * @param v vector that should be stored + */ +template<typename MatrixT, typename VectorT> +void store_householder_vector(MatrixT & A, + unsigned int ind, + VectorT & v) +{ + for (unsigned int i = ind; i < static_cast<unsigned int>(A.size1()); ++i) + A(i, ind-1) = v(i); +} + + +//QR algorithm +/** @brief Inplace QR factorization via Householder reflections c.f. Gene H. Golub, Charles F. Van Loan "Matrix Computations" 3rd edition p.224 + * + * @param R input matrix + * @param b_v vector of betas + */ +template<typename MatrixT, typename VectorT> +void single_qr(MatrixT & R, VectorT & b_v) +{ + typedef typename MatrixT::value_type NumericType; + + if ((R.size1() > 0) && (R.size2() > 0)) + { + VectorT v = static_cast<VectorT>(boost::numeric::ublas::zero_vector<NumericType>(R.size1())); + b_v = static_cast<VectorT>(boost::numeric::ublas::zero_vector<NumericType>(R.size2())); + + for (unsigned int i = 0; i < static_cast<unsigned int>(R.size2()); ++i) + { + householder_vector(R, i, v, b_v[i]); + apply_householder_reflection(R, i, v, b_v[i]); + if (i < R.size1()) + store_householder_vector(R, i+1, v); + } + } +} + +//********************** HELP FUNCTIONS FOR GPU-based QR factorization *************************// + +/** @brief Getting max size of rows/columns from container of index set + * + * @param inds container of index set + * @param max_size max size that corresponds to that container + */ +template<typename SizeT> +void get_max_block_size(std::vector<std::vector<SizeT> > const & inds, + SizeT & max_size) +{ + max_size = 0; + for (vcl_size_t i = 0; i < inds.size(); ++i) + if (inds[i].size() > max_size) + max_size = static_cast<SizeT>(inds[i].size()); +} + +/** @brief Dot_prod(column(A, ind), v) starting from index ind+1 + * + * @param A input matrix + * @param v input vector + * @param ind index + * @param res result value + */ +template<typename MatrixT, typename VectorT, typename NumericT> +void custom_dot_prod(MatrixT const & A, + VectorT const & v, + unsigned int ind, + NumericT & res) +{ + res = static_cast<NumericT>(0); + for (unsigned int j = ind; j < A.size1(); ++j) + { + if (j == ind) + res += v(j); + else + res += A(j, ind)*v(j); + } +} + +/** @brief Recovery Q from matrix R and vector of betas b_v + * + * @param R input matrix + * @param b_v vector of betas + * @param y output vector + */ +template<typename MatrixT, typename VectorT> +void apply_q_trans_vec(MatrixT const & R, + VectorT const & b_v, + VectorT & y) +{ + typedef typename MatrixT::value_type NumericT; + + NumericT inn_prod = NumericT(0); + for (vcl_size_t i = 0; i < R.size2(); ++i) + { + custom_dot_prod(R, y, static_cast<unsigned int>(i), inn_prod); + for (vcl_size_t j = i; j < R.size1(); ++j) + { + if (i == j) + y(j) -= b_v(i)*inn_prod; + else + y(j) -= b_v(i)*inn_prod*R(j,i); + } + } +} + +/** @brief Multiplication of Q'*A, where Q is in implicit for lower part of R and vector of betas - b_v + * + * @param R input matrix + * @param b_v vector of betas + * @param A output matrix + */ +template<typename MatrixT, typename VectorT> +void apply_q_trans_mat(MatrixT const & R, + VectorT const & b_v, + MatrixT & A) +{ + VectorT tmp_v; + for (vcl_size_t i = 0; i < A.size2(); ++i) + { + tmp_v = static_cast<VectorT>(column(A,i)); + apply_q_trans_vec(R, b_v, tmp_v); + column(A,i) = tmp_v; + } +} + +//parallel QR for GPU +/** @brief Inplace QR factorization via Householder reflections c.f. Gene H. Golub, Charles F. Van Loan "Matrix Computations" 3rd edition p.224 performed on GPU + * + * @param g_I container of row indices + * @param g_J container of column indices + * @param g_A_I_J_vcl contigious matrices, GPU memory is used + * @param g_bv_vcl contigiuos vectors beta, GPU memory is used + * @param g_is_update container of indicators that show active blocks + * @param ctx Optional context in which the auxiliary data is created (one out of multiple OpenCL contexts, CUDA, host) + */ +template<typename NumericT> +void block_qr(std::vector<std::vector<unsigned int> > & g_I, + std::vector<std::vector<unsigned int> > & g_J, + block_matrix & g_A_I_J_vcl, + block_vector & g_bv_vcl, + std::vector<cl_uint> & g_is_update, + viennacl::context ctx) +{ + viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context()); + + //typedef typename MatrixType::value_type ScalarType; + unsigned int bv_size = 0; + unsigned int v_size = 0; + //set up arguments for GPU + //find maximum size of rows/columns + unsigned int local_r_n = 0; + unsigned int local_c_n = 0; + //find max size for blocks + get_max_block_size(g_I, local_r_n); + get_max_block_size(g_J, local_c_n); + //get size + get_size(g_J, bv_size); + get_size(g_I, v_size); + //get start indices + std::vector<cl_uint> start_bv_inds(g_I.size() + 1, 0); + std::vector<cl_uint> start_v_inds(g_I.size() + 1, 0); + init_start_inds(g_J, start_bv_inds); + init_start_inds(g_I, start_v_inds); + //init arrays + std::vector<NumericT> b_v(bv_size, NumericT(0)); + std::vector<NumericT> v(v_size, NumericT(0)); + //call qr program + block_vector v_vcl; + + g_bv_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, + static_cast<unsigned int>(sizeof(NumericT)*bv_size), + &(b_v[0])); + + v_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, + static_cast<unsigned int>(sizeof(NumericT)*v_size), + &(v[0])); + //the same as j_start_inds + g_bv_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, + static_cast<unsigned int>(sizeof(cl_uint)*g_I.size()), + &(start_bv_inds[0])); + + v_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, + static_cast<unsigned int>(sizeof(cl_uint)*g_I.size()), + &(start_v_inds[0])); + viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE, + static_cast<unsigned int>(sizeof(cl_uint)*g_is_update.size()), + &(g_is_update[0])); + //local memory + //viennacl::ocl::enqueue(k(vcl_vec, size, viennacl::ocl::local_mem(sizeof(SCALARTYPE) * k.local_work_size()), temp)); + viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx); + viennacl::ocl::kernel & qr_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_qr"); + + qr_kernel.local_work_size(0, local_c_n); + qr_kernel.global_work_size(0, local_c_n*256); + viennacl::ocl::enqueue(qr_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle1(), g_bv_vcl.handle(), + v_vcl.handle(), g_A_I_J_vcl.handle2(), + g_bv_vcl.handle1(), v_vcl.handle1(), g_is_update_vcl, + viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(NumericT)*(local_r_n*local_c_n))), + static_cast<cl_uint>(g_I.size()))); + +} +} +} +} +} +#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/small_matrix.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/small_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/small_matrix.hpp new file mode 100644 index 0000000..3cfdbb3 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/small_matrix.hpp @@ -0,0 +1,113 @@ +#ifndef VIENNACL_LINALG_DETAIL_SPAI_SMALL_MATRIX_HPP +#define VIENNACL_LINALG_DETAIL_SPAI_SMALL_MATRIX_HPP + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/detail/spai/small_matrix.hpp + @brief Implementation of a routines for small matrices (helper for SPAI). Experimental. + + SPAI code contributed by Nikolay Lukash +*/ + +#include <utility> +#include <iostream> +#include <fstream> +#include <string> +#include <algorithm> +#include <vector> +#include <math.h> +#include <map> +#include "boost/numeric/ublas/vector.hpp" +#include "boost/numeric/ublas/matrix.hpp" +#include "boost/numeric/ublas/matrix_proxy.hpp" +#include "boost/numeric/ublas/vector_proxy.hpp" +#include "boost/numeric/ublas/storage.hpp" +#include "boost/numeric/ublas/io.hpp" +#include "boost/numeric/ublas/lu.hpp" +#include "boost/numeric/ublas/triangular.hpp" +#include "boost/numeric/ublas/matrix_expression.hpp" +#include "boost/numeric/ublas/detail/matrix_assign.hpp" + +#include "viennacl/forwards.h" + +namespace viennacl +{ +namespace linalg +{ +namespace detail +{ +namespace spai +{ + +// +// Constructs an orthonormal sparse matrix M (with M^T M = Id). Is composed of elementary 2x2 rotation matrices with suitable renumbering. +// +template<typename MatrixT> +void make_rotation_matrix(MatrixT & mat, + vcl_size_t new_size, + vcl_size_t off_diagonal_distance = 4) +{ + mat.resize(new_size, new_size, false); + mat.clear(); + + double val = 1.0 / std::sqrt(2.0); + + for (vcl_size_t i=0; i<new_size; ++i) + mat(i,i) = val; + + for (vcl_size_t i=off_diagonal_distance; i<new_size; ++i) + { + mat(i-off_diagonal_distance, i) = val; + mat(i, i-off_diagonal_distance) = -val; + } + +} + + +//calcualtes matrix determinant +template<typename MatrixT> +double determinant(boost::numeric::ublas::matrix_expression<MatrixT> const & mat_r) +{ + double det = 1.0; + + MatrixT mLu(mat_r()); + boost::numeric::ublas::permutation_matrix<vcl_size_t> pivots(mat_r().size1()); + + int is_singular = static_cast<int>(lu_factorize(mLu, pivots)); + + if (!is_singular) + { + for (vcl_size_t i=0; i < pivots.size(); ++i) + { + if (pivots(i) != i) + det *= -1.0; + + det *= mLu(i,i); + } + } + else + det = 0.0; + + return det; +} + +} +} +} +} +#endif http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai-dynamic.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai-dynamic.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai-dynamic.hpp new file mode 100644 index 0000000..bac0b9e --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai-dynamic.hpp @@ -0,0 +1,687 @@ +#ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_DYNAMIC_HPP +#define VIENNACL_LINALG_DETAIL_SPAI_SPAI_DYNAMIC_HPP + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/detail/spai/spai-dynamic.hpp + @brief Implementation of a dynamic SPAI. Provides the routines for automatic pattern updates Experimental. + + SPAI code contributed by Nikolay Lukash +*/ + +#include <utility> +#include <iostream> +#include <fstream> +#include <string> +#include <algorithm> +#include <vector> +#include <math.h> +#include <map> +//#include "block_matrix.hpp" +//#include "block_vector.hpp" +//#include "benchmark-utils.hpp" +#include "boost/numeric/ublas/vector.hpp" +#include "boost/numeric/ublas/matrix.hpp" +#include "boost/numeric/ublas/matrix_proxy.hpp" +#include "boost/numeric/ublas/vector_proxy.hpp" +#include "boost/numeric/ublas/storage.hpp" +#include "boost/numeric/ublas/io.hpp" +#include "boost/numeric/ublas/lu.hpp" +#include "boost/numeric/ublas/triangular.hpp" +#include "boost/numeric/ublas/matrix_expression.hpp" +// ViennaCL includes +#include "viennacl/linalg/prod.hpp" +#include "viennacl/matrix.hpp" +#include "viennacl/compressed_matrix.hpp" +#include "viennacl/linalg/sparse_matrix_operations.hpp" +#include "viennacl/linalg/matrix_operations.hpp" +#include "viennacl/scalar.hpp" +#include "viennacl/linalg/cg.hpp" +#include "viennacl/linalg/inner_prod.hpp" +#include "viennacl/linalg/ilu.hpp" +#include "viennacl/ocl/backend.hpp" + +#include "viennacl/linalg/detail/spai/block_matrix.hpp" +#include "viennacl/linalg/detail/spai/block_vector.hpp" +#include "viennacl/linalg/detail/spai/qr.hpp" +#include "viennacl/linalg/detail/spai/spai-static.hpp" +#include "viennacl/linalg/detail/spai/spai.hpp" +#include "viennacl/linalg/detail/spai/spai_tag.hpp" +#include "viennacl/linalg/opencl/kernels/spai.hpp" + +namespace viennacl +{ +namespace linalg +{ +namespace detail +{ +namespace spai +{ + +/** @brief Helper functor for comparing std::pair<> based on the second member. */ +struct CompareSecond +{ + template<typename T1, typename T2> + bool operator()(std::pair<T1, T2> const & left, std::pair<T1, T2> const & right) + { + return static_cast<double>(left.second) > static_cast<double>(right.second); + } +}; + + +/** @brief Composition of new matrix R, that is going to be used in Least Square problem solving + * + * @param A matrix Q'*A(I, \\tilde J), where \\tilde J - set of new column indices + * @param R_n matrix A_Iu_J_u after QR factorization + * @param R previously composed matrix R + */ +template<typename MatrixT> +void composeNewR(MatrixT const & A, + MatrixT const & R_n, + MatrixT & R) +{ + typedef typename MatrixT::value_type NumericType; + + vcl_size_t row_n = R_n.size1() - (A.size1() - R.size2()); + MatrixT C = boost::numeric::ublas::zero_matrix<NumericType>(R.size1() + row_n, R.size2() + A.size2()); + + //write original R to new Composite R + boost::numeric::ublas::project(C, boost::numeric::ublas::range(0,R.size1()), boost::numeric::ublas::range(0, R.size2())) += R; + //write upper part of Q'*A_I_\hatJ, all columns and number of rows that equals to R.size2() + boost::numeric::ublas::project(C, boost::numeric::ublas::range(0, R.size2()), boost::numeric::ublas::range(R.size2(), + R.size2() + A.size2())) += + boost::numeric::ublas::project(A, boost::numeric::ublas::range(0, R.size2()), boost::numeric::ublas::range(0, A.size2())); + + //adding decomposed(QR) block to Composite R + if (R_n.size1() > 0 && R_n.size2() > 0) + boost::numeric::ublas::project(C, + boost::numeric::ublas::range(R.size2(), R.size1() + row_n), + boost::numeric::ublas::range(R.size2(), R.size2() + A.size2())) += R_n; + R = C; +} + +/** @brief Composition of new vector of coefficients beta from QR factorizations(necessary for Q recovery) + * + * @param v_n new vector from last QR factorization + * @param v composition of previous vectors from QR factorizations + */ +template<typename VectorT> +void composeNewVector(VectorT const & v_n, + VectorT & v) +{ + typedef typename VectorT::value_type NumericType; + + VectorT w = boost::numeric::ublas::zero_vector<NumericType>(v.size() + v_n.size()); + boost::numeric::ublas::project(w, boost::numeric::ublas::range(0, v.size())) += v; + boost::numeric::ublas::project(w, boost::numeric::ublas::range(v.size(), v.size() + v_n.size())) += v_n; + v = w; +} + +/** @brief Computation of Euclidean norm for sparse vector + * + * @param v initial sparse vector + * @param norm scalar that represents Euclidean norm + */ +template<typename SparseVectorT, typename NumericT> +void sparse_norm_2(SparseVectorT const & v, + NumericT & norm) +{ + for (typename SparseVectorT::const_iterator vec_it = v.begin(); vec_it != v.end(); ++vec_it) + norm += (vec_it->second)*(vec_it->second); + + norm = std::sqrt(norm); +} + +/** @brief Dot product of two sparse vectors + * + * @param v1 initial sparse vector + * @param v2 initial sparse vector + * @param res_v scalar that represents dot product result + */ +template<typename SparseVectorT, typename NumericT> +void sparse_inner_prod(SparseVectorT const & v1, + SparseVectorT const & v2, + NumericT & res_v) +{ + typename SparseVectorT::const_iterator v_it1 = v1.begin(); + typename SparseVectorT::const_iterator v_it2 = v2.begin(); + + while ((v_it1 != v1.end())&&(v_it2 != v2.end())) + { + if (v_it1->first == v_it2->first) + { + res_v += (v_it1->second)*(v_it2->second); + ++v_it1; + ++v_it2; + } + else if (v_it1->first < v_it2->first) + ++v_it1; + else + ++v_it2; + } +} + +/** @brief Building a new set of column indices J_u, cf. Kallischko dissertation p.31 + * + * @param A_v_c vectorized column-wise initial matrix + * @param res residual vector + * @param J set of column indices + * @param J_u set of new column indices + * @param tag SPAI tag with parameters + */ +template<typename SparseVectorT, typename NumericT> +bool buildAugmentedIndexSet(std::vector<SparseVectorT> const & A_v_c, + SparseVectorT const & res, + std::vector<unsigned int> & J, + std::vector<unsigned int> & J_u, + spai_tag const & tag) +{ + std::vector<std::pair<unsigned int, NumericT> > p; + vcl_size_t cur_size = 0; + NumericT inprod, norm2; + + for (typename SparseVectorT::const_iterator res_it = res.begin(); res_it != res.end(); ++res_it) + { + if (!isInIndexSet(J, res_it->first) && (std::fabs(res_it->second) > tag.getResidualThreshold())) + { + inprod = norm2 = 0; + sparse_inner_prod(res, A_v_c[res_it->first], inprod); + sparse_norm_2(A_v_c[res_it->first], norm2); + p.push_back(std::pair<unsigned int, NumericT>(res_it->first, (inprod*inprod)/(norm2*norm2))); + } + } + + std::sort(p.begin(), p.end(), CompareSecond()); + while ((cur_size < J.size()) && (p.size() > 0)) + { + J_u.push_back(p[0].first); + p.erase(p.begin()); + cur_size++; + } + p.clear(); + return (cur_size > 0); +} + +/** @brief Building a new indices to current set of row indices I_n, cf. Kallischko dissertation p.32 + * + * @param A_v_c vectorized column-wise initial matrix + * @param I set of previous determined row indices + * @param J_n set of new column indices + * @param I_n set of new indices + */ +template<typename SparseVectorT> +void buildNewRowSet(std::vector<SparseVectorT> const & A_v_c, + std::vector<unsigned int> const & I, + std::vector<unsigned int> const & J_n, + std::vector<unsigned int> & I_n) +{ + for (vcl_size_t i = 0; i < J_n.size(); ++i) + { + for (typename SparseVectorT::const_iterator col_it = A_v_c[J_n[i]].begin(); col_it!=A_v_c[J_n[i]].end(); ++col_it) + { + if (!isInIndexSet(I, col_it->first) && !isInIndexSet(I_n, col_it->first)) + I_n.push_back(col_it->first); + } + } +} + +/** @brief Composition of new block for QR factorization cf. Kallischko dissertation p.82, figure 4.7 + * + * @param A_I_J previously composed block + * @param A_I_J_u matrix Q'*A(I, \\tilde J), where \\tilde J - set of new column indices + * @param A_I_u_J_u is composition of lower part A(I, \\tilde J) and A(\\tilde I, \\tilde J) - new block for QR decomposition + */ +template<typename MatrixT> +void QRBlockComposition(MatrixT const & A_I_J, + MatrixT const & A_I_J_u, + MatrixT & A_I_u_J_u) +{ + typedef typename MatrixT::value_type NumericType; + + vcl_size_t row_n1 = A_I_J_u.size1() - A_I_J.size2(); + vcl_size_t row_n2 = A_I_u_J_u.size1(); + vcl_size_t row_n = row_n1 + row_n2; + vcl_size_t col_n = A_I_J_u.size2(); + + MatrixT C = boost::numeric::ublas::zero_matrix<NumericType>(row_n, col_n); + boost::numeric::ublas::project(C, + boost::numeric::ublas::range(0, row_n1), + boost::numeric::ublas::range(0, col_n)) + += boost::numeric::ublas::project(A_I_J_u, + boost::numeric::ublas::range(A_I_J.size2(), A_I_J_u.size1()), + boost::numeric::ublas::range(0, col_n)); + + boost::numeric::ublas::project(C, + boost::numeric::ublas::range(row_n1, row_n1 + row_n2), + boost::numeric::ublas::range(0, col_n)) += A_I_u_J_u; + A_I_u_J_u = C; +} + +/** @brief CPU-based dynamic update for SPAI preconditioner + * + * @param A initial sparse matrix + * @param A_v_c vectorized column-wise initial matrix + * @param g_res container of residuals for all columns + * @param g_is_update container with identificators that shows which block should be modified + * @param g_I container of row index sets for all columns + * @param g_J container of column index sets for all columns + * @param g_b_v container of vectors of beta for Q recovery(cf. Golub Van Loan "Matrix Computations", 3rd edition p.211) + * @param g_A_I_J container of block matrices from previous update + * @param tag SPAI configuration tag + */ +template<typename SparseMatrixT, + typename SparseVectorT, + typename DenseMatrixT, + typename VectorT> +void block_update(SparseMatrixT const & A, + std::vector<SparseVectorT> const & A_v_c, + std::vector<SparseVectorT> & g_res, + std::vector<bool> & g_is_update, + std::vector<std::vector<unsigned int> >& g_I, + std::vector<std::vector<unsigned int> >& g_J, + std::vector<VectorT> & g_b_v, + std::vector<DenseMatrixT> & g_A_I_J, + spai_tag const & tag) +{ + typedef typename DenseMatrixT::value_type NumericType; + + + std::vector<std::vector<unsigned int> > g_J_u(g_J.size()); // set of new column indices + std::vector<std::vector<unsigned int> > g_I_u(g_J.size()); // set of new row indices + std::vector<DenseMatrixT> g_A_I_J_u(g_J.size()); // matrix A(I, \tilde J), cf. Kallischko p.31-32 + std::vector<DenseMatrixT> g_A_I_u_J_u(g_J.size()); // matrix A(\tilde I, \tilde J), cf. Kallischko + std::vector<VectorT> g_b_v_u(g_J.size()); // new vector of beta coefficients from QR factorization + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long i = 0; i < static_cast<long>(g_J.size()); ++i) + { + if (g_is_update[static_cast<vcl_size_t>(i)]) + { + if (buildAugmentedIndexSet<SparseVectorT, NumericType>(A_v_c, g_res[static_cast<vcl_size_t>(i)], g_J[static_cast<vcl_size_t>(i)], g_J_u[static_cast<vcl_size_t>(i)], tag)) + { + //initialize matrix A_I_\hatJ + initProjectSubMatrix(A, g_J_u[static_cast<vcl_size_t>(i)], g_I[static_cast<vcl_size_t>(i)], g_A_I_J_u[static_cast<vcl_size_t>(i)]); + //multiplication of Q'*A_I_\hatJ + apply_q_trans_mat(g_A_I_J[static_cast<vcl_size_t>(i)], g_b_v[static_cast<vcl_size_t>(i)], g_A_I_J_u[static_cast<vcl_size_t>(i)]); + //building new rows index set \hatI + buildNewRowSet(A_v_c, g_I[static_cast<vcl_size_t>(i)], g_J_u[static_cast<vcl_size_t>(i)], g_I_u[static_cast<vcl_size_t>(i)]); + initProjectSubMatrix(A, g_J_u[static_cast<vcl_size_t>(i)], g_I_u[static_cast<vcl_size_t>(i)], g_A_I_u_J_u[static_cast<vcl_size_t>(i)]); + //composition of block for new QR factorization + QRBlockComposition(g_A_I_J[static_cast<vcl_size_t>(i)], g_A_I_J_u[static_cast<vcl_size_t>(i)], g_A_I_u_J_u[static_cast<vcl_size_t>(i)]); + //QR factorization + single_qr(g_A_I_u_J_u[static_cast<vcl_size_t>(i)], g_b_v_u[static_cast<vcl_size_t>(i)]); + //composition of new R and new vector b_v + composeNewR(g_A_I_J_u[static_cast<vcl_size_t>(i)], g_A_I_u_J_u[static_cast<vcl_size_t>(i)], g_A_I_J[static_cast<vcl_size_t>(i)]); + composeNewVector(g_b_v_u[static_cast<vcl_size_t>(i)], g_b_v[static_cast<vcl_size_t>(i)]); + //composition of new sets: I and J + g_J[static_cast<vcl_size_t>(i)].insert(g_J[static_cast<vcl_size_t>(i)].end(), g_J_u[static_cast<vcl_size_t>(i)].begin(), g_J_u[static_cast<vcl_size_t>(i)].end()); + g_I[static_cast<vcl_size_t>(i)].insert(g_I[static_cast<vcl_size_t>(i)].end(), g_I_u[static_cast<vcl_size_t>(i)].begin(), g_I_u[static_cast<vcl_size_t>(i)].end()); + } + else + { + g_is_update[static_cast<vcl_size_t>(i)] = false; + } + } + } +} + + +/**************************************************** GPU SPAI Update ****************************************************************/ + + +//performs Q'*A(I, \tilde J) on GPU +/** @brief Performs multiplication Q'*A(I, \\tilde J) on GPU + * + * @param g_J_u container of sets of new column indices + * @param g_I container of row indices + * @param g_A_I_J_vcl block matrix composed from previous blocks, they are blocks of R + * @param g_bv_vcl block of beta vectors + * @param g_A_I_J_u_vcl block of matrices A(I, \\tilde J) + * @param g_is_update indicators, that show if a certain block should be processed + * @param ctx Optional context in which the auxiliary data is created (one out of multiple OpenCL contexts, CUDA, host) + */ +template<typename NumericT> +void block_q_multiplication(std::vector<std::vector<unsigned int> > const & g_J_u, + std::vector<std::vector<unsigned int> > const & g_I, + block_matrix & g_A_I_J_vcl, + block_vector & g_bv_vcl, + block_matrix & g_A_I_J_u_vcl, + std::vector<cl_uint> & g_is_update, + viennacl::context ctx) +{ + viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context()); + unsigned int local_r_n = 0; + unsigned int local_c_n = 0; + unsigned int sz_blocks = 0; + + get_max_block_size(g_I, local_r_n); + get_max_block_size(g_J_u, local_c_n); + + //for debug + std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0)); + std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0)); + compute_blocks_size(g_I, g_J_u, sz_blocks, blocks_ind, matrix_dims); + //std::vector<ScalarType> con_A_I_J(sz_blocks, static_cast<ScalarType>(0)); + + viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE, + static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())), + &(g_is_update[0])); + viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx); + viennacl::ocl::kernel& block_q_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_q_mult"); + + block_q_kernel.local_work_size(0, local_c_n); + block_q_kernel.global_work_size(0, 128*local_c_n); + viennacl::ocl::enqueue(block_q_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(), + g_bv_vcl.handle(), + g_bv_vcl.handle1(), g_A_I_J_vcl.handle1(), g_A_I_J_u_vcl.handle1(), g_is_update_vcl, + viennacl::ocl::local_mem(static_cast<unsigned int>(sizeof(NumericT)*(local_r_n*local_c_n))), + static_cast<cl_uint>(g_I.size()))); +} + +/** @brief Assembly of container of index row sets: I_q, row indices for new "QR block" + * + * @param g_I container of row indices + * @param g_J container of column indices + * @param g_I_u container of new row indices + * @param g_I_q container of row indices for new QR blocks + */ +template<typename SizeT> +void assemble_qr_row_inds(std::vector<std::vector<SizeT> > const & g_I, + std::vector<std::vector<SizeT> > const & g_J, + std::vector<std::vector<SizeT> > const & g_I_u, + std::vector<std::vector<SizeT> > & g_I_q) +{ +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long i = 0; i < static_cast<long>(g_I.size()); ++i) + { + for (vcl_size_t j = g_J[static_cast<vcl_size_t>(i)].size(); j < g_I[static_cast<vcl_size_t>(i)].size(); ++j) + g_I_q[static_cast<vcl_size_t>(i)].push_back(g_I[static_cast<vcl_size_t>(i)][j]); + + for (vcl_size_t j = 0; j < g_I_u[static_cast<vcl_size_t>(i)].size(); ++j) + g_I_q[static_cast<vcl_size_t>(i)].push_back(g_I_u[static_cast<vcl_size_t>(i)][j]); + } +} + +/** @brief Performs assembly for new QR block + * + * @param g_J container of column indices + * @param g_I container of row indices + * @param g_J_u container of new column indices + * @param g_I_u container of new row indices + * @param g_I_q container of row indices for new QR blocks + * @param g_A_I_J_u_vcl blocks of Q'*A(I, \\tilde J) + * @param matrix_dimensions array with matrix dimensions for all blocks + * @param g_A_I_u_J_u_vcl blocks A(\\tilde I, \\tilde J) + * @param g_is_update container with update indicators + * @param is_empty_block indicator if all previous blocks A(\\tilde I, \\tilde J) - are empty, in case if they are empty kernel with smaller number of arguments is used + * @param ctx Optional context in which the matrix is created (one out of multiple OpenCL contexts, CUDA, host) +*/ +template<typename NumericT> +void assemble_qr_block(std::vector<std::vector<unsigned int> > const & g_J, + std::vector<std::vector<unsigned int> > const& g_I, + std::vector<std::vector<unsigned int> > const& g_J_u, + std::vector<std::vector<unsigned int> > const& g_I_u, + std::vector<std::vector<unsigned int> >& g_I_q, + block_matrix & g_A_I_J_u_vcl, + viennacl::ocl::handle<cl_mem> & matrix_dimensions, + block_matrix & g_A_I_u_J_u_vcl, + std::vector<cl_uint> & g_is_update, + bool is_empty_block, + viennacl::context ctx) +{ + viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context()); + + //std::vector<std::vector<unsigned int> > g_I_q(g_I.size()); + assemble_qr_row_inds(g_I, g_J, g_I_u, g_I_q); + unsigned int sz_blocks; + std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0)); + std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0)); + + compute_blocks_size(g_I_q, g_J_u, sz_blocks, blocks_ind, matrix_dims); + + std::vector<NumericT> con_A_I_J_q(sz_blocks, static_cast<NumericT>(0)); + + block_matrix g_A_I_J_q_vcl; + //need to allocate memory for QR block + g_A_I_J_q_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, + static_cast<unsigned int>(sizeof(NumericT)*sz_blocks), + &(con_A_I_J_q[0])); + g_A_I_J_q_vcl.handle().context(opencl_ctx); + + g_A_I_J_q_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, + static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size())), + &(matrix_dims[0])); + g_A_I_J_q_vcl.handle1().context(opencl_ctx); + + g_A_I_J_q_vcl.handle2() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, + static_cast<unsigned int>(sizeof(cl_uint)*static_cast<unsigned int>(g_I.size() + 1)), + &(blocks_ind[0])); + g_A_I_J_q_vcl.handle2().context(opencl_ctx); + + viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE, + static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())), + &(g_is_update[0])); + + viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx); + if (!is_empty_block) + { + viennacl::ocl::kernel& qr_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_qr_assembly"); + qr_assembly_kernel.local_work_size(0, 1); + qr_assembly_kernel.global_work_size(0, 256); + viennacl::ocl::enqueue(qr_assembly_kernel(matrix_dimensions, + g_A_I_J_u_vcl.handle(), + g_A_I_J_u_vcl.handle2(), + g_A_I_J_u_vcl.handle1(), + g_A_I_u_J_u_vcl.handle(), + g_A_I_u_J_u_vcl.handle2(), + g_A_I_u_J_u_vcl.handle1(), + g_A_I_J_q_vcl.handle(), + g_A_I_J_q_vcl.handle2(), + g_A_I_J_q_vcl.handle1(), + g_is_update_vcl, + static_cast<unsigned int>(g_I.size()))); + } + else + { + viennacl::ocl::kernel& qr_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_qr_assembly_1"); + qr_assembly_kernel.local_work_size(0, 1); + qr_assembly_kernel.global_work_size(0, 256); + viennacl::ocl::enqueue(qr_assembly_kernel(matrix_dimensions, g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(), + g_A_I_J_u_vcl.handle1(), + g_A_I_J_q_vcl.handle(), + g_A_I_J_q_vcl.handle2(), g_A_I_J_q_vcl.handle1(), + g_is_update_vcl, + static_cast<unsigned int>(g_I.size()))); + } + g_A_I_u_J_u_vcl.handle() = g_A_I_J_q_vcl.handle(); + g_A_I_u_J_u_vcl.handle1() = g_A_I_J_q_vcl.handle1(); + g_A_I_u_J_u_vcl.handle2() = g_A_I_J_q_vcl.handle2(); +} + +/** @brief Performs assembly for new R matrix on GPU + * + * @param g_I container of row indices + * @param g_J container of column indices + * @param g_A_I_J_vcl container of block matrices from previous update + * @param g_A_I_J_u_vcl container of block matrices Q'*A(I, \\tilde J) + * @param g_A_I_u_J_u_vcl container of block matrices QR factored on current iteration + * @param g_bv_vcl block of beta vectors from previous iteration + * @param g_bv_vcl_u block of updated beta vectors got after recent QR factorization + * @param g_is_update container with identificators that shows which block should be modified + * @param ctx Optional context in which the auxiliary data is created (one out of multiple OpenCL contexts, CUDA, host) + */ +template<typename NumericT> +void assemble_r(std::vector<std::vector<unsigned int> > & g_I, + std::vector<std::vector<unsigned int> > & g_J, + block_matrix & g_A_I_J_vcl, + block_matrix & g_A_I_J_u_vcl, + block_matrix & g_A_I_u_J_u_vcl, + block_vector & g_bv_vcl, + block_vector & g_bv_vcl_u, + std::vector<cl_uint> & g_is_update, + viennacl::context ctx) +{ + viennacl::ocl::context & opencl_ctx = const_cast<viennacl::ocl::context &>(ctx.opencl_context()); + std::vector<cl_uint> matrix_dims(g_I.size()*2, static_cast<cl_uint>(0)); + std::vector<cl_uint> blocks_ind(g_I.size() + 1, static_cast<cl_uint>(0)); + std::vector<cl_uint> start_bv_r_inds(g_I.size() + 1, 0); + unsigned int sz_blocks, bv_size; + + compute_blocks_size(g_I, g_J, sz_blocks, blocks_ind, matrix_dims); + get_size(g_J, bv_size); + init_start_inds(g_J, start_bv_r_inds); + + std::vector<NumericT> con_A_I_J_r(sz_blocks, static_cast<NumericT>(0)); + std::vector<NumericT> b_v_r(bv_size, static_cast<NumericT>(0)); + + block_matrix g_A_I_J_r_vcl; + block_vector g_bv_r_vcl; + g_A_I_J_r_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, + static_cast<unsigned int>(sizeof(NumericT)*sz_blocks), + &(con_A_I_J_r[0])); + g_A_I_J_r_vcl.handle().context(opencl_ctx); + + g_A_I_J_r_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, + static_cast<unsigned int>(sizeof(cl_uint)*2*static_cast<unsigned int>(g_I.size())), + &(matrix_dims[0])); + g_A_I_J_r_vcl.handle1().context(opencl_ctx); + + g_A_I_J_r_vcl.handle2() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, + static_cast<unsigned int>(sizeof(cl_uint)*static_cast<unsigned int>(g_I.size() + 1)), + &(blocks_ind[0])); + g_A_I_J_r_vcl.handle2().context(opencl_ctx); + + g_bv_r_vcl.handle() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, + static_cast<unsigned int>(sizeof(NumericT)*bv_size), + &(b_v_r[0])); + g_bv_r_vcl.handle().context(opencl_ctx); + + g_bv_r_vcl.handle1() = opencl_ctx.create_memory(CL_MEM_READ_WRITE, + static_cast<unsigned int>(sizeof(cl_uint)*(g_I.size() + 1)), + &(start_bv_r_inds[0])); + g_bv_r_vcl.handle().context(opencl_ctx); + + viennacl::ocl::handle<cl_mem> g_is_update_vcl = opencl_ctx.create_memory(CL_MEM_READ_WRITE, + static_cast<unsigned int>(sizeof(cl_uint)*(g_is_update.size())), + &(g_is_update[0])); + viennacl::linalg::opencl::kernels::spai<NumericT>::init(opencl_ctx); + viennacl::ocl::kernel& r_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_r_assembly"); + r_assembly_kernel.local_work_size(0, 1); + r_assembly_kernel.global_work_size(0, 256); + + viennacl::ocl::enqueue(r_assembly_kernel(g_A_I_J_vcl.handle(), g_A_I_J_vcl.handle2(), g_A_I_J_vcl.handle1(), + g_A_I_J_u_vcl.handle(), g_A_I_J_u_vcl.handle2(), g_A_I_J_u_vcl.handle1(), + g_A_I_u_J_u_vcl.handle(), g_A_I_u_J_u_vcl.handle2(), g_A_I_u_J_u_vcl.handle1(), + g_A_I_J_r_vcl.handle(), g_A_I_J_r_vcl.handle2(), g_A_I_J_r_vcl.handle1(), + g_is_update_vcl, static_cast<cl_uint>(g_I.size()))); + + viennacl::ocl::kernel & bv_assembly_kernel = opencl_ctx.get_kernel(viennacl::linalg::opencl::kernels::spai<NumericT>::program_name(), "block_bv_assembly"); + bv_assembly_kernel.local_work_size(0, 1); + bv_assembly_kernel.global_work_size(0, 256); + viennacl::ocl::enqueue(bv_assembly_kernel(g_bv_vcl.handle(), g_bv_vcl.handle1(), g_A_I_J_vcl.handle1(), g_bv_vcl_u.handle(), + g_bv_vcl_u.handle1(), g_A_I_J_u_vcl.handle1(), + g_bv_r_vcl.handle(), g_bv_r_vcl.handle1(), g_A_I_J_r_vcl.handle1(), g_is_update_vcl, + static_cast<cl_uint>(g_I.size()))); + g_bv_vcl.handle() = g_bv_r_vcl.handle(); + g_bv_vcl.handle1() = g_bv_r_vcl.handle1(); + + g_A_I_J_vcl.handle() = g_A_I_J_r_vcl.handle(); + g_A_I_J_vcl.handle2() = g_A_I_J_r_vcl.handle2(); + g_A_I_J_vcl.handle1() = g_A_I_J_r_vcl.handle1(); +} + +/** @brief GPU-based block update + * + * @param A sparse matrix + * @param A_v_c vectorized column-wise initial matrix + * @param g_is_update container with identificators that shows which block should be modified + * @param g_res container of residuals for all columns + * @param g_J container of column index sets for all columns + * @param g_I container of row index sets for all columns + * @param g_A_I_J_vcl container of block matrices from previous update + * @param g_bv_vcl block of beta vectors from previous iteration + * @param tag SPAI configuration tag + */ +template<typename NumericT, unsigned int AlignmentV, typename SparseVectorT> +void block_update(viennacl::compressed_matrix<NumericT, AlignmentV> const & A, + std::vector<SparseVectorT> const & A_v_c, + std::vector<cl_uint> & g_is_update, + std::vector<SparseVectorT> & g_res, + std::vector<std::vector<unsigned int> > & g_J, + std::vector<std::vector<unsigned int> > & g_I, + block_matrix & g_A_I_J_vcl, + block_vector & g_bv_vcl, + spai_tag const & tag) +{ + viennacl::context ctx = viennacl::traits::context(A); + //updated index set for columns + std::vector<std::vector<unsigned int> > g_J_u(g_J.size()); + //updated index set for rows + std::vector<std::vector<unsigned int> > g_I_u(g_J.size()); + //mixed index set of old and updated indices for rows + std::vector<std::vector<unsigned int> > g_I_q(g_J.size()); + //GPU memory for A_I_\hatJ + block_matrix g_A_I_J_u_vcl; + //GPU memory for A_\hatI_\hatJ + block_matrix g_A_I_u_J_u_vcl; + bool is_empty_block; + //GPU memory for new b_v + block_vector g_bv_u_vcl; + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long i = 0; i < static_cast<long>(g_J.size()); ++i) + { + if (g_is_update[static_cast<vcl_size_t>(i)]) + { + if (buildAugmentedIndexSet<SparseVectorT, NumericT>(A_v_c, g_res[static_cast<vcl_size_t>(i)], g_J[static_cast<vcl_size_t>(i)], g_J_u[static_cast<vcl_size_t>(i)], tag)) + buildNewRowSet(A_v_c, g_I[static_cast<vcl_size_t>(i)], g_J_u[static_cast<vcl_size_t>(i)], g_I_u[static_cast<vcl_size_t>(i)]); + } + } + //assemble new A_I_J_u blocks on GPU and multiply them with Q' + block_assembly(A, g_J_u, g_I, g_A_I_J_u_vcl, g_is_update, is_empty_block); + //I have matrix A_I_J_u ready.. + block_q_multiplication<NumericT>(g_J_u, g_I, g_A_I_J_vcl, g_bv_vcl, g_A_I_J_u_vcl, g_is_update, ctx); + //assemble A_\hatI_\hatJ + block_assembly(A, g_J_u, g_I_u, g_A_I_u_J_u_vcl, g_is_update, is_empty_block); + assemble_qr_block<NumericT>(g_J, g_I, g_J_u, g_I_u, g_I_q, g_A_I_J_u_vcl, g_A_I_J_vcl.handle1(), + g_A_I_u_J_u_vcl, g_is_update, is_empty_block, ctx); + + block_qr<NumericT>(g_I_q, g_J_u, g_A_I_u_J_u_vcl, g_bv_u_vcl, g_is_update, ctx); + //concatanation of new and old indices +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for +#endif + for (long i = 0; i < static_cast<long>(g_J.size()); ++i) + { + g_J[static_cast<vcl_size_t>(i)].insert(g_J[static_cast<vcl_size_t>(i)].end(), g_J_u[static_cast<vcl_size_t>(i)].begin(), g_J_u[static_cast<vcl_size_t>(i)].end()); + g_I[static_cast<vcl_size_t>(i)].insert(g_I[static_cast<vcl_size_t>(i)].end(), g_I_u[static_cast<vcl_size_t>(i)].begin(), g_I_u[static_cast<vcl_size_t>(i)].end()); + } + assemble_r<NumericT>(g_I, g_J, g_A_I_J_vcl, g_A_I_J_u_vcl, g_A_I_u_J_u_vcl, g_bv_vcl, g_bv_u_vcl, g_is_update, ctx); +} + +} +} +} +} +#endif http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai-static.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai-static.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai-static.hpp new file mode 100644 index 0000000..0fd11146 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/detail/spai/spai-static.hpp @@ -0,0 +1,192 @@ +#ifndef VIENNACL_LINALG_DETAIL_SPAI_SPAI_STATIC_HPP +#define VIENNACL_LINALG_DETAIL_SPAI_SPAI_STATIC_HPP + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/detail/spai/spai-static.hpp + @brief Implementation of a static SPAI. Experimental. + + SPAI code contributed by Nikolay Lukash +*/ + +#include <utility> +#include <iostream> +#include <fstream> +#include <string> +#include <algorithm> +#include <vector> +#include <math.h> +#include <map> +//#include "spai-dynamic.hpp" +#include "boost/numeric/ublas/vector.hpp" +#include "boost/numeric/ublas/matrix.hpp" +#include "boost/numeric/ublas/matrix_proxy.hpp" +#include "boost/numeric/ublas/vector_proxy.hpp" +#include "boost/numeric/ublas/storage.hpp" +#include "boost/numeric/ublas/io.hpp" +#include "boost/numeric/ublas/lu.hpp" +#include "boost/numeric/ublas/triangular.hpp" +#include "boost/numeric/ublas/matrix_expression.hpp" +// ViennaCL includes +#include "viennacl/linalg/prod.hpp" +#include "viennacl/matrix.hpp" +#include "viennacl/compressed_matrix.hpp" +#include "viennacl/linalg/sparse_matrix_operations.hpp" +#include "viennacl/linalg/matrix_operations.hpp" +#include "viennacl/scalar.hpp" +#include "viennacl/linalg/cg.hpp" +#include "viennacl/linalg/inner_prod.hpp" + +//#include "boost/numeric/ublas/detail/matrix_assign.hpp" + +namespace viennacl +{ +namespace linalg +{ +namespace detail +{ +namespace spai +{ + +/** @brief Determines if element ind is in set {J} + * + * @param J current set + * @param ind current element + */ +template<typename SizeT> +bool isInIndexSet(std::vector<SizeT> const & J, SizeT ind) +{ + return (std::find(J.begin(), J.end(), ind) != J.end()); +} + + + +/********************************* STATIC SPAI FUNCTIONS******************************************/ + +/** @brief Projects solution of LS problem onto original column m + * + * @param m_in solution of LS + * @param J set of non-zero columns + * @param m original column of M + */ +template<typename VectorT, typename SparseVectorT> +void fanOutVector(VectorT const & m_in, std::vector<unsigned int> const & J, SparseVectorT & m) +{ + unsigned int cnt = 0; + for (vcl_size_t i = 0; i < J.size(); ++i) + m[J[i]] = m_in(cnt++); +} + +/** @brief Solution of linear:R*x=y system by backward substitution + * + * @param R uppertriangular matrix + * @param y right handside vector + * @param x solution vector + */ +template<typename MatrixT, typename VectorT> +void backwardSolve(MatrixT const & R, VectorT const & y, VectorT & x) +{ + for (long i2 = static_cast<long>(R.size2())-1; i2 >= 0; i2--) + { + vcl_size_t i = static_cast<vcl_size_t>(i2); + x(i) = y(i); + for (vcl_size_t j = static_cast<vcl_size_t>(i)+1; j < R.size2(); ++j) + x(i) -= R(i,j)*x(j); + + x(i) /= R(i,i); + } +} + +/** @brief Perform projection of set I on the unit-vector + * + * @param I set of non-zero rows + * @param y result vector + * @param ind index of unit vector + */ +template<typename VectorT, typename NumericT> +void projectI(std::vector<unsigned int> const & I, VectorT & y, unsigned int ind) +{ + for (vcl_size_t i = 0; i < I.size(); ++i) + { + //y.resize(y.size()+1); + if (I[i] == ind) + y(i) = NumericT(1.0); + else + y(i) = NumericT(0.0); + } +} + +/** @brief Builds index set of projected columns for current column of preconditioner + * + * @param v current column of preconditioner + * @param J output - index set of non-zero columns + */ +template<typename SparseVectorT> +void buildColumnIndexSet(SparseVectorT const & v, std::vector<unsigned int> & J) +{ + for (typename SparseVectorT::const_iterator vec_it = v.begin(); vec_it != v.end(); ++vec_it) + J.push_back(vec_it->first); + + std::sort(J.begin(), J.end()); +} + +/** @brief Initialize preconditioner with sparcity pattern = p(A) + * + * @param A input matrix + * @param M output matrix - initialized preconditioner + */ +template<typename SparseMatrixT> +void initPreconditioner(SparseMatrixT const & A, SparseMatrixT & M) +{ + typedef typename SparseMatrixT::value_type NumericType; + + M.resize(A.size1(), A.size2(), false); + for (typename SparseMatrixT::const_iterator1 row_it = A.begin1(); row_it!= A.end1(); ++row_it) + for (typename SparseMatrixT::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it) + M(col_it.index1(),col_it.index2()) = NumericType(1); +} + +/** @brief Row projection for matrix A(:,J) -> A(I,J), building index set of non-zero rows + * + * @param A_v_c input matrix + * @param J set of non-zero rows + * @param I output matrix + */ +template<typename SparseVectorT> +void projectRows(std::vector<SparseVectorT> const & A_v_c, + std::vector<unsigned int> const & J, + std::vector<unsigned int> & I) +{ + for (vcl_size_t i = 0; i < J.size(); ++i) + { + for (typename SparseVectorT::const_iterator col_it = A_v_c[J[i]].begin(); col_it!=A_v_c[J[i]].end(); ++col_it) + { + if (!isInIndexSet(I, col_it->first)) + I.push_back(col_it->first); + } + } + std::sort(I.begin(), I.end()); +} + + +} //namespace spai +} //namespace detail +} //namespace linalg +} //namespace viennacl + +#endif
