http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/ilu.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/ilu.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/ilu.hpp new file mode 100644 index 0000000..540ff82 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/ilu.hpp @@ -0,0 +1,33 @@ +#ifndef VIENNACL_LINALG_ILU_HPP_ +#define VIENNACL_LINALG_ILU_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/ilu.hpp + @brief Implementations of incomplete factorization preconditioners. Convenience header file. +*/ + +#include "viennacl/linalg/detail/ilu/ilut.hpp" +#include "viennacl/linalg/detail/ilu/ilu0.hpp" +#include "viennacl/linalg/detail/ilu/block_ilu.hpp" +#include "viennacl/linalg/detail/ilu/chow_patel_ilu.hpp" + +#endif + + +
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/ilu_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/ilu_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/ilu_operations.hpp new file mode 100644 index 0000000..febd347 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/ilu_operations.hpp @@ -0,0 +1,334 @@ +#ifndef VIENNACL_LINALG_ILU_OPERATIONS_HPP_ +#define VIENNACL_LINALG_ILU_OPERATIONS_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the PDF manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/ilu_operations.hpp + @brief Implementations of specialized routines for the Chow-Patel parallel ILU preconditioner +*/ + +#include "viennacl/forwards.h" +#include "viennacl/range.hpp" +#include "viennacl/scalar.hpp" +#include "viennacl/tools/tools.hpp" +#include "viennacl/meta/predicate.hpp" +#include "viennacl/meta/enable_if.hpp" +#include "viennacl/traits/size.hpp" +#include "viennacl/traits/start.hpp" +#include "viennacl/traits/handle.hpp" +#include "viennacl/traits/stride.hpp" +#include "viennacl/linalg/host_based/ilu_operations.hpp" + +#ifdef VIENNACL_WITH_OPENCL + #include "viennacl/linalg/opencl/ilu_operations.hpp" +#endif + +#ifdef VIENNACL_WITH_CUDA + #include "viennacl/linalg/cuda/ilu_operations.hpp" +#endif + +namespace viennacl +{ +namespace linalg +{ + +/** @brief Extracts the lower triangular part L from A. + * + * Diagonal of L is stored explicitly in order to enable better code reuse. + * + */ +template<typename NumericT> +void extract_L(compressed_matrix<NumericT> const & A, + compressed_matrix<NumericT> & L) +{ + switch (viennacl::traits::handle(A).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::extract_L(A, L); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::extract_L(A, L); + break; +#endif +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::extract_L(A, L); + break; +#endif + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } +} + +/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L accordingly. + * + * Since A should not be modified (const-correctness), updates are in L. + * + */ +template<typename NumericT> +void icc_scale(compressed_matrix<NumericT> const & A, + compressed_matrix<NumericT> & L) +{ + switch (viennacl::traits::handle(A).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::icc_scale(A, L); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::icc_scale(A, L); + break; +#endif +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::icc_scale(A, L); + break; +#endif + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } +} + +/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ICC (cf. Algorithm 3 in paper, but for L rather than U) + * + * We use a fully synchronous (Jacobi-like) variant, because asynchronous methods as described in the paper are a nightmare to debug + * (and particularly funny if they sometimes fail, sometimes not) + * + * @param L Factor L to be updated for the incomplete Cholesky factorization + * @param aij_L Lower triangular potion from system matrix + */ +template<typename NumericT> +void icc_chow_patel_sweep(compressed_matrix<NumericT> & L, + vector<NumericT> & aij_L) +{ + switch (viennacl::traits::handle(L).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::icc_chow_patel_sweep(L, aij_L); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::icc_chow_patel_sweep(L, aij_L); + break; +#endif +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::icc_chow_patel_sweep(L, aij_L); + break; +#endif + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } +} + + + +//////////////////////// ILU //////////////////// + +/** @brief Extracts the lower triangular part L and the upper triangular part U from A. + * + * Diagonals of L and U are stored explicitly in order to enable better code reuse. + * + */ +template<typename NumericT> +void extract_LU(compressed_matrix<NumericT> const & A, + compressed_matrix<NumericT> & L, + compressed_matrix<NumericT> & U) +{ + switch (viennacl::traits::handle(A).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::extract_LU(A, L, U); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::extract_LU(A, L, U); + break; +#endif +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::extract_LU(A, L, U); + break; +#endif + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } +} + +/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. + * + * Since A should not be modified (const-correctness), updates are in L and U. + * + */ +template<typename NumericT> +void ilu_scale(compressed_matrix<NumericT> const & A, + compressed_matrix<NumericT> & L, + compressed_matrix<NumericT> & U) +{ + switch (viennacl::traits::handle(A).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::ilu_scale(A, L, U); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::ilu_scale(A, L, U); + break; +#endif +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::ilu_scale(A, L, U); + break; +#endif + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } +} + +/** @brief Transposition B <- A^T, where the aij-vector is permuted in the same way as the value array in A when assigned to B + * + * @param A Input matrix to be transposed + * @param B Output matrix containing the transposed matrix + */ +template<typename NumericT> +void ilu_transpose(compressed_matrix<NumericT> const & A, + compressed_matrix<NumericT> & B) +{ + viennacl::context orig_ctx = viennacl::traits::context(A); + viennacl::context cpu_ctx(viennacl::MAIN_MEMORY); + (void)orig_ctx; + (void)cpu_ctx; + + viennacl::compressed_matrix<NumericT> A_host(0, 0, 0, cpu_ctx); + (void)A_host; + + switch (viennacl::traits::handle(A).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::ilu_transpose(A, B); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + A_host = A; + B.switch_memory_context(cpu_ctx); + viennacl::linalg::host_based::ilu_transpose(A_host, B); + B.switch_memory_context(orig_ctx); + break; +#endif +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + A_host = A; + B.switch_memory_context(cpu_ctx); + viennacl::linalg::host_based::ilu_transpose(A_host, B); + B.switch_memory_context(orig_ctx); + break; +#endif + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } +} + + + +/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU (cf. Algorithm 2 in paper) + * + * We use a fully synchronous (Jacobi-like) variant, because asynchronous methods as described in the paper are a nightmare to debug + * (and particularly funny if they sometimes fail, sometimes not) + * + * @param L Lower-triangular matrix L in LU factorization + * @param aij_L Lower-triangular matrix L from A + * @param U_trans Upper-triangular matrix U in CSC-storage, which is the same as U^trans in CSR-storage + * @param aij_U_trans Upper-triangular matrix from A in CSC-storage, which is the same as U^trans in CSR-storage + */ +template<typename NumericT> +void ilu_chow_patel_sweep(compressed_matrix<NumericT> & L, + vector<NumericT> const & aij_L, + compressed_matrix<NumericT> & U_trans, + vector<NumericT> const & aij_U_trans) +{ + switch (viennacl::traits::handle(L).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::ilu_chow_patel_sweep(L, aij_L, U_trans, aij_U_trans); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::ilu_chow_patel_sweep(L, aij_L, U_trans, aij_U_trans); + break; +#endif +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::ilu_chow_patel_sweep(L, aij_L, U_trans, aij_U_trans); + break; +#endif + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } +} + +/** @brief Extracts the lower triangular part L and the upper triangular part U from A. + * + * Diagonals of L and U are stored explicitly in order to enable better code reuse. + * + */ +template<typename NumericT> +void ilu_form_neumann_matrix(compressed_matrix<NumericT> & R, + vector<NumericT> & diag_R) +{ + switch (viennacl::traits::handle(R).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::ilu_form_neumann_matrix(R, diag_R); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::ilu_form_neumann_matrix(R, diag_R); + break; +#endif +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::ilu_form_neumann_matrix(R, diag_R); + break; +#endif + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } +} + +} //namespace linalg +} //namespace viennacl + + +#endif http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/inner_prod.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/inner_prod.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/inner_prod.hpp new file mode 100644 index 0000000..b31a82a --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/inner_prod.hpp @@ -0,0 +1,186 @@ +#ifndef VIENNACL_LINALG_INNER_PROD_HPP_ +#define VIENNACL_LINALG_INNER_PROD_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/inner_prod.hpp + @brief Generic interface for the computation of inner products. See viennacl/linalg/vector_operations.hpp for implementations. +*/ + +#include "viennacl/forwards.h" +#include "viennacl/tools/tools.hpp" +#include "viennacl/meta/enable_if.hpp" +#include "viennacl/meta/tag_of.hpp" +#include "viennacl/meta/result_of.hpp" + +namespace viennacl +{ +// +// generic inner_prod function +// uses tag dispatch to identify which algorithm +// should be called +// +namespace linalg +{ + +#ifdef VIENNACL_WITH_ARMADILLO +// ---------------------------------------------------- +// Armadillo +// +template<typename NumericT> +NumericT inner_prod(arma::Col<NumericT> const& v1, arma::Col<NumericT> const& v2) +{ + return dot(v1, v2); +} +#endif + +#ifdef VIENNACL_WITH_EIGEN +// ---------------------------------------------------- +// EIGEN +// +template<typename VectorT1, typename VectorT2> +typename viennacl::enable_if< viennacl::is_eigen< typename viennacl::traits::tag_of< VectorT1 >::type >::value, + typename VectorT1::RealScalar>::type +inner_prod(VectorT1 const & v1, VectorT2 const & v2) +{ + //std::cout << "eigen .. " << std::endl; + return v1.dot(v2); +} +#endif + +#ifdef VIENNACL_WITH_MTL4 +// ---------------------------------------------------- +// MTL4 +// +template<typename VectorT1, typename VectorT2> +typename viennacl::enable_if< viennacl::is_mtl4< typename viennacl::traits::tag_of< VectorT1 >::type >::value, + typename VectorT1::value_type>::type +inner_prod(VectorT1 const & v1, VectorT2 const & v2) +{ + //std::cout << "mtl4 .. " << std::endl; + return mtl::dot(v1, v2); +} +#endif + +#ifdef VIENNACL_WITH_UBLAS +// ---------------------------------------------------- +// UBLAS +// +template<typename VectorT1, typename VectorT2> +typename viennacl::enable_if< viennacl::is_ublas< typename viennacl::traits::tag_of< VectorT1 >::type >::value, + typename VectorT1::value_type>::type +inner_prod(VectorT1 const & v1, VectorT2 const & v2) +{ + //std::cout << "ublas .. " << std::endl; + return boost::numeric::ublas::inner_prod(v1, v2); +} +#endif + +// ---------------------------------------------------- +// STL +// +template<typename VectorT1, typename VectorT2> +typename viennacl::enable_if< viennacl::is_stl< typename viennacl::traits::tag_of< VectorT1 >::type >::value, + typename VectorT1::value_type>::type +inner_prod(VectorT1 const & v1, VectorT2 const & v2) +{ + assert(v1.size() == v2.size() && bool("Vector sizes mismatch")); + //std::cout << "stl .. " << std::endl; + typename VectorT1::value_type result = 0; + for (typename VectorT1::size_type i=0; i<v1.size(); ++i) + result += v1[i] * v2[i]; + + return result; +} + +// ---------------------------------------------------- +// VIENNACL +// +template<typename NumericT> +viennacl::scalar_expression< const vector_base<NumericT>, const vector_base<NumericT>, viennacl::op_inner_prod > +inner_prod(vector_base<NumericT> const & vector1, + vector_base<NumericT> const & vector2) +{ + //std::cout << "viennacl .. " << std::endl; + return viennacl::scalar_expression< const vector_base<NumericT>, + const vector_base<NumericT>, + viennacl::op_inner_prod >(vector1, vector2); +} + + +// expression on lhs: +template< typename LHS, typename RHS, typename OP, typename NumericT> +viennacl::scalar_expression< const viennacl::vector_expression<LHS, RHS, OP>, + const vector_base<NumericT>, + viennacl::op_inner_prod > +inner_prod(viennacl::vector_expression<LHS, RHS, OP> const & vector1, + vector_base<NumericT> const & vector2) +{ + //std::cout << "viennacl .. " << std::endl; + return viennacl::scalar_expression< const viennacl::vector_expression<LHS, RHS, OP>, + const vector_base<NumericT>, + viennacl::op_inner_prod >(vector1, vector2); +} + +// expression on rhs: +template<typename NumericT, typename LHS, typename RHS, typename OP> +viennacl::scalar_expression< const vector_base<NumericT>, + const viennacl::vector_expression<LHS, RHS, OP>, + viennacl::op_inner_prod > +inner_prod(vector_base<NumericT> const & vector1, + viennacl::vector_expression<LHS, RHS, OP> const & vector2) +{ + //std::cout << "viennacl .. " << std::endl; + return viennacl::scalar_expression< const vector_base<NumericT>, + const viennacl::vector_expression<LHS, RHS, OP>, + viennacl::op_inner_prod >(vector1, vector2); +} + +// expression on lhs and rhs: +template<typename LHS1, typename RHS1, typename OP1, + typename LHS2, typename RHS2, typename OP2> +viennacl::scalar_expression< const viennacl::vector_expression<LHS1, RHS1, OP1>, + const viennacl::vector_expression<LHS2, RHS2, OP2>, + viennacl::op_inner_prod > +inner_prod(viennacl::vector_expression<LHS1, RHS1, OP1> const & vector1, + viennacl::vector_expression<LHS2, RHS2, OP2> const & vector2) +{ + //std::cout << "viennacl .. " << std::endl; + return viennacl::scalar_expression< const viennacl::vector_expression<LHS1, RHS1, OP1>, + const viennacl::vector_expression<LHS2, RHS2, OP2>, + viennacl::op_inner_prod >(vector1, vector2); +} + + +// Multiple inner products: +template<typename NumericT> +viennacl::vector_expression< const vector_base<NumericT>, const vector_tuple<NumericT>, viennacl::op_inner_prod > +inner_prod(vector_base<NumericT> const & x, + vector_tuple<NumericT> const & y_tuple) +{ + return viennacl::vector_expression< const vector_base<NumericT>, + const vector_tuple<NumericT>, + viennacl::op_inner_prod >(x, y_tuple); +} + + +} // end namespace linalg +} // end namespace viennacl +#endif + + http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/iterative_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/iterative_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/iterative_operations.hpp new file mode 100644 index 0000000..78a813d --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/iterative_operations.hpp @@ -0,0 +1,425 @@ +#ifndef VIENNACL_LINALG_ITERATIVE_OPERATIONS_HPP_ +#define VIENNACL_LINALG_ITERATIVE_OPERATIONS_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/iterative_operations.hpp + @brief Implementations of specialized routines for the iterative solvers. +*/ + +#include "viennacl/forwards.h" +#include "viennacl/range.hpp" +#include "viennacl/scalar.hpp" +#include "viennacl/tools/tools.hpp" +#include "viennacl/meta/predicate.hpp" +#include "viennacl/meta/enable_if.hpp" +#include "viennacl/traits/size.hpp" +#include "viennacl/traits/start.hpp" +#include "viennacl/traits/handle.hpp" +#include "viennacl/traits/stride.hpp" +#include "viennacl/linalg/host_based/iterative_operations.hpp" + +#ifdef VIENNACL_WITH_OPENCL + #include "viennacl/linalg/opencl/iterative_operations.hpp" +#endif + +#ifdef VIENNACL_WITH_CUDA + #include "viennacl/linalg/cuda/iterative_operations.hpp" +#endif + +namespace viennacl +{ +namespace linalg +{ + +/** @brief Performs a joint vector update operation needed for an efficient pipelined CG algorithm. + * + * This routines computes for vectors 'result', 'p', 'r', 'Ap': + * result += alpha * p; + * r -= alpha * Ap; + * p = r + beta * p; + * and runs the parallel reduction stage for computing inner_prod(r,r) + */ +template<typename NumericT> +void pipelined_cg_vector_update(vector_base<NumericT> & result, + NumericT alpha, + vector_base<NumericT> & p, + vector_base<NumericT> & r, + vector_base<NumericT> const & Ap, + NumericT beta, + vector_base<NumericT> & inner_prod_buffer) +{ + switch (viennacl::traits::handle(result).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::pipelined_cg_vector_update(result, alpha, p, r, Ap, beta, inner_prod_buffer); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::pipelined_cg_vector_update(result, alpha, p, r, Ap, beta, inner_prod_buffer); + break; +#endif +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::pipelined_cg_vector_update(result, alpha, p, r, Ap, beta, inner_prod_buffer); + break; +#endif + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } +} + + +/** @brief Performs a joint vector update operation needed for an efficient pipelined CG algorithm. + * + * This routines computes for a matrix A and vectors 'p' and 'Ap': + * Ap = prod(A, p); + * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap) + */ +template<typename MatrixT, typename NumericT> +void pipelined_cg_prod(MatrixT const & A, + vector_base<NumericT> const & p, + vector_base<NumericT> & Ap, + vector_base<NumericT> & inner_prod_buffer) +{ + switch (viennacl::traits::handle(p).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::pipelined_cg_prod(A, p, Ap, inner_prod_buffer); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::pipelined_cg_prod(A, p, Ap, inner_prod_buffer); + break; +#endif +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::pipelined_cg_prod(A, p, Ap, inner_prod_buffer); + break; +#endif + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } +} + +//////////////////////////////////////////// + +/** @brief Performs a joint vector update operation needed for an efficient pipelined CG algorithm. + * + * This routines computes for vectors 's', 'r', 'Ap': + * s = r - alpha * Ap + * with alpha obtained from a reduction step on the 0th and the 3rd out of 6 chunks in inner_prod_buffer + * and runs the parallel reduction stage for computing inner_prod(s,s) + */ +template<typename NumericT> +void pipelined_bicgstab_update_s(vector_base<NumericT> & s, + vector_base<NumericT> & r, + vector_base<NumericT> const & Ap, + vector_base<NumericT> & inner_prod_buffer, + vcl_size_t buffer_chunk_size, + vcl_size_t buffer_chunk_offset) +{ + switch (viennacl::traits::handle(s).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::pipelined_bicgstab_update_s(s, r, Ap, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::pipelined_bicgstab_update_s(s, r, Ap, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset); + break; +#endif +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::pipelined_bicgstab_update_s(s, r, Ap, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset); + break; +#endif + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } +} + +/** @brief Performs a joint vector update operation needed for an efficient pipelined BiCGStab algorithm. + * + * x_{j+1} = x_j + alpha * p_j + omega * s_j + * r_{j+1} = s_j - omega * t_j + * p_{j+1} = r_{j+1} + beta * (p_j - omega * q_j) + * and compute first stage of r_dot_r0 = <r_{j+1}, r_o^*> for use in next iteration + */ +template<typename NumericT> +void pipelined_bicgstab_vector_update(vector_base<NumericT> & result, NumericT alpha, vector_base<NumericT> & p, NumericT omega, vector_base<NumericT> const & s, + vector_base<NumericT> & residual, vector_base<NumericT> const & As, + NumericT beta, vector_base<NumericT> const & Ap, + vector_base<NumericT> const & r0star, + vector_base<NumericT> & inner_prod_buffer, + vcl_size_t buffer_chunk_size) +{ + switch (viennacl::traits::handle(s).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::pipelined_bicgstab_vector_update(result, alpha, p, omega, s, residual, As, beta, Ap, r0star, inner_prod_buffer, buffer_chunk_size); + break; + #ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::pipelined_bicgstab_vector_update(result, alpha, p, omega, s, residual, As, beta, Ap, r0star, inner_prod_buffer, buffer_chunk_size); + break; + #endif + #ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::pipelined_bicgstab_vector_update(result, alpha, p, omega, s, residual, As, beta, Ap, r0star, inner_prod_buffer, buffer_chunk_size); + break; + #endif + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } +} + + +/** @brief Performs a joint vector update operation needed for an efficient pipelined CG algorithm. + * + * This routines computes for a matrix A and vectors 'p' and 'Ap': + * Ap = prod(A, p); + * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap) + */ +template<typename MatrixT, typename NumericT> +void pipelined_bicgstab_prod(MatrixT const & A, + vector_base<NumericT> const & p, + vector_base<NumericT> & Ap, + vector_base<NumericT> const & r0star, + vector_base<NumericT> & inner_prod_buffer, + vcl_size_t buffer_chunk_size, + vcl_size_t buffer_chunk_offset) +{ + switch (viennacl::traits::handle(p).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::pipelined_bicgstab_prod(A, p, Ap, r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::pipelined_bicgstab_prod(A, p, Ap, r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset); + break; +#endif +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::pipelined_bicgstab_prod(A, p, Ap, r0star, inner_prod_buffer, buffer_chunk_size, buffer_chunk_offset); + break; +#endif + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } +} + +//////////////////////////////////////////// + +/** @brief Performs a vector normalization needed for an efficient pipelined GMRES algorithm. + * + * This routines computes for vectors 'r', 'v_k': + * Second reduction step for ||v_k|| + * v_k /= ||v_k|| + * First reduction step for <r, v_k> + */ +template <typename T> +void pipelined_gmres_normalize_vk(vector_base<T> & v_k, + vector_base<T> const & residual, + vector_base<T> & R_buffer, + vcl_size_t offset_in_R, + vector_base<T> const & inner_prod_buffer, + vector_base<T> & r_dot_vk_buffer, + vcl_size_t buffer_chunk_size, + vcl_size_t buffer_chunk_offset) +{ + switch (viennacl::traits::handle(v_k).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::pipelined_gmres_normalize_vk(v_k, residual, R_buffer, offset_in_R, inner_prod_buffer, r_dot_vk_buffer, buffer_chunk_size, buffer_chunk_offset); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::pipelined_gmres_normalize_vk(v_k, residual, R_buffer, offset_in_R, inner_prod_buffer, r_dot_vk_buffer, buffer_chunk_size, buffer_chunk_offset); + break; +#endif +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::pipelined_gmres_normalize_vk(v_k, residual, R_buffer, offset_in_R, inner_prod_buffer, r_dot_vk_buffer, buffer_chunk_size, buffer_chunk_offset); + break; +#endif + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } +} + + + +/** @brief Computes the first reduction stage for multiple inner products <v_i, v_k>, i=0..k-1 + * + * All vectors v_i are stored column-major in the array 'device_krylov_basis', where each vector has an actual length 'v_k_size', but might be padded to have 'v_k_internal_size' + */ +template <typename T> +void pipelined_gmres_gram_schmidt_stage1(vector_base<T> const & device_krylov_basis, + vcl_size_t v_k_size, + vcl_size_t v_k_internal_size, + vcl_size_t k, + vector_base<T> & vi_in_vk_buffer, + vcl_size_t buffer_chunk_size) +{ + switch (viennacl::traits::handle(device_krylov_basis).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::pipelined_gmres_gram_schmidt_stage1(device_krylov_basis, v_k_size, v_k_internal_size, k, vi_in_vk_buffer, buffer_chunk_size); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::pipelined_gmres_gram_schmidt_stage1(device_krylov_basis, v_k_size, v_k_internal_size, k, vi_in_vk_buffer, buffer_chunk_size); + break; +#endif +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::pipelined_gmres_gram_schmidt_stage1(device_krylov_basis, v_k_size, v_k_internal_size, k, vi_in_vk_buffer, buffer_chunk_size); + break; +#endif + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } +} + + +/** @brief Computes the second reduction stage for multiple inner products <v_i, v_k>, i=0..k-1, then updates v_k -= <v_i, v_k> v_i and computes the first reduction stage for ||v_k|| + * + * All vectors v_i are stored column-major in the array 'device_krylov_basis', where each vector has an actual length 'v_k_size', but might be padded to have 'v_k_internal_size' + */ +template <typename T> +void pipelined_gmres_gram_schmidt_stage2(vector_base<T> & device_krylov_basis, + vcl_size_t v_k_size, + vcl_size_t v_k_internal_size, + vcl_size_t k, + vector_base<T> const & vi_in_vk_buffer, + vector_base<T> & R_buffer, + vcl_size_t krylov_dim, + vector_base<T> & inner_prod_buffer, + vcl_size_t buffer_chunk_size) +{ + switch (viennacl::traits::handle(device_krylov_basis).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::pipelined_gmres_gram_schmidt_stage2(device_krylov_basis, v_k_size, v_k_internal_size, k, vi_in_vk_buffer, R_buffer, krylov_dim, inner_prod_buffer, buffer_chunk_size); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::pipelined_gmres_gram_schmidt_stage2(device_krylov_basis, v_k_size, v_k_internal_size, k, vi_in_vk_buffer, R_buffer, krylov_dim, inner_prod_buffer, buffer_chunk_size); + break; +#endif +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::pipelined_gmres_gram_schmidt_stage2(device_krylov_basis, v_k_size, v_k_internal_size, k, vi_in_vk_buffer, R_buffer, krylov_dim, inner_prod_buffer, buffer_chunk_size); + break; +#endif + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } +} + + +/** @brief Computes x += eta_0 r + sum_{i=1}^{k-1} eta_i v_{i-1} */ +template <typename T> +void pipelined_gmres_update_result(vector_base<T> & result, + vector_base<T> const & residual, + vector_base<T> const & krylov_basis, + vcl_size_t v_k_size, + vcl_size_t v_k_internal_size, + vector_base<T> const & coefficients, + vcl_size_t k) +{ + switch (viennacl::traits::handle(result).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::pipelined_gmres_update_result(result, residual, krylov_basis, v_k_size, v_k_internal_size, coefficients, k); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::pipelined_gmres_update_result(result, residual, krylov_basis, v_k_size, v_k_internal_size, coefficients, k); + break; +#endif +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::pipelined_gmres_update_result(result, residual, krylov_basis, v_k_size, v_k_internal_size, coefficients, k); + break; +#endif + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } +} + +/** @brief Performs a joint vector update operation needed for an efficient pipelined GMRES algorithm. + * + * This routines computes for a matrix A and vectors 'p' and 'Ap': + * Ap = prod(A, p); + * and computes the two reduction stages for computing inner_prod(p,Ap), inner_prod(Ap,Ap) + */ +template <typename MatrixType, typename T> +void pipelined_gmres_prod(MatrixType const & A, + vector_base<T> const & p, + vector_base<T> & Ap, + vector_base<T> & inner_prod_buffer) +{ + switch (viennacl::traits::handle(p).get_active_handle_id()) + { + case viennacl::MAIN_MEMORY: + viennacl::linalg::host_based::pipelined_gmres_prod(A, p, Ap, inner_prod_buffer); + break; +#ifdef VIENNACL_WITH_OPENCL + case viennacl::OPENCL_MEMORY: + viennacl::linalg::opencl::pipelined_gmres_prod(A, p, Ap, inner_prod_buffer); + break; +#endif +#ifdef VIENNACL_WITH_CUDA + case viennacl::CUDA_MEMORY: + viennacl::linalg::cuda::pipelined_gmres_prod(A, p, Ap, inner_prod_buffer); + break; +#endif + case viennacl::MEMORY_NOT_INITIALIZED: + throw memory_exception("not initialised!"); + default: + throw memory_exception("not implemented"); + } +} + + +} //namespace linalg +} //namespace viennacl + + +#endif http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/jacobi_precond.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/jacobi_precond.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/jacobi_precond.hpp new file mode 100644 index 0000000..0b16964 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/jacobi_precond.hpp @@ -0,0 +1,141 @@ +#ifndef VIENNACL_LINALG_JACOBI_PRECOND_HPP_ +#define VIENNACL_LINALG_JACOBI_PRECOND_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/jacobi_precond.hpp + @brief Implementation of a simple Jacobi preconditioner +*/ + +#include <vector> +#include <cmath> +#include "viennacl/forwards.h" +#include "viennacl/vector.hpp" +#include "viennacl/compressed_matrix.hpp" +#include "viennacl/tools/tools.hpp" +#include "viennacl/linalg/sparse_matrix_operations.hpp" +#include "viennacl/linalg/row_scaling.hpp" + +#include <map> + +namespace viennacl +{ +namespace linalg +{ + +/** @brief A tag for a jacobi preconditioner +*/ +class jacobi_tag {}; + + +/** @brief Jacobi preconditioner class, can be supplied to solve()-routines. Generic version for non-ViennaCL matrices. +*/ +template<typename MatrixT, + bool is_viennacl = detail::row_scaling_for_viennacl<MatrixT>::value > +class jacobi_precond +{ + typedef typename MatrixT::value_type NumericType; + + public: + jacobi_precond(MatrixT const & mat, jacobi_tag const &) : diag_A_(viennacl::traits::size1(mat)) + { + init(mat); + } + + void init(MatrixT const & mat) + { + diag_A_.resize(viennacl::traits::size1(mat)); //resize without preserving values + + for (typename MatrixT::const_iterator1 row_it = mat.begin1(); + row_it != mat.end1(); + ++row_it) + { + bool diag_found = false; + for (typename MatrixT::const_iterator2 col_it = row_it.begin(); + col_it != row_it.end(); + ++col_it) + { + if (col_it.index1() == col_it.index2()) + { + diag_A_[col_it.index1()] = *col_it; + diag_found = true; + } + } + if (!diag_found) + throw zero_on_diagonal_exception("ViennaCL: Zero in diagonal encountered while setting up Jacobi preconditioner!"); + } + } + + + /** @brief Apply to res = b - Ax, i.e. jacobi applied vec (right hand side), */ + template<typename VectorT> + void apply(VectorT & vec) const + { + assert(viennacl::traits::size(diag_A_) == viennacl::traits::size(vec) && bool("Size mismatch")); + for (vcl_size_t i=0; i<diag_A_.size(); ++i) + vec[i] /= diag_A_[i]; + } + + private: + std::vector<NumericType> diag_A_; +}; + + +/** @brief Jacobi preconditioner class, can be supplied to solve()-routines. +* +* Specialization for compressed_matrix +*/ +template<typename MatrixT> +class jacobi_precond<MatrixT, true> +{ + typedef typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type NumericType; + + public: + jacobi_precond(MatrixT const & mat, jacobi_tag const &) : diag_A_(mat.size1(), viennacl::traits::context(mat)) + { + init(mat); + } + + + void init(MatrixT const & mat) + { + detail::row_info(mat, diag_A_, detail::SPARSE_ROW_DIAGONAL); + } + + + template<unsigned int AlignmentV> + void apply(viennacl::vector<NumericType, AlignmentV> & vec) const + { + assert(viennacl::traits::size(diag_A_) == viennacl::traits::size(vec) && bool("Size mismatch")); + vec = element_div(vec, diag_A_); + } + + private: + viennacl::vector<NumericType> diag_A_; +}; + +} +} + + + + +#endif + + + http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/lanczos.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/lanczos.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/lanczos.hpp new file mode 100644 index 0000000..ffac471 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/lanczos.hpp @@ -0,0 +1,515 @@ +#ifndef VIENNACL_LINALG_LANCZOS_HPP_ +#define VIENNACL_LINALG_LANCZOS_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/lanczos.hpp +* @brief Generic interface for the Lanczos algorithm. +* +* Contributed by Guenther Mader and Astrid Rupp. +*/ + +#include <cmath> +#include <vector> +#include "viennacl/vector.hpp" +#include "viennacl/compressed_matrix.hpp" +#include "viennacl/linalg/prod.hpp" +#include "viennacl/linalg/inner_prod.hpp" +#include "viennacl/linalg/norm_2.hpp" +#include "viennacl/io/matrix_market.hpp" +#include "viennacl/linalg/bisect.hpp" +#include "viennacl/tools/random.hpp" + +namespace viennacl +{ +namespace linalg +{ + +/** @brief A tag for the lanczos algorithm. +*/ +class lanczos_tag +{ +public: + + enum + { + partial_reorthogonalization = 0, + full_reorthogonalization, + no_reorthogonalization + }; + + /** @brief The constructor + * + * @param factor Exponent of epsilon - tolerance for batches of Reorthogonalization + * @param numeig Number of eigenvalues to be returned + * @param met Method for Lanczos-Algorithm: 0 for partial Reorthogonalization, 1 for full Reorthogonalization and 2 for Lanczos without Reorthogonalization + * @param krylov Maximum krylov-space size + */ + + lanczos_tag(double factor = 0.75, + vcl_size_t numeig = 10, + int met = 0, + vcl_size_t krylov = 100) : factor_(factor), num_eigenvalues_(numeig), method_(met), krylov_size_(krylov) {} + + /** @brief Sets the number of eigenvalues */ + void num_eigenvalues(vcl_size_t numeig){ num_eigenvalues_ = numeig; } + + /** @brief Returns the number of eigenvalues */ + vcl_size_t num_eigenvalues() const { return num_eigenvalues_; } + + /** @brief Sets the exponent of epsilon. Values between 0.6 and 0.9 usually give best results. */ + void factor(double fct) { factor_ = fct; } + + /** @brief Returns the exponent */ + double factor() const { return factor_; } + + /** @brief Sets the size of the kylov space. Must be larger than number of eigenvalues to compute. */ + void krylov_size(vcl_size_t max) { krylov_size_ = max; } + + /** @brief Returns the size of the kylov space */ + vcl_size_t krylov_size() const { return krylov_size_; } + + /** @brief Sets the reorthogonalization method */ + void method(int met){ method_ = met; } + + /** @brief Returns the reorthogonalization method */ + int method() const { return method_; } + + +private: + double factor_; + vcl_size_t num_eigenvalues_; + int method_; // see enum defined above for possible values + vcl_size_t krylov_size_; +}; + + +namespace detail +{ + /** @brief Inverse iteration for finding an eigenvector for an eigenvalue. + * + * beta[0] to be ignored for consistency. + */ + template<typename NumericT> + void inverse_iteration(std::vector<NumericT> const & alphas, std::vector<NumericT> const & betas, + NumericT & eigenvalue, std::vector<NumericT> & eigenvector) + { + std::vector<NumericT> alpha_sweeped = alphas; + for (vcl_size_t i=0; i<alpha_sweeped.size(); ++i) + alpha_sweeped[i] -= eigenvalue; + for (vcl_size_t row=1; row < alpha_sweeped.size(); ++row) + alpha_sweeped[row] -= betas[row] * betas[row] / alpha_sweeped[row-1]; + + // starting guess: ignore last equation + eigenvector[alphas.size() - 1] = 1.0; + + for (vcl_size_t iter=0; iter<1; ++iter) + { + // solve first n-1 equations (A - \lambda I) y = -beta[n] + eigenvector[alphas.size() - 1] /= alpha_sweeped[alphas.size() - 1]; + for (vcl_size_t row2=1; row2 < alphas.size(); ++row2) + { + vcl_size_t row = alphas.size() - row2 - 1; + eigenvector[row] -= eigenvector[row+1] * betas[row+1]; + eigenvector[row] /= alpha_sweeped[row]; + } + + // normalize eigenvector: + NumericT norm_vector = 0; + for (vcl_size_t i=0; i<eigenvector.size(); ++i) + norm_vector += eigenvector[i] * eigenvector[i]; + norm_vector = std::sqrt(norm_vector); + for (vcl_size_t i=0; i<eigenvector.size(); ++i) + eigenvector[i] /= norm_vector; + } + + //eigenvalue = (alphas[0] * eigenvector[0] + betas[1] * eigenvector[1]) / eigenvector[0]; + } + + /** + * @brief Implementation of the Lanczos PRO algorithm (partial reorthogonalization) + * + * @param A The system matrix + * @param r Random start vector + * @param eigenvectors_A Dense matrix holding the eigenvectors of A (one eigenvector per column) + * @param size Size of krylov-space + * @param tag Lanczos_tag with several options for the algorithm + * @param compute_eigenvectors Boolean flag. If true, eigenvectors are computed. Otherwise the routine returns after calculating eigenvalues. + * @return Returns the eigenvalues (number of eigenvalues equals size of krylov-space) + */ + + template<typename MatrixT, typename DenseMatrixT, typename NumericT> + std::vector<NumericT> + lanczosPRO (MatrixT const& A, vector_base<NumericT> & r, DenseMatrixT & eigenvectors_A, vcl_size_t size, lanczos_tag const & tag, bool compute_eigenvectors) + { + // generation of some random numbers, used for lanczos PRO algorithm + viennacl::tools::normal_random_numbers<NumericT> get_N; + + std::vector<vcl_size_t> l_bound(size/2), u_bound(size/2); + vcl_size_t n = r.size(); + std::vector<NumericT> w(size), w_old(size); //w_k, w_{k-1} + + NumericT inner_rt; + std::vector<NumericT> alphas, betas; + viennacl::matrix<NumericT, viennacl::column_major> Q(n, size); //column-major matrix holding the Krylov basis vectors + + bool second_step = false; + NumericT eps = std::numeric_limits<NumericT>::epsilon(); + NumericT squ_eps = std::sqrt(eps); + NumericT eta = std::exp(std::log(eps) * tag.factor()); + + NumericT beta = viennacl::linalg::norm_2(r); + + r /= beta; + + viennacl::vector_base<NumericT> q_0(Q.handle(), Q.size1(), 0, 1); + q_0 = r; + + viennacl::vector<NumericT> u = viennacl::linalg::prod(A, r); + NumericT alpha = viennacl::linalg::inner_prod(u, r); + alphas.push_back(alpha); + w[0] = 1; + betas.push_back(beta); + + vcl_size_t batches = 0; + for (vcl_size_t i = 1; i < size; i++) // Main loop for setting up the Krylov space + { + viennacl::vector_base<NumericT> q_iminus1(Q.handle(), Q.size1(), (i-1) * Q.internal_size1(), 1); + r = u - alpha * q_iminus1; + beta = viennacl::linalg::norm_2(r); + + betas.push_back(beta); + r = r / beta; + + // + // Update recurrence relation for estimating orthogonality loss + // + w_old = w; + w[0] = (betas[1] * w_old[1] + (alphas[0] - alpha) * w_old[0] - betas[i - 1] * w_old[0]) / beta + eps * 0.3 * get_N() * (betas[1] + beta); + for (vcl_size_t j = 1; j < i - 1; j++) + w[j] = (betas[j + 1] * w_old[j + 1] + (alphas[j] - alpha) * w_old[j] + betas[j] * w_old[j - 1] - betas[i - 1] * w_old[j]) / beta + eps * 0.3 * get_N() * (betas[j + 1] + beta); + w[i-1] = 0.6 * eps * NumericT(n) * get_N() * betas[1] / beta; + + // + // Check whether there has been a need for reorthogonalization detected in the previous iteration. + // If so, run the reorthogonalization for each batch + // + if (second_step) + { + for (vcl_size_t j = 0; j < batches; j++) + { + for (vcl_size_t k = l_bound[j] + 1; k < u_bound[j] - 1; k++) + { + viennacl::vector_base<NumericT> q_k(Q.handle(), Q.size1(), k * Q.internal_size1(), 1); + inner_rt = viennacl::linalg::inner_prod(r, q_k); + r = r - inner_rt * q_k; + w[k] = 1.5 * eps * get_N(); + } + } + NumericT temp = viennacl::linalg::norm_2(r); + r = r / temp; + beta = beta * temp; + second_step = false; + } + batches = 0; + + // + // Check for semiorthogonality + // + for (vcl_size_t j = 0; j < i; j++) + { + if (std::fabs(w[j]) >= squ_eps) // tentative loss of orthonormality, hence reorthonomalize + { + viennacl::vector_base<NumericT> q_j(Q.handle(), Q.size1(), j * Q.internal_size1(), 1); + inner_rt = viennacl::linalg::inner_prod(r, q_j); + r = r - inner_rt * q_j; + w[j] = 1.5 * eps * get_N(); + vcl_size_t k = j - 1; + + // orthogonalization with respect to earlier basis vectors + while (std::fabs(w[k]) > eta) + { + viennacl::vector_base<NumericT> q_k(Q.handle(), Q.size1(), k * Q.internal_size1(), 1); + inner_rt = viennacl::linalg::inner_prod(r, q_k); + r = r - inner_rt * q_k; + w[k] = 1.5 * eps * get_N(); + if (k == 0) break; + k--; + } + l_bound[batches] = k; + + // orthogonalization with respect to later basis vectors + k = j + 1; + while (k < i && std::fabs(w[k]) > eta) + { + viennacl::vector_base<NumericT> q_k(Q.handle(), Q.size1(), k * Q.internal_size1(), 1); + inner_rt = viennacl::linalg::inner_prod(r, q_k); + r = r - inner_rt * q_k; + w[k] = 1.5 * eps * get_N(); + k++; + } + u_bound[batches] = k - 1; + batches++; + + j = k-1; // go to end of current batch + } + } + + // + // Normalize basis vector and reorthogonalize as needed + // + if (batches > 0) + { + NumericT temp = viennacl::linalg::norm_2(r); + r = r / temp; + beta = beta * temp; + second_step = true; + } + + // store Krylov vector in Q: + viennacl::vector_base<NumericT> q_i(Q.handle(), Q.size1(), i * Q.internal_size1(), 1); + q_i = r; + + // + // determine and store alpha = <r, u> with u = A q_i - beta q_{i-1} + // + u = viennacl::linalg::prod(A, r); + u += (-beta) * q_iminus1; + alpha = viennacl::linalg::inner_prod(u, r); + alphas.push_back(alpha); + } + + // + // Step 2: Compute eigenvalues of tridiagonal matrix obtained during Lanczos iterations: + // + std::vector<NumericT> eigenvalues = bisect(alphas, betas); + + // + // Step 3: Compute eigenvectors via inverse iteration. Does not update eigenvalues, so only approximate by nature. + // + if (compute_eigenvectors) + { + std::vector<NumericT> eigenvector_tridiag(alphas.size()); + for (std::size_t i=0; i < tag.num_eigenvalues(); ++i) + { + // compute eigenvector of tridiagonal matrix via inverse: + inverse_iteration(alphas, betas, eigenvalues[eigenvalues.size() - i - 1], eigenvector_tridiag); + + // eigenvector w of full matrix A. Given as w = Q * u, where u is the eigenvector of the tridiagonal matrix + viennacl::vector<NumericT> eigenvector_u(eigenvector_tridiag.size()); + viennacl::copy(eigenvector_tridiag, eigenvector_u); + + viennacl::vector_base<NumericT> eigenvector_A(eigenvectors_A.handle(), + eigenvectors_A.size1(), + eigenvectors_A.row_major() ? i : i * eigenvectors_A.internal_size1(), + eigenvectors_A.row_major() ? eigenvectors_A.internal_size2() : 1); + eigenvector_A = viennacl::linalg::prod(project(Q, + range(0, Q.size1()), + range(0, eigenvector_u.size())), + eigenvector_u); + } + } + + return eigenvalues; + } + + + /** + * @brief Implementation of the Lanczos FRO algorithm + * + * @param A The system matrix + * @param r Random start vector + * @param eigenvectors_A A dense matrix in which the eigenvectors of A will be stored. Both row- and column-major matrices are supported. + * @param krylov_dim Size of krylov-space + * @param tag The Lanczos tag holding tolerances, etc. + * @param compute_eigenvectors Boolean flag. If true, eigenvectors are computed. Otherwise the routine returns after calculating eigenvalues. + * @return Returns the eigenvalues (number of eigenvalues equals size of krylov-space) + */ + template< typename MatrixT, typename DenseMatrixT, typename NumericT> + std::vector<NumericT> + lanczos(MatrixT const& A, vector_base<NumericT> & r, DenseMatrixT & eigenvectors_A, vcl_size_t krylov_dim, lanczos_tag const & tag, bool compute_eigenvectors) + { + std::vector<NumericT> alphas, betas; + viennacl::vector<NumericT> Aq(r.size()); + viennacl::matrix<NumericT, viennacl::column_major> Q(r.size(), krylov_dim + 1); // Krylov basis (each Krylov vector is one column) + + NumericT norm_r = norm_2(r); + NumericT beta = norm_r; + r /= norm_r; + + // first Krylov vector: + viennacl::vector_base<NumericT> q0(Q.handle(), Q.size1(), 0, 1); + q0 = r; + + // + // Step 1: Run Lanczos' method to obtain tridiagonal matrix + // + for (vcl_size_t i = 0; i < krylov_dim; i++) + { + betas.push_back(beta); + // last available vector from Krylov basis: + viennacl::vector_base<NumericT> q_i(Q.handle(), Q.size1(), i * Q.internal_size1(), 1); + + // Lanczos algorithm: + // - Compute A * q: + Aq = viennacl::linalg::prod(A, q_i); + + // - Form Aq <- Aq - <Aq, q_i> * q_i - beta * q_{i-1}, where beta is ||q_i|| before normalization in previous iteration + NumericT alpha = viennacl::linalg::inner_prod(Aq, q_i); + Aq -= alpha * q_i; + + if (i > 0) + { + viennacl::vector_base<NumericT> q_iminus1(Q.handle(), Q.size1(), (i-1) * Q.internal_size1(), 1); + Aq -= beta * q_iminus1; + + // Extra measures for improved numerical stability? + if (tag.method() == lanczos_tag::full_reorthogonalization) + { + // Gram-Schmidt (re-)orthogonalization: + // TODO: Reuse fast (pipelined) routines from GMRES or GEMV + for (vcl_size_t j = 0; j < i; j++) + { + viennacl::vector_base<NumericT> q_j(Q.handle(), Q.size1(), j * Q.internal_size1(), 1); + NumericT inner_rq = viennacl::linalg::inner_prod(Aq, q_j); + Aq -= inner_rq * q_j; + } + } + } + + // normalize Aq and add to Krylov basis at column i+1 in Q: + beta = viennacl::linalg::norm_2(Aq); + viennacl::vector_base<NumericT> q_iplus1(Q.handle(), Q.size1(), (i+1) * Q.internal_size1(), 1); + q_iplus1 = Aq / beta; + + alphas.push_back(alpha); + } + + // + // Step 2: Compute eigenvalues of tridiagonal matrix obtained during Lanczos iterations: + // + std::vector<NumericT> eigenvalues = bisect(alphas, betas); + + // + // Step 3: Compute eigenvectors via inverse iteration. Does not update eigenvalues, so only approximate by nature. + // + if (compute_eigenvectors) + { + std::vector<NumericT> eigenvector_tridiag(alphas.size()); + for (std::size_t i=0; i < tag.num_eigenvalues(); ++i) + { + // compute eigenvector of tridiagonal matrix via inverse: + inverse_iteration(alphas, betas, eigenvalues[eigenvalues.size() - i - 1], eigenvector_tridiag); + + // eigenvector w of full matrix A. Given as w = Q * u, where u is the eigenvector of the tridiagonal matrix + viennacl::vector<NumericT> eigenvector_u(eigenvector_tridiag.size()); + viennacl::copy(eigenvector_tridiag, eigenvector_u); + + viennacl::vector_base<NumericT> eigenvector_A(eigenvectors_A.handle(), + eigenvectors_A.size1(), + eigenvectors_A.row_major() ? i : i * eigenvectors_A.internal_size1(), + eigenvectors_A.row_major() ? eigenvectors_A.internal_size2() : 1); + eigenvector_A = viennacl::linalg::prod(project(Q, + range(0, Q.size1()), + range(0, eigenvector_u.size())), + eigenvector_u); + } + } + + return eigenvalues; + } + +} // end namespace detail + +/** +* @brief Implementation of the calculation of eigenvalues using lanczos (with and without reorthogonalization). +* +* Implementation of Lanczos with partial reorthogonalization is implemented separately. +* +* @param matrix The system matrix +* @param eigenvectors_A A dense matrix in which the eigenvectors of A will be stored. Both row- and column-major matrices are supported. +* @param tag Tag with several options for the lanczos algorithm +* @param compute_eigenvectors Boolean flag. If true, eigenvectors are computed. Otherwise the routine returns after calculating eigenvalues. +* @return Returns the n largest eigenvalues (n defined in the lanczos_tag) +*/ +template<typename MatrixT, typename DenseMatrixT> +std::vector< typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type > +eig(MatrixT const & matrix, DenseMatrixT & eigenvectors_A, lanczos_tag const & tag, bool compute_eigenvectors = true) +{ + typedef typename viennacl::result_of::value_type<MatrixT>::type NumericType; + typedef typename viennacl::result_of::cpu_value_type<NumericType>::type CPU_NumericType; + typedef typename viennacl::result_of::vector_for_matrix<MatrixT>::type VectorT; + + viennacl::tools::uniform_random_numbers<CPU_NumericType> random_gen; + + std::vector<CPU_NumericType> eigenvalues; + vcl_size_t matrix_size = matrix.size1(); + VectorT r(matrix_size); + std::vector<CPU_NumericType> s(matrix_size); + + for (vcl_size_t i=0; i<s.size(); ++i) + s[i] = CPU_NumericType(0.5) + random_gen(); + + detail::copy_vec_to_vec(s,r); + + vcl_size_t size_krylov = (matrix_size < tag.krylov_size()) ? matrix_size + : tag.krylov_size(); + + switch (tag.method()) + { + case lanczos_tag::partial_reorthogonalization: + eigenvalues = detail::lanczosPRO(matrix, r, eigenvectors_A, size_krylov, tag, compute_eigenvectors); + break; + case lanczos_tag::full_reorthogonalization: + case lanczos_tag::no_reorthogonalization: + eigenvalues = detail::lanczos(matrix, r, eigenvectors_A, size_krylov, tag, compute_eigenvectors); + break; + } + + std::vector<CPU_NumericType> largest_eigenvalues; + + for (vcl_size_t i = 1; i<=tag.num_eigenvalues(); i++) + largest_eigenvalues.push_back(eigenvalues[size_krylov-i]); + + + return largest_eigenvalues; +} + + +/** +* @brief Implementation of the calculation of eigenvalues using lanczos (with and without reorthogonalization). +* +* Implementation of Lanczos with partial reorthogonalization is implemented separately. +* +* @param matrix The system matrix +* @param tag Tag with several options for the lanczos algorithm +* @return Returns the n largest eigenvalues (n defined in the lanczos_tag) +*/ +template<typename MatrixT> +std::vector< typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type > +eig(MatrixT const & matrix, lanczos_tag const & tag) +{ + typedef typename viennacl::result_of::cpu_value_type<typename MatrixT::value_type>::type NumericType; + + viennacl::matrix<NumericType> eigenvectors(matrix.size1(), tag.num_eigenvalues()); + return eig(matrix, eigenvectors, tag, false); +} + +} // end namespace linalg +} // end namespace viennacl +#endif http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/lu.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/lu.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/lu.hpp new file mode 100644 index 0000000..0bdd037 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/lu.hpp @@ -0,0 +1,227 @@ +#ifndef VIENNACL_LINALG_LU_HPP +#define VIENNACL_LINALG_LU_HPP + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/lu.hpp + @brief Implementations of LU factorization for row-major and column-major dense matrices. +*/ + +#include <algorithm> //for std::min + +#include "viennacl/matrix.hpp" +#include "viennacl/matrix_proxy.hpp" + +#include "viennacl/linalg/prod.hpp" +#include "viennacl/linalg/direct_solve.hpp" + +namespace viennacl +{ +namespace linalg +{ +/** @brief LU factorization of a row-major dense matrix. +* +* @param A The system matrix, where the LU matrices are directly written to. The implicit unit diagonal of L is not written. +*/ +template<typename NumericT> +void lu_factorize(matrix<NumericT, viennacl::row_major> & A) +{ + typedef matrix<NumericT, viennacl::row_major> MatrixType; + + vcl_size_t max_block_size = 32; + vcl_size_t num_blocks = (A.size2() - 1) / max_block_size + 1; + std::vector<NumericT> temp_buffer(A.internal_size2() * max_block_size); + + // Iterate over panels + for (vcl_size_t panel_id = 0; panel_id < num_blocks; ++panel_id) + { + vcl_size_t row_start = panel_id * max_block_size; + vcl_size_t current_block_size = std::min<vcl_size_t>(A.size1() - row_start, max_block_size); + + viennacl::range block_range(row_start, row_start + current_block_size); + viennacl::range remainder_range(row_start + current_block_size, A.size1()); + + // + // Perform LU factorization on panel: + // + + + // Read from matrix to buffer: + viennacl::backend::memory_read(A.handle(), + sizeof(NumericT) * row_start * A.internal_size2(), + sizeof(NumericT) * current_block_size * A.internal_size2(), + &(temp_buffer[0])); + + // Factorize (kij-version): + for (vcl_size_t k=0; k < current_block_size - 1; ++k) + { + for (vcl_size_t i=k+1; i < current_block_size; ++i) + { + temp_buffer[row_start + i * A.internal_size2() + k] /= temp_buffer[row_start + k * A.internal_size2() + k]; // write l_ik + + NumericT l_ik = temp_buffer[row_start + i * A.internal_size2() + k]; + + for (vcl_size_t j = row_start + k + 1; j < A.size1(); ++j) + temp_buffer[i * A.internal_size2() + j] -= l_ik * temp_buffer[k * A.internal_size2() + j]; // l_ik * a_kj + } + } + + // Write back: + viennacl::backend::memory_write(A.handle(), + sizeof(NumericT) * row_start * A.internal_size2(), + sizeof(NumericT) * current_block_size * A.internal_size2(), + &(temp_buffer[0])); + + if (remainder_range.size() > 0) + { + // + // Compute L_12 = [ (U_11)^{T}^{-1} A_{21}^T ]^T + // + viennacl::matrix_range<MatrixType> U_11(A, block_range, block_range); + viennacl::matrix_range<MatrixType> A_21(A, remainder_range, block_range); + viennacl::linalg::inplace_solve(trans(U_11), trans(A_21), viennacl::linalg::lower_tag()); + + // + // Update remainder of A + // + viennacl::matrix_range<MatrixType> L_21(A, remainder_range, block_range); + viennacl::matrix_range<MatrixType> U_12(A, block_range, remainder_range); + viennacl::matrix_range<MatrixType> A_22(A, remainder_range, remainder_range); + + A_22 -= viennacl::linalg::prod(L_21, U_12); + } + } + +} + + +/** @brief LU factorization of a column-major dense matrix. +* +* @param A The system matrix, where the LU matrices are directly written to. The implicit unit diagonal of L is not written. +*/ +template<typename NumericT> +void lu_factorize(matrix<NumericT, viennacl::column_major> & A) +{ + typedef matrix<NumericT, viennacl::column_major> MatrixType; + + vcl_size_t max_block_size = 32; + vcl_size_t num_blocks = (A.size1() - 1) / max_block_size + 1; + std::vector<NumericT> temp_buffer(A.internal_size1() * max_block_size); + + // Iterate over panels + for (vcl_size_t panel_id = 0; panel_id < num_blocks; ++panel_id) + { + vcl_size_t col_start = panel_id * max_block_size; + vcl_size_t current_block_size = std::min<vcl_size_t>(A.size1() - col_start, max_block_size); + + viennacl::range block_range(col_start, col_start + current_block_size); + viennacl::range remainder_range(col_start + current_block_size, A.size1()); + + // + // Perform LU factorization on panel: + // + + + // Read from matrix to buffer: + viennacl::backend::memory_read(A.handle(), + sizeof(NumericT) * col_start * A.internal_size1(), + sizeof(NumericT) * current_block_size * A.internal_size1(), + &(temp_buffer[0])); + + // Factorize (kji-version): + for (vcl_size_t k=0; k < current_block_size; ++k) + { + NumericT a_kk = temp_buffer[col_start + k + k * A.internal_size1()]; + for (vcl_size_t i=col_start+k+1; i < A.size1(); ++i) + temp_buffer[i + k * A.internal_size1()] /= a_kk; // write l_ik + + for (vcl_size_t j=k+1; j < current_block_size; ++j) + { + NumericT a_kj = temp_buffer[col_start + k + j * A.internal_size1()]; + for (vcl_size_t i=col_start+k+1; i < A.size1(); ++i) + temp_buffer[i + j * A.internal_size1()] -= temp_buffer[i + k * A.internal_size1()] * a_kj; // l_ik * a_kj + } + } + + // Write back: + viennacl::backend::memory_write(A.handle(), + sizeof(NumericT) * col_start * A.internal_size1(), + sizeof(NumericT) * current_block_size * A.internal_size1(), + &(temp_buffer[0])); + + if (remainder_range.size() > 0) + { + // + // Compute U_12: + // + viennacl::matrix_range<MatrixType> L_11(A, block_range, block_range); + viennacl::matrix_range<MatrixType> A_12(A, block_range, remainder_range); + viennacl::linalg::inplace_solve(L_11, A_12, viennacl::linalg::unit_lower_tag()); + + // + // Update remainder of A + // + viennacl::matrix_range<MatrixType> L_21(A, remainder_range, block_range); + viennacl::matrix_range<MatrixType> U_12(A, block_range, remainder_range); + viennacl::matrix_range<MatrixType> A_22(A, remainder_range, remainder_range); + + A_22 -= viennacl::linalg::prod(L_21, U_12); + } + + } + +} + + +// +// Convenience layer: +// + +/** @brief LU substitution for the system LU = rhs. +* +* @param A The system matrix, where the LU matrices are directly written to. The implicit unit diagonal of L is not written. +* @param B The matrix of load vectors, where the solution is directly written to +*/ +template<typename NumericT, typename F1, typename F2, unsigned int AlignmentV1, unsigned int AlignmentV2> +void lu_substitute(matrix<NumericT, F1, AlignmentV1> const & A, + matrix<NumericT, F2, AlignmentV2> & B) +{ + assert(A.size1() == A.size2() && bool("Matrix must be square")); + assert(A.size1() == B.size1() && bool("Matrix must be square")); + inplace_solve(A, B, unit_lower_tag()); + inplace_solve(A, B, upper_tag()); +} + +/** @brief LU substitution for the system LU = rhs. +* +* @param A The system matrix, where the LU matrices are directly written to. The implicit unit diagonal of L is not written. +* @param vec The load vector, where the solution is directly written to +*/ +template<typename NumericT, typename F, unsigned int MatAlignmentV, unsigned int VecAlignmentV> +void lu_substitute(matrix<NumericT, F, MatAlignmentV> const & A, + vector<NumericT, VecAlignmentV> & vec) +{ + assert(A.size1() == A.size2() && bool("Matrix must be square")); + inplace_solve(A, vec, unit_lower_tag()); + inplace_solve(A, vec, upper_tag()); +} + +} +} + +#endif
