http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/direct_solve.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/direct_solve.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/direct_solve.hpp deleted file mode 100644 index 1d212c2..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/direct_solve.hpp +++ /dev/null @@ -1,307 +0,0 @@ -#ifndef VIENNACL_LINALG_HOST_BASED_DIRECT_SOLVE_HPP -#define VIENNACL_LINALG_HOST_BASED_DIRECT_SOLVE_HPP - -/* ========================================================================= - Copyright (c) 2010-2016, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. - Portions of this software are copyright by UChicago Argonne, LLC. - - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - - Project Head: Karl Rupp [email protected] - - (A list of authors and contributors can be found in the manual) - - License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - -/** @file viennacl/linalg/host_based/direct_solve.hpp - @brief Implementations of dense direct triangular solvers are found here. -*/ - -#include "viennacl/vector.hpp" -#include "viennacl/matrix.hpp" - -#include "viennacl/linalg/host_based/common.hpp" - -namespace viennacl -{ -namespace linalg -{ -namespace host_based -{ - -namespace detail -{ - // - // Upper solve: - // - template<typename MatrixT1, typename MatrixT2> - void upper_inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, bool unit_diagonal) - { - typedef typename MatrixT2::value_type value_type; - - for (vcl_size_t i = 0; i < A_size; ++i) - { - vcl_size_t current_row = A_size - i - 1; - - for (vcl_size_t j = current_row + 1; j < A_size; ++j) - { - value_type A_element = A(current_row, j); - for (vcl_size_t k=0; k < B_size; ++k) - B(current_row, k) -= A_element * B(j, k); - } - - if (!unit_diagonal) - { - value_type A_diag = A(current_row, current_row); - for (vcl_size_t k=0; k < B_size; ++k) - B(current_row, k) /= A_diag; - } - } - } - - template<typename MatrixT1, typename MatrixT2> - void inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::unit_upper_tag) - { - upper_inplace_solve_matrix(A, B, A_size, B_size, true); - } - - template<typename MatrixT1, typename MatrixT2> - void inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::upper_tag) - { - upper_inplace_solve_matrix(A, B, A_size, B_size, false); - } - - // - // Lower solve: - // - template<typename MatrixT1, typename MatrixT2> - void lower_inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, bool unit_diagonal) - { - typedef typename MatrixT2::value_type value_type; - - for (vcl_size_t i = 0; i < A_size; ++i) - { - for (vcl_size_t j = 0; j < i; ++j) - { - value_type A_element = A(i, j); - for (vcl_size_t k=0; k < B_size; ++k) - B(i, k) -= A_element * B(j, k); - } - - if (!unit_diagonal) - { - value_type A_diag = A(i, i); - for (vcl_size_t k=0; k < B_size; ++k) - B(i, k) /= A_diag; - } - } - } - - template<typename MatrixT1, typename MatrixT2> - void inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::unit_lower_tag) - { - lower_inplace_solve_matrix(A, B, A_size, B_size, true); - } - - template<typename MatrixT1, typename MatrixT2> - void inplace_solve_matrix(MatrixT1 & A, MatrixT2 & B, vcl_size_t A_size, vcl_size_t B_size, viennacl::linalg::lower_tag) - { - lower_inplace_solve_matrix(A, B, A_size, B_size, false); - } - -} - -// -// Note: By convention, all size checks are performed in the calling frontend. No need to double-check here. -// - -////////////////// upper triangular solver (upper_tag) ////////////////////////////////////// -/** @brief Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notation) -* -* @param A The system matrix -* @param B The matrix of row vectors, where the solution is directly written to -*/ -template<typename NumericT, typename SolverTagT> -void inplace_solve(matrix_base<NumericT> const & A, - matrix_base<NumericT> & B, - SolverTagT) -{ - typedef NumericT value_type; - - value_type const * data_A = detail::extract_raw_pointer<value_type>(A); - value_type * data_B = detail::extract_raw_pointer<value_type>(B); - - vcl_size_t A_start1 = viennacl::traits::start1(A); - vcl_size_t A_start2 = viennacl::traits::start2(A); - vcl_size_t A_inc1 = viennacl::traits::stride1(A); - vcl_size_t A_inc2 = viennacl::traits::stride2(A); - //vcl_size_t A_size1 = viennacl::traits::size1(A); - vcl_size_t A_size2 = viennacl::traits::size2(A); - vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(A); - vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(A); - - vcl_size_t B_start1 = viennacl::traits::start1(B); - vcl_size_t B_start2 = viennacl::traits::start2(B); - vcl_size_t B_inc1 = viennacl::traits::stride1(B); - vcl_size_t B_inc2 = viennacl::traits::stride2(B); - //vcl_size_t B_size1 = viennacl::traits::size1(B); - vcl_size_t B_size2 = viennacl::traits::size2(B); - vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(B); - vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(B); - - - if (A.row_major() && B.row_major()) - { - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - - detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SolverTagT()); - } - else if (A.row_major() && !B.row_major()) - { - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - - detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SolverTagT()); - } - else if (!A.row_major() && B.row_major()) - { - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - - detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SolverTagT()); - } - else - { - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - - detail::inplace_solve_matrix(wrapper_A, wrapper_B, A_size2, B_size2, SolverTagT()); - } -} - - -// -// Solve on vector -// - -namespace detail -{ - // - // Upper solve: - // - template<typename MatrixT, typename VectorT> - void upper_inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, bool unit_diagonal) - { - typedef typename VectorT::value_type value_type; - - for (vcl_size_t i = 0; i < A_size; ++i) - { - vcl_size_t current_row = A_size - i - 1; - - for (vcl_size_t j = current_row + 1; j < A_size; ++j) - { - value_type A_element = A(current_row, j); - b(current_row) -= A_element * b(j); - } - - if (!unit_diagonal) - b(current_row) /= A(current_row, current_row); - } - } - - template<typename MatrixT, typename VectorT> - void inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, viennacl::linalg::unit_upper_tag) - { - upper_inplace_solve_vector(A, b, A_size, true); - } - - template<typename MatrixT, typename VectorT> - void inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, viennacl::linalg::upper_tag) - { - upper_inplace_solve_vector(A, b, A_size, false); - } - - // - // Lower solve: - // - template<typename MatrixT, typename VectorT> - void lower_inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, bool unit_diagonal) - { - typedef typename VectorT::value_type value_type; - - for (vcl_size_t i = 0; i < A_size; ++i) - { - for (vcl_size_t j = 0; j < i; ++j) - { - value_type A_element = A(i, j); - b(i) -= A_element * b(j); - } - - if (!unit_diagonal) - b(i) /= A(i, i); - } - } - - template<typename MatrixT, typename VectorT> - void inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, viennacl::linalg::unit_lower_tag) - { - lower_inplace_solve_vector(A, b, A_size, true); - } - - template<typename MatrixT, typename VectorT> - void inplace_solve_vector(MatrixT & A, VectorT & b, vcl_size_t A_size, viennacl::linalg::lower_tag) - { - lower_inplace_solve_vector(A, b, A_size, false); - } - -} - -template<typename NumericT, typename SolverTagT> -void inplace_solve(matrix_base<NumericT> const & mat, - vector_base<NumericT> & vec, - SolverTagT) -{ - typedef NumericT value_type; - - value_type const * data_A = detail::extract_raw_pointer<value_type>(mat); - value_type * data_v = detail::extract_raw_pointer<value_type>(vec); - - vcl_size_t A_start1 = viennacl::traits::start1(mat); - vcl_size_t A_start2 = viennacl::traits::start2(mat); - vcl_size_t A_inc1 = viennacl::traits::stride1(mat); - vcl_size_t A_inc2 = viennacl::traits::stride2(mat); - vcl_size_t A_size2 = viennacl::traits::size2(mat); - vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat); - vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat); - - vcl_size_t start1 = viennacl::traits::start(vec); - vcl_size_t inc1 = viennacl::traits::stride(vec); - - if (mat.row_major()) - { - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::vector_array_wrapper<value_type> wrapper_v(data_v, start1, inc1); - - detail::inplace_solve_vector(wrapper_A, wrapper_v, A_size2, SolverTagT()); - } - else - { - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::vector_array_wrapper<value_type> wrapper_v(data_v, start1, inc1); - - detail::inplace_solve_vector(wrapper_A, wrapper_v, A_size2, SolverTagT()); - } -} - - -} // namespace host_based -} // namespace linalg -} // namespace viennacl - -#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/fft_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/fft_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/fft_operations.hpp deleted file mode 100644 index f53f8f2..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/fft_operations.hpp +++ /dev/null @@ -1,856 +0,0 @@ -#ifndef VIENNACL_LINALG_HOST_BASED_FFT_OPERATIONS_HPP_ -#define VIENNACL_LINALG_HOST_BASED_FFT_OPERATIONS_HPP_ - -/* ========================================================================= - Copyright (c) 2010-2016, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. - Portions of this software are copyright by UChicago Argonne, LLC. - - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - - Project Head: Karl Rupp [email protected] - - (A list of authors and contributors can be found in the manual) - - License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - -/** @file viennacl/linalg/host_based/fft_operations.hpp - @brief Implementations of Fast Furier Transformation using a plain single-threaded or OpenMP-enabled execution on CPU - */ - -//TODO openom Conditions -#include <viennacl/vector.hpp> -#include <viennacl/matrix.hpp> - -#include "viennacl/linalg/host_based/vector_operations.hpp" - -#include <stdexcept> -#include <cmath> -#include <complex> - -namespace viennacl -{ -namespace linalg -{ -namespace host_based -{ -namespace detail -{ - namespace fft - { - const vcl_size_t MAX_LOCAL_POINTS_NUM = 512; - - namespace FFT_DATA_ORDER - { - enum DATA_ORDER - { - ROW_MAJOR, COL_MAJOR - }; - } - - inline vcl_size_t num_bits(vcl_size_t size) - { - vcl_size_t bits_datasize = 0; - vcl_size_t ds = 1; - - while (ds < size) - { - ds = ds << 1; - bits_datasize++; - } - - return bits_datasize; - } - - inline vcl_size_t next_power_2(vcl_size_t n) - { - n = n - 1; - - vcl_size_t power = 1; - - while (power < sizeof(vcl_size_t) * 8) - { - n = n | (n >> power); - power *= 2; - } - - return n + 1; - } - - inline vcl_size_t get_reorder_num(vcl_size_t v, vcl_size_t bit_size) - { - v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); - v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); - v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); - v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); - v = (v >> 16) | (v << 16); - v = v >> (32 - bit_size); - return v; - } - - template<typename NumericT, unsigned int AlignmentV> - void copy_to_complex_array(std::complex<NumericT> * input_complex, - viennacl::vector<NumericT, AlignmentV> const & in, vcl_size_t size) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - for (long i2 = 0; i2 < long(size * 2); i2 += 2) - { //change array to complex array - vcl_size_t i = vcl_size_t(i2); - input_complex[i / 2] = std::complex<NumericT>(in[i], in[i + 1]); - } - } - - template<typename NumericT> - void copy_to_complex_array(std::complex<NumericT> * input_complex, - viennacl::vector_base<NumericT> const & in, vcl_size_t size) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - for (long i2 = 0; i2 < long(size * 2); i2 += 2) - { //change array to complex array - vcl_size_t i = vcl_size_t(i2); - input_complex[i / 2] = std::complex<NumericT>(in[i], in[i + 1]); - } - } - - template<typename NumericT, unsigned int AlignmentV> - void copy_to_vector(std::complex<NumericT> * input_complex, - viennacl::vector<NumericT, AlignmentV> & in, vcl_size_t size) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - for (long i2 = 0; i2 < long(size); i2++) - { - vcl_size_t i = vcl_size_t(i2); - in(i * 2) = static_cast<NumericT>(std::real(input_complex[i])); - in(i * 2 + 1) = static_cast<NumericT>(std::imag(input_complex[i])); - } - } - - template<typename NumericT> - void copy_to_complex_array(std::complex<NumericT> * input_complex, - NumericT const * in, vcl_size_t size) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - for (long i2 = 0; i2 < long(size * 2); i2 += 2) - { //change array to complex array - vcl_size_t i = vcl_size_t(i2); - input_complex[i / 2] = std::complex<NumericT>(in[i], in[i + 1]); - } - } - - template<typename NumericT> - void copy_to_vector(std::complex<NumericT> * input_complex, NumericT * in, vcl_size_t size) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - for (long i2 = 0; i2 < long(size); i2++) - { - vcl_size_t i = vcl_size_t(i2); - in[i * 2] = static_cast<NumericT>(std::real(input_complex[i])); - in[i * 2 + 1] = static_cast<NumericT>(std::imag(input_complex[i])); - } - } - - template<typename NumericT> - void copy_to_vector(std::complex<NumericT> * input_complex, - viennacl::vector_base<NumericT> & in, vcl_size_t size) - { - std::vector<NumericT> temp(2 * size); -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - for (long i2 = 0; i2 < long(size); i2++) - { - vcl_size_t i = vcl_size_t(i2); - temp[i * 2] = static_cast<NumericT>(std::real(input_complex[i])); - temp[i * 2 + 1] = static_cast<NumericT>(std::imag(input_complex[i])); - } - viennacl::copy(temp, in); - } - - template<typename NumericT> - void zero2(NumericT *input1, NumericT *input2, vcl_size_t size) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - for (long i2 = 0; i2 < long(size); i2++) - { - vcl_size_t i = vcl_size_t(i2); - input1[i] = 0; - input2[i] = 0; - } - } - - } //namespace fft - -} //namespace detail - -/** - * @brief Direct algoritm kenrnel - */ -template<typename NumericT> -void fft_direct(std::complex<NumericT> * input_complex, std::complex<NumericT> * output, - vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign, - viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) -{ - NumericT const NUM_PI = NumericT(3.14159265358979323846); -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel -#endif - for (long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++) - { - vcl_size_t batch_id = vcl_size_t(batch_id2); - for (vcl_size_t k = 0; k < size; k += 1) - { - std::complex<NumericT> f = 0; - for (vcl_size_t n = 0; n < size; n++) - { - std::complex<NumericT> input; - if (!data_order) - input = input_complex[batch_id * stride + n]; //input index here - else - input = input_complex[n * stride + batch_id]; - NumericT arg = sign * 2 * NUM_PI * NumericT(k) / NumericT(size * n); - NumericT sn = std::sin(arg); - NumericT cs = std::cos(arg); - - std::complex<NumericT> ex(cs, sn); - std::complex<NumericT> tmp(input.real() * ex.real() - input.imag() * ex.imag(), - input.real() * ex.imag() + input.imag() * ex.real()); - f = f + tmp; - } - if (!data_order) - output[batch_id * stride + k] = f; // output index here - else - output[k * stride + batch_id] = f; - } - } - -} - -/** - * @brief Direct 1D algorithm for computing Fourier transformation. - * - * Works on any sizes of data. - * Serial implementation has o(n^2) complexity - */ -template<typename NumericT, unsigned int AlignmentV> -void direct(viennacl::vector<NumericT, AlignmentV> const & in, - viennacl::vector<NumericT, AlignmentV> & out, - vcl_size_t size, vcl_size_t stride, - vcl_size_t batch_num, NumericT sign = NumericT(-1), - viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) -{ - std::vector<std::complex<NumericT> > input_complex(size * batch_num); - std::vector<std::complex<NumericT> > output(size * batch_num); - - viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], in, size * batch_num); - - fft_direct(&input_complex[0], &output[0], size, stride, batch_num, sign, data_order); - - viennacl::linalg::host_based::detail::fft::copy_to_vector(&output[0], out, size * batch_num); -} - -/** - * @brief Direct 2D algorithm for computing Fourier transformation. - * - * Works on any sizes of data. - * Serial implementation has o(n^2) complexity - */ -template<typename NumericT, unsigned int AlignmentV> -void direct(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & in, - viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & out, vcl_size_t size, - vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1), - viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) -{ - vcl_size_t row_num = in.internal_size1(); - vcl_size_t col_num = in.internal_size2() >> 1; - - vcl_size_t size_mat = row_num * col_num; - - std::vector<std::complex<NumericT> > input_complex(size_mat); - std::vector<std::complex<NumericT> > output(size_mat); - - NumericT const * data_A = detail::extract_raw_pointer<NumericT>(in); - NumericT * data_B = detail::extract_raw_pointer<NumericT>(out); - - viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data_A, size_mat); - - fft_direct(&input_complex[0], &output[0], size, stride, batch_num, sign, data_order); - - viennacl::linalg::host_based::detail::fft::copy_to_vector(&output[0], data_B, size_mat); -} - -/* - * This function performs reorder of 1D input data. Indexes are sorted in bit-reversal order. - * Such reordering should be done before in-place FFT. - */ -template<typename NumericT, unsigned int AlignmentV> -void reorder(viennacl::vector<NumericT, AlignmentV>& in, vcl_size_t size, vcl_size_t stride, - vcl_size_t bits_datasize, vcl_size_t batch_num, - viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) -{ - std::vector<std::complex<NumericT> > input(size * batch_num); - viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input[0], in, size * batch_num); -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for -#endif - for (long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++) - { - vcl_size_t batch_id = vcl_size_t(batch_id2); - for (vcl_size_t i = 0; i < size; i++) - { - vcl_size_t v = viennacl::linalg::host_based::detail::fft::get_reorder_num(i, bits_datasize); - if (i < v) - { - if (!data_order) - { - std::complex<NumericT> tmp = input[batch_id * stride + i]; // index - input[batch_id * stride + i] = input[batch_id * stride + v]; //index - input[batch_id * stride + v] = tmp; //index - } - else - { - std::complex<NumericT> tmp = input[i * stride + batch_id]; // index - input[i * stride + batch_id] = input[v * stride + batch_id]; //index - input[v * stride + batch_id] = tmp; //index - } - } - } - } - viennacl::linalg::host_based::detail::fft::copy_to_vector(&input[0], in, size * batch_num); -} - -/* - * This function performs reorder of 2D input data. Indexes are sorted in bit-reversal order. - * Such reordering should be done before in-place FFT. - */ -template<typename NumericT, unsigned int AlignmentV> -void reorder(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& in, - vcl_size_t size, vcl_size_t stride, vcl_size_t bits_datasize, vcl_size_t batch_num, - viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) -{ - - NumericT * data = detail::extract_raw_pointer<NumericT>(in); - vcl_size_t row_num = in.internal_size1(); - vcl_size_t col_num = in.internal_size2() >> 1; - vcl_size_t size_mat = row_num * col_num; - - std::vector<std::complex<NumericT> > input(size_mat); - - viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input[0], data, size_mat); - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for -#endif - for (long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++) - { - vcl_size_t batch_id = vcl_size_t(batch_id2); - for (vcl_size_t i = 0; i < size; i++) - { - vcl_size_t v = viennacl::linalg::host_based::detail::fft::get_reorder_num(i, bits_datasize); - if (i < v) - { - if (!data_order) - { - std::complex<NumericT> tmp = input[batch_id * stride + i]; // index - input[batch_id * stride + i] = input[batch_id * stride + v]; //index - input[batch_id * stride + v] = tmp; //index - } else - { - std::complex<NumericT> tmp = input[i * stride + batch_id]; // index - input[i * stride + batch_id] = input[v * stride + batch_id]; //index - input[v * stride + batch_id] = tmp; //index - } - } - } - } - viennacl::linalg::host_based::detail::fft::copy_to_vector(&input[0], data, size_mat); -} - -/** - * @brief Radix-2 algorithm for computing Fourier transformation. - * Kernel for computing smaller amount of data - */ -template<typename NumericT> -void fft_radix2(std::complex<NumericT> * input_complex, vcl_size_t batch_num, - vcl_size_t bit_size, vcl_size_t size, vcl_size_t stride, NumericT sign, - viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) -{ - NumericT const NUM_PI = NumericT(3.14159265358979323846); - - for (vcl_size_t step = 0; step < bit_size; step++) - { - vcl_size_t ss = 1 << step; - vcl_size_t half_size = size >> 1; - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for -#endif - for (long batch_id2 = 0; batch_id2 < long(batch_num); batch_id2++) - { - vcl_size_t batch_id = vcl_size_t(batch_id2); - for (vcl_size_t tid = 0; tid < half_size; tid++) - { - vcl_size_t group = (tid & (ss - 1)); - vcl_size_t pos = ((tid >> step) << (step + 1)) + group; - std::complex<NumericT> in1; - std::complex<NumericT> in2; - vcl_size_t offset; - if (!data_order) - { - offset = batch_id * stride + pos; - in1 = input_complex[offset]; - in2 = input_complex[offset + ss]; - } - else - { - offset = pos * stride + batch_id; - in1 = input_complex[offset]; - in2 = input_complex[offset + ss * stride]; - } - NumericT arg = NumericT(group) * sign * NUM_PI / NumericT(ss); - NumericT sn = std::sin(arg); - NumericT cs = std::cos(arg); - std::complex<NumericT> ex(cs, sn); - std::complex<NumericT> tmp(in2.real() * ex.real() - in2.imag() * ex.imag(), - in2.real() * ex.imag() + in2.imag() * ex.real()); - if (!data_order) - input_complex[offset + ss] = in1 - tmp; - else - input_complex[offset + ss * stride] = in1 - tmp; - input_complex[offset] = in1 + tmp; - } - } - } - -} - -/** - * @brief Radix-2 algorithm for computing Fourier transformation. - * Kernel for computing bigger amount of data - */ -template<typename NumericT> -void fft_radix2_local(std::complex<NumericT> * input_complex, - std::complex<NumericT> * lcl_input, vcl_size_t batch_num, vcl_size_t bit_size, - vcl_size_t size, vcl_size_t stride, NumericT sign, - viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) -{ - NumericT const NUM_PI = NumericT(3.14159265358979323846); - - for (vcl_size_t batch_id = 0; batch_id < batch_num; batch_id++) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for -#endif - for (long p2 = 0; p2 < long(size); p2 += 1) - { - vcl_size_t p = vcl_size_t(p2); - vcl_size_t v = viennacl::linalg::host_based::detail::fft::get_reorder_num(p, bit_size); - - if (!data_order) - lcl_input[v] = input_complex[batch_id * stride + p]; //index - else - lcl_input[v] = input_complex[p * stride + batch_id]; - } - - for (vcl_size_t s = 0; s < bit_size; s++) - { - vcl_size_t ss = 1 << s; -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for -#endif - for (long tid2 = 0; tid2 < long(size)/2; tid2++) - { - vcl_size_t tid = vcl_size_t(tid2); - vcl_size_t group = (tid & (ss - 1)); - vcl_size_t pos = ((tid >> s) << (s + 1)) + group; - - std::complex<NumericT> in1 = lcl_input[pos]; - std::complex<NumericT> in2 = lcl_input[pos + ss]; - - NumericT arg = NumericT(group) * sign * NUM_PI / NumericT(ss); - - NumericT sn = std::sin(arg); - NumericT cs = std::cos(arg); - std::complex<NumericT> ex(cs, sn); - - std::complex<NumericT> tmp(in2.real() * ex.real() - in2.imag() * ex.imag(), - in2.real() * ex.imag() + in2.imag() * ex.real()); - - lcl_input[pos + ss] = in1 - tmp; - lcl_input[pos] = in1 + tmp; - } - - } -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for -#endif - //copy local array back to global memory - for (long p2 = 0; p2 < long(size); p2 += 1) - { - vcl_size_t p = vcl_size_t(p2); - if (!data_order) - input_complex[batch_id * stride + p] = lcl_input[p]; - else - input_complex[p * stride + batch_id] = lcl_input[p]; - - } - - } - -} - -/** - * @brief Radix-2 1D algorithm for computing Fourier transformation. - * - * Works only on power-of-two sizes of data. - * Serial implementation has o(n * lg n) complexity. - * This is a Cooley-Tukey algorithm - */ -template<typename NumericT, unsigned int AlignmentV> -void radix2(viennacl::vector<NumericT, AlignmentV>& in, vcl_size_t size, vcl_size_t stride, - vcl_size_t batch_num, NumericT sign = NumericT(-1), - viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) -{ - - vcl_size_t bit_size = viennacl::linalg::host_based::detail::fft::num_bits(size); - - std::vector<std::complex<NumericT> > input_complex(size * batch_num); - std::vector<std::complex<NumericT> > lcl_input(size * batch_num); - viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], in, size * batch_num); - - if (size <= viennacl::linalg::host_based::detail::fft::MAX_LOCAL_POINTS_NUM) - { - viennacl::linalg::host_based::fft_radix2_local(&input_complex[0], &lcl_input[0], batch_num, bit_size, size, stride, sign, data_order); - } - else - { - viennacl::linalg::host_based::reorder<NumericT>(in, size, stride, bit_size, batch_num, data_order); - viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], in, size * batch_num); - viennacl::linalg::host_based::fft_radix2(&input_complex[0], batch_num, bit_size, size, stride, sign, data_order); - } - - viennacl::linalg::host_based::detail::fft::copy_to_vector(&input_complex[0], in, size * batch_num); -} - -/** - * @brief Radix-2 2D algorithm for computing Fourier transformation. - * - * Works only on power-of-two sizes of data. - * Serial implementation has o(n * lg n) complexity. - * This is a Cooley-Tukey algorithm - */ -template<typename NumericT, unsigned int AlignmentV> -void radix2(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV>& in, vcl_size_t size, - vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1), - viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order = viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR) -{ - - vcl_size_t bit_size = viennacl::linalg::host_based::detail::fft::num_bits(size); - - NumericT * data = detail::extract_raw_pointer<NumericT>(in); - - vcl_size_t row_num = in.internal_size1(); - vcl_size_t col_num = in.internal_size2() >> 1; - vcl_size_t size_mat = row_num * col_num; - - std::vector<std::complex<NumericT> > input_complex(size_mat); - - viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data, size_mat); - if (size <= viennacl::linalg::host_based::detail::fft::MAX_LOCAL_POINTS_NUM) - { - //std::cout<<bit_size<<","<<size<<","<<stride<<","<<batch_num<<","<<size<<","<<sign<<","<<data_order<<std::endl; - std::vector<std::complex<NumericT> > lcl_input(size_mat); - viennacl::linalg::host_based::fft_radix2_local(&input_complex[0], &lcl_input[0], batch_num, bit_size, size, stride, sign, data_order); - } - else - { - viennacl::linalg::host_based::reorder<NumericT>(in, size, stride, bit_size, batch_num, data_order); - viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data, size_mat); - viennacl::linalg::host_based::fft_radix2(&input_complex[0], batch_num, bit_size, size, stride, sign, data_order); - } - - viennacl::linalg::host_based::detail::fft::copy_to_vector(&input_complex[0], data, size_mat); - -} - -/** - * @brief Bluestein's algorithm for computing Fourier transformation. - * - * Currently, Works only for sizes of input data which less than 2^16. - * Uses a lot of additional memory, but should be fast for any size of data. - * Serial implementation has something about o(n * lg n) complexity - */ -template<typename NumericT, unsigned int AlignmentV> -void bluestein(viennacl::vector<NumericT, AlignmentV>& in, viennacl::vector<NumericT, AlignmentV>& out, vcl_size_t /*batch_num*/) -{ - - vcl_size_t size = in.size() >> 1; - vcl_size_t ext_size = viennacl::linalg::host_based::detail::fft::next_power_2(2 * size - 1); - - viennacl::vector<NumericT, AlignmentV> A(ext_size << 1); - viennacl::vector<NumericT, AlignmentV> B(ext_size << 1); - viennacl::vector<NumericT, AlignmentV> Z(ext_size << 1); - - std::vector<std::complex<NumericT> > input_complex(size); - std::vector<std::complex<NumericT> > output_complex(size); - - std::vector<std::complex<NumericT> > A_complex(ext_size); - std::vector<std::complex<NumericT> > B_complex(ext_size); - std::vector<std::complex<NumericT> > Z_complex(ext_size); - - viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], in, size); -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for -#endif - for (long i2 = 0; i2 < long(ext_size); i2++) - { - vcl_size_t i = vcl_size_t(i2); - A_complex[i] = 0; - B_complex[i] = 0; - } - - vcl_size_t double_size = size << 1; - - NumericT const NUM_PI = NumericT(3.14159265358979323846); -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for -#endif - for (long i2 = 0; i2 < long(size); i2++) - { - vcl_size_t i = vcl_size_t(i2); - vcl_size_t rm = i * i % (double_size); - NumericT angle = NumericT(rm) / NumericT(size) * NumericT(NUM_PI); - - NumericT sn_a = std::sin(-angle); - NumericT cs_a = std::cos(-angle); - - std::complex<NumericT> a_i(cs_a, sn_a); - std::complex<NumericT> b_i(cs_a, -sn_a); - - A_complex[i] = std::complex<NumericT>(input_complex[i].real() * a_i.real() - input_complex[i].imag() * a_i.imag(), - input_complex[i].real() * a_i.imag() + input_complex[i].imag() * a_i.real()); - B_complex[i] = b_i; - - // very bad instruction, to be fixed - if (i) - B_complex[ext_size - i] = b_i; - } - - viennacl::linalg::host_based::detail::fft::copy_to_vector(&input_complex[0], in, size); - viennacl::linalg::host_based::detail::fft::copy_to_vector(&A_complex[0], A, ext_size); - viennacl::linalg::host_based::detail::fft::copy_to_vector(&B_complex[0], B, ext_size); - - viennacl::linalg::convolve_i(A, B, Z); - - viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&Z_complex[0], Z, ext_size); - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for -#endif - for (long i2 = 0; i2 < long(size); i2++) - { - vcl_size_t i = vcl_size_t(i2); - vcl_size_t rm = i * i % (double_size); - NumericT angle = NumericT(rm) / NumericT(size) * NumericT(-NUM_PI); - NumericT sn_a = std::sin(angle); - NumericT cs_a = std::cos(angle); - std::complex<NumericT> b_i(cs_a, sn_a); - output_complex[i] = std::complex<NumericT>(Z_complex[i].real() * b_i.real() - Z_complex[i].imag() * b_i.imag(), - Z_complex[i].real() * b_i.imag() + Z_complex[i].imag() * b_i.real()); - } - viennacl::linalg::host_based::detail::fft::copy_to_vector(&output_complex[0], out, size); - -} - -/** - * @brief Normalize vector with his own size - */ -template<typename NumericT, unsigned int AlignmentV> -void normalize(viennacl::vector<NumericT, AlignmentV> & input) -{ - vcl_size_t size = input.size() >> 1; - NumericT norm_factor = static_cast<NumericT>(size); - for (vcl_size_t i = 0; i < size * 2; i++) - input[i] /= norm_factor; - -} - -/** - * @brief Complex multiplikation of two vectors - */ -template<typename NumericT, unsigned int AlignmentV> -void multiply_complex(viennacl::vector<NumericT, AlignmentV> const & input1, - viennacl::vector<NumericT, AlignmentV> const & input2, - viennacl::vector<NumericT, AlignmentV> & output) -{ - vcl_size_t size = input1.size() >> 1; - - std::vector<std::complex<NumericT> > input1_complex(size); - std::vector<std::complex<NumericT> > input2_complex(size); - std::vector<std::complex<NumericT> > output_complex(size); - viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input1_complex[0], input1, size); - viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input2_complex[0], input2, size); - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for -#endif - for (long i2 = 0; i2 < long(size); i2++) - { - vcl_size_t i = vcl_size_t(i2); - std::complex<NumericT> in1 = input1_complex[i]; - std::complex<NumericT> in2 = input2_complex[i]; - output_complex[i] = std::complex<NumericT>(in1.real() * in2.real() - in1.imag() * in2.imag(), - in1.real() * in2.imag() + in1.imag() * in2.real()); - } - viennacl::linalg::host_based::detail::fft::copy_to_vector(&output_complex[0], output, size); - -} -/** - * @brief Inplace transpose of matrix - */ -template<typename NumericT, unsigned int AlignmentV> -void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & input) -{ - vcl_size_t row_num = input.internal_size1() / 2; - vcl_size_t col_num = input.internal_size2() / 2; - - vcl_size_t size = row_num * col_num; - - NumericT * data = detail::extract_raw_pointer<NumericT>(input); - - std::vector<std::complex<NumericT> > input_complex(size); - - viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data, size); -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for -#endif - for (long i2 = 0; i2 < long(size); i2++) - { - vcl_size_t i = vcl_size_t(i2); - vcl_size_t row = i / col_num; - vcl_size_t col = i - row * col_num; - vcl_size_t new_pos = col * row_num + row; - - if (i < new_pos) - { - std::complex<NumericT> val = input_complex[i]; - input_complex[i] = input_complex[new_pos]; - input_complex[new_pos] = val; - } - } - viennacl::linalg::host_based::detail::fft::copy_to_vector(&input_complex[0], data, size); - -} - -/** - * @brief Transpose matrix - */ -template<typename NumericT, unsigned int AlignmentV> -void transpose(viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> const & input, - viennacl::matrix<NumericT, viennacl::row_major, AlignmentV> & output) -{ - - vcl_size_t row_num = input.internal_size1() / 2; - vcl_size_t col_num = input.internal_size2() / 2; - vcl_size_t size = row_num * col_num; - - NumericT const * data_A = detail::extract_raw_pointer<NumericT>(input); - NumericT * data_B = detail::extract_raw_pointer<NumericT>(output); - - std::vector<std::complex<NumericT> > input_complex(size); - viennacl::linalg::host_based::detail::fft::copy_to_complex_array(&input_complex[0], data_A, size); - - std::vector<std::complex<NumericT> > output_complex(size); -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for -#endif - for (long i2 = 0; i2 < long(size); i2++) - { - vcl_size_t i = vcl_size_t(i2); - vcl_size_t row = i / col_num; - vcl_size_t col = i % col_num; - vcl_size_t new_pos = col * row_num + row; - output_complex[new_pos] = input_complex[i]; - } - viennacl::linalg::host_based::detail::fft::copy_to_vector(&output_complex[0], data_B, size); -} - -/** - * @brief Create complex vector from real vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part) - */ -template<typename NumericT> -void real_to_complex(viennacl::vector_base<NumericT> const & in, - viennacl::vector_base<NumericT> & out, vcl_size_t size) -{ - NumericT const * data_in = detail::extract_raw_pointer<NumericT>(in); - NumericT * data_out = detail::extract_raw_pointer<NumericT>(out); - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - for (long i2 = 0; i2 < long(size); i2++) - { - vcl_size_t i = static_cast<vcl_size_t>(i2); - data_out[2*i ] = data_in[i]; - data_out[2*i+1] = NumericT(0); - } -} - -/** - * @brief Create real vector from complex vector (even elements(2*k) = real part, odd elements(2*k+1) = imaginary part) - */ -template<typename NumericT> -void complex_to_real(viennacl::vector_base<NumericT> const & in, - viennacl::vector_base<NumericT> & out, vcl_size_t size) -{ - NumericT const * data_in = detail::extract_raw_pointer<NumericT>(in); - NumericT * data_out = detail::extract_raw_pointer<NumericT>(out); - -#ifdef VIENNACL_WITH_OPENMP -#pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - for (long i = 0; i < long(size); i++) - data_out[i] = data_in[2*i]; -} - -/** - * @brief Reverse vector to opposite order and save it in input vector - */ -template<typename NumericT> -void reverse(viennacl::vector_base<NumericT> & in) -{ - vcl_size_t size = in.size(); - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (size > VIENNACL_OPENMP_VECTOR_MIN_SIZE) -#endif - for (long i2 = 0; i2 < long(size); i2++) - { - vcl_size_t i = vcl_size_t(i2); - NumericT val1 = in[i]; - NumericT val2 = in[size - i - 1]; - in[i] = val2; - in[size - i - 1] = val1; - } -} - -} //namespace host_based -} //namespace linalg -} //namespace viennacl - -#endif /* FFT_OPERATIONS_HPP_ */ http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/ilu_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/ilu_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/ilu_operations.hpp deleted file mode 100644 index 62c885a..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/ilu_operations.hpp +++ /dev/null @@ -1,672 +0,0 @@ -#ifndef VIENNACL_LINALG_HOST_BASED_ILU_OPERATIONS_HPP_ -#define VIENNACL_LINALG_HOST_BASED_ILU_OPERATIONS_HPP_ - -/* ========================================================================= - Copyright (c) 2010-2016, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. - Portions of this software are copyright by UChicago Argonne, LLC. - - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - - Project Head: Karl Rupp [email protected] - - (A list of authors and contributors can be found in the PDF manual) - - License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - -/** @file viennacl/linalg/host_based/ilu_operations.hpp - @brief Implementations of specialized routines for the Chow-Patel parallel ILU preconditioner using the host (OpenMP) -*/ - -#include <cmath> -#include <algorithm> //for std::max and std::min - -#include "viennacl/forwards.h" -#include "viennacl/scalar.hpp" -#include "viennacl/tools/tools.hpp" -#include "viennacl/meta/predicate.hpp" -#include "viennacl/meta/enable_if.hpp" -#include "viennacl/traits/size.hpp" -#include "viennacl/traits/start.hpp" -#include "viennacl/linalg/host_based/common.hpp" -#include "viennacl/linalg/vector_operations.hpp" -#include "viennacl/traits/stride.hpp" - - -// Minimum vector size for using OpenMP on vector operations: -#ifndef VIENNACL_OPENMP_ILU_MIN_SIZE - #define VIENNACL_OPENMP_ILU_MIN_SIZE 5000 -#endif - -namespace viennacl -{ -namespace linalg -{ -namespace host_based -{ - -template<typename NumericT> -void extract_L(compressed_matrix<NumericT> const & A, - compressed_matrix<NumericT> & L) -{ - // L is known to have correct dimensions - - unsigned int const *A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1()); - unsigned int const *A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2()); - NumericT const *A_elements = detail::extract_raw_pointer<NumericT>(A.handle()); - - unsigned int *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1()); - - // - // Step 1: Count elements in L - // -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(A.size1()); ++row) - { - unsigned int col_begin = A_row_buffer[row]; - unsigned int col_end = A_row_buffer[row+1]; - - for (unsigned int j = col_begin; j < col_end; ++j) - { - unsigned int col = A_col_buffer[j]; - if (long(col) <= row) - ++L_row_buffer[row]; - } - } - - // - // Step 2: Exclusive scan on row_buffer arrays to get correct starting indices - // - viennacl::vector_base<unsigned int> wrapped_L_row_buffer(L.handle1(), L.size1() + 1, 0, 1); - viennacl::linalg::exclusive_scan(wrapped_L_row_buffer); - L.reserve(wrapped_L_row_buffer[L.size1()], false); - - unsigned int *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2()); - NumericT *L_elements = detail::extract_raw_pointer<NumericT>(L.handle()); - - // - // Step 3: Write entries: - // -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(A.size1()); ++row) - { - unsigned int col_begin = A_row_buffer[row]; - unsigned int col_end = A_row_buffer[row+1]; - - unsigned int index_L = L_row_buffer[row]; - for (unsigned int j = col_begin; j < col_end; ++j) - { - unsigned int col = A_col_buffer[j]; - NumericT value = A_elements[j]; - - if (long(col) <= row) - { - L_col_buffer[index_L] = col; - L_elements[index_L] = value; - ++index_L; - } - } - } - -} // extract_L - - -/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */ -template<typename NumericT> -void icc_scale(compressed_matrix<NumericT> const & A, - compressed_matrix<NumericT> & L) -{ - viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A)); - - unsigned int const *A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1()); - unsigned int const *A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2()); - NumericT const *A_elements = detail::extract_raw_pointer<NumericT>(A.handle()); - - NumericT *D_elements = detail::extract_raw_pointer<NumericT>(D.handle()); - - // - // Step 1: Determine D - // -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(A.size1()); ++row) - { - unsigned int col_begin = A_row_buffer[row]; - unsigned int col_end = A_row_buffer[row+1]; - - for (unsigned int j = col_begin; j < col_end; ++j) - { - unsigned int col = A_col_buffer[j]; - if (row == col) - { - D_elements[row] = NumericT(1) / std::sqrt(std::fabs(A_elements[j])); - break; - } - } - } - - // - // Step 2: Scale values in L: - // - unsigned int const *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1()); - unsigned int const *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2()); - NumericT *L_elements = detail::extract_raw_pointer<NumericT>(L.handle()); - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(A.size1()); ++row) - { - unsigned int col_begin = L_row_buffer[row]; - unsigned int col_end = L_row_buffer[row+1]; - - NumericT D_row = D_elements[row]; - - for (unsigned int j = col_begin; j < col_end; ++j) - L_elements[j] *= D_row * D_elements[L_col_buffer[j]]; - } - - L.generate_row_block_information(); -} - - - -/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ICC using OpenMP (cf. Algorithm 3 in paper, but for L rather than U) */ -template<typename NumericT> -void icc_chow_patel_sweep(compressed_matrix<NumericT> & L, - vector<NumericT> & aij_L) -{ - unsigned int const *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1()); - unsigned int const *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2()); - NumericT *L_elements = detail::extract_raw_pointer<NumericT>(L.handle()); - - NumericT *aij_ptr = detail::extract_raw_pointer<NumericT>(aij_L.handle()); - - // temporary workspace - NumericT *L_backup = (NumericT *)malloc(sizeof(NumericT) * L.nnz()); - - // backup: -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (L.nnz() > VIENNACL_OPENMP_ILU_MIN_SIZE) -#endif - for (long i = 0; i < static_cast<long>(L.nnz()); ++i) - L_backup[i] = L_elements[i]; - - - // sweep -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (L.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(L.size1()); ++row) - { - // - // update L: - // - unsigned int row_Li_start = L_row_buffer[row]; - unsigned int row_Li_end = L_row_buffer[row + 1]; - - for (unsigned int i = row_Li_start; i < row_Li_end; ++i) - { - unsigned int col = L_col_buffer[i]; - - unsigned int row_Lj_start = L_row_buffer[col]; - unsigned int row_Lj_end = L_row_buffer[col+1]; - - // compute \sum_{k=1}^{j-1} l_ik l_jk - unsigned int index_Lj = row_Lj_start; - unsigned int col_Lj = L_col_buffer[index_Lj]; - NumericT s = aij_ptr[i]; - for (unsigned int index_Li = row_Li_start; index_Li < i; ++index_Li) - { - unsigned int col_Li = L_col_buffer[index_Li]; - - // find element in row j - while (col_Lj < col_Li) - { - ++index_Lj; - col_Lj = L_col_buffer[index_Lj]; - } - - if (col_Lj == col_Li) - s -= L_backup[index_Li] * L_backup[index_Lj]; - } - - if (row != col) - L_elements[i] = s / L_backup[row_Lj_end - 1]; // diagonal element is last in row! - else - L_elements[i] = std::sqrt(s); - } - } - - free(L_backup); -} - - - -//////////////////////// ILU //////////////////////// - -template<typename NumericT> -void extract_LU(compressed_matrix<NumericT> const & A, - compressed_matrix<NumericT> & L, - compressed_matrix<NumericT> & U) -{ - // L and U are known to have correct dimensions - - unsigned int const *A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1()); - unsigned int const *A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2()); - NumericT const *A_elements = detail::extract_raw_pointer<NumericT>(A.handle()); - - unsigned int *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1()); - unsigned int *U_row_buffer = detail::extract_raw_pointer<unsigned int>(U.handle1()); - - // - // Step 1: Count elements in L and U - // -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(A.size1()); ++row) - { - unsigned int col_begin = A_row_buffer[row]; - unsigned int col_end = A_row_buffer[row+1]; - - for (unsigned int j = col_begin; j < col_end; ++j) - { - unsigned int col = A_col_buffer[j]; - if (long(col) <= row) - ++L_row_buffer[row]; - if (long(col) >= row) - ++U_row_buffer[row]; - } - } - - // - // Step 2: Exclusive scan on row_buffer arrays to get correct starting indices - // - viennacl::vector_base<unsigned int> wrapped_L_row_buffer(L.handle1(), L.size1() + 1, 0, 1); - viennacl::linalg::exclusive_scan(wrapped_L_row_buffer); - L.reserve(wrapped_L_row_buffer[L.size1()], false); - - viennacl::vector_base<unsigned int> wrapped_U_row_buffer(U.handle1(), U.size1() + 1, 0, 1); - viennacl::linalg::exclusive_scan(wrapped_U_row_buffer); - U.reserve(wrapped_U_row_buffer[U.size1()], false); - - unsigned int *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2()); - NumericT *L_elements = detail::extract_raw_pointer<NumericT>(L.handle()); - - unsigned int *U_col_buffer = detail::extract_raw_pointer<unsigned int>(U.handle2()); - NumericT *U_elements = detail::extract_raw_pointer<NumericT>(U.handle()); - - // - // Step 3: Write entries: - // -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(A.size1()); ++row) - { - unsigned int col_begin = A_row_buffer[row]; - unsigned int col_end = A_row_buffer[row+1]; - - unsigned int index_L = L_row_buffer[row]; - unsigned int index_U = U_row_buffer[row]; - for (unsigned int j = col_begin; j < col_end; ++j) - { - unsigned int col = A_col_buffer[j]; - NumericT value = A_elements[j]; - - if (long(col) <= row) - { - L_col_buffer[index_L] = col; - L_elements[index_L] = value; - ++index_L; - } - - if (long(col) >= row) - { - U_col_buffer[index_U] = col; - U_elements[index_U] = value; - ++index_U; - } - } - } - -} // extract_LU - - - -/** @brief Scales the values extracted from A such that A' = DAD has unit diagonal. Updates values from A in L and U accordingly. */ -template<typename NumericT> -void ilu_scale(compressed_matrix<NumericT> const & A, - compressed_matrix<NumericT> & L, - compressed_matrix<NumericT> & U) -{ - viennacl::vector<NumericT> D(A.size1(), viennacl::traits::context(A)); - - unsigned int const *A_row_buffer = detail::extract_raw_pointer<unsigned int>(A.handle1()); - unsigned int const *A_col_buffer = detail::extract_raw_pointer<unsigned int>(A.handle2()); - NumericT const *A_elements = detail::extract_raw_pointer<NumericT>(A.handle()); - - NumericT *D_elements = detail::extract_raw_pointer<NumericT>(D.handle()); - - // - // Step 1: Determine D - // -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(A.size1()); ++row) - { - unsigned int col_begin = A_row_buffer[row]; - unsigned int col_end = A_row_buffer[row+1]; - - for (unsigned int j = col_begin; j < col_end; ++j) - { - unsigned int col = A_col_buffer[j]; - if (row == col) - { - D_elements[row] = NumericT(1) / std::sqrt(std::fabs(A_elements[j])); - break; - } - } - } - - // - // Step 2: Scale values in L: - // - unsigned int const *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1()); - unsigned int const *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2()); - NumericT *L_elements = detail::extract_raw_pointer<NumericT>(L.handle()); - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(A.size1()); ++row) - { - unsigned int col_begin = L_row_buffer[row]; - unsigned int col_end = L_row_buffer[row+1]; - - NumericT D_row = D_elements[row]; - - for (unsigned int j = col_begin; j < col_end; ++j) - L_elements[j] *= D_row * D_elements[L_col_buffer[j]]; - } - - // - // Step 3: Scale values in U: - // - unsigned int const *U_row_buffer = detail::extract_raw_pointer<unsigned int>(U.handle1()); - unsigned int const *U_col_buffer = detail::extract_raw_pointer<unsigned int>(U.handle2()); - NumericT *U_elements = detail::extract_raw_pointer<NumericT>(U.handle()); - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (A.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(A.size1()); ++row) - { - unsigned int col_begin = U_row_buffer[row]; - unsigned int col_end = U_row_buffer[row+1]; - - NumericT D_row = D_elements[row]; - - for (unsigned int j = col_begin; j < col_end; ++j) - U_elements[j] *= D_row * D_elements[U_col_buffer[j]]; - } - - L.generate_row_block_information(); - // Note: block information for U will be generated after transposition - -} - -template<typename NumericT> -void ilu_transpose(compressed_matrix<NumericT> const & A, - compressed_matrix<NumericT> & B) -{ - NumericT const * A_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(A.handle()); - unsigned int const * A_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle1()); - unsigned int const * A_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(A.handle2()); - - // initialize datastructures for B: - B = compressed_matrix<NumericT>(A.size2(), A.size1(), A.nnz(), viennacl::traits::context(A)); - - NumericT * B_elements = viennacl::linalg::host_based::detail::extract_raw_pointer<NumericT>(B.handle()); - unsigned int * B_row_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(B.handle1()); - unsigned int * B_col_buffer = viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned int>(B.handle2()); - - // prepare uninitialized B_row_buffer: - for (std::size_t i = 0; i < B.size1(); ++i) - B_row_buffer[i] = 0; - - // - // Stage 1: Compute pattern for B - // - for (std::size_t row = 0; row < A.size1(); ++row) - { - unsigned int row_start = A_row_buffer[row]; - unsigned int row_stop = A_row_buffer[row+1]; - - for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index) - B_row_buffer[A_col_buffer[nnz_index]] += 1; - } - - // Bring row-start array in place using exclusive-scan: - unsigned int offset = B_row_buffer[0]; - B_row_buffer[0] = 0; - for (std::size_t row = 1; row < B.size1(); ++row) - { - unsigned int tmp = B_row_buffer[row]; - B_row_buffer[row] = offset; - offset += tmp; - } - B_row_buffer[B.size1()] = offset; - - // - // Stage 2: Fill with data - // - - std::vector<unsigned int> B_row_offsets(B.size1()); //number of elements already written per row - - for (unsigned int row = 0; row < static_cast<unsigned int>(A.size1()); ++row) - { - //std::cout << "Row " << row << ": "; - unsigned int row_start = A_row_buffer[row]; - unsigned int row_stop = A_row_buffer[row+1]; - - for (unsigned int nnz_index = row_start; nnz_index < row_stop; ++nnz_index) - { - unsigned int col_in_A = A_col_buffer[nnz_index]; - unsigned int B_nnz_index = B_row_buffer[col_in_A] + B_row_offsets[col_in_A]; - B_col_buffer[B_nnz_index] = row; - B_elements[B_nnz_index] = A_elements[nnz_index]; - ++B_row_offsets[col_in_A]; - //B_temp.at(A_col_buffer[nnz_index])[row] = A_elements[nnz_index]; - } - } - - // Step 3: Make datastructure consistent (row blocks!) - B.generate_row_block_information(); -} - - - -/** @brief Performs one nonlinear relaxation step in the Chow-Patel-ILU using OpenMP (cf. Algorithm 2 in paper) */ -template<typename NumericT> -void ilu_chow_patel_sweep(compressed_matrix<NumericT> & L, - vector<NumericT> const & aij_L, - compressed_matrix<NumericT> & U_trans, - vector<NumericT> const & aij_U_trans) -{ - unsigned int const *L_row_buffer = detail::extract_raw_pointer<unsigned int>(L.handle1()); - unsigned int const *L_col_buffer = detail::extract_raw_pointer<unsigned int>(L.handle2()); - NumericT *L_elements = detail::extract_raw_pointer<NumericT>(L.handle()); - - NumericT const *aij_L_ptr = detail::extract_raw_pointer<NumericT>(aij_L.handle()); - - unsigned int const *U_row_buffer = detail::extract_raw_pointer<unsigned int>(U_trans.handle1()); - unsigned int const *U_col_buffer = detail::extract_raw_pointer<unsigned int>(U_trans.handle2()); - NumericT *U_elements = detail::extract_raw_pointer<NumericT>(U_trans.handle()); - - NumericT const *aij_U_trans_ptr = detail::extract_raw_pointer<NumericT>(aij_U_trans.handle()); - - // temporary workspace - NumericT *L_backup = new NumericT[L.nnz()]; - NumericT *U_backup = new NumericT[U_trans.nnz()]; - - // backup: -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (L.nnz() > VIENNACL_OPENMP_ILU_MIN_SIZE) -#endif - for (long i = 0; i < static_cast<long>(L.nnz()); ++i) - L_backup[i] = L_elements[i]; - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (U_trans.nnz() > VIENNACL_OPENMP_ILU_MIN_SIZE) -#endif - for (long i = 0; i < static_cast<long>(U_trans.nnz()); ++i) - U_backup[i] = U_elements[i]; - - // sweep -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (L.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(L.size1()); ++row) - { - // - // update L: - // - unsigned int row_L_start = L_row_buffer[row]; - unsigned int row_L_end = L_row_buffer[row + 1]; - - for (unsigned int j = row_L_start; j < row_L_end; ++j) - { - unsigned int col = L_col_buffer[j]; - - if (col == row) - continue; - - unsigned int row_U_start = U_row_buffer[col]; - unsigned int row_U_end = U_row_buffer[col + 1]; - - // compute \sum_{k=1}^{j-1} l_ik u_kj - unsigned int index_U = row_U_start; - unsigned int col_U = (index_U < row_U_end) ? U_col_buffer[index_U] : static_cast<unsigned int>(U_trans.size2()); - NumericT sum = 0; - for (unsigned int k = row_L_start; k < j; ++k) - { - unsigned int col_L = L_col_buffer[k]; - - // find element in U - while (col_U < col_L) - { - ++index_U; - col_U = U_col_buffer[index_U]; - } - - if (col_U == col_L) - sum += L_backup[k] * U_backup[index_U]; - } - - // update l_ij: - assert(U_col_buffer[row_U_end - 1] == col && bool("Accessing U element which is not a diagonal element!")); - L_elements[j] = (aij_L_ptr[j] - sum) / U_backup[row_U_end - 1]; // diagonal element is last entry in U - } - - - // - // update U: - // - unsigned int row_U_start = U_row_buffer[row]; - unsigned int row_U_end = U_row_buffer[row + 1]; - for (unsigned int j = row_U_start; j < row_U_end; ++j) - { - unsigned int col = U_col_buffer[j]; - - row_L_start = L_row_buffer[col]; - row_L_end = L_row_buffer[col + 1]; - - // compute \sum_{k=1}^{j-1} l_ik u_kj - unsigned int index_L = row_L_start; - unsigned int col_L = (index_L < row_L_end) ? L_col_buffer[index_L] : static_cast<unsigned int>(L.size1()); - NumericT sum = 0; - for (unsigned int k = row_U_start; k < j; ++k) - { - unsigned int col_U = U_col_buffer[k]; - - // find element in L - while (col_L < col_U) - { - ++index_L; - col_L = L_col_buffer[index_L]; - } - - if (col_U == col_L) - sum += L_backup[index_L] * U_backup[k]; - } - - // update u_ij: - U_elements[j] = aij_U_trans_ptr[j] - sum; - } - } - - delete[] L_backup; - delete[] U_backup; -} - - -template<typename NumericT> -void ilu_form_neumann_matrix(compressed_matrix<NumericT> & R, - vector<NumericT> & diag_R) -{ - unsigned int *R_row_buffer = detail::extract_raw_pointer<unsigned int>(R.handle1()); - unsigned int *R_col_buffer = detail::extract_raw_pointer<unsigned int>(R.handle2()); - NumericT *R_elements = detail::extract_raw_pointer<NumericT>(R.handle()); - - NumericT *diag_R_ptr = detail::extract_raw_pointer<NumericT>(diag_R.handle()); - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if (R.size1() > VIENNACL_OPENMP_ILU_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(R.size1()); ++row) - { - unsigned int col_begin = R_row_buffer[row]; - unsigned int col_end = R_row_buffer[row+1]; - - // part 1: extract diagonal entry - NumericT diag = 0; - for (unsigned int j = col_begin; j < col_end; ++j) - { - unsigned int col = R_col_buffer[j]; - if (col == row) - { - diag = R_elements[j]; - R_elements[j] = 0; // (I - D^{-1}R) - break; - } - } - diag_R_ptr[row] = diag; - - assert((diag > 0 || diag < 0) && bool("Zero diagonal detected!")); - - // part2: scale - for (unsigned int j = col_begin; j < col_end; ++j) - R_elements[j] /= -diag; - } - - //std::cout << "diag_R: " << diag_R << std::endl; -} - -} //namespace host_based -} //namespace linalg -} //namespace viennacl - - -#endif
