http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/matrix_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/matrix_operations.hpp deleted file mode 100644 index f23223f..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/matrix_operations.hpp +++ /dev/null @@ -1,2052 +0,0 @@ -#ifndef VIENNACL_LINALG_HOST_BASED_MATRIX_OPERATIONS_HPP_ -#define VIENNACL_LINALG_HOST_BASED_MATRIX_OPERATIONS_HPP_ - -/* ========================================================================= - Copyright (c) 2010-2016, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. - Portions of this software are copyright by UChicago Argonne, LLC. - - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - - Project Head: Karl Rupp [email protected] - - (A list of authors and contributors can be found in the manual) - - License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - -/** @file viennacl/linalg/host_based/matrix_operations.hpp - @brief Implementations of dense matrix related operations, including matrix-vector products, using a plain single-threaded or OpenMP-enabled execution on CPU. -*/ - -#include "viennacl/forwards.h" -#include "viennacl/scalar.hpp" -#include "viennacl/vector.hpp" -#include "viennacl/vector_proxy.hpp" -#include "viennacl/tools/tools.hpp" -#include "viennacl/meta/enable_if.hpp" -#include "viennacl/meta/predicate.hpp" -#include "viennacl/meta/result_of.hpp" -#include "viennacl/traits/size.hpp" -#include "viennacl/traits/start.hpp" -#include "viennacl/traits/handle.hpp" -#include "viennacl/traits/stride.hpp" -#include "viennacl/linalg/detail/op_applier.hpp" -#include "viennacl/linalg/host_based/common.hpp" -#include "viennacl/linalg/prod.hpp" - -// Minimum Matrix size(size1*size2) for using OpenMP on matrix operations: -#ifndef VIENNACL_OPENMP_MATRIX_MIN_SIZE - #define VIENNACL_OPENMP_MATRIX_MIN_SIZE 5000 -#endif - -namespace viennacl -{ -namespace linalg -{ -namespace host_based -{ - -// -// Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here! -// - -template<typename DestNumericT, typename SrcNumericT> -void convert(matrix_base<DestNumericT> & mat1, matrix_base<SrcNumericT> const & mat2) -{ - assert(mat1.row_major() == mat2.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!")); - - DestNumericT * data_A = detail::extract_raw_pointer<DestNumericT>(mat1); - SrcNumericT const * data_B = detail::extract_raw_pointer<SrcNumericT>(mat2); - - vcl_size_t A_start1 = viennacl::traits::start1(mat1); - vcl_size_t A_start2 = viennacl::traits::start2(mat1); - vcl_size_t A_inc1 = viennacl::traits::stride1(mat1); - vcl_size_t A_inc2 = viennacl::traits::stride2(mat1); - vcl_size_t A_size1 = viennacl::traits::size1(mat1); - vcl_size_t A_size2 = viennacl::traits::size2(mat1); - vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat1); - vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat1); - - vcl_size_t B_start1 = viennacl::traits::start1(mat2); - vcl_size_t B_start2 = viennacl::traits::start2(mat2); - vcl_size_t B_inc1 = viennacl::traits::stride1(mat2); - vcl_size_t B_inc2 = viennacl::traits::stride2(mat2); - vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(mat2); - vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(mat2); - - if (mat1.row_major()) - { - detail::matrix_array_wrapper<DestNumericT, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<SrcNumericT const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(A_size1); ++row) - for (vcl_size_t col = 0; col < A_size2; ++col) - wrapper_A(row, col) = static_cast<DestNumericT>(wrapper_B(row, col)); - } - else - { - detail::matrix_array_wrapper<DestNumericT, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<SrcNumericT const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long col = 0; col < static_cast<long>(A_size2); ++col) - for (vcl_size_t row = 0; row < A_size1; ++row) - wrapper_A(row, col) = static_cast<DestNumericT>(wrapper_B(row, col)); - } -} - - - -template<typename NumericT, - typename SizeT, typename DistanceT> -void trans(const matrix_expression<const matrix_base<NumericT, SizeT, DistanceT>, - const matrix_base<NumericT, SizeT, DistanceT>, op_trans> & proxy, matrix_base<NumericT> & temp_trans) -{ - typedef NumericT value_type; - const value_type * data_A = detail::extract_raw_pointer<value_type>(proxy.lhs()); - value_type * data_B = detail::extract_raw_pointer<value_type>(temp_trans); - - vcl_size_t A_start1 = viennacl::traits::start1(proxy.lhs()); - vcl_size_t A_start2 = viennacl::traits::start2(proxy.lhs()); - vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(proxy.lhs()); - vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(proxy.lhs()); - vcl_size_t A_inc1 = viennacl::traits::stride1(proxy.lhs()); - vcl_size_t A_inc2 = viennacl::traits::stride2(proxy.lhs()); - vcl_size_t A_size1 = viennacl::traits::size1(proxy.lhs()); - vcl_size_t A_size2 = viennacl::traits::size2(proxy.lhs()); - - vcl_size_t B_start1 = viennacl::traits::start1(temp_trans); - vcl_size_t B_start2 = viennacl::traits::start2(temp_trans); - vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(temp_trans); - vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(temp_trans); - vcl_size_t B_inc1 = viennacl::traits::stride1(temp_trans); - vcl_size_t B_inc2 = viennacl::traits::stride2(temp_trans); - - const vcl_size_t sub_mat_size = 64; //The matrix will be divided into sub-matrices for better storage access. - - vcl_size_t row_count = A_size1 / sub_mat_size; - vcl_size_t col_count = A_size2 / sub_mat_size; - - vcl_size_t row_count_remainder = A_size1 % sub_mat_size; - vcl_size_t col_count_remainder = A_size2 % sub_mat_size; - - if (proxy.lhs().row_major()) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for(long i = 0; i < static_cast<long>(row_count*col_count); ++i)//This is the main part of the transposition - { - vcl_size_t row = vcl_size_t(i) / col_count; - vcl_size_t col = vcl_size_t(i) % col_count; - - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1 + A_inc1 * (row * sub_mat_size) - , A_start2 + A_inc2 * (col * sub_mat_size), A_inc1 - , A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type , row_major, false> wrapper_B(data_B, B_start1 + B_inc1 * (col * sub_mat_size) - , B_start2 + B_inc2 * (row * sub_mat_size), B_inc1 - , B_inc2, B_internal_size1, B_internal_size2); - for(vcl_size_t j = 0; j < (sub_mat_size); ++j) - for(vcl_size_t k = 0; k < (sub_mat_size); ++k) - wrapper_B(j, k) = wrapper_A(k, j); - } - { //This is the transposition of the remainder on the right side of the matrix - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1 - , A_start2 + A_inc2 * (col_count * sub_mat_size), A_inc1 - , A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type , row_major, false> wrapper_B(data_B, B_start1 + B_inc1 * (col_count * sub_mat_size) - , B_start2, B_inc1 - , B_inc2, B_internal_size1, B_internal_size2); - for(vcl_size_t j = 0; j < col_count_remainder; ++j) - for(vcl_size_t k = 0 ; k < A_size1; ++k) - wrapper_B(j, k) = wrapper_A(k, j); - } - { //This is the transposition of the remainder on the bottom side of the matrix - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1 + A_inc1 * (row_count * sub_mat_size) - , A_start2, A_inc1 - , A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type , row_major, false> wrapper_B(data_B,B_start1 - , B_start2 + B_inc2 * (row_count * sub_mat_size), B_inc1 - , B_inc2, B_internal_size1, B_internal_size2); - for(vcl_size_t j = 0; j < row_count_remainder; ++j) - for(vcl_size_t k = 0; k < (A_size2 - col_count_remainder); ++k) - wrapper_B(k, j) = wrapper_A(j, k); - } - } - else - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for(long i = 0; i < static_cast<long>(row_count*col_count); ++i)//This is the main part of the transposition - { - vcl_size_t row = vcl_size_t(i) / col_count; - vcl_size_t col = vcl_size_t(i) % col_count; - - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1 + A_inc1 * (row * sub_mat_size) - , A_start2 + A_inc2 * (col * sub_mat_size), A_inc1 - , A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type , column_major, false> wrapper_B(data_B, B_start1 + B_inc1 * (col * sub_mat_size) - , B_start2 + B_inc2 * (row * sub_mat_size), B_inc1 - , B_inc2, B_internal_size1, B_internal_size2); - for(vcl_size_t j = 0; j < (sub_mat_size); ++j) - for(vcl_size_t k = 0; k < (sub_mat_size); ++k) - wrapper_B(k, j)=wrapper_A(j, k); - } - { //This is the transposition of the remainder on the right side of the matrix - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1 - , A_start2 + A_inc2 * (col_count * sub_mat_size), A_inc1 - , A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type , column_major, false> wrapper_B(data_B,B_start1 + B_inc1 * (col_count * sub_mat_size) - , B_start2, B_inc1 - , B_inc2, B_internal_size1, B_internal_size2); - for(vcl_size_t j = 0; j < col_count_remainder; ++j) - for(vcl_size_t k = 0; k < A_size1; ++k) - wrapper_B(j, k)=wrapper_A(k, j); - } - { //This is the transposition of the remainder on the bottom side of the matrix - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1 + A_inc1 * (row_count * sub_mat_size) - , A_start2, A_inc1 - , A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type , column_major, false> wrapper_B(data_B, B_start1 - , B_start2 + B_inc2 * (row_count * sub_mat_size), B_inc1 - , B_inc2, B_internal_size1, B_internal_size2); - for(vcl_size_t j = 0; j < row_count_remainder; ++j) - for(vcl_size_t k = 0; k < (A_size2 - col_count_remainder); ++k) - wrapper_B(k, j)=wrapper_A(j, k); - } - } -} - -template<typename NumericT, typename ScalarT1> -void am(matrix_base<NumericT> & mat1, - matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha) -{ - assert(mat1.row_major() == mat2.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!")); - - typedef NumericT value_type; - - value_type * data_A = detail::extract_raw_pointer<value_type>(mat1); - value_type const * data_B = detail::extract_raw_pointer<value_type>(mat2); - - value_type data_alpha = alpha; - if (flip_sign_alpha) - data_alpha = -data_alpha; - - vcl_size_t A_start1 = viennacl::traits::start1(mat1); - vcl_size_t A_start2 = viennacl::traits::start2(mat1); - vcl_size_t A_inc1 = viennacl::traits::stride1(mat1); - vcl_size_t A_inc2 = viennacl::traits::stride2(mat1); - vcl_size_t A_size1 = viennacl::traits::size1(mat1); - vcl_size_t A_size2 = viennacl::traits::size2(mat1); - vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat1); - vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat1); - - vcl_size_t B_start1 = viennacl::traits::start1(mat2); - vcl_size_t B_start2 = viennacl::traits::start2(mat2); - vcl_size_t B_inc1 = viennacl::traits::stride1(mat2); - vcl_size_t B_inc2 = viennacl::traits::stride2(mat2); - vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(mat2); - vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(mat2); - - if (mat1.row_major()) - { - detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - - if (reciprocal_alpha) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(A_size1); ++row) - for (vcl_size_t col = 0; col < A_size2; ++col) - wrapper_A(row, col) = wrapper_B(row, col) / data_alpha; - } - else - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(A_size1); ++row) - for (vcl_size_t col = 0; col < A_size2; ++col) - wrapper_A(row, col) = wrapper_B(row, col) * data_alpha; - } - } - else - { - detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - - if (reciprocal_alpha) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long col = 0; col < static_cast<long>(A_size2); ++col) - for (vcl_size_t row = 0; row < A_size1; ++row) - wrapper_A(row, col) = wrapper_B(row, col) / data_alpha; - } - else - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long col = 0; col < static_cast<long>(A_size2); ++col) - for (vcl_size_t row = 0; row < A_size1; ++row) - wrapper_A(row, col) = wrapper_B(row, col) * data_alpha; - } - } -} - - -template<typename NumericT, - typename ScalarT1, typename ScalarT2> -void ambm(matrix_base<NumericT> & mat1, - matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha, - matrix_base<NumericT> const & mat3, ScalarT2 const & beta, vcl_size_t /*len_beta*/, bool reciprocal_beta, bool flip_sign_beta) -{ - assert(mat1.row_major() == mat2.row_major() && mat1.row_major() == mat3.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!")); - - typedef NumericT value_type; - - value_type * data_A = detail::extract_raw_pointer<value_type>(mat1); - value_type const * data_B = detail::extract_raw_pointer<value_type>(mat2); - value_type const * data_C = detail::extract_raw_pointer<value_type>(mat3); - - value_type data_alpha = alpha; - if (flip_sign_alpha) - data_alpha = -data_alpha; - - value_type data_beta = beta; - if (flip_sign_beta) - data_beta = -data_beta; - - vcl_size_t A_start1 = viennacl::traits::start1(mat1); - vcl_size_t A_start2 = viennacl::traits::start2(mat1); - vcl_size_t A_inc1 = viennacl::traits::stride1(mat1); - vcl_size_t A_inc2 = viennacl::traits::stride2(mat1); - vcl_size_t A_size1 = viennacl::traits::size1(mat1); - vcl_size_t A_size2 = viennacl::traits::size2(mat1); - vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat1); - vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat1); - - vcl_size_t B_start1 = viennacl::traits::start1(mat2); - vcl_size_t B_start2 = viennacl::traits::start2(mat2); - vcl_size_t B_inc1 = viennacl::traits::stride1(mat2); - vcl_size_t B_inc2 = viennacl::traits::stride2(mat2); - vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(mat2); - vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(mat2); - - vcl_size_t C_start1 = viennacl::traits::start1(mat3); - vcl_size_t C_start2 = viennacl::traits::start2(mat3); - vcl_size_t C_inc1 = viennacl::traits::stride1(mat3); - vcl_size_t C_inc2 = viennacl::traits::stride2(mat3); - vcl_size_t C_internal_size1 = viennacl::traits::internal_size1(mat3); - vcl_size_t C_internal_size2 = viennacl::traits::internal_size2(mat3); - - if (mat1.row_major()) - { - detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - if (reciprocal_alpha && reciprocal_beta) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(A_size1); ++row) - for (vcl_size_t col = 0; col < A_size2; ++col) - wrapper_A(row, col) = wrapper_B(row, col) / data_alpha + wrapper_C(row, col) / data_beta; - } - else if (reciprocal_alpha && !reciprocal_beta) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(A_size1); ++row) - for (vcl_size_t col = 0; col < A_size2; ++col) - wrapper_A(row, col) = wrapper_B(row, col) / data_alpha + wrapper_C(row, col) * data_beta; - } - else if (!reciprocal_alpha && reciprocal_beta) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(A_size1); ++row) - for (vcl_size_t col = 0; col < A_size2; ++col) - wrapper_A(row, col) = wrapper_B(row, col) * data_alpha + wrapper_C(row, col) / data_beta; - } - else if (!reciprocal_alpha && !reciprocal_beta) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(A_size1); ++row) - for (vcl_size_t col = 0; col < A_size2; ++col) - wrapper_A(row, col) = wrapper_B(row, col) * data_alpha + wrapper_C(row, col) * data_beta; - } - } - else - { - detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - if (reciprocal_alpha && reciprocal_beta) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long col = 0; col < static_cast<long>(A_size2); ++col) - for (vcl_size_t row = 0; row < A_size1; ++row) - wrapper_A(row, col) = wrapper_B(row, col) / data_alpha + wrapper_C(row, col) / data_beta; - } - else if (reciprocal_alpha && !reciprocal_beta) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long col = 0; col < static_cast<long>(A_size2); ++col) - for (vcl_size_t row = 0; row < A_size1; ++row) - wrapper_A(row, col) = wrapper_B(row, col) / data_alpha + wrapper_C(row, col) * data_beta; - } - else if (!reciprocal_alpha && reciprocal_beta) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long col = 0; col < static_cast<long>(A_size2); ++col) - for (vcl_size_t row = 0; row < A_size1; ++row) - wrapper_A(row, col) = wrapper_B(row, col) * data_alpha + wrapper_C(row, col) / data_beta; - } - else if (!reciprocal_alpha && !reciprocal_beta) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long col = 0; col < static_cast<long>(A_size2); ++col) - for (vcl_size_t row = 0; row < A_size1; ++row) - wrapper_A(row, col) = wrapper_B(row, col) * data_alpha + wrapper_C(row, col) * data_beta; - } - } - -} - - -template<typename NumericT, - typename ScalarT1, typename ScalarT2> -void ambm_m(matrix_base<NumericT> & mat1, - matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha, - matrix_base<NumericT> const & mat3, ScalarT2 const & beta, vcl_size_t /*len_beta*/, bool reciprocal_beta, bool flip_sign_beta) -{ - assert(mat1.row_major() == mat2.row_major() && mat1.row_major() == mat3.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!")); - - typedef NumericT value_type; - - value_type * data_A = detail::extract_raw_pointer<value_type>(mat1); - value_type const * data_B = detail::extract_raw_pointer<value_type>(mat2); - value_type const * data_C = detail::extract_raw_pointer<value_type>(mat3); - - value_type data_alpha = alpha; - if (flip_sign_alpha) - data_alpha = -data_alpha; - - value_type data_beta = beta; - if (flip_sign_beta) - data_beta = -data_beta; - - vcl_size_t A_start1 = viennacl::traits::start1(mat1); - vcl_size_t A_start2 = viennacl::traits::start2(mat1); - vcl_size_t A_inc1 = viennacl::traits::stride1(mat1); - vcl_size_t A_inc2 = viennacl::traits::stride2(mat1); - vcl_size_t A_size1 = viennacl::traits::size1(mat1); - vcl_size_t A_size2 = viennacl::traits::size2(mat1); - vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat1); - vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat1); - - vcl_size_t B_start1 = viennacl::traits::start1(mat2); - vcl_size_t B_start2 = viennacl::traits::start2(mat2); - vcl_size_t B_inc1 = viennacl::traits::stride1(mat2); - vcl_size_t B_inc2 = viennacl::traits::stride2(mat2); - vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(mat2); - vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(mat2); - - vcl_size_t C_start1 = viennacl::traits::start1(mat3); - vcl_size_t C_start2 = viennacl::traits::start2(mat3); - vcl_size_t C_inc1 = viennacl::traits::stride1(mat3); - vcl_size_t C_inc2 = viennacl::traits::stride2(mat3); - vcl_size_t C_internal_size1 = viennacl::traits::internal_size1(mat3); - vcl_size_t C_internal_size2 = viennacl::traits::internal_size2(mat3); - - if (mat1.row_major()) - { - detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - if (reciprocal_alpha && reciprocal_beta) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(A_size1); ++row) - for (vcl_size_t col = 0; col < A_size2; ++col) - wrapper_A(row, col) += wrapper_B(row, col) / data_alpha + wrapper_C(row, col) / data_beta; - } - else if (reciprocal_alpha && !reciprocal_beta) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(A_size1); ++row) - for (vcl_size_t col = 0; col < A_size2; ++col) - wrapper_A(row, col) += wrapper_B(row, col) / data_alpha + wrapper_C(row, col) * data_beta; - } - else if (!reciprocal_alpha && reciprocal_beta) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(A_size1); ++row) - for (vcl_size_t col = 0; col < A_size2; ++col) - wrapper_A(row, col) += wrapper_B(row, col) * data_alpha + wrapper_C(row, col) / data_beta; - } - else if (!reciprocal_alpha && !reciprocal_beta) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(A_size1); ++row) - for (vcl_size_t col = 0; col < A_size2; ++col) - wrapper_A(row, col) += wrapper_B(row, col) * data_alpha + wrapper_C(row, col) * data_beta; - } - } - else - { - detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - if (reciprocal_alpha && reciprocal_beta) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long col = 0; col < static_cast<long>(A_size2); ++col) - for (vcl_size_t row = 0; row < A_size1; ++row) - wrapper_A(row, col) += wrapper_B(row, col) / data_alpha + wrapper_C(row, col) / data_beta; - } - else if (reciprocal_alpha && !reciprocal_beta) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long col = 0; col < static_cast<long>(A_size2); ++col) - for (vcl_size_t row = 0; row < A_size1; ++row) - wrapper_A(row, col) += wrapper_B(row, col) / data_alpha + wrapper_C(row, col) * data_beta; - } - else if (!reciprocal_alpha && reciprocal_beta) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long col = 0; col < static_cast<long>(A_size2); ++col) - for (vcl_size_t row = 0; row < A_size1; ++row) - wrapper_A(row, col) += wrapper_B(row, col) * data_alpha + wrapper_C(row, col) / data_beta; - } - else if (!reciprocal_alpha && !reciprocal_beta) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long col = 0; col < static_cast<long>(A_size2); ++col) - for (vcl_size_t row = 0; row < A_size1; ++row) - wrapper_A(row, col) += wrapper_B(row, col) * data_alpha + wrapper_C(row, col) * data_beta; - } - } - -} - - - - -template<typename NumericT> -void matrix_assign(matrix_base<NumericT> & mat, NumericT s, bool clear = false) -{ - typedef NumericT value_type; - - value_type * data_A = detail::extract_raw_pointer<value_type>(mat); - value_type alpha = static_cast<value_type>(s); - - vcl_size_t A_start1 = viennacl::traits::start1(mat); - vcl_size_t A_start2 = viennacl::traits::start2(mat); - vcl_size_t A_inc1 = viennacl::traits::stride1(mat); - vcl_size_t A_inc2 = viennacl::traits::stride2(mat); - vcl_size_t A_size1 = clear ? viennacl::traits::internal_size1(mat) : viennacl::traits::size1(mat); - vcl_size_t A_size2 = clear ? viennacl::traits::internal_size2(mat) : viennacl::traits::size2(mat); - vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat); - vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat); - - if (mat.row_major()) - { - detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(A_size1); ++row) - for (vcl_size_t col = 0; col < A_size2; ++col) - wrapper_A(static_cast<vcl_size_t>(row), col) = alpha; - //data_A[index_generator_A::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] - // = data_B[index_generator_B::mem_index(row * B_inc1 + B_start1, col * B_inc2 + B_start2, B_internal_size1, B_internal_size2)] * alpha; - } - else - { - detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long col = 0; col < static_cast<long>(A_size2); ++col) - for (vcl_size_t row = 0; row < A_size1; ++row) - wrapper_A(row, static_cast<vcl_size_t>(col)) = alpha; - //data_A[index_generator_A::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] - // = data_B[index_generator_B::mem_index(row * B_inc1 + B_start1, col * B_inc2 + B_start2, B_internal_size1, B_internal_size2)] * alpha; - } -} - - - -template<typename NumericT> -void matrix_diagonal_assign(matrix_base<NumericT> & mat, NumericT s) -{ - typedef NumericT value_type; - - value_type * data_A = detail::extract_raw_pointer<value_type>(mat); - value_type alpha = static_cast<value_type>(s); - - vcl_size_t A_start1 = viennacl::traits::start1(mat); - vcl_size_t A_start2 = viennacl::traits::start2(mat); - vcl_size_t A_inc1 = viennacl::traits::stride1(mat); - vcl_size_t A_inc2 = viennacl::traits::stride2(mat); - vcl_size_t A_size1 = viennacl::traits::size1(mat); - //vcl_size_t A_size2 = viennacl::traits::size2(mat); - vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat); - vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat); - - if (mat.row_major()) - { - detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size1) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(A_size1); ++row) - wrapper_A(row, row) = alpha; - } - else - { - detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size1) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(A_size1); ++row) - wrapper_A(row, row) = alpha; - } -} - -template<typename NumericT> -void matrix_diag_from_vector(const vector_base<NumericT> & vec, int k, matrix_base<NumericT> & mat) -{ - typedef NumericT value_type; - - value_type *data_A = detail::extract_raw_pointer<value_type>(mat); - value_type const *data_vec = detail::extract_raw_pointer<value_type>(vec); - - vcl_size_t A_start1 = viennacl::traits::start1(mat); - vcl_size_t A_start2 = viennacl::traits::start2(mat); - vcl_size_t A_inc1 = viennacl::traits::stride1(mat); - vcl_size_t A_inc2 = viennacl::traits::stride2(mat); - //vcl_size_t A_size1 = viennacl::traits::size1(mat); - //vcl_size_t A_size2 = viennacl::traits::size2(mat); - vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat); - vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat); - - vcl_size_t v_start = viennacl::traits::start(vec); - vcl_size_t v_inc = viennacl::traits::stride(vec); - vcl_size_t v_size = viennacl::traits::size(vec); - - vcl_size_t row_start = 0; - vcl_size_t col_start = 0; - - if (k >= 0) - col_start = static_cast<vcl_size_t>(k); - else - row_start = static_cast<vcl_size_t>(-k); - - matrix_assign(mat, NumericT(0)); - - if (mat.row_major()) - { - detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - - for (vcl_size_t i = 0; i < v_size; ++i) - wrapper_A(row_start + i, col_start + i) = data_vec[v_start + i * v_inc]; - } - else - { - detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - - for (vcl_size_t i = 0; i < v_size; ++i) - wrapper_A(row_start + i, col_start + i) = data_vec[v_start + i * v_inc]; - } -} - -template<typename NumericT> -void matrix_diag_to_vector(const matrix_base<NumericT> & mat, int k, vector_base<NumericT> & vec) -{ - typedef NumericT value_type; - - value_type const * data_A = detail::extract_raw_pointer<value_type>(mat); - value_type * data_vec = detail::extract_raw_pointer<value_type>(vec); - - vcl_size_t A_start1 = viennacl::traits::start1(mat); - vcl_size_t A_start2 = viennacl::traits::start2(mat); - vcl_size_t A_inc1 = viennacl::traits::stride1(mat); - vcl_size_t A_inc2 = viennacl::traits::stride2(mat); - //vcl_size_t A_size1 = viennacl::traits::size1(mat); - //vcl_size_t A_size2 = viennacl::traits::size2(mat); - vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat); - vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat); - - vcl_size_t v_start = viennacl::traits::start(vec); - vcl_size_t v_inc = viennacl::traits::stride(vec); - vcl_size_t v_size = viennacl::traits::size(vec); - - vcl_size_t row_start = 0; - vcl_size_t col_start = 0; - - if (k >= 0) - col_start = static_cast<vcl_size_t>(k); - else - row_start = static_cast<vcl_size_t>(-k); - - if (mat.row_major()) - { - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - - for (vcl_size_t i = 0; i < v_size; ++i) - data_vec[v_start + i * v_inc] = wrapper_A(row_start + i, col_start + i); - } - else - { - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - - for (vcl_size_t i = 0; i < v_size; ++i) - data_vec[v_start + i * v_inc] = wrapper_A(row_start + i, col_start + i); - } -} - -template<typename NumericT> -void matrix_row(const matrix_base<NumericT> & mat, unsigned int i, vector_base<NumericT> & vec) -{ - typedef NumericT value_type; - - value_type const * data_A = detail::extract_raw_pointer<value_type>(mat); - value_type * data_vec = detail::extract_raw_pointer<value_type>(vec); - - vcl_size_t A_start1 = viennacl::traits::start1(mat); - vcl_size_t A_start2 = viennacl::traits::start2(mat); - vcl_size_t A_inc1 = viennacl::traits::stride1(mat); - vcl_size_t A_inc2 = viennacl::traits::stride2(mat); - //vcl_size_t A_size1 = viennacl::traits::size1(mat); - //vcl_size_t A_size2 = viennacl::traits::size2(mat); - vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat); - vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat); - - vcl_size_t v_start = viennacl::traits::start(vec); - vcl_size_t v_inc = viennacl::traits::stride(vec); - vcl_size_t v_size = viennacl::traits::size(vec); - - if (mat.row_major()) - { - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - - for (vcl_size_t j = 0; j < v_size; ++j) - data_vec[v_start + j * v_inc] = wrapper_A(static_cast<vcl_size_t>(i), j); - } - else - { - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - - for (vcl_size_t j = 0; j < v_size; ++j) - data_vec[v_start + j * v_inc] = wrapper_A(static_cast<vcl_size_t>(i), j); - } -} - -template<typename NumericT> -void matrix_column(const matrix_base<NumericT> & mat, unsigned int j, vector_base<NumericT> & vec) -{ - typedef NumericT value_type; - - value_type const * data_A = detail::extract_raw_pointer<value_type>(mat); - value_type * data_vec = detail::extract_raw_pointer<value_type>(vec); - - vcl_size_t A_start1 = viennacl::traits::start1(mat); - vcl_size_t A_start2 = viennacl::traits::start2(mat); - vcl_size_t A_inc1 = viennacl::traits::stride1(mat); - vcl_size_t A_inc2 = viennacl::traits::stride2(mat); - //vcl_size_t A_size1 = viennacl::traits::size1(mat); - //vcl_size_t A_size2 = viennacl::traits::size2(mat); - vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat); - vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat); - - vcl_size_t v_start = viennacl::traits::start(vec); - vcl_size_t v_inc = viennacl::traits::stride(vec); - vcl_size_t v_size = viennacl::traits::size(vec); - - if (mat.row_major()) - { - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - - for (vcl_size_t i = 0; i < v_size; ++i) - data_vec[v_start + i * v_inc] = wrapper_A(i, static_cast<vcl_size_t>(j)); - } - else - { - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - - for (vcl_size_t i = 0; i < v_size; ++i) - data_vec[v_start + i * v_inc] = wrapper_A(i, static_cast<vcl_size_t>(j)); - } -} - -// -///////////////////////// Element-wise operation ////////////////////////////////// -// - -// Binary operations A = B .* C and A = B ./ C - -/** @brief Implementation of the element-wise operations A = B .* C and A = B ./ C (using MATLAB syntax) -* -* @param A The result matrix (or -range, or -slice) -* @param proxy The proxy object holding B, C, and the operation -*/ -template<typename NumericT, typename OpT> -void element_op(matrix_base<NumericT> & A, - matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_binary<OpT> > const & proxy) -{ - assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!")); - - typedef NumericT value_type; - typedef viennacl::linalg::detail::op_applier<op_element_binary<OpT> > OpFunctor; - - value_type * data_A = detail::extract_raw_pointer<value_type>(A); - value_type const * data_B = detail::extract_raw_pointer<value_type>(proxy.lhs()); - value_type const * data_C = detail::extract_raw_pointer<value_type>(proxy.rhs()); - - vcl_size_t A_start1 = viennacl::traits::start1(A); - vcl_size_t A_start2 = viennacl::traits::start2(A); - vcl_size_t A_inc1 = viennacl::traits::stride1(A); - vcl_size_t A_inc2 = viennacl::traits::stride2(A); - vcl_size_t A_size1 = viennacl::traits::size1(A); - vcl_size_t A_size2 = viennacl::traits::size2(A); - vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(A); - vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(A); - - vcl_size_t B_start1 = viennacl::traits::start1(proxy.lhs()); - vcl_size_t B_start2 = viennacl::traits::start2(proxy.lhs()); - vcl_size_t B_inc1 = viennacl::traits::stride1(proxy.lhs()); - vcl_size_t B_inc2 = viennacl::traits::stride2(proxy.lhs()); - vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(proxy.lhs()); - vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(proxy.lhs()); - - vcl_size_t C_start1 = viennacl::traits::start1(proxy.rhs()); - vcl_size_t C_start2 = viennacl::traits::start2(proxy.rhs()); - vcl_size_t C_inc1 = viennacl::traits::stride1(proxy.rhs()); - vcl_size_t C_inc2 = viennacl::traits::stride2(proxy.rhs()); - vcl_size_t C_internal_size1 = viennacl::traits::internal_size1(proxy.rhs()); - vcl_size_t C_internal_size2 = viennacl::traits::internal_size2(proxy.rhs()); - - if (A.row_major()) - { - detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(A_size1); ++row) - for (vcl_size_t col = 0; col < A_size2; ++col) - OpFunctor::apply(wrapper_A(row, col), wrapper_B(row, col), wrapper_C(row, col)); - //data_A[index_generator_A::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] - // = data_B[index_generator_B::mem_index(row * B_inc1 + B_start1, col * B_inc2 + B_start2, B_internal_size1, B_internal_size2)] * alpha - // + data_C[index_generator_C::mem_index(row * C_inc1 + C_start1, col * C_inc2 + C_start2, C_internal_size1, C_internal_size2)] * beta; - } - else - { - detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long col = 0; col < static_cast<long>(A_size2); ++col) - for (vcl_size_t row = 0; row < A_size1; ++row) - OpFunctor::apply(wrapper_A(row, col), wrapper_B(row, col), wrapper_C(row, col)); - - //data_A[index_generator_A::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] - // = data_B[index_generator_B::mem_index(row * B_inc1 + B_start1, col * B_inc2 + B_start2, B_internal_size1, B_internal_size2)] * alpha - // + data_C[index_generator_C::mem_index(row * C_inc1 + C_start1, col * C_inc2 + C_start2, C_internal_size1, C_internal_size2)] * beta; - } -} - -// Unary operations - -// A = op(B) -template<typename NumericT, typename OpT> -void element_op(matrix_base<NumericT> & A, - matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<OpT> > const & proxy) -{ - assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!")); - - typedef NumericT value_type; - typedef viennacl::linalg::detail::op_applier<op_element_unary<OpT> > OpFunctor; - - value_type * data_A = detail::extract_raw_pointer<value_type>(A); - value_type const * data_B = detail::extract_raw_pointer<value_type>(proxy.lhs()); - - vcl_size_t A_start1 = viennacl::traits::start1(A); - vcl_size_t A_start2 = viennacl::traits::start2(A); - vcl_size_t A_inc1 = viennacl::traits::stride1(A); - vcl_size_t A_inc2 = viennacl::traits::stride2(A); - vcl_size_t A_size1 = viennacl::traits::size1(A); - vcl_size_t A_size2 = viennacl::traits::size2(A); - vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(A); - vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(A); - - vcl_size_t B_start1 = viennacl::traits::start1(proxy.lhs()); - vcl_size_t B_start2 = viennacl::traits::start2(proxy.lhs()); - vcl_size_t B_inc1 = viennacl::traits::stride1(proxy.lhs()); - vcl_size_t B_inc2 = viennacl::traits::stride2(proxy.lhs()); - vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(proxy.lhs()); - vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(proxy.lhs()); - - if (A.row_major()) - { - detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(A_size1); ++row) - for (vcl_size_t col = 0; col < A_size2; ++col) - OpFunctor::apply(wrapper_A(row, col), wrapper_B(row, col)); - } - else - { - detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long col = 0; col < static_cast<long>(A_size2); ++col) - for (vcl_size_t row = 0; row < A_size1; ++row) - OpFunctor::apply(wrapper_A(row, col), wrapper_B(row, col)); - } -} - - - -// -///////////////////////// matrix-vector products ///////////////////////////////// -// - -// A * x - -/** @brief Carries out matrix-vector multiplication -* -* Implementation of the convenience expression result = prod(mat, vec); -* -* @param mat The matrix -* @param trans Flag whether mat is to be transposed -* @param vec The vector -* @param result The result vector -*/ -template<typename NumericT> -void prod_impl(const matrix_base<NumericT> & mat, bool trans, - const vector_base<NumericT> & vec, - vector_base<NumericT> & result) -{ - typedef NumericT value_type; - - value_type const * data_A = detail::extract_raw_pointer<value_type>(mat); - value_type const * data_x = detail::extract_raw_pointer<value_type>(vec); - value_type * data_result = detail::extract_raw_pointer<value_type>(result); - - vcl_size_t A_start1 = viennacl::traits::start1(mat); - vcl_size_t A_start2 = viennacl::traits::start2(mat); - vcl_size_t A_inc1 = viennacl::traits::stride1(mat); - vcl_size_t A_inc2 = viennacl::traits::stride2(mat); - vcl_size_t A_size1 = viennacl::traits::size1(mat); - vcl_size_t A_size2 = viennacl::traits::size2(mat); - vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat); - vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat); - - vcl_size_t start1 = viennacl::traits::start(vec); - vcl_size_t inc1 = viennacl::traits::stride(vec); - - vcl_size_t start2 = viennacl::traits::start(result); - vcl_size_t inc2 = viennacl::traits::stride(result); - - if (mat.row_major()) - { - if (trans) - { - vcl_size_t thread_count = 1; -#ifdef VIENNACL_WITH_OPENMP - if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) - thread_count = omp_get_max_threads(); -#endif - std::vector<value_type> temp_array(A_size2*thread_count, 0); - detail::vector_array_wrapper<value_type> wrapper_res(data_result, start2, inc2); - - for (vcl_size_t col = 0; col < A_size2; ++col) - wrapper_res(col) = 0; - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - { - vcl_size_t id = 0; -#ifdef VIENNACL_WITH_OPENMP - if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) - id = omp_get_thread_num(); - #endif - vcl_size_t begin = (A_size1 * id) / thread_count; - vcl_size_t end = (A_size1 * (id + 1)) / thread_count; - - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_mat(data_A, A_start1 + A_inc1 * begin, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::vector_array_wrapper<value_type const> wrapper_vec(data_x, start1 + inc1 * begin, inc1); - - for (vcl_size_t row = 0; row < (end - begin); ++row) //run through matrix sequentially - { - value_type temp = wrapper_vec(row); - for (vcl_size_t col = 0; col < A_size2; ++col) - temp_array[A_size2 * id + col] += wrapper_mat(row , col) * temp; - } - } - for (vcl_size_t id = 0; id < thread_count; ++id) - for (vcl_size_t col = 0; col < A_size2; ++col) - wrapper_res(col) += temp_array[A_size2 * id + col]; - } - - else - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(A_size1); ++row) - { - value_type temp = 0; - for (vcl_size_t col = 0; col < A_size2; ++col) - temp += data_A[viennacl::row_major::mem_index(static_cast<vcl_size_t>(row) * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] * data_x[col * inc1 + start1]; - - data_result[static_cast<vcl_size_t>(row) * inc2 + start2] = temp; - } - } - } - else - { - if (!trans) - { - vcl_size_t thread_count = 1; -#ifdef VIENNACL_WITH_OPENMP - if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) - thread_count = omp_get_max_threads(); -#endif - std::vector<value_type> temp_array(A_size1*thread_count, 0); - detail::vector_array_wrapper<value_type> wrapper_res(data_result, start2, inc2); - - for (vcl_size_t row = 0; row < A_size1; ++row) - wrapper_res(row) = 0; - -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - { - vcl_size_t id = 0; -#ifdef VIENNACL_WITH_OPENMP - if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) - id = omp_get_thread_num(); - #endif - vcl_size_t begin = (A_size2 * id) / thread_count; - vcl_size_t end = (A_size2 * (id + 1)) / thread_count; - - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_mat(data_A, A_start1, A_start2 + A_inc2 * begin, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::vector_array_wrapper<value_type const> wrapper_vec(data_x, start1 + inc1 * begin, inc1); - - for (vcl_size_t col = 0; col < (end - begin); ++col) //run through matrix sequentially - { - value_type temp = wrapper_vec(col); - for (vcl_size_t row = 0; row < A_size1; ++row) - temp_array[A_size1 * id + row] += wrapper_mat(row , col) * temp; - } - } - for (vcl_size_t id = 0; id < thread_count; ++id) - for (vcl_size_t row = 0; row < A_size1; ++row) - wrapper_res(row) += temp_array[A_size1 * id + row]; - } - else - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(A_size2); ++row) - { - value_type temp = 0; - for (vcl_size_t col = 0; col < A_size1; ++col) - temp += data_A[viennacl::column_major::mem_index(col * A_inc1 + A_start1, static_cast<vcl_size_t>(row) * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] * data_x[col * inc1 + start1]; - - data_result[static_cast<vcl_size_t>(row) * inc2 + start2] = temp; - } - } - } -} - - - -// -///////////////////////// matrix-matrix products ///////////////////////////////// -// - -namespace detail -{ - template<typename MatrixAccT1, typename MatrixAccT2, typename MatrixAccT3, typename NumericT> - void prod(MatrixAccT1 & A, MatrixAccT2 & B, MatrixAccT3 & C, - vcl_size_t C_size1, vcl_size_t C_size2, vcl_size_t A_size2, - NumericT alpha, NumericT beta) - { - if (C_size1 == 0 || C_size2 == 0 || A_size2 == 0) - return; - - static const vcl_size_t blocksize = 64; - - vcl_size_t num_blocks_C1 = (C_size1 - 1) / blocksize + 1; - vcl_size_t num_blocks_C2 = (C_size2 - 1) / blocksize + 1; - vcl_size_t num_blocks_A2 = (A_size2 - 1) / blocksize + 1; - - // - // outer loop pair: Run over all blocks with indices (block_idx_i, block_idx_j) of the result matrix C: - // -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((C_size1*C_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long block_idx_i2=0; block_idx_i2<static_cast<long>(num_blocks_C1); ++block_idx_i2) - { - // thread-local auxiliary buffers - std::vector<NumericT> buffer_A(blocksize * blocksize); // row-major - std::vector<NumericT> buffer_B(blocksize * blocksize); // column-major - std::vector<NumericT> buffer_C(blocksize * blocksize); // row-major - - vcl_size_t block_idx_i = static_cast<vcl_size_t>(block_idx_i2); - for (vcl_size_t block_idx_j=0; block_idx_j<num_blocks_C2; ++block_idx_j) - { - // Reset block matrix: - std::fill(buffer_C.begin(), buffer_C.end(), NumericT(0)); - - vcl_size_t offset_i = block_idx_i*blocksize; - vcl_size_t offset_j = block_idx_j*blocksize; - - // C(block_idx_i, block_idx_i) += A(block_idx_i, block_idx_k) * B(block_idx_k, block_idx_j) - for (vcl_size_t block_idx_k=0; block_idx_k<num_blocks_A2; ++block_idx_k) - { - // flush buffers: - std::fill(buffer_A.begin(), buffer_A.end(), NumericT(0)); - std::fill(buffer_B.begin(), buffer_B.end(), NumericT(0)); - - vcl_size_t offset_k = block_idx_k*blocksize; - - // load current data: - for (vcl_size_t i = offset_i; i < std::min(offset_i + blocksize, C_size1); ++i) - for (vcl_size_t k = offset_k; k < std::min(offset_k + blocksize, A_size2); ++k) - buffer_A[(i - offset_i) * blocksize + (k - offset_k)] = A(i, k); - - for (vcl_size_t j = offset_j; j < std::min(offset_j + blocksize, C_size2); ++j) - for (vcl_size_t k = offset_k; k < std::min(offset_k + blocksize, A_size2); ++k) - buffer_B[(k - offset_k) + (j - offset_j) * blocksize] = B(k, j); - - // multiply (this is the hot spot in terms of flops) - for (vcl_size_t i = 0; i < blocksize; ++i) - { - NumericT const * ptrA = &(buffer_A[i*blocksize]); - for (vcl_size_t j = 0; j < blocksize; ++j) - { - NumericT const * ptrB = &(buffer_B[j*blocksize]); - - NumericT temp = NumericT(0); - for (vcl_size_t k = 0; k < blocksize; ++k) - temp += ptrA[k] * ptrB[k]; // buffer_A[i*blocksize + k] * buffer_B[k + j*blocksize]; - - buffer_C[i*blocksize + j] += temp; - } - } - } - - // write result: - if (beta > 0 || beta < 0) - { - for (vcl_size_t i = offset_i; i < std::min(offset_i + blocksize, C_size1); ++i) - for (vcl_size_t j = offset_j; j < std::min(offset_j + blocksize, C_size2); ++j) - C(i,j) = beta * C(i,j) + alpha * buffer_C[(i - offset_i) * blocksize + (j - offset_j)]; - } - else - { - for (vcl_size_t i = offset_i; i < std::min(offset_i + blocksize, C_size1); ++i) - for (vcl_size_t j = offset_j; j < std::min(offset_j + blocksize, C_size2); ++j) - C(i,j) = alpha * buffer_C[(i - offset_i) * blocksize + (j - offset_j)]; - } - - } // for block j - } // for block i - - } // prod() - -} // namespace detail - -/** @brief Carries out matrix-matrix multiplication -* -* Implementation of C = prod(A, B); -* -*/ -template<typename NumericT, typename ScalarT1, typename ScalarT2 > -void prod_impl(const matrix_base<NumericT> & A, bool trans_A, - const matrix_base<NumericT> & B, bool trans_B, - matrix_base<NumericT> & C, - ScalarT1 alpha, - ScalarT2 beta) -{ - typedef NumericT value_type; - - value_type const * data_A = detail::extract_raw_pointer<value_type>(A); - value_type const * data_B = detail::extract_raw_pointer<value_type>(B); - value_type * data_C = detail::extract_raw_pointer<value_type>(C); - - vcl_size_t A_start1 = viennacl::traits::start1(A); - vcl_size_t A_start2 = viennacl::traits::start2(A); - vcl_size_t A_inc1 = viennacl::traits::stride1(A); - vcl_size_t A_inc2 = viennacl::traits::stride2(A); - vcl_size_t A_size1 = viennacl::traits::size1(A); - vcl_size_t A_size2 = viennacl::traits::size2(A); - vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(A); - vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(A); - - vcl_size_t B_start1 = viennacl::traits::start1(B); - vcl_size_t B_start2 = viennacl::traits::start2(B); - vcl_size_t B_inc1 = viennacl::traits::stride1(B); - vcl_size_t B_inc2 = viennacl::traits::stride2(B); - vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(B); - vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(B); - - vcl_size_t C_start1 = viennacl::traits::start1(C); - vcl_size_t C_start2 = viennacl::traits::start2(C); - vcl_size_t C_inc1 = viennacl::traits::stride1(C); - vcl_size_t C_inc2 = viennacl::traits::stride2(C); - vcl_size_t C_size1 = viennacl::traits::size1(C); - vcl_size_t C_size2 = viennacl::traits::size2(C); - vcl_size_t C_internal_size1 = viennacl::traits::internal_size1(C); - vcl_size_t C_internal_size2 = viennacl::traits::internal_size2(C); - - if (!trans_A && !trans_B) - { - if (A.row_major() && B.row_major() && C.row_major()) - { - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - else if (A.row_major() && B.row_major() && !C.row_major()) - { - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - else if (A.row_major() && !B.row_major() && C.row_major()) - { - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - else if (A.row_major() && !B.row_major() && !C.row_major()) - { - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - else if (!A.row_major() && B.row_major() && C.row_major()) - { - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - else if (!A.row_major() && B.row_major() && !C.row_major()) - { - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - else if (!A.row_major() && !B.row_major() && C.row_major()) - { - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - else - { - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - } - else if (!trans_A && trans_B) - { - if (A.row_major() && B.row_major() && C.row_major()) - { - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - else if (A.row_major() && B.row_major() && !C.row_major()) - { - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - else if (A.row_major() && !B.row_major() && C.row_major()) - { - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - else if (A.row_major() && !B.row_major() && !C.row_major()) - { - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - else if (!A.row_major() && B.row_major() && C.row_major()) - { - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - else if (!A.row_major() && B.row_major() && !C.row_major()) - { - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - else if (!A.row_major() && !B.row_major() && C.row_major()) - { - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - else - { - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - } - else if (trans_A && !trans_B) - { - if (A.row_major() && B.row_major() && C.row_major()) - { - detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - else if (A.row_major() && B.row_major() && !C.row_major()) - { - detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - else if (A.row_major() && !B.row_major() && C.row_major()) - { - detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - else if (A.row_major() && !B.row_major() && !C.row_major()) - { - detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - else if (!A.row_major() && B.row_major() && C.row_major()) - { - detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - else if (!A.row_major() && B.row_major() && !C.row_major()) - { - detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - else if (!A.row_major() && !B.row_major() && C.row_major()) - { - detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - else - { - detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - } - else if (trans_A && trans_B) - { - if (A.row_major() && B.row_major() && C.row_major()) - { - detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - else if (A.row_major() && B.row_major() && !C.row_major()) - { - detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - else if (A.row_major() && !B.row_major() && C.row_major()) - { - detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - else if (A.row_major() && !B.row_major() && !C.row_major()) - { - detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - else if (!A.row_major() && B.row_major() && C.row_major()) - { - detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - else if (!A.row_major() && B.row_major() && !C.row_major()) - { - detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - else if (!A.row_major() && !B.row_major() && C.row_major()) - { - detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - else - { - detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); - detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); - detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); - - detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); - } - } -} - - - - -// -///////////////////////// miscellaneous operations ///////////////////////////////// -// - - -/** @brief The implementation of the operation mat += alpha * vec1 * vec2^T, i.e. a scaled rank 1 update -* -* Implementation of the convenience expression result += alpha * outer_prod(vec1, vec2); -* -* @param mat1 The matrix to be updated -* @param alpha The scaling factor (either a viennacl::scalar<>, float, or double) -* @param reciprocal_alpha Use 1/alpha instead of alpha -* @param flip_sign_alpha Use -alpha instead of alpha -* @param vec1 The first vector -* @param vec2 The second vector -*/ -template<typename NumericT, typename ScalarT> -void scaled_rank_1_update(matrix_base<NumericT> & mat1, - ScalarT const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha, - const vector_base<NumericT> & vec1, - const vector_base<NumericT> & vec2) -{ - typedef NumericT value_type; - - value_type * data_A = detail::extract_raw_pointer<value_type>(mat1); - value_type const * data_v1 = detail::extract_raw_pointer<value_type>(vec1); - value_type const * data_v2 = detail::extract_raw_pointer<value_type>(vec2); - - vcl_size_t A_start1 = viennacl::traits::start1(mat1); - vcl_size_t A_start2 = viennacl::traits::start2(mat1); - vcl_size_t A_inc1 = viennacl::traits::stride1(mat1); - vcl_size_t A_inc2 = viennacl::traits::stride2(mat1); - vcl_size_t A_size1 = viennacl::traits::size1(mat1); - vcl_size_t A_size2 = viennacl::traits::size2(mat1); - vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat1); - vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat1); - - vcl_size_t start1 = viennacl::traits::start(vec1); - vcl_size_t inc1 = viennacl::traits::stride(vec1); - - vcl_size_t start2 = viennacl::traits::start(vec2); - vcl_size_t inc2 = viennacl::traits::stride(vec2); - - value_type data_alpha = alpha; - if (flip_sign_alpha) - data_alpha = -data_alpha; - - if (mat1.row_major()) - { - if(reciprocal_alpha) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(A_size1); ++row) - { - value_type value_v1 = data_v1[static_cast<vcl_size_t>(row) * inc1 + start1] / data_alpha; - for (vcl_size_t col = 0; col < A_size2; ++col) - data_A[viennacl::row_major::mem_index(static_cast<vcl_size_t>(row) * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] += value_v1 * data_v2[col * inc2 + start2]; - } - } - else - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long row = 0; row < static_cast<long>(A_size1); ++row) - { - value_type value_v1 = data_v1[static_cast<vcl_size_t>(row) * inc1 + start1] * data_alpha; - for (vcl_size_t col = 0; col < A_size2; ++col) - data_A[viennacl::row_major::mem_index(static_cast<vcl_size_t>(row) * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] += value_v1 * data_v2[col * inc2 + start2]; - } - } - } - else - { - if(reciprocal_alpha) - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long col = 0; col < static_cast<long>(A_size2); ++col) //run through matrix sequentially - { - value_type value_v2 = data_v2[static_cast<vcl_size_t>(col) * inc2 + start2] / data_alpha; - for (vcl_size_t row = 0; row < A_size1; ++row) - data_A[viennacl::column_major::mem_index(row * A_inc1 + A_start1, static_cast<vcl_size_t>(col) * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] += data_v1[row * inc1 + start1] * value_v2; - } - } - else - { -#ifdef VIENNACL_WITH_OPENMP - #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) -#endif - for (long col = 0; col < static_cast<long>(A_size2); ++col) //run through matrix sequentially - { - value_type value_v2 = data_v2[static_cast<vcl_size_t>(col) * inc2 + start2] * data_alpha; - for (vcl_size_t row = 0; row < A_size1; ++row) - data_A[viennacl::column_major::mem_index(row * A_inc1 + A_start1, static_cast<vcl_size_t>(col) * A_inc2 + A_start2, A_in
<TRUNCATED>
