http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/matrix_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/matrix_operations.hpp new file mode 100644 index 0000000..f23223f --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/host_based/matrix_operations.hpp @@ -0,0 +1,2052 @@ +#ifndef VIENNACL_LINALG_HOST_BASED_MATRIX_OPERATIONS_HPP_ +#define VIENNACL_LINALG_HOST_BASED_MATRIX_OPERATIONS_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/host_based/matrix_operations.hpp + @brief Implementations of dense matrix related operations, including matrix-vector products, using a plain single-threaded or OpenMP-enabled execution on CPU. +*/ + +#include "viennacl/forwards.h" +#include "viennacl/scalar.hpp" +#include "viennacl/vector.hpp" +#include "viennacl/vector_proxy.hpp" +#include "viennacl/tools/tools.hpp" +#include "viennacl/meta/enable_if.hpp" +#include "viennacl/meta/predicate.hpp" +#include "viennacl/meta/result_of.hpp" +#include "viennacl/traits/size.hpp" +#include "viennacl/traits/start.hpp" +#include "viennacl/traits/handle.hpp" +#include "viennacl/traits/stride.hpp" +#include "viennacl/linalg/detail/op_applier.hpp" +#include "viennacl/linalg/host_based/common.hpp" +#include "viennacl/linalg/prod.hpp" + +// Minimum Matrix size(size1*size2) for using OpenMP on matrix operations: +#ifndef VIENNACL_OPENMP_MATRIX_MIN_SIZE + #define VIENNACL_OPENMP_MATRIX_MIN_SIZE 5000 +#endif + +namespace viennacl +{ +namespace linalg +{ +namespace host_based +{ + +// +// Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here! +// + +template<typename DestNumericT, typename SrcNumericT> +void convert(matrix_base<DestNumericT> & mat1, matrix_base<SrcNumericT> const & mat2) +{ + assert(mat1.row_major() == mat2.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!")); + + DestNumericT * data_A = detail::extract_raw_pointer<DestNumericT>(mat1); + SrcNumericT const * data_B = detail::extract_raw_pointer<SrcNumericT>(mat2); + + vcl_size_t A_start1 = viennacl::traits::start1(mat1); + vcl_size_t A_start2 = viennacl::traits::start2(mat1); + vcl_size_t A_inc1 = viennacl::traits::stride1(mat1); + vcl_size_t A_inc2 = viennacl::traits::stride2(mat1); + vcl_size_t A_size1 = viennacl::traits::size1(mat1); + vcl_size_t A_size2 = viennacl::traits::size2(mat1); + vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat1); + vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat1); + + vcl_size_t B_start1 = viennacl::traits::start1(mat2); + vcl_size_t B_start2 = viennacl::traits::start2(mat2); + vcl_size_t B_inc1 = viennacl::traits::stride1(mat2); + vcl_size_t B_inc2 = viennacl::traits::stride2(mat2); + vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(mat2); + vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(mat2); + + if (mat1.row_major()) + { + detail::matrix_array_wrapper<DestNumericT, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<SrcNumericT const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long row = 0; row < static_cast<long>(A_size1); ++row) + for (vcl_size_t col = 0; col < A_size2; ++col) + wrapper_A(row, col) = static_cast<DestNumericT>(wrapper_B(row, col)); + } + else + { + detail::matrix_array_wrapper<DestNumericT, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<SrcNumericT const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long col = 0; col < static_cast<long>(A_size2); ++col) + for (vcl_size_t row = 0; row < A_size1; ++row) + wrapper_A(row, col) = static_cast<DestNumericT>(wrapper_B(row, col)); + } +} + + + +template<typename NumericT, + typename SizeT, typename DistanceT> +void trans(const matrix_expression<const matrix_base<NumericT, SizeT, DistanceT>, + const matrix_base<NumericT, SizeT, DistanceT>, op_trans> & proxy, matrix_base<NumericT> & temp_trans) +{ + typedef NumericT value_type; + const value_type * data_A = detail::extract_raw_pointer<value_type>(proxy.lhs()); + value_type * data_B = detail::extract_raw_pointer<value_type>(temp_trans); + + vcl_size_t A_start1 = viennacl::traits::start1(proxy.lhs()); + vcl_size_t A_start2 = viennacl::traits::start2(proxy.lhs()); + vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(proxy.lhs()); + vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(proxy.lhs()); + vcl_size_t A_inc1 = viennacl::traits::stride1(proxy.lhs()); + vcl_size_t A_inc2 = viennacl::traits::stride2(proxy.lhs()); + vcl_size_t A_size1 = viennacl::traits::size1(proxy.lhs()); + vcl_size_t A_size2 = viennacl::traits::size2(proxy.lhs()); + + vcl_size_t B_start1 = viennacl::traits::start1(temp_trans); + vcl_size_t B_start2 = viennacl::traits::start2(temp_trans); + vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(temp_trans); + vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(temp_trans); + vcl_size_t B_inc1 = viennacl::traits::stride1(temp_trans); + vcl_size_t B_inc2 = viennacl::traits::stride2(temp_trans); + + const vcl_size_t sub_mat_size = 64; //The matrix will be divided into sub-matrices for better storage access. + + vcl_size_t row_count = A_size1 / sub_mat_size; + vcl_size_t col_count = A_size2 / sub_mat_size; + + vcl_size_t row_count_remainder = A_size1 % sub_mat_size; + vcl_size_t col_count_remainder = A_size2 % sub_mat_size; + + if (proxy.lhs().row_major()) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for(long i = 0; i < static_cast<long>(row_count*col_count); ++i)//This is the main part of the transposition + { + vcl_size_t row = vcl_size_t(i) / col_count; + vcl_size_t col = vcl_size_t(i) % col_count; + + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1 + A_inc1 * (row * sub_mat_size) + , A_start2 + A_inc2 * (col * sub_mat_size), A_inc1 + , A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type , row_major, false> wrapper_B(data_B, B_start1 + B_inc1 * (col * sub_mat_size) + , B_start2 + B_inc2 * (row * sub_mat_size), B_inc1 + , B_inc2, B_internal_size1, B_internal_size2); + for(vcl_size_t j = 0; j < (sub_mat_size); ++j) + for(vcl_size_t k = 0; k < (sub_mat_size); ++k) + wrapper_B(j, k) = wrapper_A(k, j); + } + { //This is the transposition of the remainder on the right side of the matrix + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1 + , A_start2 + A_inc2 * (col_count * sub_mat_size), A_inc1 + , A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type , row_major, false> wrapper_B(data_B, B_start1 + B_inc1 * (col_count * sub_mat_size) + , B_start2, B_inc1 + , B_inc2, B_internal_size1, B_internal_size2); + for(vcl_size_t j = 0; j < col_count_remainder; ++j) + for(vcl_size_t k = 0 ; k < A_size1; ++k) + wrapper_B(j, k) = wrapper_A(k, j); + } + { //This is the transposition of the remainder on the bottom side of the matrix + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1 + A_inc1 * (row_count * sub_mat_size) + , A_start2, A_inc1 + , A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type , row_major, false> wrapper_B(data_B,B_start1 + , B_start2 + B_inc2 * (row_count * sub_mat_size), B_inc1 + , B_inc2, B_internal_size1, B_internal_size2); + for(vcl_size_t j = 0; j < row_count_remainder; ++j) + for(vcl_size_t k = 0; k < (A_size2 - col_count_remainder); ++k) + wrapper_B(k, j) = wrapper_A(j, k); + } + } + else + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for(long i = 0; i < static_cast<long>(row_count*col_count); ++i)//This is the main part of the transposition + { + vcl_size_t row = vcl_size_t(i) / col_count; + vcl_size_t col = vcl_size_t(i) % col_count; + + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1 + A_inc1 * (row * sub_mat_size) + , A_start2 + A_inc2 * (col * sub_mat_size), A_inc1 + , A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type , column_major, false> wrapper_B(data_B, B_start1 + B_inc1 * (col * sub_mat_size) + , B_start2 + B_inc2 * (row * sub_mat_size), B_inc1 + , B_inc2, B_internal_size1, B_internal_size2); + for(vcl_size_t j = 0; j < (sub_mat_size); ++j) + for(vcl_size_t k = 0; k < (sub_mat_size); ++k) + wrapper_B(k, j)=wrapper_A(j, k); + } + { //This is the transposition of the remainder on the right side of the matrix + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1 + , A_start2 + A_inc2 * (col_count * sub_mat_size), A_inc1 + , A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type , column_major, false> wrapper_B(data_B,B_start1 + B_inc1 * (col_count * sub_mat_size) + , B_start2, B_inc1 + , B_inc2, B_internal_size1, B_internal_size2); + for(vcl_size_t j = 0; j < col_count_remainder; ++j) + for(vcl_size_t k = 0; k < A_size1; ++k) + wrapper_B(j, k)=wrapper_A(k, j); + } + { //This is the transposition of the remainder on the bottom side of the matrix + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1 + A_inc1 * (row_count * sub_mat_size) + , A_start2, A_inc1 + , A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type , column_major, false> wrapper_B(data_B, B_start1 + , B_start2 + B_inc2 * (row_count * sub_mat_size), B_inc1 + , B_inc2, B_internal_size1, B_internal_size2); + for(vcl_size_t j = 0; j < row_count_remainder; ++j) + for(vcl_size_t k = 0; k < (A_size2 - col_count_remainder); ++k) + wrapper_B(k, j)=wrapper_A(j, k); + } + } +} + +template<typename NumericT, typename ScalarT1> +void am(matrix_base<NumericT> & mat1, + matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha) +{ + assert(mat1.row_major() == mat2.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!")); + + typedef NumericT value_type; + + value_type * data_A = detail::extract_raw_pointer<value_type>(mat1); + value_type const * data_B = detail::extract_raw_pointer<value_type>(mat2); + + value_type data_alpha = alpha; + if (flip_sign_alpha) + data_alpha = -data_alpha; + + vcl_size_t A_start1 = viennacl::traits::start1(mat1); + vcl_size_t A_start2 = viennacl::traits::start2(mat1); + vcl_size_t A_inc1 = viennacl::traits::stride1(mat1); + vcl_size_t A_inc2 = viennacl::traits::stride2(mat1); + vcl_size_t A_size1 = viennacl::traits::size1(mat1); + vcl_size_t A_size2 = viennacl::traits::size2(mat1); + vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat1); + vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat1); + + vcl_size_t B_start1 = viennacl::traits::start1(mat2); + vcl_size_t B_start2 = viennacl::traits::start2(mat2); + vcl_size_t B_inc1 = viennacl::traits::stride1(mat2); + vcl_size_t B_inc2 = viennacl::traits::stride2(mat2); + vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(mat2); + vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(mat2); + + if (mat1.row_major()) + { + detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + + if (reciprocal_alpha) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long row = 0; row < static_cast<long>(A_size1); ++row) + for (vcl_size_t col = 0; col < A_size2; ++col) + wrapper_A(row, col) = wrapper_B(row, col) / data_alpha; + } + else + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long row = 0; row < static_cast<long>(A_size1); ++row) + for (vcl_size_t col = 0; col < A_size2; ++col) + wrapper_A(row, col) = wrapper_B(row, col) * data_alpha; + } + } + else + { + detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + + if (reciprocal_alpha) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long col = 0; col < static_cast<long>(A_size2); ++col) + for (vcl_size_t row = 0; row < A_size1; ++row) + wrapper_A(row, col) = wrapper_B(row, col) / data_alpha; + } + else + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long col = 0; col < static_cast<long>(A_size2); ++col) + for (vcl_size_t row = 0; row < A_size1; ++row) + wrapper_A(row, col) = wrapper_B(row, col) * data_alpha; + } + } +} + + +template<typename NumericT, + typename ScalarT1, typename ScalarT2> +void ambm(matrix_base<NumericT> & mat1, + matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha, + matrix_base<NumericT> const & mat3, ScalarT2 const & beta, vcl_size_t /*len_beta*/, bool reciprocal_beta, bool flip_sign_beta) +{ + assert(mat1.row_major() == mat2.row_major() && mat1.row_major() == mat3.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!")); + + typedef NumericT value_type; + + value_type * data_A = detail::extract_raw_pointer<value_type>(mat1); + value_type const * data_B = detail::extract_raw_pointer<value_type>(mat2); + value_type const * data_C = detail::extract_raw_pointer<value_type>(mat3); + + value_type data_alpha = alpha; + if (flip_sign_alpha) + data_alpha = -data_alpha; + + value_type data_beta = beta; + if (flip_sign_beta) + data_beta = -data_beta; + + vcl_size_t A_start1 = viennacl::traits::start1(mat1); + vcl_size_t A_start2 = viennacl::traits::start2(mat1); + vcl_size_t A_inc1 = viennacl::traits::stride1(mat1); + vcl_size_t A_inc2 = viennacl::traits::stride2(mat1); + vcl_size_t A_size1 = viennacl::traits::size1(mat1); + vcl_size_t A_size2 = viennacl::traits::size2(mat1); + vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat1); + vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat1); + + vcl_size_t B_start1 = viennacl::traits::start1(mat2); + vcl_size_t B_start2 = viennacl::traits::start2(mat2); + vcl_size_t B_inc1 = viennacl::traits::stride1(mat2); + vcl_size_t B_inc2 = viennacl::traits::stride2(mat2); + vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(mat2); + vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(mat2); + + vcl_size_t C_start1 = viennacl::traits::start1(mat3); + vcl_size_t C_start2 = viennacl::traits::start2(mat3); + vcl_size_t C_inc1 = viennacl::traits::stride1(mat3); + vcl_size_t C_inc2 = viennacl::traits::stride2(mat3); + vcl_size_t C_internal_size1 = viennacl::traits::internal_size1(mat3); + vcl_size_t C_internal_size2 = viennacl::traits::internal_size2(mat3); + + if (mat1.row_major()) + { + detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + if (reciprocal_alpha && reciprocal_beta) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long row = 0; row < static_cast<long>(A_size1); ++row) + for (vcl_size_t col = 0; col < A_size2; ++col) + wrapper_A(row, col) = wrapper_B(row, col) / data_alpha + wrapper_C(row, col) / data_beta; + } + else if (reciprocal_alpha && !reciprocal_beta) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long row = 0; row < static_cast<long>(A_size1); ++row) + for (vcl_size_t col = 0; col < A_size2; ++col) + wrapper_A(row, col) = wrapper_B(row, col) / data_alpha + wrapper_C(row, col) * data_beta; + } + else if (!reciprocal_alpha && reciprocal_beta) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long row = 0; row < static_cast<long>(A_size1); ++row) + for (vcl_size_t col = 0; col < A_size2; ++col) + wrapper_A(row, col) = wrapper_B(row, col) * data_alpha + wrapper_C(row, col) / data_beta; + } + else if (!reciprocal_alpha && !reciprocal_beta) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long row = 0; row < static_cast<long>(A_size1); ++row) + for (vcl_size_t col = 0; col < A_size2; ++col) + wrapper_A(row, col) = wrapper_B(row, col) * data_alpha + wrapper_C(row, col) * data_beta; + } + } + else + { + detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + if (reciprocal_alpha && reciprocal_beta) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long col = 0; col < static_cast<long>(A_size2); ++col) + for (vcl_size_t row = 0; row < A_size1; ++row) + wrapper_A(row, col) = wrapper_B(row, col) / data_alpha + wrapper_C(row, col) / data_beta; + } + else if (reciprocal_alpha && !reciprocal_beta) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long col = 0; col < static_cast<long>(A_size2); ++col) + for (vcl_size_t row = 0; row < A_size1; ++row) + wrapper_A(row, col) = wrapper_B(row, col) / data_alpha + wrapper_C(row, col) * data_beta; + } + else if (!reciprocal_alpha && reciprocal_beta) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long col = 0; col < static_cast<long>(A_size2); ++col) + for (vcl_size_t row = 0; row < A_size1; ++row) + wrapper_A(row, col) = wrapper_B(row, col) * data_alpha + wrapper_C(row, col) / data_beta; + } + else if (!reciprocal_alpha && !reciprocal_beta) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long col = 0; col < static_cast<long>(A_size2); ++col) + for (vcl_size_t row = 0; row < A_size1; ++row) + wrapper_A(row, col) = wrapper_B(row, col) * data_alpha + wrapper_C(row, col) * data_beta; + } + } + +} + + +template<typename NumericT, + typename ScalarT1, typename ScalarT2> +void ambm_m(matrix_base<NumericT> & mat1, + matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha, + matrix_base<NumericT> const & mat3, ScalarT2 const & beta, vcl_size_t /*len_beta*/, bool reciprocal_beta, bool flip_sign_beta) +{ + assert(mat1.row_major() == mat2.row_major() && mat1.row_major() == mat3.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!")); + + typedef NumericT value_type; + + value_type * data_A = detail::extract_raw_pointer<value_type>(mat1); + value_type const * data_B = detail::extract_raw_pointer<value_type>(mat2); + value_type const * data_C = detail::extract_raw_pointer<value_type>(mat3); + + value_type data_alpha = alpha; + if (flip_sign_alpha) + data_alpha = -data_alpha; + + value_type data_beta = beta; + if (flip_sign_beta) + data_beta = -data_beta; + + vcl_size_t A_start1 = viennacl::traits::start1(mat1); + vcl_size_t A_start2 = viennacl::traits::start2(mat1); + vcl_size_t A_inc1 = viennacl::traits::stride1(mat1); + vcl_size_t A_inc2 = viennacl::traits::stride2(mat1); + vcl_size_t A_size1 = viennacl::traits::size1(mat1); + vcl_size_t A_size2 = viennacl::traits::size2(mat1); + vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat1); + vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat1); + + vcl_size_t B_start1 = viennacl::traits::start1(mat2); + vcl_size_t B_start2 = viennacl::traits::start2(mat2); + vcl_size_t B_inc1 = viennacl::traits::stride1(mat2); + vcl_size_t B_inc2 = viennacl::traits::stride2(mat2); + vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(mat2); + vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(mat2); + + vcl_size_t C_start1 = viennacl::traits::start1(mat3); + vcl_size_t C_start2 = viennacl::traits::start2(mat3); + vcl_size_t C_inc1 = viennacl::traits::stride1(mat3); + vcl_size_t C_inc2 = viennacl::traits::stride2(mat3); + vcl_size_t C_internal_size1 = viennacl::traits::internal_size1(mat3); + vcl_size_t C_internal_size2 = viennacl::traits::internal_size2(mat3); + + if (mat1.row_major()) + { + detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + if (reciprocal_alpha && reciprocal_beta) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long row = 0; row < static_cast<long>(A_size1); ++row) + for (vcl_size_t col = 0; col < A_size2; ++col) + wrapper_A(row, col) += wrapper_B(row, col) / data_alpha + wrapper_C(row, col) / data_beta; + } + else if (reciprocal_alpha && !reciprocal_beta) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long row = 0; row < static_cast<long>(A_size1); ++row) + for (vcl_size_t col = 0; col < A_size2; ++col) + wrapper_A(row, col) += wrapper_B(row, col) / data_alpha + wrapper_C(row, col) * data_beta; + } + else if (!reciprocal_alpha && reciprocal_beta) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long row = 0; row < static_cast<long>(A_size1); ++row) + for (vcl_size_t col = 0; col < A_size2; ++col) + wrapper_A(row, col) += wrapper_B(row, col) * data_alpha + wrapper_C(row, col) / data_beta; + } + else if (!reciprocal_alpha && !reciprocal_beta) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long row = 0; row < static_cast<long>(A_size1); ++row) + for (vcl_size_t col = 0; col < A_size2; ++col) + wrapper_A(row, col) += wrapper_B(row, col) * data_alpha + wrapper_C(row, col) * data_beta; + } + } + else + { + detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + if (reciprocal_alpha && reciprocal_beta) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long col = 0; col < static_cast<long>(A_size2); ++col) + for (vcl_size_t row = 0; row < A_size1; ++row) + wrapper_A(row, col) += wrapper_B(row, col) / data_alpha + wrapper_C(row, col) / data_beta; + } + else if (reciprocal_alpha && !reciprocal_beta) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long col = 0; col < static_cast<long>(A_size2); ++col) + for (vcl_size_t row = 0; row < A_size1; ++row) + wrapper_A(row, col) += wrapper_B(row, col) / data_alpha + wrapper_C(row, col) * data_beta; + } + else if (!reciprocal_alpha && reciprocal_beta) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long col = 0; col < static_cast<long>(A_size2); ++col) + for (vcl_size_t row = 0; row < A_size1; ++row) + wrapper_A(row, col) += wrapper_B(row, col) * data_alpha + wrapper_C(row, col) / data_beta; + } + else if (!reciprocal_alpha && !reciprocal_beta) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long col = 0; col < static_cast<long>(A_size2); ++col) + for (vcl_size_t row = 0; row < A_size1; ++row) + wrapper_A(row, col) += wrapper_B(row, col) * data_alpha + wrapper_C(row, col) * data_beta; + } + } + +} + + + + +template<typename NumericT> +void matrix_assign(matrix_base<NumericT> & mat, NumericT s, bool clear = false) +{ + typedef NumericT value_type; + + value_type * data_A = detail::extract_raw_pointer<value_type>(mat); + value_type alpha = static_cast<value_type>(s); + + vcl_size_t A_start1 = viennacl::traits::start1(mat); + vcl_size_t A_start2 = viennacl::traits::start2(mat); + vcl_size_t A_inc1 = viennacl::traits::stride1(mat); + vcl_size_t A_inc2 = viennacl::traits::stride2(mat); + vcl_size_t A_size1 = clear ? viennacl::traits::internal_size1(mat) : viennacl::traits::size1(mat); + vcl_size_t A_size2 = clear ? viennacl::traits::internal_size2(mat) : viennacl::traits::size2(mat); + vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat); + vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat); + + if (mat.row_major()) + { + detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long row = 0; row < static_cast<long>(A_size1); ++row) + for (vcl_size_t col = 0; col < A_size2; ++col) + wrapper_A(static_cast<vcl_size_t>(row), col) = alpha; + //data_A[index_generator_A::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] + // = data_B[index_generator_B::mem_index(row * B_inc1 + B_start1, col * B_inc2 + B_start2, B_internal_size1, B_internal_size2)] * alpha; + } + else + { + detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long col = 0; col < static_cast<long>(A_size2); ++col) + for (vcl_size_t row = 0; row < A_size1; ++row) + wrapper_A(row, static_cast<vcl_size_t>(col)) = alpha; + //data_A[index_generator_A::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] + // = data_B[index_generator_B::mem_index(row * B_inc1 + B_start1, col * B_inc2 + B_start2, B_internal_size1, B_internal_size2)] * alpha; + } +} + + + +template<typename NumericT> +void matrix_diagonal_assign(matrix_base<NumericT> & mat, NumericT s) +{ + typedef NumericT value_type; + + value_type * data_A = detail::extract_raw_pointer<value_type>(mat); + value_type alpha = static_cast<value_type>(s); + + vcl_size_t A_start1 = viennacl::traits::start1(mat); + vcl_size_t A_start2 = viennacl::traits::start2(mat); + vcl_size_t A_inc1 = viennacl::traits::stride1(mat); + vcl_size_t A_inc2 = viennacl::traits::stride2(mat); + vcl_size_t A_size1 = viennacl::traits::size1(mat); + //vcl_size_t A_size2 = viennacl::traits::size2(mat); + vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat); + vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat); + + if (mat.row_major()) + { + detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size1) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long row = 0; row < static_cast<long>(A_size1); ++row) + wrapper_A(row, row) = alpha; + } + else + { + detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size1) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long row = 0; row < static_cast<long>(A_size1); ++row) + wrapper_A(row, row) = alpha; + } +} + +template<typename NumericT> +void matrix_diag_from_vector(const vector_base<NumericT> & vec, int k, matrix_base<NumericT> & mat) +{ + typedef NumericT value_type; + + value_type *data_A = detail::extract_raw_pointer<value_type>(mat); + value_type const *data_vec = detail::extract_raw_pointer<value_type>(vec); + + vcl_size_t A_start1 = viennacl::traits::start1(mat); + vcl_size_t A_start2 = viennacl::traits::start2(mat); + vcl_size_t A_inc1 = viennacl::traits::stride1(mat); + vcl_size_t A_inc2 = viennacl::traits::stride2(mat); + //vcl_size_t A_size1 = viennacl::traits::size1(mat); + //vcl_size_t A_size2 = viennacl::traits::size2(mat); + vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat); + vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat); + + vcl_size_t v_start = viennacl::traits::start(vec); + vcl_size_t v_inc = viennacl::traits::stride(vec); + vcl_size_t v_size = viennacl::traits::size(vec); + + vcl_size_t row_start = 0; + vcl_size_t col_start = 0; + + if (k >= 0) + col_start = static_cast<vcl_size_t>(k); + else + row_start = static_cast<vcl_size_t>(-k); + + matrix_assign(mat, NumericT(0)); + + if (mat.row_major()) + { + detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + + for (vcl_size_t i = 0; i < v_size; ++i) + wrapper_A(row_start + i, col_start + i) = data_vec[v_start + i * v_inc]; + } + else + { + detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + + for (vcl_size_t i = 0; i < v_size; ++i) + wrapper_A(row_start + i, col_start + i) = data_vec[v_start + i * v_inc]; + } +} + +template<typename NumericT> +void matrix_diag_to_vector(const matrix_base<NumericT> & mat, int k, vector_base<NumericT> & vec) +{ + typedef NumericT value_type; + + value_type const * data_A = detail::extract_raw_pointer<value_type>(mat); + value_type * data_vec = detail::extract_raw_pointer<value_type>(vec); + + vcl_size_t A_start1 = viennacl::traits::start1(mat); + vcl_size_t A_start2 = viennacl::traits::start2(mat); + vcl_size_t A_inc1 = viennacl::traits::stride1(mat); + vcl_size_t A_inc2 = viennacl::traits::stride2(mat); + //vcl_size_t A_size1 = viennacl::traits::size1(mat); + //vcl_size_t A_size2 = viennacl::traits::size2(mat); + vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat); + vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat); + + vcl_size_t v_start = viennacl::traits::start(vec); + vcl_size_t v_inc = viennacl::traits::stride(vec); + vcl_size_t v_size = viennacl::traits::size(vec); + + vcl_size_t row_start = 0; + vcl_size_t col_start = 0; + + if (k >= 0) + col_start = static_cast<vcl_size_t>(k); + else + row_start = static_cast<vcl_size_t>(-k); + + if (mat.row_major()) + { + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + + for (vcl_size_t i = 0; i < v_size; ++i) + data_vec[v_start + i * v_inc] = wrapper_A(row_start + i, col_start + i); + } + else + { + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + + for (vcl_size_t i = 0; i < v_size; ++i) + data_vec[v_start + i * v_inc] = wrapper_A(row_start + i, col_start + i); + } +} + +template<typename NumericT> +void matrix_row(const matrix_base<NumericT> & mat, unsigned int i, vector_base<NumericT> & vec) +{ + typedef NumericT value_type; + + value_type const * data_A = detail::extract_raw_pointer<value_type>(mat); + value_type * data_vec = detail::extract_raw_pointer<value_type>(vec); + + vcl_size_t A_start1 = viennacl::traits::start1(mat); + vcl_size_t A_start2 = viennacl::traits::start2(mat); + vcl_size_t A_inc1 = viennacl::traits::stride1(mat); + vcl_size_t A_inc2 = viennacl::traits::stride2(mat); + //vcl_size_t A_size1 = viennacl::traits::size1(mat); + //vcl_size_t A_size2 = viennacl::traits::size2(mat); + vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat); + vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat); + + vcl_size_t v_start = viennacl::traits::start(vec); + vcl_size_t v_inc = viennacl::traits::stride(vec); + vcl_size_t v_size = viennacl::traits::size(vec); + + if (mat.row_major()) + { + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + + for (vcl_size_t j = 0; j < v_size; ++j) + data_vec[v_start + j * v_inc] = wrapper_A(static_cast<vcl_size_t>(i), j); + } + else + { + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + + for (vcl_size_t j = 0; j < v_size; ++j) + data_vec[v_start + j * v_inc] = wrapper_A(static_cast<vcl_size_t>(i), j); + } +} + +template<typename NumericT> +void matrix_column(const matrix_base<NumericT> & mat, unsigned int j, vector_base<NumericT> & vec) +{ + typedef NumericT value_type; + + value_type const * data_A = detail::extract_raw_pointer<value_type>(mat); + value_type * data_vec = detail::extract_raw_pointer<value_type>(vec); + + vcl_size_t A_start1 = viennacl::traits::start1(mat); + vcl_size_t A_start2 = viennacl::traits::start2(mat); + vcl_size_t A_inc1 = viennacl::traits::stride1(mat); + vcl_size_t A_inc2 = viennacl::traits::stride2(mat); + //vcl_size_t A_size1 = viennacl::traits::size1(mat); + //vcl_size_t A_size2 = viennacl::traits::size2(mat); + vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat); + vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat); + + vcl_size_t v_start = viennacl::traits::start(vec); + vcl_size_t v_inc = viennacl::traits::stride(vec); + vcl_size_t v_size = viennacl::traits::size(vec); + + if (mat.row_major()) + { + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + + for (vcl_size_t i = 0; i < v_size; ++i) + data_vec[v_start + i * v_inc] = wrapper_A(i, static_cast<vcl_size_t>(j)); + } + else + { + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + + for (vcl_size_t i = 0; i < v_size; ++i) + data_vec[v_start + i * v_inc] = wrapper_A(i, static_cast<vcl_size_t>(j)); + } +} + +// +///////////////////////// Element-wise operation ////////////////////////////////// +// + +// Binary operations A = B .* C and A = B ./ C + +/** @brief Implementation of the element-wise operations A = B .* C and A = B ./ C (using MATLAB syntax) +* +* @param A The result matrix (or -range, or -slice) +* @param proxy The proxy object holding B, C, and the operation +*/ +template<typename NumericT, typename OpT> +void element_op(matrix_base<NumericT> & A, + matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_binary<OpT> > const & proxy) +{ + assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!")); + + typedef NumericT value_type; + typedef viennacl::linalg::detail::op_applier<op_element_binary<OpT> > OpFunctor; + + value_type * data_A = detail::extract_raw_pointer<value_type>(A); + value_type const * data_B = detail::extract_raw_pointer<value_type>(proxy.lhs()); + value_type const * data_C = detail::extract_raw_pointer<value_type>(proxy.rhs()); + + vcl_size_t A_start1 = viennacl::traits::start1(A); + vcl_size_t A_start2 = viennacl::traits::start2(A); + vcl_size_t A_inc1 = viennacl::traits::stride1(A); + vcl_size_t A_inc2 = viennacl::traits::stride2(A); + vcl_size_t A_size1 = viennacl::traits::size1(A); + vcl_size_t A_size2 = viennacl::traits::size2(A); + vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(A); + vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(A); + + vcl_size_t B_start1 = viennacl::traits::start1(proxy.lhs()); + vcl_size_t B_start2 = viennacl::traits::start2(proxy.lhs()); + vcl_size_t B_inc1 = viennacl::traits::stride1(proxy.lhs()); + vcl_size_t B_inc2 = viennacl::traits::stride2(proxy.lhs()); + vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(proxy.lhs()); + vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(proxy.lhs()); + + vcl_size_t C_start1 = viennacl::traits::start1(proxy.rhs()); + vcl_size_t C_start2 = viennacl::traits::start2(proxy.rhs()); + vcl_size_t C_inc1 = viennacl::traits::stride1(proxy.rhs()); + vcl_size_t C_inc2 = viennacl::traits::stride2(proxy.rhs()); + vcl_size_t C_internal_size1 = viennacl::traits::internal_size1(proxy.rhs()); + vcl_size_t C_internal_size2 = viennacl::traits::internal_size2(proxy.rhs()); + + if (A.row_major()) + { + detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long row = 0; row < static_cast<long>(A_size1); ++row) + for (vcl_size_t col = 0; col < A_size2; ++col) + OpFunctor::apply(wrapper_A(row, col), wrapper_B(row, col), wrapper_C(row, col)); + //data_A[index_generator_A::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] + // = data_B[index_generator_B::mem_index(row * B_inc1 + B_start1, col * B_inc2 + B_start2, B_internal_size1, B_internal_size2)] * alpha + // + data_C[index_generator_C::mem_index(row * C_inc1 + C_start1, col * C_inc2 + C_start2, C_internal_size1, C_internal_size2)] * beta; + } + else + { + detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long col = 0; col < static_cast<long>(A_size2); ++col) + for (vcl_size_t row = 0; row < A_size1; ++row) + OpFunctor::apply(wrapper_A(row, col), wrapper_B(row, col), wrapper_C(row, col)); + + //data_A[index_generator_A::mem_index(row * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] + // = data_B[index_generator_B::mem_index(row * B_inc1 + B_start1, col * B_inc2 + B_start2, B_internal_size1, B_internal_size2)] * alpha + // + data_C[index_generator_C::mem_index(row * C_inc1 + C_start1, col * C_inc2 + C_start2, C_internal_size1, C_internal_size2)] * beta; + } +} + +// Unary operations + +// A = op(B) +template<typename NumericT, typename OpT> +void element_op(matrix_base<NumericT> & A, + matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<OpT> > const & proxy) +{ + assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!")); + + typedef NumericT value_type; + typedef viennacl::linalg::detail::op_applier<op_element_unary<OpT> > OpFunctor; + + value_type * data_A = detail::extract_raw_pointer<value_type>(A); + value_type const * data_B = detail::extract_raw_pointer<value_type>(proxy.lhs()); + + vcl_size_t A_start1 = viennacl::traits::start1(A); + vcl_size_t A_start2 = viennacl::traits::start2(A); + vcl_size_t A_inc1 = viennacl::traits::stride1(A); + vcl_size_t A_inc2 = viennacl::traits::stride2(A); + vcl_size_t A_size1 = viennacl::traits::size1(A); + vcl_size_t A_size2 = viennacl::traits::size2(A); + vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(A); + vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(A); + + vcl_size_t B_start1 = viennacl::traits::start1(proxy.lhs()); + vcl_size_t B_start2 = viennacl::traits::start2(proxy.lhs()); + vcl_size_t B_inc1 = viennacl::traits::stride1(proxy.lhs()); + vcl_size_t B_inc2 = viennacl::traits::stride2(proxy.lhs()); + vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(proxy.lhs()); + vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(proxy.lhs()); + + if (A.row_major()) + { + detail::matrix_array_wrapper<value_type, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long row = 0; row < static_cast<long>(A_size1); ++row) + for (vcl_size_t col = 0; col < A_size2; ++col) + OpFunctor::apply(wrapper_A(row, col), wrapper_B(row, col)); + } + else + { + detail::matrix_array_wrapper<value_type, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long col = 0; col < static_cast<long>(A_size2); ++col) + for (vcl_size_t row = 0; row < A_size1; ++row) + OpFunctor::apply(wrapper_A(row, col), wrapper_B(row, col)); + } +} + + + +// +///////////////////////// matrix-vector products ///////////////////////////////// +// + +// A * x + +/** @brief Carries out matrix-vector multiplication +* +* Implementation of the convenience expression result = prod(mat, vec); +* +* @param mat The matrix +* @param trans Flag whether mat is to be transposed +* @param vec The vector +* @param result The result vector +*/ +template<typename NumericT> +void prod_impl(const matrix_base<NumericT> & mat, bool trans, + const vector_base<NumericT> & vec, + vector_base<NumericT> & result) +{ + typedef NumericT value_type; + + value_type const * data_A = detail::extract_raw_pointer<value_type>(mat); + value_type const * data_x = detail::extract_raw_pointer<value_type>(vec); + value_type * data_result = detail::extract_raw_pointer<value_type>(result); + + vcl_size_t A_start1 = viennacl::traits::start1(mat); + vcl_size_t A_start2 = viennacl::traits::start2(mat); + vcl_size_t A_inc1 = viennacl::traits::stride1(mat); + vcl_size_t A_inc2 = viennacl::traits::stride2(mat); + vcl_size_t A_size1 = viennacl::traits::size1(mat); + vcl_size_t A_size2 = viennacl::traits::size2(mat); + vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat); + vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat); + + vcl_size_t start1 = viennacl::traits::start(vec); + vcl_size_t inc1 = viennacl::traits::stride(vec); + + vcl_size_t start2 = viennacl::traits::start(result); + vcl_size_t inc2 = viennacl::traits::stride(result); + + if (mat.row_major()) + { + if (trans) + { + vcl_size_t thread_count = 1; +#ifdef VIENNACL_WITH_OPENMP + if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) + thread_count = omp_get_max_threads(); +#endif + std::vector<value_type> temp_array(A_size2*thread_count, 0); + detail::vector_array_wrapper<value_type> wrapper_res(data_result, start2, inc2); + + for (vcl_size_t col = 0; col < A_size2; ++col) + wrapper_res(col) = 0; + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + { + vcl_size_t id = 0; +#ifdef VIENNACL_WITH_OPENMP + if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) + id = omp_get_thread_num(); + #endif + vcl_size_t begin = (A_size1 * id) / thread_count; + vcl_size_t end = (A_size1 * (id + 1)) / thread_count; + + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_mat(data_A, A_start1 + A_inc1 * begin, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::vector_array_wrapper<value_type const> wrapper_vec(data_x, start1 + inc1 * begin, inc1); + + for (vcl_size_t row = 0; row < (end - begin); ++row) //run through matrix sequentially + { + value_type temp = wrapper_vec(row); + for (vcl_size_t col = 0; col < A_size2; ++col) + temp_array[A_size2 * id + col] += wrapper_mat(row , col) * temp; + } + } + for (vcl_size_t id = 0; id < thread_count; ++id) + for (vcl_size_t col = 0; col < A_size2; ++col) + wrapper_res(col) += temp_array[A_size2 * id + col]; + } + + else + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long row = 0; row < static_cast<long>(A_size1); ++row) + { + value_type temp = 0; + for (vcl_size_t col = 0; col < A_size2; ++col) + temp += data_A[viennacl::row_major::mem_index(static_cast<vcl_size_t>(row) * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] * data_x[col * inc1 + start1]; + + data_result[static_cast<vcl_size_t>(row) * inc2 + start2] = temp; + } + } + } + else + { + if (!trans) + { + vcl_size_t thread_count = 1; +#ifdef VIENNACL_WITH_OPENMP + if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) + thread_count = omp_get_max_threads(); +#endif + std::vector<value_type> temp_array(A_size1*thread_count, 0); + detail::vector_array_wrapper<value_type> wrapper_res(data_result, start2, inc2); + + for (vcl_size_t row = 0; row < A_size1; ++row) + wrapper_res(row) = 0; + +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + { + vcl_size_t id = 0; +#ifdef VIENNACL_WITH_OPENMP + if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) + id = omp_get_thread_num(); + #endif + vcl_size_t begin = (A_size2 * id) / thread_count; + vcl_size_t end = (A_size2 * (id + 1)) / thread_count; + + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_mat(data_A, A_start1, A_start2 + A_inc2 * begin, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::vector_array_wrapper<value_type const> wrapper_vec(data_x, start1 + inc1 * begin, inc1); + + for (vcl_size_t col = 0; col < (end - begin); ++col) //run through matrix sequentially + { + value_type temp = wrapper_vec(col); + for (vcl_size_t row = 0; row < A_size1; ++row) + temp_array[A_size1 * id + row] += wrapper_mat(row , col) * temp; + } + } + for (vcl_size_t id = 0; id < thread_count; ++id) + for (vcl_size_t row = 0; row < A_size1; ++row) + wrapper_res(row) += temp_array[A_size1 * id + row]; + } + else + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long row = 0; row < static_cast<long>(A_size2); ++row) + { + value_type temp = 0; + for (vcl_size_t col = 0; col < A_size1; ++col) + temp += data_A[viennacl::column_major::mem_index(col * A_inc1 + A_start1, static_cast<vcl_size_t>(row) * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] * data_x[col * inc1 + start1]; + + data_result[static_cast<vcl_size_t>(row) * inc2 + start2] = temp; + } + } + } +} + + + +// +///////////////////////// matrix-matrix products ///////////////////////////////// +// + +namespace detail +{ + template<typename MatrixAccT1, typename MatrixAccT2, typename MatrixAccT3, typename NumericT> + void prod(MatrixAccT1 & A, MatrixAccT2 & B, MatrixAccT3 & C, + vcl_size_t C_size1, vcl_size_t C_size2, vcl_size_t A_size2, + NumericT alpha, NumericT beta) + { + if (C_size1 == 0 || C_size2 == 0 || A_size2 == 0) + return; + + static const vcl_size_t blocksize = 64; + + vcl_size_t num_blocks_C1 = (C_size1 - 1) / blocksize + 1; + vcl_size_t num_blocks_C2 = (C_size2 - 1) / blocksize + 1; + vcl_size_t num_blocks_A2 = (A_size2 - 1) / blocksize + 1; + + // + // outer loop pair: Run over all blocks with indices (block_idx_i, block_idx_j) of the result matrix C: + // +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((C_size1*C_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long block_idx_i2=0; block_idx_i2<static_cast<long>(num_blocks_C1); ++block_idx_i2) + { + // thread-local auxiliary buffers + std::vector<NumericT> buffer_A(blocksize * blocksize); // row-major + std::vector<NumericT> buffer_B(blocksize * blocksize); // column-major + std::vector<NumericT> buffer_C(blocksize * blocksize); // row-major + + vcl_size_t block_idx_i = static_cast<vcl_size_t>(block_idx_i2); + for (vcl_size_t block_idx_j=0; block_idx_j<num_blocks_C2; ++block_idx_j) + { + // Reset block matrix: + std::fill(buffer_C.begin(), buffer_C.end(), NumericT(0)); + + vcl_size_t offset_i = block_idx_i*blocksize; + vcl_size_t offset_j = block_idx_j*blocksize; + + // C(block_idx_i, block_idx_i) += A(block_idx_i, block_idx_k) * B(block_idx_k, block_idx_j) + for (vcl_size_t block_idx_k=0; block_idx_k<num_blocks_A2; ++block_idx_k) + { + // flush buffers: + std::fill(buffer_A.begin(), buffer_A.end(), NumericT(0)); + std::fill(buffer_B.begin(), buffer_B.end(), NumericT(0)); + + vcl_size_t offset_k = block_idx_k*blocksize; + + // load current data: + for (vcl_size_t i = offset_i; i < std::min(offset_i + blocksize, C_size1); ++i) + for (vcl_size_t k = offset_k; k < std::min(offset_k + blocksize, A_size2); ++k) + buffer_A[(i - offset_i) * blocksize + (k - offset_k)] = A(i, k); + + for (vcl_size_t j = offset_j; j < std::min(offset_j + blocksize, C_size2); ++j) + for (vcl_size_t k = offset_k; k < std::min(offset_k + blocksize, A_size2); ++k) + buffer_B[(k - offset_k) + (j - offset_j) * blocksize] = B(k, j); + + // multiply (this is the hot spot in terms of flops) + for (vcl_size_t i = 0; i < blocksize; ++i) + { + NumericT const * ptrA = &(buffer_A[i*blocksize]); + for (vcl_size_t j = 0; j < blocksize; ++j) + { + NumericT const * ptrB = &(buffer_B[j*blocksize]); + + NumericT temp = NumericT(0); + for (vcl_size_t k = 0; k < blocksize; ++k) + temp += ptrA[k] * ptrB[k]; // buffer_A[i*blocksize + k] * buffer_B[k + j*blocksize]; + + buffer_C[i*blocksize + j] += temp; + } + } + } + + // write result: + if (beta > 0 || beta < 0) + { + for (vcl_size_t i = offset_i; i < std::min(offset_i + blocksize, C_size1); ++i) + for (vcl_size_t j = offset_j; j < std::min(offset_j + blocksize, C_size2); ++j) + C(i,j) = beta * C(i,j) + alpha * buffer_C[(i - offset_i) * blocksize + (j - offset_j)]; + } + else + { + for (vcl_size_t i = offset_i; i < std::min(offset_i + blocksize, C_size1); ++i) + for (vcl_size_t j = offset_j; j < std::min(offset_j + blocksize, C_size2); ++j) + C(i,j) = alpha * buffer_C[(i - offset_i) * blocksize + (j - offset_j)]; + } + + } // for block j + } // for block i + + } // prod() + +} // namespace detail + +/** @brief Carries out matrix-matrix multiplication +* +* Implementation of C = prod(A, B); +* +*/ +template<typename NumericT, typename ScalarT1, typename ScalarT2 > +void prod_impl(const matrix_base<NumericT> & A, bool trans_A, + const matrix_base<NumericT> & B, bool trans_B, + matrix_base<NumericT> & C, + ScalarT1 alpha, + ScalarT2 beta) +{ + typedef NumericT value_type; + + value_type const * data_A = detail::extract_raw_pointer<value_type>(A); + value_type const * data_B = detail::extract_raw_pointer<value_type>(B); + value_type * data_C = detail::extract_raw_pointer<value_type>(C); + + vcl_size_t A_start1 = viennacl::traits::start1(A); + vcl_size_t A_start2 = viennacl::traits::start2(A); + vcl_size_t A_inc1 = viennacl::traits::stride1(A); + vcl_size_t A_inc2 = viennacl::traits::stride2(A); + vcl_size_t A_size1 = viennacl::traits::size1(A); + vcl_size_t A_size2 = viennacl::traits::size2(A); + vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(A); + vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(A); + + vcl_size_t B_start1 = viennacl::traits::start1(B); + vcl_size_t B_start2 = viennacl::traits::start2(B); + vcl_size_t B_inc1 = viennacl::traits::stride1(B); + vcl_size_t B_inc2 = viennacl::traits::stride2(B); + vcl_size_t B_internal_size1 = viennacl::traits::internal_size1(B); + vcl_size_t B_internal_size2 = viennacl::traits::internal_size2(B); + + vcl_size_t C_start1 = viennacl::traits::start1(C); + vcl_size_t C_start2 = viennacl::traits::start2(C); + vcl_size_t C_inc1 = viennacl::traits::stride1(C); + vcl_size_t C_inc2 = viennacl::traits::stride2(C); + vcl_size_t C_size1 = viennacl::traits::size1(C); + vcl_size_t C_size2 = viennacl::traits::size2(C); + vcl_size_t C_internal_size1 = viennacl::traits::internal_size1(C); + vcl_size_t C_internal_size2 = viennacl::traits::internal_size2(C); + + if (!trans_A && !trans_B) + { + if (A.row_major() && B.row_major() && C.row_major()) + { + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + else if (A.row_major() && B.row_major() && !C.row_major()) + { + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + else if (A.row_major() && !B.row_major() && C.row_major()) + { + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + else if (A.row_major() && !B.row_major() && !C.row_major()) + { + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + else if (!A.row_major() && B.row_major() && C.row_major()) + { + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + else if (!A.row_major() && B.row_major() && !C.row_major()) + { + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + else if (!A.row_major() && !B.row_major() && C.row_major()) + { + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + else + { + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + } + else if (!trans_A && trans_B) + { + if (A.row_major() && B.row_major() && C.row_major()) + { + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + else if (A.row_major() && B.row_major() && !C.row_major()) + { + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + else if (A.row_major() && !B.row_major() && C.row_major()) + { + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + else if (A.row_major() && !B.row_major() && !C.row_major()) + { + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + else if (!A.row_major() && B.row_major() && C.row_major()) + { + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + else if (!A.row_major() && B.row_major() && !C.row_major()) + { + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + else if (!A.row_major() && !B.row_major() && C.row_major()) + { + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + else + { + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size2, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + } + else if (trans_A && !trans_B) + { + if (A.row_major() && B.row_major() && C.row_major()) + { + detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + else if (A.row_major() && B.row_major() && !C.row_major()) + { + detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + else if (A.row_major() && !B.row_major() && C.row_major()) + { + detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + else if (A.row_major() && !B.row_major() && !C.row_major()) + { + detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + else if (!A.row_major() && B.row_major() && C.row_major()) + { + detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + else if (!A.row_major() && B.row_major() && !C.row_major()) + { + detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, row_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + else if (!A.row_major() && !B.row_major() && C.row_major()) + { + detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + else + { + detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, column_major, false> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + } + else if (trans_A && trans_B) + { + if (A.row_major() && B.row_major() && C.row_major()) + { + detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + else if (A.row_major() && B.row_major() && !C.row_major()) + { + detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + else if (A.row_major() && !B.row_major() && C.row_major()) + { + detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + else if (A.row_major() && !B.row_major() && !C.row_major()) + { + detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + else if (!A.row_major() && B.row_major() && C.row_major()) + { + detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + else if (!A.row_major() && B.row_major() && !C.row_major()) + { + detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, row_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + else if (!A.row_major() && !B.row_major() && C.row_major()) + { + detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, row_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + else + { + detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_A(data_A, A_start1, A_start2, A_inc1, A_inc2, A_internal_size1, A_internal_size2); + detail::matrix_array_wrapper<value_type const, column_major, true> wrapper_B(data_B, B_start1, B_start2, B_inc1, B_inc2, B_internal_size1, B_internal_size2); + detail::matrix_array_wrapper<value_type, column_major, false> wrapper_C(data_C, C_start1, C_start2, C_inc1, C_inc2, C_internal_size1, C_internal_size2); + + detail::prod(wrapper_A, wrapper_B, wrapper_C, C_size1, C_size2, A_size1, static_cast<value_type>(alpha), static_cast<value_type>(beta)); + } + } +} + + + + +// +///////////////////////// miscellaneous operations ///////////////////////////////// +// + + +/** @brief The implementation of the operation mat += alpha * vec1 * vec2^T, i.e. a scaled rank 1 update +* +* Implementation of the convenience expression result += alpha * outer_prod(vec1, vec2); +* +* @param mat1 The matrix to be updated +* @param alpha The scaling factor (either a viennacl::scalar<>, float, or double) +* @param reciprocal_alpha Use 1/alpha instead of alpha +* @param flip_sign_alpha Use -alpha instead of alpha +* @param vec1 The first vector +* @param vec2 The second vector +*/ +template<typename NumericT, typename ScalarT> +void scaled_rank_1_update(matrix_base<NumericT> & mat1, + ScalarT const & alpha, vcl_size_t /*len_alpha*/, bool reciprocal_alpha, bool flip_sign_alpha, + const vector_base<NumericT> & vec1, + const vector_base<NumericT> & vec2) +{ + typedef NumericT value_type; + + value_type * data_A = detail::extract_raw_pointer<value_type>(mat1); + value_type const * data_v1 = detail::extract_raw_pointer<value_type>(vec1); + value_type const * data_v2 = detail::extract_raw_pointer<value_type>(vec2); + + vcl_size_t A_start1 = viennacl::traits::start1(mat1); + vcl_size_t A_start2 = viennacl::traits::start2(mat1); + vcl_size_t A_inc1 = viennacl::traits::stride1(mat1); + vcl_size_t A_inc2 = viennacl::traits::stride2(mat1); + vcl_size_t A_size1 = viennacl::traits::size1(mat1); + vcl_size_t A_size2 = viennacl::traits::size2(mat1); + vcl_size_t A_internal_size1 = viennacl::traits::internal_size1(mat1); + vcl_size_t A_internal_size2 = viennacl::traits::internal_size2(mat1); + + vcl_size_t start1 = viennacl::traits::start(vec1); + vcl_size_t inc1 = viennacl::traits::stride(vec1); + + vcl_size_t start2 = viennacl::traits::start(vec2); + vcl_size_t inc2 = viennacl::traits::stride(vec2); + + value_type data_alpha = alpha; + if (flip_sign_alpha) + data_alpha = -data_alpha; + + if (mat1.row_major()) + { + if(reciprocal_alpha) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long row = 0; row < static_cast<long>(A_size1); ++row) + { + value_type value_v1 = data_v1[static_cast<vcl_size_t>(row) * inc1 + start1] / data_alpha; + for (vcl_size_t col = 0; col < A_size2; ++col) + data_A[viennacl::row_major::mem_index(static_cast<vcl_size_t>(row) * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] += value_v1 * data_v2[col * inc2 + start2]; + } + } + else + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long row = 0; row < static_cast<long>(A_size1); ++row) + { + value_type value_v1 = data_v1[static_cast<vcl_size_t>(row) * inc1 + start1] * data_alpha; + for (vcl_size_t col = 0; col < A_size2; ++col) + data_A[viennacl::row_major::mem_index(static_cast<vcl_size_t>(row) * A_inc1 + A_start1, col * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] += value_v1 * data_v2[col * inc2 + start2]; + } + } + } + else + { + if(reciprocal_alpha) + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long col = 0; col < static_cast<long>(A_size2); ++col) //run through matrix sequentially + { + value_type value_v2 = data_v2[static_cast<vcl_size_t>(col) * inc2 + start2] / data_alpha; + for (vcl_size_t row = 0; row < A_size1; ++row) + data_A[viennacl::column_major::mem_index(row * A_inc1 + A_start1, static_cast<vcl_size_t>(col) * A_inc2 + A_start2, A_internal_size1, A_internal_size2)] += data_v1[row * inc1 + start1] * value_v2; + } + } + else + { +#ifdef VIENNACL_WITH_OPENMP + #pragma omp parallel for if ((A_size1*A_size2) > VIENNACL_OPENMP_MATRIX_MIN_SIZE) +#endif + for (long col = 0; col < static_cast<long>(A_size2); ++col) //run through matrix sequentially + { + value_type value_v2 = data_v2[static_cast<vcl_size_t>(col) * inc2 + start2] * data_alpha; + for (vcl_size_t row = 0; row < A_size1; ++row) + data_A[viennacl::column_major::mem_index(row * A_inc1 + A_start1, static_cast<vcl_size_t>(col) * A_inc2 + A_start2, A_intern
<TRUNCATED>
