http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations.hpp new file mode 100644 index 0000000..912d24d --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/matrix_operations.hpp @@ -0,0 +1,2725 @@ +#ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_HPP_ +#define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +/** @file viennacl/linalg/cuda/matrix_operations.hpp + @brief Implementations of dense matrix related operations, including matrix-vector products, using CUDA. +*/ + +#include "viennacl/forwards.h" +#include "viennacl/scalar.hpp" +#include "viennacl/vector.hpp" +#include "viennacl/vector_proxy.hpp" +#include "viennacl/tools/tools.hpp" +#include "viennacl/meta/enable_if.hpp" +#include "viennacl/meta/predicate.hpp" +#include "viennacl/meta/result_of.hpp" +#include "viennacl/traits/size.hpp" +#include "viennacl/traits/start.hpp" +#include "viennacl/traits/handle.hpp" +#include "viennacl/traits/stride.hpp" + +#include "viennacl/linalg/cuda/common.hpp" + +#include "viennacl/linalg/cuda/vector_operations.hpp" +#include "viennacl/linalg/cuda/matrix_operations_row.hpp" +#include "viennacl/linalg/cuda/matrix_operations_col.hpp" +#include "viennacl/linalg/cuda/matrix_operations_prod.hpp" +#include "viennacl/linalg/cuda/matrix_operations_prod.hpp" + +namespace viennacl +{ +namespace linalg +{ +namespace cuda +{ +// +// Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here! +// + +template<typename DestNumericT, typename SrcNumericT> +void convert(matrix_base<DestNumericT> & mat1, matrix_base<SrcNumericT> const & mat2) +{ + assert(mat1.row_major() == mat2.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!")); + + if (mat1.row_major()) + { + convert_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1), + static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)), + static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)), + static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)), + static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)), + + viennacl::cuda_arg(mat2), + static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)), + static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)), + static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("convert_row_kernel"); + } + else + { + convert_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1), + static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)), + static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)), + static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)), + static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)), + + viennacl::cuda_arg(mat2), + static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)), + static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)), + static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("convert_col_kernel"); + } +} + +template<typename NumericT, typename SizeT, typename DistanceT> +void trans(matrix_expression<const matrix_base<NumericT, SizeT, DistanceT>,const matrix_base<NumericT, SizeT, DistanceT>, op_trans> const & proxy, + matrix_base<NumericT> & temp_trans) +{ + trans_kernel<<<128,128>>>(viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(proxy.lhs().start1()), static_cast<unsigned int>(proxy.lhs().start2()), + static_cast<unsigned int>(proxy.lhs().internal_size1()), static_cast<unsigned int>(proxy.lhs().internal_size2()), + static_cast<unsigned int>(proxy.lhs().size1()), static_cast<unsigned int>(proxy.lhs().size2()), + static_cast<unsigned int>(proxy.lhs().stride1()), static_cast<unsigned int>(proxy.lhs().stride2()), + + viennacl::cuda_arg(temp_trans), + static_cast<unsigned int>(temp_trans.start1()), static_cast<unsigned int>(temp_trans.start2()), + static_cast<unsigned int>(temp_trans.internal_size1()), static_cast<unsigned int>(temp_trans.internal_size2()), + static_cast<unsigned int>(temp_trans.stride1()), static_cast<unsigned int>(temp_trans.stride2()), + static_cast<bool>(proxy.lhs().row_major())); + VIENNACL_CUDA_LAST_ERROR_CHECK("trans_kernel"); +} + + +template<typename NumericT, typename ScalarT> +void am(matrix_base<NumericT> & mat1, + matrix_base<NumericT> const & mat2, ScalarT const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha) +{ + assert(mat1.row_major() == mat2.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!")); + + typedef NumericT value_type; + + unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha); + + value_type temporary_alpha = 0; + if (viennacl::is_cpu_scalar<ScalarT>::value) + temporary_alpha = alpha; + + if (mat1.row_major()) + { + am_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1), + static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)), + static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)), + static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)), + static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)), + + viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), + options_alpha, + viennacl::cuda_arg(mat2), + static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)), + static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)), + static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("am_row_kernel"); + } + else + { + am_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1), + static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)), + static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)), + static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)), + static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)), + + viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), + options_alpha, + viennacl::cuda_arg(mat2), + static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)), + static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)), + static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("am_col_kernel"); + } +} + + +template<typename NumericT, typename ScalarT1, typename ScalarT2> +void ambm(matrix_base<NumericT> & mat1, + matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, + matrix_base<NumericT> const & mat3, ScalarT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta) +{ + assert(mat1.row_major() == mat2.row_major() && mat1.row_major() == mat3.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!")); + + typedef NumericT value_type; + + unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha); + + value_type temporary_alpha = 0; + if (viennacl::is_cpu_scalar<ScalarT1>::value) + temporary_alpha = alpha; + + + unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta); + + value_type temporary_beta = 0; + if (viennacl::is_cpu_scalar<ScalarT2>::value) + temporary_beta = beta; + + + if (mat1.row_major()) + { + ambm_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1), + static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)), + static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)), + static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)), + static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)), + + viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), + options_alpha, + viennacl::cuda_arg(mat2), + static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)), + static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)), + static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)), + + viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)), + options_beta, + viennacl::cuda_arg(mat3), + static_cast<unsigned int>(viennacl::traits::start1(mat3)), static_cast<unsigned int>(viennacl::traits::start2(mat3)), + static_cast<unsigned int>(viennacl::traits::stride1(mat3)), static_cast<unsigned int>(viennacl::traits::stride2(mat3)), + static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat3)) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_row_kernel"); + } + else + { + ambm_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1), + static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)), + static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)), + static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)), + static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)), + + viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), + options_alpha, + viennacl::cuda_arg(mat2), + static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)), + static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)), + static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)), + + viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)), + options_beta, + viennacl::cuda_arg(mat3), + static_cast<unsigned int>(viennacl::traits::start1(mat3)), static_cast<unsigned int>(viennacl::traits::start2(mat3)), + static_cast<unsigned int>(viennacl::traits::stride1(mat3)), static_cast<unsigned int>(viennacl::traits::stride2(mat3)), + static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat3)) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_col_kernel"); + } + +} + + +template<typename NumericT, typename ScalarT1, typename ScalarT2> +void ambm_m(matrix_base<NumericT> & mat1, + matrix_base<NumericT> const & mat2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, + matrix_base<NumericT> const & mat3, ScalarT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta) +{ + assert(mat1.row_major() == mat2.row_major() && mat1.row_major() == mat3.row_major() && bool("Addition/subtraction on mixed matrix layouts not supported yet!")); + + typedef NumericT value_type; + + unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha); + + value_type temporary_alpha = 0; + if (viennacl::is_cpu_scalar<ScalarT1>::value) + temporary_alpha = alpha; + + + unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta); + + value_type temporary_beta = 0; + if (viennacl::is_cpu_scalar<ScalarT2>::value) + temporary_beta = beta; + + + if (mat1.row_major()) + { + ambm_m_row_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1), + static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)), + static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)), + static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)), + static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)), + + viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), + options_alpha, + viennacl::cuda_arg(mat2), + static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)), + static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)), + static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)), + + viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)), + options_beta, + viennacl::cuda_arg(mat3), + static_cast<unsigned int>(viennacl::traits::start1(mat3)), static_cast<unsigned int>(viennacl::traits::start2(mat3)), + static_cast<unsigned int>(viennacl::traits::stride1(mat3)), static_cast<unsigned int>(viennacl::traits::stride2(mat3)), + static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat3)) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_m_row_kernel"); + } + else + { + ambm_m_col_kernel<<<128, 128>>>(viennacl::cuda_arg(mat1), + static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)), + static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)), + static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)), + static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)), + + viennacl::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), + options_alpha, + viennacl::cuda_arg(mat2), + static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)), + static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)), + static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)), + + viennacl::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)), + options_beta, + viennacl::cuda_arg(mat3), + static_cast<unsigned int>(viennacl::traits::start1(mat3)), static_cast<unsigned int>(viennacl::traits::start2(mat3)), + static_cast<unsigned int>(viennacl::traits::stride1(mat3)), static_cast<unsigned int>(viennacl::traits::stride2(mat3)), + static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat3)) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_m_col_kernel"); + } + +} + + + + +template<typename NumericT> +void matrix_assign(matrix_base<NumericT> & mat, NumericT s, bool clear = false) +{ + typedef NumericT value_type; + value_type alpha = s; + + unsigned int s1 = clear ? viennacl::traits::internal_size1(mat) : viennacl::traits::size1(mat); + unsigned int s2 = clear ? viennacl::traits::internal_size2(mat) : viennacl::traits::size2(mat); + + if (mat.row_major()) + { + + matrix_row_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(mat), + static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)), + static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)), + s1, s2, + static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)), + alpha); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_assign_kernel"); + } + else + { + matrix_col_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(mat), + static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)), + static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)), + s1, s2, + static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)), + alpha); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_assign_kernel"); + } +} + +template<typename NumericT> +void matrix_diagonal_assign(matrix_base<NumericT> & mat, NumericT s) +{ + typedef NumericT value_type; + value_type alpha = s; + + if (mat.row_major()) + { + matrix_row_diagonal_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(mat), + static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)), + static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)), + static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)), + static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)), + alpha); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_diagonal_assign_kernel"); + } + else + { + matrix_col_diagonal_assign_kernel<<<128, 128>>>(viennacl::cuda_arg(mat), + static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)), + static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)), + static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)), + static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)), + alpha); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_diagonal_assign_kernel"); + } +} + + +template<typename NumericT> +void matrix_diag_from_vector(const vector_base<NumericT> & vec, int k, matrix_base<NumericT> & mat) +{ + typedef NumericT value_type; + + // Step 1: assign zero matrix: + matrix_assign(mat, NumericT(0)); + + // Step 2: Assign diagonal: + unsigned int options_alpha = 0; + + vcl_size_t mat_start = 0; + vcl_size_t mat_stride = 0; + vcl_size_t mat_size = viennacl::traits::size(vec); + if (mat.row_major()) + { + vcl_size_t first_row_index = 0; + vcl_size_t first_col_index = 0; + if (k < 0) + first_row_index = vcl_size_t(-k); + else + first_col_index = vcl_size_t(k); + mat_start = (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat) + + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat); + mat_stride = viennacl::traits::stride1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat); + } + else + { + vcl_size_t first_row_index = 0; + vcl_size_t first_col_index = 0; + if (k < 0) + first_row_index = vcl_size_t(-k); + else + first_col_index = vcl_size_t(k); + mat_start = viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat) + + (viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat); + mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat); + } + + av_kernel<<<128, 128>>>(viennacl::cuda_arg(mat), + static_cast<unsigned int>(mat_start), + static_cast<unsigned int>(mat_stride), + static_cast<unsigned int>(mat_size), + + viennacl::cuda_arg<value_type>(NumericT(1)), + options_alpha, + viennacl::cuda_arg(vec), + static_cast<unsigned int>(viennacl::traits::start(vec)), + static_cast<unsigned int>(viennacl::traits::stride(vec)) ); + VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel"); +} + +template<typename NumericT> +void matrix_diag_to_vector(matrix_base<NumericT> const & mat, int k, vector_base<NumericT> & vec) +{ + typedef NumericT value_type; + + unsigned int options_alpha = 0; + + vcl_size_t mat_start = 0; + vcl_size_t mat_stride = 0; + if (mat.row_major()) + { + vcl_size_t first_row_index = 0; + vcl_size_t first_col_index = 0; + if (k < 0) + first_row_index = vcl_size_t(-k); + else + first_col_index = vcl_size_t(k); + mat_start = (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat) + + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat); + mat_stride = viennacl::traits::stride1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat); + } + else + { + vcl_size_t first_row_index = 0; + vcl_size_t first_col_index = 0; + if (k < 0) + first_row_index = vcl_size_t(-k); + else + first_col_index = vcl_size_t(k); + mat_start = viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat) + + (viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat); + mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat); + } + + av_kernel<<<128, 128>>>(viennacl::cuda_arg(vec), + static_cast<unsigned int>(viennacl::traits::start(vec)), + static_cast<unsigned int>(viennacl::traits::stride(vec)), + static_cast<unsigned int>(viennacl::traits::size(vec)), + + viennacl::cuda_arg<value_type>(NumericT(1)), + options_alpha, + viennacl::cuda_arg(mat), + static_cast<unsigned int>(mat_start), + static_cast<unsigned int>(mat_stride)); + VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel"); +} + +template<typename NumericT> +void matrix_row(matrix_base<NumericT> const & mat, unsigned int i, vector_base<NumericT> & vec) +{ + typedef NumericT value_type; + + unsigned int options_alpha = 0; + + vcl_size_t mat_start = 0; + vcl_size_t mat_stride = 0; + if (mat.row_major()) + { + mat_start = (viennacl::traits::start1(mat) + i * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat); + mat_stride = viennacl::traits::stride2(mat); + } + else + { + mat_start = viennacl::traits::start1(mat) + i * viennacl::traits::stride1(mat) + viennacl::traits::start2(mat) * viennacl::traits::internal_size1(mat); + mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat); + } + + av_kernel<<<128, 128>>>(viennacl::cuda_arg(vec), + static_cast<unsigned int>(viennacl::traits::start(vec)), + static_cast<unsigned int>(viennacl::traits::stride(vec)), + static_cast<unsigned int>(viennacl::traits::size(vec)), + + viennacl::cuda_arg<value_type>(NumericT(1)), + options_alpha, + viennacl::cuda_arg(mat), + static_cast<unsigned int>(mat_start), + static_cast<unsigned int>(mat_stride)); + VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel"); +} + +template<typename NumericT> +void matrix_column(const matrix_base<NumericT> & mat, unsigned int j, vector_base<NumericT> & vec) +{ + typedef NumericT value_type; + + unsigned int options_alpha = 0; + + vcl_size_t mat_start = 0; + vcl_size_t mat_stride = 0; + if (mat.row_major()) + { + mat_start = viennacl::traits::start1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat); + mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size2(mat); + } + else + { + mat_start = viennacl::traits::start1(mat) + (viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat); + mat_stride = viennacl::traits::stride2(mat); + } + + av_kernel<<<128, 128>>>(viennacl::cuda_arg(vec), + static_cast<unsigned int>(viennacl::traits::start(vec)), + static_cast<unsigned int>(viennacl::traits::stride(vec)), + static_cast<unsigned int>(viennacl::traits::size(vec)), + + viennacl::cuda_arg<value_type>(NumericT(1)), + options_alpha, + viennacl::cuda_arg(mat), + static_cast<unsigned int>(mat_start), + static_cast<unsigned int>(mat_stride)); + VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel"); +} + + +// +///////////////////////// binary element-wise operations ///////////////////////////////// +// + + +template<typename NumericT, typename SizeT, typename OpT> +void element_op(matrix_base<NumericT, SizeT> & A, + matrix_expression<const matrix_base<NumericT, SizeT>, const matrix_base<NumericT, SizeT>, op_element_binary<OpT> > const & proxy) +{ + assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!")); + + typedef NumericT value_type; + + unsigned int op_type = 2; //0: product, 1: division, 2: power + if (viennacl::is_division<OpT>::value) + op_type = 1; + else if (viennacl::is_product<OpT>::value) + op_type = 0; + + if (A.row_major()) + { + element_op_int_row_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())), + + viennacl::cuda_arg(proxy.rhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())), + + op_type + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel"); + } + else + { + element_op_int_col_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())), + + viennacl::cuda_arg(proxy.rhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())), + + op_type + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel"); + } +} + +template<typename SizeT, typename OpT> +void element_op(matrix_base<float, SizeT> & A, + matrix_expression<const matrix_base<float, SizeT>, const matrix_base<float, SizeT>, op_element_binary<OpT> > const & proxy) +{ + assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!")); + + typedef float value_type; + + unsigned int op_type = 2; //0: product, 1: division, 2: power + if (viennacl::is_division<OpT>::value) + op_type = 1; + else if (viennacl::is_product<OpT>::value) + op_type = 0; + + if (A.row_major()) + { + element_op_row_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())), + + viennacl::cuda_arg(proxy.rhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())), + + op_type + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel"); + } + else + { + element_op_col_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())), + + viennacl::cuda_arg(proxy.rhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())), + + op_type + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel"); + } +} + +template<typename SizeT, typename OpT> +void element_op(matrix_base<double, SizeT> & A, + matrix_expression<const matrix_base<double, SizeT>, const matrix_base<double, SizeT>, op_element_binary<OpT> > const & proxy) +{ + assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!")); + + typedef double value_type; + + unsigned int op_type = 2; //0: product, 1: division, 2: power + if (viennacl::is_division<OpT>::value) + op_type = 1; + else if (viennacl::is_product<OpT>::value) + op_type = 0; + + if (A.row_major()) + { + element_op_row_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())), + + viennacl::cuda_arg(proxy.rhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())), + + op_type + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel"); + } + else + { + element_op_col_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())), + + viennacl::cuda_arg(proxy.rhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())), + + op_type + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel"); + } +} + +// +///////////////////////// unary element-wise operations ///////////////////////////////// +// + +// Note: Due to CUDA vs C-proprocessor interference (concatenation seems to be broken in at least CUDA 4.2), +// we could not find a more 'automatic' way of generating the overloads below... + +// abs +template<typename NumericT> +void element_op(matrix_base<NumericT> & A, + matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_abs> > const & proxy) +{ + assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!")); + + typedef NumericT value_type; + + if (A.row_major()) + { + matrix_row_element_abs_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_abs_kernel"); + } + else + { + matrix_col_element_abs_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_abs_kernel"); + } +} + + +// acos +template<typename NumericT> +void element_op(matrix_base<NumericT> & A, + matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_acos> > const & proxy) +{ + assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!")); + + typedef NumericT value_type; + + if (A.row_major()) + { + matrix_row_element_acos_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_acos_kernel"); + } + else + { + matrix_col_element_acos_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_acos_kernel"); + } +} + + +// asin +template<typename NumericT> +void element_op(matrix_base<NumericT> & A, + matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_asin> > const & proxy) +{ + assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!")); + + typedef NumericT value_type; + + if (A.row_major()) + { + matrix_row_element_asin_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_asin_kernel"); + } + else + { + matrix_col_element_asin_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sin_kernel"); + } +} + + +// atan +template<typename NumericT> +void element_op(matrix_base<NumericT> & A, + matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_atan> > const & proxy) +{ + assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!")); + + typedef NumericT value_type; + + if (A.row_major()) + { + matrix_row_element_atan_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_atan_kernel"); + } + else + { + matrix_col_element_atan_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_atan_kernel"); + } +} + + +// ceil +template<typename NumericT> +void element_op(matrix_base<NumericT> & A, + matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_ceil> > const & proxy) +{ + assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!")); + + typedef NumericT value_type; + + if (A.row_major()) + { + matrix_row_element_ceil_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_ceil_kernel"); + } + else + { + matrix_col_element_ceil_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_ceil_kernel"); + } +} + + +// cos +template<typename NumericT> +void element_op(matrix_base<NumericT> & A, + matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_cos> > const & proxy) +{ + assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!")); + + typedef NumericT value_type; + + if (A.row_major()) + { + matrix_row_element_cos_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_cos_kernel"); + } + else + { + matrix_col_element_cos_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_cos_kernel"); + } +} + + +// cosh +template<typename NumericT> +void element_op(matrix_base<NumericT> & A, + matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_cosh> > const & proxy) +{ + assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!")); + + typedef NumericT value_type; + + if (A.row_major()) + { + matrix_row_element_cosh_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_cosh_kernel"); + } + else + { + matrix_col_element_cosh_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_cosh_kernel"); + } +} + + +// exp +template<typename NumericT> +void element_op(matrix_base<NumericT> & A, + matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_exp> > const & proxy) +{ + assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!")); + + typedef NumericT value_type; + + if (A.row_major()) + { + matrix_row_element_exp_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_exp_kernel"); + } + else + { + matrix_col_element_exp_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_exp_kernel"); + } +} + + +// fabs +template<typename NumericT> +void element_op(matrix_base<NumericT> & A, + matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_fabs> > const & proxy) +{ + assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!")); + + typedef NumericT value_type; + + if (A.row_major()) + { + matrix_row_element_fabs_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_fabs_kernel"); + } + else + { + matrix_col_element_fabs_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_fabs_kernel"); + } +} + + +// floor +template<typename NumericT> +void element_op(matrix_base<NumericT> & A, + matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_floor> > const & proxy) +{ + assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!")); + + typedef NumericT value_type; + + if (A.row_major()) + { + matrix_row_element_floor_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_floor_kernel"); + } + else + { + matrix_col_element_floor_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_floor_kernel"); + } +} + + +// log +template<typename NumericT> +void element_op(matrix_base<NumericT> & A, + matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_log> > const & proxy) +{ + assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!")); + + typedef NumericT value_type; + + if (A.row_major()) + { + matrix_row_element_log_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_log_kernel"); + } + else + { + matrix_col_element_log_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_log_kernel"); + } +} + + +// log10 +template<typename NumericT> +void element_op(matrix_base<NumericT> & A, + matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_log10> > const & proxy) +{ + assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!")); + + typedef NumericT value_type; + + if (A.row_major()) + { + matrix_row_element_log10_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_log10_kernel"); + } + else + { + matrix_col_element_log10_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_log10_kernel"); + } +} + + +// sin +template<typename NumericT> +void element_op(matrix_base<NumericT> & A, + matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_sin> > const & proxy) +{ + assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!")); + + typedef NumericT value_type; + + if (A.row_major()) + { + matrix_row_element_sin_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sin_kernel"); + } + else + { + matrix_col_element_sin_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sin_kernel"); + } +} + + +// sinh +template<typename NumericT> +void element_op(matrix_base<NumericT> & A, + matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_sinh> > const & proxy) +{ + assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!")); + + typedef NumericT value_type; + + if (A.row_major()) + { + matrix_row_element_sinh_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sinh_kernel"); + } + else + { + matrix_col_element_sinh_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), + + viennacl::cuda_arg(proxy.lhs()), + static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), + static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) + ); + VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sinh_kernel"); + } +} + + +// sqrt +template<typename NumericT> +void element_op(matrix_base<NumericT> & A, + matrix_expression<const matrix_base<NumericT>, const matrix_base<NumericT>, op_element_unary<op_sqrt> > const & proxy) +{ + assert(A.row_major() == proxy.lhs().row_major() && A.row_major() == proxy.rhs().row_major() && bool("Element-wise operations on mixed matrix layouts not supported yet!")); + + typedef NumericT value_type; + + if (A.row_major()) + { + matrix_row_element_sqrt_kernel<<<128, 128>>>(viennacl::cuda_arg(A), + static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), + static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), + static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), + static_cast<unsigned i
<TRUNCATED>
