http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/device_specific/mapped_objects.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/mapped_objects.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/mapped_objects.hpp deleted file mode 100644 index 19f7993..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/device_specific/mapped_objects.hpp +++ /dev/null @@ -1,512 +0,0 @@ -#ifndef VIENNACL_DEVICE_SPECIFIC_MAPPED_TYPE_HPP -#define VIENNACL_DEVICE_SPECIFIC_MAPPED_TYPE_HPP - -/* ========================================================================= - Copyright (c) 2010-2016, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. - Portions of this software are copyright by UChicago Argonne, LLC. - - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - - Project Head: Karl Rupp [email protected] - - (A list of authors and contributors can be found in the manual) - - License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - - -/** @file viennacl/device_specific/mapped_objects.hpp - @brief Map ViennaCL objects to generator wrappers -*/ - -#include <string> - -#include "viennacl/scheduler/forwards.h" -#include "viennacl/device_specific/forwards.h" -#include "viennacl/device_specific/utils.hpp" - -namespace viennacl -{ - -namespace device_specific -{ - -/** @brief Mapped Object -* -* This object populates the symbolic mapping associated with a statement. (root_id, LHS|RHS|PARENT) => mapped_object -* The tree can then be reconstructed in its symbolic form -*/ -class mapped_object -{ -private: - virtual void postprocess(std::string &) const { } - -protected: - struct MorphBase { virtual ~MorphBase(){} }; - struct MorphBase1D : public MorphBase { public: virtual std::string operator()(std::string const & i) const = 0; }; - struct MorphBase2D : public MorphBase { public: virtual std::string operator()(std::string const & i, std::string const & j) const = 0; }; - - static void replace_offset(std::string & str, MorphBase const & morph) - { - vcl_size_t pos = 0; - while ((pos=str.find("$OFFSET", pos))!=std::string::npos) - { - std::string postprocessed; - vcl_size_t pos_po = str.find('{', pos); - vcl_size_t pos_pe = str.find('}', pos_po); - - if (MorphBase2D const * p2d = dynamic_cast<MorphBase2D const *>(&morph)) - { - vcl_size_t pos_comma = str.find(',', pos_po); - std::string i = str.substr(pos_po + 1, pos_comma - pos_po - 1); - std::string j = str.substr(pos_comma + 1, pos_pe - pos_comma - 1); - postprocessed = (*p2d)(i, j); - } - else if (MorphBase1D const * p1d = dynamic_cast<MorphBase1D const *>(&morph)) - { - std::string i = str.substr(pos_po + 1, pos_pe - pos_po - 1); - postprocessed = (*p1d)(i); - } - - str.replace(pos, pos_pe + 1 - pos, postprocessed); - pos = pos_pe; - } - } - - void register_attribute(std::string & attribute, std::string const & key, std::string const & value) - { - attribute = value; - keywords_[key] = attribute; - } - -public: - struct node_info - { - node_info(mapping_type const * _mapping, scheduler::statement const * _statement, vcl_size_t _root_idx) : - mapping(_mapping), statement(_statement), root_idx(_root_idx) { } - mapping_type const * mapping; - scheduler::statement const * statement; - vcl_size_t root_idx; - }; - -public: - mapped_object(std::string const & scalartype, unsigned int id, std::string const & type_key) : type_key_(type_key) - { - register_attribute(scalartype_, "#scalartype", scalartype); - register_attribute(name_, "#name", "obj" + tools::to_string(id)); - } - - virtual ~mapped_object(){ } - - virtual std::string & append_kernel_arguments(std::set<std::string> &, std::string & str, unsigned int) const { return str; } - - std::string type_key() const { return type_key_; } - - std::string const & name() const { return name_; } - - std::string process(std::string const & in) const - { - std::string res(in); - for (std::map<std::string,std::string>::const_iterator it = keywords_.begin(); it != keywords_.end(); ++it) - tools::find_and_replace(res, it->first, it->second); - postprocess(res); - return res; - } - - std::string evaluate(std::map<std::string, std::string> const & accessors) const - { - if (accessors.find(type_key_)==accessors.end()) - return name_; - return process(at(accessors, type_key_)); - } - - -protected: - std::string name_; - std::string scalartype_; - std::string type_key_; - std::map<std::string, std::string> keywords_; -}; - - -/** @brief Binary leaf interface -* -* Some subtrees have to be interpret at leaves when reconstructing the final expression. It is the case of trans(), diag(), prod(), etc... -* This interface stores basic infos about the sub-trees -*/ -class binary_leaf -{ -public: - binary_leaf(mapped_object::node_info info) : info_(info){ } - - void process_recursive(utils::kernel_generation_stream & stream, leaf_t leaf, std::string const & key, std::string const & process_str, std::set<std::string> & already_fetched) - { - tree_parsing::process(stream, leaf, key, process_str, *info_.statement, info_.root_idx, *info_.mapping, already_fetched); - } - - std::string evaluate_recursive(leaf_t leaf, std::map<std::string, std::string> const & accessors) - { - return tree_parsing::evaluate(leaf, accessors, *info_.statement, info_.root_idx, *info_.mapping); - } - -protected: - mapped_object::node_info info_; -}; - -/** @brief Matrix product - * - * Maps prod(matrix_expression, matrix_expression) - */ -class mapped_matrix_product : public mapped_object, public binary_leaf -{ -public: - mapped_matrix_product(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "matrix_product"), binary_leaf(info) { } -}; - -/** @brief Reduction -* -* Base class for mapping a reduction -*/ -class mapped_reduction : public mapped_object, public binary_leaf -{ -public: - mapped_reduction(std::string const & scalartype, unsigned int id, node_info info, std::string const & type_key) : mapped_object(scalartype, id, type_key), binary_leaf(info){ } - - vcl_size_t root_idx() const { return info_.root_idx; } - scheduler::statement const & statement() const { return *info_.statement; } - scheduler::statement_node root_node() const { return statement().array()[root_idx()]; } - bool is_index_reduction() const { return utils::is_index_reduction(info_.statement->array()[info_.root_idx].op); } - - scheduler::op_element root_op() const - { - scheduler::op_element res = info_.statement->array()[info_.root_idx].op; - if (res.type==scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE - ||res.type==scheduler::OPERATION_BINARY_INNER_PROD_TYPE) - res.type = scheduler::OPERATION_BINARY_ADD_TYPE; - return res; - } -}; - -/** @brief Scalar reduction -* -* Maps a scalar reduction (max, min, argmax, inner_prod, etc..) -*/ -class mapped_scalar_reduction : public mapped_reduction -{ -public: - mapped_scalar_reduction(std::string const & scalartype, unsigned int id, node_info info) : mapped_reduction(scalartype, id, info, "scalar_reduction"){ } -}; - -/** @brief Vector reduction -* -* Maps a row-wise reduction (max, min, argmax, matrix-vector product, etc..) -*/ -class mapped_row_wise_reduction : public mapped_reduction -{ -public: - mapped_row_wise_reduction(std::string const & scalartype, unsigned int id, node_info info) : mapped_reduction(scalartype, id, info, "row_wise_reduction") { } -}; - -/** @brief Host scalar - * - * Maps a host scalar (passed by value) - */ -class mapped_host_scalar : public mapped_object -{ -public: - mapped_host_scalar(std::string const & scalartype, unsigned int id) : mapped_object(scalartype, id, "host_scalar"){ } - - std::string & append_kernel_arguments(std::set<std::string> & already_generated, std::string & str, unsigned int width) const - { - if (already_generated.insert(name_).second) - str += generate_value_kernel_argument(utils::append_width(scalartype_, width), name_); - return str; - } -}; - -/** @brief Handle -* -* Maps an object passed by pointer -*/ -class mapped_handle : public mapped_object -{ -private: - virtual void append_optional_arguments(std::string &) const = 0; - -public: - mapped_handle(std::string const & scalartype, unsigned int id, std::string const & type_key) : mapped_object(scalartype, id, type_key) - { - register_attribute(pointer_, "#pointer", name_ + "_pointer"); - } - - std::string & append_kernel_arguments(std::set<std::string> & already_generated, std::string & str, unsigned int width) const - { - if (already_generated.insert(name_).second) - { - str += generate_pointer_kernel_argument("__global", utils::append_width(scalartype_, width), pointer_); - append_optional_arguments(str); - } - return str; - } - -private: - std::string pointer_; -}; - - -/** @brief Scalar - * - * Maps a scalar passed by pointer - */ -class mapped_scalar : public mapped_handle -{ -private: - void append_optional_arguments(std::string &) const{ } - -public: - mapped_scalar(std::string const & scalartype, unsigned int id) : mapped_handle(scalartype, id, "scalar") { } -}; - -/** @brief Buffered - * - * Maps a buffered object (vector, matrix) - */ -class mapped_buffer : public mapped_handle -{ -public: - mapped_buffer(std::string const & scalartype, unsigned int id, std::string const & type_key) : mapped_handle(scalartype, id, type_key){ } -}; - -/** @brief Vector - * - * Maps a vector - */ -class mapped_vector : public mapped_buffer -{ - void append_optional_arguments(std::string & str) const - { - str += generate_value_kernel_argument("unsigned int", start_); - str += generate_value_kernel_argument("unsigned int", stride_); - } - -public: - mapped_vector(std::string const & scalartype, unsigned int id) : mapped_buffer(scalartype, id, "vector") - { - register_attribute(start_, "#start", name_ + "_start"); - register_attribute(stride_, "#stride", name_ + "_stride"); - } - -private: - std::string start_; - std::string stride_; -}; - -/** @brief Matrix - * - * Maps a matrix - */ -class mapped_matrix : public mapped_buffer -{ -private: - void append_optional_arguments(std::string & str) const - { - str += generate_value_kernel_argument("unsigned int", ld_); - str += generate_value_kernel_argument("unsigned int", start1_); - str += generate_value_kernel_argument("unsigned int", start2_); - str += generate_value_kernel_argument("unsigned int", stride1_); - str += generate_value_kernel_argument("unsigned int", stride2_); - } - - void postprocess(std::string & str) const - { - struct Morph : public MorphBase2D - { - Morph(bool _is_row_major, std::string const & _ld) : is_row_major(_is_row_major), ld(_ld){ } - std::string operator()(std::string const & i, std::string const & j) const - { - if (is_row_major) - return "(" + i + ") * " + ld + " + (" + j + ")"; - return "(" + i + ") + (" + j + ") * " + ld; - } - private: - bool is_row_major; - std::string const & ld; - }; - replace_offset(str, Morph(row_major_, ld_)); - } - -public: - mapped_matrix(std::string const & scalartype, unsigned int id, bool row_major) : mapped_buffer(scalartype, id, "matrix"), row_major_(row_major) - { - register_attribute(ld_, "#ld", name_ + "_ld"); - register_attribute(start1_, "#start1", name_ + "_start1"); - register_attribute(start2_, "#start2", name_ + "_start2"); - register_attribute(stride1_, "#stride1", name_ + "_stride1"); - register_attribute(stride2_, "#stride2", name_ + "_stride2"); - if (row_major_) - keywords_["#nldstride"] = "#stride1"; - else - keywords_["#nldstride"] = "#stride2"; - - if (row_major_) - { - std::swap(start1_, start2_); - std::swap(stride1_, stride2_); - } - } - - bool row_major() const - { - return row_major_; - } - -private: - std::string ld_; - std::string start1_; - std::string start2_; - std::string stride1_; - std::string stride2_; - bool row_major_; -}; - -/** @brief Vector diag -* -* Maps a diag(vector_expression) node into a diagonal matrix -*/ -class mapped_vector_diag : public mapped_object, public binary_leaf -{ -private: - void postprocess(std::string &res) const - { - std::map<std::string, std::string> accessors; - tools::find_and_replace(res, "#diag_offset", tree_parsing::evaluate(RHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping)); - accessors["vector"] = res; - res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping); - } - -public: - mapped_vector_diag(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "vector_diag"), binary_leaf(info){ } -}; - - -/** @brief Trans -* -* Maps trans(matrix_expression) into the transposed of matrix_expression -*/ -class mapped_trans: public mapped_object, public binary_leaf -{ -private: - void postprocess(std::string &res) const - { - std::map<std::string, std::string> accessors; - accessors["matrix"] = res; - res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping); - } - -public: - mapped_trans(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "matrix_trans"), binary_leaf(info){ } -}; - -/** @brief Matrix row -* -* Maps row(matrix_expression, scalar_expression) into the scalar_expression's row of matrix_expression -*/ -class mapped_matrix_row : public mapped_object, binary_leaf -{ -private: - void postprocess(std::string &res) const - { - std::map<std::string, std::string> accessors; - tools::find_and_replace(res, "#row", tree_parsing::evaluate(RHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping)); - accessors["matrix"] = res; - res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping); - } - -public: - mapped_matrix_row(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "matrix_row"), binary_leaf(info) - { } -}; - - -/** @brief Matrix column -* -* Maps column(matrix_expression, scalar_expression) into the scalar_expression's column of matrix_expression -*/ -class mapped_matrix_column : public mapped_object, binary_leaf -{ -private: - void postprocess(std::string &res) const - { - std::map<std::string, std::string> accessors; - tools::find_and_replace(res, "#column", tree_parsing::evaluate(RHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping)); - accessors["matrix"] = res; - res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping); - } - -public: - mapped_matrix_column(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "matrix_column"), binary_leaf(info) - { } -}; - -/** @brief Matrix diag -* -* Maps a diag(matrix_expression) node into the vector of its diagonal elements -*/ -class mapped_matrix_diag : public mapped_object, binary_leaf -{ -private: - void postprocess(std::string &res) const - { - std::map<std::string, std::string> accessors; - tools::find_and_replace(res, "#diag_offset", tree_parsing::evaluate(RHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping)); - accessors["matrix"] = res; - res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping); - } - -public: - mapped_matrix_diag(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "matrix_diag"), binary_leaf(info) - { } -}; - -/** @brief Implicit vector - * - * Maps an implicit vector - */ -class mapped_implicit_vector : public mapped_object -{ -public: - mapped_implicit_vector(std::string const & scalartype, unsigned int id) : mapped_object(scalartype, id, "implicit_vector") - { } - - std::string & append_kernel_arguments(std::set<std::string> & /*already_generated*/, std::string & str, unsigned int width) const - { - str += generate_value_kernel_argument(utils::append_width(scalartype_, width), name_); - return str; - } -}; - -/** @brief Implicit matrix - * - * Maps an implicit matrix - */ -class mapped_implicit_matrix : public mapped_object -{ -public: - mapped_implicit_matrix(std::string const & scalartype, unsigned int id) : mapped_object(scalartype, id, "implicit_matrix") - { } - - std::string & append_kernel_arguments(std::set<std::string> & /*already_generated*/, std::string & str, unsigned int width) const - { - str += generate_value_kernel_argument(utils::append_width(scalartype_, width), name_); - return str; - } -}; - -} - -} -#endif
http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/matrix_product_template.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/matrix_product_template.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/matrix_product_template.hpp deleted file mode 100644 index 1f082ac..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/matrix_product_template.hpp +++ /dev/null @@ -1,859 +0,0 @@ -#ifndef VIENNACL_DEVICE_SPECIFIC_TEMPLATES_MATRIX_PRODUCT_HPP -#define VIENNACL_DEVICE_SPECIFIC_TEMPLATES_MATRIX_PRODUCT_HPP - -/* ========================================================================= -Copyright (c) 2010-2016, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. -Portions of this software are copyright by UChicago Argonne, LLC. - - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - -Project Head: Karl Rupp [email protected] - -(A list of authors and contributors can be found in the manual) - -License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - - -/** @file viennacl/device_specific/templates/matrix_product_template.hpp -* -* Kernel template for the matrix product operation -*/ - -#include <vector> - -#include "viennacl/scheduler/forwards.h" - -#include "viennacl/detail/matrix_def.hpp" -#include "viennacl/matrix_proxy.hpp" - -#include "viennacl/device_specific/templates/template_base.hpp" -#include "viennacl/device_specific/mapped_objects.hpp" -#include "viennacl/device_specific/utils.hpp" -#include "viennacl/device_specific/tree_parsing.hpp" -#include "viennacl/forwards.h" - -#include "viennacl/tools/tools.hpp" - -namespace viennacl -{ -namespace device_specific -{ - -struct matrix_product_parameters : public template_base::parameters_type -{ - matrix_product_parameters(unsigned int simd_width - , unsigned int local_size_0, unsigned int KL, unsigned int local_size_1 - , unsigned int ms, unsigned int ks, unsigned int ns - , fetching_policy_type A_fetching_policy_param, fetching_policy_type B_fetching_policy_param - , unsigned int local_fetch_0_param, unsigned int local_fetch_1_param): template_base::parameters_type(simd_width, local_size_0, local_size_1, 1), - kL(KL), mS(ms), kS(ks), nS(ns), A_fetching_policy(A_fetching_policy_param), B_fetching_policy(B_fetching_policy_param), - local_fetch_0(local_fetch_0_param), local_fetch_1(local_fetch_1_param), - mL(ms*local_size_0), nL(ns*local_size_1){} - - unsigned int kL; - - unsigned int mS; - unsigned int kS; - unsigned int nS; - - fetching_policy_type A_fetching_policy; - fetching_policy_type B_fetching_policy; - - unsigned int local_fetch_0; - unsigned int local_fetch_1; - - unsigned int mL; - unsigned int nL; -}; - -class matrix_product_template : public template_base_impl<matrix_product_template, matrix_product_parameters> -{ - -private: - unsigned int n_lmem_elements() const - { - unsigned int N = 0; - if (p_.A_fetching_policy==FETCH_FROM_LOCAL) - N += p_.kL * (p_.mL+1); - if (p_.B_fetching_policy==FETCH_FROM_LOCAL) - N += p_.nL * (p_.kL+1); - return N; - } - - int check_invalid_impl(viennacl::ocl::device const & /*device*/) const - { - if (p_.A_fetching_policy!=FETCH_FROM_LOCAL && p_.B_fetching_policy!=FETCH_FROM_LOCAL&& (p_.local_fetch_0!=0 || p_.local_fetch_1!=0)) - return TEMPLATE_GLOBAL_MEMORY_REQUIRES_ZERO_LOCAL_FETCH; - - if ((p_.mS % p_.simd_width) > 0 || (p_.nS % p_.simd_width) > 0) - return TEMPLATE_MS_NS_MUST_BE_SIMD_WIDTH_MULTIPLE; - - if (p_.kS > p_.kL) - return TEMPLATE_KS_MUST_BE_SMALLER_THAN_KL; - - if (!(A_trans_=='N' && B_trans_=='T') && p_.simd_width>1) - return TEMPLATE_SIMD_WIDTH_MUST_BE_ONE; - - if (p_.A_fetching_policy==FETCH_FROM_LOCAL || p_.B_fetching_policy==FETCH_FROM_LOCAL) - { - if ((p_.local_fetch_0*p_.local_fetch_1) !=(p_.local_size_0*p_.local_size_1)) - return TEMPLATE_LOCAL_FETCH_PRODUCT_MUST_MATCH_LOCAL_SIZE_PRODUCT; - } - - if (p_.A_fetching_policy==FETCH_FROM_LOCAL) - { - unsigned int bound1 = (A_trans_=='N')?p_.kL:p_.mL; - unsigned int bound0 = (A_trans_=='N')?p_.mL:p_.kL; - - if (p_.local_fetch_1>0 && (bound1 % p_.local_fetch_1)> 0) - return A_trans_=='N'?TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE:TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE; - - if (p_.local_fetch_0>0 && (bound0 % (p_.local_fetch_0*p_.simd_width)) > 0) - return A_trans_=='N'?TEMPLATE_LOCAL_FETCH_0_MUST_BE_NL_MULTIPLE:TEMPLATE_LOCAL_FETCH_0_MUST_BE_KL_MULTIPLE; - - } - if (p_.B_fetching_policy==FETCH_FROM_LOCAL) - { - unsigned int bound1 = (B_trans_=='T')?p_.kL:p_.nL; - unsigned int bound0 = (B_trans_=='T')?p_.nL:p_.kL; - - if (p_.local_fetch_1>0 && (bound1 % p_.local_fetch_1)> 0) - return B_trans_=='T'?TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE:TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE; - - if (p_.local_fetch_0>0 && (bound0 % (p_.local_fetch_0*p_.simd_width)) > 0) - return B_trans_=='T'?TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE:TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE; - - } - - return TEMPLATE_VALID; - } - - static void parse(scheduler::statement const & s, - vcl_size_t & C_idx, leaf_t & C_leaf, vcl_size_t & alpha_idx, leaf_t & alpha_leaf, - vcl_size_t & A_idx, leaf_t & A_leaf, bool& A_trans, vcl_size_t & B_idx, leaf_t & B_leaf, bool& B_trans, - vcl_size_t & beta_idx, leaf_t & beta_leaf) - { - using namespace tree_parsing; - using namespace scheduler; - - scheduler::statement::container_type const & array = s.array(); - vcl_size_t root_idx = s.root(); - - C_idx = root_idx; - C_leaf = LHS_NODE_TYPE; - - vcl_size_t node_add_idx = array[root_idx].rhs.node_index; - - vcl_size_t node_1_idx = array[node_add_idx].lhs.node_index; - alpha_idx = node_1_idx; - alpha_leaf = RHS_NODE_TYPE; - - vcl_size_t mat_prod_idx = array[node_1_idx].lhs.node_index; - if (array[mat_prod_idx].lhs.type_family==MATRIX_TYPE_FAMILY) - { - A_trans = false; - A_idx = mat_prod_idx; - } - else - { - A_trans = true; - A_idx = array[mat_prod_idx].lhs.node_index; - } - A_leaf = LHS_NODE_TYPE; - - if (array[mat_prod_idx].rhs.type_family==MATRIX_TYPE_FAMILY) - { - B_trans = false; - B_idx = mat_prod_idx; - B_leaf = RHS_NODE_TYPE; - } - else - { - B_trans = true; - B_idx = array[mat_prod_idx].rhs.node_index; - B_leaf = LHS_NODE_TYPE; - } - - vcl_size_t node_2_idx = array[node_add_idx].rhs.node_index; - beta_idx = node_2_idx; - beta_leaf = RHS_NODE_TYPE; - } - - void VIENNACL_HANDLE_BOUNDS(bool fallback, utils::kernel_generation_stream & stream, std::string const & inbounds, std::string const & do_if, std::string do_else) const - { - if (fallback) - { - stream << "if (" << inbounds << ")" << std::endl; - stream.inc_tab(); - stream << do_if << ";" << std::endl; - stream.dec_tab(); - stream << "else" << std::endl; - stream.inc_tab(); - stream << do_else << ";" << std::endl; - stream.dec_tab(); - } - else - stream << do_if << ";" << std::endl; - } - - - std::string generate_impl(const std::string &kernel_prefix, const statements_container &statements, const std::vector<mapping_type> &mappings, bool fallback) const - { - using std::string; - using tools::to_string; - - parameters_type pfallback(1, p_.local_size_0, p_.kL, p_.local_size_1, p_.mS, 1, p_.nS, p_.A_fetching_policy, p_.B_fetching_policy, p_.local_fetch_0, p_.local_fetch_1); - parameters_type const & p = fallback?pfallback:p_; - -#define VIENNACL_MUL_STRIDE1 string(fallback?"*#stride1":"") -#define VIENNACL_HANDLE_BOUNDS(in_bounds, to_load) (!fallback?string(to_load):string( string(in_bounds) + "?" + string(to_load) + ":0")) -#define VIENNACL_VSTORE(value, offset, ptr) vstore(p.simd_width, value, offset, ptr) - - string widthstr = tools::to_string(p.simd_width); - - ////////////////// - /// INIT - /// ////////////// - utils::kernel_generation_stream stream; - scheduler::statement const & st = statements.data().front(); - mapping_type const & mapping = mappings.front(); - - bool A_trans = false, B_trans = false; - vcl_size_t C_idx=0, alpha_idx=0, A_idx=0, B_idx=0, beta_idx=0; - leaf_t C_leaf=LHS_NODE_TYPE, alpha_leaf=LHS_NODE_TYPE, A_leaf=LHS_NODE_TYPE, B_leaf=LHS_NODE_TYPE, beta_leaf=LHS_NODE_TYPE; - parse(st, C_idx, C_leaf, alpha_idx, alpha_leaf, A_idx, A_leaf, A_trans, B_idx, B_leaf, B_trans, beta_idx, beta_leaf); - - mapped_matrix * C = (mapped_matrix* )at(mapping, mapping_key( C_idx, C_leaf)).get(); - mapped_host_scalar * alpha = (mapped_host_scalar*)at(mapping, mapping_key(alpha_idx, alpha_leaf)).get(); - mapped_matrix * A = (mapped_matrix* )at(mapping, mapping_key( A_idx, A_leaf)).get(); - mapped_matrix * B = (mapped_matrix* )at(mapping, mapping_key( B_idx, B_leaf)).get(); - mapped_host_scalar * beta = (mapped_host_scalar*)at(mapping, mapping_key( beta_idx, beta_leaf)).get(); - - ////////////////// - /// DECLARATIONS - /// ////////////// - - stream << " __attribute__((reqd_work_group_size(" << p.local_size_0 << "," << p.local_size_1 << ",1)))" << std::endl; - std::map<std::string, unsigned int> widths; - widths[A->name()] = p.simd_width; - widths[B->name()] = p.simd_width; - generate_prototype(stream, kernel_prefix, "unsigned int M, unsigned int N, unsigned int K, ", mappings, statements, widths); - stream << "{" << std::endl; - stream.inc_tab(); - if(!fallback) - { - stream << A->process("#start1 /= " + to_string(p.simd_width) + ";") << std::endl; - stream << A->process("#ld /= " + to_string(p.simd_width) + ";") << std::endl; - stream << B->process("#start1/= " + to_string(p.simd_width) + ";") << std::endl; - stream << B->process("#ld /= " + to_string(p.simd_width) + ";") << std::endl; - } - tree_parsing::process(stream, PARENT_NODE_TYPE, "matrix", "#pointer += $OFFSET{#start1, #start2};", statements, mappings); - tree_parsing::process(stream, PARENT_NODE_TYPE, "matrix", "#ld *= #nldstride;", statements, mappings); - - ///Result Values - stream << C->process("#scalartype rC[" + to_string(p.mS) + "][" + to_string(p.nS) + "] = {{(#scalartype)0}};") << std::endl; - if (p.A_fetching_policy==FETCH_FROM_LOCAL) - stream << A->process("#scalartype rA[" + to_string(p.kS) + "][" + to_string(p.mS) + "];") << std::endl; - else - stream << A->process(utils::append_width("#scalartype",p.simd_width) + " rA[" + to_string(p.kS) + "][" + to_string(p.mS/p.simd_width) + "];") << std::endl; - if (p.B_fetching_policy==FETCH_FROM_LOCAL) - stream << B->process("#scalartype rB[" + to_string(p.kS) + "][" + to_string(p.nS) + "];"); - else - stream << B->process(utils::append_width("#scalartype",p.simd_width) + " rB[" + to_string(p.kS) + "][" + to_string(p.nS/p.simd_width) + "];") << std::endl; - - - if (p.A_fetching_policy==FETCH_FROM_LOCAL) - stream << A->process("__local #scalartype lA[" + to_string(p.kL*(p.mL+1)) + "];"); - if (p.B_fetching_policy==FETCH_FROM_LOCAL) - stream << B->process("__local #scalartype lB[" + to_string(p.kL*(p.nL+1)) + "];"); - stream << std::endl; - - stream << "size_t gidx = get_group_id(0);" << std::endl; - stream << "size_t gidy = get_group_id(1);" << std::endl; - stream << "size_t idx = get_local_id(0);" << std::endl; - stream << "size_t idy = get_local_id(1);" << std::endl; - - if (p.A_fetching_policy==FETCH_FROM_LOCAL || p.B_fetching_policy==FETCH_FROM_LOCAL) - { - stream << std::endl; - stream << "size_t idt = " << p.local_size_0 << "*idy + idx;" << std::endl; - stream << "size_t idxT = idt % " << p.local_fetch_0 << ";" << std::endl; - stream << "size_t idyT = idt / " << p.local_fetch_0 << ";" << std::endl; - } - stream << std::endl; - - if (fallback) - { - //Bounds checking for M (in A, C) - stream << "bool in_bounds_m[" << p.mS << "];" << std::endl; - stream << "for(size_t m = 0; m < " << p.mS << "; m++)" << std::endl; - stream.inc_tab(); - switch (p.A_fetching_policy) - { - case FETCH_FROM_GLOBAL_CONTIGUOUS: - stream << "in_bounds_m[m] = gidx*" << p.mL << " + idx*" << p.mS << " + m < M;" << std::endl; - break; - default: - stream << "in_bounds_m[m] = gidx*" << p.mL << " + idx + m*" << p.local_size_0 << " < M;" << std::endl; - break; - } - stream.dec_tab(); - - //Bounds checking for A if Local - if (p.A_fetching_policy==FETCH_FROM_LOCAL) - { - unsigned int fetch_size = (A_trans_=='N'?p.local_fetch_0*p.simd_width:p.local_fetch_1); - stream << "bool in_bounds_m_local[" << p.mL/fetch_size << "];" << std::endl; - stream << "for(size_t m = 0; m < " << p.mL/fetch_size << "; m++)" << std::endl; - stream.inc_tab(); - stream << "in_bounds_m_local[m] = gidx*" << p.mL << " + " << (A_trans_=='N'?"idxT":"idyT") << " + m*" << fetch_size << " < M;" << std::endl; - stream.dec_tab(); - } - - //Bounds checking for N (in B, C) - stream << "bool in_bounds_n[" << p.nS << "];" << std::endl; - stream << "for(size_t n = 0; n < " << p.nS << "; n++)" << std::endl; - stream.inc_tab(); - switch (p.B_fetching_policy) - { - case FETCH_FROM_GLOBAL_CONTIGUOUS: - stream << "in_bounds_n[n] = gidy*" << p.nL << " + idy*" << p.nS << " + n < N;" << std::endl; - break; - default: - stream << "in_bounds_n[n] = gidy*" << p.nL << " + idy + n*" << p.local_size_1 << " < N;" << std::endl; - break; - } - stream.dec_tab(); - - //Bounds checking for B if Local - if (p.B_fetching_policy==FETCH_FROM_LOCAL) - { - unsigned int fetch_size = (B_trans_=='T'?p.local_fetch_0*p.simd_width:p.local_fetch_1); - stream << "bool in_bounds_n_local[" << p.nL/fetch_size << "];" << std::endl; - stream << "for(size_t n = 0; n < " << p.nL/fetch_size << "; n++)" << std::endl; - stream.inc_tab(); - stream << "in_bounds_n_local[n] = gidy*" << p.nL << " + " << (B_trans_=='T'?"idxT":"idyT") << " + n*" << fetch_size << " < N;" << std::endl; - stream.dec_tab(); - } - } - - switch (p.A_fetching_policy) - { - case FETCH_FROM_LOCAL: - if (A_trans_=='N') - stream << A->process("#pointer += (gidx*" + to_string(p.mL/p.simd_width) + " + idxT)" + VIENNACL_MUL_STRIDE1 + " + idyT*#ld;") << std::endl; - else - stream << A->process("#pointer += idxT" + VIENNACL_MUL_STRIDE1 + " + gidx*" + to_string(p.mL/p.simd_width) + "*#ld + idyT*#ld;") << std::endl; - break; - - case FETCH_FROM_GLOBAL_CONTIGUOUS: - if (A_trans_=='N') - stream << A->process("#pointer += (gidx*" + to_string(p.mL/p.simd_width) + "+ idx*" + to_string(p.mS/p.simd_width) + ")" + VIENNACL_MUL_STRIDE1 + ";") << std::endl; - else - stream << A->process("#pointer += (gidx*" + to_string(p.mL/p.simd_width) + "+ idx*" + to_string(p.mS/p.simd_width) + ")*#ld;") << std::endl; - break; - - case FETCH_FROM_GLOBAL_STRIDED: - if (A_trans_=='N') - stream << A->process("#pointer += (gidx*" + to_string(p.mL/p.simd_width) + "+ idx" + ")" + VIENNACL_MUL_STRIDE1 + ";") << std::endl; - else - stream << A->process("#pointer += (gidx*" + to_string(p.mL/p.simd_width) + "+ idx)*#ld;") << std::endl; - break; - - //default: break; - } - - switch (p.B_fetching_policy) - { - case FETCH_FROM_LOCAL: - if (B_trans_=='T') - stream << B->process("#pointer += (gidy*" + to_string(p.nL/p.simd_width) + " + idxT" + ")" + VIENNACL_MUL_STRIDE1 + " + idyT*#ld;") << std::endl; - else - stream << B->process("#pointer += idxT" + VIENNACL_MUL_STRIDE1 + " + gidy*" + to_string(p.nL/p.simd_width) + "*#ld + idyT*#ld;") << std::endl; - break; - - case FETCH_FROM_GLOBAL_CONTIGUOUS: - if (B_trans_=='T') - stream << B->process("#pointer += (gidy*" + to_string(p.nL/p.simd_width) + "+ idy*" + to_string(p.nS/p.simd_width) + ")" + VIENNACL_MUL_STRIDE1 + ";") << std::endl; - else - stream << B->process("#pointer += (gidy*" + to_string(p.nL/p.simd_width) + "+ idy*" + to_string(p.nS/p.simd_width) + ")*#ld;") << std::endl; - break; - - case FETCH_FROM_GLOBAL_STRIDED: - if (B_trans_=='T') - stream << B->process("#pointer += (gidy*" + to_string(p.nL/p.simd_width) + "+ idy" + ")" + VIENNACL_MUL_STRIDE1 + ";") << std::endl; - else - stream << B->process("#pointer += (gidy*" + to_string(p.nL/p.simd_width) + "+ idy)*#ld;") << std::endl; - break; - - //default: break; - } - - stream << std::endl; - stream << "size_t K_size_t = K;" << std::endl; - stream << "for(size_t block_k=0; block_k < K_size_t; block_k+=" << p.kL << "){" << std::endl; - stream.inc_tab(); - - if (p.A_fetching_policy==FETCH_FROM_LOCAL) - { - if (A_trans_=='N') - stream << A->process("__local #scalartype* plA = lA + idyT*" + to_string(p.mL + 1) + " + " + to_string(p.simd_width) + "*idxT;") << std::endl; - else - stream << A->process("__local #scalartype* plA = lA + idxT*" + to_string(p.mL + 1) + " + idyT;") << std::endl; - } - - - if (p.B_fetching_policy==FETCH_FROM_LOCAL) - { - if (B_trans_=='T') - stream << B->process("__local #scalartype* plB = lB + idyT*" + to_string(p.nL+1) + " + " + to_string(p.simd_width) + "*idxT;") << std::endl; - else - stream << B->process("__local #scalartype* plB = lB + idxT*" + to_string(p.nL+1) + "+ idyT;") <<std::endl; - } - - - if (p.A_fetching_policy==FETCH_FROM_LOCAL || p.B_fetching_policy==FETCH_FROM_LOCAL) - stream << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl; - - ///Fetch LHS to Local Memory - if (p.A_fetching_policy==FETCH_FROM_LOCAL && A_trans_=='N') - for (unsigned int k = 0; k < p.kL; k += p.local_fetch_1) - for (unsigned int m = 0; m < p.mL; m += p.local_fetch_0*p.simd_width) - { - string in_bounds = "in_bounds_m_local[" + to_string(m/(p.local_fetch_0*p.simd_width)) + "]"; - string to_load = "#pointer[" + to_string(k) + "*#ld + " + to_string(m/p.simd_width) + VIENNACL_MUL_STRIDE1 + "]"; - stream << A->process(VIENNACL_VSTORE(VIENNACL_HANDLE_BOUNDS(in_bounds, to_load), "0", "plA + " + to_string(k*(p.mL+1)+m))) << ";" << std::endl; - } - else if (p.A_fetching_policy==FETCH_FROM_LOCAL && A_trans_=='T') - for (unsigned int k = 0; k < p.mL; k += p.local_fetch_1) - for (unsigned int m = 0; m < p.kL; m += p.local_fetch_0*p.simd_width) - { - string in_bounds = "in_bounds_m_local[" + to_string(k/p.local_fetch_1) + "]"; - string to_load = "#pointer[" + to_string(k) + "*#ld + " + to_string(m/p.simd_width) + VIENNACL_MUL_STRIDE1 + "]"; - stream << A->process(VIENNACL_VSTORE(VIENNACL_HANDLE_BOUNDS(in_bounds, to_load), "0", "plA + " + to_string(m*(p.mL+1)+k))) << ";" << std::endl; - } - - if (p.B_fetching_policy==FETCH_FROM_LOCAL && B_trans_=='T') - for (unsigned int k = 0; k < p.kL; k += p.local_fetch_1) - for (unsigned int n = 0; n < p.nL; n += p.local_fetch_0*p.simd_width) - { - string in_bounds = "in_bounds_n_local[" + to_string(n/(p.local_fetch_0*p.simd_width)) + "]"; - string to_load = "#pointer[" + to_string(k) + "*#ld + " + to_string(n/p.simd_width) + VIENNACL_MUL_STRIDE1 + "]"; - stream << B->process(VIENNACL_VSTORE(VIENNACL_HANDLE_BOUNDS(in_bounds, to_load), "0", "plB + " + to_string(k*(p.nL+1)+n))) << ";" << std::endl; - } - else if (p.B_fetching_policy==FETCH_FROM_LOCAL && B_trans_=='N') - for (unsigned int k = 0; k < p.nL; k += p.local_fetch_1) - for (unsigned int n = 0; n < p.kL; n += p.local_fetch_0*p.simd_width) - { - string in_bounds = "in_bounds_n_local[" + to_string(k/p.local_fetch_1) + "]"; - string to_load = "#pointer[" + to_string(k) + "*#ld + " + to_string(n/p.simd_width) + VIENNACL_MUL_STRIDE1 + "]"; - stream << B->process(VIENNACL_VSTORE(VIENNACL_HANDLE_BOUNDS(in_bounds, to_load), "0", "plB + " + to_string(n*(p.nL+1)+k))) << ";" << std::endl; - } - - if (p.A_fetching_policy==FETCH_FROM_LOCAL || p.B_fetching_policy == FETCH_FROM_LOCAL) - { - stream << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl; - stream << "size_t offA = " << p.simd_width << "*idx;" << std::endl; - stream << "size_t offB = " << p.simd_width << "*idy;" << std::endl; - } - - if (fallback) - stream << "for(size_t k = 0; k < " << p.kL << " && (block_k + k < K_size_t); k+=" << p.kS << "){" << std::endl; - else - stream << "for(size_t k = 0; k < " << p.kL << "; k+=" << p.kS << "){" << std::endl; - stream.inc_tab(); - - ///Fetch LHS to registers - stream << "#pragma unroll " << p.kS << std::endl; - stream << "for(size_t kk = 0; kk < " << p.kS << "; kk++)" << std::endl; - stream << "#pragma unroll " << p.mS/p.simd_width << std::endl; - stream << "for(size_t mm = 0; mm < " << p.mS/p.simd_width << "; mm++)" << std::endl; - stream << "{" << std::endl; - stream.inc_tab(); - switch (p.A_fetching_policy) - { - case FETCH_FROM_LOCAL: - for (unsigned int ss = 0; ss < p.simd_width; ++ss) - stream << "rA[kk][mm*" << p.simd_width << "+" << ss << "] = lA[offA + mm*" << p.local_size_0*p.simd_width << "+" << ss << "+ kk*" << (p.mL+1) << "];" << std::endl; - break; - - case FETCH_FROM_GLOBAL_CONTIGUOUS: - { - if (A_trans_=='N') - stream << "rA[kk][mm] = " << A->process(VIENNACL_HANDLE_BOUNDS("in_bounds_m[mm]", "#pointer[kk*#ld + mm" + VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl; - else - stream << "rA[kk][mm] = " << A->process(VIENNACL_HANDLE_BOUNDS("in_bounds_m[mm]", "#pointer[mm*#ld + kk" + VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl; - break; - } - - case FETCH_FROM_GLOBAL_STRIDED: - { - if (A_trans_=='N') - stream << "rA[kk][mm] = " << A->process(VIENNACL_HANDLE_BOUNDS("in_bounds_m[mm]", "#pointer[kk*#ld + mm*" + to_string(p.local_size_0) + VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl; - else - stream << "rA[kk][mm] = " << A->process(VIENNACL_HANDLE_BOUNDS("in_bounds_m[mm]", "#pointer[mm*#ld*" + to_string(p.local_size_0) + " + kk" + VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl; - break; - } - - //default: break; - } - stream.dec_tab(); - stream << "}" << std::endl; - - stream << "#pragma unroll " << p.kS << std::endl; - stream << "for(size_t kk = 0; kk < " << p.kS << "; kk++)" << std::endl; - stream << "#pragma unroll " << p.nS/p.simd_width << std::endl; - stream << "for(size_t nn = 0; nn < " << p.nS/p.simd_width << "; nn++)" << std::endl; - stream << "{" << std::endl; - stream.inc_tab(); - switch (p.B_fetching_policy) - { - case FETCH_FROM_LOCAL: - for (unsigned int ss = 0; ss < p.simd_width; ++ss) - stream << "rB[kk][nn*" << p.simd_width << "+" << ss << "] = lB[offB + nn*" << p.local_size_1*p.simd_width << "+" << ss << "+ kk*" << (p.nL+1) << "];" << std::endl; - break; - - case FETCH_FROM_GLOBAL_CONTIGUOUS: - { - if (B_trans_=='T') - stream << "rB[kk][nn] = " << B->process(VIENNACL_HANDLE_BOUNDS("in_bounds_n[nn]", "#pointer[kk*#ld + nn" + VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl; - else - stream << "rB[kk][nn] = " << B->process(VIENNACL_HANDLE_BOUNDS("in_bounds_n[nn]", "#pointer[nn*#ld + kk" + VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl; - break; - } - - case FETCH_FROM_GLOBAL_STRIDED: - { - if (B_trans_=='T') - stream << "rB[kk][nn] = " << B->process(VIENNACL_HANDLE_BOUNDS("in_bounds_n[nn]", "#pointer[kk*#ld + nn*" + to_string(p.local_size_1) + VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl; - else - stream << "rB[kk][nn] = " << B->process(VIENNACL_HANDLE_BOUNDS("in_bounds_n[nn]", "#pointer[nn*#ld*" + to_string(p.local_size_1) + " + kk" + VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl; - break; - } - - //default: break; - } - stream.dec_tab(); - stream << "}" << std::endl; - - - ///Increment pointers - switch (p.A_fetching_policy) - { - case FETCH_FROM_LOCAL: - stream << "offA += " << p.kS*(p.mL+1) << ";" << std::endl; - break; - - default: - if (A_trans_=='N') - stream << A->process("#pointer += " + to_string(p.kS) + "*#ld;") << std::endl; - else - stream << A->process("#pointer += " + to_string(p.kS) + "" + VIENNACL_MUL_STRIDE1 + ";") << std::endl; - break; - } - - - switch (p.B_fetching_policy) - { - case FETCH_FROM_LOCAL: - stream << "offB += " << p.kS*(p.nL+1) << ";" << std::endl; - break; - - default: - if (B_trans_=='T') - stream << B->process("#pointer += " + to_string(p.kS) + "*#ld;") << std::endl; - else - stream << B->process("#pointer += " + to_string(p.kS) + "" + VIENNACL_MUL_STRIDE1 + ";") << std::endl; - break; - } - - - stream << "#pragma unroll " << p.kS << std::endl; - stream << "for(size_t kk = 0; kk <" << p.kS << "; ++kk)" << std::endl; - stream << "{" << std::endl; - stream.inc_tab(); - for (unsigned int nn=0; nn < p.nS; ++nn) - for (unsigned int mm=0; mm < p.mS; ++mm) - { - string res_str, lhs_str, rhs_str; - res_str = "rC[" + tools::to_string(mm) + "][" + tools::to_string(nn) + "]"; - if (p.A_fetching_policy==FETCH_FROM_LOCAL || p.simd_width==1) - lhs_str = "rA[kk][" + tools::to_string(mm) + "]"; - else - lhs_str = "rA[kk][" + tools::to_string(mm/p.simd_width) + "].s" + tools::to_string(mm%p.simd_width); - if (p.B_fetching_policy==FETCH_FROM_LOCAL || p.simd_width==1) - rhs_str = "rB[kk]["+tools::to_string(nn)+"]"; - else - rhs_str = "rB[kk]["+tools::to_string(nn/p.simd_width)+"].s"+tools::to_string(nn%p.simd_width); - stream << res_str << "=" << "fma(" << lhs_str << "," << rhs_str << "," << res_str << ");" << std::endl; - } - stream.dec_tab(); - stream << "}" << std::endl; - - - - - stream.dec_tab(); - stream << "}" << std::endl; - - //Increment global pointer if local memory is used - //Else, it's incremented directly when fetching - if (p.A_fetching_policy==FETCH_FROM_LOCAL) - { - if (A_trans_=='N') - stream << A->process("#pointer += " + to_string(p.kL) + "*#ld;") << std::endl; - else - stream << A->process("#pointer += " + to_string(p.kL) + "" + VIENNACL_MUL_STRIDE1 + ";") << std::endl; - } - - if (p.B_fetching_policy==FETCH_FROM_LOCAL) - { - if (B_trans_=='T') - stream << B->process("#pointer += " + to_string(p.kL) + "*#ld;") << std::endl; - else - stream << B->process("#pointer += " + to_string(p.kL) + "" + VIENNACL_MUL_STRIDE1 + ";") << std::endl; - } - - stream.dec_tab(); - stream << "}" << std::endl; - - - if (C->row_major()) - { - unsigned int ministartstride0 = p.A_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?p.mS:p.simd_width; - unsigned int ministartstride1 = p.B_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?p.nS:p.simd_width; - - stream << C->process("#pointer += gidx*" + to_string(p.mL) + "*#ld;") << std::endl; - stream << C->process("#pointer += idx*" + to_string(ministartstride0) + "*#ld;") << std::endl; - stream << C->process("#pointer += gidy*" + to_string(p.nL) + "*#stride2;") << std::endl; - stream << C->process("#pointer += idy*" + to_string(ministartstride1) + "*#stride2;") << std::endl; - - for (unsigned int n=0; n < p.nS; ++n) - { - for (unsigned int m=0; m < p.mS; ++m) - { - unsigned int ministride1 = p.A_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?1:p.local_size_0; - string Cj = to_string((m/p.simd_width)*(ministride1*p.simd_width) + m%p.simd_width); - if (fallback) - { - stream << "if (in_bounds_m[" + to_string(m) + "] && in_bounds_n[" + to_string(n) + "])" << std::endl; - stream.inc_tab(); - } - stream << C->process("#pointer[" + Cj + "*#ld] = rC[" + to_string(m) + "][" + to_string(n) + "]*" + alpha->name() + "+ #pointer[" + Cj + "*#ld]*" + beta->name() + ";") << std::endl; - if (fallback) - stream.dec_tab(); - } - if ((n+1)%p.simd_width>0 || p.B_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS) - stream << C->process("#pointer += #stride2;") << std::endl; - else - stream << C->process("#pointer += " + to_string((p.local_size_1*p.simd_width) - (p.simd_width-1)) + "*#stride2;") << std::endl; - } - - } - else - { - unsigned int ministartstride0 = p.A_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?p.mS:p.simd_width; - unsigned int ministartstride1 = p.B_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?p.nS:p.simd_width; - - stream << C->process("#pointer += gidx*" + to_string(p.mL) + "*#stride1;") << std::endl; - stream << C->process("#pointer += idx*" + to_string(ministartstride0) + "*#stride1;") << std::endl; - stream << C->process("#pointer += gidy*" + to_string(p.nL) + "*#ld;") << std::endl; - stream << C->process("#pointer += idy*" + to_string(ministartstride1) + "*#ld;") << std::endl; - - for (unsigned int m=0; m < p.mS; ++m) - { - for (unsigned int n=0; n < p.nS; ++n) - { - unsigned int ministride1 = p.B_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?1:p.local_size_1; - string Cj = to_string((n/p.simd_width)*(ministride1*p.simd_width) + n%p.simd_width); - if (fallback) - { - stream << "if (in_bounds_m[" + to_string(m) + "] && in_bounds_n[" + to_string(n) + "])" << std::endl; - stream.inc_tab(); - } - stream << C->process("#pointer[" + Cj + "*#ld] = rC[" + to_string(m) + "][" + to_string(n) + "]*" + alpha->name() + " + #pointer[" + Cj + "*#ld]*" + beta->name() + ";") << std::endl; - if (fallback) - stream.dec_tab(); - } - - if ((m+1)%p.simd_width>0 || p.A_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS) - stream << C->process("#pointer += #stride1;") << std::endl; - else - stream << C->process("#pointer += " + to_string((p.local_size_0*p.simd_width) - (p.simd_width-1)) + "*#stride1;") << std::endl; - } - } - - stream.dec_tab(); - stream << "}" << std::endl; - - return stream.str(); - -#undef VIENNACL_MUL_STRIDE1 -#undef VIENNACL_HANDLE_BOUNDS -#undef VIENNACL_VSTORE - } - - std::vector<std::string> generate_impl(std::string const & kernel_prefix, statements_container const & statements, std::vector<mapping_type> const & mappings) const - { - std::vector<std::string> res; - res.push_back(generate_impl(kernel_prefix, statements, mappings, false)); - res.push_back(generate_impl(kernel_prefix, statements, mappings, true)); - return res; - } - - template<class NumericT> - void enqueue_block(scheduler::statement & statement, - scheduler::lhs_rhs_element& eA, scheduler::lhs_rhs_element& eB, scheduler::lhs_rhs_element& eC, scheduler::lhs_rhs_element& ebeta, - matrix_base<NumericT> const & A, matrix_base<NumericT> const & B, matrix_base<NumericT> const & C, NumericT beta, - std::vector<lazy_program_compiler> & programs, std::string const & kernel_prefix, vcl_size_t id) - { - if (A.size1()==0 || A.size2()==0 || B.size1()==0 || B.size2()==0 || C.size1()==0 || C.size2()==0) - return; - - viennacl::ocl::kernel& kernel = programs[id].program().get_kernel(kernel_prefix); - - kernel.local_work_size(0, p_.local_size_0); - kernel.local_work_size(1, p_.local_size_1); - - scheduler::statement::assign_element(eA, A); - scheduler::statement::assign_element(eB, B); - scheduler::statement::assign_element(eC, C); - scheduler::statement::assign_element(ebeta, beta); - - if (id==1) - { - kernel.global_work_size(0, tools::align_to_multiple(tools::align_to_multiple((unsigned int)C.size1(),p_.mS)/p_.mS, p_.local_size_0)); - kernel.global_work_size(1, tools::align_to_multiple(tools::align_to_multiple((unsigned int)C.size2(),p_.nS)/p_.nS, p_.local_size_1)); - } - else - { - kernel.global_work_size(0, C.size1()/p_.mS); - kernel.global_work_size(1, C.size2()/p_.nS); - } - unsigned int current_arg = 0; - kernel.arg(current_arg++, cl_uint(C.size1())); - kernel.arg(current_arg++, cl_uint(C.size2())); - if (A.row_major()) - kernel.arg(current_arg++, cl_uint(A_trans_=='T'?A.size2():A.size1())); - else - kernel.arg(current_arg++, cl_uint(A_trans_=='N'?A.size2():A.size1())); - set_arguments(statement, kernel, current_arg); - viennacl::ocl::enqueue(kernel); - - } - - template<class NumericT> - matrix_slice< viennacl::matrix_base<NumericT> > create_slice(viennacl::matrix_base<NumericT>* scheduler::lhs_rhs_element::*ptr, scheduler::lhs_rhs_element const & element, - vcl_size_t s0_0, vcl_size_t s0_1, vcl_size_t s1_0, vcl_size_t s1_1, bool swap) - { - matrix_base<NumericT> & M = *(element.*ptr); - slice s0(s0_0, 1, s0_1 - s0_0); - slice s1(s1_0, 1, s1_1 - s1_0); - if (swap) - std::swap(s0, s1); - return matrix_slice<viennacl::matrix_base<NumericT> >(M, s0, s1); - } - - template<class NumericT> - void enqueue_impl(viennacl::matrix_base<NumericT>* scheduler::lhs_rhs_element::*ptr_matrix, - scheduler::statement & statement, scheduler::lhs_rhs_element & A, scheduler::lhs_rhs_element & B, scheduler::lhs_rhs_element & C, scheduler::lhs_rhs_element & beta, - NumericT beta_value, std::vector<lazy_program_compiler> & programs, std::string const & kernel_prefix) - { - using namespace device_specific::utils; - vcl_size_t ldstrideA = call_on_matrix(A, leading_stride()); - vcl_size_t ldstrideB = call_on_matrix(B, leading_stride()); - vcl_size_t ldstrideC = call_on_matrix(C, leading_stride()); - vcl_size_t ldstartA = call_on_matrix(A, leading_start()); - vcl_size_t ldstartB = call_on_matrix(B, leading_start()); - bool swap_A = ((A_trans_=='T') ^ utils::call_on_matrix(A, row_major_fun())); - bool swap_B = ((B_trans_=='T') ^ utils::call_on_matrix(B, row_major_fun())); - - vcl_size_t M = call_on_matrix(C, size1_fun()); - vcl_size_t N = call_on_matrix(C, size2_fun()); - vcl_size_t K; - if (utils::call_on_matrix(A, row_major_fun())) - K = A_trans_=='T'?call_on_matrix(A, size2_fun()):call_on_matrix(A, size1_fun()); - else - K = A_trans_=='N'?call_on_matrix(A, size2_fun()):call_on_matrix(A, size1_fun()); - - if (M < p_.mL || N < p_.nL || K < p_.kL || ldstrideA> 1 || ldstrideB > 1 || ldstrideC > 1 || - (p_.simd_width>1 && (ldstartA % p_.simd_width > 0 || ldstartB % p_.simd_width > 0))) - { - enqueue_block(statement, A, B, C, beta, create_slice(ptr_matrix, A, 0, M, 0, K, swap_A), - create_slice(ptr_matrix, B, 0, K, 0, N, swap_B), - create_slice(ptr_matrix, C, 0, M, 0, N, false), beta_value, programs, kernel_prefix, 1); - return; - } - - - scheduler::lhs_rhs_element Acopy = A; - scheduler::lhs_rhs_element Bcopy = B; - scheduler::lhs_rhs_element Ccopy = C; - - vcl_size_t lM = M / p_.mL * p_.mL; - vcl_size_t lN = N / p_.nL * p_.nL; - vcl_size_t lK = K / p_.kL * p_.kL; - - - enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, 0, lM, 0, lK, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, 0, lK, 0, lN, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, 0, lM, 0, lN, false), beta_value, programs, kernel_prefix, 0); - enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, 0, lM, lK, K, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, lK, K, 0, lN, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, 0, lM, 0, lN, false), (NumericT)1, programs, kernel_prefix, 1); - - enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, 0, lM, 0, lK, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, 0, lK, lN, N, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, 0, lM, lN, N, false), beta_value, programs, kernel_prefix, 1); - enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, 0, lM, lK, K, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, lK, K, lN, N, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, 0, lM, lN, N, false), (NumericT)1, programs, kernel_prefix, 1); - - enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, lM, M, 0, lK, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, 0, lK, 0, lN, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, lM, M, 0, lN, false), beta_value, programs, kernel_prefix, 1); - enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, lM, M, lK, K, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, lK, K, 0, lN, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, lM, M, 0, lN, false), (NumericT)1, programs, kernel_prefix, 1); - - enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, lM, M, 0, lK, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, 0, lK, lN, N, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, lM, M, lN, N, false), beta_value, programs, kernel_prefix, 1); - enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, lM, M, lK, K, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, lK, K, lN, N, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, lM, M, lN, N, false), (NumericT)1, programs, kernel_prefix, 1); - } - -public: - matrix_product_template(matrix_product_template::parameters_type const & parameters, char A_trans, char B_trans) : template_base_impl<matrix_product_template, matrix_product_parameters>(parameters, BIND_ALL_UNIQUE), A_trans_(A_trans), B_trans_(B_trans){ } - - virtual void enqueue(std::string const & kernel_prefix, std::vector<lazy_program_compiler> & programs, statements_container const & statements) - { - using namespace device_specific::utils; - using namespace tree_parsing; - - scheduler::statement const & st = statements.data().front(); - bool A_trans, B_trans; - vcl_size_t C_idx=0, A_idx=0, B_idx=0, alpha_idx=0, beta_idx = 0; - leaf_t C_leaf=LHS_NODE_TYPE, A_leaf=LHS_NODE_TYPE, B_leaf=LHS_NODE_TYPE, alpha_leaf=LHS_NODE_TYPE, beta_leaf=LHS_NODE_TYPE; - parse(st, C_idx, C_leaf, alpha_idx, alpha_leaf, A_idx, A_leaf, A_trans, B_idx, B_leaf, B_trans, beta_idx, beta_leaf); - - scheduler::statement stcopy = st; - scheduler::lhs_rhs_element& A = utils::lhs_rhs_element(stcopy, A_idx, A_leaf); - scheduler::lhs_rhs_element& B = utils::lhs_rhs_element(stcopy, B_idx, B_leaf); - scheduler::lhs_rhs_element& C = utils::lhs_rhs_element(stcopy, C_idx, C_leaf); - scheduler::lhs_rhs_element& beta = utils::lhs_rhs_element(stcopy, beta_idx, beta_leaf); - - - - - - - if (C.numeric_type==scheduler::FLOAT_TYPE) - enqueue_impl<float>(&scheduler::lhs_rhs_element::matrix_float, stcopy, A, B, C, beta, beta.host_float, programs, kernel_prefix); - else if (C.numeric_type==scheduler::DOUBLE_TYPE) - enqueue_impl<double>(&scheduler::lhs_rhs_element::matrix_double, stcopy, A, B, C, beta, beta.host_double, programs, kernel_prefix); - else - throw generator_not_supported_exception("GEMM only supported for float/double"); - - } - -private: - const char A_trans_; - const char B_trans_; -}; - -} - -} - -#endif http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/template_base.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/template_base.hpp b/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/template_base.hpp deleted file mode 100644 index 40e3168..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/template_base.hpp +++ /dev/null @@ -1,596 +0,0 @@ -#ifndef VIENNACL_DEVICE_SPECIFIC_TEMPLATES_TEMPLATE_BASE_ -#define VIENNACL_DEVICE_SPECIFIC_TEMPLATES_TEMPLATE_BASE_ - -/* ========================================================================= - Copyright (c) 2010-2016, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. - Portions of this software are copyright by UChicago Argonne, LLC. - - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - - Project Head: Karl Rupp [email protected] - - (A list of authors and contributors can be found in the manual) - - License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - - -/** @file viennacl/device_specific/templates/template_base.hpp - * - * Base classes for the profiles -*/ - -#include <list> -#include <set> - -#include "viennacl/ocl/kernel.hpp" -#include "viennacl/ocl/device.hpp" -#include "viennacl/ocl/device_utils.hpp" - -#include "viennacl/scheduler/forwards.h" -#include "viennacl/scheduler/io.hpp" - -#include "viennacl/device_specific/lazy_program_compiler.hpp" -#include "viennacl/device_specific/mapped_objects.hpp" -#include "viennacl/device_specific/tree_parsing.hpp" -#include "viennacl/device_specific/utils.hpp" - -namespace viennacl -{ -namespace device_specific -{ - -enum fetching_policy_type -{ - FETCH_FROM_LOCAL, - FETCH_FROM_GLOBAL_STRIDED, - FETCH_FROM_GLOBAL_CONTIGUOUS -}; - -class template_base -{ -public: - struct parameters_type - { - parameters_type(unsigned int _simd_width, unsigned int _local_size_1, unsigned int _local_size_2, unsigned int _num_kernels) : simd_width(_simd_width), local_size_0(_local_size_1), local_size_1(_local_size_2), num_kernels(_num_kernels){ } - - unsigned int simd_width; - unsigned int local_size_0; - unsigned int local_size_1; - unsigned int num_kernels; - }; - -private: - /** @brief Functor to map the statements to the types defined in mapped_objects.hpp */ - class map_functor : public tree_parsing::traversal_functor - { - - scheduler::statement_node_numeric_type numeric_type(scheduler::statement const * statement, vcl_size_t root_idx) const - { - scheduler::statement_node const * root_node = &statement->array()[root_idx]; - while (root_node->lhs.numeric_type==scheduler::INVALID_NUMERIC_TYPE) - root_node = &statement->array()[root_node->lhs.node_index]; - return root_node->lhs.numeric_type; - } - - public: - typedef tools::shared_ptr<mapped_object> result_type; - - map_functor(symbolic_binder & binder, mapping_type & mapping) : binder_(binder), mapping_(mapping){ } - - /** @brief Binary leaf */ - template<class T> - result_type binary_leaf(scheduler::statement const * statement, vcl_size_t root_idx, mapping_type const * mapping) const - { - return result_type(new T(utils::numeric_type_to_string(numeric_type(statement,root_idx)), binder_.get(NULL), mapped_object::node_info(mapping, statement, root_idx))); - } - - template<class NumericT> - result_type operator()(NumericT const & /*scalar*/) const - { - return result_type(new mapped_host_scalar(utils::type_to_string<NumericT>::value(), binder_.get(NULL))); - } - - /** @brief Scalar mapping */ - template<class NumericT> - result_type operator()(scalar<NumericT> const & scal) const - { - return result_type(new mapped_scalar(utils::type_to_string<NumericT>::value(), binder_.get(&viennacl::traits::handle(scal)))); - } - - /** @brief Vector mapping */ - template<class NumericT> - result_type operator()(vector_base<NumericT> const & vec) const - { - return result_type(new mapped_vector(utils::type_to_string<NumericT>::value(), binder_.get(&viennacl::traits::handle(vec)))); - } - - /** @brief Implicit vector mapping */ - template<class NumericT> - result_type operator()(implicit_vector_base<NumericT> const & /*vec*/) const - { - return result_type(new mapped_implicit_vector(utils::type_to_string<NumericT>::value(), binder_.get(NULL))); - } - - /** @brief Matrix mapping */ - template<class NumericT> - result_type operator()(matrix_base<NumericT> const & mat) const - { - return result_type(new mapped_matrix(utils::type_to_string<NumericT>::value(), binder_.get(&viennacl::traits::handle(mat)), - viennacl::traits::row_major(mat))); - } - - /** @brief Implicit matrix mapping */ - template<class NumericT> - result_type operator()(implicit_matrix_base<NumericT> const & /*mat*/) const - { - return result_type(new mapped_implicit_matrix(utils::type_to_string<NumericT>::value(), binder_.get(NULL))); - } - - /** @brief Traversal functor */ - void operator()(scheduler::statement const & statement, vcl_size_t root_idx, leaf_t leaf_t) const { - mapping_type::key_type key(root_idx, leaf_t); - scheduler::statement_node const & root_node = statement.array()[root_idx]; - - if (leaf_t == LHS_NODE_TYPE && root_node.lhs.type_family != scheduler::COMPOSITE_OPERATION_FAMILY) - mapping_.insert(mapping_type::value_type(key, utils::call_on_element(root_node.lhs, *this))); - else if (leaf_t == RHS_NODE_TYPE && root_node.rhs.type_family != scheduler::COMPOSITE_OPERATION_FAMILY) - mapping_.insert(mapping_type::value_type(key, utils::call_on_element(root_node.rhs, *this))); - else if ( leaf_t== PARENT_NODE_TYPE) - { - if (root_node.op.type==scheduler::OPERATION_BINARY_VECTOR_DIAG_TYPE) - mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_vector_diag>(&statement, root_idx, &mapping_))); - else if (root_node.op.type==scheduler::OPERATION_BINARY_MATRIX_DIAG_TYPE) - mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_matrix_diag>(&statement, root_idx, &mapping_))); - else if (root_node.op.type==scheduler::OPERATION_BINARY_MATRIX_ROW_TYPE) - mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_matrix_row>(&statement, root_idx, &mapping_))); - else if (root_node.op.type==scheduler::OPERATION_BINARY_MATRIX_COLUMN_TYPE) - mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_matrix_column>(&statement, root_idx, &mapping_))); - else if (is_scalar_reduction(root_node)) - mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_scalar_reduction>(&statement, root_idx, &mapping_))); - else if (is_vector_reduction(root_node)) - mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_row_wise_reduction>(&statement, root_idx, &mapping_))); - else if (root_node.op.type == scheduler::OPERATION_BINARY_MAT_MAT_PROD_TYPE) - mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_matrix_product>(&statement, root_idx, &mapping_))); - else if (root_node.op.type == scheduler::OPERATION_UNARY_TRANS_TYPE) - mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_trans>(&statement, root_idx, &mapping_))); - } - } - - private: - symbolic_binder & binder_; - mapping_type & mapping_; - }; - - /** @brief functor for generating the prototype of a statement */ - class prototype_generation_traversal : public tree_parsing::traversal_functor - { - private: - std::set<std::string> & already_generated_; - std::string & str_; - mapping_type const & mapping_; - std::map<std::string, unsigned int> const & widths_; - public: - prototype_generation_traversal(std::set<std::string> & already_generated, std::string & str, mapping_type const & mapping, std::map<std::string, unsigned int> const & widths) : - already_generated_(already_generated), str_(str), mapping_(mapping), widths_(widths){ } - - void operator()(scheduler::statement const & statement, vcl_size_t root_idx, leaf_t leaf) const - { - scheduler::statement_node const & root_node = statement.array()[root_idx]; - if ( (leaf==LHS_NODE_TYPE && root_node.lhs.type_family!=scheduler::COMPOSITE_OPERATION_FAMILY) - ||(leaf==RHS_NODE_TYPE && root_node.rhs.type_family!=scheduler::COMPOSITE_OPERATION_FAMILY) ) - { - mapped_object * obj = at(mapping_, std::make_pair(root_idx,leaf)).get(); - if(widths_.find(obj->name())!=widths_.end()) - obj->append_kernel_arguments(already_generated_, str_, at(widths_, obj->name())); - else - obj->append_kernel_arguments(already_generated_, str_, 1); - } - } - }; - - - - /** @brief functor for setting the arguments of a kernel */ - class set_arguments_functor : public tree_parsing::traversal_functor - { - public: - typedef void result_type; - - set_arguments_functor(symbolic_binder & binder, unsigned int & current_arg, viennacl::ocl::kernel & kernel) : binder_(binder), current_arg_(current_arg), kernel_(kernel){ } - - template<class NumericT> - result_type operator()(NumericT const & scal) const { - typedef typename viennacl::result_of::cl_type<NumericT>::type cl_scalartype; - kernel_.arg(current_arg_++, cl_scalartype(scal)); - } - - /** @brief Scalar mapping */ - template<class NumericT> - result_type operator()(scalar<NumericT> const & scal) const { - if (binder_.bind(&viennacl::traits::handle(scal))) - kernel_.arg(current_arg_++, scal.handle().opencl_handle()); - } - - /** @brief Vector mapping */ - template<class NumericT> - result_type operator()(vector_base<NumericT> const & vec) const { - if (binder_.bind(&viennacl::traits::handle(vec))) - { - kernel_.arg(current_arg_++, vec.handle().opencl_handle()); - kernel_.arg(current_arg_++, cl_uint(viennacl::traits::start(vec))); - kernel_.arg(current_arg_++, cl_uint(viennacl::traits::stride(vec))); - } - } - - /** @brief Implicit vector mapping */ - template<class NumericT> - result_type operator()(implicit_vector_base<NumericT> const & vec) const - { - typedef typename viennacl::result_of::cl_type<NumericT>::type cl_scalartype; - kernel_.arg(current_arg_++, cl_scalartype(vec.value())); - if (vec.has_index()) - kernel_.arg(current_arg_++, cl_uint(vec.index())); - } - - /** @brief Matrix mapping */ - template<class NumericT> - result_type operator()(matrix_base<NumericT> const & mat) const - { - if (binder_.bind(&viennacl::traits::handle(mat))) - { - kernel_.arg(current_arg_++, mat.handle().opencl_handle()); - kernel_.arg(current_arg_++, cl_uint(viennacl::traits::ld(mat))); - if (mat.row_major()) - { - kernel_.arg(current_arg_++, cl_uint(viennacl::traits::start2(mat))); - kernel_.arg(current_arg_++, cl_uint(viennacl::traits::start1(mat))); - kernel_.arg(current_arg_++, cl_uint(viennacl::traits::stride2(mat))); - kernel_.arg(current_arg_++, cl_uint(viennacl::traits::stride1(mat))); - } - else - { - kernel_.arg(current_arg_++, cl_uint(viennacl::traits::start1(mat))); - kernel_.arg(current_arg_++, cl_uint(viennacl::traits::start2(mat))); - kernel_.arg(current_arg_++, cl_uint(viennacl::traits::stride1(mat))); - kernel_.arg(current_arg_++, cl_uint(viennacl::traits::stride2(mat))); - } - } - } - - /** @brief Implicit matrix mapping */ - template<class NumericT> - result_type operator()(implicit_matrix_base<NumericT> const & mat) const - { - kernel_.arg(current_arg_++, typename viennacl::result_of::cl_type<NumericT>::type(mat.value())); - } - - /** @brief Traversal functor: */ - void operator()(scheduler::statement const & statement, vcl_size_t root_idx, leaf_t leaf_t) const - { - scheduler::statement_node const & root_node = statement.array()[root_idx]; - if (leaf_t==LHS_NODE_TYPE && root_node.lhs.type_family != scheduler::COMPOSITE_OPERATION_FAMILY) - utils::call_on_element(root_node.lhs, *this); - else if (leaf_t==RHS_NODE_TYPE && root_node.rhs.type_family != scheduler::COMPOSITE_OPERATION_FAMILY) - utils::call_on_element(root_node.rhs, *this); - } - - private: - symbolic_binder & binder_; - unsigned int & current_arg_; - viennacl::ocl::kernel & kernel_; - }; - -protected: - - static void generate_prototype(utils::kernel_generation_stream & stream, std::string const & name, std::string const & first_arguments, std::vector<mapping_type> const & mappings, statements_container const &statements, - std::map<std::string, unsigned int> const & widths) - { - statements_container::data_type::const_iterator sit; - std::vector<mapping_type>::const_iterator mit; - std::set<std::string> already_generated; - - std::string arguments = first_arguments; - for (mit = mappings.begin(), sit = statements.data().begin(); sit != statements.data().end(); ++sit, ++mit) - tree_parsing::traverse(*sit, sit->root(), prototype_generation_traversal(already_generated, arguments, *mit, widths), true); - arguments.erase(arguments.size()-1); //Last comma pruned - stream << "__kernel " << "void " << name << "(" << arguments << ")" << std::endl; - } - - static void generate_prototype(utils::kernel_generation_stream & stream, std::string const & name, std::string const & first_arguments, std::vector<mapping_type> const & mappings, statements_container const & statements) - { - generate_prototype(stream, name, first_arguments, mappings, statements, std::map<std::string, unsigned int>()); - } - - void set_arguments(statements_container const & statements, viennacl::ocl::kernel & kernel, unsigned int & current_arg) - { - tools::shared_ptr<symbolic_binder> binder = make_binder(binding_policy_); - for (statements_container::data_type::const_iterator itt = statements.data().begin(); itt != statements.data().end(); ++itt) - tree_parsing::traverse(*itt, itt->root(), set_arguments_functor(*binder,current_arg,kernel), true); - } - - class invalid_template_exception : public std::exception - { - public: - invalid_template_exception() : message_() {} - invalid_template_exception(std::string message) : - message_("ViennaCL: Internal error: The generator cannot apply the given template to the given statement: " + message + "\n" - "If you are using a builtin template, please report on [email protected]! We will provide a fix as soon as possible\n" - "If you are using your own template, please try using other parameters") {} - virtual const char* what() const throw() { return message_.c_str(); } - virtual ~invalid_template_exception() throw() {} - private: - std::string message_; - }; - - static void fetching_loop_info(fetching_policy_type policy, std::string const & bound, utils::kernel_generation_stream & stream, std::string & init, std::string & upper_bound, std::string & inc, std::string const & domain_id, std::string const & domain_size) - { - if (policy==FETCH_FROM_GLOBAL_STRIDED) - { - init = domain_id; - upper_bound = bound; - inc = domain_size; - } - else if (policy==FETCH_FROM_GLOBAL_CONTIGUOUS) - { - std::string chunk_size = "chunk_size"; - std::string chunk_start = "chunk_start"; - std::string chunk_end = "chunk_end"; - - stream << "unsigned int " << chunk_size << " = (" << bound << "+" << domain_size << "-1)/" << domain_size << ";" << std::endl; - stream << "unsigned int " << chunk_start << " =" << domain_id << "*" << chunk_size << ";" << std::endl; - stream << "unsigned int " << chunk_end << " = min(" << chunk_start << "+" << chunk_size << ", " << bound << ");" << std::endl; - init = chunk_start; - upper_bound = chunk_end; - inc = "1"; - } - } - - static bool is_node_trans(scheduler::statement::container_type const & array, vcl_size_t root_idx, leaf_t leaf_type) - { - bool res = false; - scheduler::lhs_rhs_element scheduler::statement_node::*ptr; - if (leaf_type==LHS_NODE_TYPE) - ptr = &scheduler::statement_node::lhs; - else - ptr = &scheduler::statement_node::rhs; - scheduler::statement_node const * node = &array[root_idx]; - while ((node->*ptr).type_family==scheduler::COMPOSITE_OPERATION_FAMILY) - { - if (array[(node->*ptr).node_index].op.type==scheduler::OPERATION_UNARY_TRANS_TYPE) - res = !res; - node = &array[(node->*ptr).node_index]; - } - return res; - } - -protected: - - static std::string append_simd_suffix(std::string const & str, unsigned int i) - { - assert(i < 16); - static char suffixes[] = {'0','1','2','3','4','5','6','7','8','9', - 'a','b','c','d','e','f'}; - return str + tools::to_string(suffixes[i]); - } - - static bool is_striding_operator(scheduler::statement_node const & node) - { - return node.op.type==scheduler::OPERATION_BINARY_MATRIX_COLUMN_TYPE - || node.op.type==scheduler::OPERATION_BINARY_MATRIX_ROW_TYPE - || node.op.type==scheduler::OPERATION_BINARY_MATRIX_DIAG_TYPE; - } - - static bool has_strided_access(statements_container const & statements) - { - for (statements_container::data_type::const_iterator it = statements.data().begin(); it != statements.data().end(); ++it) - { - //checks for vectors - std::vector<scheduler::lhs_rhs_element> vectors; - tree_parsing::traverse(*it, it->root(), tree_parsing::filter_elements(scheduler::DENSE_VECTOR_TYPE, vectors), true); - for (std::vector<scheduler::lhs_rhs_element>::iterator itt = vectors.begin(); itt != vectors.end(); ++itt) - if (utils::call_on_vector(*itt, utils::stride_fun())>1) - return true; - - //checks for matrix - std::vector<scheduler::lhs_rhs_element> matrices; - tree_parsing::traverse(*it, it->root(), tree_parsing::filter_elements(scheduler::DENSE_MATRIX_TYPE, matrices), true); - for (std::vector<scheduler::lhs_rhs_element>::iterator itt = matrices.begin(); itt != matrices.end(); ++itt) - if (utils::call_on_matrix(*itt, utils::stride1_fun())>1 || utils::call_on_matrix(*itt, utils::stride2_fun())>2) - return true; - - std::vector<vcl_size_t> striding_operators; - tree_parsing::traverse(*it, it->root(), tree_parsing::filter(&is_striding_operator, striding_operators), false); - if(striding_operators.size() > 0) - return true; - } - return false; - } - - static vcl_size_t vector_size(scheduler::statement_node const & node, bool up_to_internal_size) - { - using namespace scheduler; - using namespace utils; - if (node.op.type==OPERATION_BINARY_MATRIX_DIAG_TYPE) - { - vcl_size_t size1 = up_to_internal_size?call_on_matrix(node.lhs, internal_size1_fun()):call_on_matrix(node.lhs, size1_fun()); - vcl_size_t size2 = up_to_internal_size?call_on_matrix(node.lhs, internal_size2_fun()):call_on_matrix(node.lhs, size2_fun()); - return std::min<vcl_size_t>(size1, size2); - } - else if (node.op.type==OPERATION_BINARY_MATRIX_ROW_TYPE) - return up_to_internal_size?call_on_matrix(node.lhs, internal_size2_fun()):call_on_matrix(node.lhs, size2_fun()); - else if (node.op.type==OPERATION_BINARY_MATRIX_COLUMN_TYPE) - return up_to_internal_size?call_on_matrix(node.lhs, internal_size1_fun()):call_on_matrix(node.lhs, size1_fun()); - else - return up_to_internal_size?call_on_vector(node.lhs, internal_size_fun()):call_on_vector(node.lhs, size_fun()); - } - - //NB : templates are not used here because declaring a functor out of the generate() functions would be harder to read - struct loop_body_base - { - virtual void operator()(utils::kernel_generation_stream & stream, unsigned int simd_width) const = 0; - virtual ~loop_body_base() {} - }; - - static void element_wise_loop_1D(utils::kernel_generation_stream & stream, loop_body_base const & loop_body, - fetching_policy_type fetch, unsigned int simd_width, std::string const & i, std::string const & bound, std::string const & domain_id, std::string const & domain_size) - { - std::string strwidth = tools::to_string(simd_width); - std::string boundround = bound + "/" + strwidth; - - std::string init, upper_bound, inc; - fetching_loop_info(fetch, boundround, stream, init, upper_bound, inc, domain_id, domain_size); - stream << "for(unsigned int " << i << " = " << init << "; " << i << " < " << upper_bound << "; " << i << " += " << inc << ")" << std::endl; - stream << "{" << std::endl; - stream.inc_tab(); - loop_body(stream, simd_width); - stream.dec_tab(); - stream << "}" << std::endl; - - if (simd_width>1) - { - stream << "for(unsigned int " << i << " = " << boundround << "*" << strwidth << " + " << domain_id << "; " << i << " < " << bound << "; " << i << " += " + domain_size + ")" << std::endl; - stream << "{" << std::endl; - stream.inc_tab(); - loop_body(stream, 1); - stream.dec_tab(); - stream << "}" << std::endl; - } - } - - static std::string vstore(unsigned int simd_width, std::string const & value, std::string const & offset, std::string const & ptr) - { - if (simd_width==1) - return "(" + ptr + ")[" + offset + "] = " + value; - else - return utils::append_width("vstore", simd_width) + "(" + value + ", " + offset + ", " + ptr + ")"; - } - - static std::string vload(unsigned int simd_width, std::string const & offset, std::string const & ptr) - { - if (simd_width==1) - return "(" + ptr + ")[" + offset + "]"; - else - return utils::append_width("vload", simd_width) + "(" + offset + ", " + ptr + ")"; - } - -private: - /** @brief Generates the body of the associated kernel function */ - virtual std::vector<std::string> generate_impl(std::string const & kernel_prefix, statements_container const & statements, std::vector<mapping_type> const & mapping) const = 0; - -public: - template_base(binding_policy_t binding_policy) : binding_policy_(binding_policy) {} - - virtual ~template_base(){ } - - std::vector<std::string> generate(std::string const & kernel_prefix, statements_container const & statements, viennacl::ocl::device const & device) - { - statements_container::data_type::const_iterator sit; - std::vector<mapping_type>::iterator mit; - - if(int err = check_invalid(statements, device)) - throw generator_not_supported_exception("The supplied parameters for this template are invalid : err " + tools::to_string(err)); - - //Create mapping - std::vector<mapping_type> mappings(statements.data().size()); - tools::shared_ptr<symbolic_binder> binder = make_binder(binding_policy_); - for (mit = mappings.begin(), sit = statements.data().begin(); sit != statements.data().end(); ++sit, ++mit) - tree_parsing::traverse(*sit, sit->root(), map_functor(*binder,*mit), true); - - return generate_impl(kernel_prefix, statements, mappings); - } - - /** @brief returns whether or not the profile has undefined behavior on particular device */ - virtual int check_invalid(statements_container const & statements, viennacl::ocl::device const & device) const = 0; - - virtual void enqueue(std::string const & kernel_prefix, std::vector<lazy_program_compiler> & programs, statements_container const & statements) = 0; - - virtual tools::shared_ptr<template_base> clone() const = 0; -private: - binding_policy_t binding_policy_; -}; - - -template<class TemplateType, class ParametersType> -class template_base_impl : public template_base -{ -private: - virtual int check_invalid_impl(viennacl::ocl::device const & /*dev*/) const { return TEMPLATE_VALID; } - - virtual unsigned int n_lmem_elements() const { return 0; } - -public: - typedef ParametersType parameters_type; - - /** @brief The constructor */ - template_base_impl(parameters_type const & parameters, binding_policy_t binding_policy) : template_base(binding_policy), p_(parameters){ } - - parameters_type const & parameters() const - { - return p_; - } - - tools::shared_ptr<template_base> clone() const - { - return tools::shared_ptr<template_base>(new TemplateType(*dynamic_cast<TemplateType const *>(this))); - } - - /** @brief returns whether or not the profile has undefined behavior on particular device */ - int check_invalid(statements_container const & statements, viennacl::ocl::device const & device) const - { - using namespace viennacl::tools; - - scheduler::statement const & statement = statements.data().front(); - unsigned int scalartype_size = utils::size_of(lhs_most(statement.array(), statement.root()).lhs.numeric_type); - - //Query device informations - vcl_size_t lmem_available = static_cast<vcl_size_t>(device.local_mem_size()); - vcl_size_t lmem_usage = scalartype_size*n_lmem_elements(); - if (lmem_usage>lmem_available) - return TEMPLATE_LOCAL_MEMORY_OVERFLOW; - - //Invalid work group size - vcl_size_t max_workgroup_size = device.max_work_group_size(); - std::vector<vcl_size_t> max_work_item_sizes = device.max_work_item_sizes(); - if (p_.local_size_0*p_.local_size_1 > max_workgroup_size) - return TEMPLATE_WORK_GROUP_SIZE_OVERFLOW; - if (p_.local_size_0 > max_work_item_sizes[0]) - return TEMPLATE_LOCAL_SIZE_0_OVERFLOW; - - if (p_.local_size_1 > max_work_item_sizes[1]) - return TEMPLATE_LOCAL_SIZE_1_OVERFLOW; - - //Advice from the Intel guide - unsigned int warp_size = 8; - if (device.type()==CL_DEVICE_TYPE_GPU) - { - //Advice from the nvidia guide - warp_size = 32; - //Advice from the AMD guide - if (device.vendor_id()==4098) - warp_size = 64; - } - if (((p_.local_size_0*p_.local_size_1)%warp_size)>0) - return TEMPLATE_LOCAL_SIZE_NOT_WARP_MULTIPLE; - - //Invalid SIMD Width - if (p_.simd_width!=1 && p_.simd_width!=2 && - p_.simd_width!=4 && p_.simd_width!=8 && - p_.simd_width!=16) - return TEMPLATE_INVALID_SIMD_WIDTH; - - return check_invalid_impl(device); - } - -protected: - parameters_type p_; -}; - -} -} - -#endif
