http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/device_specific/mapped_objects.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/device_specific/mapped_objects.hpp 
b/native-viennaCL/src/main/cpp/viennacl/device_specific/mapped_objects.hpp
deleted file mode 100644
index 19f7993..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/device_specific/mapped_objects.hpp
+++ /dev/null
@@ -1,512 +0,0 @@
-#ifndef VIENNACL_DEVICE_SPECIFIC_MAPPED_TYPE_HPP
-#define VIENNACL_DEVICE_SPECIFIC_MAPPED_TYPE_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-
-/** @file viennacl/device_specific/mapped_objects.hpp
-    @brief Map ViennaCL objects to generator wrappers
-*/
-
-#include <string>
-
-#include "viennacl/scheduler/forwards.h"
-#include "viennacl/device_specific/forwards.h"
-#include "viennacl/device_specific/utils.hpp"
-
-namespace viennacl
-{
-
-namespace device_specific
-{
-
-/** @brief Mapped Object
-*
-* This object populates the symbolic mapping associated with a statement. 
(root_id, LHS|RHS|PARENT) => mapped_object
-* The tree can then be reconstructed in its symbolic form
-*/
-class mapped_object
-{
-private:
-  virtual void postprocess(std::string &) const { }
-
-protected:
-  struct MorphBase { virtual ~MorphBase(){} };
-  struct MorphBase1D : public MorphBase { public: virtual std::string 
operator()(std::string const & i) const = 0; };
-  struct MorphBase2D : public MorphBase { public: virtual std::string 
operator()(std::string const & i, std::string const & j) const = 0; };
-
-  static void replace_offset(std::string & str, MorphBase const & morph)
-  {
-    vcl_size_t pos = 0;
-    while ((pos=str.find("$OFFSET", pos))!=std::string::npos)
-    {
-      std::string postprocessed;
-      vcl_size_t pos_po = str.find('{', pos);
-      vcl_size_t pos_pe = str.find('}', pos_po);
-
-      if (MorphBase2D const * p2d = dynamic_cast<MorphBase2D const *>(&morph))
-      {
-        vcl_size_t pos_comma = str.find(',', pos_po);
-        std::string i = str.substr(pos_po + 1, pos_comma - pos_po - 1);
-        std::string j = str.substr(pos_comma + 1, pos_pe - pos_comma - 1);
-        postprocessed = (*p2d)(i, j);
-      }
-      else if (MorphBase1D const * p1d = dynamic_cast<MorphBase1D const 
*>(&morph))
-      {
-        std::string i = str.substr(pos_po + 1, pos_pe - pos_po - 1);
-        postprocessed = (*p1d)(i);
-      }
-
-      str.replace(pos, pos_pe + 1 - pos, postprocessed);
-      pos = pos_pe;
-    }
-  }
-
-  void register_attribute(std::string & attribute, std::string const & key, 
std::string const & value)
-  {
-    attribute = value;
-    keywords_[key] = attribute;
-  }
-
-public:
-  struct node_info
-  {
-    node_info(mapping_type const * _mapping, scheduler::statement const * 
_statement, vcl_size_t _root_idx) :
-      mapping(_mapping), statement(_statement), root_idx(_root_idx) { }
-    mapping_type const * mapping;
-    scheduler::statement const * statement;
-    vcl_size_t root_idx;
-  };
-
-public:
-  mapped_object(std::string const & scalartype, unsigned int id, std::string 
const & type_key) : type_key_(type_key)
-  {
-    register_attribute(scalartype_, "#scalartype", scalartype);
-    register_attribute(name_, "#name", "obj" + tools::to_string(id));
-  }
-
-  virtual ~mapped_object(){ }
-
-  virtual std::string & append_kernel_arguments(std::set<std::string> &, 
std::string & str, unsigned int) const { return str; }
-
-  std::string type_key() const { return type_key_; }
-
-  std::string const & name() const { return name_; }
-
-  std::string process(std::string const & in) const
-  {
-    std::string res(in);
-    for (std::map<std::string,std::string>::const_iterator it = 
keywords_.begin(); it != keywords_.end(); ++it)
-      tools::find_and_replace(res, it->first, it->second);
-    postprocess(res);
-    return res;
-  }
-
-  std::string evaluate(std::map<std::string, std::string> const & accessors) 
const
-  {
-    if (accessors.find(type_key_)==accessors.end())
-      return name_;
-    return process(at(accessors, type_key_));
-  }
-
-
-protected:
-  std::string name_;
-  std::string scalartype_;
-  std::string type_key_;
-  std::map<std::string, std::string> keywords_;
-};
-
-
-/** @brief Binary leaf interface
-*
-*  Some subtrees have to be interpret at leaves when reconstructing the final 
expression. It is the case of trans(), diag(), prod(), etc...
-*  This interface stores basic infos about the sub-trees
-*/
-class binary_leaf
-{
-public:
-  binary_leaf(mapped_object::node_info info) : info_(info){ }
-
-  void process_recursive(utils::kernel_generation_stream & stream, leaf_t 
leaf, std::string const & key, std::string const & process_str, 
std::set<std::string> & already_fetched)
-  {
-    tree_parsing::process(stream, leaf, key, process_str, *info_.statement, 
info_.root_idx, *info_.mapping, already_fetched);
-  }
-
-  std::string evaluate_recursive(leaf_t leaf, std::map<std::string, 
std::string> const & accessors)
-  {
-    return tree_parsing::evaluate(leaf, accessors, *info_.statement, 
info_.root_idx, *info_.mapping);
-  }
-
-protected:
-  mapped_object::node_info info_;
-};
-
-/** @brief Matrix product
-  *
-  * Maps prod(matrix_expression, matrix_expression)
-  */
-class mapped_matrix_product : public mapped_object, public binary_leaf
-{
-public:
-  mapped_matrix_product(std::string const & scalartype, unsigned int id, 
node_info info) : mapped_object(scalartype, id, "matrix_product"), 
binary_leaf(info) { }
-};
-
-/** @brief Reduction
-*
-* Base class for mapping a reduction
-*/
-class mapped_reduction : public mapped_object, public binary_leaf
-{
-public:
-  mapped_reduction(std::string const & scalartype, unsigned int id, node_info 
info, std::string const & type_key) : mapped_object(scalartype, id, type_key), 
binary_leaf(info){ }
-
-  vcl_size_t root_idx() const { return info_.root_idx; }
-  scheduler::statement const & statement() const { return *info_.statement; }
-  scheduler::statement_node root_node() const { return 
statement().array()[root_idx()]; }
-  bool is_index_reduction() const { return 
utils::is_index_reduction(info_.statement->array()[info_.root_idx].op); }
-
-  scheduler::op_element root_op() const
-  {
-    scheduler::op_element res = info_.statement->array()[info_.root_idx].op;
-    if (res.type==scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE
-        ||res.type==scheduler::OPERATION_BINARY_INNER_PROD_TYPE)
-      res.type        = scheduler::OPERATION_BINARY_ADD_TYPE;
-    return res;
-  }
-};
-
-/** @brief Scalar reduction
-*
-* Maps a scalar reduction (max, min, argmax, inner_prod, etc..)
-*/
-class mapped_scalar_reduction : public mapped_reduction
-{
-public:
-  mapped_scalar_reduction(std::string const & scalartype, unsigned int id, 
node_info info) : mapped_reduction(scalartype, id, info, "scalar_reduction"){ }
-};
-
-/** @brief Vector reduction
-*
-* Maps a row-wise reduction (max, min, argmax, matrix-vector product, etc..)
-*/
-class mapped_row_wise_reduction : public mapped_reduction
-{
-public:
-  mapped_row_wise_reduction(std::string const & scalartype, unsigned int id, 
node_info info) : mapped_reduction(scalartype, id, info, "row_wise_reduction") 
{ }
-};
-
-/** @brief Host scalar
- *
- * Maps a host scalar (passed by value)
- */
-class mapped_host_scalar : public mapped_object
-{
-public:
-  mapped_host_scalar(std::string const & scalartype, unsigned int id) : 
mapped_object(scalartype, id, "host_scalar"){ }
-
-  std::string & append_kernel_arguments(std::set<std::string> & 
already_generated, std::string & str, unsigned int width) const
-  {
-    if (already_generated.insert(name_).second)
-      str += generate_value_kernel_argument(utils::append_width(scalartype_, 
width), name_);
-    return str;
-  }
-};
-
-/** @brief Handle
-*
-* Maps an object passed by pointer
-*/
-class mapped_handle : public mapped_object
-{
-private:
-  virtual void append_optional_arguments(std::string &) const = 0;
-
-public:
-  mapped_handle(std::string const & scalartype, unsigned int id, std::string 
const & type_key) : mapped_object(scalartype, id, type_key)
-  {
-    register_attribute(pointer_, "#pointer", name_ + "_pointer");
-  }
-
-  std::string & append_kernel_arguments(std::set<std::string> & 
already_generated, std::string & str, unsigned int width) const
-  {
-    if (already_generated.insert(name_).second)
-    {
-      str += generate_pointer_kernel_argument("__global", 
utils::append_width(scalartype_, width), pointer_);
-      append_optional_arguments(str);
-    }
-    return str;
-  }
-
-private:
-  std::string pointer_;
-};
-
-
-/** @brief Scalar
- *
- * Maps a scalar passed by pointer
- */
-class mapped_scalar : public mapped_handle
-{
-private:
-  void append_optional_arguments(std::string &) const{ }
-
-public:
-  mapped_scalar(std::string const & scalartype, unsigned int id) : 
mapped_handle(scalartype, id, "scalar") { }
-};
-
-/** @brief Buffered
- *
- * Maps a buffered object (vector, matrix)
- */
-class mapped_buffer : public mapped_handle
-{
-public:
-  mapped_buffer(std::string const & scalartype, unsigned int id, std::string 
const & type_key) : mapped_handle(scalartype, id, type_key){ }
-};
-
-/** @brief Vector
- *
- * Maps a vector
- */
-class mapped_vector : public mapped_buffer
-{
-  void append_optional_arguments(std::string & str) const
-  {
-    str += generate_value_kernel_argument("unsigned int", start_);
-    str += generate_value_kernel_argument("unsigned int", stride_);
-  }
-
-public:
-  mapped_vector(std::string const & scalartype, unsigned int id) : 
mapped_buffer(scalartype, id, "vector")
-  {
-    register_attribute(start_, "#start", name_ + "_start");
-    register_attribute(stride_, "#stride", name_ + "_stride");
-  }
-
-private:
-  std::string start_;
-  std::string stride_;
-};
-
-/** @brief Matrix
- *
- * Maps a matrix
- */
-class mapped_matrix : public mapped_buffer
-{
-private:
-  void append_optional_arguments(std::string & str) const
-  {
-    str += generate_value_kernel_argument("unsigned int", ld_);
-    str += generate_value_kernel_argument("unsigned int", start1_);
-    str += generate_value_kernel_argument("unsigned int", start2_);
-    str += generate_value_kernel_argument("unsigned int", stride1_);
-    str += generate_value_kernel_argument("unsigned int", stride2_);
-  }
-
-  void postprocess(std::string & str) const
-  {
-    struct Morph : public MorphBase2D
-    {
-      Morph(bool _is_row_major, std::string const & _ld) : 
is_row_major(_is_row_major), ld(_ld){ }
-      std::string operator()(std::string const & i, std::string const & j) 
const
-      {
-        if (is_row_major)
-          return "(" + i + ") * " + ld +  " + (" + j + ")";
-        return "(" + i + ") +  (" + j + ") * " + ld;
-      }
-    private:
-      bool is_row_major;
-      std::string const & ld;
-    };
-    replace_offset(str, Morph(row_major_, ld_));
-  }
-
-public:
-  mapped_matrix(std::string const & scalartype, unsigned int id, bool 
row_major) : mapped_buffer(scalartype, id, "matrix"), row_major_(row_major)
-  {
-    register_attribute(ld_, "#ld", name_ + "_ld");
-    register_attribute(start1_, "#start1", name_ + "_start1");
-    register_attribute(start2_, "#start2", name_ + "_start2");
-    register_attribute(stride1_, "#stride1", name_ + "_stride1");
-    register_attribute(stride2_, "#stride2", name_ + "_stride2");
-    if (row_major_)
-      keywords_["#nldstride"] = "#stride1";
-    else
-      keywords_["#nldstride"] = "#stride2";
-
-    if (row_major_)
-    {
-      std::swap(start1_, start2_);
-      std::swap(stride1_, stride2_);
-    }
-  }
-
-  bool row_major() const
-  {
-    return row_major_;
-  }
-
-private:
-  std::string ld_;
-  std::string start1_;
-  std::string start2_;
-  std::string stride1_;
-  std::string stride2_;
-  bool row_major_;
-};
-
-/** @brief Vector diag
-*
-*  Maps a diag(vector_expression) node into a diagonal matrix
-*/
-class mapped_vector_diag : public mapped_object, public binary_leaf
-{
-private:
-  void postprocess(std::string &res) const
-  {
-    std::map<std::string, std::string> accessors;
-    tools::find_and_replace(res, "#diag_offset", 
tree_parsing::evaluate(RHS_NODE_TYPE, accessors, *info_.statement, 
info_.root_idx, *info_.mapping));
-    accessors["vector"] = res;
-    res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, 
info_.root_idx, *info_.mapping);
-  }
-
-public:
-  mapped_vector_diag(std::string const & scalartype, unsigned int id, 
node_info info) : mapped_object(scalartype, id, "vector_diag"), 
binary_leaf(info){ }
-};
-
-
-/** @brief Trans
-*
-*  Maps trans(matrix_expression) into the transposed of matrix_expression
-*/
-class mapped_trans: public mapped_object, public binary_leaf
-{
-private:
-  void postprocess(std::string &res) const
-  {
-    std::map<std::string, std::string> accessors;
-    accessors["matrix"] = res;
-    res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, 
info_.root_idx, *info_.mapping);
-  }
-
-public:
-  mapped_trans(std::string const & scalartype, unsigned int id, node_info 
info) : mapped_object(scalartype, id, "matrix_trans"), binary_leaf(info){ }
-};
-
-/** @brief Matrix row
-*
-*  Maps row(matrix_expression, scalar_expression) into the scalar_expression's 
row of matrix_expression
-*/
-class mapped_matrix_row : public mapped_object, binary_leaf
-{
-private:
-  void postprocess(std::string &res) const
-  {
-    std::map<std::string, std::string> accessors;
-    tools::find_and_replace(res, "#row", tree_parsing::evaluate(RHS_NODE_TYPE, 
accessors, *info_.statement, info_.root_idx, *info_.mapping));
-    accessors["matrix"] = res;
-    res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, 
info_.root_idx, *info_.mapping);
-  }
-
-public:
-  mapped_matrix_row(std::string const & scalartype, unsigned int id, node_info 
info) : mapped_object(scalartype, id, "matrix_row"), binary_leaf(info)
-  { }
-};
-
-
-/** @brief Matrix column
-*
-*  Maps column(matrix_expression, scalar_expression) into the 
scalar_expression's column of matrix_expression
-*/
-class mapped_matrix_column : public mapped_object, binary_leaf
-{
-private:
-  void postprocess(std::string &res) const
-  {
-    std::map<std::string, std::string> accessors;
-    tools::find_and_replace(res, "#column", 
tree_parsing::evaluate(RHS_NODE_TYPE, accessors, *info_.statement, 
info_.root_idx, *info_.mapping));
-    accessors["matrix"] = res;
-    res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, 
info_.root_idx, *info_.mapping);
-  }
-
-public:
-  mapped_matrix_column(std::string const & scalartype, unsigned int id, 
node_info info) : mapped_object(scalartype, id, "matrix_column"), 
binary_leaf(info)
-  { }
-};
-
-/** @brief Matrix diag
-*
-*  Maps a diag(matrix_expression) node into the vector of its diagonal elements
-*/
-class mapped_matrix_diag : public mapped_object, binary_leaf
-{
-private:
-  void postprocess(std::string &res) const
-  {
-    std::map<std::string, std::string> accessors;
-    tools::find_and_replace(res, "#diag_offset", 
tree_parsing::evaluate(RHS_NODE_TYPE, accessors, *info_.statement, 
info_.root_idx, *info_.mapping));
-    accessors["matrix"] = res;
-    res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, 
info_.root_idx, *info_.mapping);
-  }
-
-public:
-  mapped_matrix_diag(std::string const & scalartype, unsigned int id, 
node_info info) : mapped_object(scalartype, id, "matrix_diag"), 
binary_leaf(info)
-  { }
-};
-
-/** @brief Implicit vector
- *
- * Maps an implicit vector
- */
-class mapped_implicit_vector : public mapped_object
-{
-public:
-  mapped_implicit_vector(std::string const & scalartype, unsigned int id) : 
mapped_object(scalartype, id, "implicit_vector")
-  { }
-
-  std::string & append_kernel_arguments(std::set<std::string> & 
/*already_generated*/, std::string & str, unsigned int width) const
-  {
-    str += generate_value_kernel_argument(utils::append_width(scalartype_, 
width), name_);
-    return str;
-  }
-};
-
-/** @brief Implicit matrix
- *
- * Maps an implicit matrix
- */
-class mapped_implicit_matrix : public mapped_object
-{
-public:
-  mapped_implicit_matrix(std::string const & scalartype, unsigned int id) : 
mapped_object(scalartype, id, "implicit_matrix")
-  { }
-
-  std::string & append_kernel_arguments(std::set<std::string> & 
/*already_generated*/, std::string & str, unsigned int width) const
-  {
-    str += generate_value_kernel_argument(utils::append_width(scalartype_, 
width), name_);
-    return str;
-  }
-};
-
-}
-
-}
-#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/matrix_product_template.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/matrix_product_template.hpp
 
b/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/matrix_product_template.hpp
deleted file mode 100644
index 1f082ac..0000000
--- 
a/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/matrix_product_template.hpp
+++ /dev/null
@@ -1,859 +0,0 @@
-#ifndef VIENNACL_DEVICE_SPECIFIC_TEMPLATES_MATRIX_PRODUCT_HPP
-#define VIENNACL_DEVICE_SPECIFIC_TEMPLATES_MATRIX_PRODUCT_HPP
-
-/* =========================================================================
-Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                ViennaCL - The Vienna Computing Library
-                            -----------------
-
-Project Head:    Karl Rupp                   [email protected]
-
-(A list of authors and contributors can be found in the manual)
-
-License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-
-/** @file viennacl/device_specific/templates/matrix_product_template.hpp
-*
-* Kernel template for the matrix product operation
-*/
-
-#include <vector>
-
-#include "viennacl/scheduler/forwards.h"
-
-#include "viennacl/detail/matrix_def.hpp"
-#include "viennacl/matrix_proxy.hpp"
-
-#include "viennacl/device_specific/templates/template_base.hpp"
-#include "viennacl/device_specific/mapped_objects.hpp"
-#include "viennacl/device_specific/utils.hpp"
-#include "viennacl/device_specific/tree_parsing.hpp"
-#include "viennacl/forwards.h"
-
-#include "viennacl/tools/tools.hpp"
-
-namespace viennacl
-{
-namespace device_specific
-{
-
-struct matrix_product_parameters : public template_base::parameters_type
-{
-  matrix_product_parameters(unsigned int simd_width
-                            , unsigned int local_size_0, unsigned int KL, 
unsigned int local_size_1
-                            , unsigned int ms, unsigned int ks, unsigned int ns
-                            , fetching_policy_type A_fetching_policy_param, 
fetching_policy_type B_fetching_policy_param
-                            , unsigned int local_fetch_0_param, unsigned int 
local_fetch_1_param): template_base::parameters_type(simd_width, local_size_0, 
local_size_1, 1),
-    kL(KL), mS(ms), kS(ks), nS(ns), 
A_fetching_policy(A_fetching_policy_param), 
B_fetching_policy(B_fetching_policy_param),
-    local_fetch_0(local_fetch_0_param), local_fetch_1(local_fetch_1_param),
-    mL(ms*local_size_0), nL(ns*local_size_1){}
-
-  unsigned int kL;
-
-  unsigned int mS;
-  unsigned int kS;
-  unsigned int nS;
-
-  fetching_policy_type A_fetching_policy;
-  fetching_policy_type B_fetching_policy;
-
-  unsigned int local_fetch_0;
-  unsigned int local_fetch_1;
-
-  unsigned int mL;
-  unsigned int nL;
-};
-
-class matrix_product_template : public 
template_base_impl<matrix_product_template, matrix_product_parameters>
-{
-
-private:
-  unsigned int n_lmem_elements() const
-  {
-    unsigned int N = 0;
-    if (p_.A_fetching_policy==FETCH_FROM_LOCAL)
-      N += p_.kL * (p_.mL+1);
-    if (p_.B_fetching_policy==FETCH_FROM_LOCAL)
-      N += p_.nL * (p_.kL+1);
-    return N;
-  }
-
-  int check_invalid_impl(viennacl::ocl::device const & /*device*/) const
-  {
-    if (p_.A_fetching_policy!=FETCH_FROM_LOCAL && 
p_.B_fetching_policy!=FETCH_FROM_LOCAL&& (p_.local_fetch_0!=0 || 
p_.local_fetch_1!=0))
-      return TEMPLATE_GLOBAL_MEMORY_REQUIRES_ZERO_LOCAL_FETCH;
-
-    if ((p_.mS % p_.simd_width) > 0 || (p_.nS % p_.simd_width) > 0)
-      return TEMPLATE_MS_NS_MUST_BE_SIMD_WIDTH_MULTIPLE;
-
-    if (p_.kS > p_.kL)
-      return TEMPLATE_KS_MUST_BE_SMALLER_THAN_KL;
-
-    if (!(A_trans_=='N' && B_trans_=='T') && p_.simd_width>1)
-      return TEMPLATE_SIMD_WIDTH_MUST_BE_ONE;
-
-    if (p_.A_fetching_policy==FETCH_FROM_LOCAL || 
p_.B_fetching_policy==FETCH_FROM_LOCAL)
-    {
-      if ((p_.local_fetch_0*p_.local_fetch_1) 
!=(p_.local_size_0*p_.local_size_1))
-        return TEMPLATE_LOCAL_FETCH_PRODUCT_MUST_MATCH_LOCAL_SIZE_PRODUCT;
-    }
-
-    if (p_.A_fetching_policy==FETCH_FROM_LOCAL)
-    {
-      unsigned int bound1 = (A_trans_=='N')?p_.kL:p_.mL;
-      unsigned int bound0 = (A_trans_=='N')?p_.mL:p_.kL;
-
-      if (p_.local_fetch_1>0 && (bound1 % p_.local_fetch_1)> 0)
-        return 
A_trans_=='N'?TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE:TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE;
-
-      if (p_.local_fetch_0>0 && (bound0 % (p_.local_fetch_0*p_.simd_width)) > 
0)
-        return 
A_trans_=='N'?TEMPLATE_LOCAL_FETCH_0_MUST_BE_NL_MULTIPLE:TEMPLATE_LOCAL_FETCH_0_MUST_BE_KL_MULTIPLE;
-
-    }
-    if (p_.B_fetching_policy==FETCH_FROM_LOCAL)
-    {
-      unsigned int bound1 = (B_trans_=='T')?p_.kL:p_.nL;
-      unsigned int bound0 = (B_trans_=='T')?p_.nL:p_.kL;
-
-      if (p_.local_fetch_1>0 && (bound1 % p_.local_fetch_1)> 0)
-        return 
B_trans_=='T'?TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE:TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE;
-
-      if (p_.local_fetch_0>0 && (bound0 % (p_.local_fetch_0*p_.simd_width)) > 
0)
-        return 
B_trans_=='T'?TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE:TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE;
-
-    }
-
-    return TEMPLATE_VALID;
-  }
-
-  static void parse(scheduler::statement const & s,
-                    vcl_size_t & C_idx, leaf_t & C_leaf, vcl_size_t & 
alpha_idx, leaf_t & alpha_leaf,
-                    vcl_size_t & A_idx, leaf_t & A_leaf, bool& A_trans, 
vcl_size_t & B_idx, leaf_t & B_leaf, bool& B_trans,
-                    vcl_size_t & beta_idx, leaf_t & beta_leaf)
-  {
-    using namespace tree_parsing;
-    using namespace scheduler;
-
-    scheduler::statement::container_type const & array = s.array();
-    vcl_size_t root_idx = s.root();
-
-    C_idx = root_idx;
-    C_leaf = LHS_NODE_TYPE;
-
-    vcl_size_t node_add_idx = array[root_idx].rhs.node_index;
-
-    vcl_size_t node_1_idx = array[node_add_idx].lhs.node_index;
-    alpha_idx = node_1_idx;
-    alpha_leaf = RHS_NODE_TYPE;
-
-    vcl_size_t mat_prod_idx = array[node_1_idx].lhs.node_index;
-    if (array[mat_prod_idx].lhs.type_family==MATRIX_TYPE_FAMILY)
-    {
-      A_trans = false;
-      A_idx = mat_prod_idx;
-    }
-    else
-    {
-      A_trans = true;
-      A_idx = array[mat_prod_idx].lhs.node_index;
-    }
-    A_leaf = LHS_NODE_TYPE;
-
-    if (array[mat_prod_idx].rhs.type_family==MATRIX_TYPE_FAMILY)
-    {
-      B_trans = false;
-      B_idx = mat_prod_idx;
-      B_leaf = RHS_NODE_TYPE;
-    }
-    else
-    {
-      B_trans = true;
-      B_idx = array[mat_prod_idx].rhs.node_index;
-      B_leaf = LHS_NODE_TYPE;
-    }
-
-    vcl_size_t node_2_idx = array[node_add_idx].rhs.node_index;
-    beta_idx = node_2_idx;
-    beta_leaf = RHS_NODE_TYPE;
-  }
-
-  void VIENNACL_HANDLE_BOUNDS(bool fallback, utils::kernel_generation_stream & 
stream, std::string const & inbounds, std::string const & do_if, std::string 
do_else) const
-  {
-    if (fallback)
-    {
-      stream << "if (" << inbounds << ")" << std::endl;
-      stream.inc_tab();
-      stream << do_if << ";" << std::endl;
-      stream.dec_tab();
-      stream << "else" << std::endl;
-      stream.inc_tab();
-      stream << do_else << ";" << std::endl;
-      stream.dec_tab();
-    }
-    else
-      stream << do_if << ";" << std::endl;
-  }
-
-
-  std::string generate_impl(const std::string &kernel_prefix, const 
statements_container &statements, const std::vector<mapping_type> &mappings, 
bool fallback) const
-  {
-    using std::string;
-    using tools::to_string;
-
-    parameters_type pfallback(1, p_.local_size_0, p_.kL, p_.local_size_1, 
p_.mS, 1, p_.nS, p_.A_fetching_policy, p_.B_fetching_policy, p_.local_fetch_0, 
p_.local_fetch_1);
-    parameters_type const & p = fallback?pfallback:p_;
-
-#define VIENNACL_MUL_STRIDE1 string(fallback?"*#stride1":"")
-#define VIENNACL_HANDLE_BOUNDS(in_bounds, to_load) 
(!fallback?string(to_load):string( string(in_bounds) + "?" + string(to_load) + 
":0"))
-#define VIENNACL_VSTORE(value, offset, ptr) vstore(p.simd_width, value, 
offset, ptr)
-
-    string widthstr = tools::to_string(p.simd_width);
-
-    //////////////////
-    /// INIT
-    /// //////////////
-    utils::kernel_generation_stream stream;
-    scheduler::statement const & st = statements.data().front();
-    mapping_type const & mapping = mappings.front();
-
-    bool A_trans = false, B_trans = false;
-    vcl_size_t C_idx=0, alpha_idx=0, A_idx=0, B_idx=0, beta_idx=0;
-    leaf_t C_leaf=LHS_NODE_TYPE, alpha_leaf=LHS_NODE_TYPE, 
A_leaf=LHS_NODE_TYPE, B_leaf=LHS_NODE_TYPE, beta_leaf=LHS_NODE_TYPE;
-    parse(st, C_idx, C_leaf, alpha_idx, alpha_leaf, A_idx, A_leaf, A_trans, 
B_idx, B_leaf, B_trans, beta_idx, beta_leaf);
-
-    mapped_matrix      * C     = (mapped_matrix*     )at(mapping, mapping_key( 
   C_idx,     C_leaf)).get();
-    mapped_host_scalar * alpha = (mapped_host_scalar*)at(mapping, 
mapping_key(alpha_idx, alpha_leaf)).get();
-    mapped_matrix      * A     = (mapped_matrix*     )at(mapping, mapping_key( 
   A_idx,     A_leaf)).get();
-    mapped_matrix      * B     = (mapped_matrix*     )at(mapping, mapping_key( 
   B_idx,     B_leaf)).get();
-    mapped_host_scalar * beta  = (mapped_host_scalar*)at(mapping, mapping_key( 
beta_idx,  beta_leaf)).get();
-
-    //////////////////
-    /// DECLARATIONS
-    /// //////////////
-
-    stream << " __attribute__((reqd_work_group_size(" << p.local_size_0 << "," 
<< p.local_size_1 << ",1)))" << std::endl;
-    std::map<std::string, unsigned int> widths;
-    widths[A->name()] = p.simd_width;
-    widths[B->name()] = p.simd_width;
-    generate_prototype(stream, kernel_prefix, "unsigned int M, unsigned int N, 
unsigned int K, ", mappings, statements, widths);
-    stream << "{" << std::endl;
-    stream.inc_tab();
-    if(!fallback)
-    {
-      stream << A->process("#start1 /= " + to_string(p.simd_width) + ";") << 
std::endl;
-      stream << A->process("#ld /= " + to_string(p.simd_width) + ";") << 
std::endl;
-      stream << B->process("#start1/= "  + to_string(p.simd_width) + ";") << 
std::endl;
-      stream << B->process("#ld /= " + to_string(p.simd_width) + ";") << 
std::endl;
-    }
-    tree_parsing::process(stream, PARENT_NODE_TYPE, "matrix", "#pointer += 
$OFFSET{#start1, #start2};", statements, mappings);
-    tree_parsing::process(stream, PARENT_NODE_TYPE, "matrix", "#ld *= 
#nldstride;", statements, mappings);
-
-    ///Result Values
-    stream << C->process("#scalartype rC[" + to_string(p.mS) + "][" + 
to_string(p.nS) + "] = {{(#scalartype)0}};") << std::endl;
-    if (p.A_fetching_policy==FETCH_FROM_LOCAL)
-      stream << A->process("#scalartype rA[" + to_string(p.kS) + "][" + 
to_string(p.mS) + "];") << std::endl;
-    else
-      stream << A->process(utils::append_width("#scalartype",p.simd_width) + " 
rA[" + to_string(p.kS) + "][" + to_string(p.mS/p.simd_width) + "];") << 
std::endl;
-    if (p.B_fetching_policy==FETCH_FROM_LOCAL)
-      stream << B->process("#scalartype rB[" + to_string(p.kS) + "][" + 
to_string(p.nS) + "];");
-    else
-      stream << B->process(utils::append_width("#scalartype",p.simd_width) + " 
rB[" + to_string(p.kS) + "][" + to_string(p.nS/p.simd_width) + "];") << 
std::endl;
-
-
-    if (p.A_fetching_policy==FETCH_FROM_LOCAL)
-      stream << A->process("__local #scalartype lA[" + 
to_string(p.kL*(p.mL+1)) + "];");
-    if (p.B_fetching_policy==FETCH_FROM_LOCAL)
-      stream << B->process("__local #scalartype lB[" + 
to_string(p.kL*(p.nL+1)) + "];");
-    stream << std::endl;
-
-    stream << "size_t gidx = get_group_id(0);" << std::endl;
-    stream << "size_t gidy = get_group_id(1);" << std::endl;
-    stream << "size_t idx = get_local_id(0);" << std::endl;
-    stream << "size_t idy = get_local_id(1);" << std::endl;
-
-    if (p.A_fetching_policy==FETCH_FROM_LOCAL || 
p.B_fetching_policy==FETCH_FROM_LOCAL)
-    {
-      stream << std::endl;
-      stream << "size_t idt = " << p.local_size_0 << "*idy + idx;" << 
std::endl;
-      stream << "size_t idxT = idt % " << p.local_fetch_0 << ";" << std::endl;
-      stream << "size_t idyT = idt / " << p.local_fetch_0 << ";" << std::endl;
-    }
-    stream << std::endl;
-
-    if (fallback)
-    {
-      //Bounds checking for M (in A, C)
-      stream << "bool in_bounds_m[" << p.mS << "];" << std::endl;
-      stream << "for(size_t m = 0; m < " << p.mS << "; m++)" << std::endl;
-      stream.inc_tab();
-      switch (p.A_fetching_policy)
-      {
-      case FETCH_FROM_GLOBAL_CONTIGUOUS:
-        stream << "in_bounds_m[m] = gidx*" << p.mL << " + idx*" << p.mS << " + 
m < M;" << std::endl;
-        break;
-      default:
-        stream << "in_bounds_m[m] = gidx*" << p.mL << " + idx + m*" << 
p.local_size_0 << " < M;" << std::endl;
-        break;
-      }
-      stream.dec_tab();
-
-      //Bounds checking for A if Local
-      if (p.A_fetching_policy==FETCH_FROM_LOCAL)
-      {
-        unsigned int fetch_size = 
(A_trans_=='N'?p.local_fetch_0*p.simd_width:p.local_fetch_1);
-        stream << "bool in_bounds_m_local[" << p.mL/fetch_size << "];" << 
std::endl;
-        stream << "for(size_t m = 0; m < " << p.mL/fetch_size << "; m++)" << 
std::endl;
-        stream.inc_tab();
-        stream << "in_bounds_m_local[m] = gidx*" << p.mL << " + " << 
(A_trans_=='N'?"idxT":"idyT") << " + m*" << fetch_size << " < M;" << std::endl;
-        stream.dec_tab();
-      }
-
-      //Bounds checking for N (in B, C)
-      stream << "bool in_bounds_n[" << p.nS << "];" << std::endl;
-      stream << "for(size_t n = 0; n < " << p.nS << "; n++)" << std::endl;
-      stream.inc_tab();
-      switch (p.B_fetching_policy)
-      {
-      case FETCH_FROM_GLOBAL_CONTIGUOUS:
-        stream << "in_bounds_n[n] = gidy*" << p.nL << " + idy*" << p.nS << " + 
n < N;" << std::endl;
-        break;
-      default:
-        stream << "in_bounds_n[n] = gidy*" << p.nL << " + idy + n*" << 
p.local_size_1 << " < N;" << std::endl;
-        break;
-      }
-      stream.dec_tab();
-
-      //Bounds checking for B if Local
-      if (p.B_fetching_policy==FETCH_FROM_LOCAL)
-      {
-        unsigned int fetch_size = 
(B_trans_=='T'?p.local_fetch_0*p.simd_width:p.local_fetch_1);
-        stream << "bool in_bounds_n_local[" << p.nL/fetch_size << "];" << 
std::endl;
-        stream << "for(size_t n = 0; n < " <<  p.nL/fetch_size << "; n++)" << 
std::endl;
-        stream.inc_tab();
-        stream << "in_bounds_n_local[n] = gidy*" << p.nL << " + " << 
(B_trans_=='T'?"idxT":"idyT") << " + n*" << fetch_size << " < N;" << std::endl;
-        stream.dec_tab();
-      }
-    }
-
-    switch (p.A_fetching_policy)
-    {
-    case FETCH_FROM_LOCAL:
-      if (A_trans_=='N')
-        stream << A->process("#pointer += (gidx*" + 
to_string(p.mL/p.simd_width) + " + idxT)" + VIENNACL_MUL_STRIDE1 + " + 
idyT*#ld;") << std::endl;
-      else
-        stream << A->process("#pointer += idxT" + VIENNACL_MUL_STRIDE1 + " + 
gidx*" + to_string(p.mL/p.simd_width) + "*#ld + idyT*#ld;") << std::endl;
-      break;
-
-    case FETCH_FROM_GLOBAL_CONTIGUOUS:
-      if (A_trans_=='N')
-        stream << A->process("#pointer += (gidx*" + 
to_string(p.mL/p.simd_width) + "+ idx*" + to_string(p.mS/p.simd_width) + ")" + 
VIENNACL_MUL_STRIDE1 + ";") << std::endl;
-      else
-        stream << A->process("#pointer += (gidx*" + 
to_string(p.mL/p.simd_width) + "+ idx*" + to_string(p.mS/p.simd_width) + 
")*#ld;") << std::endl;
-      break;
-
-    case FETCH_FROM_GLOBAL_STRIDED:
-      if (A_trans_=='N')
-        stream << A->process("#pointer += (gidx*" + 
to_string(p.mL/p.simd_width) + "+ idx" + ")" + VIENNACL_MUL_STRIDE1 + ";") << 
std::endl;
-      else
-        stream << A->process("#pointer += (gidx*" + 
to_string(p.mL/p.simd_width) + "+ idx)*#ld;") << std::endl;
-      break;
-
-    //default: break;
-    }
-
-    switch (p.B_fetching_policy)
-    {
-    case FETCH_FROM_LOCAL:
-      if (B_trans_=='T')
-        stream << B->process("#pointer += (gidy*" + 
to_string(p.nL/p.simd_width) + " + idxT" + ")" + VIENNACL_MUL_STRIDE1 + " + 
idyT*#ld;") << std::endl;
-      else
-        stream << B->process("#pointer += idxT" + VIENNACL_MUL_STRIDE1 + " + 
gidy*" + to_string(p.nL/p.simd_width) + "*#ld + idyT*#ld;") << std::endl;
-      break;
-
-    case FETCH_FROM_GLOBAL_CONTIGUOUS:
-      if (B_trans_=='T')
-        stream << B->process("#pointer += (gidy*" + 
to_string(p.nL/p.simd_width) + "+ idy*" + to_string(p.nS/p.simd_width) + ")" + 
VIENNACL_MUL_STRIDE1 + ";") << std::endl;
-      else
-        stream << B->process("#pointer += (gidy*" + 
to_string(p.nL/p.simd_width) + "+ idy*" + to_string(p.nS/p.simd_width) + 
")*#ld;") << std::endl;
-      break;
-
-    case FETCH_FROM_GLOBAL_STRIDED:
-      if (B_trans_=='T')
-        stream << B->process("#pointer += (gidy*" + 
to_string(p.nL/p.simd_width) + "+ idy" + ")" + VIENNACL_MUL_STRIDE1 + ";") << 
std::endl;
-      else
-        stream << B->process("#pointer += (gidy*" + 
to_string(p.nL/p.simd_width) + "+ idy)*#ld;") << std::endl;
-      break;
-
-    //default: break;
-    }
-
-    stream << std::endl;
-    stream << "size_t K_size_t = K;" << std::endl;
-    stream << "for(size_t block_k=0; block_k < K_size_t; block_k+=" << p.kL << 
"){" << std::endl;
-    stream.inc_tab();
-
-    if (p.A_fetching_policy==FETCH_FROM_LOCAL)
-    {
-      if (A_trans_=='N')
-        stream << A->process("__local #scalartype* plA = lA + idyT*" + 
to_string(p.mL + 1) + " + " + to_string(p.simd_width) + "*idxT;") << std::endl;
-      else
-        stream << A->process("__local #scalartype* plA = lA + idxT*" + 
to_string(p.mL + 1) + " + idyT;") << std::endl;
-    }
-
-
-    if (p.B_fetching_policy==FETCH_FROM_LOCAL)
-    {
-      if (B_trans_=='T')
-        stream  << B->process("__local #scalartype* plB = lB + idyT*" + 
to_string(p.nL+1) + " + " + to_string(p.simd_width) + "*idxT;") << std::endl;
-      else
-        stream << B->process("__local #scalartype* plB = lB + idxT*" + 
to_string(p.nL+1) + "+ idyT;") <<std::endl;
-    }
-
-
-    if (p.A_fetching_policy==FETCH_FROM_LOCAL || 
p.B_fetching_policy==FETCH_FROM_LOCAL)
-      stream << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
-
-    ///Fetch LHS to Local Memory
-    if (p.A_fetching_policy==FETCH_FROM_LOCAL && A_trans_=='N')
-      for (unsigned int k = 0; k < p.kL; k += p.local_fetch_1)
-        for (unsigned int m = 0; m < p.mL; m += p.local_fetch_0*p.simd_width)
-        {
-          string in_bounds = "in_bounds_m_local[" + 
to_string(m/(p.local_fetch_0*p.simd_width)) + "]";
-          string to_load = "#pointer[" + to_string(k) + "*#ld + " + 
to_string(m/p.simd_width) + VIENNACL_MUL_STRIDE1 + "]";
-          stream << 
A->process(VIENNACL_VSTORE(VIENNACL_HANDLE_BOUNDS(in_bounds, to_load), "0", 
"plA + " + to_string(k*(p.mL+1)+m))) << ";" << std::endl;
-        }
-    else if (p.A_fetching_policy==FETCH_FROM_LOCAL && A_trans_=='T')
-      for (unsigned int k = 0; k < p.mL; k += p.local_fetch_1)
-        for (unsigned int m = 0; m < p.kL; m += p.local_fetch_0*p.simd_width)
-        {
-          string in_bounds = "in_bounds_m_local[" + 
to_string(k/p.local_fetch_1) + "]";
-          string to_load = "#pointer[" + to_string(k) + "*#ld + " + 
to_string(m/p.simd_width) + VIENNACL_MUL_STRIDE1 + "]";
-          stream << 
A->process(VIENNACL_VSTORE(VIENNACL_HANDLE_BOUNDS(in_bounds, to_load), "0", 
"plA + " + to_string(m*(p.mL+1)+k))) << ";" << std::endl;
-        }
-
-    if (p.B_fetching_policy==FETCH_FROM_LOCAL && B_trans_=='T')
-      for (unsigned int k = 0; k < p.kL; k += p.local_fetch_1)
-        for (unsigned int n = 0; n < p.nL; n += p.local_fetch_0*p.simd_width)
-        {
-          string in_bounds = "in_bounds_n_local[" + 
to_string(n/(p.local_fetch_0*p.simd_width)) + "]";
-          string to_load = "#pointer[" + to_string(k) + "*#ld + " + 
to_string(n/p.simd_width) + VIENNACL_MUL_STRIDE1 + "]";
-          stream << 
B->process(VIENNACL_VSTORE(VIENNACL_HANDLE_BOUNDS(in_bounds, to_load), "0", 
"plB + " + to_string(k*(p.nL+1)+n))) << ";" << std::endl;
-        }
-    else if (p.B_fetching_policy==FETCH_FROM_LOCAL && B_trans_=='N')
-      for (unsigned int k = 0; k < p.nL; k += p.local_fetch_1)
-        for (unsigned int n = 0; n < p.kL; n += p.local_fetch_0*p.simd_width)
-        {
-          string in_bounds = "in_bounds_n_local[" + 
to_string(k/p.local_fetch_1) + "]";
-          string to_load = "#pointer[" + to_string(k) + "*#ld + " + 
to_string(n/p.simd_width) + VIENNACL_MUL_STRIDE1 + "]";
-          stream << 
B->process(VIENNACL_VSTORE(VIENNACL_HANDLE_BOUNDS(in_bounds, to_load), "0", 
"plB + " + to_string(n*(p.nL+1)+k))) << ";" << std::endl;
-        }
-
-    if (p.A_fetching_policy==FETCH_FROM_LOCAL || p.B_fetching_policy == 
FETCH_FROM_LOCAL)
-    {
-      stream << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
-      stream << "size_t offA = " << p.simd_width << "*idx;" << std::endl;
-      stream << "size_t offB = " << p.simd_width << "*idy;" << std::endl;
-    }
-
-    if (fallback)
-      stream << "for(size_t k = 0; k < " << p.kL << " && (block_k + k < 
K_size_t); k+=" << p.kS << "){" << std::endl;
-    else
-      stream << "for(size_t k = 0; k < " << p.kL << "; k+=" << p.kS << "){" << 
std::endl;
-    stream.inc_tab();
-
-    ///Fetch LHS to registers
-    stream << "#pragma unroll " << p.kS <<  std::endl;
-    stream << "for(size_t kk = 0; kk < " << p.kS << "; kk++)" << std::endl;
-    stream << "#pragma unroll " << p.mS/p.simd_width << std::endl;
-    stream << "for(size_t mm = 0; mm < " << p.mS/p.simd_width << "; mm++)" << 
std::endl;
-    stream << "{" << std::endl;
-    stream.inc_tab();
-    switch (p.A_fetching_policy)
-    {
-    case FETCH_FROM_LOCAL:
-      for (unsigned int ss = 0; ss < p.simd_width; ++ss)
-        stream << "rA[kk][mm*" << p.simd_width << "+" << ss << "] = lA[offA + 
mm*" << p.local_size_0*p.simd_width << "+" << ss << "+ kk*" << (p.mL+1) << "];" 
<< std::endl;
-      break;
-
-    case FETCH_FROM_GLOBAL_CONTIGUOUS:
-    {
-      if (A_trans_=='N')
-        stream << "rA[kk][mm] = " << 
A->process(VIENNACL_HANDLE_BOUNDS("in_bounds_m[mm]", "#pointer[kk*#ld + mm" + 
VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl;
-      else
-        stream << "rA[kk][mm] = " << 
A->process(VIENNACL_HANDLE_BOUNDS("in_bounds_m[mm]", "#pointer[mm*#ld + kk" + 
VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl;
-      break;
-    }
-
-    case FETCH_FROM_GLOBAL_STRIDED:
-    {
-      if (A_trans_=='N')
-        stream << "rA[kk][mm] = " << 
A->process(VIENNACL_HANDLE_BOUNDS("in_bounds_m[mm]", "#pointer[kk*#ld + mm*" + 
to_string(p.local_size_0) + VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl;
-      else
-        stream << "rA[kk][mm] = " << 
A->process(VIENNACL_HANDLE_BOUNDS("in_bounds_m[mm]", "#pointer[mm*#ld*" + 
to_string(p.local_size_0) + " + kk" + VIENNACL_MUL_STRIDE1 + "]")) << ";" << 
std::endl;
-      break;
-    }
-
-    //default: break;
-    }
-    stream.dec_tab();
-    stream << "}" << std::endl;
-
-    stream << "#pragma unroll " << p.kS << std::endl;
-    stream << "for(size_t kk = 0; kk < " << p.kS << "; kk++)" << std::endl;
-    stream << "#pragma unroll " << p.nS/p.simd_width << std::endl;
-    stream << "for(size_t nn = 0; nn < " << p.nS/p.simd_width << "; nn++)" << 
std::endl;
-    stream << "{" << std::endl;
-    stream.inc_tab();
-    switch (p.B_fetching_policy)
-    {
-    case FETCH_FROM_LOCAL:
-      for (unsigned int ss = 0; ss < p.simd_width; ++ss)
-        stream << "rB[kk][nn*" << p.simd_width << "+" << ss << "] = lB[offB + 
nn*" << p.local_size_1*p.simd_width << "+" << ss  << "+ kk*" << (p.nL+1) << 
"];" << std::endl;
-      break;
-
-    case FETCH_FROM_GLOBAL_CONTIGUOUS:
-    {
-      if (B_trans_=='T')
-        stream << "rB[kk][nn] = " << 
B->process(VIENNACL_HANDLE_BOUNDS("in_bounds_n[nn]", "#pointer[kk*#ld + nn" + 
VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl;
-      else
-        stream << "rB[kk][nn] = " << 
B->process(VIENNACL_HANDLE_BOUNDS("in_bounds_n[nn]", "#pointer[nn*#ld + kk" + 
VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl;
-      break;
-    }
-
-    case FETCH_FROM_GLOBAL_STRIDED:
-    {
-      if (B_trans_=='T')
-        stream << "rB[kk][nn] = " << 
B->process(VIENNACL_HANDLE_BOUNDS("in_bounds_n[nn]", "#pointer[kk*#ld + nn*" + 
to_string(p.local_size_1) + VIENNACL_MUL_STRIDE1 + "]")) << ";" << std::endl;
-      else
-        stream << "rB[kk][nn] = " << 
B->process(VIENNACL_HANDLE_BOUNDS("in_bounds_n[nn]", "#pointer[nn*#ld*" + 
to_string(p.local_size_1) + " + kk" + VIENNACL_MUL_STRIDE1 + "]")) << ";" << 
std::endl;
-      break;
-    }
-
-    //default: break;
-    }
-    stream.dec_tab();
-    stream << "}" << std::endl;
-
-
-    ///Increment pointers
-    switch (p.A_fetching_policy)
-    {
-    case FETCH_FROM_LOCAL:
-      stream << "offA += " << p.kS*(p.mL+1) << ";" << std::endl;
-      break;
-
-    default:
-      if (A_trans_=='N')
-        stream << A->process("#pointer += " + to_string(p.kS) + "*#ld;") << 
std::endl;
-      else
-        stream << A->process("#pointer += " + to_string(p.kS) + "" + 
VIENNACL_MUL_STRIDE1 + ";") << std::endl;
-      break;
-    }
-
-
-    switch (p.B_fetching_policy)
-    {
-    case FETCH_FROM_LOCAL:
-      stream << "offB += " << p.kS*(p.nL+1) << ";" << std::endl;
-      break;
-
-    default:
-      if (B_trans_=='T')
-        stream << B->process("#pointer += " + to_string(p.kS) + "*#ld;") << 
std::endl;
-      else
-        stream << B->process("#pointer += " + to_string(p.kS) + "" + 
VIENNACL_MUL_STRIDE1 + ";") << std::endl;
-      break;
-    }
-
-
-    stream << "#pragma unroll " << p.kS << std::endl;
-    stream << "for(size_t kk = 0; kk <" << p.kS << "; ++kk)" << std::endl;
-    stream << "{" << std::endl;
-    stream.inc_tab();
-    for (unsigned int nn=0; nn < p.nS; ++nn)
-      for (unsigned int mm=0; mm < p.mS; ++mm)
-      {
-        string res_str, lhs_str, rhs_str;
-        res_str = "rC[" + tools::to_string(mm) + "][" + tools::to_string(nn) + 
"]";
-        if (p.A_fetching_policy==FETCH_FROM_LOCAL || p.simd_width==1)
-          lhs_str = "rA[kk][" + tools::to_string(mm) + "]";
-        else
-          lhs_str = "rA[kk][" + tools::to_string(mm/p.simd_width) + "].s" + 
tools::to_string(mm%p.simd_width);
-        if (p.B_fetching_policy==FETCH_FROM_LOCAL || p.simd_width==1)
-          rhs_str = "rB[kk]["+tools::to_string(nn)+"]";
-        else
-          rhs_str = 
"rB[kk]["+tools::to_string(nn/p.simd_width)+"].s"+tools::to_string(nn%p.simd_width);
-        stream << res_str << "=" << "fma(" << lhs_str << "," << rhs_str << "," 
<< res_str << ");" << std::endl;
-      }
-    stream.dec_tab();
-    stream << "}" << std::endl;
-
-
-
-
-    stream.dec_tab();
-    stream << "}" << std::endl;
-
-    //Increment global pointer if local memory is used
-    //Else, it's incremented directly when fetching
-    if (p.A_fetching_policy==FETCH_FROM_LOCAL)
-    {
-      if (A_trans_=='N')
-        stream << A->process("#pointer += " + to_string(p.kL) + "*#ld;") << 
std::endl;
-      else
-        stream << A->process("#pointer += " + to_string(p.kL) + "" + 
VIENNACL_MUL_STRIDE1 + ";") << std::endl;
-    }
-
-    if (p.B_fetching_policy==FETCH_FROM_LOCAL)
-    {
-      if (B_trans_=='T')
-        stream << B->process("#pointer += " + to_string(p.kL) + "*#ld;") << 
std::endl;
-      else
-        stream << B->process("#pointer += " + to_string(p.kL) + "" + 
VIENNACL_MUL_STRIDE1 + ";") << std::endl;
-    }
-
-    stream.dec_tab();
-    stream << "}" << std::endl;
-
-
-    if (C->row_major())
-    {
-      unsigned int ministartstride0 = 
p.A_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?p.mS:p.simd_width;
-      unsigned int ministartstride1 = 
p.B_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?p.nS:p.simd_width;
-
-      stream << C->process("#pointer += gidx*" + to_string(p.mL) + "*#ld;") << 
std::endl;
-      stream << C->process("#pointer += idx*" + to_string(ministartstride0) + 
"*#ld;") << std::endl;
-      stream << C->process("#pointer += gidy*" + to_string(p.nL) + 
"*#stride2;") << std::endl;
-      stream << C->process("#pointer += idy*" + to_string(ministartstride1) + 
"*#stride2;") << std::endl;
-
-      for (unsigned int n=0; n < p.nS; ++n)
-      {
-        for (unsigned int m=0; m < p.mS; ++m)
-        {
-          unsigned int ministride1 = 
p.A_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?1:p.local_size_0;
-          string Cj = to_string((m/p.simd_width)*(ministride1*p.simd_width) + 
m%p.simd_width);
-          if (fallback)
-          {
-            stream << "if (in_bounds_m[" + to_string(m) + "] && in_bounds_n[" 
+ to_string(n) + "])" << std::endl;
-            stream.inc_tab();
-          }
-          stream << C->process("#pointer[" + Cj + "*#ld] = rC[" + to_string(m) 
+ "][" + to_string(n) + "]*" + alpha->name() + "+ #pointer[" + Cj + "*#ld]*" + 
beta->name() + ";") << std::endl;
-          if (fallback)
-            stream.dec_tab();
-        }
-        if ((n+1)%p.simd_width>0 || 
p.B_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS)
-          stream << C->process("#pointer += #stride2;") << std::endl;
-        else
-          stream << C->process("#pointer += " + 
to_string((p.local_size_1*p.simd_width) - (p.simd_width-1)) + "*#stride2;") << 
std::endl;
-      }
-
-    }
-    else
-    {
-      unsigned int ministartstride0 = 
p.A_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?p.mS:p.simd_width;
-      unsigned int ministartstride1 = 
p.B_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?p.nS:p.simd_width;
-
-      stream << C->process("#pointer += gidx*" + to_string(p.mL) + 
"*#stride1;") << std::endl;
-      stream << C->process("#pointer += idx*" + to_string(ministartstride0) + 
"*#stride1;") << std::endl;
-      stream << C->process("#pointer += gidy*" + to_string(p.nL) + "*#ld;") << 
std::endl;
-      stream << C->process("#pointer += idy*" + to_string(ministartstride1) + 
"*#ld;") << std::endl;
-
-      for (unsigned int m=0; m < p.mS; ++m)
-      {
-        for (unsigned int n=0; n < p.nS; ++n)
-        {
-          unsigned int ministride1 = 
p.B_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?1:p.local_size_1;
-          string Cj = to_string((n/p.simd_width)*(ministride1*p.simd_width) + 
n%p.simd_width);
-          if (fallback)
-          {
-            stream << "if (in_bounds_m[" + to_string(m) + "] && in_bounds_n[" 
+ to_string(n) + "])" << std::endl;
-            stream.inc_tab();
-          }
-          stream << C->process("#pointer[" + Cj + "*#ld] = rC[" + to_string(m) 
+ "][" + to_string(n) + "]*" + alpha->name() + " + #pointer[" + Cj + "*#ld]*" + 
beta->name() + ";") << std::endl;
-          if (fallback)
-            stream.dec_tab();
-        }
-
-        if ((m+1)%p.simd_width>0 || 
p.A_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS)
-          stream << C->process("#pointer += #stride1;") << std::endl;
-        else
-          stream << C->process("#pointer += " + 
to_string((p.local_size_0*p.simd_width) - (p.simd_width-1)) + "*#stride1;") << 
std::endl;
-      }
-    }
-
-    stream.dec_tab();
-    stream << "}" << std::endl;
-
-    return stream.str();
-
-#undef VIENNACL_MUL_STRIDE1
-#undef VIENNACL_HANDLE_BOUNDS
-#undef VIENNACL_VSTORE
-  }
-
-  std::vector<std::string> generate_impl(std::string const & kernel_prefix, 
statements_container const & statements, std::vector<mapping_type> const & 
mappings) const
-  {
-    std::vector<std::string> res;
-    res.push_back(generate_impl(kernel_prefix, statements, mappings, false));
-    res.push_back(generate_impl(kernel_prefix, statements, mappings, true));
-    return res;
-  }
-
-  template<class NumericT>
-  void enqueue_block(scheduler::statement & statement,
-                     scheduler::lhs_rhs_element& eA, 
scheduler::lhs_rhs_element& eB, scheduler::lhs_rhs_element& eC, 
scheduler::lhs_rhs_element& ebeta,
-                     matrix_base<NumericT> const & A, matrix_base<NumericT> 
const & B, matrix_base<NumericT> const & C, NumericT beta,
-                     std::vector<lazy_program_compiler> & programs, 
std::string const & kernel_prefix, vcl_size_t id)
-  {
-    if (A.size1()==0 || A.size2()==0 || B.size1()==0 || B.size2()==0 || 
C.size1()==0 || C.size2()==0)
-      return;
-
-    viennacl::ocl::kernel& kernel = 
programs[id].program().get_kernel(kernel_prefix);
-
-    kernel.local_work_size(0, p_.local_size_0);
-    kernel.local_work_size(1, p_.local_size_1);
-
-    scheduler::statement::assign_element(eA, A);
-    scheduler::statement::assign_element(eB, B);
-    scheduler::statement::assign_element(eC, C);
-    scheduler::statement::assign_element(ebeta, beta);
-
-    if (id==1)
-    {
-      kernel.global_work_size(0, 
tools::align_to_multiple(tools::align_to_multiple((unsigned 
int)C.size1(),p_.mS)/p_.mS, p_.local_size_0));
-      kernel.global_work_size(1, 
tools::align_to_multiple(tools::align_to_multiple((unsigned 
int)C.size2(),p_.nS)/p_.nS, p_.local_size_1));
-    }
-    else
-    {
-      kernel.global_work_size(0, C.size1()/p_.mS);
-      kernel.global_work_size(1, C.size2()/p_.nS);
-    }
-    unsigned int current_arg = 0;
-    kernel.arg(current_arg++, cl_uint(C.size1()));
-    kernel.arg(current_arg++, cl_uint(C.size2()));
-    if (A.row_major())
-      kernel.arg(current_arg++, cl_uint(A_trans_=='T'?A.size2():A.size1()));
-    else
-      kernel.arg(current_arg++, cl_uint(A_trans_=='N'?A.size2():A.size1()));
-    set_arguments(statement, kernel, current_arg);
-    viennacl::ocl::enqueue(kernel);
-
-  }
-
-  template<class NumericT>
-  matrix_slice< viennacl::matrix_base<NumericT> >  
create_slice(viennacl::matrix_base<NumericT>* scheduler::lhs_rhs_element::*ptr, 
scheduler::lhs_rhs_element const & element,
-                                                                          
vcl_size_t s0_0, vcl_size_t s0_1, vcl_size_t s1_0, vcl_size_t s1_1, bool swap)
-  {
-    matrix_base<NumericT> & M = *(element.*ptr);
-    slice s0(s0_0, 1, s0_1 - s0_0);
-    slice s1(s1_0, 1, s1_1 - s1_0);
-    if (swap)
-      std::swap(s0, s1);
-    return matrix_slice<viennacl::matrix_base<NumericT> >(M, s0, s1);
-  }
-
-  template<class NumericT>
-  void enqueue_impl(viennacl::matrix_base<NumericT>* 
scheduler::lhs_rhs_element::*ptr_matrix,
-                    scheduler::statement & statement, 
scheduler::lhs_rhs_element & A, scheduler::lhs_rhs_element & B, 
scheduler::lhs_rhs_element & C, scheduler::lhs_rhs_element & beta,
-                    NumericT beta_value, std::vector<lazy_program_compiler> & 
programs, std::string const & kernel_prefix)
-  {
-    using namespace device_specific::utils;
-    vcl_size_t ldstrideA = call_on_matrix(A, leading_stride());
-    vcl_size_t ldstrideB = call_on_matrix(B, leading_stride());
-    vcl_size_t ldstrideC = call_on_matrix(C, leading_stride());
-    vcl_size_t ldstartA = call_on_matrix(A, leading_start());
-    vcl_size_t ldstartB = call_on_matrix(B, leading_start());
-    bool swap_A = ((A_trans_=='T') ^ utils::call_on_matrix(A, 
row_major_fun()));
-    bool swap_B = ((B_trans_=='T') ^ utils::call_on_matrix(B, 
row_major_fun()));
-
-    vcl_size_t M = call_on_matrix(C, size1_fun());
-    vcl_size_t N = call_on_matrix(C, size2_fun());
-    vcl_size_t K;
-    if (utils::call_on_matrix(A, row_major_fun()))
-      K = A_trans_=='T'?call_on_matrix(A, size2_fun()):call_on_matrix(A, 
size1_fun());
-    else
-      K = A_trans_=='N'?call_on_matrix(A, size2_fun()):call_on_matrix(A, 
size1_fun());
-
-    if (M < p_.mL || N < p_.nL || K < p_.kL || ldstrideA> 1 || ldstrideB > 1 
|| ldstrideC > 1 ||
-        (p_.simd_width>1 && (ldstartA % p_.simd_width > 0 || ldstartB % 
p_.simd_width > 0)))
-    {
-      enqueue_block(statement, A, B, C, beta, create_slice(ptr_matrix, A, 0, 
M, 0, K, swap_A),
-                    create_slice(ptr_matrix, B, 0, K, 0, N,  swap_B),
-                    create_slice(ptr_matrix, C, 0, M, 0, N, false), 
beta_value, programs, kernel_prefix, 1);
-      return;
-    }
-
-
-    scheduler::lhs_rhs_element Acopy = A;
-    scheduler::lhs_rhs_element Bcopy = B;
-    scheduler::lhs_rhs_element Ccopy = C;
-
-    vcl_size_t lM = M / p_.mL * p_.mL;
-    vcl_size_t lN = N / p_.nL * p_.nL;
-    vcl_size_t lK = K / p_.kL * p_.kL;
-
-
-    enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, 
Acopy, 0, lM, 0, lK, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, 0, lK, 
0, lN, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, 0, lM, 0, lN, false), 
beta_value, programs, kernel_prefix, 0);
-    enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, 
Acopy, 0, lM, lK, K, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, lK, K, 
0, lN, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, 0, lM, 0, lN, false), 
(NumericT)1, programs, kernel_prefix, 1);
-
-    enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, 
Acopy, 0, lM, 0, lK, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, 0, lK, 
lN, N, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, 0, lM, lN, N, false), 
beta_value, programs, kernel_prefix, 1);
-    enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, 
Acopy, 0, lM, lK, K, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, lK, K, 
lN, N, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, 0, lM, lN, N, false), 
(NumericT)1, programs, kernel_prefix, 1);
-
-    enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, 
Acopy, lM, M, 0, lK, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, 0, lK, 
0, lN, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, lM, M, 0, lN, false), 
beta_value, programs, kernel_prefix, 1);
-    enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, 
Acopy, lM, M, lK, K, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, lK, K, 
0, lN, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, lM, M, 0, lN, false), 
(NumericT)1, programs, kernel_prefix, 1);
-
-    enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, 
Acopy, lM, M, 0, lK, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, 0, lK, 
lN, N, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, lM, M, lN, N, false), 
beta_value, programs, kernel_prefix, 1);
-    enqueue_block(statement, A, B, C, beta, create_slice<NumericT>(ptr_matrix, 
Acopy, lM, M, lK, K, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, lK, K, 
lN, N, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, lM, M, lN, N, false), 
(NumericT)1, programs, kernel_prefix, 1);
-  }
-
-public:
-  matrix_product_template(matrix_product_template::parameters_type const & 
parameters, char A_trans, char B_trans) : 
template_base_impl<matrix_product_template, 
matrix_product_parameters>(parameters, BIND_ALL_UNIQUE), A_trans_(A_trans), 
B_trans_(B_trans){ }
-
-  virtual void enqueue(std::string const & kernel_prefix, 
std::vector<lazy_program_compiler> & programs, statements_container const & 
statements)
-  {
-    using namespace device_specific::utils;
-    using namespace tree_parsing;
-
-    scheduler::statement const & st = statements.data().front();
-    bool A_trans, B_trans;
-    vcl_size_t C_idx=0, A_idx=0, B_idx=0, alpha_idx=0, beta_idx = 0;
-    leaf_t C_leaf=LHS_NODE_TYPE, A_leaf=LHS_NODE_TYPE, B_leaf=LHS_NODE_TYPE, 
alpha_leaf=LHS_NODE_TYPE, beta_leaf=LHS_NODE_TYPE;
-    parse(st, C_idx, C_leaf, alpha_idx, alpha_leaf, A_idx, A_leaf, A_trans, 
B_idx, B_leaf, B_trans, beta_idx, beta_leaf);
-
-    scheduler::statement stcopy = st;
-    scheduler::lhs_rhs_element& A = utils::lhs_rhs_element(stcopy, A_idx, 
A_leaf);
-    scheduler::lhs_rhs_element& B = utils::lhs_rhs_element(stcopy, B_idx, 
B_leaf);
-    scheduler::lhs_rhs_element& C = utils::lhs_rhs_element(stcopy, C_idx, 
C_leaf);
-    scheduler::lhs_rhs_element& beta = utils::lhs_rhs_element(stcopy, 
beta_idx, beta_leaf);
-
-
-
-
-
-
-    if (C.numeric_type==scheduler::FLOAT_TYPE)
-      enqueue_impl<float>(&scheduler::lhs_rhs_element::matrix_float, stcopy, 
A, B, C, beta, beta.host_float, programs, kernel_prefix);
-    else if (C.numeric_type==scheduler::DOUBLE_TYPE)
-      enqueue_impl<double>(&scheduler::lhs_rhs_element::matrix_double, stcopy, 
A, B, C, beta, beta.host_double, programs, kernel_prefix);
-    else
-      throw generator_not_supported_exception("GEMM only supported for 
float/double");
-
-  }
-
-private:
-  const char A_trans_;
-  const char B_trans_;
-};
-
-}
-
-}
-
-#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/template_base.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/template_base.hpp
 
b/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/template_base.hpp
deleted file mode 100644
index 40e3168..0000000
--- 
a/native-viennaCL/src/main/cpp/viennacl/device_specific/templates/template_base.hpp
+++ /dev/null
@@ -1,596 +0,0 @@
-#ifndef VIENNACL_DEVICE_SPECIFIC_TEMPLATES_TEMPLATE_BASE_
-#define VIENNACL_DEVICE_SPECIFIC_TEMPLATES_TEMPLATE_BASE_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-
-/** @file viennacl/device_specific/templates/template_base.hpp
- *
- * Base classes for the profiles
-*/
-
-#include <list>
-#include <set>
-
-#include "viennacl/ocl/kernel.hpp"
-#include "viennacl/ocl/device.hpp"
-#include "viennacl/ocl/device_utils.hpp"
-
-#include "viennacl/scheduler/forwards.h"
-#include "viennacl/scheduler/io.hpp"
-
-#include "viennacl/device_specific/lazy_program_compiler.hpp"
-#include "viennacl/device_specific/mapped_objects.hpp"
-#include "viennacl/device_specific/tree_parsing.hpp"
-#include "viennacl/device_specific/utils.hpp"
-
-namespace viennacl
-{
-namespace device_specific
-{
-
-enum fetching_policy_type
-{
-  FETCH_FROM_LOCAL,
-  FETCH_FROM_GLOBAL_STRIDED,
-  FETCH_FROM_GLOBAL_CONTIGUOUS
-};
-
-class template_base
-{
-public:
-  struct parameters_type
-  {
-    parameters_type(unsigned int _simd_width, unsigned int _local_size_1, 
unsigned int _local_size_2, unsigned int _num_kernels) : 
simd_width(_simd_width), local_size_0(_local_size_1), 
local_size_1(_local_size_2), num_kernels(_num_kernels){ }
-
-    unsigned int simd_width;
-    unsigned int local_size_0;
-    unsigned int local_size_1;
-    unsigned int num_kernels;
-  };
-
-private:
-  /** @brief Functor to map the statements to the types defined in 
mapped_objects.hpp */
-  class map_functor : public tree_parsing::traversal_functor
-  {
-
-    scheduler::statement_node_numeric_type numeric_type(scheduler::statement 
const * statement, vcl_size_t root_idx) const
-    {
-      scheduler::statement_node const * root_node = 
&statement->array()[root_idx];
-      while (root_node->lhs.numeric_type==scheduler::INVALID_NUMERIC_TYPE)
-        root_node = &statement->array()[root_node->lhs.node_index];
-      return root_node->lhs.numeric_type;
-    }
-
-  public:
-    typedef tools::shared_ptr<mapped_object> result_type;
-
-    map_functor(symbolic_binder & binder, mapping_type & mapping) : 
binder_(binder), mapping_(mapping){ }
-
-    /** @brief Binary leaf */
-    template<class T>
-    result_type binary_leaf(scheduler::statement const * statement, vcl_size_t 
root_idx, mapping_type const * mapping) const
-    {
-      return result_type(new 
T(utils::numeric_type_to_string(numeric_type(statement,root_idx)), 
binder_.get(NULL), mapped_object::node_info(mapping, statement, root_idx)));
-    }
-
-    template<class NumericT>
-    result_type operator()(NumericT const & /*scalar*/) const
-    {
-      return result_type(new 
mapped_host_scalar(utils::type_to_string<NumericT>::value(), 
binder_.get(NULL)));
-    }
-
-    /** @brief Scalar mapping */
-    template<class NumericT>
-    result_type operator()(scalar<NumericT> const & scal) const
-    {
-      return result_type(new 
mapped_scalar(utils::type_to_string<NumericT>::value(), 
binder_.get(&viennacl::traits::handle(scal))));
-    }
-
-    /** @brief Vector mapping */
-    template<class NumericT>
-    result_type operator()(vector_base<NumericT> const & vec) const
-    {
-      return result_type(new 
mapped_vector(utils::type_to_string<NumericT>::value(), 
binder_.get(&viennacl::traits::handle(vec))));
-    }
-
-    /** @brief Implicit vector mapping */
-    template<class NumericT>
-    result_type operator()(implicit_vector_base<NumericT> const & /*vec*/) 
const
-    {
-      return result_type(new 
mapped_implicit_vector(utils::type_to_string<NumericT>::value(), 
binder_.get(NULL)));
-    }
-
-    /** @brief Matrix mapping */
-    template<class NumericT>
-    result_type operator()(matrix_base<NumericT> const & mat) const
-    {
-      return result_type(new 
mapped_matrix(utils::type_to_string<NumericT>::value(), 
binder_.get(&viennacl::traits::handle(mat)),
-                                           viennacl::traits::row_major(mat)));
-    }
-
-    /** @brief Implicit matrix mapping */
-    template<class NumericT>
-    result_type operator()(implicit_matrix_base<NumericT> const & /*mat*/) 
const
-    {
-      return result_type(new 
mapped_implicit_matrix(utils::type_to_string<NumericT>::value(), 
binder_.get(NULL)));
-    }
-
-    /** @brief Traversal functor */
-    void operator()(scheduler::statement const & statement, vcl_size_t 
root_idx, leaf_t leaf_t) const {
-      mapping_type::key_type key(root_idx, leaf_t);
-      scheduler::statement_node const & root_node = 
statement.array()[root_idx];
-
-      if (leaf_t == LHS_NODE_TYPE && root_node.lhs.type_family != 
scheduler::COMPOSITE_OPERATION_FAMILY)
-        mapping_.insert(mapping_type::value_type(key, 
utils::call_on_element(root_node.lhs, *this)));
-      else if (leaf_t == RHS_NODE_TYPE && root_node.rhs.type_family != 
scheduler::COMPOSITE_OPERATION_FAMILY)
-        mapping_.insert(mapping_type::value_type(key,  
utils::call_on_element(root_node.rhs, *this)));
-      else if ( leaf_t== PARENT_NODE_TYPE)
-      {
-        if (root_node.op.type==scheduler::OPERATION_BINARY_VECTOR_DIAG_TYPE)
-          mapping_.insert(mapping_type::value_type(key, 
binary_leaf<mapped_vector_diag>(&statement, root_idx, &mapping_)));
-        else if 
(root_node.op.type==scheduler::OPERATION_BINARY_MATRIX_DIAG_TYPE)
-          mapping_.insert(mapping_type::value_type(key, 
binary_leaf<mapped_matrix_diag>(&statement, root_idx, &mapping_)));
-        else if 
(root_node.op.type==scheduler::OPERATION_BINARY_MATRIX_ROW_TYPE)
-          mapping_.insert(mapping_type::value_type(key, 
binary_leaf<mapped_matrix_row>(&statement, root_idx, &mapping_)));
-        else if 
(root_node.op.type==scheduler::OPERATION_BINARY_MATRIX_COLUMN_TYPE)
-          mapping_.insert(mapping_type::value_type(key, 
binary_leaf<mapped_matrix_column>(&statement, root_idx, &mapping_)));
-        else if (is_scalar_reduction(root_node))
-          mapping_.insert(mapping_type::value_type(key, 
binary_leaf<mapped_scalar_reduction>(&statement, root_idx, &mapping_)));
-        else if (is_vector_reduction(root_node))
-          mapping_.insert(mapping_type::value_type(key, 
binary_leaf<mapped_row_wise_reduction>(&statement, root_idx, &mapping_)));
-        else if (root_node.op.type == 
scheduler::OPERATION_BINARY_MAT_MAT_PROD_TYPE)
-          mapping_.insert(mapping_type::value_type(key, 
binary_leaf<mapped_matrix_product>(&statement, root_idx, &mapping_)));
-        else if (root_node.op.type == scheduler::OPERATION_UNARY_TRANS_TYPE)
-          mapping_.insert(mapping_type::value_type(key, 
binary_leaf<mapped_trans>(&statement, root_idx, &mapping_)));
-      }
-    }
-
-  private:
-    symbolic_binder & binder_;
-    mapping_type & mapping_;
-  };
-
-  /** @brief functor for generating the prototype of a statement */
-  class prototype_generation_traversal : public tree_parsing::traversal_functor
-  {
-  private:
-    std::set<std::string> & already_generated_;
-    std::string & str_;
-    mapping_type const & mapping_;
-    std::map<std::string, unsigned int> const & widths_;
-  public:
-    prototype_generation_traversal(std::set<std::string> & already_generated, 
std::string & str, mapping_type const & mapping, std::map<std::string, unsigned 
int> const & widths) :
-      already_generated_(already_generated), str_(str),  mapping_(mapping), 
widths_(widths){ }
-
-    void operator()(scheduler::statement const & statement, vcl_size_t 
root_idx, leaf_t leaf) const
-    {
-      scheduler::statement_node const & root_node = 
statement.array()[root_idx];
-      if ( (leaf==LHS_NODE_TYPE && 
root_node.lhs.type_family!=scheduler::COMPOSITE_OPERATION_FAMILY)
-           ||(leaf==RHS_NODE_TYPE && 
root_node.rhs.type_family!=scheduler::COMPOSITE_OPERATION_FAMILY) )
-      {
-        mapped_object * obj = at(mapping_, 
std::make_pair(root_idx,leaf)).get();
-        if(widths_.find(obj->name())!=widths_.end())
-          obj->append_kernel_arguments(already_generated_, str_, at(widths_, 
obj->name()));
-        else
-          obj->append_kernel_arguments(already_generated_, str_, 1);
-      }
-    }
-  };
-
-
-
-  /** @brief functor for setting the arguments of a kernel */
-  class set_arguments_functor : public tree_parsing::traversal_functor
-  {
-  public:
-    typedef void result_type;
-
-    set_arguments_functor(symbolic_binder & binder, unsigned int & 
current_arg, viennacl::ocl::kernel & kernel) : binder_(binder), 
current_arg_(current_arg), kernel_(kernel){ }
-
-    template<class NumericT>
-    result_type operator()(NumericT const & scal) const {
-      typedef typename viennacl::result_of::cl_type<NumericT>::type 
cl_scalartype;
-      kernel_.arg(current_arg_++, cl_scalartype(scal));
-    }
-
-    /** @brief Scalar mapping */
-    template<class NumericT>
-    result_type operator()(scalar<NumericT> const & scal) const {
-      if (binder_.bind(&viennacl::traits::handle(scal)))
-        kernel_.arg(current_arg_++, scal.handle().opencl_handle());
-    }
-
-    /** @brief Vector mapping */
-    template<class NumericT>
-    result_type operator()(vector_base<NumericT> const & vec) const {
-      if (binder_.bind(&viennacl::traits::handle(vec)))
-      {
-        kernel_.arg(current_arg_++, vec.handle().opencl_handle());
-        kernel_.arg(current_arg_++, cl_uint(viennacl::traits::start(vec)));
-        kernel_.arg(current_arg_++, cl_uint(viennacl::traits::stride(vec)));
-      }
-    }
-
-    /** @brief Implicit vector mapping */
-    template<class NumericT>
-    result_type operator()(implicit_vector_base<NumericT> const & vec) const
-    {
-      typedef typename viennacl::result_of::cl_type<NumericT>::type 
cl_scalartype;
-      kernel_.arg(current_arg_++, cl_scalartype(vec.value()));
-      if (vec.has_index())
-        kernel_.arg(current_arg_++, cl_uint(vec.index()));
-    }
-
-    /** @brief Matrix mapping */
-    template<class NumericT>
-    result_type operator()(matrix_base<NumericT> const & mat) const
-    {
-      if (binder_.bind(&viennacl::traits::handle(mat)))
-      {
-        kernel_.arg(current_arg_++, mat.handle().opencl_handle());
-        kernel_.arg(current_arg_++, cl_uint(viennacl::traits::ld(mat)));
-        if (mat.row_major())
-        {
-          kernel_.arg(current_arg_++, cl_uint(viennacl::traits::start2(mat)));
-          kernel_.arg(current_arg_++, cl_uint(viennacl::traits::start1(mat)));
-          kernel_.arg(current_arg_++, cl_uint(viennacl::traits::stride2(mat)));
-          kernel_.arg(current_arg_++, cl_uint(viennacl::traits::stride1(mat)));
-        }
-        else
-        {
-          kernel_.arg(current_arg_++, cl_uint(viennacl::traits::start1(mat)));
-          kernel_.arg(current_arg_++, cl_uint(viennacl::traits::start2(mat)));
-          kernel_.arg(current_arg_++, cl_uint(viennacl::traits::stride1(mat)));
-          kernel_.arg(current_arg_++, cl_uint(viennacl::traits::stride2(mat)));
-        }
-      }
-    }
-
-    /** @brief Implicit matrix mapping */
-    template<class NumericT>
-    result_type operator()(implicit_matrix_base<NumericT> const & mat) const
-    {
-      kernel_.arg(current_arg_++, typename 
viennacl::result_of::cl_type<NumericT>::type(mat.value()));
-    }
-
-    /** @brief Traversal functor: */
-    void operator()(scheduler::statement const & statement, vcl_size_t 
root_idx, leaf_t leaf_t) const
-    {
-      scheduler::statement_node const & root_node = 
statement.array()[root_idx];
-      if (leaf_t==LHS_NODE_TYPE && root_node.lhs.type_family != 
scheduler::COMPOSITE_OPERATION_FAMILY)
-        utils::call_on_element(root_node.lhs, *this);
-      else if (leaf_t==RHS_NODE_TYPE && root_node.rhs.type_family != 
scheduler::COMPOSITE_OPERATION_FAMILY)
-        utils::call_on_element(root_node.rhs, *this);
-    }
-
-  private:
-    symbolic_binder & binder_;
-    unsigned int & current_arg_;
-    viennacl::ocl::kernel & kernel_;
-  };
-
-protected:
-
-  static void generate_prototype(utils::kernel_generation_stream & stream, 
std::string const & name, std::string const & first_arguments, 
std::vector<mapping_type> const & mappings, statements_container const 
&statements,
-                                 std::map<std::string, unsigned int> const & 
widths)
-  {
-    statements_container::data_type::const_iterator sit;
-    std::vector<mapping_type>::const_iterator mit;
-    std::set<std::string> already_generated;
-
-    std::string arguments = first_arguments;
-    for (mit = mappings.begin(), sit = statements.data().begin(); sit != 
statements.data().end(); ++sit, ++mit)
-      tree_parsing::traverse(*sit, sit->root(), 
prototype_generation_traversal(already_generated, arguments, *mit, widths), 
true);
-    arguments.erase(arguments.size()-1); //Last comma pruned
-    stream << "__kernel " << "void " << name << "(" << arguments << ")" << 
std::endl;
-  }
-
-  static void generate_prototype(utils::kernel_generation_stream & stream, 
std::string const & name, std::string const & first_arguments, 
std::vector<mapping_type> const & mappings, statements_container const & 
statements)
-  {
-    generate_prototype(stream, name, first_arguments, mappings, statements, 
std::map<std::string, unsigned int>());
-  }
-
-  void set_arguments(statements_container const & statements, 
viennacl::ocl::kernel & kernel, unsigned int & current_arg)
-  {
-    tools::shared_ptr<symbolic_binder> binder = make_binder(binding_policy_);
-    for (statements_container::data_type::const_iterator itt = 
statements.data().begin(); itt != statements.data().end(); ++itt)
-      tree_parsing::traverse(*itt, itt->root(), 
set_arguments_functor(*binder,current_arg,kernel), true);
-  }
-
-  class invalid_template_exception : public std::exception
-  {
-  public:
-    invalid_template_exception() : message_() {}
-    invalid_template_exception(std::string message) :
-      message_("ViennaCL: Internal error: The generator cannot apply the given 
template to the given statement: " + message + "\n"
-               "If you are using a builtin template, please report on 
[email protected]! We will provide a fix as soon as 
possible\n"
-               "If you are using your own template, please try using other 
parameters") {}
-    virtual const char* what() const throw() { return message_.c_str(); }
-    virtual ~invalid_template_exception() throw() {}
-  private:
-    std::string message_;
-  };
-
-  static void fetching_loop_info(fetching_policy_type policy, std::string 
const & bound, utils::kernel_generation_stream & stream, std::string & init, 
std::string & upper_bound, std::string & inc, std::string const & domain_id, 
std::string const & domain_size)
-  {
-    if (policy==FETCH_FROM_GLOBAL_STRIDED)
-    {
-      init = domain_id;
-      upper_bound = bound;
-      inc = domain_size;
-    }
-    else if (policy==FETCH_FROM_GLOBAL_CONTIGUOUS)
-    {
-      std::string chunk_size = "chunk_size";
-      std::string chunk_start = "chunk_start";
-      std::string chunk_end = "chunk_end";
-
-      stream << "unsigned int " << chunk_size << " = (" << bound << "+" << 
domain_size << "-1)/" << domain_size << ";" << std::endl;
-      stream << "unsigned int " << chunk_start << " =" << domain_id << "*" << 
chunk_size << ";" << std::endl;
-      stream << "unsigned int " << chunk_end << " = min(" << chunk_start << 
"+" << chunk_size << ", " << bound << ");" << std::endl;
-      init = chunk_start;
-      upper_bound = chunk_end;
-      inc = "1";
-    }
-  }
-
-  static bool is_node_trans(scheduler::statement::container_type const & 
array, vcl_size_t root_idx, leaf_t leaf_type)
-  {
-    bool res = false;
-    scheduler::lhs_rhs_element scheduler::statement_node::*ptr;
-    if (leaf_type==LHS_NODE_TYPE)
-      ptr = &scheduler::statement_node::lhs;
-    else
-      ptr = &scheduler::statement_node::rhs;
-    scheduler::statement_node const * node = &array[root_idx];
-    while ((node->*ptr).type_family==scheduler::COMPOSITE_OPERATION_FAMILY)
-    {
-      if 
(array[(node->*ptr).node_index].op.type==scheduler::OPERATION_UNARY_TRANS_TYPE)
-        res = !res;
-      node = &array[(node->*ptr).node_index];
-    }
-    return res;
-  }
-
-protected:
-
-  static std::string append_simd_suffix(std::string const & str, unsigned int 
i)
-  {
-    assert(i < 16);
-    static char suffixes[] = {'0','1','2','3','4','5','6','7','8','9',
-                             'a','b','c','d','e','f'};
-    return str + tools::to_string(suffixes[i]);
-  }
-
-  static bool is_striding_operator(scheduler::statement_node const & node)
-  {
-    return node.op.type==scheduler::OPERATION_BINARY_MATRIX_COLUMN_TYPE
-            || node.op.type==scheduler::OPERATION_BINARY_MATRIX_ROW_TYPE
-            || node.op.type==scheduler::OPERATION_BINARY_MATRIX_DIAG_TYPE;
-  }
-
-  static bool has_strided_access(statements_container const & statements)
-  {
-    for (statements_container::data_type::const_iterator it = 
statements.data().begin(); it != statements.data().end(); ++it)
-    {
-      //checks for vectors
-      std::vector<scheduler::lhs_rhs_element> vectors;
-      tree_parsing::traverse(*it, it->root(), 
tree_parsing::filter_elements(scheduler::DENSE_VECTOR_TYPE, vectors), true);
-      for (std::vector<scheduler::lhs_rhs_element>::iterator itt = 
vectors.begin(); itt != vectors.end(); ++itt)
-        if (utils::call_on_vector(*itt, utils::stride_fun())>1)
-          return true;
-
-      //checks for matrix
-      std::vector<scheduler::lhs_rhs_element> matrices;
-      tree_parsing::traverse(*it, it->root(), 
tree_parsing::filter_elements(scheduler::DENSE_MATRIX_TYPE, matrices), true);
-      for (std::vector<scheduler::lhs_rhs_element>::iterator itt = 
matrices.begin(); itt != matrices.end(); ++itt)
-        if (utils::call_on_matrix(*itt, utils::stride1_fun())>1 || 
utils::call_on_matrix(*itt, utils::stride2_fun())>2)
-          return true;
-
-      std::vector<vcl_size_t> striding_operators;
-      tree_parsing::traverse(*it, it->root(), 
tree_parsing::filter(&is_striding_operator, striding_operators), false);
-      if(striding_operators.size() > 0)
-          return true;
-    }
-    return false;
-  }
-
-  static vcl_size_t vector_size(scheduler::statement_node const & node, bool 
up_to_internal_size)
-  {
-    using namespace scheduler;
-    using namespace utils;
-    if (node.op.type==OPERATION_BINARY_MATRIX_DIAG_TYPE)
-    {
-      vcl_size_t size1 = up_to_internal_size?call_on_matrix(node.lhs, 
internal_size1_fun()):call_on_matrix(node.lhs, size1_fun());
-      vcl_size_t size2 = up_to_internal_size?call_on_matrix(node.lhs, 
internal_size2_fun()):call_on_matrix(node.lhs, size2_fun());
-      return std::min<vcl_size_t>(size1, size2);
-    }
-    else if (node.op.type==OPERATION_BINARY_MATRIX_ROW_TYPE)
-      return up_to_internal_size?call_on_matrix(node.lhs, 
internal_size2_fun()):call_on_matrix(node.lhs, size2_fun());
-    else if (node.op.type==OPERATION_BINARY_MATRIX_COLUMN_TYPE)
-      return up_to_internal_size?call_on_matrix(node.lhs, 
internal_size1_fun()):call_on_matrix(node.lhs, size1_fun());
-    else
-      return up_to_internal_size?call_on_vector(node.lhs, 
internal_size_fun()):call_on_vector(node.lhs, size_fun());
-  }
-
-  //NB : templates are not used here because declaring a functor out of the 
generate() functions would be harder to read
-  struct loop_body_base
-  {
-    virtual void operator()(utils::kernel_generation_stream & stream, unsigned 
int simd_width) const = 0;
-    virtual ~loop_body_base() {}
-  };
-
-  static void element_wise_loop_1D(utils::kernel_generation_stream & stream, 
loop_body_base const & loop_body,
-                                   fetching_policy_type fetch, unsigned int 
simd_width, std::string const & i, std::string const & bound, std::string const 
& domain_id, std::string const & domain_size)
-  {
-    std::string strwidth = tools::to_string(simd_width);
-    std::string boundround = bound + "/" + strwidth;
-
-    std::string init, upper_bound, inc;
-    fetching_loop_info(fetch, boundround, stream, init, upper_bound, inc, 
domain_id, domain_size);
-    stream << "for(unsigned int " << i << " = " << init << "; " << i << " < " 
<< upper_bound << "; " << i << " += " << inc << ")" << std::endl;
-    stream << "{" << std::endl;
-    stream.inc_tab();
-    loop_body(stream, simd_width);
-    stream.dec_tab();
-    stream << "}" << std::endl;
-
-    if (simd_width>1)
-    {
-      stream << "for(unsigned int " << i << " = " << boundround << "*" << 
strwidth << " + " << domain_id << "; " << i << " < " << bound << "; " << i << " 
+= " + domain_size + ")" << std::endl;
-      stream << "{" << std::endl;
-      stream.inc_tab();
-      loop_body(stream, 1);
-      stream.dec_tab();
-      stream << "}" << std::endl;
-    }
-  }
-
-  static std::string vstore(unsigned int simd_width, std::string const & 
value, std::string const & offset, std::string const & ptr)
-  {
-    if (simd_width==1)
-      return "(" + ptr + ")[" + offset + "] = " + value;
-    else
-      return utils::append_width("vstore", simd_width) + "(" + value + ", " + 
offset + ", " + ptr + ")";
-  }
-
-  static std::string vload(unsigned int simd_width, std::string const & 
offset, std::string const & ptr)
-  {
-    if (simd_width==1)
-      return "(" + ptr + ")[" + offset + "]";
-    else
-      return utils::append_width("vload", simd_width) + "(" + offset + ", " + 
ptr + ")";
-  }
-
-private:
-  /** @brief Generates the body of the associated kernel function */
-  virtual std::vector<std::string> generate_impl(std::string const & 
kernel_prefix, statements_container const & statements, 
std::vector<mapping_type> const & mapping) const = 0;
-
-public:
-  template_base(binding_policy_t binding_policy) : 
binding_policy_(binding_policy) {}
-
-  virtual ~template_base(){ }
-
-  std::vector<std::string> generate(std::string const & kernel_prefix, 
statements_container const & statements, viennacl::ocl::device const & device)
-  {
-    statements_container::data_type::const_iterator sit;
-    std::vector<mapping_type>::iterator mit;
-
-    if(int err = check_invalid(statements, device))
-      throw generator_not_supported_exception("The supplied parameters for 
this template are invalid : err " + tools::to_string(err));
-
-    //Create mapping
-    std::vector<mapping_type> mappings(statements.data().size());
-    tools::shared_ptr<symbolic_binder> binder = make_binder(binding_policy_);
-    for (mit = mappings.begin(), sit = statements.data().begin(); sit != 
statements.data().end(); ++sit, ++mit)
-      tree_parsing::traverse(*sit, sit->root(), map_functor(*binder,*mit), 
true);
-
-    return generate_impl(kernel_prefix, statements, mappings);
-  }
-
-  /** @brief returns whether or not the profile has undefined behavior on 
particular device */
-  virtual int check_invalid(statements_container const & statements, 
viennacl::ocl::device const & device) const = 0;
-
-  virtual void enqueue(std::string const & kernel_prefix, 
std::vector<lazy_program_compiler> & programs, statements_container const & 
statements) = 0;
-
-  virtual tools::shared_ptr<template_base> clone() const = 0;
-private:
-  binding_policy_t binding_policy_;
-};
-
-
-template<class TemplateType, class ParametersType>
-class template_base_impl : public template_base
-{
-private:
-  virtual int check_invalid_impl(viennacl::ocl::device const & /*dev*/) const 
{ return TEMPLATE_VALID; }
-
-  virtual unsigned int n_lmem_elements() const { return 0; }
-
-public:
-  typedef ParametersType parameters_type;
-
-  /** @brief The constructor */
-  template_base_impl(parameters_type const & parameters, binding_policy_t 
binding_policy) : template_base(binding_policy), p_(parameters){ }
-
-  parameters_type const & parameters() const
-  {
-    return p_;
-  }
-
-  tools::shared_ptr<template_base> clone() const
-  {
-    return tools::shared_ptr<template_base>(new 
TemplateType(*dynamic_cast<TemplateType const *>(this)));
-  }
-
-  /** @brief returns whether or not the profile has undefined behavior on 
particular device */
-  int check_invalid(statements_container const & statements, 
viennacl::ocl::device const & device) const
-  {
-    using namespace viennacl::tools;
-
-    scheduler::statement const & statement = statements.data().front();
-    unsigned int scalartype_size = utils::size_of(lhs_most(statement.array(), 
statement.root()).lhs.numeric_type);
-
-    //Query device informations
-    vcl_size_t lmem_available = 
static_cast<vcl_size_t>(device.local_mem_size());
-    vcl_size_t lmem_usage = scalartype_size*n_lmem_elements();
-    if (lmem_usage>lmem_available)
-      return TEMPLATE_LOCAL_MEMORY_OVERFLOW;
-
-    //Invalid work group size
-    vcl_size_t max_workgroup_size = device.max_work_group_size();
-    std::vector<vcl_size_t> max_work_item_sizes = device.max_work_item_sizes();
-    if (p_.local_size_0*p_.local_size_1 > max_workgroup_size)
-      return TEMPLATE_WORK_GROUP_SIZE_OVERFLOW;
-    if (p_.local_size_0 > max_work_item_sizes[0])
-      return TEMPLATE_LOCAL_SIZE_0_OVERFLOW;
-
-    if (p_.local_size_1 > max_work_item_sizes[1])
-      return TEMPLATE_LOCAL_SIZE_1_OVERFLOW;
-
-    //Advice from the Intel guide
-    unsigned int warp_size = 8;
-    if (device.type()==CL_DEVICE_TYPE_GPU)
-    {
-      //Advice from the nvidia guide
-      warp_size = 32;
-      //Advice from the AMD guide
-      if (device.vendor_id()==4098)
-        warp_size = 64;
-    }
-    if (((p_.local_size_0*p_.local_size_1)%warp_size)>0)
-      return TEMPLATE_LOCAL_SIZE_NOT_WARP_MULTIPLE;
-
-    //Invalid SIMD Width
-    if (p_.simd_width!=1 && p_.simd_width!=2 &&
-        p_.simd_width!=4 && p_.simd_width!=8 &&
-        p_.simd_width!=16)
-      return TEMPLATE_INVALID_SIMD_WIDTH;
-
-    return check_invalid_impl(device);
-  }
-
-protected:
-  parameters_type p_;
-};
-
-}
-}
-
-#endif

Reply via email to