http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/svd.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/svd.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/svd.hpp new file mode 100644 index 0000000..0b93bb8 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/svd.hpp @@ -0,0 +1,703 @@ +#ifndef VIENNACL_LINALG_OPENCL_KERNELS_SVD_HPP +#define VIENNACL_LINALG_OPENCL_KERNELS_SVD_HPP + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +#include "viennacl/tools/tools.hpp" +#include "viennacl/ocl/kernel.hpp" +#include "viennacl/ocl/platform.hpp" +#include "viennacl/ocl/utils.hpp" + +/** @file viennacl/linalg/opencl/kernels/svd.hpp + * @brief OpenCL kernel file for singular value decomposition */ +namespace viennacl +{ +namespace linalg +{ +namespace opencl +{ +namespace kernels +{ + +template <typename StringType> +void generate_svd_bidiag_pack(StringType & source, std::string const & numeric_string, bool is_row_major) +{ + source.append("__kernel void bidiag_pack(__global "); source.append(numeric_string); source.append("* A, \n"); + source.append(" __global "); source.append(numeric_string); source.append("* D, \n"); + source.append(" __global "); source.append(numeric_string); source.append("* S, \n"); + source.append(" uint size1, \n"); + source.append(" uint size2, \n"); + source.append(" uint stride \n"); + source.append(") { \n"); + source.append(" uint size = min(size1, size2); \n"); + + source.append(" if(get_global_id(0) == 0) \n"); + source.append(" S[0] = 0; \n"); + if(is_row_major) + { + source.append(" for(uint i = get_global_id(0); i < size ; i += get_global_size(0)) { \n"); + source.append(" D[i] = A[i*stride + i]; \n"); + source.append(" S[i + 1] = (i + 1 < size2) ? A[i*stride + (i + 1)] : 0; \n"); + } + else + { + source.append(" for(uint i = get_global_id(0); i < size ; i += get_global_size(0)) { \n"); + source.append(" D[i] = A[i*stride + i]; \n"); + source.append(" S[i + 1] = (i + 1 < size2) ? A[i + (i + 1) * stride] : 0; \n"); + } + source.append(" } \n"); + source.append("} \n"); +} + +template<typename StringT> +void generate_svd_col_reduce_lcl_array(StringT & source, std::string const & numeric_string) +{ + // calculates a sum of local array elements + source.append("void col_reduce_lcl_array(__local "); source.append(numeric_string); source.append("* sums, uint lcl_id, uint lcl_sz) { \n"); + source.append(" uint step = lcl_sz >> 1; \n"); + + source.append(" while (step > 0) { \n"); + source.append(" if (lcl_id < step) { \n"); + source.append(" sums[lcl_id] += sums[lcl_id + step]; \n"); + source.append(" } \n"); + source.append(" step >>= 1; \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + source.append(" } \n"); + source.append("} \n"); +} + +template <typename StringType> +void generate_svd_copy_col(StringType & source, std::string const & numeric_string, bool is_row_major) +{ + // probably, this is a ugly way + source.append("__kernel void copy_col(__global "); source.append(numeric_string); source.append("* A, \n"); + source.append(" __global "); source.append(numeric_string); source.append("* V, \n"); + source.append(" uint row_start, \n"); + source.append(" uint col_start, \n"); + source.append(" uint size, \n"); + source.append(" uint stride \n"); + source.append(" ) { \n"); + source.append(" uint glb_id = get_global_id(0); \n"); + source.append(" uint glb_sz = get_global_size(0); \n"); + if(is_row_major) + { + source.append(" for(uint i = row_start + glb_id; i < size; i += glb_sz) { \n"); + source.append(" V[i - row_start] = A[i * stride + col_start]; \n"); + source.append(" } \n"); + } + else + { + source.append(" for(uint i = row_start + glb_id; i < size; i += glb_sz) { \n"); + source.append(" V[i - row_start] = A[i + col_start * stride]; \n"); + source.append(" } \n"); + } + + source.append("} \n"); +} + +template <typename StringType> +void generate_svd_copy_row(StringType & source, std::string const & numeric_string, bool is_row_major) +{ + // probably, this is too + source.append("__kernel void copy_row(__global "); source.append(numeric_string); source.append("* A, \n"); + source.append(" __global "); source.append(numeric_string); source.append("* V, \n"); + source.append(" uint row_start, \n"); + source.append(" uint col_start, \n"); + source.append(" uint size, \n"); + source.append(" uint stride \n"); + source.append(" ) { \n"); + source.append(" uint glb_id = get_global_id(0); \n"); + source.append(" uint glb_sz = get_global_size(0); \n"); + if(is_row_major) + { + source.append(" for(uint i = col_start + glb_id; i < size; i += glb_sz) { \n"); + source.append(" V[i - col_start] = A[row_start * stride + i]; \n"); + source.append(" } \n"); + } + else + { + source.append(" for(uint i = col_start + glb_id; i < size; i += glb_sz) { \n"); + source.append(" V[i - col_start] = A[row_start + i * stride]; \n"); + source.append(" } \n"); + } + + source.append("} \n"); +} + +template<typename StringT> +void generate_svd_final_iter_update(StringT & source, std::string const & numeric_string) +{ + source.append("__kernel void final_iter_update(__global "); source.append(numeric_string); source.append("* A, \n"); + source.append(" uint stride, \n"); + source.append(" uint n, \n"); + source.append(" uint last_n, \n"); + source.append(" "); source.append(numeric_string); source.append(" q, \n"); + source.append(" "); source.append(numeric_string); source.append(" p \n"); + source.append(" ) \n"); + source.append("{ \n"); + source.append(" uint glb_id = get_global_id(0); \n"); + source.append(" uint glb_sz = get_global_size(0); \n"); + + source.append(" for (uint px = glb_id; px < last_n; px += glb_sz) \n"); + source.append(" { \n"); + source.append(" "); source.append(numeric_string); source.append(" v_in = A[n * stride + px]; \n"); + source.append(" "); source.append(numeric_string); source.append(" z = A[(n - 1) * stride + px]; \n"); + source.append(" A[(n - 1) * stride + px] = q * z + p * v_in; \n"); + source.append(" A[n * stride + px] = q * v_in - p * z; \n"); + source.append(" } \n"); + source.append("} \n"); +} + +template <typename StringType> +void generate_svd_givens_next(StringType & source, std::string const & numeric_string, bool is_row_major) +{ + source.append("__kernel void givens_next(__global "); source.append(numeric_string); source.append("* matr, \n"); + source.append(" __global "); source.append(numeric_string); source.append("* cs, \n"); + source.append(" __global "); source.append(numeric_string); source.append("* ss, \n"); + source.append(" uint size, \n"); + source.append(" uint stride, \n"); + source.append(" uint start_i, \n"); + source.append(" uint end_i \n"); + source.append(" ) \n"); + source.append("{ \n"); + source.append(" uint glb_id = get_global_id(0); \n"); + source.append(" uint glb_sz = get_global_size(0); \n"); + + source.append(" uint lcl_id = get_local_id(0); \n"); + source.append(" uint lcl_sz = get_local_size(0); \n"); + + source.append(" uint j = glb_id; \n"); + + source.append(" __local "); source.append(numeric_string); source.append(" cs_lcl[256]; \n"); + source.append(" __local "); source.append(numeric_string); source.append(" ss_lcl[256]; \n"); + if(is_row_major) + { + + source.append(" "); source.append(numeric_string); source.append(" x = (j < size) ? matr[(end_i + 1) + j * stride] : 0; \n"); + + source.append(" uint elems_num = end_i - start_i + 1; \n"); + source.append(" uint block_num = (elems_num + lcl_sz - 1) / lcl_sz; \n"); + + source.append(" for(uint block_id = 0; block_id < block_num; block_id++) \n"); + source.append(" { \n"); + source.append(" uint to = min(elems_num - block_id * lcl_sz, lcl_sz); \n"); + + source.append(" if(lcl_id < to) \n"); + source.append(" { \n"); + source.append(" cs_lcl[lcl_id] = cs[end_i - (lcl_id + block_id * lcl_sz)]; \n"); + source.append(" ss_lcl[lcl_id] = ss[end_i - (lcl_id + block_id * lcl_sz)]; \n"); + source.append(" } \n"); + + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + + source.append(" if(j < size) \n"); + source.append(" { \n"); + source.append(" for(uint ind = 0; ind < to; ind++) \n"); + source.append(" { \n"); + source.append(" uint i = end_i - (ind + block_id * lcl_sz); \n"); + + source.append(" "); source.append(numeric_string); source.append(" z = matr[i + j * stride]; \n"); + + source.append(" "); source.append(numeric_string); source.append(" cs_val = cs_lcl[ind]; \n"); + source.append(" "); source.append(numeric_string); source.append(" ss_val = ss_lcl[ind]; \n"); + + source.append(" matr[(i + 1) + j * stride] = x * cs_val + z * ss_val; \n"); + source.append(" x = -x * ss_val + z * cs_val; \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + source.append(" } \n"); + source.append(" if(j < size) \n"); + source.append(" matr[(start_i) + j * stride] = x; \n"); + } + else + { + + source.append(" "); source.append(numeric_string); source.append(" x = (j < size) ? matr[(end_i + 1) * stride + j] : 0; \n"); + + source.append(" uint elems_num = end_i - start_i + 1; \n"); + source.append(" uint block_num = (elems_num + lcl_sz - 1) / lcl_sz; \n"); + + source.append(" for(uint block_id = 0; block_id < block_num; block_id++) \n"); + source.append(" { \n"); + source.append(" uint to = min(elems_num - block_id * lcl_sz, lcl_sz); \n"); + + source.append(" if(lcl_id < to) \n"); + source.append(" { \n"); + source.append(" cs_lcl[lcl_id] = cs[end_i - (lcl_id + block_id * lcl_sz)]; \n"); + source.append(" ss_lcl[lcl_id] = ss[end_i - (lcl_id + block_id * lcl_sz)]; \n"); + source.append(" } \n"); + + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + + source.append(" if(j < size) \n"); + source.append(" { \n"); + source.append(" for(uint ind = 0; ind < to; ind++) \n"); + source.append(" { \n"); + source.append(" uint i = end_i - (ind + block_id * lcl_sz); \n"); + + source.append(" "); source.append(numeric_string); source.append(" z = matr[i * stride + j]; \n"); + + source.append(" "); source.append(numeric_string); source.append(" cs_val = cs_lcl[ind]; \n"); + source.append(" "); source.append(numeric_string); source.append(" ss_val = ss_lcl[ind]; \n"); + + source.append(" matr[(i + 1) * stride + j] = x * cs_val + z * ss_val; \n"); + source.append(" x = -x * ss_val + z * cs_val; \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + source.append(" } \n"); + source.append(" if(j < size) \n"); + source.append(" matr[(start_i) * stride + j] = x; \n"); + } + source.append("} \n"); +} + +template<typename StringT> +void generate_svd_givens_prev(StringT & source, std::string const & numeric_string) +{ + source.append("__kernel void givens_prev(__global "); source.append(numeric_string); source.append("* matr, \n"); + source.append(" __global "); source.append(numeric_string); source.append("* cs, \n"); + source.append(" __global "); source.append(numeric_string); source.append("* ss, \n"); + source.append(" uint size, \n"); + source.append(" uint stride, \n"); + source.append(" uint start_i, \n"); + source.append(" uint end_i \n"); + source.append(" ) \n"); + source.append("{ \n"); + source.append(" uint glb_id = get_global_id(0); \n"); + source.append(" uint glb_sz = get_global_size(0); \n"); + + source.append(" uint lcl_id = get_local_id(0); \n"); + source.append(" uint lcl_sz = get_local_size(0); \n"); + + source.append(" uint j = glb_id; \n"); + + source.append(" __local "); source.append(numeric_string); source.append(" cs_lcl[256]; \n"); + source.append(" __local "); source.append(numeric_string); source.append(" ss_lcl[256]; \n"); + + source.append(" "); source.append(numeric_string); source.append(" x = (j < size) ? matr[(start_i - 1) * stride + j] : 0; \n"); + + source.append(" uint elems_num = end_i - start_i; \n"); + source.append(" uint block_num = (elems_num + lcl_sz - 1) / lcl_sz; \n"); + + source.append(" for (uint block_id = 0; block_id < block_num; block_id++) \n"); + source.append(" { \n"); + source.append(" uint to = min(elems_num - block_id * lcl_sz, lcl_sz); \n"); + + source.append(" if (lcl_id < to) \n"); + source.append(" { \n"); + source.append(" cs_lcl[lcl_id] = cs[lcl_id + start_i + block_id * lcl_sz]; \n"); + source.append(" ss_lcl[lcl_id] = ss[lcl_id + start_i + block_id * lcl_sz]; \n"); + source.append(" } \n"); + + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + + source.append(" if (j < size) \n"); + source.append(" { \n"); + source.append(" for (uint ind = 0; ind < to; ind++) \n"); + source.append(" { \n"); + source.append(" uint i = ind + start_i + block_id * lcl_sz; \n"); + + source.append(" "); source.append(numeric_string); source.append(" z = matr[i * stride + j]; \n"); + + source.append(" "); source.append(numeric_string); source.append(" cs_val = cs_lcl[ind];//cs[i]; \n"); + source.append(" "); source.append(numeric_string); source.append(" ss_val = ss_lcl[ind];//ss[i]; \n"); + + source.append(" matr[(i - 1) * stride + j] = x * cs_val + z * ss_val; \n"); + source.append(" x = -x * ss_val + z * cs_val; \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + source.append(" } \n"); + source.append(" if (j < size) \n"); + source.append(" matr[(end_i - 1) * stride + j] = x; \n"); + source.append("} \n"); +} + +template <typename StringType> +void generate_svd_house_update_A_left(StringType & source, std::string const & numeric_string, bool is_row_major) +{ + source.append("__kernel void house_update_A_left( \n"); + source.append(" __global "); source.append(numeric_string); source.append("* A, \n"); + source.append(" __constant "); source.append(numeric_string); source.append("* V, \n"); //householder vector + source.append(" uint row_start, \n"); + source.append(" uint col_start, \n"); + source.append(" uint size1, \n"); + source.append(" uint size2, \n"); + source.append(" uint stride, \n"); + source.append(" __local "); source.append(numeric_string); source.append("* sums \n"); + source.append(" ) { \n"); + source.append(" uint glb_id = get_global_id(0); \n"); + source.append(" uint glb_sz = get_global_size(0); \n"); + + source.append(" uint grp_id = get_group_id(0); \n"); + source.append(" uint grp_nm = get_num_groups(0); \n"); + + source.append(" uint lcl_id = get_local_id(0); \n"); + source.append(" uint lcl_sz = get_local_size(0); \n"); + + source.append(" "); source.append(numeric_string); source.append(" ss = 0; \n"); + + // doing it in slightly different way to avoid cache misses + if(is_row_major) + { + source.append(" for(uint i = glb_id + col_start; i < size2; i += glb_sz) { \n"); + source.append(" ss = 0; \n"); + source.append(" for(uint j = row_start; j < size1; j++) ss = ss + (V[j] * A[j * stride + i]); \n"); + + source.append(" for(uint j = row_start; j < size1; j++) \n"); + source.append(" A[j * stride + i] = A[j * stride + i] - (2 * V[j] * ss); \n"); + source.append(" } \n"); + } + else + { + source.append(" for(uint i = glb_id + col_start; i < size2; i += glb_sz) { \n"); + source.append(" ss = 0; \n"); + source.append(" for(uint j = row_start; j < size1; j++) ss = ss + (V[j] * A[j + i * stride]); \n"); + + source.append(" for(uint j = row_start; j < size1; j++) \n"); + source.append(" A[j + i * stride] = A[j + i * stride] - (2 * V[j] * ss); \n"); + source.append(" } \n"); + } + source.append("} \n"); +} + +template <typename StringType> +void generate_svd_house_update_A_right(StringType & source, std::string const & numeric_string, bool is_row_major) +{ + + source.append("__kernel void house_update_A_right( \n"); + source.append(" __global "); source.append(numeric_string); source.append("* A, \n"); + source.append(" __global "); source.append(numeric_string); source.append("* V, \n"); // householder vector + source.append(" uint row_start, \n"); + source.append(" uint col_start, \n"); + source.append(" uint size1, \n"); + source.append(" uint size2, \n"); + source.append(" uint stride, \n"); + source.append(" __local "); source.append(numeric_string); source.append("* sums \n"); + source.append(" ) { \n"); + + source.append(" uint glb_id = get_global_id(0); \n"); + + source.append(" uint grp_id = get_group_id(0); \n"); + source.append(" uint grp_nm = get_num_groups(0); \n"); + + source.append(" uint lcl_id = get_local_id(0); \n"); + source.append(" uint lcl_sz = get_local_size(0); \n"); + + source.append(" "); source.append(numeric_string); source.append(" ss = 0; \n"); + + // update of A matrix + if(is_row_major) + { + source.append(" for(uint i = grp_id + row_start; i < size1; i += grp_nm) { \n"); + source.append(" ss = 0; \n"); + + source.append(" for(uint j = lcl_id; j < size2; j += lcl_sz) ss = ss + (V[j] * A[i * stride + j]); \n"); + source.append(" sums[lcl_id] = ss; \n"); + + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + source.append(" col_reduce_lcl_array(sums, lcl_id, lcl_sz); \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + + source.append(" "); source.append(numeric_string); source.append(" sum_Av = sums[0]; \n"); + + source.append(" for(uint j = lcl_id; j < size2; j += lcl_sz) \n"); + source.append(" A[i * stride + j] = A[i * stride + j] - (2 * V[j] * sum_Av); \n"); + source.append(" } \n"); + } + else + { + source.append(" for(uint i = grp_id + row_start; i < size1; i += grp_nm) { \n"); + source.append(" ss = 0; \n"); + + source.append(" for(uint j = lcl_id; j < size2; j += lcl_sz) ss = ss + (V[j] * A[i + j * stride]); \n"); + source.append(" sums[lcl_id] = ss; \n"); + + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + source.append(" col_reduce_lcl_array(sums, lcl_id, lcl_sz); \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + + source.append(" "); source.append(numeric_string); source.append(" sum_Av = sums[0]; \n"); + + source.append(" for(uint j = lcl_id; j < size2; j += lcl_sz) \n"); + source.append(" A[i + j * stride] = A[i + j * stride] - (2 * V[j] * sum_Av); \n"); + source.append(" } \n"); + } + + source.append("} \n"); + +} + +template <typename StringType> +void generate_svd_house_update_QL(StringType & source, std::string const & numeric_string, bool is_row_major) +{ + source.append("__kernel void house_update_QL(\n"); + source.append(" __global "); source.append(numeric_string); source.append("* QL, \n"); + source.append(" __constant "); source.append(numeric_string); source.append("* V, \n"); //householder vector + source.append(" uint size1, \n"); + source.append(" uint strideQ, \n"); + source.append(" __local "); source.append(numeric_string); source.append("* sums \n"); + source.append(" ) { \n"); + source.append(" uint glb_id = get_global_id(0); \n"); + source.append(" uint glb_sz = get_global_size(0); \n"); + + source.append(" uint grp_id = get_group_id(0); \n"); + source.append(" uint grp_nm = get_num_groups(0); \n"); + + source.append(" uint lcl_id = get_local_id(0); \n"); + source.append(" uint lcl_sz = get_local_size(0); \n"); + + source.append(" "); source.append(numeric_string); source.append(" ss = 0; \n"); + + if(is_row_major) + { + source.append(" for(uint i = grp_id; i < size1; i += grp_nm) { \n"); + source.append(" ss = 0; \n"); + source.append(" for(uint j = lcl_id; j < size1; j += lcl_sz) ss = ss + (V[j] * QL[i * strideQ + j]); \n"); + source.append(" sums[lcl_id] = ss; \n"); + + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + source.append(" col_reduce_lcl_array(sums, lcl_id, lcl_sz); \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + + source.append(" "); source.append(numeric_string); source.append(" sum_Qv = sums[0]; \n"); + + source.append(" for(uint j = lcl_id; j < size1; j += lcl_sz) \n"); + source.append(" QL[i * strideQ + j] = QL[i * strideQ + j] - (2 * V[j] * sum_Qv); \n"); + source.append(" } \n"); + } + else + { + source.append(" for(uint i = grp_id; i < size1; i += grp_nm) { \n"); + source.append(" ss = 0; \n"); + source.append(" for(uint j = lcl_id; j < size1; j += lcl_sz) ss = ss + (V[j] * QL[i + j * strideQ]); \n"); + source.append(" sums[lcl_id] = ss; \n"); + + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + source.append(" col_reduce_lcl_array(sums, lcl_id, lcl_sz); \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + + source.append(" "); source.append(numeric_string); source.append(" sum_Qv = sums[0]; \n"); + + source.append(" for(uint j = lcl_id; j < size1; j += lcl_sz) \n"); + source.append(" QL[i + j * strideQ] = QL[i + j * strideQ] - (2 * V[j] * sum_Qv); \n"); + source.append(" } \n"); + } + source.append("} \n"); + +} + +template<typename StringT> +void generate_svd_house_update_QR(StringT & source, std::string const & numeric_string) +{ + source.append("__kernel void house_update_QR( \n"); + source.append(" __global "); source.append(numeric_string); source.append("* QR, \n"); + source.append(" __global "); source.append(numeric_string); source.append("* V, \n"); // householder vector + source.append(" uint size1, \n"); + source.append(" uint size2, \n"); + source.append(" uint strideQ, \n"); + source.append(" __local "); source.append(numeric_string); source.append("* sums \n"); + source.append(" ) { \n"); + + source.append(" uint glb_id = get_global_id(0); \n"); + + source.append(" uint grp_id = get_group_id(0); \n"); + source.append(" uint grp_nm = get_num_groups(0); \n"); + + source.append(" uint lcl_id = get_local_id(0); \n"); + source.append(" uint lcl_sz = get_local_size(0); \n"); + + source.append(" "); source.append(numeric_string); source.append(" ss = 0; \n"); + + // update of QR matrix + // Actually, we are calculating a transpose of right matrix. This allows to avoid cache + // misses. + source.append(" for (uint i = grp_id; i < size2; i += grp_nm) { \n"); + source.append(" ss = 0; \n"); + source.append(" for (uint j = lcl_id; j < size2; j += lcl_sz) ss = ss + (V[j] * QR[i * strideQ + j]); \n"); + source.append(" sums[lcl_id] = ss; \n"); + + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + source.append(" col_reduce_lcl_array(sums, lcl_id, lcl_sz); \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + + source.append(" "); source.append(numeric_string); source.append(" sum_Qv = sums[0]; \n"); + source.append(" for (uint j = lcl_id; j < size2; j += lcl_sz) \n"); + source.append(" QR[i * strideQ + j] = QR[i * strideQ + j] - (2 * V[j] * sum_Qv); \n"); + source.append(" } \n"); + source.append("} \n"); +} + +template<typename StringT> +void generate_svd_inverse_signs(StringT & source, std::string const & numeric_string) +{ + source.append("__kernel void inverse_signs(__global "); source.append(numeric_string); source.append("* v, \n"); + source.append(" __global "); source.append(numeric_string); source.append("* signs, \n"); + source.append(" uint size, \n"); + source.append(" uint stride \n"); + source.append(" ) \n"); + source.append("{ \n"); + source.append(" uint glb_id_x = get_global_id(0); \n"); + source.append(" uint glb_id_y = get_global_id(1); \n"); + + source.append(" if ((glb_id_x < size) && (glb_id_y < size)) \n"); + source.append(" v[glb_id_x * stride + glb_id_y] *= signs[glb_id_x]; \n"); + source.append("} \n"); + +} + +template<typename StringT> +void generate_svd_transpose_inplace(StringT & source, std::string const & numeric_string) +{ + + source.append("__kernel void transpose_inplace(__global "); source.append(numeric_string); source.append("* input, \n"); + source.append(" unsigned int row_num, \n"); + source.append(" unsigned int col_num) { \n"); + source.append(" unsigned int size = row_num * col_num; \n"); + source.append(" for (unsigned int i = get_global_id(0); i < size; i+= get_global_size(0)) { \n"); + source.append(" unsigned int row = i / col_num; \n"); + source.append(" unsigned int col = i - row*col_num; \n"); + + source.append(" unsigned int new_pos = col * row_num + row; \n"); + + //new_pos = (col < row) ? 0 : 1; + //input[i] = new_pos; + + source.append(" if (i < new_pos) { \n"); + source.append(" "); source.append(numeric_string); source.append(" val = input[i]; \n"); + source.append(" input[i] = input[new_pos]; \n"); + source.append(" input[new_pos] = val; \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append("} \n"); + +} + +template<typename StringT> +void generate_svd_update_qr_column(StringT & source, std::string const & numeric_string) +{ + source.append("__kernel void update_qr_column(__global "); source.append(numeric_string); source.append("* A, \n"); + source.append(" uint stride, \n"); + source.append(" __global "); source.append(numeric_string); source.append("* buf, \n"); + source.append(" int m, \n"); + source.append(" int n, \n"); + source.append(" int last_n) \n"); + source.append("{ \n"); + source.append(" uint glb_id = get_global_id(0); \n"); + source.append(" uint glb_sz = get_global_size(0); \n"); + + source.append(" for (int i = glb_id; i < last_n; i += glb_sz) \n"); + source.append(" { \n"); + source.append(" "); source.append(numeric_string); source.append(" a_ik = A[m * stride + i], a_ik_1, a_ik_2; \n"); + + source.append(" a_ik_1 = A[(m + 1) * stride + i]; \n"); + + source.append(" for (int k = m; k < n; k++) \n"); + source.append(" { \n"); + source.append(" bool notlast = (k != n - 1); \n"); + + source.append(" "); source.append(numeric_string); source.append(" p = buf[5 * k] * a_ik + buf[5 * k + 1] * a_ik_1; \n"); + + source.append(" if (notlast) \n"); + source.append(" { \n"); + source.append(" a_ik_2 = A[(k + 2) * stride + i]; \n"); + source.append(" p = p + buf[5 * k + 2] * a_ik_2; \n"); + source.append(" a_ik_2 = a_ik_2 - p * buf[5 * k + 4]; \n"); + source.append(" } \n"); + + source.append(" A[k * stride + i] = a_ik - p; \n"); + source.append(" a_ik_1 = a_ik_1 - p * buf[5 * k + 3]; \n"); + + source.append(" a_ik = a_ik_1; \n"); + source.append(" a_ik_1 = a_ik_2; \n"); + source.append(" } \n"); + + source.append(" A[n * stride + i] = a_ik; \n"); + source.append(" } \n"); + + source.append("} \n"); +} + + + + +// main kernel class +/** @brief Main kernel class for generating OpenCL kernels for singular value decomposition of dense matrices. */ +template<typename NumericT, typename MatrixLayout = row_major> +struct svd +{ + static std::string program_name() + { + bool is_row = viennacl::is_row_major<MatrixLayout>::value; + return (viennacl::ocl::type_to_string<NumericT>::apply() + "_svd_") + (is_row ? "row" : "col"); + } + + static void init(viennacl::ocl::context & ctx) + { + static std::map<cl_context, bool> init_done; + if (!init_done[ctx.handle().get()]) + { + viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx); + std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply(); + bool is_row_major = viennacl::is_row_major<MatrixLayout>::value; + + std::string source; + source.reserve(1024); + + viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source); + + // only generate for floating points (forces error for integers) + if (numeric_string == "float" || numeric_string == "double") + { + //helper function used by multiple kernels: + generate_svd_col_reduce_lcl_array(source, numeric_string); + + //kernels: + generate_svd_bidiag_pack(source, numeric_string, is_row_major); + generate_svd_copy_col(source, numeric_string, is_row_major); + generate_svd_copy_row(source, numeric_string, is_row_major); + generate_svd_final_iter_update(source, numeric_string); + generate_svd_givens_next(source, numeric_string, is_row_major); + generate_svd_givens_prev(source, numeric_string); + generate_svd_house_update_A_left(source, numeric_string, is_row_major); + generate_svd_house_update_A_right(source, numeric_string, is_row_major); + generate_svd_house_update_QL(source, numeric_string, is_row_major); + generate_svd_house_update_QR(source, numeric_string); + generate_svd_inverse_signs(source, numeric_string); + generate_svd_transpose_inplace(source, numeric_string); + generate_svd_update_qr_column(source, numeric_string); + } + + std::string prog_name = program_name(); + #ifdef VIENNACL_BUILD_INFO + std::cout << "Creating program " << prog_name << std::endl; + #endif + ctx.add_program(source, prog_name); + init_done[ctx.handle().get()] = true; + } //if + } //init +}; + +} // namespace kernels +} // namespace opencl +} // namespace linalg +} // namespace viennacl +#endif +
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/vector.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/vector.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/vector.hpp new file mode 100644 index 0000000..b6a2b7d --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/vector.hpp @@ -0,0 +1,867 @@ +#ifndef VIENNACL_LINALG_OPENCL_KERNELS_VECTOR_HPP +#define VIENNACL_LINALG_OPENCL_KERNELS_VECTOR_HPP + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +#include "viennacl/tools/tools.hpp" + +#include "viennacl/scheduler/forwards.h" +#include "viennacl/scheduler/io.hpp" +#include "viennacl/scheduler/preset.hpp" + +#include "viennacl/ocl/kernel.hpp" +#include "viennacl/ocl/platform.hpp" +#include "viennacl/ocl/utils.hpp" + + + +/** @file viennacl/linalg/opencl/kernels/vector.hpp + * @brief OpenCL kernel file for vector operations */ +namespace viennacl +{ +namespace linalg +{ +namespace opencl +{ +namespace kernels +{ + +//////////////////////////// Part 1: Kernel generation routines //////////////////////////////////// + +/** @brief Enumeration for the scalar type in avbv-like operations */ +enum avbv_scalar_type +{ + VIENNACL_AVBV_NONE = 0, // vector does not exist/contribute + VIENNACL_AVBV_CPU, + VIENNACL_AVBV_GPU +}; + +/** @brief Configuration struct for generating OpenCL kernels for linear combinations of vectors */ +struct avbv_config +{ + avbv_config() : with_stride_and_range(true), a(VIENNACL_AVBV_CPU), b(VIENNACL_AVBV_NONE) {} + + bool with_stride_and_range; + std::string assign_op; + avbv_scalar_type a; + avbv_scalar_type b; +}; + +// just returns the for-loop +template <typename StringType> +void generate_avbv_impl2(StringType & source, std::string const & /*numeric_string*/, avbv_config const & cfg, bool mult_alpha, bool mult_beta) +{ + source.append(" for (unsigned int i = get_global_id(0); i < size1.z; i += get_global_size(0)) \n"); + if (cfg.with_stride_and_range) + { + source.append(" vec1[i*size1.y+size1.x] "); source.append(cfg.assign_op); source.append(" vec2[i*size2.y+size2.x] "); + if (mult_alpha) + source.append("* alpha "); + else + source.append("/ alpha "); + if (cfg.b != VIENNACL_AVBV_NONE) + { + source.append("+ vec3[i*size3.y+size3.x] "); + if (mult_beta) + source.append("* beta"); + else + source.append("/ beta"); + } + } + else + { + source.append(" vec1[i] "); source.append(cfg.assign_op); source.append(" vec2[i] "); + if (mult_alpha) + source.append("* alpha "); + else + source.append("/ alpha "); + if (cfg.b != VIENNACL_AVBV_NONE) + { + source.append("+ vec3[i] "); + if (mult_beta) + source.append("* beta"); + else + source.append("/ beta"); + } + } + source.append("; \n"); +} + +template <typename StringType> +void generate_avbv_impl(StringType & source, std::string const & numeric_string, avbv_config const & cfg) +{ + source.append("__kernel void av"); + if (cfg.b != VIENNACL_AVBV_NONE) + source.append("bv"); + if (cfg.assign_op != "=") + source.append("_v"); + + if (cfg.a == VIENNACL_AVBV_CPU) + source.append("_cpu"); + else if (cfg.a == VIENNACL_AVBV_GPU) + source.append("_gpu"); + + if (cfg.b == VIENNACL_AVBV_CPU) + source.append("_cpu"); + else if (cfg.b == VIENNACL_AVBV_GPU) + source.append("_gpu"); + source.append("( \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n"); + source.append(" uint4 size1, \n"); + source.append(" \n"); + if (cfg.a == VIENNACL_AVBV_CPU) + { + source.append(" "); source.append(numeric_string); source.append(" fac2, \n"); + } + else if (cfg.a == VIENNACL_AVBV_GPU) + { + source.append(" __global "); source.append(numeric_string); source.append(" * fac2, \n"); + } + source.append(" unsigned int options2, \n"); // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse + source.append(" __global const "); source.append(numeric_string); source.append(" * vec2, \n"); + source.append(" uint4 size2"); + + if (cfg.b != VIENNACL_AVBV_NONE) + { + source.append(", \n\n"); + if (cfg.b == VIENNACL_AVBV_CPU) + { + source.append(" "); source.append(numeric_string); source.append(" fac3, \n"); + } + else if (cfg.b == VIENNACL_AVBV_GPU) + { + source.append(" __global "); source.append(numeric_string); source.append(" * fac3, \n"); + } + source.append(" unsigned int options3, \n"); // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse + source.append(" __global const "); source.append(numeric_string); source.append(" * vec3, \n"); + source.append(" uint4 size3 \n"); + } + source.append(") { \n"); + + if (cfg.a == VIENNACL_AVBV_CPU) + { + source.append(" "); source.append(numeric_string); source.append(" alpha = fac2; \n"); + } + else if (cfg.a == VIENNACL_AVBV_GPU) + { + source.append(" "); source.append(numeric_string); source.append(" alpha = fac2[0]; \n"); + } + source.append(" if (options2 & (1 << 0)) \n"); + source.append(" alpha = -alpha; \n"); + source.append(" \n"); + + if (cfg.b == VIENNACL_AVBV_CPU) + { + source.append(" "); source.append(numeric_string); source.append(" beta = fac3; \n"); + } + else if (cfg.b == VIENNACL_AVBV_GPU) + { + source.append(" "); source.append(numeric_string); source.append(" beta = fac3[0]; \n"); + } + if (cfg.b != VIENNACL_AVBV_NONE) + { + source.append(" if (options3 & (1 << 0)) \n"); + source.append(" beta = -beta; \n"); + source.append(" \n"); + } + source.append(" if (options2 & (1 << 1)) { \n"); + if (cfg.b != VIENNACL_AVBV_NONE) + { + source.append(" if (options3 & (1 << 1)) {\n"); + generate_avbv_impl2(source, numeric_string, cfg, false, false); + source.append(" } else {\n"); + generate_avbv_impl2(source, numeric_string, cfg, false, true); + source.append(" } \n"); + } + else + generate_avbv_impl2(source, numeric_string, cfg, false, true); + source.append(" } else { \n"); + if (cfg.b != VIENNACL_AVBV_NONE) + { + source.append(" if (options3 & (1 << 1)) {\n"); + generate_avbv_impl2(source, numeric_string, cfg, true, false); + source.append(" } else {\n"); + generate_avbv_impl2(source, numeric_string, cfg, true, true); + source.append(" } \n"); + } + else + generate_avbv_impl2(source, numeric_string, cfg, true, true); + source.append(" } \n"); + source.append("} \n"); +} + +template <typename StringType> +void generate_avbv(StringType & source, std::string const & numeric_string) +{ + avbv_config cfg; + cfg.assign_op = "="; + cfg.with_stride_and_range = true; + + // av + cfg.b = VIENNACL_AVBV_NONE; cfg.a = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg); + cfg.b = VIENNACL_AVBV_NONE; cfg.a = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg); + + // avbv + cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg); + cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg); + cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg); + cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg); + + // avbv + cfg.assign_op = "+="; + + cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg); + cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg); + cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg); + cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg); +} + +template <typename StringType> +void generate_plane_rotation(StringType & source, std::string const & numeric_string) +{ + source.append("__kernel void plane_rotation( \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n"); + source.append(" unsigned int start1, \n"); + source.append(" unsigned int inc1, \n"); + source.append(" unsigned int size1, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * vec2, \n"); + source.append(" unsigned int start2, \n"); + source.append(" unsigned int inc2, \n"); + source.append(" unsigned int size2, \n"); + source.append(" "); source.append(numeric_string); source.append(" alpha, \n"); + source.append(" "); source.append(numeric_string); source.append(" beta) \n"); + source.append("{ \n"); + source.append(" "); source.append(numeric_string); source.append(" tmp1 = 0; \n"); + source.append(" "); source.append(numeric_string); source.append(" tmp2 = 0; \n"); + source.append(" \n"); + source.append(" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n"); + source.append(" { \n"); + source.append(" tmp1 = vec1[i*inc1+start1]; \n"); + source.append(" tmp2 = vec2[i*inc2+start2]; \n"); + source.append(" \n"); + source.append(" vec1[i*inc1+start1] = alpha * tmp1 + beta * tmp2; \n"); + source.append(" vec2[i*inc2+start2] = alpha * tmp2 - beta * tmp1; \n"); + source.append(" } \n"); + source.append(" \n"); + source.append("} \n"); +} + +template <typename StringType> +void generate_vector_swap(StringType & source, std::string const & numeric_string) +{ + source.append("__kernel void swap( \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n"); + source.append(" unsigned int start1, \n"); + source.append(" unsigned int inc1, \n"); + source.append(" unsigned int size1, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * vec2, \n"); + source.append(" unsigned int start2, \n"); + source.append(" unsigned int inc2, \n"); + source.append(" unsigned int size2 \n"); + source.append(" ) \n"); + source.append("{ \n"); + source.append(" "); source.append(numeric_string); source.append(" tmp; \n"); + source.append(" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n"); + source.append(" { \n"); + source.append(" tmp = vec2[i*inc2+start2]; \n"); + source.append(" vec2[i*inc2+start2] = vec1[i*inc1+start1]; \n"); + source.append(" vec1[i*inc1+start1] = tmp; \n"); + source.append(" } \n"); + source.append("} \n"); +} + +template <typename StringType> +void generate_assign_cpu(StringType & source, std::string const & numeric_string) +{ + source.append("__kernel void assign_cpu( \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n"); + source.append(" unsigned int start1, \n"); + source.append(" unsigned int inc1, \n"); + source.append(" unsigned int size1, \n"); + source.append(" unsigned int internal_size1, \n"); + source.append(" "); source.append(numeric_string); source.append(" alpha) \n"); + source.append("{ \n"); + source.append(" for (unsigned int i = get_global_id(0); i < internal_size1; i += get_global_size(0)) \n"); + source.append(" vec1[i*inc1+start1] = (i < size1) ? alpha : 0; \n"); + source.append("} \n"); + +} + +template <typename StringType> +void generate_inner_prod(StringType & source, std::string const & numeric_string, vcl_size_t vector_num) +{ + std::stringstream ss; + ss << vector_num; + std::string vector_num_string = ss.str(); + + source.append("__kernel void inner_prod"); source.append(vector_num_string); source.append("( \n"); + source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n"); + source.append(" uint4 params_x, \n"); + for (vcl_size_t i=0; i<vector_num; ++i) + { + ss.str(""); + ss << i; + source.append(" __global const "); source.append(numeric_string); source.append(" * y"); source.append(ss.str()); source.append(", \n"); + source.append(" uint4 params_y"); source.append(ss.str()); source.append(", \n"); + } + source.append(" __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * group_buffer) \n"); + source.append("{ \n"); + source.append(" unsigned int entries_per_thread = (params_x.z - 1) / get_global_size(0) + 1; \n"); + source.append(" unsigned int vec_start_index = get_group_id(0) * get_local_size(0) * entries_per_thread; \n"); + source.append(" unsigned int vec_stop_index = min((unsigned int)((get_group_id(0) + 1) * get_local_size(0) * entries_per_thread), params_x.z); \n"); + + // compute partial results within group: + for (vcl_size_t i=0; i<vector_num; ++i) + { + ss.str(""); + ss << i; + source.append(" "); source.append(numeric_string); source.append(" tmp"); source.append(ss.str()); source.append(" = 0; \n"); + } + source.append(" for (unsigned int i = vec_start_index + get_local_id(0); i < vec_stop_index; i += get_local_size(0)) { \n"); + source.append(" "); source.append(numeric_string); source.append(" val_x = x[i*params_x.y + params_x.x]; \n"); + for (vcl_size_t i=0; i<vector_num; ++i) + { + ss.str(""); + ss << i; + source.append(" tmp"); source.append(ss.str()); source.append(" += val_x * y"); source.append(ss.str()); source.append("[i * params_y"); source.append(ss.str()); source.append(".y + params_y"); source.append(ss.str()); source.append(".x]; \n"); + } + source.append(" } \n"); + for (vcl_size_t i=0; i<vector_num; ++i) + { + ss.str(""); + ss << i; + source.append(" tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(" * get_local_size(0)] = tmp"); source.append(ss.str()); source.append("; \n"); + } + + // now run reduction: + source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n"); + source.append(" { \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + source.append(" if (get_local_id(0) < stride) { \n"); + for (vcl_size_t i=0; i<vector_num; ++i) + { + ss.str(""); + ss << i; + source.append(" tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(" * get_local_size(0)] += tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(" * get_local_size(0) + stride]; \n"); + } + source.append(" } \n"); + source.append(" } \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + + source.append(" if (get_local_id(0) == 0) { \n"); + for (vcl_size_t i=0; i<vector_num; ++i) + { + ss.str(""); + ss << i; + source.append(" group_buffer[get_group_id(0) + "); source.append(ss.str()); source.append(" * get_num_groups(0)] = tmp_buffer["); source.append(ss.str()); source.append(" * get_local_size(0)]; \n"); + } + source.append(" } \n"); + source.append("} \n"); + +} + +template <typename StringType> +void generate_norm(StringType & source, std::string const & numeric_string) +{ + bool is_float_or_double = (numeric_string == "float" || numeric_string == "double"); + + source.append(numeric_string); source.append(" impl_norm( \n"); + source.append(" __global const "); source.append(numeric_string); source.append(" * vec, \n"); + source.append(" unsigned int start1, \n"); + source.append(" unsigned int inc1, \n"); + source.append(" unsigned int size1, \n"); + source.append(" unsigned int norm_selector, \n"); + source.append(" __local "); source.append(numeric_string); source.append(" * tmp_buffer) \n"); + source.append("{ \n"); + source.append(" "); source.append(numeric_string); source.append(" tmp = 0; \n"); + source.append(" if (norm_selector == 1) \n"); //norm_1 + source.append(" { \n"); + source.append(" for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n"); + if (is_float_or_double) + source.append(" tmp += fabs(vec[i*inc1 + start1]); \n"); + else if (numeric_string[0] == 'u') // abs may not be defined for unsigned types + source.append(" tmp += vec[i*inc1 + start1]; \n"); + else + source.append(" tmp += abs(vec[i*inc1 + start1]); \n"); + source.append(" } \n"); + source.append(" else if (norm_selector == 2) \n"); //norm_2 + source.append(" { \n"); + source.append(" "); source.append(numeric_string); source.append(" vec_entry = 0; \n"); + source.append(" for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n"); + source.append(" { \n"); + source.append(" vec_entry = vec[i*inc1 + start1]; \n"); + source.append(" tmp += vec_entry * vec_entry; \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append(" else if (norm_selector == 0) \n"); //norm_inf + source.append(" { \n"); + source.append(" for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n"); + if (is_float_or_double) + source.append(" tmp = fmax(fabs(vec[i*inc1 + start1]), tmp); \n"); + else if (numeric_string[0] == 'u') // abs may not be defined for unsigned types + source.append(" tmp = max(vec[i*inc1 + start1], tmp); \n"); + else + { + source.append(" tmp = max(("); source.append(numeric_string); source.append(")abs(vec[i*inc1 + start1]), tmp); \n"); + } + source.append(" } \n"); + + source.append(" tmp_buffer[get_local_id(0)] = tmp; \n"); + + source.append(" if (norm_selector > 0) \n"); //norm_1 or norm_2: + source.append(" { \n"); + source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n"); + source.append(" { \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + source.append(" if (get_local_id(0) < stride) \n"); + source.append(" tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride]; \n"); + source.append(" } \n"); + source.append(" return tmp_buffer[0]; \n"); + source.append(" } \n"); + + //norm_inf: + source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n"); + source.append(" { \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + source.append(" if (get_local_id(0) < stride) \n"); + if (is_float_or_double) + source.append(" tmp_buffer[get_local_id(0)] = fmax(tmp_buffer[get_local_id(0)], tmp_buffer[get_local_id(0)+stride]); \n"); + else + source.append(" tmp_buffer[get_local_id(0)] = max(tmp_buffer[get_local_id(0)], tmp_buffer[get_local_id(0)+stride]); \n"); + source.append(" } \n"); + + source.append(" return tmp_buffer[0]; \n"); + source.append("}; \n"); + + source.append("__kernel void norm( \n"); + source.append(" __global const "); source.append(numeric_string); source.append(" * vec, \n"); + source.append(" unsigned int start1, \n"); + source.append(" unsigned int inc1, \n"); + source.append(" unsigned int size1, \n"); + source.append(" unsigned int norm_selector, \n"); + source.append(" __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * group_buffer) \n"); + source.append("{ \n"); + source.append(" "); source.append(numeric_string); source.append(" tmp = impl_norm(vec, \n"); + source.append(" ( get_group_id(0) * size1) / get_num_groups(0) * inc1 + start1, \n"); + source.append(" inc1, \n"); + source.append(" ( (1 + get_group_id(0)) * size1) / get_num_groups(0) \n"); + source.append(" - ( get_group_id(0) * size1) / get_num_groups(0), \n"); + source.append(" norm_selector, \n"); + source.append(" tmp_buffer); \n"); + + source.append(" if (get_local_id(0) == 0) \n"); + source.append(" group_buffer[get_group_id(0)] = tmp; \n"); + source.append("} \n"); + +} + +template <typename StringType> +void generate_inner_prod_sum(StringType & source, std::string const & numeric_string) +{ + // sums the array 'vec1' and writes to result. Each work group computes the inner product for a subvector of size 'size_per_workgroup'. + source.append("__kernel void sum_inner_prod( \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n"); + source.append(" unsigned int size_per_workgroup, \n"); + source.append(" __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * result, \n"); + source.append(" unsigned int start_result, \n"); + source.append(" unsigned int inc_result) \n"); + source.append("{ \n"); + source.append(" "); source.append(numeric_string); source.append(" thread_sum = 0; \n"); + source.append(" for (unsigned int i = get_local_id(0); i<size_per_workgroup; i += get_local_size(0)) \n"); + source.append(" thread_sum += vec1[size_per_workgroup * get_group_id(0) + i]; \n"); + + source.append(" tmp_buffer[get_local_id(0)] = thread_sum; \n"); + + source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n"); + source.append(" { \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + source.append(" if (get_local_id(0) < stride) \n"); + source.append(" tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0) + stride]; \n"); + source.append(" } \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + + source.append(" if (get_local_id(0) == 0) \n"); + source.append(" result[start_result + inc_result * get_group_id(0)] = tmp_buffer[0]; \n"); + source.append("} \n"); + +} + +template <typename StringType> +void generate_sum(StringType & source, std::string const & numeric_string) +{ + // sums the array 'vec1' and writes to result. Makes use of a single work-group only. + source.append("__kernel void sum( \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n"); + source.append(" unsigned int start1, \n"); + source.append(" unsigned int inc1, \n"); + source.append(" unsigned int size1, \n"); + source.append(" unsigned int option, \n"); //0: use fmax, 1: just sum, 2: sum and return sqrt of sum + source.append(" __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * result) \n"); + source.append("{ \n"); + source.append(" "); source.append(numeric_string); source.append(" thread_sum = 0; \n"); + source.append(" "); source.append(numeric_string); source.append(" tmp = 0; \n"); + source.append(" for (unsigned int i = get_local_id(0); i<size1; i += get_local_size(0)) \n"); + source.append(" { \n"); + source.append(" if (option > 0) \n"); + source.append(" thread_sum += vec1[i*inc1+start1]; \n"); + source.append(" else \n"); + source.append(" { \n"); + source.append(" tmp = vec1[i*inc1+start1]; \n"); + source.append(" tmp = (tmp < 0) ? -tmp : tmp; \n"); + source.append(" thread_sum = (thread_sum > tmp) ? thread_sum : tmp; \n"); + source.append(" } \n"); + source.append(" } \n"); + + source.append(" tmp_buffer[get_local_id(0)] = thread_sum; \n"); + + source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n"); + source.append(" { \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + source.append(" if (get_local_id(0) < stride) \n"); + source.append(" { \n"); + source.append(" if (option > 0) \n"); + source.append(" tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0) + stride]; \n"); + source.append(" else \n"); + source.append(" tmp_buffer[get_local_id(0)] = (tmp_buffer[get_local_id(0)] > tmp_buffer[get_local_id(0) + stride]) ? tmp_buffer[get_local_id(0)] : tmp_buffer[get_local_id(0) + stride]; \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + + source.append(" if (get_global_id(0) == 0) \n"); + source.append(" { \n"); + if (numeric_string == "float" || numeric_string == "double") + { + source.append(" if (option == 2) \n"); + source.append(" *result = sqrt(tmp_buffer[0]); \n"); + source.append(" else \n"); + } + source.append(" *result = tmp_buffer[0]; \n"); + source.append(" } \n"); + source.append("} \n"); + +} + +template <typename StringType> +void generate_index_norm_inf(StringType & source, std::string const & numeric_string) +{ + //index_norm_inf: + source.append("unsigned int index_norm_inf_impl( \n"); + source.append(" __global const "); source.append(numeric_string); source.append(" * vec, \n"); + source.append(" unsigned int start1, \n"); + source.append(" unsigned int inc1, \n"); + source.append(" unsigned int size1, \n"); + source.append(" __local "); source.append(numeric_string); source.append(" * entry_buffer, \n"); + source.append(" __local unsigned int * index_buffer) \n"); + source.append("{ \n"); + //step 1: fill buffer: + source.append(" "); source.append(numeric_string); source.append(" cur_max = 0; \n"); + source.append(" "); source.append(numeric_string); source.append(" tmp; \n"); + source.append(" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n"); + source.append(" { \n"); + if (numeric_string == "float" || numeric_string == "double") + source.append(" tmp = fabs(vec[i*inc1+start1]); \n"); + else if (numeric_string[0] == 'u') // abs may not be defined for unsigned types + source.append(" tmp = vec[i*inc1+start1]; \n"); + else + source.append(" tmp = abs(vec[i*inc1+start1]); \n"); + source.append(" if (cur_max < tmp) \n"); + source.append(" { \n"); + source.append(" entry_buffer[get_global_id(0)] = tmp; \n"); + source.append(" index_buffer[get_global_id(0)] = i; \n"); + source.append(" cur_max = tmp; \n"); + source.append(" } \n"); + source.append(" } \n"); + + //step 2: parallel reduction: + source.append(" for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2) \n"); + source.append(" { \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + source.append(" if (get_global_id(0) < stride) \n"); + source.append(" { \n"); + //find the first occurring index + source.append(" if (entry_buffer[get_global_id(0)] < entry_buffer[get_global_id(0)+stride]) \n"); + source.append(" { \n"); + source.append(" index_buffer[get_global_id(0)] = index_buffer[get_global_id(0)+stride]; \n"); + source.append(" entry_buffer[get_global_id(0)] = entry_buffer[get_global_id(0)+stride]; \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append(" \n"); + source.append(" return index_buffer[0]; \n"); + source.append("} \n"); + + source.append("__kernel void index_norm_inf( \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * vec, \n"); + source.append(" unsigned int start1, \n"); + source.append(" unsigned int inc1, \n"); + source.append(" unsigned int size1, \n"); + source.append(" __local "); source.append(numeric_string); source.append(" * entry_buffer, \n"); + source.append(" __local unsigned int * index_buffer, \n"); + source.append(" __global unsigned int * result) \n"); + source.append("{ \n"); + source.append(" entry_buffer[get_global_id(0)] = 0; \n"); + source.append(" index_buffer[get_global_id(0)] = 0; \n"); + source.append(" unsigned int tmp = index_norm_inf_impl(vec, start1, inc1, size1, entry_buffer, index_buffer); \n"); + source.append(" if (get_global_id(0) == 0) *result = tmp; \n"); + source.append("} \n"); + +} + +template <typename StringType> +void generate_maxmin(StringType & source, std::string const & numeric_string, bool is_max) +{ + // sums the array 'vec1' and writes to result. Makes use of a single work-group only. + if (is_max) + source.append("__kernel void max_kernel( \n"); + else + source.append("__kernel void min_kernel( \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n"); + source.append(" unsigned int start1, \n"); + source.append(" unsigned int inc1, \n"); + source.append(" unsigned int size1, \n"); + source.append(" __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * result) \n"); + source.append("{ \n"); + source.append(" "); source.append(numeric_string); source.append(" thread_result = vec1[start1]; \n"); + source.append(" for (unsigned int i = get_global_id(0); i<size1; i += get_global_size(0)) \n"); + source.append(" { \n"); + source.append(" "); source.append(numeric_string); source.append(" tmp = vec1[i*inc1+start1]; \n"); + if (is_max) + source.append(" thread_result = thread_result > tmp ? thread_result : tmp; \n"); + else + source.append(" thread_result = thread_result < tmp ? thread_result : tmp; \n"); + source.append(" } \n"); + + source.append(" tmp_buffer[get_local_id(0)] = thread_result; \n"); + + source.append(" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n"); + source.append(" { \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + source.append(" if (get_local_id(0) < stride) \n"); + source.append(" { \n"); + if (is_max) + source.append(" tmp_buffer[get_local_id(0)] = tmp_buffer[get_local_id(0)] > tmp_buffer[get_local_id(0) + stride] ? tmp_buffer[get_local_id(0)] : tmp_buffer[get_local_id(0) + stride]; \n"); + else + source.append(" tmp_buffer[get_local_id(0)] = tmp_buffer[get_local_id(0)] < tmp_buffer[get_local_id(0) + stride] ? tmp_buffer[get_local_id(0)] : tmp_buffer[get_local_id(0) + stride]; \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + + source.append(" if (get_local_id(0) == 0) \n"); + source.append(" result[get_group_id(0)] = tmp_buffer[0]; \n"); + source.append("} \n"); +} + +//////////////////////////// Part 2: Main kernel class //////////////////////////////////// + +// main kernel class +/** @brief Main kernel class for generating OpenCL kernels for operations on/with viennacl::vector<> without involving matrices, multiple inner products, or element-wise operations other than addition or subtraction. */ +template<typename NumericT> +struct vector +{ + static std::string program_name() + { + return viennacl::ocl::type_to_string<NumericT>::apply() + "_vector"; + } + + static void init(viennacl::ocl::context & ctx) + { + viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx); + std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply(); + + static std::map<cl_context, bool> init_done; + if (!init_done[ctx.handle().get()]) + { + std::string source; + source.reserve(8192); + + viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source); + + // fully parametrized kernels: + generate_avbv(source, numeric_string); + + // kernels with mostly predetermined skeleton: + generate_plane_rotation(source, numeric_string); + generate_vector_swap(source, numeric_string); + generate_assign_cpu(source, numeric_string); + + generate_inner_prod(source, numeric_string, 1); + generate_norm(source, numeric_string); + generate_sum(source, numeric_string); + generate_index_norm_inf(source, numeric_string); + generate_maxmin(source, numeric_string, true); + generate_maxmin(source, numeric_string, false); + + std::string prog_name = program_name(); + #ifdef VIENNACL_BUILD_INFO + std::cout << "Creating program " << prog_name << std::endl; + #endif + ctx.add_program(source, prog_name); + init_done[ctx.handle().get()] = true; + } //if + } //init +}; + +// class with kernels for multiple inner products. +/** @brief Main kernel class for generating OpenCL kernels for multiple inner products on/with viennacl::vector<>. */ +template<typename NumericT> +struct vector_multi_inner_prod +{ + static std::string program_name() + { + return viennacl::ocl::type_to_string<NumericT>::apply() + "_vector_multi"; + } + + static void init(viennacl::ocl::context & ctx) + { + viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx); + std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply(); + + static std::map<cl_context, bool> init_done; + if (!init_done[ctx.handle().get()]) + { + std::string source; + source.reserve(8192); + + viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source); + + generate_inner_prod(source, numeric_string, 2); + generate_inner_prod(source, numeric_string, 3); + generate_inner_prod(source, numeric_string, 4); + generate_inner_prod(source, numeric_string, 8); + + generate_inner_prod_sum(source, numeric_string); + + std::string prog_name = program_name(); + #ifdef VIENNACL_BUILD_INFO + std::cout << "Creating program " << prog_name << std::endl; + #endif + ctx.add_program(source, prog_name); + init_done[ctx.handle().get()] = true; + } //if + } //init +}; + + +template<typename StringT> +void generate_vector_convert(StringT & source, std::string const & dest_type, std::string const & src_type) +{ + source.append(" __kernel void convert_" + dest_type + "_" + src_type + "( \n"); + source.append(" __global " + dest_type + " * dest, \n"); + source.append(" unsigned int start_dest, unsigned int inc_dest, unsigned int size_dest, \n"); + source.append(" __global const " + src_type + " * src, \n"); + source.append(" unsigned int start_src, unsigned int inc_src) \n"); + source.append(" { \n"); + source.append(" for (unsigned int i = get_global_id(0); i < size_dest; i += get_global_size(0)) \n"); + source.append(" dest[start_dest + i * inc_dest] = src[start_src + i * inc_src]; \n"); + source.append(" } \n"); +} + +/** @brief Main kernel class for vector conversion routines (e.g. convert vector<int> to vector<float>). */ +struct vector_convert +{ + +public: + static std::string program_name() + { + return "vector_convert"; + } + + static void init(viennacl::ocl::context & ctx) + { + static std::map<cl_context, bool> init_done; + if (!init_done[ctx.handle().get()]) + { + std::string source; + source.reserve(4096); + + // int + generate_vector_convert(source, viennacl::ocl::type_to_string<int>::apply(), viennacl::ocl::type_to_string<int>::apply()); + generate_vector_convert(source, viennacl::ocl::type_to_string<int>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply()); + generate_vector_convert(source, viennacl::ocl::type_to_string<int>::apply(), viennacl::ocl::type_to_string<long>::apply()); + generate_vector_convert(source, viennacl::ocl::type_to_string<int>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply()); + generate_vector_convert(source, viennacl::ocl::type_to_string<int>::apply(), viennacl::ocl::type_to_string<float>::apply()); + + // unsigned int + generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(), viennacl::ocl::type_to_string<int>::apply()); + generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply()); + generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(), viennacl::ocl::type_to_string<long>::apply()); + generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply()); + generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(), viennacl::ocl::type_to_string<float>::apply()); + + // long + generate_vector_convert(source, viennacl::ocl::type_to_string<long>::apply(), viennacl::ocl::type_to_string<int>::apply()); + generate_vector_convert(source, viennacl::ocl::type_to_string<long>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply()); + generate_vector_convert(source, viennacl::ocl::type_to_string<long>::apply(), viennacl::ocl::type_to_string<long>::apply()); + generate_vector_convert(source, viennacl::ocl::type_to_string<long>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply()); + generate_vector_convert(source, viennacl::ocl::type_to_string<long>::apply(), viennacl::ocl::type_to_string<float>::apply()); + + // unsigned long + generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<int>::apply()); + generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply()); + generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<long>::apply()); + generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply()); + generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<float>::apply()); + + // float + generate_vector_convert(source, viennacl::ocl::type_to_string<float>::apply(), viennacl::ocl::type_to_string<int>::apply()); + generate_vector_convert(source, viennacl::ocl::type_to_string<float>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply()); + generate_vector_convert(source, viennacl::ocl::type_to_string<float>::apply(), viennacl::ocl::type_to_string<long>::apply()); + generate_vector_convert(source, viennacl::ocl::type_to_string<float>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply()); + generate_vector_convert(source, viennacl::ocl::type_to_string<float>::apply(), viennacl::ocl::type_to_string<float>::apply()); + + if (ctx.current_device().double_support()) + { + viennacl::ocl::append_double_precision_pragma<double>(ctx, source); + + generate_vector_convert(source, viennacl::ocl::type_to_string<int>::apply(), viennacl::ocl::type_to_string<double>::apply()); + generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned int>::apply(), viennacl::ocl::type_to_string<double>::apply()); + generate_vector_convert(source, viennacl::ocl::type_to_string<long>::apply(), viennacl::ocl::type_to_string<double>::apply()); + generate_vector_convert(source, viennacl::ocl::type_to_string<unsigned long>::apply(), viennacl::ocl::type_to_string<double>::apply()); + generate_vector_convert(source, viennacl::ocl::type_to_string<float>::apply(), viennacl::ocl::type_to_string<double>::apply()); + + generate_vector_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<int>::apply()); + generate_vector_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<unsigned int>::apply()); + generate_vector_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<long>::apply()); + generate_vector_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<unsigned long>::apply()); + generate_vector_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<float>::apply()); + generate_vector_convert(source, viennacl::ocl::type_to_string<double>::apply(), viennacl::ocl::type_to_string<double>::apply()); + } + + std::string prog_name = program_name(); + #ifdef VIENNACL_BUILD_INFO + std::cout << "Creating program " << prog_name << std::endl; + #endif + ctx.add_program(source, prog_name); + init_done[ctx.handle().get()] = true; + } //if + } //init + +}; + + +} // namespace kernels +} // namespace opencl +} // namespace linalg +} // namespace viennacl +#endif + http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/vector_element.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/vector_element.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/vector_element.hpp new file mode 100644 index 0000000..8445302 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/vector_element.hpp @@ -0,0 +1,163 @@ +#ifndef VIENNACL_LINALG_OPENCL_KERNELS_VECTOR_ELEMENT_HPP +#define VIENNACL_LINALG_OPENCL_KERNELS_VECTOR_ELEMENT_HPP + +#include "viennacl/tools/tools.hpp" +#include "viennacl/ocl/kernel.hpp" +#include "viennacl/ocl/platform.hpp" +#include "viennacl/ocl/utils.hpp" + +/** @file viennacl/linalg/opencl/kernels/vector_element.hpp + * @brief OpenCL kernel file for element-wise vector operations */ +namespace viennacl +{ +namespace linalg +{ +namespace opencl +{ +namespace kernels +{ + +//////////////////////////// Part 1: Kernel generation routines //////////////////////////////////// + + +//generate code for C = op1(A) * op2(B), where A, B, C can have different storage layouts and opX(D) = D or trans(D) +template <typename StringT> +void generate_vector_unary_element_ops(StringT & source, std::string const & numeric_string, + std::string const & funcname, std::string const & op, std::string const & op_name) +{ + source.append("__kernel void "); source.append(funcname); source.append("_"); source.append(op_name); source.append("(\n"); + source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n"); + source.append(" uint4 size1, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * vec2, \n"); + source.append(" uint4 size2) { \n"); + source.append(" for (unsigned int i = get_global_id(0); i < size1.z; i += get_global_size(0)) \n"); + if (numeric_string[0] == 'u' && funcname == "abs") // abs() on unsigned does not work on MacOS X 10.6.8, so we use the identity: + { + source.append(" vec1[i*size1.y+size1.x] "); source.append(op); source.append(" vec2[i*size2.y+size2.x]; \n"); + } + else + { + source.append(" vec1[i*size1.y+size1.x] "); source.append(op); source.append(" "); source.append(funcname); source.append("(vec2[i*size2.y+size2.x]); \n"); + } + source.append("} \n"); +} + +template <typename StringT> +void generate_vector_unary_element_ops(StringT & source, std::string const & numeric_string, std::string const & funcname) +{ + generate_vector_unary_element_ops(source, numeric_string, funcname, "=", "assign"); + //generate_vector_unary_element_ops(source, numeric_string, funcname, "+=", "plus"); + //generate_vector_unary_element_ops(source, numeric_string, funcname, "-=", "minus"); +} + +template <typename StringT> +void generate_vector_binary_element_ops(StringT & source, std::string const & numeric_string, int op_type) //op_type: {0: product, 1: division, 2: power} +{ + std::string kernel_name_suffix; + if (op_type == 0) + kernel_name_suffix = "prod"; + else if (op_type == 1) + kernel_name_suffix = "div"; + else + kernel_name_suffix = "pow"; + + // generic kernel for the vector operation v1 = alpha * v2 + beta * v3, where v1, v2, v3 are not necessarily distinct vectors + source.append("__kernel void element_" + kernel_name_suffix + "(\n"); + source.append(" __global "); source.append(numeric_string); source.append(" * vec1, \n"); + source.append(" unsigned int start1, \n"); + source.append(" unsigned int inc1, \n"); + source.append(" unsigned int size1, \n"); + + source.append(" __global const "); source.append(numeric_string); source.append(" * vec2, \n"); + source.append(" unsigned int start2, \n"); + source.append(" unsigned int inc2, \n"); + + source.append(" __global const "); source.append(numeric_string); source.append(" * vec3, \n"); + source.append(" unsigned int start3, \n"); + source.append(" unsigned int inc3, \n"); + + source.append(" unsigned int op_type) \n"); + source.append("{ \n"); + source.append(" for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n"); + if (op_type == 0) + source.append(" vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3]; \n"); + else if (op_type == 1) + source.append(" vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3]; \n"); + else if (op_type == 2) + source.append(" vec1[i*inc1+start1] = pow(vec2[i*inc2+start2], vec3[i*inc3+start3]); \n"); + + source.append("} \n"); +} + +//////////////////////////// Part 2: Main kernel class //////////////////////////////////// + +// main kernel class +/** @brief Main kernel class for generating OpenCL kernels for elementwise operations other than addition and subtraction on/with viennacl::vector<>. */ +template<typename NumericT> +struct vector_element +{ + static std::string program_name() + { + return viennacl::ocl::type_to_string<NumericT>::apply() + "_vector_element"; + } + + static void init(viennacl::ocl::context & ctx) + { + viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx); + std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply(); + + static std::map<cl_context, bool> init_done; + if (!init_done[ctx.handle().get()]) + { + std::string source; + source.reserve(8192); + + viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source); + + // unary operations + if (numeric_string == "float" || numeric_string == "double") + { + generate_vector_unary_element_ops(source, numeric_string, "acos"); + generate_vector_unary_element_ops(source, numeric_string, "asin"); + generate_vector_unary_element_ops(source, numeric_string, "atan"); + generate_vector_unary_element_ops(source, numeric_string, "ceil"); + generate_vector_unary_element_ops(source, numeric_string, "cos"); + generate_vector_unary_element_ops(source, numeric_string, "cosh"); + generate_vector_unary_element_ops(source, numeric_string, "exp"); + generate_vector_unary_element_ops(source, numeric_string, "fabs"); + generate_vector_unary_element_ops(source, numeric_string, "floor"); + generate_vector_unary_element_ops(source, numeric_string, "log"); + generate_vector_unary_element_ops(source, numeric_string, "log10"); + generate_vector_unary_element_ops(source, numeric_string, "sin"); + generate_vector_unary_element_ops(source, numeric_string, "sinh"); + generate_vector_unary_element_ops(source, numeric_string, "sqrt"); + generate_vector_unary_element_ops(source, numeric_string, "tan"); + generate_vector_unary_element_ops(source, numeric_string, "tanh"); + } + else + { + generate_vector_unary_element_ops(source, numeric_string, "abs"); + } + + // binary operations + generate_vector_binary_element_ops(source, numeric_string, 0); + generate_vector_binary_element_ops(source, numeric_string, 1); + if (numeric_string == "float" || numeric_string == "double") + generate_vector_binary_element_ops(source, numeric_string, 2); + + std::string prog_name = program_name(); + #ifdef VIENNACL_BUILD_INFO + std::cout << "Creating program " << prog_name << std::endl; + #endif + ctx.add_program(source, prog_name); + init_done[ctx.handle().get()] = true; + } //if + } //init +}; + +} // namespace kernels +} // namespace opencl +} // namespace linalg +} // namespace viennacl +#endif +
