http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/scalar.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/scalar.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/scalar.hpp new file mode 100644 index 0000000..dcd39ad --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/scalar.hpp @@ -0,0 +1,283 @@ +#ifndef VIENNACL_LINALG_OPENCL_KERNELS_SCALAR_HPP +#define VIENNACL_LINALG_OPENCL_KERNELS_SCALAR_HPP + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +#include "viennacl/tools/tools.hpp" +#include "viennacl/ocl/kernel.hpp" +#include "viennacl/ocl/platform.hpp" +#include "viennacl/ocl/utils.hpp" + +/** @file viennacl/linalg/opencl/kernels/scalar.hpp + * @brief OpenCL kernel file for scalar operations */ +namespace viennacl +{ +namespace linalg +{ +namespace opencl +{ +namespace kernels +{ + +//////////////////////////// Part 1: Kernel generation routines //////////////////////////////////// + +/** @brief Enumeration for the scalar type in avbv-like operations */ +enum asbs_scalar_type +{ + VIENNACL_ASBS_NONE = 0, // scalar does not exist/contribute + VIENNACL_ASBS_CPU, + VIENNACL_ASBS_GPU +}; + +/** @brief Configuration struct for generating OpenCL kernels for linear combinations of viennacl::scalar<> objects */ +struct asbs_config +{ + asbs_config() : with_stride_and_range(true), a(VIENNACL_ASBS_CPU), b(VIENNACL_ASBS_NONE) {} + + bool with_stride_and_range; + std::string assign_op; + asbs_scalar_type a; + asbs_scalar_type b; +}; + +// just returns the assignment string +template<typename StringT> +void generate_asbs_impl3(StringT & source, char sign_a, char sign_b, asbs_config const & cfg, bool mult_alpha, bool mult_beta) +{ + source.append(" *s1 "); source.append(cfg.assign_op); source.append(1, sign_a); source.append(" *s2 "); + if (mult_alpha) + source.append("* alpha "); + else + source.append("/ alpha "); + if (cfg.b != VIENNACL_ASBS_NONE) + { + source.append(1, sign_b); source.append(" *s3 "); + if (mult_beta) + source.append("* beta"); + else + source.append("/ beta"); + } + source.append("; \n"); +} + +template<typename StringT> +void generate_asbs_impl2(StringT & source, char sign_a, char sign_b, asbs_config const & cfg) +{ + source.append(" if (options2 & (1 << 1)) { \n"); + if (cfg.b != VIENNACL_ASBS_NONE) + { + source.append(" if (options3 & (1 << 1)) \n"); + generate_asbs_impl3(source, sign_a, sign_b, cfg, false, false); + source.append(" else \n"); + generate_asbs_impl3(source, sign_a, sign_b, cfg, false, true); + } + else + generate_asbs_impl3(source, sign_a, sign_b, cfg, false, true); + source.append(" } else { \n"); + if (cfg.b != VIENNACL_ASBS_NONE) + { + source.append(" if (options3 & (1 << 1)) \n"); + generate_asbs_impl3(source, sign_a, sign_b, cfg, true, false); + source.append(" else \n"); + generate_asbs_impl3(source, sign_a, sign_b, cfg, true, true); + } + else + generate_asbs_impl3(source, sign_a, sign_b, cfg, true, true); + source.append(" } \n"); + +} + +template<typename StringT> +void generate_asbs_impl(StringT & source, std::string const & numeric_string, asbs_config const & cfg) +{ + source.append("__kernel void as"); + if (cfg.b != VIENNACL_ASBS_NONE) + source.append("bs"); + if (cfg.assign_op != "=") + source.append("_s"); + + if (cfg.a == VIENNACL_ASBS_CPU) + source.append("_cpu"); + else if (cfg.a == VIENNACL_ASBS_GPU) + source.append("_gpu"); + + if (cfg.b == VIENNACL_ASBS_CPU) + source.append("_cpu"); + else if (cfg.b == VIENNACL_ASBS_GPU) + source.append("_gpu"); + source.append("( \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * s1, \n"); + source.append(" \n"); + if (cfg.a == VIENNACL_ASBS_CPU) + { + source.append(" "); source.append(numeric_string); source.append(" fac2, \n"); + } + else if (cfg.a == VIENNACL_ASBS_GPU) + { + source.append(" __global "); source.append(numeric_string); source.append(" * fac2, \n"); + } + source.append(" unsigned int options2, \n"); // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse + source.append(" __global const "); source.append(numeric_string); source.append(" * s2"); + + if (cfg.b != VIENNACL_ASBS_NONE) + { + source.append(", \n\n"); + if (cfg.b == VIENNACL_ASBS_CPU) + { + source.append(" "); source.append(numeric_string); source.append(" fac3, \n"); + } + else if (cfg.b == VIENNACL_ASBS_GPU) + { + source.append(" __global "); source.append(numeric_string); source.append(" * fac3, \n"); + } + source.append(" unsigned int options3, \n"); // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse + source.append(" __global const "); source.append(numeric_string); source.append(" * s3"); + } + source.append(") \n{ \n"); + + if (cfg.a == VIENNACL_ASBS_CPU) + { + source.append(" "); source.append(numeric_string); source.append(" alpha = fac2; \n"); + } + else if (cfg.a == VIENNACL_ASBS_GPU) + { + source.append(" "); source.append(numeric_string); source.append(" alpha = fac2[0]; \n"); + } + source.append(" \n"); + + if (cfg.b == VIENNACL_ASBS_CPU) + { + source.append(" "); source.append(numeric_string); source.append(" beta = fac3; \n"); + } + else if (cfg.b == VIENNACL_ASBS_GPU) + { + source.append(" "); source.append(numeric_string); source.append(" beta = fac3[0]; \n"); + } + + source.append(" if (options2 & (1 << 0)) { \n"); + if (cfg.b != VIENNACL_ASBS_NONE) + { + source.append(" if (options3 & (1 << 0)) { \n"); + generate_asbs_impl2(source, '-', '-', cfg); + source.append(" } else { \n"); + generate_asbs_impl2(source, '-', '+', cfg); + source.append(" } \n"); + } + else + generate_asbs_impl2(source, '-', '+', cfg); + source.append(" } else { \n"); + if (cfg.b != VIENNACL_ASBS_NONE) + { + source.append(" if (options3 & (1 << 0)) { \n"); + generate_asbs_impl2(source, '+', '-', cfg); + source.append(" } else { \n"); + generate_asbs_impl2(source, '+', '+', cfg); + source.append(" } \n"); + } + else + generate_asbs_impl2(source, '+', '+', cfg); + + source.append(" } \n"); + source.append("} \n"); +} + +template<typename StringT> +void generate_asbs(StringT & source, std::string const & numeric_string) +{ + asbs_config cfg; + cfg.assign_op = "="; + cfg.with_stride_and_range = true; + + // as + cfg.b = VIENNACL_ASBS_NONE; cfg.a = VIENNACL_ASBS_CPU; generate_asbs_impl(source, numeric_string, cfg); + cfg.b = VIENNACL_ASBS_NONE; cfg.a = VIENNACL_ASBS_GPU; generate_asbs_impl(source, numeric_string, cfg); + + // asbs + cfg.a = VIENNACL_ASBS_CPU; cfg.b = VIENNACL_ASBS_CPU; generate_asbs_impl(source, numeric_string, cfg); + cfg.a = VIENNACL_ASBS_CPU; cfg.b = VIENNACL_ASBS_GPU; generate_asbs_impl(source, numeric_string, cfg); + cfg.a = VIENNACL_ASBS_GPU; cfg.b = VIENNACL_ASBS_CPU; generate_asbs_impl(source, numeric_string, cfg); + cfg.a = VIENNACL_ASBS_GPU; cfg.b = VIENNACL_ASBS_GPU; generate_asbs_impl(source, numeric_string, cfg); + + // asbs + cfg.assign_op = "+="; + + cfg.a = VIENNACL_ASBS_CPU; cfg.b = VIENNACL_ASBS_CPU; generate_asbs_impl(source, numeric_string, cfg); + cfg.a = VIENNACL_ASBS_CPU; cfg.b = VIENNACL_ASBS_GPU; generate_asbs_impl(source, numeric_string, cfg); + cfg.a = VIENNACL_ASBS_GPU; cfg.b = VIENNACL_ASBS_CPU; generate_asbs_impl(source, numeric_string, cfg); + cfg.a = VIENNACL_ASBS_GPU; cfg.b = VIENNACL_ASBS_GPU; generate_asbs_impl(source, numeric_string, cfg); +} + +template<typename StringT> +void generate_scalar_swap(StringT & source, std::string const & numeric_string) +{ + source.append("__kernel void swap( \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * s1, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * s2) \n"); + source.append("{ \n"); + source.append(" "); source.append(numeric_string); source.append(" tmp = *s2; \n"); + source.append(" *s2 = *s1; \n"); + source.append(" *s1 = tmp; \n"); + source.append("} \n"); +} + +//////////////////////////// Part 2: Main kernel class //////////////////////////////////// + +// main kernel class +/** @brief Main kernel class for generating OpenCL kernels for operations involving viennacl::scalar<>, but not viennacl::vector<> or viennacl::matrix<>. */ +template<typename NumericT> +struct scalar +{ + static std::string program_name() + { + return viennacl::ocl::type_to_string<NumericT>::apply() + "_scalar"; + } + + static void init(viennacl::ocl::context & ctx) + { + static std::map<cl_context, bool> init_done; + if (!init_done[ctx.handle().get()]) + { + viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx); + std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply(); + + std::string source; + source.reserve(8192); + + viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source); + + // fully parametrized kernels: + generate_asbs(source, numeric_string); + generate_scalar_swap(source, numeric_string); + + + std::string prog_name = program_name(); + #ifdef VIENNACL_BUILD_INFO + std::cout << "Creating program " << prog_name << std::endl; + #endif + ctx.add_program(source, prog_name); + init_done[ctx.handle().get()] = true; + } //if + } //init +}; + +} // namespace kernels +} // namespace opencl +} // namespace linalg +} // namespace viennacl +#endif +
http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/scan.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/scan.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/scan.hpp new file mode 100644 index 0000000..9626d2d --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/scan.hpp @@ -0,0 +1,194 @@ +#ifndef VIENNACL_LINALG_OPENCL_KERNELS_SCAN_HPP +#define VIENNACL_LINALG_OPENCL_KERNELS_SCAN_HPP + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +#include "viennacl/tools/tools.hpp" +#include "viennacl/ocl/kernel.hpp" +#include "viennacl/ocl/platform.hpp" +#include "viennacl/ocl/utils.hpp" + +/** @file viennacl/linalg/opencl/kernels/scan.hpp + * @brief OpenCL kernel file for scan operations. To be merged back to vector operations. */ +namespace viennacl +{ +namespace linalg +{ +namespace opencl +{ +namespace kernels +{ + + +template <typename StringType> +void generate_scan_kernel_1(StringType & source, std::string const & numeric_string) +{ + source.append("__kernel void scan_1(__global "); source.append(numeric_string); source.append("* X, \n"); + source.append(" unsigned int startX, \n"); + source.append(" unsigned int incX, \n"); + source.append(" unsigned int sizeX, \n"); + + source.append(" __global "); source.append(numeric_string); source.append("* Y, \n"); + source.append(" unsigned int startY, \n"); + source.append(" unsigned int incY, \n"); + + source.append(" unsigned int scan_offset, \n"); // 0 for inclusive scan, 1 for exclusive scan + source.append(" __global "); source.append(numeric_string); source.append("* carries) { \n"); + + source.append(" __local "); source.append(numeric_string); source.append(" shared_buffer[256]; \n"); + source.append(" "); source.append(numeric_string); source.append(" my_value; \n"); + + source.append(" unsigned int work_per_thread = (sizeX - 1) / get_global_size(0) + 1; \n"); + source.append(" unsigned int block_start = work_per_thread * get_local_size(0) * get_group_id(0); \n"); + source.append(" unsigned int block_stop = work_per_thread * get_local_size(0) * (get_group_id(0) + 1); \n"); + source.append(" unsigned int block_offset = 0; \n"); + + // run scan on each section: + source.append(" for (unsigned int i = block_start + get_local_id(0); i < block_stop; i += get_local_size(0)) { \n"); + + // load data + source.append(" my_value = (i < sizeX) ? X[i * incX + startX] : 0; \n"); + + // inclusive scan in shared buffer: + source.append(" for(unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + source.append(" shared_buffer[get_local_id(0)] = my_value; \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + source.append(" if (get_local_id(0) >= stride) \n"); + source.append(" my_value += shared_buffer[get_local_id(0) - stride]; \n"); + source.append(" } \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + source.append(" shared_buffer[get_local_id(0)] = my_value; \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + + // write to output array: + source.append(" if (scan_offset > 0) \n"); + source.append(" my_value = (get_local_id(0) > 0) ? shared_buffer[get_local_id(0) - 1] : 0; \n"); + + source.append(" if (i < sizeX) \n"); + source.append(" Y[i * incY + startY] = block_offset + my_value; \n"); + + source.append(" block_offset += shared_buffer[get_local_size(0)-1]; \n"); + source.append(" } \n"); + + // write carry: + source.append(" if (get_local_id(0) == 0) carries[get_group_id(0)] = block_offset; \n"); + + source.append("} \n"); +} + +template <typename StringType> +void generate_scan_kernel_2(StringType & source, std::string const & numeric_string) +{ + source.append("__kernel void scan_2(__global "); source.append(numeric_string); source.append("* carries) { \n"); + + source.append(" __local "); source.append(numeric_string); source.append(" shared_buffer[256]; \n"); //section size + + // load data + source.append(" "); source.append(numeric_string); source.append(" my_carry = carries[get_local_id(0)]; \n"); + + // scan in shared buffer: + source.append(" for(unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + source.append(" shared_buffer[get_local_id(0)] = my_carry; \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + source.append(" if (get_local_id(0) >= stride) \n"); + source.append(" my_carry += shared_buffer[get_local_id(0) - stride]; \n"); + source.append(" } \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + source.append(" shared_buffer[get_local_id(0)] = my_carry; \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + + // write to output array: + source.append(" carries[get_local_id(0)] = (get_local_id(0) > 0) ? shared_buffer[get_local_id(0) - 1] : 0; \n"); + + source.append("} \n"); +} + +template <typename StringType> +void generate_scan_kernel_3(StringType & source, std::string const & numeric_string) +{ + source.append("__kernel void scan_3(__global "); source.append(numeric_string); source.append(" * Y, \n"); + source.append(" unsigned int startY, \n"); + source.append(" unsigned int incY, \n"); + source.append(" unsigned int sizeY, \n"); + + source.append(" __global "); source.append(numeric_string); source.append("* carries) { \n"); + + source.append(" unsigned int work_per_thread = (sizeY - 1) / get_global_size(0) + 1; \n"); + source.append(" unsigned int block_start = work_per_thread * get_local_size(0) * get_group_id(0); \n"); + source.append(" unsigned int block_stop = work_per_thread * get_local_size(0) * (get_group_id(0) + 1); \n"); + + source.append(" __local "); source.append(numeric_string); source.append(" shared_offset; \n"); + + source.append(" if (get_local_id(0) == 0) shared_offset = carries[get_group_id(0)]; \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + + source.append(" for (unsigned int i = block_start + get_local_id(0); i < block_stop; i += get_local_size(0)) \n"); + source.append(" if (i < sizeY) \n"); + source.append(" Y[i * incY + startY] += shared_offset; \n"); + + source.append("} \n"); +} + + + + +// main kernel class +/** @brief Main kernel class for generating OpenCL kernels for singular value decomposition of dense matrices. */ +template<typename NumericT> +struct scan +{ + static std::string program_name() + { + return viennacl::ocl::type_to_string<NumericT>::apply() + "_scan"; + } + + static void init(viennacl::ocl::context & ctx) + { + static std::map<cl_context, bool> init_done; + if (!init_done[ctx.handle().get()]) + { + viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx); + std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply(); + + std::string source; + source.reserve(1024); + + viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source); + + generate_scan_kernel_1(source, numeric_string); + generate_scan_kernel_2(source, numeric_string); + generate_scan_kernel_3(source, numeric_string); + + std::string prog_name = program_name(); + #ifdef VIENNACL_BUILD_INFO + std::cout << "Creating program " << prog_name << std::endl; + #endif + ctx.add_program(source, prog_name); + init_done[ctx.handle().get()] = true; + } //if + } //init +}; + +} // namespace kernels +} // namespace opencl +} // namespace linalg +} // namespace viennacl +#endif + http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/sliced_ell_matrix.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/sliced_ell_matrix.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/sliced_ell_matrix.hpp new file mode 100644 index 0000000..562cb52 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/sliced_ell_matrix.hpp @@ -0,0 +1,135 @@ +#ifndef VIENNACL_LINALG_OPENCL_KERNELS_SLICED_ELL_MATRIX_HPP +#define VIENNACL_LINALG_OPENCL_KERNELS_SLICED_ELL_MATRIX_HPP + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +#include "viennacl/tools/tools.hpp" +#include "viennacl/ocl/kernel.hpp" +#include "viennacl/ocl/platform.hpp" +#include "viennacl/ocl/utils.hpp" + +#include "viennacl/linalg/opencl/common.hpp" + +/** @file viennacl/linalg/opencl/kernels/sliced_ell_matrix.hpp + * @brief OpenCL kernel file for sliced_ell_matrix operations */ +namespace viennacl +{ +namespace linalg +{ +namespace opencl +{ +namespace kernels +{ + +//////////////////////////// Part 1: Kernel generation routines //////////////////////////////////// + +template<typename StringT> +void generate_sliced_ell_vec_mul(StringT & source, std::string const & numeric_string, bool with_alpha_beta) +{ + if (with_alpha_beta) + source.append("__kernel void vec_mul_alpha_beta( \n"); + else + source.append("__kernel void vec_mul( \n"); + source.append(" __global const unsigned int * columns_per_block, \n"); + source.append(" __global const unsigned int * column_indices, \n"); + source.append(" __global const unsigned int * block_start, \n"); + source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n"); + source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n"); + source.append(" uint4 layout_x, \n"); + if (with_alpha_beta) { source.append(" "); source.append(numeric_string); source.append(" alpha, \n"); } + source.append(" __global "); source.append(numeric_string); source.append(" * result, \n"); + source.append(" uint4 layout_result, \n"); + if (with_alpha_beta) { source.append(" "); source.append(numeric_string); source.append(" beta, \n"); } + source.append(" unsigned int block_size) \n"); + source.append("{ \n"); + source.append(" uint blocks_per_workgroup = get_local_size(0) / block_size; \n"); + source.append(" uint id_in_block = get_local_id(0) % block_size; \n"); + source.append(" uint num_blocks = (layout_result.z - 1) / block_size + 1; \n"); + source.append(" uint global_warp_count = blocks_per_workgroup * get_num_groups(0); \n"); + source.append(" uint global_warp_id = blocks_per_workgroup * get_group_id(0) + get_local_id(0) / block_size; \n"); + + source.append(" for (uint block_idx = global_warp_id; block_idx < num_blocks; block_idx += global_warp_count) { \n"); + source.append(" "); source.append(numeric_string); source.append(" sum = 0; \n"); + + source.append(" uint row = block_idx * block_size + id_in_block; \n"); + source.append(" uint offset = block_start[block_idx]; \n"); + source.append(" uint num_columns = columns_per_block[block_idx]; \n"); + source.append(" for (uint item_id = 0; item_id < num_columns; item_id++) { \n"); + source.append(" uint index = offset + item_id * block_size + id_in_block; \n"); + source.append(" "); source.append(numeric_string); source.append(" val = elements[index]; \n"); + source.append(" sum += (val != 0) ? (x[column_indices[index] * layout_x.y + layout_x.x] * val) : 0; \n"); + source.append(" } \n"); + + source.append(" if (row < layout_result.z) \n"); + if (with_alpha_beta) + source.append(" result[row * layout_result.y + layout_result.x] = alpha * sum + ((beta != 0) ? beta * result[row * layout_result.y + layout_result.x] : 0); \n"); + else + source.append(" result[row * layout_result.y + layout_result.x] = sum; \n"); + source.append(" } \n"); + source.append("} \n"); +} + + +//////////////////////////// Part 2: Main kernel class //////////////////////////////////// + +// main kernel class +/** @brief Main kernel class for generating OpenCL kernels for ell_matrix. */ +template<typename NumericT, typename IndexT> +struct sliced_ell_matrix; + +template<typename NumericT> +struct sliced_ell_matrix<NumericT, unsigned int> +{ + static std::string program_name() + { + return viennacl::ocl::type_to_string<NumericT>::apply() + viennacl::ocl::type_to_string<unsigned int>::apply() + "_sliced_ell_matrix"; + } + + static void init(viennacl::ocl::context & ctx) + { + static std::map<cl_context, bool> init_done; + if (!init_done[ctx.handle().get()]) + { + viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx); + std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply(); + + std::string source; + source.reserve(1024); + + viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source); + + // fully parametrized kernels: + generate_sliced_ell_vec_mul(source, numeric_string, true); + generate_sliced_ell_vec_mul(source, numeric_string, false); + + std::string prog_name = program_name(); + #ifdef VIENNACL_BUILD_INFO + std::cout << "Creating program " << prog_name << std::endl; + #endif + ctx.add_program(source, prog_name); + init_done[ctx.handle().get()] = true; + } //if + } //init +}; + +} // namespace kernels +} // namespace opencl +} // namespace linalg +} // namespace viennacl +#endif + http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/spai.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/spai.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/spai.hpp new file mode 100644 index 0000000..19ac991 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/spai.hpp @@ -0,0 +1,631 @@ +#ifndef VIENNACL_LINALG_OPENCL_KERNELS_SPAI_HPP +#define VIENNACL_LINALG_OPENCL_KERNELS_SPAI_HPP + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + +#include "viennacl/tools/tools.hpp" +#include "viennacl/ocl/kernel.hpp" +#include "viennacl/ocl/platform.hpp" +#include "viennacl/ocl/utils.hpp" + +/** @file viennacl/linalg/opencl/kernels/spai.hpp + * @brief OpenCL kernel file for sparse approximate inverse operations */ +namespace viennacl +{ +namespace linalg +{ +namespace opencl +{ +namespace kernels +{ + +//////////////////////////// Part 1: Kernel generation routines //////////////////////////////////// + +template<typename StringT> +void generate_spai_assemble_blocks(StringT & source, std::string const & numeric_string) +{ + source.append("float get_element(__global const unsigned int * row_indices, \n"); + source.append(" __global const unsigned int * column_indices, \n"); + source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n"); + source.append(" unsigned int row, \n"); + source.append(" unsigned int col) \n"); + source.append("{ \n"); + source.append(" unsigned int row_end = row_indices[row+1]; \n"); + source.append(" for (unsigned int i = row_indices[row]; i < row_end; ++i){ \n"); + source.append(" if (column_indices[i] == col) \n"); + source.append(" return elements[i]; \n"); + source.append(" if (column_indices[i] > col) \n"); + source.append(" return 0; \n"); + source.append(" } \n"); + source.append(" return 0; \n"); + source.append("} \n"); + + source.append("void block_assembly(__global const unsigned int * row_indices, \n"); + source.append(" __global const unsigned int * column_indices, \n"); + source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n"); + source.append(" __global const unsigned int * matrix_dimensions, \n"); + source.append(" __global const unsigned int * set_I, \n"); + source.append(" __global const unsigned int * set_J, \n"); + source.append(" unsigned int matrix_ind, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * com_A_I_J) \n"); + source.append("{ \n"); + source.append(" unsigned int row_n = matrix_dimensions[2*matrix_ind]; \n"); + source.append(" unsigned int col_n = matrix_dimensions[2*matrix_ind + 1]; \n"); + + source.append(" for (unsigned int i = 0; i < col_n; ++i){ \n"); + //start row index + source.append(" for (unsigned int j = 0; j < row_n; j++){ \n"); + source.append(" com_A_I_J[ i*row_n + j] = get_element(row_indices, column_indices, elements, set_I[j], set_J[i]); \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append("} \n"); + + source.append("__kernel void assemble_blocks( \n"); + source.append(" __global const unsigned int * row_indices, \n"); + source.append(" __global const unsigned int * column_indices, \n"); + source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n"); + source.append(" __global const unsigned int * set_I, \n"); + source.append(" __global const unsigned int * set_J, \n"); + source.append(" __global const unsigned int * i_ind, \n"); + source.append(" __global const unsigned int * j_ind, \n"); + source.append(" __global const unsigned int * block_ind, \n"); + source.append(" __global const unsigned int * matrix_dimensions, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * com_A_I_J, \n"); + source.append(" __global unsigned int * g_is_update, \n"); + source.append(" unsigned int block_elems_num) \n"); + source.append("{ \n"); + source.append(" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n"); + source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n"); + source.append(" block_assembly(row_indices, column_indices, elements, matrix_dimensions, set_I + i_ind[i], set_J + j_ind[i], i, com_A_I_J + block_ind[i]); \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append(" } \n"); +} + +template<typename StringT> +void generate_spai_block_bv_assembly(StringT & source, std::string const & numeric_string) +{ + source.append(" void assemble_bv(__global "); source.append(numeric_string); source.append(" * g_bv_r, __global "); source.append(numeric_string); source.append(" * g_bv, unsigned int col_n){ \n"); + source.append(" for (unsigned int i = 0; i < col_n; ++i){ \n"); + source.append(" g_bv_r[i] = g_bv[ i]; \n"); + source.append(" } \n"); + source.append(" } \n"); + + source.append(" void assemble_bv_block(__global "); source.append(numeric_string); source.append(" * g_bv_r, __global "); source.append(numeric_string); source.append(" * g_bv, unsigned int col_n, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * g_bv_u, unsigned int col_n_u) \n"); + source.append(" { \n"); + source.append(" assemble_bv(g_bv_r, g_bv, col_n); \n"); + source.append(" assemble_bv(g_bv_r + col_n, g_bv_u, col_n_u); \n"); + source.append(" } \n"); + + source.append(" __kernel void block_bv_assembly(__global "); source.append(numeric_string); source.append(" * g_bv, \n"); + source.append(" __global unsigned int * start_bv_ind, \n"); + source.append(" __global unsigned int * matrix_dimensions, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * g_bv_u, \n"); + source.append(" __global unsigned int * start_bv_u_ind, \n"); + source.append(" __global unsigned int * matrix_dimensions_u, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * g_bv_r, \n"); + source.append(" __global unsigned int * start_bv_r_ind, \n"); + source.append(" __global unsigned int * matrix_dimensions_r, \n"); + source.append(" __global unsigned int * g_is_update, \n"); + source.append(" //__local "); source.append(numeric_string); source.append(" * local_gb, \n"); + source.append(" unsigned int block_elems_num) \n"); + source.append(" { \n"); + source.append(" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n"); + source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n"); + source.append(" assemble_bv_block(g_bv_r + start_bv_r_ind[i], g_bv + start_bv_ind[i], matrix_dimensions[2*i + 1], g_bv_u + start_bv_u_ind[i], matrix_dimensions_u[2*i + 1]); \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append(" } \n"); +} + +template<typename StringT> +void generate_spai_block_least_squares(StringT & source, std::string const & numeric_string) +{ + source.append("void custom_dot_prod_ls(__global "); source.append(numeric_string); source.append(" * A, unsigned int row_n, __global "); source.append(numeric_string); source.append(" * v, unsigned int ind, "); source.append(numeric_string); source.append(" *res){ \n"); + source.append(" *res = 0.0; \n"); + source.append(" for (unsigned int j = ind; j < row_n; ++j){ \n"); + source.append(" if (j == ind){ \n"); + source.append(" *res += v[ j]; \n"); + source.append(" }else{ \n"); + source.append(" *res += A[ j + ind*row_n]*v[ j]; \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append("} \n"); + + source.append("void backwardSolve(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * y, __global "); source.append(numeric_string); source.append(" * x){ \n"); + source.append(" for (int i = col_n-1; i >= 0; i--) { \n"); + source.append(" x[ i] = y[ i]; \n"); + source.append(" for (int j = i+1; j < col_n; ++j) { \n"); + source.append(" x[ i] -= R[ i + j*row_n]*x[ j]; \n"); + source.append(" } \n"); + source.append(" x[i] /= R[ i + i*row_n]; \n"); + source.append(" } \n"); + source.append("} \n"); + + + source.append("void apply_q_trans_vec_ls(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global const "); source.append(numeric_string); source.append(" * b_v, __global "); source.append(numeric_string); source.append(" * y){ \n"); + source.append(" "); source.append(numeric_string); source.append(" inn_prod = 0; \n"); + source.append(" for (unsigned int i = 0; i < col_n; ++i){ \n"); + source.append(" custom_dot_prod_ls(R, row_n, y, i, &inn_prod); \n"); + source.append(" for (unsigned int j = i; j < row_n; ++j){ \n"); + source.append(" if (i == j){ \n"); + source.append(" y[ j] -= b_v[ i]*inn_prod; \n"); + source.append(" } \n"); + source.append(" else{ \n"); + source.append(" y[j] -= b_v[ i]*inn_prod*R[ j +i*row_n]; \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append(" } \n"); + + source.append("void ls(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * b_v, __global "); source.append(numeric_string); source.append(" * m_v, __global "); source.append(numeric_string); source.append(" * y_v){ \n"); + source.append(" apply_q_trans_vec_ls(R, row_n, col_n, b_v, y_v); \n"); + source.append(" //m_new - is m_v now \n"); + source.append(" backwardSolve(R, row_n, col_n, y_v, m_v); \n"); + source.append("} \n"); + + source.append("__kernel void block_least_squares( \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * global_R, \n"); + source.append(" __global unsigned int * block_ind, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * b_v, \n"); + source.append(" __global unsigned int * start_bv_inds, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * m_v, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * y_v, \n"); + source.append(" __global unsigned int * start_y_inds, \n"); + source.append(" __global unsigned int * matrix_dimensions, \n"); + source.append(" __global unsigned int * g_is_update, \n"); + source.append(" unsigned int block_elems_num) \n"); + source.append("{ \n"); + source.append(" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n"); + source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n"); + source.append(" ls(global_R + block_ind[i], matrix_dimensions[2*i], matrix_dimensions[2*i + 1], b_v +start_bv_inds[i], m_v + start_bv_inds[i], y_v + start_y_inds[i] ); \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append("} \n"); +} + +template<typename StringT> +void generate_spai_block_q_mult(StringT & source, std::string const & numeric_string) +{ + source.append("void custom_dot_prod(__global "); source.append(numeric_string); source.append(" * A, unsigned int row_n, __local "); source.append(numeric_string); source.append(" * v, unsigned int ind, "); source.append(numeric_string); source.append(" *res){ \n"); + source.append(" *res = 0.0; \n"); + source.append(" for (unsigned int j = ind; j < row_n; ++j){ \n"); + source.append(" if (j == ind){ \n"); + source.append(" *res += v[j]; \n"); + source.append(" }else{ \n"); + source.append(" *res += A[j + ind*row_n]*v[j]; \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append("} \n"); + + source.append("void apply_q_trans_vec(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * b_v, __local "); source.append(numeric_string); source.append(" * y){ \n"); + source.append(" "); source.append(numeric_string); source.append(" inn_prod = 0; \n"); + source.append(" for (unsigned int i = 0; i < col_n; ++i){ \n"); + source.append(" custom_dot_prod(R, row_n, y, i, &inn_prod); \n"); + source.append(" for (unsigned int j = i; j < row_n; ++j){ \n"); + source.append(" if (i == j){ \n"); + source.append(" y[j] -= b_v[ i]*inn_prod; \n"); + source.append(" } \n"); + source.append(" else{ \n"); + source.append(" y[j] -= b_v[ i]*inn_prod*R[ j + i*row_n]; \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append("} \n"); + + source.append("void q_mult(__global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, unsigned int col_n, __global "); source.append(numeric_string); source.append(" * b_v, __local "); source.append(numeric_string); source.append(" * R_u, unsigned int col_n_u){ \n"); + source.append(" for (unsigned int i = get_local_id(0); i < col_n_u; i+= get_local_size(0)){ \n"); + source.append(" apply_q_trans_vec(R, row_n, col_n, b_v, R_u + row_n*i); \n"); + source.append(" } \n"); + source.append("} \n"); + + source.append("void matrix_from_global_to_local(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n"); + source.append(" for (unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n"); + source.append(" for (unsigned int j = 0; j < row_n; ++j){ \n"); + source.append(" l_M[i*row_n + j] = g_M[mat_start_ind + i*row_n + j]; \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append("} \n"); + + source.append("void matrix_from_local_to_global(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n"); + source.append(" for (unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n"); + source.append(" for (unsigned int j = 0; j < row_n; ++j){ \n"); + source.append(" g_M[mat_start_ind + i*row_n + j] = l_M[i*row_n + j]; \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append("} \n"); + + source.append("__kernel void block_q_mult(__global "); source.append(numeric_string); source.append(" * global_R, \n"); + source.append(" __global unsigned int * block_ind, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * global_R_u, \n"); + source.append(" __global unsigned int *block_ind_u, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * b_v, \n"); + source.append(" __global unsigned int * start_bv_inds, \n"); + source.append(" __global unsigned int * matrix_dimensions, \n"); + source.append(" __global unsigned int * matrix_dimensions_u, \n"); + source.append(" __global unsigned int * g_is_update, \n"); + source.append(" __local "); source.append(numeric_string); source.append(" * local_R_u, \n"); + source.append(" unsigned int block_elems_num){ \n"); + source.append(" for (unsigned int i = get_group_id(0); i < block_elems_num; i += get_num_groups(0)){ \n"); + source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && (g_is_update[i] > 0)){ \n"); + //matrix_from_global_to_local(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]); \n"); + source.append(" matrix_from_global_to_local(global_R_u, local_R_u, matrix_dimensions_u[2*i], matrix_dimensions_u[2*i+ 1], block_ind_u[i]); \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + source.append(" q_mult(global_R + block_ind[i], matrix_dimensions[2*i], matrix_dimensions[2*i + 1], b_v + start_bv_inds[i], local_R_u, \n"); + source.append(" matrix_dimensions_u[2*i + 1]); \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + source.append(" matrix_from_local_to_global(global_R_u, local_R_u, matrix_dimensions_u[2*i], matrix_dimensions_u[2*i + 1], block_ind_u[i]); \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append("} \n"); +} + +template<typename StringT> +void generate_spai_block_qr(StringT & source, std::string const & numeric_string) +{ + source.append("void dot_prod(__local const "); source.append(numeric_string); source.append("* A, unsigned int n, unsigned int beg_ind, "); source.append(numeric_string); source.append("* res){ \n"); + source.append(" *res = 0; \n"); + source.append(" for (unsigned int i = beg_ind; i < n; ++i){ \n"); + source.append(" *res += A[(beg_ind-1)*n + i]*A[(beg_ind-1)*n + i]; \n"); + source.append(" } \n"); + source.append("} \n"); + + source.append("void vector_div(__global "); source.append(numeric_string); source.append("* v, unsigned int beg_ind, "); source.append(numeric_string); source.append(" b, unsigned int n){ \n"); + source.append(" for (unsigned int i = beg_ind; i < n; ++i){ \n"); + source.append(" v[i] /= b; \n"); + source.append(" } \n"); + source.append("} \n"); + + source.append("void copy_vector(__local const "); source.append(numeric_string); source.append("* A, __global "); source.append(numeric_string); source.append("* v, const unsigned int beg_ind, const unsigned int n){ \n"); + source.append(" for (unsigned int i = beg_ind; i < n; ++i){ \n"); + source.append(" v[i] = A[(beg_ind-1)*n + i]; \n"); + source.append(" } \n"); + source.append("} \n"); + + + source.append("void householder_vector(__local const "); source.append(numeric_string); source.append("* A, unsigned int j, unsigned int n, __global "); source.append(numeric_string); source.append("* v, __global "); source.append(numeric_string); source.append("* b){ \n"); + source.append(" "); source.append(numeric_string); source.append(" sg; \n"); + source.append(" dot_prod(A, n, j+1, &sg); \n"); + source.append(" copy_vector(A, v, j+1, n); \n"); + source.append(" "); source.append(numeric_string); source.append(" mu; \n"); + source.append(" v[j] = 1.0; \n"); + //print_contigious_vector(v, v_start_ind, n); + source.append(" if (sg == 0){ \n"); + source.append(" *b = 0; \n"); + source.append(" } \n"); + source.append(" else{ \n"); + source.append(" mu = sqrt(A[j*n + j]*A[ j*n + j] + sg); \n"); + source.append(" if (A[ j*n + j] <= 0){ \n"); + source.append(" v[j] = A[ j*n + j] - mu; \n"); + source.append(" }else{ \n"); + source.append(" v[j] = -sg/(A[ j*n + j] + mu); \n"); + source.append(" } \n"); + source.append(" *b = 2*(v[j]*v[j])/(sg + v[j]*v[j]); \n"); + //*b = (2*v[j]*v[j])/(sg + (v[j])*(v[j])); + source.append(" vector_div(v, j, v[j], n); \n"); + //print_contigious_vector(v, v_start_ind, n); + source.append(" } \n"); + source.append("} \n"); + + source.append("void custom_inner_prod(__local const "); source.append(numeric_string); source.append("* A, __global "); source.append(numeric_string); source.append("* v, unsigned int col_ind, unsigned int row_num, unsigned int start_ind, "); source.append(numeric_string); source.append("* res){ \n"); + source.append(" for (unsigned int i = start_ind; i < row_num; ++i){ \n"); + source.append(" *res += A[col_ind*row_num + i]*v[i]; \n"); + source.append(" } \n"); + source.append("} \n"); + // + source.append("void apply_householder_reflection(__local "); source.append(numeric_string); source.append("* A, unsigned int row_n, unsigned int col_n, unsigned int iter_cnt, __global "); source.append(numeric_string); source.append("* v, "); source.append(numeric_string); source.append(" b){ \n"); + source.append(" "); source.append(numeric_string); source.append(" in_prod_res; \n"); + source.append(" for (unsigned int i= iter_cnt + get_local_id(0); i < col_n; i+=get_local_size(0)){ \n"); + source.append(" in_prod_res = 0.0; \n"); + source.append(" custom_inner_prod(A, v, i, row_n, iter_cnt, &in_prod_res); \n"); + source.append(" for (unsigned int j = iter_cnt; j < row_n; ++j){ \n"); + source.append(" A[ i*row_n + j] -= b*in_prod_res* v[j]; \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append("} \n"); + + source.append("void store_householder_vector(__local "); source.append(numeric_string); source.append("* A, unsigned int ind, unsigned int n, __global "); source.append(numeric_string); source.append("* v){ \n"); + source.append(" for (unsigned int i = ind; i < n; ++i){ \n"); + source.append(" A[ (ind-1)*n + i] = v[i]; \n"); + source.append(" } \n"); + source.append("} \n"); + + source.append("void single_qr( __local "); source.append(numeric_string); source.append("* R, __global unsigned int* matrix_dimensions, __global "); source.append(numeric_string); source.append("* b_v, __global "); source.append(numeric_string); source.append("* v, unsigned int matrix_ind){ \n"); + //matrix_dimensions[0] - number of rows + //matrix_dimensions[1] - number of columns + source.append(" unsigned int col_n = matrix_dimensions[2*matrix_ind + 1]; \n"); + source.append(" unsigned int row_n = matrix_dimensions[2*matrix_ind]; \n"); + + source.append(" if ((col_n == row_n)&&(row_n == 1)){ \n"); + source.append(" b_v[0] = 0.0; \n"); + source.append(" return; \n"); + source.append(" } \n"); + source.append(" for (unsigned int i = 0; i < col_n; ++i){ \n"); + source.append(" if (get_local_id(0) == 0){ \n"); + source.append(" householder_vector(R, i, row_n, v, b_v + i); \n"); + source.append(" } \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + source.append(" apply_householder_reflection(R, row_n, col_n, i, v, b_v[i]); \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + source.append(" if (get_local_id(0) == 0){ \n"); + source.append(" if (i < matrix_dimensions[2*matrix_ind]){ \n"); + source.append(" store_householder_vector(R, i+1, row_n, v); \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append("} \n"); + + source.append("void matrix_from_global_to_local_qr(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n"); + source.append(" for (unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n"); + source.append(" for (unsigned int j = 0; j < row_n; ++j){ \n"); + source.append(" l_M[i*row_n + j] = g_M[mat_start_ind + i*row_n + j]; \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append("} \n"); + source.append("void matrix_from_local_to_global_qr(__global "); source.append(numeric_string); source.append("* g_M, __local "); source.append(numeric_string); source.append("* l_M, unsigned int row_n, unsigned int col_n, unsigned int mat_start_ind){ \n"); + source.append(" for (unsigned int i = get_local_id(0); i < col_n; i+= get_local_size(0)){ \n"); + source.append(" for (unsigned int j = 0; j < row_n; ++j){ \n"); + source.append(" g_M[mat_start_ind + i*row_n + j] = l_M[i*row_n + j]; \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append("} \n"); + + + source.append("__kernel void block_qr( \n"); + source.append(" __global "); source.append(numeric_string); source.append("* R, \n"); + source.append(" __global unsigned int* matrix_dimensions, \n"); + source.append(" __global "); source.append(numeric_string); source.append("* b_v, \n"); + source.append(" __global "); source.append(numeric_string); source.append("* v, \n"); + source.append(" __global unsigned int* start_matrix_inds, \n"); + source.append(" __global unsigned int* start_bv_inds, \n"); + source.append(" __global unsigned int* start_v_inds, \n"); + source.append(" __global unsigned int * g_is_update, \n"); + source.append(" __local "); source.append(numeric_string); source.append("* local_buff_R, \n"); + source.append(" unsigned int block_elems_num){ \n"); + source.append(" for (unsigned int i = get_group_id(0); i < block_elems_num; i += get_num_groups(0)){ \n"); + source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n"); + source.append(" matrix_from_global_to_local_qr(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]); \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + source.append(" single_qr(local_buff_R, matrix_dimensions, b_v + start_bv_inds[i], v + start_v_inds[i], i); \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + source.append(" matrix_from_local_to_global_qr(R, local_buff_R, matrix_dimensions[2*i], matrix_dimensions[2*i + 1], start_matrix_inds[i]); \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append("} \n"); +} + +template<typename StringT> +void generate_spai_block_qr_assembly(StringT & source, std::string const & numeric_string) +{ + source.append("void assemble_upper_part(__global "); source.append(numeric_string); source.append(" * R_q, \n"); + source.append(" unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, \n"); + source.append(" unsigned int row_n_u, unsigned int col_n_u, \n"); + source.append(" unsigned int col_n, unsigned int diff){ \n"); + source.append(" for (unsigned int i = 0; i < col_n_q; ++i){ \n"); + source.append(" for (unsigned int j = 0; j < diff; ++j){ \n"); + source.append(" R_q[ i*row_n_q + j] = R_u[ i*row_n_u + j + col_n ]; \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append(" } \n"); + + source.append("void assemble_lower_part(__global "); source.append(numeric_string); source.append(" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u_u, \n"); + source.append(" unsigned int row_n_u_u, unsigned int col_n_u_u, \n"); + source.append(" unsigned int diff){ \n"); + source.append(" for (unsigned int i = 0; i < col_n_u_u; ++i){ \n"); + source.append(" for (unsigned int j = 0; j < row_n_u_u; ++j){ \n"); + source.append(" R_q[i*row_n_q + j + diff] = R_u_u[i*row_n_u_u + j]; \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append("} \n"); + + source.append("void assemble_qr_block(__global "); source.append(numeric_string); source.append(" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, \n"); + source.append(" unsigned int col_n_u, __global "); source.append(numeric_string); source.append(" * R_u_u, unsigned int row_n_u_u, unsigned int col_n_u_u, unsigned int col_n){ \n"); + source.append(" unsigned int diff = row_n_u - col_n; \n"); + source.append(" assemble_upper_part(R_q, row_n_q, col_n_q, R_u, row_n_u, col_n_u, col_n, diff); \n"); + source.append(" if (diff > 0){ \n"); + source.append(" assemble_lower_part(R_q, row_n_q, col_n_q, R_u_u, row_n_u_u, col_n_u_u, diff); \n"); + source.append(" } \n"); + source.append("} \n"); + + source.append("__kernel void block_qr_assembly( \n"); + source.append(" __global unsigned int * matrix_dimensions, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * R_u, \n"); + source.append(" __global unsigned int * block_ind_u, \n"); + source.append(" __global unsigned int * matrix_dimensions_u, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * R_u_u, \n"); + source.append(" __global unsigned int * block_ind_u_u, \n"); + source.append(" __global unsigned int * matrix_dimensions_u_u, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * R_q, \n"); + source.append(" __global unsigned int * block_ind_q, \n"); + source.append(" __global unsigned int * matrix_dimensions_q, \n"); + source.append(" __global unsigned int * g_is_update, \n"); + source.append(" //__local "); source.append(numeric_string); source.append(" * local_R_q, \n"); + source.append(" unsigned int block_elems_num) \n"); + source.append("{ \n"); + source.append(" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n"); + source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n"); + source.append(" assemble_qr_block(R_q + block_ind_q[i], matrix_dimensions_q[2*i], matrix_dimensions_q[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], \n"); + source.append(" matrix_dimensions_u[2*i + 1], R_u_u + block_ind_u_u[i], matrix_dimensions_u_u[2*i], matrix_dimensions_u_u[2*i + 1], matrix_dimensions[2*i + 1]); \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append("} \n"); +} + +template<typename StringT> +void generate_spai_block_qr_assembly_1(StringT & source, std::string const & numeric_string) +{ + source.append("void assemble_upper_part_1(__global "); source.append(numeric_string); source.append(" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, \n"); + source.append(" unsigned int row_n_u, unsigned int col_n_u, \n"); + source.append(" unsigned int col_n, unsigned int diff){ \n"); + source.append(" for (unsigned int i = 0; i < col_n_q; ++i){ \n"); + source.append(" for (unsigned int j = 0; j < diff; ++j){ \n"); + source.append(" R_q[ i*row_n_q + j] = R_u[i*row_n_u + j + col_n ]; \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append(" } \n"); + + + source.append("void assemble_qr_block_1(__global "); source.append(numeric_string); source.append(" * R_q, unsigned int row_n_q, unsigned int col_n_q, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, \n"); + source.append(" unsigned int col_n_u, unsigned int col_n){ \n"); + source.append(" unsigned int diff = row_n_u - col_n; \n"); + source.append(" assemble_upper_part_1(R_q, row_n_q, col_n_q, R_u, row_n_u, col_n_u, col_n, diff); \n"); + source.append("} \n"); + + source.append("__kernel void block_qr_assembly_1( \n"); + source.append(" __global unsigned int * matrix_dimensions, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * R_u, \n"); + source.append(" __global unsigned int * block_ind_u, \n"); + source.append(" __global unsigned int * matrix_dimensions_u, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * R_q, \n"); + source.append(" __global unsigned int * block_ind_q, \n"); + source.append(" __global unsigned int * matrix_dimensions_q, \n"); + source.append(" __global unsigned int * g_is_update, \n"); + source.append(" unsigned int block_elems_num) \n"); + source.append("{ \n"); + source.append(" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n"); + source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n"); + source.append(" assemble_qr_block_1(R_q + block_ind_q[i], matrix_dimensions_q[2*i], matrix_dimensions_q[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], \n"); + source.append(" matrix_dimensions_u[2*i + 1], matrix_dimensions[2*i + 1]); \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append("} \n"); +} + +template<typename StringT> +void generate_spai_block_r_assembly(StringT & source, std::string const & numeric_string) +{ + source.append("void assemble_r(__global "); source.append(numeric_string); source.append(" * gR, unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R, \n"); + source.append(" unsigned int row_n, unsigned int col_n) \n"); + source.append("{ \n"); + source.append(" for (unsigned int i = 0; i < col_n; ++i){ \n"); + source.append(" for (unsigned int j = 0; j < row_n; ++j){ \n"); + source.append(" gR[i*row_n_r + j] = R[i*row_n + j ]; \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append("} \n"); + + source.append("void assemble_r_u(__global "); source.append(numeric_string); source.append(" * gR, \n"); + source.append(" unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, unsigned int col_n_u, \n"); + source.append(" unsigned int col_n) \n"); + source.append("{ \n"); + source.append(" for (unsigned int i = 0; i < col_n_u; ++i){ \n"); + source.append(" for (unsigned int j = 0; j < col_n; ++j){ \n"); + source.append(" gR[ (i+col_n)*row_n_r + j] = R_u[ i*row_n_u + j]; \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append("} \n"); + + + source.append("void assemble_r_u_u(__global "); source.append(numeric_string); source.append(" * gR, unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R_u_u, unsigned int row_n_u_u, \n"); + source.append(" unsigned int col_n_u_u, unsigned int col_n) \n"); + source.append("{ \n"); + source.append(" for (unsigned int i = 0; i < col_n_u_u; ++i){ \n"); + source.append(" for (unsigned int j = 0; j < row_n_u_u; ++j){ \n"); + source.append(" gR[(col_n+i)*row_n_r + j + col_n] = R_u_u[i*row_n_u_u + j]; \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append("} \n"); + + source.append("void assemble_r_block(__global "); source.append(numeric_string); source.append(" * gR, unsigned int row_n_r, unsigned int col_n_r, __global "); source.append(numeric_string); source.append(" * R, unsigned int row_n, \n"); + source.append(" unsigned int col_n, __global "); source.append(numeric_string); source.append(" * R_u, unsigned int row_n_u, unsigned int col_n_u, __global "); source.append(numeric_string); source.append(" * R_u_u, \n"); + source.append(" unsigned int row_n_u_u, unsigned int col_n_u_u){ \n"); + source.append(" assemble_r(gR, row_n_r, col_n_r, R, row_n, col_n); \n"); + source.append(" assemble_r_u(gR, row_n_r, col_n_r, R_u, row_n_u, col_n_u, col_n); \n"); + source.append(" assemble_r_u_u(gR, row_n_r, col_n_r, R_u_u, row_n_u_u, col_n_u_u, col_n); \n"); + source.append("} \n"); + + + source.append("__kernel void block_r_assembly( \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * R, \n"); + source.append(" __global unsigned int * block_ind, \n"); + source.append(" __global unsigned int * matrix_dimensions, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * R_u, \n"); + source.append(" __global unsigned int * block_ind_u, \n"); + source.append(" __global unsigned int * matrix_dimensions_u, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * R_u_u, \n"); + source.append(" __global unsigned int * block_ind_u_u, \n"); + source.append(" __global unsigned int * matrix_dimensions_u_u, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" * g_R, \n"); + source.append(" __global unsigned int * block_ind_r, \n"); + source.append(" __global unsigned int * matrix_dimensions_r, \n"); + source.append(" __global unsigned int * g_is_update, \n"); + source.append(" unsigned int block_elems_num) \n"); + source.append("{ \n"); + source.append(" for (unsigned int i = get_global_id(0); i < block_elems_num; i += get_global_size(0)){ \n"); + source.append(" if ((matrix_dimensions[2*i] > 0) && (matrix_dimensions[2*i + 1] > 0) && g_is_update[i] > 0){ \n"); + + source.append(" assemble_r_block(g_R + block_ind_r[i], matrix_dimensions_r[2*i], matrix_dimensions_r[2*i + 1], R + block_ind[i], matrix_dimensions[2*i], \n"); + source.append(" matrix_dimensions[2*i + 1], R_u + block_ind_u[i], matrix_dimensions_u[2*i], matrix_dimensions_u[2*i + 1], \n"); + source.append(" R_u_u + block_ind_u_u[i], matrix_dimensions_u_u[2*i], matrix_dimensions_u_u[2*i + 1]); \n"); + + source.append(" } \n"); + source.append(" } \n"); + source.append("} \n"); +} + +//////////////////////////// Part 2: Main kernel class //////////////////////////////////// + +// main kernel class +/** @brief Main kernel class for generating OpenCL kernels for the sparse approximate inverse preconditioners. */ +template<typename NumericT> +struct spai +{ + static std::string program_name() + { + return viennacl::ocl::type_to_string<NumericT>::apply() + "_spai"; + } + + static void init(viennacl::ocl::context & ctx) + { + static std::map<cl_context, bool> init_done; + if (!init_done[ctx.handle().get()]) + { + viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx); + std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply(); + + std::string source; + source.reserve(1024); + + viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source); + + generate_spai_assemble_blocks(source, numeric_string); + generate_spai_block_bv_assembly(source, numeric_string); + generate_spai_block_least_squares(source, numeric_string); + generate_spai_block_q_mult(source, numeric_string); + generate_spai_block_qr(source, numeric_string); + generate_spai_block_qr_assembly(source, numeric_string); + generate_spai_block_qr_assembly_1(source, numeric_string); + generate_spai_block_r_assembly(source, numeric_string); + + std::string prog_name = program_name(); + #ifdef VIENNACL_BUILD_INFO + std::cout << "Creating program " << prog_name << std::endl; + #endif + ctx.add_program(source, prog_name); + init_done[ctx.handle().get()] = true; + } //if + } //init +}; + +} // namespace kernels +} // namespace opencl +} // namespace linalg +} // namespace viennacl +#endif +
