http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/bisect.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/bisect.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/bisect.hpp new file mode 100644 index 0000000..64c12b0 --- /dev/null +++ b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/bisect.hpp @@ -0,0 +1,2645 @@ +#ifndef VIENNACL_LINALG_OPENCL_KERNELS_BISECT_HPP_ +#define VIENNACL_LINALG_OPENCL_KERNELS_BISECT_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2016, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + Portions of this software are copyright by UChicago Argonne, LLC. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp [email protected] + + (A list of authors and contributors can be found in the manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + + +/** @file viennacl/linalg/opencl/kernels/bisect.hpp + @brief OpenCL kernels for the bisection algorithm for eigenvalues + + Implementation based on the sample provided with the CUDA 6.0 SDK, for which + the creation of derivative works is allowed by including the following statement: + "This software contains source code provided by NVIDIA Corporation." +*/ + + + +#include "viennacl/tools/tools.hpp" +#include "viennacl/ocl/kernel.hpp" +#include "viennacl/ocl/platform.hpp" +#include "viennacl/ocl/utils.hpp" + +#include "viennacl/linalg/opencl/common.hpp" + +// declaration, forward + +namespace viennacl +{ +namespace linalg +{ +namespace opencl +{ +namespace kernels +{ + template <typename StringType> + void generate_bisect_kernel_config(StringType & source) + { + /* Global configuration parameter */ + source.append(" #define VIENNACL_BISECT_MAX_THREADS_BLOCK 256\n"); + source.append(" #define VIENNACL_BISECT_MAX_SMALL_MATRIX 256\n"); + source.append(" #define VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX 256\n"); + source.append(" #define VIENNACL_BISECT_MIN_ABS_INTERVAL 5.0e-37\n"); + + } + + //////////////////////////////////////////////////////////////////////////////// + // Compute the next lower power of two of n + // n number for which next higher power of two is seeked + //////////////////////////////////////////////////////////////////////////////// + + template <typename StringType> + void generate_bisect_kernel_floorPow2(StringType & source, std::string const & numeric_string) + { + source.append(" \n"); + source.append(" inline int \n"); + source.append(" floorPow2(int n) \n"); + source.append(" { \n"); + source.append(" uint glb_id = get_global_id(0); \n"); + source.append(" uint grp_id = get_group_id(0); \n"); + source.append(" uint grp_nm = get_num_groups(0); \n"); + source.append(" uint lcl_id = get_local_id(0); \n"); + source.append(" uint lcl_sz = get_local_size(0); \n"); + + + // early out if already power of two + source.append(" if (0 == (n & (n-1))) \n"); + source.append(" { \n"); + source.append(" return n; \n"); + source.append(" } \n"); + + source.append(" int exp; \n"); + source.append(" frexp(( "); source.append(numeric_string); source.append(" )n, &exp); \n"); + source.append(" return (1 << (exp - 1)); \n"); + source.append(" } \n"); + + } + + + //////////////////////////////////////////////////////////////////////////////// + // Compute the next higher power of two of n + // n number for which next higher power of two is seeked + //////////////////////////////////////////////////////////////////////////////// + + template <typename StringType> + void generate_bisect_kernel_ceilPow2(StringType & source, std::string const & numeric_string) + { + source.append(" \n"); + source.append(" inline int \n"); + source.append(" ceilPow2(int n) \n"); + source.append(" { \n"); + source.append(" uint glb_id = get_global_id(0); \n"); + source.append(" uint grp_id = get_group_id(0); \n"); + source.append(" uint grp_nm = get_num_groups(0); \n"); + source.append(" uint lcl_id = get_local_id(0); \n"); + source.append(" uint lcl_sz = get_local_size(0); \n"); + + + // early out if already power of two + source.append(" if (0 == (n & (n-1))) \n"); + source.append(" { \n"); + source.append(" return n; \n"); + source.append(" } \n"); + + source.append(" int exp; \n"); + source.append(" frexp(( "); source.append(numeric_string); source.append(" )n, &exp); \n"); + source.append(" return (1 << exp); \n"); + source.append(" } \n"); + } + + + //////////////////////////////////////////////////////////////////////////////// + // Compute midpoint of interval [\a left, \a right] avoiding overflow if possible + // + // left left / lower limit of interval + // right right / upper limit of interval + //////////////////////////////////////////////////////////////////////////////// + + template <typename StringType> + void generate_bisect_kernel_computeMidpoint(StringType & source, std::string const & numeric_string) + { + source.append(" \n"); + source.append(" inline "); source.append(numeric_string); source.append(" \n"); + source.append(" computeMidpoint(const "); source.append(numeric_string); source.append(" left,\n"); + source.append(" const "); source.append(numeric_string); source.append(" right) \n"); + source.append(" { \n"); + source.append(" uint glb_id = get_global_id(0); \n"); + source.append(" uint grp_id = get_group_id(0); \n"); + source.append(" uint grp_nm = get_num_groups(0); \n"); + source.append(" uint lcl_id = get_local_id(0); \n"); + source.append(" uint lcl_sz = get_local_size(0); \n"); + source.append(" "); source.append(numeric_string); source.append(" mid; \n"); + + source.append(" if (sign(left) == sign(right)) \n"); + source.append(" { \n"); + source.append(" mid = left + (right - left) * 0.5f; \n"); + source.append(" } \n"); + source.append(" else \n"); + source.append(" { \n"); + source.append(" mid = (left + right) * 0.5f; \n"); + source.append(" } \n"); + + source.append(" return mid; \n"); + source.append(" } \n"); + + } + + + //////////////////////////////////////////////////////////////////////////////// + // Check if interval converged and store appropriately + // + // addr address where to store the information of the interval + // s_left shared memory storage for left interval limits + // s_right shared memory storage for right interval limits + // s_left_count shared memory storage for number of eigenvalues less than left interval limits + // s_right_count shared memory storage for number of eigenvalues less than right interval limits + // left lower limit of interval + // right upper limit of interval + // left_count eigenvalues less than \a left + // right_count eigenvalues less than \a right + // precision desired precision for eigenvalues + //////////////////////////////////////////////////////////////////////////////// + + template<typename StringType> + void generate_bisect_kernel_storeInterval(StringType & source, std::string const & numeric_string) + { + source.append(" \n"); + source.append(" void \n"); + source.append(" storeInterval(unsigned int addr, \n"); + source.append(" __local "); source.append(numeric_string); source.append(" * s_left, \n"); + source.append(" __local "); source.append(numeric_string); source.append(" * s_right, \n"); + source.append(" __local unsigned int * s_left_count, \n"); + source.append(" __local unsigned int * s_right_count, \n"); + source.append(" "); source.append(numeric_string); source.append(" left, \n"); + source.append(" "); source.append(numeric_string); source.append(" right, \n"); + source.append(" unsigned int left_count, \n"); + source.append(" unsigned int right_count, \n"); + source.append(" "); source.append(numeric_string); source.append(" precision) \n"); + source.append(" { \n"); + source.append(" uint glb_id = get_global_id(0); \n"); + source.append(" uint grp_id = get_group_id(0); \n"); + source.append(" uint grp_nm = get_num_groups(0); \n"); + source.append(" uint lcl_id = get_local_id(0); \n"); + source.append(" uint lcl_sz = get_local_size(0); \n"); + + source.append(" s_left_count[addr] = left_count; \n"); + source.append(" s_right_count[addr] = right_count; \n"); + + // check if interval converged + source.append(" "); source.append(numeric_string); source.append(" t0 = fabs(right - left); \n"); + source.append(" "); source.append(numeric_string); source.append(" t1 = max(fabs(left), fabs(right)) * precision; \n"); + + source.append(" if (t0 <= max(( "); source.append(numeric_string); source.append(" )VIENNACL_BISECT_MIN_ABS_INTERVAL, t1)) \n"); + source.append(" { \n"); + // compute mid point + source.append(" "); source.append(numeric_string); source.append(" lambda = computeMidpoint(left, right); \n"); + + // mark as converged + source.append(" s_left[addr] = lambda; \n"); + source.append(" s_right[addr] = lambda; \n"); + source.append(" } \n"); + source.append(" else \n"); + source.append(" { \n"); + + // store current limits + source.append(" s_left[addr] = left; \n"); + source.append(" s_right[addr] = right; \n"); + source.append(" } \n"); + + source.append(" } \n"); + + } + + template<typename StringType> + void generate_bisect_kernel_storeIntervalShort(StringType & source, std::string const & numeric_string) + { + source.append(" \n"); + source.append(" void \n"); + source.append(" storeIntervalShort(unsigned int addr, \n"); + source.append(" __local "); source.append(numeric_string); source.append(" * s_left, \n"); + source.append(" __local "); source.append(numeric_string); source.append(" * s_right, \n"); + source.append(" __local unsigned short * s_left_count, \n"); + source.append(" __local unsigned short * s_right_count, \n"); + source.append(" "); source.append(numeric_string); source.append(" left, \n"); + source.append(" "); source.append(numeric_string); source.append(" right, \n"); + source.append(" unsigned int left_count, \n"); + source.append(" unsigned int right_count, \n"); + source.append(" "); source.append(numeric_string); source.append(" precision) \n"); + source.append(" { \n"); + source.append(" uint glb_id = get_global_id(0); \n"); + source.append(" uint grp_id = get_group_id(0); \n"); + source.append(" uint grp_nm = get_num_groups(0); \n"); + source.append(" uint lcl_id = get_local_id(0); \n"); + source.append(" uint lcl_sz = get_local_size(0); \n"); + + source.append(" s_left_count[addr] = left_count; \n"); + source.append(" s_right_count[addr] = right_count; \n"); + + // check if interval converged + source.append(" "); source.append(numeric_string); source.append(" t0 = fabs(right - left); \n"); + source.append(" "); source.append(numeric_string); source.append(" t1 = max(fabs(left), fabs(right)) * precision; \n"); + + source.append(" if (t0 <= max(( "); source.append(numeric_string); source.append(" )VIENNACL_BISECT_MIN_ABS_INTERVAL, t1)) \n"); + source.append(" { \n"); + // compute mid point + source.append(" "); source.append(numeric_string); source.append(" lambda = computeMidpoint(left, right); \n"); + + // mark as converged + source.append(" s_left[addr] = lambda; \n"); + source.append(" s_right[addr] = lambda; \n"); + source.append(" } \n"); + source.append(" else \n"); + source.append(" { \n"); + + // store current limits + source.append(" s_left[addr] = left; \n"); + source.append(" s_right[addr] = right; \n"); + source.append(" } \n"); + + source.append(" } \n"); + + + } + + + //////////////////////////////////////////////////////////////////////////////// + // Compute number of eigenvalues that are smaller than x given a symmetric, + // real, and tridiagonal matrix + // + // g_d diagonal elements stored in global memory + // g_s superdiagonal elements stored in global memory + // n size of matrix + // x value for which the number of eigenvalues that are smaller is sought + // tid thread identified (e.g. threadIdx.x or gtid) + // num_intervals_active number of active intervals / threads that currently process an interval + // s_d scratch space to store diagonal entries of the tridiagonal matrix in shared memory + // s_s scratch space to store superdiagonal entries of the tridiagonal matrix in shared memory + // converged flag if the current thread is already converged (that is count does not have to be computed) + //////////////////////////////////////////////////////////////////////////////// + + template <typename StringType> + void generate_bisect_kernel_computeNumSmallerEigenvals(StringType & source, std::string const & numeric_string) + { + source.append(" \n"); + source.append(" inline unsigned int \n"); + source.append(" computeNumSmallerEigenvals(__global "); source.append(numeric_string); source.append(" *g_d, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n"); + source.append(" const unsigned int n, \n"); + source.append(" const "); source.append(numeric_string); source.append(" x, \n"); + source.append(" const unsigned int tid, \n"); + source.append(" const unsigned int num_intervals_active, \n"); + source.append(" __local "); source.append(numeric_string); source.append(" *s_d, \n"); + source.append(" __local "); source.append(numeric_string); source.append(" *s_s, \n"); + source.append(" unsigned int converged \n"); + source.append(" ) \n"); + source.append(" { \n"); + source.append(" uint glb_id = get_global_id(0); \n"); + source.append(" uint grp_id = get_group_id(0); \n"); + source.append(" uint grp_nm = get_num_groups(0); \n"); + source.append(" uint lcl_id = get_local_id(0); \n"); + source.append(" uint lcl_sz = get_local_size(0); \n"); + + + source.append(" "); source.append(numeric_string); source.append(" delta = 1.0f; \n"); + source.append(" unsigned int count = 0; \n"); + + source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); + + // read data into shared memory + source.append(" if (lcl_id < n) \n"); + source.append(" { \n"); + source.append(" s_d[lcl_id] = *(g_d + lcl_id); \n"); + source.append(" s_s[lcl_id] = *(g_s + lcl_id - 1); \n"); + source.append(" } \n"); + + source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); + + // perform loop only for active threads + source.append(" if ((tid < num_intervals_active) && (0 == converged)) \n"); + source.append(" { \n"); + + // perform (optimized) Gaussian elimination to determine the number + // of eigenvalues that are smaller than n + source.append(" for (unsigned int k = 0; k < n; ++k) \n"); + source.append(" { \n"); + source.append(" delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta; \n"); + source.append(" count += (delta < 0) ? 1 : 0; \n"); + source.append(" } \n"); + + source.append(" } \n"); // end if thread currently processing an interval + + source.append(" return count; \n"); + source.append(" } \n"); + + } + + + //////////////////////////////////////////////////////////////////////////////// + // Compute number of eigenvalues that are smaller than x given a symmetric, + // real, and tridiagonal matrix + // + // g_d diagonal elements stored in global memory + // g_s superdiagonal elements stored in global memory + // n size of matrix + // x value for which the number of eigenvalues that are smaller is seeked + // tid thread identified (e.g. threadIdx.x or gtid) + // num_intervals_active number of active intervals / threads that currently process an interval + // s_d scratch space to store diagonal entries of the tridiagonal matrix in shared memory + // s_s scratch space to store superdiagonal entries of the tridiagonal matrix in shared memory + // converged flag if the current thread is already converged (that is count does not have to be computed) + //////////////////////////////////////////////////////////////////////////////// + + template <typename StringType> + void generate_bisect_kernel_computeNumSmallerEigenvalsLarge(StringType & source, std::string const & numeric_string) + { + source.append(" \n"); + source.append(" inline unsigned int \n"); + source.append(" computeNumSmallerEigenvalsLarge(__global "); source.append(numeric_string); source.append(" *g_d, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n"); + source.append(" const unsigned int n, \n"); + source.append(" const "); source.append(numeric_string); source.append(" x, \n"); + source.append(" const unsigned int tid, \n"); + source.append(" const unsigned int num_intervals_active, \n"); + source.append(" __local "); source.append(numeric_string); source.append(" *s_d, \n"); + source.append(" __local "); source.append(numeric_string); source.append(" *s_s, \n"); + source.append(" unsigned int converged \n"); + source.append(" ) \n"); + source.append(" { \n"); + source.append(" uint glb_id = get_global_id(0); \n"); + source.append(" uint grp_id = get_group_id(0); \n"); + source.append(" uint grp_nm = get_num_groups(0); \n"); + source.append(" uint lcl_id = get_local_id(0); \n"); + source.append(" uint lcl_sz = get_local_size(0); \n"); + + source.append(" "); source.append(numeric_string); source.append(" delta = 1.0f; \n"); + source.append(" unsigned int count = 0; \n"); + + source.append(" unsigned int rem = n; \n"); + + // do until whole diagonal and superdiagonal has been loaded and processed + source.append(" for (unsigned int i = 0; i < n; i += lcl_sz) \n"); + source.append(" { \n"); + + source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); + + // read new chunk of data into shared memory + source.append(" if ((i + lcl_id) < n) \n"); + source.append(" { \n"); + + source.append(" s_d[lcl_id] = *(g_d + i + lcl_id); \n"); + source.append(" s_s[lcl_id] = *(g_s + i + lcl_id - 1); \n"); + source.append(" } \n"); + + source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); + + + source.append(" if (tid < num_intervals_active) \n"); + source.append(" { \n"); + + // perform (optimized) Gaussian elimination to determine the number + // of eigenvalues that are smaller than n + source.append(" for (unsigned int k = 0; k < min(rem,lcl_sz); ++k) \n"); + source.append(" { \n"); + source.append(" delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta; \n"); + // delta = (abs( delta) < (1.0e-10)) ? -(1.0e-10) : delta; + source.append(" count += (delta < 0) ? 1 : 0; \n"); + source.append(" } \n"); + + source.append(" } \n"); // end if thread currently processing an interval + + source.append(" rem -= lcl_sz; \n"); + source.append(" } \n"); + + source.append(" return count; \n"); + source.append(" } \n"); + + + } + + //////////////////////////////////////////////////////////////////////////////// + // Store all non-empty intervals resulting from the subdivision of the interval + // currently processed by the thread + // + // addr base address for storing intervals + // num_threads_active number of threads / intervals in current sweep + // s_left shared memory storage for left interval limits + // s_right shared memory storage for right interval limits + // s_left_count shared memory storage for number of eigenvalues less than left interval limits + // s_right_count shared memory storage for number of eigenvalues less than right interval limits + // left lower limit of interval + // mid midpoint of interval + // right upper limit of interval + // left_count eigenvalues less than \a left + // mid_count eigenvalues less than \a mid + // right_count eigenvalues less than \a right + // precision desired precision for eigenvalues + // compact_second_chunk shared mem flag if second chunk is used and ergo requires compaction + // s_compaction_list_exc helper array for stream compaction, s_compaction_list_exc[tid] = 1 when the thread generated two child intervals + // is_active_interval mark is thread has a second non-empty child interval + //////////////////////////////////////////////////////////////////////////////// + + template<typename StringType> + void generate_bisect_kernel_storeNonEmptyIntervals(StringType & source, std::string const & numeric_string) + { + source.append(" \n"); + source.append(" void \n"); + source.append(" storeNonEmptyIntervals(unsigned int addr, \n"); + source.append(" const unsigned int num_threads_active, \n"); + source.append(" __local "); source.append(numeric_string); source.append(" *s_left, \n"); + source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n"); + source.append(" __local unsigned int *s_left_count, \n"); + source.append(" __local unsigned int *s_right_count, \n"); + source.append(" "); source.append(numeric_string); source.append(" left, \n "); + source.append(" "); source.append(numeric_string); source.append(" mid, \n"); + source.append(" "); source.append(numeric_string); source.append(" right,\n"); + source.append(" const unsigned int left_count, \n"); + source.append(" const unsigned int mid_count, \n"); + source.append(" const unsigned int right_count, \n"); + source.append(" "); source.append(numeric_string); source.append(" precision, \n"); + source.append(" __local unsigned int *compact_second_chunk, \n"); + source.append(" __local unsigned int *s_compaction_list_exc, \n"); + source.append(" unsigned int *is_active_second) \n"); + source.append(" { \n"); + source.append(" uint glb_id = get_global_id(0); \n"); + source.append(" uint grp_id = get_group_id(0); \n"); + source.append(" uint grp_nm = get_num_groups(0); \n"); + source.append(" uint lcl_id = get_local_id(0); \n"); + source.append(" uint lcl_sz = get_local_size(0); \n"); + + // check if both child intervals are valid + source.append(" \n"); + source.append(" if ((left_count != mid_count) && (mid_count != right_count)) \n"); + source.append(" { \n"); + + // store the left interval + source.append(" storeInterval(addr, s_left, s_right, s_left_count, s_right_count, \n"); + source.append(" left, mid, left_count, mid_count, precision); \n"); + + // mark that a second interval has been generated, only stored after + // stream compaction of second chunk + source.append(" *is_active_second = 1; \n"); + source.append(" s_compaction_list_exc[lcl_id] = 1; \n"); + source.append(" *compact_second_chunk = 1; \n"); + source.append(" } \n"); + source.append(" else \n"); + source.append(" { \n"); + + // only one non-empty child interval + + // mark that no second child + source.append(" *is_active_second = 0; \n"); + source.append(" s_compaction_list_exc[lcl_id] = 0; \n"); + + // store the one valid child interval + source.append(" if (left_count != mid_count) \n"); + source.append(" { \n"); + source.append(" storeInterval(addr, s_left, s_right, s_left_count, s_right_count, \n"); + source.append(" left, mid, left_count, mid_count, precision); \n"); + source.append(" } \n"); + source.append(" else \n"); + source.append(" { \n"); + source.append(" storeInterval(addr, s_left, s_right, s_left_count, s_right_count, \n"); + source.append(" mid, right, mid_count, right_count, precision); \n"); + source.append(" } \n"); + + source.append(" } \n"); + source.append(" } \n"); + + } + + + //////////////////////////////////////////////////////////////////////////////// + //! Store all non-empty intervals resulting from the subdivision of the interval + //! currently processed by the thread + //////////////////////////////////////////////////////////////////////////////// + + template <typename StringType> + void generate_bisect_kernel_storeNonEmptyIntervalsLarge(StringType & source, std::string const & numeric_string) + { + source.append(" \n"); + source.append(" void \n"); + source.append(" storeNonEmptyIntervalsLarge(unsigned int addr, \n"); + source.append(" const unsigned int num_threads_active, \n"); + source.append(" __local "); source.append(numeric_string); source.append(" *s_left, \n"); + source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n"); + source.append(" __local unsigned short *s_left_count, \n"); + source.append(" __local unsigned short *s_right_count, \n"); + source.append(" "); source.append(numeric_string); source.append(" left, \n "); + source.append(" "); source.append(numeric_string); source.append(" mid, \n"); + source.append(" "); source.append(numeric_string); source.append(" right,\n"); + source.append(" const unsigned int left_count, \n"); + source.append(" const unsigned int mid_count, \n"); + source.append(" const unsigned int right_count, \n"); + source.append(" "); source.append(numeric_string); source.append(" epsilon, \n"); + source.append(" __local unsigned int *compact_second_chunk, \n"); + source.append(" __local unsigned short *s_compaction_list, \n"); + source.append(" unsigned int *is_active_second) \n"); + source.append(" { \n"); + source.append(" uint glb_id = get_global_id(0); \n"); + source.append(" uint grp_id = get_group_id(0); \n"); + source.append(" uint grp_nm = get_num_groups(0); \n"); + source.append(" uint lcl_id = get_local_id(0); \n"); + source.append(" uint lcl_sz = get_local_size(0); \n"); + + // check if both child intervals are valid + source.append(" if ((left_count != mid_count) && (mid_count != right_count)) \n"); + source.append(" { \n"); + + source.append(" storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count, \n"); + source.append(" left, mid, left_count, mid_count, epsilon); \n"); + + source.append(" *is_active_second = 1; \n"); + source.append(" s_compaction_list[lcl_id] = 1; \n"); + source.append(" *compact_second_chunk = 1; \n"); + source.append(" } \n"); + source.append(" else \n"); + source.append(" { \n"); + + // only one non-empty child interval + + // mark that no second child + source.append(" *is_active_second = 0; \n"); + source.append(" s_compaction_list[lcl_id] = 0; \n"); + + // store the one valid child interval + source.append(" if (left_count != mid_count) \n"); + source.append(" { \n"); + source.append(" storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count, \n"); + source.append(" left, mid, left_count, mid_count, epsilon); \n"); + source.append(" } \n"); + source.append(" else \n"); + source.append(" { \n"); + source.append(" storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count, \n"); + source.append(" mid, right, mid_count, right_count, epsilon); \n"); + source.append(" } \n"); + source.append(" } \n"); + source.append(" } \n"); + } + + //////////////////////////////////////////////////////////////////////////////// + // Create indices for compaction, that is process \a s_compaction_list_exc + // which is 1 for intervals that generated a second child and 0 otherwise + // and create for each of the non-zero elements the index where the new + // interval belongs to in a compact representation of all generated second children + // + // s_compaction_list_exc list containing the flags which threads generated two children + // num_threads_compaction number of threads to employ for compaction + //////////////////////////////////////////////////////////////////////////////// + + template<typename StringType> + void generate_bisect_kernel_createIndicesCompaction(StringType & source) + { + source.append(" \n"); + source.append(" void \n"); + source.append(" createIndicesCompaction(__local unsigned int *s_compaction_list_exc, \n"); + source.append(" unsigned int num_threads_compaction) \n"); + source.append(" { \n"); + source.append(" uint glb_id = get_global_id(0); \n"); + source.append(" uint grp_id = get_group_id(0); \n"); + source.append(" uint grp_nm = get_num_groups(0); \n"); + source.append(" uint lcl_id = get_local_id(0); \n"); + source.append(" uint lcl_sz = get_local_size(0); \n"); + + + source.append(" unsigned int offset = 1; \n"); + source.append(" const unsigned int tid = lcl_id; \n"); + // if(tid == 0) + // printf("num_threads_compaction = %u\n", num_threads_compaction); + + // higher levels of scan tree + source.append(" for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) \n"); + source.append(" { \n"); + + source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); + + source.append(" if (tid < d) \n"); + source.append(" { \n"); + + source.append(" unsigned int ai = offset*(2*tid+1)-1; \n"); + source.append(" unsigned int bi = offset*(2*tid+2)-1; \n"); + source.append(" \n"); + source.append(" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n"); + source.append(" + s_compaction_list_exc[ai]; \n"); + source.append(" } \n"); + + source.append(" offset <<= 1; \n"); + source.append(" } \n"); + + // traverse down tree: first down to level 2 across + source.append(" for (int d = 2; d < num_threads_compaction; d <<= 1) \n"); + source.append(" { \n"); + + source.append(" offset >>= 1; \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); + + source.append(" if (tid < (d-1)) \n"); + source.append(" { \n"); + + source.append(" unsigned int ai = offset*(tid+1) - 1; \n"); + source.append(" unsigned int bi = ai + (offset >> 1); \n"); + + source.append(" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n"); + source.append(" + s_compaction_list_exc[ai]; \n"); + source.append(" } \n"); + source.append(" } \n"); + + source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); + + source.append(" } \n"); + } + + + template<typename StringType> + void generate_bisect_kernel_createIndicesCompactionShort(StringType & source) + { + source.append(" \n"); + source.append(" void \n"); + source.append(" createIndicesCompactionShort(__local unsigned short *s_compaction_list_exc, \n"); + source.append(" unsigned int num_threads_compaction) \n"); + source.append(" { \n"); + source.append(" uint glb_id = get_global_id(0); \n"); + source.append(" uint grp_id = get_group_id(0); \n"); + source.append(" uint grp_nm = get_num_groups(0); \n"); + source.append(" uint lcl_id = get_local_id(0); \n"); + source.append(" uint lcl_sz = get_local_size(0); \n"); + + + source.append(" unsigned int offset = 1; \n"); + source.append(" const unsigned int tid = lcl_id; \n"); + + // higher levels of scan tree + source.append(" for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) \n"); + source.append(" { \n"); + + source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); + + source.append(" if (tid < d) \n"); + source.append(" { \n"); + + source.append(" unsigned int ai = offset*(2*tid+1)-1; \n"); + source.append(" unsigned int bi = offset*(2*tid+2)-1; \n"); + source.append(" \n"); + source.append(" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n"); + source.append(" + s_compaction_list_exc[ai]; \n"); + source.append(" } \n"); + + source.append(" offset <<= 1; \n"); + source.append(" } \n"); + + // traverse down tree: first down to level 2 across + source.append(" for (int d = 2; d < num_threads_compaction; d <<= 1) \n"); + source.append(" { \n"); + + source.append(" offset >>= 1; \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); + + source.append(" if (tid < (d-1)) \n"); + source.append(" { \n"); + + source.append(" unsigned int ai = offset*(tid+1) - 1; \n"); + source.append(" unsigned int bi = ai + (offset >> 1); \n"); + + source.append(" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n"); + source.append(" + s_compaction_list_exc[ai]; \n"); + source.append(" } \n"); + source.append(" } \n"); + + source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); + + source.append(" } \n"); + } + + /////////////////////////////////////////////////////////////////////////////// + // Perform stream compaction for second child intervals + // + // s_left shared memory storage for left interval limits + // s_right shared memory storage for right interval limits + // s_left_count shared memory storage for number of eigenvalues less than left interval limits + // s_right_count shared memory storage for number of eigenvalues less than right interval limits + // mid midpoint of current interval (left of new interval) + // right upper limit of interval + // mid_count eigenvalues less than \a mid + // s_compaction_list list containing the indices where the data has to be stored + // num_threads_active number of active threads / intervals + // is_active_interval mark is thread has a second non-empty child interval + /////////////////////////////////////////////////////////////////////////////// + + + template<typename StringType> + void generate_bisect_kernel_compactIntervals(StringType & source, std::string const & numeric_string) + { + source.append(" \n"); + source.append(" void \n"); + source.append(" compactIntervals(__local "); source.append(numeric_string); source.append(" *s_left, \n"); + source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n"); + source.append(" __local unsigned int *s_left_count, \n"); + source.append(" __local unsigned int *s_right_count, \n"); + source.append(" "); source.append(numeric_string); source.append(" mid, \n"); + source.append(" "); source.append(numeric_string); source.append(" right, \n"); + source.append(" unsigned int mid_count, unsigned int right_count, \n"); + source.append(" __local unsigned int *s_compaction_list, \n"); + source.append(" unsigned int num_threads_active, \n"); + source.append(" unsigned int is_active_second) \n"); + source.append(" { \n"); + source.append(" uint glb_id = get_global_id(0); \n"); + source.append(" uint grp_id = get_group_id(0); \n"); + source.append(" uint grp_nm = get_num_groups(0); \n"); + source.append(" uint lcl_id = get_local_id(0); \n"); + source.append(" uint lcl_sz = get_local_size(0); \n"); + + source.append(" const unsigned int tid = lcl_id; \n"); + + // perform compaction / copy data for all threads where the second + // child is not dead + source.append(" if ((tid < num_threads_active) && (1 == is_active_second)) \n"); + source.append(" { \n"); + source.append(" unsigned int addr_w = num_threads_active + s_compaction_list[tid]; \n"); + source.append(" s_left[addr_w] = mid; \n"); + source.append(" s_right[addr_w] = right; \n"); + source.append(" s_left_count[addr_w] = mid_count; \n"); + source.append(" s_right_count[addr_w] = right_count; \n"); + source.append(" } \n"); + source.append(" } \n"); + } + + + + + template<typename StringType> + void generate_bisect_kernel_compactIntervalsShort(StringType & source, std::string const & numeric_string) + { + source.append(" \n"); + source.append(" void \n"); + source.append(" compactIntervalsShort(__local "); source.append(numeric_string); source.append(" *s_left, \n"); + source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n"); + source.append(" __local unsigned short *s_left_count, \n"); + source.append(" __local unsigned short *s_right_count, \n"); + source.append(" "); source.append(numeric_string); source.append(" mid, \n"); + source.append(" "); source.append(numeric_string); source.append(" right, \n"); + source.append(" unsigned int mid_count, unsigned int right_count, \n"); + source.append(" __local unsigned short *s_compaction_list, \n"); + source.append(" unsigned int num_threads_active, \n"); + source.append(" unsigned int is_active_second) \n"); + source.append(" { \n"); + source.append(" uint glb_id = get_global_id(0); \n"); + source.append(" uint grp_id = get_group_id(0); \n"); + source.append(" uint grp_nm = get_num_groups(0); \n"); + source.append(" uint lcl_id = get_local_id(0); \n"); + source.append(" uint lcl_sz = get_local_size(0); \n"); + + source.append(" const unsigned int tid = lcl_id; \n"); + + // perform compaction / copy data for all threads where the second + // child is not dead + source.append(" if ((tid < num_threads_active) && (1 == is_active_second)) \n"); + source.append(" { \n"); + source.append(" unsigned int addr_w = num_threads_active + s_compaction_list[tid]; \n"); + source.append(" s_left[addr_w] = mid; \n"); + source.append(" s_right[addr_w] = right; \n"); + source.append(" s_left_count[addr_w] = mid_count; \n"); + source.append(" s_right_count[addr_w] = right_count; \n"); + source.append(" } \n"); + source.append(" } \n"); + } + + + + template<typename StringType> + void generate_bisect_kernel_storeIntervalConverged(StringType & source, std::string const & numeric_string) + { + source.append(" \n"); + source.append(" void \n"); + source.append(" storeIntervalConverged( __local "); source.append(numeric_string); source.append(" *s_left, \n"); + source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n"); + source.append(" __local unsigned int *s_left_count, \n"); + source.append(" __local unsigned int *s_right_count, \n"); + source.append(" "); source.append(numeric_string); source.append(" *left, \n"); + source.append(" "); source.append(numeric_string); source.append(" *mid, \n"); + source.append(" "); source.append(numeric_string); source.append(" *right, \n"); + source.append(" unsigned int *left_count, \n"); + source.append(" unsigned int *mid_count, \n"); + source.append(" unsigned int *right_count, \n"); + source.append(" __local unsigned int *s_compaction_list_exc, \n"); + source.append(" __local unsigned int *compact_second_chunk, \n"); + source.append(" const unsigned int num_threads_active, \n"); + source.append(" unsigned int *is_active_second) \n"); + source.append(" { \n"); + source.append(" uint glb_id = get_global_id(0); \n"); + source.append(" uint grp_id = get_group_id(0); \n"); + source.append(" uint grp_nm = get_num_groups(0); \n"); + source.append(" uint lcl_id = get_local_id(0); \n"); + source.append(" uint lcl_sz = get_local_size(0); \n"); + + source.append(" const unsigned int tid = lcl_id; \n"); + source.append(" const unsigned int multiplicity = *right_count - *left_count; \n"); + // check multiplicity of eigenvalue + source.append(" if (1 == multiplicity) \n"); + source.append(" { \n"); + + // just re-store intervals, simple eigenvalue + source.append(" s_left[tid] = *left; \n"); + source.append(" s_right[tid] = *right; \n"); + source.append(" s_left_count[tid] = *left_count; \n"); + source.append(" s_right_count[tid] = *right_count; \n"); + source.append(" \n"); + + // mark that no second child / clear + source.append(" *is_active_second = 0; \n"); + source.append(" s_compaction_list_exc[tid] = 0; \n"); + source.append(" } \n"); + source.append(" else \n"); + source.append(" { \n"); + + // number of eigenvalues after the split less than mid + source.append(" *mid_count = *left_count + (multiplicity >> 1); \n"); + + // store left interval + source.append(" s_left[tid] = *left; \n"); + source.append(" s_right[tid] = *right; \n"); + source.append(" s_left_count[tid] = *left_count; \n"); + source.append(" s_right_count[tid] = *mid_count; \n"); + source.append(" *mid = *left; \n"); + + // mark that second child interval exists + source.append(" *is_active_second = 1; \n"); + source.append(" s_compaction_list_exc[tid] = 1; \n"); + source.append(" *compact_second_chunk = 1; \n"); + source.append(" } \n"); + source.append(" } \n"); + } + + + + + + template<typename StringType> + void generate_bisect_kernel_storeIntervalConvergedShort(StringType & source, std::string const & numeric_string) + { + source.append(" \n"); + source.append(" void \n"); + source.append(" storeIntervalConvergedShort(__local "); source.append(numeric_string); source.append(" *s_left, \n"); + source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n"); + source.append(" __local unsigned short *s_left_count, \n"); + source.append(" __local unsigned short *s_right_count, \n"); + source.append(" "); source.append(numeric_string); source.append(" *left, \n"); + source.append(" "); source.append(numeric_string); source.append(" *mid, \n"); + source.append(" "); source.append(numeric_string); source.append(" *right, \n"); + source.append(" unsigned int *left_count, \n"); + source.append(" unsigned int *mid_count, \n"); + source.append(" unsigned int *right_count, \n"); + source.append(" __local unsigned short *s_compaction_list_exc, \n"); + source.append(" __local unsigned int *compact_second_chunk, \n"); + source.append(" const unsigned int num_threads_active, \n"); + source.append(" unsigned int *is_active_second) \n"); + source.append(" { \n"); + source.append(" uint glb_id = get_global_id(0); \n"); + source.append(" uint grp_id = get_group_id(0); \n"); + source.append(" uint grp_nm = get_num_groups(0); \n"); + source.append(" uint lcl_id = get_local_id(0); \n"); + source.append(" uint lcl_sz = get_local_size(0); \n"); + + source.append(" const unsigned int tid = lcl_id; \n"); + source.append(" const unsigned int multiplicity = *right_count - *left_count; \n"); + // check multiplicity of eigenvalue + source.append(" if (1 == multiplicity) \n"); + source.append(" { \n"); + + // just re-store intervals, simple eigenvalue + source.append(" s_left[tid] = *left; \n"); + source.append(" s_right[tid] = *right; \n"); + source.append(" s_left_count[tid] = *left_count; \n"); + source.append(" s_right_count[tid] = *right_count; \n"); + source.append(" \n"); + + // mark that no second child / clear + source.append(" *is_active_second = 0; \n"); + source.append(" s_compaction_list_exc[tid] = 0; \n"); + source.append(" } \n"); + source.append(" else \n"); + source.append(" { \n"); + + // number of eigenvalues after the split less than mid + source.append(" *mid_count = *left_count + (multiplicity >> 1); \n"); + + // store left interval + source.append(" s_left[tid] = *left; \n"); + source.append(" s_right[tid] = *right; \n"); + source.append(" s_left_count[tid] = *left_count; \n"); + source.append(" s_right_count[tid] = *mid_count; \n"); + source.append(" *mid = *left; \n"); + + // mark that second child interval exists + source.append(" *is_active_second = 1; \n"); + source.append(" s_compaction_list_exc[tid] = 1; \n"); + source.append(" *compact_second_chunk = 1; \n"); + source.append(" } \n"); + source.append(" } \n"); + } + + /////////////////////////////////////////////////////////////////////////////// + // Subdivide interval if active and not already converged + // + // tid id of thread + // s_left shared memory storage for left interval limits + // s_right shared memory storage for right interval limits + // s_left_count shared memory storage for number of eigenvalues less than left interval limits + // s_right_count shared memory storage for number of eigenvalues less than right interval limits + // num_threads_active number of active threads in warp + // left lower limit of interval + // right upper limit of interval + // left_count eigenvalues less than \a left + // right_count eigenvalues less than \a right + // all_threads_converged shared memory flag if all threads are converged + /////////////////////////////////////////////////////////////////////////////// + + + template<typename StringType> + void generate_bisect_kernel_subdivideActiveInterval(StringType & source, std::string const & numeric_string) + { + source.append(" \n"); + source.append(" void \n"); + source.append(" subdivideActiveIntervalMulti(const unsigned int tid, \n"); + source.append(" __local "); source.append(numeric_string); source.append(" *s_left, \n"); + source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n"); + source.append(" __local unsigned int *s_left_count, \n"); + source.append(" __local unsigned int *s_right_count, \n"); + source.append(" const unsigned int num_threads_active, \n"); + source.append(" "); source.append(numeric_string); source.append(" *left, \n"); + source.append(" "); source.append(numeric_string); source.append(" *right, \n"); + source.append(" unsigned int *left_count, unsigned int *right_count, \n"); + source.append(" "); source.append(numeric_string); source.append(" *mid, \n"); + source.append(" __local unsigned int *all_threads_converged) \n"); + source.append(" { \n"); + source.append(" uint glb_id = get_global_id(0); \n"); + source.append(" uint grp_id = get_group_id(0); \n"); + source.append(" uint grp_nm = get_num_groups(0); \n"); + source.append(" uint lcl_id = get_local_id(0); \n"); + source.append(" uint lcl_sz = get_local_size(0); \n"); + + // for all active threads + source.append(" if (tid < num_threads_active) \n"); + source.append(" { \n"); + + source.append(" *left = s_left[tid]; \n"); + source.append(" *right = s_right[tid]; \n"); + source.append(" *left_count = s_left_count[tid]; \n"); + source.append(" *right_count = s_right_count[tid]; \n"); + + // check if thread already converged + source.append(" if (*left != *right) \n"); + source.append(" { \n"); + + source.append(" *mid = computeMidpoint(*left, *right); \n"); + source.append(" *all_threads_converged = 0; \n"); + source.append(" } \n"); + source.append(" else if ((*right_count - *left_count) > 1) \n"); + source.append(" { \n"); + // mark as not converged if multiple eigenvalues enclosed + // duplicate interval in storeIntervalsConverged() + source.append(" *all_threads_converged = 0; \n"); + source.append(" } \n"); + + source.append(" } \n"); + // end for all active threads + source.append(" } \n"); + } + + + template<typename StringType> + void generate_bisect_kernel_subdivideActiveIntervalShort(StringType & source, std::string const & numeric_string) + { + source.append(" \n"); + source.append(" void \n"); + source.append(" subdivideActiveIntervalShort(const unsigned int tid, \n"); + source.append(" __local "); source.append(numeric_string); source.append(" *s_left, \n"); + source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n"); + source.append(" __local unsigned short *s_left_count, \n"); + source.append(" __local unsigned short *s_right_count, \n"); + source.append(" const unsigned int num_threads_active, \n"); + source.append(" "); source.append(numeric_string); source.append(" *left, \n"); + source.append(" "); source.append(numeric_string); source.append(" *right, \n"); + source.append(" unsigned int *left_count, unsigned int *right_count, \n"); + source.append(" "); source.append(numeric_string); source.append(" *mid, \n"); + source.append(" __local unsigned int *all_threads_converged) \n"); + source.append(" { \n"); + source.append(" uint glb_id = get_global_id(0); \n"); + source.append(" uint grp_id = get_group_id(0); \n"); + source.append(" uint grp_nm = get_num_groups(0); \n"); + source.append(" uint lcl_id = get_local_id(0); \n"); + source.append(" uint lcl_sz = get_local_size(0); \n"); + + // for all active threads + source.append(" if (tid < num_threads_active) \n"); + source.append(" { \n"); + + source.append(" *left = s_left[tid]; \n"); + source.append(" *right = s_right[tid]; \n"); + source.append(" *left_count = s_left_count[tid]; \n"); + source.append(" *right_count = s_right_count[tid]; \n"); + + // check if thread already converged + source.append(" if (*left != *right) \n"); + source.append(" { \n"); + + source.append(" *mid = computeMidpoint(*left, *right); \n"); + source.append(" *all_threads_converged = 0; \n"); + source.append(" } \n"); + + source.append(" } \n"); + // end for all active threads + source.append(" } \n"); + } + + // end of utilities + // start of kernels + + + //////////////////////////////////////////////////////////////////////////////// + // Bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix + // + // g_d diagonal elements in global memory + // g_s superdiagonal elements in global elements (stored so that the element *(g_s - 1) can be accessed an equals 0 + // n size of matrix + // lg lower bound of input interval (e.g. Gerschgorin interval) + // ug upper bound of input interval (e.g. Gerschgorin interval) + // lg_eig_count number of eigenvalues that are smaller than \a lg + // lu_eig_count number of eigenvalues that are smaller than \a lu + // epsilon desired accuracy of eigenvalues to compute + //////////////////////////////////////////////////////////////////////////////// + /// + template <typename StringType> + void generate_bisect_kernel_bisectKernel(StringType & source, std::string const & numeric_string) + { + source.append(" __kernel \n"); + source.append(" void \n"); + source.append(" bisectKernelSmall(__global "); source.append(numeric_string); source.append(" *g_d, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n"); + source.append(" const unsigned int n, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" *g_left, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" *g_right, \n"); + source.append(" __global unsigned int *g_left_count, __global unsigned int *g_right_count, \n"); + source.append(" const "); source.append(numeric_string); source.append(" lg, \n"); + source.append(" const "); source.append(numeric_string); source.append(" ug, \n"); + source.append(" const unsigned int lg_eig_count, const unsigned int ug_eig_count, \n"); + source.append(" "); source.append(numeric_string); source.append(" epsilon \n"); + source.append(" ) \n"); + source.append(" { \n"); + source.append(" g_s = g_s + 1; \n"); + source.append(" uint glb_id = get_global_id(0); \n"); + source.append(" uint grp_id = get_group_id(0); \n"); + source.append(" uint grp_nm = get_num_groups(0); \n"); + source.append(" uint lcl_id = get_local_id(0); \n"); + source.append(" uint lcl_sz = get_local_size(0); \n"); + + // intervals (store left and right because the subdivision tree is in general + // not dense + source.append(" __local "); source.append(numeric_string); source.append(" s_left[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; \n"); + source.append(" __local "); source.append(numeric_string); source.append(" s_right[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; \n"); + + // number of eigenvalues that are smaller than s_left / s_right + // (correspondence is realized via indices) + source.append(" __local unsigned int s_left_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; \n"); + source.append(" __local unsigned int s_right_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; \n"); + + // helper for stream compaction + source.append(" __local unsigned int \n"); + source.append(" s_compaction_list[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX + 1]; \n"); + + // state variables for whole block + // if 0 then compaction of second chunk of child intervals is not necessary + // (because all intervals had exactly one non-dead child) + source.append(" __local unsigned int compact_second_chunk; \n"); + source.append(" __local unsigned int all_threads_converged; \n"); + + // number of currently active threads + source.append(" __local unsigned int num_threads_active; \n"); + + // number of threads to use for stream compaction + source.append(" __local unsigned int num_threads_compaction; \n"); + + // helper for exclusive scan + source.append(" __local unsigned int *s_compaction_list_exc = s_compaction_list + 1; \n"); + + + // variables for currently processed interval + // left and right limit of active interval + source.append(" "); source.append(numeric_string); source.append(" left = 0.0f; \n"); + source.append(" "); source.append(numeric_string); source.append(" right = 0.0f; \n"); + source.append(" unsigned int left_count = 0; \n"); + source.append(" unsigned int right_count = 0; \n"); + // midpoint of active interval + source.append(" "); source.append(numeric_string); source.append(" mid = 0.0f; \n"); + // number of eigenvalues smaller then mid + source.append(" unsigned int mid_count = 0; \n"); + // affected from compaction + source.append(" unsigned int is_active_second = 0; \n"); + + source.append(" s_compaction_list[lcl_id] = 0; \n"); + source.append(" s_left[lcl_id] = 0.0; \n"); + source.append(" s_right[lcl_id] = 0.0; \n"); + source.append(" s_left_count[lcl_id] = 0; \n"); + source.append(" s_right_count[lcl_id] = 0; \n"); + + source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); + + // set up initial configuration + source.append(" if (0 == lcl_id) \n"); + source.append(" { \n"); + source.append(" s_left[0] = lg; \n"); + source.append(" s_right[0] = ug; \n"); + source.append(" s_left_count[0] = lg_eig_count; \n"); + source.append(" s_right_count[0] = ug_eig_count; \n"); + + source.append(" compact_second_chunk = 0; \n"); + source.append(" num_threads_active = 1; \n"); + + source.append(" num_threads_compaction = 1; \n"); + source.append(" } \n"); + + // for all active threads read intervals from the last level + // the number of (worst case) active threads per level l is 2^l + + source.append(" while (true) \n"); + source.append(" { \n"); + + source.append(" all_threads_converged = 1; \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); + + source.append(" is_active_second = 0; \n"); + source.append(" subdivideActiveIntervalMulti(lcl_id, \n"); + source.append(" s_left, s_right, s_left_count, s_right_count, \n"); + source.append(" num_threads_active, \n"); + source.append(" &left, &right, &left_count, &right_count, \n"); + source.append(" &mid, &all_threads_converged); \n"); + // source.append(" output[lcl_id] = s_left; \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); + + // check if done + source.append(" if (1 == all_threads_converged) \n"); + source.append(" { \n"); + source.append(" break; \n"); + source.append(" } \n"); + + source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); + + // compute number of eigenvalues smaller than mid + // use all threads for reading the necessary matrix data from global + // memory + // use s_left and s_right as scratch space for diagonal and + // superdiagonal of matrix + source.append(" mid_count = computeNumSmallerEigenvals(g_d, g_s, n, mid, \n"); + source.append(" lcl_id, num_threads_active, \n"); + source.append(" s_left, s_right, \n"); + source.append(" (left == right)); \n"); + + source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); + + // store intervals + // for all threads store the first child interval in a continuous chunk of + // memory, and the second child interval -- if it exists -- in a second + // chunk; it is likely that all threads reach convergence up to + // \a epsilon at the same level; furthermore, for higher level most / all + // threads will have only one child, storing the first child compactly will + // (first) avoid to perform a compaction step on the first chunk, (second) + // make it for higher levels (when all threads / intervals have + // exactly one child) unnecessary to perform a compaction of the second + // chunk + source.append(" if (lcl_id < num_threads_active) \n"); + source.append(" { \n"); + + source.append(" if (left != right) \n"); + source.append(" { \n"); + + // store intervals + source.append(" storeNonEmptyIntervals(lcl_id, num_threads_active, \n"); + source.append(" s_left, s_right, s_left_count, s_right_count, \n"); + source.append(" left, mid, right, \n"); + source.append(" left_count, mid_count, right_count, \n"); + source.append(" epsilon, &compact_second_chunk, \n"); + source.append(" s_compaction_list_exc, \n"); + source.append(" &is_active_second); \n"); + source.append(" } \n"); + source.append(" else \n"); + source.append(" { \n"); + + source.append(" storeIntervalConverged(s_left, s_right, s_left_count, s_right_count, \n"); + source.append(" &left, &mid, &right, \n"); + source.append(" &left_count, &mid_count, &right_count, \n"); + source.append(" s_compaction_list_exc, &compact_second_chunk, \n"); + source.append(" num_threads_active, \n"); + source.append(" &is_active_second); \n"); + source.append(" } \n"); + source.append(" } \n"); + + // necessary so that compact_second_chunk is up-to-date + source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); + + // perform compaction of chunk where second children are stored + // scan of (num_threads_actieigenvaluesve / 2) elements, thus at most + // (num_threads_active / 4) threads are needed + source.append(" if (compact_second_chunk > 0) \n"); + source.append(" { \n"); + + source.append(" createIndicesCompaction(s_compaction_list_exc, num_threads_compaction); \n"); + + source.append(" compactIntervals(s_left, s_right, s_left_count, s_right_count, \n"); + source.append(" mid, right, mid_count, right_count, \n"); + source.append(" s_compaction_list, num_threads_active, \n"); + source.append(" is_active_second); \n"); + source.append(" } \n"); + + source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); + + source.append(" if (0 == lcl_id) \n"); + source.append(" { \n"); + + // update number of active threads with result of reduction + source.append(" num_threads_active += s_compaction_list[num_threads_active]; \n"); + + source.append(" num_threads_compaction = ceilPow2(num_threads_active); \n"); + + source.append(" compact_second_chunk = 0; \n"); + source.append(" } \n"); + + source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); + + source.append(" } \n"); + + source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); + + // write resulting intervals to global mem + // for all threads write if they have been converged to an eigenvalue to + // a separate array + + // at most n valid intervals + source.append(" if (lcl_id < n) \n"); + source.append(" { \n"); + // intervals converged so left and right limit are identical + source.append(" g_left[lcl_id] = s_left[lcl_id]; \n"); + // left count is sufficient to have global order + source.append(" g_left_count[lcl_id] = s_left_count[lcl_id]; \n"); + source.append(" } \n"); + source.append(" } \n"); + } + + //////////////////////////////////////////////////////////////////////////////// + // Perform second step of bisection algorithm for large matrices for intervals that after the first step contained more than one eigenvalue + // + // g_d diagonal elements of symmetric, tridiagonal matrix + // g_s superdiagonal elements of symmetric, tridiagonal matrix + // n matrix size + // blocks_mult start addresses of blocks of intervals that are processed by one block of threads, each of the intervals contains more than one eigenvalue + // blocks_mult_sum total number of eigenvalues / singleton intervals in one block of intervals + // g_left left limits of intervals + // g_right right limits of intervals + // g_left_count number of eigenvalues less than left limits + // g_right_count number of eigenvalues less than right limits + // g_lambda final eigenvalue + // g_pos index of eigenvalue (in ascending order) + // precision desired precision of eigenvalues + //////////////////////////////////////////////////////////////////////////////// + + template <typename StringType> + void generate_bisect_kernel_bisectKernelLarge_MultIntervals(StringType & source, std::string const & numeric_string) + { + source.append(" __kernel \n"); + source.append(" void \n"); + source.append(" bisectKernelLarge_MultIntervals(__global "); source.append(numeric_string); source.append(" *g_d, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n"); + source.append(" const unsigned int n, \n"); + source.append(" __global unsigned int *blocks_mult, \n"); + source.append(" __global unsigned int *blocks_mult_sum, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" *g_left, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" *g_right, \n"); + source.append(" __global unsigned int *g_left_count, \n"); + source.append(" __global unsigned int *g_right_count, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" *g_lambda, \n"); + source.append(" __global unsigned int *g_pos, \n"); + source.append(" "); source.append(numeric_string); source.append(" precision \n"); + source.append(" ) \n"); + source.append(" { \n"); + source.append(" g_s = g_s + 1; \n"); + source.append(" uint glb_id = get_global_id(0); \n"); + source.append(" uint grp_id = get_group_id(0); \n"); + source.append(" uint grp_nm = get_num_groups(0); \n"); + source.append(" uint lcl_id = get_local_id(0); \n"); + source.append(" uint lcl_sz = get_local_size(0); \n"); + + source.append(" const unsigned int tid = lcl_id; \n"); + + // left and right limits of interval + source.append(" __local "); source.append(numeric_string); source.append(" s_left[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; \n"); + source.append(" __local "); source.append(numeric_string); source.append(" s_right[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; \n"); + + // number of eigenvalues smaller than interval limits + source.append(" __local unsigned int s_left_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; \n"); + source.append(" __local unsigned int s_right_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; \n"); + + // helper array for chunk compaction of second chunk + source.append(" __local unsigned int s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; \n"); + // compaction list helper for exclusive scan + source.append(" __local unsigned int *s_compaction_list_exc = s_compaction_list + 1; \n"); + + // flag if all threads are converged + source.append(" __local unsigned int all_threads_converged; \n"); + // number of active threads + source.append(" __local unsigned int num_threads_active; \n"); + // number of threads to employ for compaction + source.append(" __local unsigned int num_threads_compaction; \n"); + // flag if second chunk has to be compacted + source.append(" __local unsigned int compact_second_chunk; \n"); + + // parameters of block of intervals processed by this block of threads + source.append(" __local unsigned int c_block_start; \n"); + source.append(" __local unsigned int c_block_end; \n"); + source.append(" __local unsigned int c_block_offset_output; \n"); + + // midpoint of currently active interval of the thread + source.append(" "); source.append(numeric_string); source.append(" mid = 0.0f; \n"); + // number of eigenvalues smaller than \a mid + source.append(" unsigned int mid_count = 0; \n"); + // current interval parameter + source.append(" "); source.append(numeric_string); source.append(" left = 0.0f; \n"); + source.append(" "); source.append(numeric_string); source.append(" right = 0.0f; \n"); + source.append(" unsigned int left_count = 0; \n"); + source.append(" unsigned int right_count = 0; \n"); + // helper for compaction, keep track which threads have a second child + source.append(" unsigned int is_active_second = 0; \n"); + + source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); + + // initialize common start conditions + source.append(" if (0 == tid) \n"); + source.append(" { \n"); + + source.append(" c_block_start = blocks_mult[grp_id]; \n"); + source.append(" c_block_end = blocks_mult[grp_id + 1]; \n"); + source.append(" c_block_offset_output = blocks_mult_sum[grp_id]; \n"); + source.append(" \n"); + + source.append(" num_threads_active = c_block_end - c_block_start; \n"); + source.append(" s_compaction_list[0] = 0; \n"); + source.append(" num_threads_compaction = ceilPow2(num_threads_active); \n"); + + source.append(" all_threads_converged = 1; \n"); + source.append(" compact_second_chunk = 0; \n"); + source.append(" } \n"); + source.append(" s_left_count [tid] = 42; \n"); + source.append(" s_right_count[tid] = 42; \n"); + source.append(" s_left_count [tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; \n"); + source.append(" s_right_count[tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; \n"); + source.append(" \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); + source.append(" \n"); + + // read data into shared memory + source.append(" if (tid < num_threads_active) \n"); + source.append(" { \n"); + + source.append(" s_left[tid] = g_left[c_block_start + tid]; \n"); + source.append(" s_right[tid] = g_right[c_block_start + tid]; \n"); + source.append(" s_left_count[tid] = g_left_count[c_block_start + tid]; \n"); + source.append(" s_right_count[tid] = g_right_count[c_block_start + tid]; \n"); + source.append(" \n"); + source.append(" } \n"); + source.append(" \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); + source.append(" unsigned int iter = 0; \n"); + // do until all threads converged + source.append(" while (true) \n"); + source.append(" { \n"); + source.append(" iter++; \n"); + //for (int iter=0; iter < 0; iter++) { + source.append(" s_compaction_list[lcl_id] = 0; \n"); + source.append(" s_compaction_list[lcl_id + lcl_sz] = 0; \n"); + source.append(" s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; \n"); + + // subdivide interval if currently active and not already converged + source.append(" subdivideActiveIntervalMulti(tid, s_left, s_right, \n"); + source.append(" s_left_count, s_right_count, \n"); + source.append(" num_threads_active, \n"); + source.append(" &left, &right, &left_count, &right_count, \n"); + source.append(" &mid, &all_threads_converged); \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); + + // stop if all eigenvalues have been found + source.append(" if (1 == all_threads_converged) \n"); + source.append(" { \n"); + source.append(" \n"); + source.append(" break; \n"); + source.append(" } \n"); + + // compute number of eigenvalues smaller than mid for active and not + // converged intervals, use all threads for loading data from gmem and + // s_left and s_right as scratch space to store the data load from gmem + // in shared memory + source.append(" mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, \n"); + source.append(" mid, tid, num_threads_active, \n"); + source.append(" s_left, s_right, \n"); + source.append(" (left == right)); \n"); + source.append(" \n"); + source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); + + source.append(" if (tid < num_threads_active) \n"); + source.append(" { \n"); + source.append(" \n"); + // store intervals + source.append(" if (left != right) \n"); + source.append(" { \n"); + + source.append(" storeNonEmptyIntervals(tid, num_threads_active, \n"); + source.append(" s_left, s_right, s_left_count, s_right_count, \n"); + source.append(" left, mid, right, \n"); + source.append(" left_count, mid_count, right_count, \n"); + source.append(" precision, &compact_second_chunk, \n"); + source.append(" s_compaction_list_exc, \n"); + source.append(" &is_active_second); \n"); + source.append(" \n"); + source.append(" } \n"); + source.append(" else \n"); + source.append(" { \n"); + + source.append(" storeIntervalConverged(s_left, s_right, s_left_count, s_right_count, \n"); + source.append(" &left, &mid, &right, \n"); + source.append(" &left_count, &mid_count, &right_count, \n"); + source.append(" s_compaction_list_exc, &compact_second_chunk, \n"); + source.append(" num_threads_active, \n"); + source.append(" &is_active_second); \n"); + source.append(" \n"); + source.append(" } \n"); + source.append(" } \n"); + + source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); + + // compact second chunk of intervals if any of the threads generated + // two child intervals + source.append(" if (1 == compact_second_chunk) \n"); + source.append(" { \n"); + + source.append(" createIndicesCompaction(s_compaction_list_exc, num_threads_compaction); \n"); + source.append(" compactIntervals(s_left, s_right, s_left_count, s_right_count, \n"); + source.append(" mid, right, mid_count, right_count, \n"); + source.append(" s_compaction_list, num_threads_active, \n"); + source.append(" is_active_second); \n"); + source.append(" } \n"); + + source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); + + // update state variables + source.append(" if (0 == tid) \n"); + source.append(" { \n"); + source.append(" num_threads_active += s_compaction_list[num_threads_active]; \n"); + source.append(" num_threads_compaction = ceilPow2(num_threads_active); \n"); + + source.append(" compact_second_chunk = 0; \n"); + source.append(" all_threads_converged = 1; \n"); + source.append(" } \n"); + + source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); + + // clear + source.append(" s_compaction_list_exc[lcl_id] = 0; \n"); + source.append(" s_compaction_list_exc[lcl_id + lcl_sz] = 0; \n"); + source.append(" \n"); + source.append(" if (num_threads_compaction > lcl_sz) \n"); + source.append(" { \n"); + source.append(" break; \n"); + source.append(" } \n"); + + + source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); + + source.append(" } \n"); // end until all threads converged + + // write data back to global memory + source.append(" if (tid < num_threads_active) \n"); + source.append(" { \n"); + + source.append(" unsigned int addr = c_block_offset_output + tid; \n"); + source.append(" \n"); + source.append(" g_lambda[addr] = s_left[tid]; \n"); + source.append(" g_pos[addr] = s_right_count[tid]; \n"); + source.append(" } \n"); + source.append(" } \n"); + } + + + //////////////////////////////////////////////////////////////////////////////// + // Determine eigenvalues for large matrices for intervals that after the first step contained one eigenvalue + // + // g_d diagonal elements of symmetric, tridiagonal matrix + // g_s superdiagonal elements of symmetric, tridiagonal matrix + // n matrix size + // num_intervals total number of intervals containing one eigenvalue after the first step + // g_left left interval limits + // g_right right interval limits + // g_pos index of interval / number of intervals that are smaller than right interval limit + // precision desired precision of eigenvalues + //////////////////////////////////////////////////////////////////////////////// + + template <typename StringType> + void generate_bisect_kernel_bisectKernelLarge_OneIntervals(StringType & source, std::string const & numeric_string) + { + source.append(" __kernel \n"); + source.append(" void \n"); + source.append(" bisectKernelLarge_OneIntervals(__global "); source.append(numeric_string); source.append(" *g_d, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n"); + source.append(" const unsigned int n, \n"); + source.append(" unsigned int num_intervals, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" *g_left, \n"); + source.append(" __global "); source.append(numeric_string); source.append(" *g_right, \n"); + source.append(" __global unsigned int *g_pos, \n"); + source.append(" "); source.append(numeric_string); source.append(" precision) \n"); + source.append(" { \n"); + source.append(" g_s = g_s + 1; \n"); + source.append(" uint glb_id = get_global_id(0); \n"); + source.append(" uint grp_id = get_group_id(0); \n"); + source.append(" uint grp_nm = get_num_groups(0); \n"); + source.append(" uint lcl_id = get_local_id(0); \n"); + source.append(" uint lcl_sz = get_local_size(0); \n"); + source.append(" const unsigned int gtid = (lcl_sz * grp_id) + lcl_id; \n"); + source.append(" __loca
<TRUNCATED>
