http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/bisect.hpp ---------------------------------------------------------------------- diff --git a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/bisect.hpp b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/bisect.hpp deleted file mode 100644 index 64c12b0..0000000 --- a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/bisect.hpp +++ /dev/null @@ -1,2645 +0,0 @@ -#ifndef VIENNACL_LINALG_OPENCL_KERNELS_BISECT_HPP_ -#define VIENNACL_LINALG_OPENCL_KERNELS_BISECT_HPP_ - -/* ========================================================================= - Copyright (c) 2010-2016, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. - Portions of this software are copyright by UChicago Argonne, LLC. - - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - - Project Head: Karl Rupp [email protected] - - (A list of authors and contributors can be found in the manual) - - License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - - -/** @file viennacl/linalg/opencl/kernels/bisect.hpp - @brief OpenCL kernels for the bisection algorithm for eigenvalues - - Implementation based on the sample provided with the CUDA 6.0 SDK, for which - the creation of derivative works is allowed by including the following statement: - "This software contains source code provided by NVIDIA Corporation." -*/ - - - -#include "viennacl/tools/tools.hpp" -#include "viennacl/ocl/kernel.hpp" -#include "viennacl/ocl/platform.hpp" -#include "viennacl/ocl/utils.hpp" - -#include "viennacl/linalg/opencl/common.hpp" - -// declaration, forward - -namespace viennacl -{ -namespace linalg -{ -namespace opencl -{ -namespace kernels -{ - template <typename StringType> - void generate_bisect_kernel_config(StringType & source) - { - /* Global configuration parameter */ - source.append(" #define VIENNACL_BISECT_MAX_THREADS_BLOCK 256\n"); - source.append(" #define VIENNACL_BISECT_MAX_SMALL_MATRIX 256\n"); - source.append(" #define VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX 256\n"); - source.append(" #define VIENNACL_BISECT_MIN_ABS_INTERVAL 5.0e-37\n"); - - } - - //////////////////////////////////////////////////////////////////////////////// - // Compute the next lower power of two of n - // n number for which next higher power of two is seeked - //////////////////////////////////////////////////////////////////////////////// - - template <typename StringType> - void generate_bisect_kernel_floorPow2(StringType & source, std::string const & numeric_string) - { - source.append(" \n"); - source.append(" inline int \n"); - source.append(" floorPow2(int n) \n"); - source.append(" { \n"); - source.append(" uint glb_id = get_global_id(0); \n"); - source.append(" uint grp_id = get_group_id(0); \n"); - source.append(" uint grp_nm = get_num_groups(0); \n"); - source.append(" uint lcl_id = get_local_id(0); \n"); - source.append(" uint lcl_sz = get_local_size(0); \n"); - - - // early out if already power of two - source.append(" if (0 == (n & (n-1))) \n"); - source.append(" { \n"); - source.append(" return n; \n"); - source.append(" } \n"); - - source.append(" int exp; \n"); - source.append(" frexp(( "); source.append(numeric_string); source.append(" )n, &exp); \n"); - source.append(" return (1 << (exp - 1)); \n"); - source.append(" } \n"); - - } - - - //////////////////////////////////////////////////////////////////////////////// - // Compute the next higher power of two of n - // n number for which next higher power of two is seeked - //////////////////////////////////////////////////////////////////////////////// - - template <typename StringType> - void generate_bisect_kernel_ceilPow2(StringType & source, std::string const & numeric_string) - { - source.append(" \n"); - source.append(" inline int \n"); - source.append(" ceilPow2(int n) \n"); - source.append(" { \n"); - source.append(" uint glb_id = get_global_id(0); \n"); - source.append(" uint grp_id = get_group_id(0); \n"); - source.append(" uint grp_nm = get_num_groups(0); \n"); - source.append(" uint lcl_id = get_local_id(0); \n"); - source.append(" uint lcl_sz = get_local_size(0); \n"); - - - // early out if already power of two - source.append(" if (0 == (n & (n-1))) \n"); - source.append(" { \n"); - source.append(" return n; \n"); - source.append(" } \n"); - - source.append(" int exp; \n"); - source.append(" frexp(( "); source.append(numeric_string); source.append(" )n, &exp); \n"); - source.append(" return (1 << exp); \n"); - source.append(" } \n"); - } - - - //////////////////////////////////////////////////////////////////////////////// - // Compute midpoint of interval [\a left, \a right] avoiding overflow if possible - // - // left left / lower limit of interval - // right right / upper limit of interval - //////////////////////////////////////////////////////////////////////////////// - - template <typename StringType> - void generate_bisect_kernel_computeMidpoint(StringType & source, std::string const & numeric_string) - { - source.append(" \n"); - source.append(" inline "); source.append(numeric_string); source.append(" \n"); - source.append(" computeMidpoint(const "); source.append(numeric_string); source.append(" left,\n"); - source.append(" const "); source.append(numeric_string); source.append(" right) \n"); - source.append(" { \n"); - source.append(" uint glb_id = get_global_id(0); \n"); - source.append(" uint grp_id = get_group_id(0); \n"); - source.append(" uint grp_nm = get_num_groups(0); \n"); - source.append(" uint lcl_id = get_local_id(0); \n"); - source.append(" uint lcl_sz = get_local_size(0); \n"); - source.append(" "); source.append(numeric_string); source.append(" mid; \n"); - - source.append(" if (sign(left) == sign(right)) \n"); - source.append(" { \n"); - source.append(" mid = left + (right - left) * 0.5f; \n"); - source.append(" } \n"); - source.append(" else \n"); - source.append(" { \n"); - source.append(" mid = (left + right) * 0.5f; \n"); - source.append(" } \n"); - - source.append(" return mid; \n"); - source.append(" } \n"); - - } - - - //////////////////////////////////////////////////////////////////////////////// - // Check if interval converged and store appropriately - // - // addr address where to store the information of the interval - // s_left shared memory storage for left interval limits - // s_right shared memory storage for right interval limits - // s_left_count shared memory storage for number of eigenvalues less than left interval limits - // s_right_count shared memory storage for number of eigenvalues less than right interval limits - // left lower limit of interval - // right upper limit of interval - // left_count eigenvalues less than \a left - // right_count eigenvalues less than \a right - // precision desired precision for eigenvalues - //////////////////////////////////////////////////////////////////////////////// - - template<typename StringType> - void generate_bisect_kernel_storeInterval(StringType & source, std::string const & numeric_string) - { - source.append(" \n"); - source.append(" void \n"); - source.append(" storeInterval(unsigned int addr, \n"); - source.append(" __local "); source.append(numeric_string); source.append(" * s_left, \n"); - source.append(" __local "); source.append(numeric_string); source.append(" * s_right, \n"); - source.append(" __local unsigned int * s_left_count, \n"); - source.append(" __local unsigned int * s_right_count, \n"); - source.append(" "); source.append(numeric_string); source.append(" left, \n"); - source.append(" "); source.append(numeric_string); source.append(" right, \n"); - source.append(" unsigned int left_count, \n"); - source.append(" unsigned int right_count, \n"); - source.append(" "); source.append(numeric_string); source.append(" precision) \n"); - source.append(" { \n"); - source.append(" uint glb_id = get_global_id(0); \n"); - source.append(" uint grp_id = get_group_id(0); \n"); - source.append(" uint grp_nm = get_num_groups(0); \n"); - source.append(" uint lcl_id = get_local_id(0); \n"); - source.append(" uint lcl_sz = get_local_size(0); \n"); - - source.append(" s_left_count[addr] = left_count; \n"); - source.append(" s_right_count[addr] = right_count; \n"); - - // check if interval converged - source.append(" "); source.append(numeric_string); source.append(" t0 = fabs(right - left); \n"); - source.append(" "); source.append(numeric_string); source.append(" t1 = max(fabs(left), fabs(right)) * precision; \n"); - - source.append(" if (t0 <= max(( "); source.append(numeric_string); source.append(" )VIENNACL_BISECT_MIN_ABS_INTERVAL, t1)) \n"); - source.append(" { \n"); - // compute mid point - source.append(" "); source.append(numeric_string); source.append(" lambda = computeMidpoint(left, right); \n"); - - // mark as converged - source.append(" s_left[addr] = lambda; \n"); - source.append(" s_right[addr] = lambda; \n"); - source.append(" } \n"); - source.append(" else \n"); - source.append(" { \n"); - - // store current limits - source.append(" s_left[addr] = left; \n"); - source.append(" s_right[addr] = right; \n"); - source.append(" } \n"); - - source.append(" } \n"); - - } - - template<typename StringType> - void generate_bisect_kernel_storeIntervalShort(StringType & source, std::string const & numeric_string) - { - source.append(" \n"); - source.append(" void \n"); - source.append(" storeIntervalShort(unsigned int addr, \n"); - source.append(" __local "); source.append(numeric_string); source.append(" * s_left, \n"); - source.append(" __local "); source.append(numeric_string); source.append(" * s_right, \n"); - source.append(" __local unsigned short * s_left_count, \n"); - source.append(" __local unsigned short * s_right_count, \n"); - source.append(" "); source.append(numeric_string); source.append(" left, \n"); - source.append(" "); source.append(numeric_string); source.append(" right, \n"); - source.append(" unsigned int left_count, \n"); - source.append(" unsigned int right_count, \n"); - source.append(" "); source.append(numeric_string); source.append(" precision) \n"); - source.append(" { \n"); - source.append(" uint glb_id = get_global_id(0); \n"); - source.append(" uint grp_id = get_group_id(0); \n"); - source.append(" uint grp_nm = get_num_groups(0); \n"); - source.append(" uint lcl_id = get_local_id(0); \n"); - source.append(" uint lcl_sz = get_local_size(0); \n"); - - source.append(" s_left_count[addr] = left_count; \n"); - source.append(" s_right_count[addr] = right_count; \n"); - - // check if interval converged - source.append(" "); source.append(numeric_string); source.append(" t0 = fabs(right - left); \n"); - source.append(" "); source.append(numeric_string); source.append(" t1 = max(fabs(left), fabs(right)) * precision; \n"); - - source.append(" if (t0 <= max(( "); source.append(numeric_string); source.append(" )VIENNACL_BISECT_MIN_ABS_INTERVAL, t1)) \n"); - source.append(" { \n"); - // compute mid point - source.append(" "); source.append(numeric_string); source.append(" lambda = computeMidpoint(left, right); \n"); - - // mark as converged - source.append(" s_left[addr] = lambda; \n"); - source.append(" s_right[addr] = lambda; \n"); - source.append(" } \n"); - source.append(" else \n"); - source.append(" { \n"); - - // store current limits - source.append(" s_left[addr] = left; \n"); - source.append(" s_right[addr] = right; \n"); - source.append(" } \n"); - - source.append(" } \n"); - - - } - - - //////////////////////////////////////////////////////////////////////////////// - // Compute number of eigenvalues that are smaller than x given a symmetric, - // real, and tridiagonal matrix - // - // g_d diagonal elements stored in global memory - // g_s superdiagonal elements stored in global memory - // n size of matrix - // x value for which the number of eigenvalues that are smaller is sought - // tid thread identified (e.g. threadIdx.x or gtid) - // num_intervals_active number of active intervals / threads that currently process an interval - // s_d scratch space to store diagonal entries of the tridiagonal matrix in shared memory - // s_s scratch space to store superdiagonal entries of the tridiagonal matrix in shared memory - // converged flag if the current thread is already converged (that is count does not have to be computed) - //////////////////////////////////////////////////////////////////////////////// - - template <typename StringType> - void generate_bisect_kernel_computeNumSmallerEigenvals(StringType & source, std::string const & numeric_string) - { - source.append(" \n"); - source.append(" inline unsigned int \n"); - source.append(" computeNumSmallerEigenvals(__global "); source.append(numeric_string); source.append(" *g_d, \n"); - source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n"); - source.append(" const unsigned int n, \n"); - source.append(" const "); source.append(numeric_string); source.append(" x, \n"); - source.append(" const unsigned int tid, \n"); - source.append(" const unsigned int num_intervals_active, \n"); - source.append(" __local "); source.append(numeric_string); source.append(" *s_d, \n"); - source.append(" __local "); source.append(numeric_string); source.append(" *s_s, \n"); - source.append(" unsigned int converged \n"); - source.append(" ) \n"); - source.append(" { \n"); - source.append(" uint glb_id = get_global_id(0); \n"); - source.append(" uint grp_id = get_group_id(0); \n"); - source.append(" uint grp_nm = get_num_groups(0); \n"); - source.append(" uint lcl_id = get_local_id(0); \n"); - source.append(" uint lcl_sz = get_local_size(0); \n"); - - - source.append(" "); source.append(numeric_string); source.append(" delta = 1.0f; \n"); - source.append(" unsigned int count = 0; \n"); - - source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); - - // read data into shared memory - source.append(" if (lcl_id < n) \n"); - source.append(" { \n"); - source.append(" s_d[lcl_id] = *(g_d + lcl_id); \n"); - source.append(" s_s[lcl_id] = *(g_s + lcl_id - 1); \n"); - source.append(" } \n"); - - source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); - - // perform loop only for active threads - source.append(" if ((tid < num_intervals_active) && (0 == converged)) \n"); - source.append(" { \n"); - - // perform (optimized) Gaussian elimination to determine the number - // of eigenvalues that are smaller than n - source.append(" for (unsigned int k = 0; k < n; ++k) \n"); - source.append(" { \n"); - source.append(" delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta; \n"); - source.append(" count += (delta < 0) ? 1 : 0; \n"); - source.append(" } \n"); - - source.append(" } \n"); // end if thread currently processing an interval - - source.append(" return count; \n"); - source.append(" } \n"); - - } - - - //////////////////////////////////////////////////////////////////////////////// - // Compute number of eigenvalues that are smaller than x given a symmetric, - // real, and tridiagonal matrix - // - // g_d diagonal elements stored in global memory - // g_s superdiagonal elements stored in global memory - // n size of matrix - // x value for which the number of eigenvalues that are smaller is seeked - // tid thread identified (e.g. threadIdx.x or gtid) - // num_intervals_active number of active intervals / threads that currently process an interval - // s_d scratch space to store diagonal entries of the tridiagonal matrix in shared memory - // s_s scratch space to store superdiagonal entries of the tridiagonal matrix in shared memory - // converged flag if the current thread is already converged (that is count does not have to be computed) - //////////////////////////////////////////////////////////////////////////////// - - template <typename StringType> - void generate_bisect_kernel_computeNumSmallerEigenvalsLarge(StringType & source, std::string const & numeric_string) - { - source.append(" \n"); - source.append(" inline unsigned int \n"); - source.append(" computeNumSmallerEigenvalsLarge(__global "); source.append(numeric_string); source.append(" *g_d, \n"); - source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n"); - source.append(" const unsigned int n, \n"); - source.append(" const "); source.append(numeric_string); source.append(" x, \n"); - source.append(" const unsigned int tid, \n"); - source.append(" const unsigned int num_intervals_active, \n"); - source.append(" __local "); source.append(numeric_string); source.append(" *s_d, \n"); - source.append(" __local "); source.append(numeric_string); source.append(" *s_s, \n"); - source.append(" unsigned int converged \n"); - source.append(" ) \n"); - source.append(" { \n"); - source.append(" uint glb_id = get_global_id(0); \n"); - source.append(" uint grp_id = get_group_id(0); \n"); - source.append(" uint grp_nm = get_num_groups(0); \n"); - source.append(" uint lcl_id = get_local_id(0); \n"); - source.append(" uint lcl_sz = get_local_size(0); \n"); - - source.append(" "); source.append(numeric_string); source.append(" delta = 1.0f; \n"); - source.append(" unsigned int count = 0; \n"); - - source.append(" unsigned int rem = n; \n"); - - // do until whole diagonal and superdiagonal has been loaded and processed - source.append(" for (unsigned int i = 0; i < n; i += lcl_sz) \n"); - source.append(" { \n"); - - source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); - - // read new chunk of data into shared memory - source.append(" if ((i + lcl_id) < n) \n"); - source.append(" { \n"); - - source.append(" s_d[lcl_id] = *(g_d + i + lcl_id); \n"); - source.append(" s_s[lcl_id] = *(g_s + i + lcl_id - 1); \n"); - source.append(" } \n"); - - source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); - - - source.append(" if (tid < num_intervals_active) \n"); - source.append(" { \n"); - - // perform (optimized) Gaussian elimination to determine the number - // of eigenvalues that are smaller than n - source.append(" for (unsigned int k = 0; k < min(rem,lcl_sz); ++k) \n"); - source.append(" { \n"); - source.append(" delta = s_d[k] - x - (s_s[k] * s_s[k]) / delta; \n"); - // delta = (abs( delta) < (1.0e-10)) ? -(1.0e-10) : delta; - source.append(" count += (delta < 0) ? 1 : 0; \n"); - source.append(" } \n"); - - source.append(" } \n"); // end if thread currently processing an interval - - source.append(" rem -= lcl_sz; \n"); - source.append(" } \n"); - - source.append(" return count; \n"); - source.append(" } \n"); - - - } - - //////////////////////////////////////////////////////////////////////////////// - // Store all non-empty intervals resulting from the subdivision of the interval - // currently processed by the thread - // - // addr base address for storing intervals - // num_threads_active number of threads / intervals in current sweep - // s_left shared memory storage for left interval limits - // s_right shared memory storage for right interval limits - // s_left_count shared memory storage for number of eigenvalues less than left interval limits - // s_right_count shared memory storage for number of eigenvalues less than right interval limits - // left lower limit of interval - // mid midpoint of interval - // right upper limit of interval - // left_count eigenvalues less than \a left - // mid_count eigenvalues less than \a mid - // right_count eigenvalues less than \a right - // precision desired precision for eigenvalues - // compact_second_chunk shared mem flag if second chunk is used and ergo requires compaction - // s_compaction_list_exc helper array for stream compaction, s_compaction_list_exc[tid] = 1 when the thread generated two child intervals - // is_active_interval mark is thread has a second non-empty child interval - //////////////////////////////////////////////////////////////////////////////// - - template<typename StringType> - void generate_bisect_kernel_storeNonEmptyIntervals(StringType & source, std::string const & numeric_string) - { - source.append(" \n"); - source.append(" void \n"); - source.append(" storeNonEmptyIntervals(unsigned int addr, \n"); - source.append(" const unsigned int num_threads_active, \n"); - source.append(" __local "); source.append(numeric_string); source.append(" *s_left, \n"); - source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n"); - source.append(" __local unsigned int *s_left_count, \n"); - source.append(" __local unsigned int *s_right_count, \n"); - source.append(" "); source.append(numeric_string); source.append(" left, \n "); - source.append(" "); source.append(numeric_string); source.append(" mid, \n"); - source.append(" "); source.append(numeric_string); source.append(" right,\n"); - source.append(" const unsigned int left_count, \n"); - source.append(" const unsigned int mid_count, \n"); - source.append(" const unsigned int right_count, \n"); - source.append(" "); source.append(numeric_string); source.append(" precision, \n"); - source.append(" __local unsigned int *compact_second_chunk, \n"); - source.append(" __local unsigned int *s_compaction_list_exc, \n"); - source.append(" unsigned int *is_active_second) \n"); - source.append(" { \n"); - source.append(" uint glb_id = get_global_id(0); \n"); - source.append(" uint grp_id = get_group_id(0); \n"); - source.append(" uint grp_nm = get_num_groups(0); \n"); - source.append(" uint lcl_id = get_local_id(0); \n"); - source.append(" uint lcl_sz = get_local_size(0); \n"); - - // check if both child intervals are valid - source.append(" \n"); - source.append(" if ((left_count != mid_count) && (mid_count != right_count)) \n"); - source.append(" { \n"); - - // store the left interval - source.append(" storeInterval(addr, s_left, s_right, s_left_count, s_right_count, \n"); - source.append(" left, mid, left_count, mid_count, precision); \n"); - - // mark that a second interval has been generated, only stored after - // stream compaction of second chunk - source.append(" *is_active_second = 1; \n"); - source.append(" s_compaction_list_exc[lcl_id] = 1; \n"); - source.append(" *compact_second_chunk = 1; \n"); - source.append(" } \n"); - source.append(" else \n"); - source.append(" { \n"); - - // only one non-empty child interval - - // mark that no second child - source.append(" *is_active_second = 0; \n"); - source.append(" s_compaction_list_exc[lcl_id] = 0; \n"); - - // store the one valid child interval - source.append(" if (left_count != mid_count) \n"); - source.append(" { \n"); - source.append(" storeInterval(addr, s_left, s_right, s_left_count, s_right_count, \n"); - source.append(" left, mid, left_count, mid_count, precision); \n"); - source.append(" } \n"); - source.append(" else \n"); - source.append(" { \n"); - source.append(" storeInterval(addr, s_left, s_right, s_left_count, s_right_count, \n"); - source.append(" mid, right, mid_count, right_count, precision); \n"); - source.append(" } \n"); - - source.append(" } \n"); - source.append(" } \n"); - - } - - - //////////////////////////////////////////////////////////////////////////////// - //! Store all non-empty intervals resulting from the subdivision of the interval - //! currently processed by the thread - //////////////////////////////////////////////////////////////////////////////// - - template <typename StringType> - void generate_bisect_kernel_storeNonEmptyIntervalsLarge(StringType & source, std::string const & numeric_string) - { - source.append(" \n"); - source.append(" void \n"); - source.append(" storeNonEmptyIntervalsLarge(unsigned int addr, \n"); - source.append(" const unsigned int num_threads_active, \n"); - source.append(" __local "); source.append(numeric_string); source.append(" *s_left, \n"); - source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n"); - source.append(" __local unsigned short *s_left_count, \n"); - source.append(" __local unsigned short *s_right_count, \n"); - source.append(" "); source.append(numeric_string); source.append(" left, \n "); - source.append(" "); source.append(numeric_string); source.append(" mid, \n"); - source.append(" "); source.append(numeric_string); source.append(" right,\n"); - source.append(" const unsigned int left_count, \n"); - source.append(" const unsigned int mid_count, \n"); - source.append(" const unsigned int right_count, \n"); - source.append(" "); source.append(numeric_string); source.append(" epsilon, \n"); - source.append(" __local unsigned int *compact_second_chunk, \n"); - source.append(" __local unsigned short *s_compaction_list, \n"); - source.append(" unsigned int *is_active_second) \n"); - source.append(" { \n"); - source.append(" uint glb_id = get_global_id(0); \n"); - source.append(" uint grp_id = get_group_id(0); \n"); - source.append(" uint grp_nm = get_num_groups(0); \n"); - source.append(" uint lcl_id = get_local_id(0); \n"); - source.append(" uint lcl_sz = get_local_size(0); \n"); - - // check if both child intervals are valid - source.append(" if ((left_count != mid_count) && (mid_count != right_count)) \n"); - source.append(" { \n"); - - source.append(" storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count, \n"); - source.append(" left, mid, left_count, mid_count, epsilon); \n"); - - source.append(" *is_active_second = 1; \n"); - source.append(" s_compaction_list[lcl_id] = 1; \n"); - source.append(" *compact_second_chunk = 1; \n"); - source.append(" } \n"); - source.append(" else \n"); - source.append(" { \n"); - - // only one non-empty child interval - - // mark that no second child - source.append(" *is_active_second = 0; \n"); - source.append(" s_compaction_list[lcl_id] = 0; \n"); - - // store the one valid child interval - source.append(" if (left_count != mid_count) \n"); - source.append(" { \n"); - source.append(" storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count, \n"); - source.append(" left, mid, left_count, mid_count, epsilon); \n"); - source.append(" } \n"); - source.append(" else \n"); - source.append(" { \n"); - source.append(" storeIntervalShort(addr, s_left, s_right, s_left_count, s_right_count, \n"); - source.append(" mid, right, mid_count, right_count, epsilon); \n"); - source.append(" } \n"); - source.append(" } \n"); - source.append(" } \n"); - } - - //////////////////////////////////////////////////////////////////////////////// - // Create indices for compaction, that is process \a s_compaction_list_exc - // which is 1 for intervals that generated a second child and 0 otherwise - // and create for each of the non-zero elements the index where the new - // interval belongs to in a compact representation of all generated second children - // - // s_compaction_list_exc list containing the flags which threads generated two children - // num_threads_compaction number of threads to employ for compaction - //////////////////////////////////////////////////////////////////////////////// - - template<typename StringType> - void generate_bisect_kernel_createIndicesCompaction(StringType & source) - { - source.append(" \n"); - source.append(" void \n"); - source.append(" createIndicesCompaction(__local unsigned int *s_compaction_list_exc, \n"); - source.append(" unsigned int num_threads_compaction) \n"); - source.append(" { \n"); - source.append(" uint glb_id = get_global_id(0); \n"); - source.append(" uint grp_id = get_group_id(0); \n"); - source.append(" uint grp_nm = get_num_groups(0); \n"); - source.append(" uint lcl_id = get_local_id(0); \n"); - source.append(" uint lcl_sz = get_local_size(0); \n"); - - - source.append(" unsigned int offset = 1; \n"); - source.append(" const unsigned int tid = lcl_id; \n"); - // if(tid == 0) - // printf("num_threads_compaction = %u\n", num_threads_compaction); - - // higher levels of scan tree - source.append(" for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) \n"); - source.append(" { \n"); - - source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); - - source.append(" if (tid < d) \n"); - source.append(" { \n"); - - source.append(" unsigned int ai = offset*(2*tid+1)-1; \n"); - source.append(" unsigned int bi = offset*(2*tid+2)-1; \n"); - source.append(" \n"); - source.append(" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n"); - source.append(" + s_compaction_list_exc[ai]; \n"); - source.append(" } \n"); - - source.append(" offset <<= 1; \n"); - source.append(" } \n"); - - // traverse down tree: first down to level 2 across - source.append(" for (int d = 2; d < num_threads_compaction; d <<= 1) \n"); - source.append(" { \n"); - - source.append(" offset >>= 1; \n"); - source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); - - source.append(" if (tid < (d-1)) \n"); - source.append(" { \n"); - - source.append(" unsigned int ai = offset*(tid+1) - 1; \n"); - source.append(" unsigned int bi = ai + (offset >> 1); \n"); - - source.append(" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n"); - source.append(" + s_compaction_list_exc[ai]; \n"); - source.append(" } \n"); - source.append(" } \n"); - - source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); - - source.append(" } \n"); - } - - - template<typename StringType> - void generate_bisect_kernel_createIndicesCompactionShort(StringType & source) - { - source.append(" \n"); - source.append(" void \n"); - source.append(" createIndicesCompactionShort(__local unsigned short *s_compaction_list_exc, \n"); - source.append(" unsigned int num_threads_compaction) \n"); - source.append(" { \n"); - source.append(" uint glb_id = get_global_id(0); \n"); - source.append(" uint grp_id = get_group_id(0); \n"); - source.append(" uint grp_nm = get_num_groups(0); \n"); - source.append(" uint lcl_id = get_local_id(0); \n"); - source.append(" uint lcl_sz = get_local_size(0); \n"); - - - source.append(" unsigned int offset = 1; \n"); - source.append(" const unsigned int tid = lcl_id; \n"); - - // higher levels of scan tree - source.append(" for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1) \n"); - source.append(" { \n"); - - source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); - - source.append(" if (tid < d) \n"); - source.append(" { \n"); - - source.append(" unsigned int ai = offset*(2*tid+1)-1; \n"); - source.append(" unsigned int bi = offset*(2*tid+2)-1; \n"); - source.append(" \n"); - source.append(" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n"); - source.append(" + s_compaction_list_exc[ai]; \n"); - source.append(" } \n"); - - source.append(" offset <<= 1; \n"); - source.append(" } \n"); - - // traverse down tree: first down to level 2 across - source.append(" for (int d = 2; d < num_threads_compaction; d <<= 1) \n"); - source.append(" { \n"); - - source.append(" offset >>= 1; \n"); - source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); - - source.append(" if (tid < (d-1)) \n"); - source.append(" { \n"); - - source.append(" unsigned int ai = offset*(tid+1) - 1; \n"); - source.append(" unsigned int bi = ai + (offset >> 1); \n"); - - source.append(" s_compaction_list_exc[bi] = s_compaction_list_exc[bi] \n"); - source.append(" + s_compaction_list_exc[ai]; \n"); - source.append(" } \n"); - source.append(" } \n"); - - source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); - - source.append(" } \n"); - } - - /////////////////////////////////////////////////////////////////////////////// - // Perform stream compaction for second child intervals - // - // s_left shared memory storage for left interval limits - // s_right shared memory storage for right interval limits - // s_left_count shared memory storage for number of eigenvalues less than left interval limits - // s_right_count shared memory storage for number of eigenvalues less than right interval limits - // mid midpoint of current interval (left of new interval) - // right upper limit of interval - // mid_count eigenvalues less than \a mid - // s_compaction_list list containing the indices where the data has to be stored - // num_threads_active number of active threads / intervals - // is_active_interval mark is thread has a second non-empty child interval - /////////////////////////////////////////////////////////////////////////////// - - - template<typename StringType> - void generate_bisect_kernel_compactIntervals(StringType & source, std::string const & numeric_string) - { - source.append(" \n"); - source.append(" void \n"); - source.append(" compactIntervals(__local "); source.append(numeric_string); source.append(" *s_left, \n"); - source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n"); - source.append(" __local unsigned int *s_left_count, \n"); - source.append(" __local unsigned int *s_right_count, \n"); - source.append(" "); source.append(numeric_string); source.append(" mid, \n"); - source.append(" "); source.append(numeric_string); source.append(" right, \n"); - source.append(" unsigned int mid_count, unsigned int right_count, \n"); - source.append(" __local unsigned int *s_compaction_list, \n"); - source.append(" unsigned int num_threads_active, \n"); - source.append(" unsigned int is_active_second) \n"); - source.append(" { \n"); - source.append(" uint glb_id = get_global_id(0); \n"); - source.append(" uint grp_id = get_group_id(0); \n"); - source.append(" uint grp_nm = get_num_groups(0); \n"); - source.append(" uint lcl_id = get_local_id(0); \n"); - source.append(" uint lcl_sz = get_local_size(0); \n"); - - source.append(" const unsigned int tid = lcl_id; \n"); - - // perform compaction / copy data for all threads where the second - // child is not dead - source.append(" if ((tid < num_threads_active) && (1 == is_active_second)) \n"); - source.append(" { \n"); - source.append(" unsigned int addr_w = num_threads_active + s_compaction_list[tid]; \n"); - source.append(" s_left[addr_w] = mid; \n"); - source.append(" s_right[addr_w] = right; \n"); - source.append(" s_left_count[addr_w] = mid_count; \n"); - source.append(" s_right_count[addr_w] = right_count; \n"); - source.append(" } \n"); - source.append(" } \n"); - } - - - - - template<typename StringType> - void generate_bisect_kernel_compactIntervalsShort(StringType & source, std::string const & numeric_string) - { - source.append(" \n"); - source.append(" void \n"); - source.append(" compactIntervalsShort(__local "); source.append(numeric_string); source.append(" *s_left, \n"); - source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n"); - source.append(" __local unsigned short *s_left_count, \n"); - source.append(" __local unsigned short *s_right_count, \n"); - source.append(" "); source.append(numeric_string); source.append(" mid, \n"); - source.append(" "); source.append(numeric_string); source.append(" right, \n"); - source.append(" unsigned int mid_count, unsigned int right_count, \n"); - source.append(" __local unsigned short *s_compaction_list, \n"); - source.append(" unsigned int num_threads_active, \n"); - source.append(" unsigned int is_active_second) \n"); - source.append(" { \n"); - source.append(" uint glb_id = get_global_id(0); \n"); - source.append(" uint grp_id = get_group_id(0); \n"); - source.append(" uint grp_nm = get_num_groups(0); \n"); - source.append(" uint lcl_id = get_local_id(0); \n"); - source.append(" uint lcl_sz = get_local_size(0); \n"); - - source.append(" const unsigned int tid = lcl_id; \n"); - - // perform compaction / copy data for all threads where the second - // child is not dead - source.append(" if ((tid < num_threads_active) && (1 == is_active_second)) \n"); - source.append(" { \n"); - source.append(" unsigned int addr_w = num_threads_active + s_compaction_list[tid]; \n"); - source.append(" s_left[addr_w] = mid; \n"); - source.append(" s_right[addr_w] = right; \n"); - source.append(" s_left_count[addr_w] = mid_count; \n"); - source.append(" s_right_count[addr_w] = right_count; \n"); - source.append(" } \n"); - source.append(" } \n"); - } - - - - template<typename StringType> - void generate_bisect_kernel_storeIntervalConverged(StringType & source, std::string const & numeric_string) - { - source.append(" \n"); - source.append(" void \n"); - source.append(" storeIntervalConverged( __local "); source.append(numeric_string); source.append(" *s_left, \n"); - source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n"); - source.append(" __local unsigned int *s_left_count, \n"); - source.append(" __local unsigned int *s_right_count, \n"); - source.append(" "); source.append(numeric_string); source.append(" *left, \n"); - source.append(" "); source.append(numeric_string); source.append(" *mid, \n"); - source.append(" "); source.append(numeric_string); source.append(" *right, \n"); - source.append(" unsigned int *left_count, \n"); - source.append(" unsigned int *mid_count, \n"); - source.append(" unsigned int *right_count, \n"); - source.append(" __local unsigned int *s_compaction_list_exc, \n"); - source.append(" __local unsigned int *compact_second_chunk, \n"); - source.append(" const unsigned int num_threads_active, \n"); - source.append(" unsigned int *is_active_second) \n"); - source.append(" { \n"); - source.append(" uint glb_id = get_global_id(0); \n"); - source.append(" uint grp_id = get_group_id(0); \n"); - source.append(" uint grp_nm = get_num_groups(0); \n"); - source.append(" uint lcl_id = get_local_id(0); \n"); - source.append(" uint lcl_sz = get_local_size(0); \n"); - - source.append(" const unsigned int tid = lcl_id; \n"); - source.append(" const unsigned int multiplicity = *right_count - *left_count; \n"); - // check multiplicity of eigenvalue - source.append(" if (1 == multiplicity) \n"); - source.append(" { \n"); - - // just re-store intervals, simple eigenvalue - source.append(" s_left[tid] = *left; \n"); - source.append(" s_right[tid] = *right; \n"); - source.append(" s_left_count[tid] = *left_count; \n"); - source.append(" s_right_count[tid] = *right_count; \n"); - source.append(" \n"); - - // mark that no second child / clear - source.append(" *is_active_second = 0; \n"); - source.append(" s_compaction_list_exc[tid] = 0; \n"); - source.append(" } \n"); - source.append(" else \n"); - source.append(" { \n"); - - // number of eigenvalues after the split less than mid - source.append(" *mid_count = *left_count + (multiplicity >> 1); \n"); - - // store left interval - source.append(" s_left[tid] = *left; \n"); - source.append(" s_right[tid] = *right; \n"); - source.append(" s_left_count[tid] = *left_count; \n"); - source.append(" s_right_count[tid] = *mid_count; \n"); - source.append(" *mid = *left; \n"); - - // mark that second child interval exists - source.append(" *is_active_second = 1; \n"); - source.append(" s_compaction_list_exc[tid] = 1; \n"); - source.append(" *compact_second_chunk = 1; \n"); - source.append(" } \n"); - source.append(" } \n"); - } - - - - - - template<typename StringType> - void generate_bisect_kernel_storeIntervalConvergedShort(StringType & source, std::string const & numeric_string) - { - source.append(" \n"); - source.append(" void \n"); - source.append(" storeIntervalConvergedShort(__local "); source.append(numeric_string); source.append(" *s_left, \n"); - source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n"); - source.append(" __local unsigned short *s_left_count, \n"); - source.append(" __local unsigned short *s_right_count, \n"); - source.append(" "); source.append(numeric_string); source.append(" *left, \n"); - source.append(" "); source.append(numeric_string); source.append(" *mid, \n"); - source.append(" "); source.append(numeric_string); source.append(" *right, \n"); - source.append(" unsigned int *left_count, \n"); - source.append(" unsigned int *mid_count, \n"); - source.append(" unsigned int *right_count, \n"); - source.append(" __local unsigned short *s_compaction_list_exc, \n"); - source.append(" __local unsigned int *compact_second_chunk, \n"); - source.append(" const unsigned int num_threads_active, \n"); - source.append(" unsigned int *is_active_second) \n"); - source.append(" { \n"); - source.append(" uint glb_id = get_global_id(0); \n"); - source.append(" uint grp_id = get_group_id(0); \n"); - source.append(" uint grp_nm = get_num_groups(0); \n"); - source.append(" uint lcl_id = get_local_id(0); \n"); - source.append(" uint lcl_sz = get_local_size(0); \n"); - - source.append(" const unsigned int tid = lcl_id; \n"); - source.append(" const unsigned int multiplicity = *right_count - *left_count; \n"); - // check multiplicity of eigenvalue - source.append(" if (1 == multiplicity) \n"); - source.append(" { \n"); - - // just re-store intervals, simple eigenvalue - source.append(" s_left[tid] = *left; \n"); - source.append(" s_right[tid] = *right; \n"); - source.append(" s_left_count[tid] = *left_count; \n"); - source.append(" s_right_count[tid] = *right_count; \n"); - source.append(" \n"); - - // mark that no second child / clear - source.append(" *is_active_second = 0; \n"); - source.append(" s_compaction_list_exc[tid] = 0; \n"); - source.append(" } \n"); - source.append(" else \n"); - source.append(" { \n"); - - // number of eigenvalues after the split less than mid - source.append(" *mid_count = *left_count + (multiplicity >> 1); \n"); - - // store left interval - source.append(" s_left[tid] = *left; \n"); - source.append(" s_right[tid] = *right; \n"); - source.append(" s_left_count[tid] = *left_count; \n"); - source.append(" s_right_count[tid] = *mid_count; \n"); - source.append(" *mid = *left; \n"); - - // mark that second child interval exists - source.append(" *is_active_second = 1; \n"); - source.append(" s_compaction_list_exc[tid] = 1; \n"); - source.append(" *compact_second_chunk = 1; \n"); - source.append(" } \n"); - source.append(" } \n"); - } - - /////////////////////////////////////////////////////////////////////////////// - // Subdivide interval if active and not already converged - // - // tid id of thread - // s_left shared memory storage for left interval limits - // s_right shared memory storage for right interval limits - // s_left_count shared memory storage for number of eigenvalues less than left interval limits - // s_right_count shared memory storage for number of eigenvalues less than right interval limits - // num_threads_active number of active threads in warp - // left lower limit of interval - // right upper limit of interval - // left_count eigenvalues less than \a left - // right_count eigenvalues less than \a right - // all_threads_converged shared memory flag if all threads are converged - /////////////////////////////////////////////////////////////////////////////// - - - template<typename StringType> - void generate_bisect_kernel_subdivideActiveInterval(StringType & source, std::string const & numeric_string) - { - source.append(" \n"); - source.append(" void \n"); - source.append(" subdivideActiveIntervalMulti(const unsigned int tid, \n"); - source.append(" __local "); source.append(numeric_string); source.append(" *s_left, \n"); - source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n"); - source.append(" __local unsigned int *s_left_count, \n"); - source.append(" __local unsigned int *s_right_count, \n"); - source.append(" const unsigned int num_threads_active, \n"); - source.append(" "); source.append(numeric_string); source.append(" *left, \n"); - source.append(" "); source.append(numeric_string); source.append(" *right, \n"); - source.append(" unsigned int *left_count, unsigned int *right_count, \n"); - source.append(" "); source.append(numeric_string); source.append(" *mid, \n"); - source.append(" __local unsigned int *all_threads_converged) \n"); - source.append(" { \n"); - source.append(" uint glb_id = get_global_id(0); \n"); - source.append(" uint grp_id = get_group_id(0); \n"); - source.append(" uint grp_nm = get_num_groups(0); \n"); - source.append(" uint lcl_id = get_local_id(0); \n"); - source.append(" uint lcl_sz = get_local_size(0); \n"); - - // for all active threads - source.append(" if (tid < num_threads_active) \n"); - source.append(" { \n"); - - source.append(" *left = s_left[tid]; \n"); - source.append(" *right = s_right[tid]; \n"); - source.append(" *left_count = s_left_count[tid]; \n"); - source.append(" *right_count = s_right_count[tid]; \n"); - - // check if thread already converged - source.append(" if (*left != *right) \n"); - source.append(" { \n"); - - source.append(" *mid = computeMidpoint(*left, *right); \n"); - source.append(" *all_threads_converged = 0; \n"); - source.append(" } \n"); - source.append(" else if ((*right_count - *left_count) > 1) \n"); - source.append(" { \n"); - // mark as not converged if multiple eigenvalues enclosed - // duplicate interval in storeIntervalsConverged() - source.append(" *all_threads_converged = 0; \n"); - source.append(" } \n"); - - source.append(" } \n"); - // end for all active threads - source.append(" } \n"); - } - - - template<typename StringType> - void generate_bisect_kernel_subdivideActiveIntervalShort(StringType & source, std::string const & numeric_string) - { - source.append(" \n"); - source.append(" void \n"); - source.append(" subdivideActiveIntervalShort(const unsigned int tid, \n"); - source.append(" __local "); source.append(numeric_string); source.append(" *s_left, \n"); - source.append(" __local "); source.append(numeric_string); source.append(" *s_right, \n"); - source.append(" __local unsigned short *s_left_count, \n"); - source.append(" __local unsigned short *s_right_count, \n"); - source.append(" const unsigned int num_threads_active, \n"); - source.append(" "); source.append(numeric_string); source.append(" *left, \n"); - source.append(" "); source.append(numeric_string); source.append(" *right, \n"); - source.append(" unsigned int *left_count, unsigned int *right_count, \n"); - source.append(" "); source.append(numeric_string); source.append(" *mid, \n"); - source.append(" __local unsigned int *all_threads_converged) \n"); - source.append(" { \n"); - source.append(" uint glb_id = get_global_id(0); \n"); - source.append(" uint grp_id = get_group_id(0); \n"); - source.append(" uint grp_nm = get_num_groups(0); \n"); - source.append(" uint lcl_id = get_local_id(0); \n"); - source.append(" uint lcl_sz = get_local_size(0); \n"); - - // for all active threads - source.append(" if (tid < num_threads_active) \n"); - source.append(" { \n"); - - source.append(" *left = s_left[tid]; \n"); - source.append(" *right = s_right[tid]; \n"); - source.append(" *left_count = s_left_count[tid]; \n"); - source.append(" *right_count = s_right_count[tid]; \n"); - - // check if thread already converged - source.append(" if (*left != *right) \n"); - source.append(" { \n"); - - source.append(" *mid = computeMidpoint(*left, *right); \n"); - source.append(" *all_threads_converged = 0; \n"); - source.append(" } \n"); - - source.append(" } \n"); - // end for all active threads - source.append(" } \n"); - } - - // end of utilities - // start of kernels - - - //////////////////////////////////////////////////////////////////////////////// - // Bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix - // - // g_d diagonal elements in global memory - // g_s superdiagonal elements in global elements (stored so that the element *(g_s - 1) can be accessed an equals 0 - // n size of matrix - // lg lower bound of input interval (e.g. Gerschgorin interval) - // ug upper bound of input interval (e.g. Gerschgorin interval) - // lg_eig_count number of eigenvalues that are smaller than \a lg - // lu_eig_count number of eigenvalues that are smaller than \a lu - // epsilon desired accuracy of eigenvalues to compute - //////////////////////////////////////////////////////////////////////////////// - /// - template <typename StringType> - void generate_bisect_kernel_bisectKernel(StringType & source, std::string const & numeric_string) - { - source.append(" __kernel \n"); - source.append(" void \n"); - source.append(" bisectKernelSmall(__global "); source.append(numeric_string); source.append(" *g_d, \n"); - source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n"); - source.append(" const unsigned int n, \n"); - source.append(" __global "); source.append(numeric_string); source.append(" *g_left, \n"); - source.append(" __global "); source.append(numeric_string); source.append(" *g_right, \n"); - source.append(" __global unsigned int *g_left_count, __global unsigned int *g_right_count, \n"); - source.append(" const "); source.append(numeric_string); source.append(" lg, \n"); - source.append(" const "); source.append(numeric_string); source.append(" ug, \n"); - source.append(" const unsigned int lg_eig_count, const unsigned int ug_eig_count, \n"); - source.append(" "); source.append(numeric_string); source.append(" epsilon \n"); - source.append(" ) \n"); - source.append(" { \n"); - source.append(" g_s = g_s + 1; \n"); - source.append(" uint glb_id = get_global_id(0); \n"); - source.append(" uint grp_id = get_group_id(0); \n"); - source.append(" uint grp_nm = get_num_groups(0); \n"); - source.append(" uint lcl_id = get_local_id(0); \n"); - source.append(" uint lcl_sz = get_local_size(0); \n"); - - // intervals (store left and right because the subdivision tree is in general - // not dense - source.append(" __local "); source.append(numeric_string); source.append(" s_left[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; \n"); - source.append(" __local "); source.append(numeric_string); source.append(" s_right[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; \n"); - - // number of eigenvalues that are smaller than s_left / s_right - // (correspondence is realized via indices) - source.append(" __local unsigned int s_left_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; \n"); - source.append(" __local unsigned int s_right_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX]; \n"); - - // helper for stream compaction - source.append(" __local unsigned int \n"); - source.append(" s_compaction_list[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX + 1]; \n"); - - // state variables for whole block - // if 0 then compaction of second chunk of child intervals is not necessary - // (because all intervals had exactly one non-dead child) - source.append(" __local unsigned int compact_second_chunk; \n"); - source.append(" __local unsigned int all_threads_converged; \n"); - - // number of currently active threads - source.append(" __local unsigned int num_threads_active; \n"); - - // number of threads to use for stream compaction - source.append(" __local unsigned int num_threads_compaction; \n"); - - // helper for exclusive scan - source.append(" __local unsigned int *s_compaction_list_exc = s_compaction_list + 1; \n"); - - - // variables for currently processed interval - // left and right limit of active interval - source.append(" "); source.append(numeric_string); source.append(" left = 0.0f; \n"); - source.append(" "); source.append(numeric_string); source.append(" right = 0.0f; \n"); - source.append(" unsigned int left_count = 0; \n"); - source.append(" unsigned int right_count = 0; \n"); - // midpoint of active interval - source.append(" "); source.append(numeric_string); source.append(" mid = 0.0f; \n"); - // number of eigenvalues smaller then mid - source.append(" unsigned int mid_count = 0; \n"); - // affected from compaction - source.append(" unsigned int is_active_second = 0; \n"); - - source.append(" s_compaction_list[lcl_id] = 0; \n"); - source.append(" s_left[lcl_id] = 0.0; \n"); - source.append(" s_right[lcl_id] = 0.0; \n"); - source.append(" s_left_count[lcl_id] = 0; \n"); - source.append(" s_right_count[lcl_id] = 0; \n"); - - source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); - - // set up initial configuration - source.append(" if (0 == lcl_id) \n"); - source.append(" { \n"); - source.append(" s_left[0] = lg; \n"); - source.append(" s_right[0] = ug; \n"); - source.append(" s_left_count[0] = lg_eig_count; \n"); - source.append(" s_right_count[0] = ug_eig_count; \n"); - - source.append(" compact_second_chunk = 0; \n"); - source.append(" num_threads_active = 1; \n"); - - source.append(" num_threads_compaction = 1; \n"); - source.append(" } \n"); - - // for all active threads read intervals from the last level - // the number of (worst case) active threads per level l is 2^l - - source.append(" while (true) \n"); - source.append(" { \n"); - - source.append(" all_threads_converged = 1; \n"); - source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); - - source.append(" is_active_second = 0; \n"); - source.append(" subdivideActiveIntervalMulti(lcl_id, \n"); - source.append(" s_left, s_right, s_left_count, s_right_count, \n"); - source.append(" num_threads_active, \n"); - source.append(" &left, &right, &left_count, &right_count, \n"); - source.append(" &mid, &all_threads_converged); \n"); - // source.append(" output[lcl_id] = s_left; \n"); - source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); - - // check if done - source.append(" if (1 == all_threads_converged) \n"); - source.append(" { \n"); - source.append(" break; \n"); - source.append(" } \n"); - - source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); - - // compute number of eigenvalues smaller than mid - // use all threads for reading the necessary matrix data from global - // memory - // use s_left and s_right as scratch space for diagonal and - // superdiagonal of matrix - source.append(" mid_count = computeNumSmallerEigenvals(g_d, g_s, n, mid, \n"); - source.append(" lcl_id, num_threads_active, \n"); - source.append(" s_left, s_right, \n"); - source.append(" (left == right)); \n"); - - source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); - - // store intervals - // for all threads store the first child interval in a continuous chunk of - // memory, and the second child interval -- if it exists -- in a second - // chunk; it is likely that all threads reach convergence up to - // \a epsilon at the same level; furthermore, for higher level most / all - // threads will have only one child, storing the first child compactly will - // (first) avoid to perform a compaction step on the first chunk, (second) - // make it for higher levels (when all threads / intervals have - // exactly one child) unnecessary to perform a compaction of the second - // chunk - source.append(" if (lcl_id < num_threads_active) \n"); - source.append(" { \n"); - - source.append(" if (left != right) \n"); - source.append(" { \n"); - - // store intervals - source.append(" storeNonEmptyIntervals(lcl_id, num_threads_active, \n"); - source.append(" s_left, s_right, s_left_count, s_right_count, \n"); - source.append(" left, mid, right, \n"); - source.append(" left_count, mid_count, right_count, \n"); - source.append(" epsilon, &compact_second_chunk, \n"); - source.append(" s_compaction_list_exc, \n"); - source.append(" &is_active_second); \n"); - source.append(" } \n"); - source.append(" else \n"); - source.append(" { \n"); - - source.append(" storeIntervalConverged(s_left, s_right, s_left_count, s_right_count, \n"); - source.append(" &left, &mid, &right, \n"); - source.append(" &left_count, &mid_count, &right_count, \n"); - source.append(" s_compaction_list_exc, &compact_second_chunk, \n"); - source.append(" num_threads_active, \n"); - source.append(" &is_active_second); \n"); - source.append(" } \n"); - source.append(" } \n"); - - // necessary so that compact_second_chunk is up-to-date - source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); - - // perform compaction of chunk where second children are stored - // scan of (num_threads_actieigenvaluesve / 2) elements, thus at most - // (num_threads_active / 4) threads are needed - source.append(" if (compact_second_chunk > 0) \n"); - source.append(" { \n"); - - source.append(" createIndicesCompaction(s_compaction_list_exc, num_threads_compaction); \n"); - - source.append(" compactIntervals(s_left, s_right, s_left_count, s_right_count, \n"); - source.append(" mid, right, mid_count, right_count, \n"); - source.append(" s_compaction_list, num_threads_active, \n"); - source.append(" is_active_second); \n"); - source.append(" } \n"); - - source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); - - source.append(" if (0 == lcl_id) \n"); - source.append(" { \n"); - - // update number of active threads with result of reduction - source.append(" num_threads_active += s_compaction_list[num_threads_active]; \n"); - - source.append(" num_threads_compaction = ceilPow2(num_threads_active); \n"); - - source.append(" compact_second_chunk = 0; \n"); - source.append(" } \n"); - - source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); - - source.append(" } \n"); - - source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); - - // write resulting intervals to global mem - // for all threads write if they have been converged to an eigenvalue to - // a separate array - - // at most n valid intervals - source.append(" if (lcl_id < n) \n"); - source.append(" { \n"); - // intervals converged so left and right limit are identical - source.append(" g_left[lcl_id] = s_left[lcl_id]; \n"); - // left count is sufficient to have global order - source.append(" g_left_count[lcl_id] = s_left_count[lcl_id]; \n"); - source.append(" } \n"); - source.append(" } \n"); - } - - //////////////////////////////////////////////////////////////////////////////// - // Perform second step of bisection algorithm for large matrices for intervals that after the first step contained more than one eigenvalue - // - // g_d diagonal elements of symmetric, tridiagonal matrix - // g_s superdiagonal elements of symmetric, tridiagonal matrix - // n matrix size - // blocks_mult start addresses of blocks of intervals that are processed by one block of threads, each of the intervals contains more than one eigenvalue - // blocks_mult_sum total number of eigenvalues / singleton intervals in one block of intervals - // g_left left limits of intervals - // g_right right limits of intervals - // g_left_count number of eigenvalues less than left limits - // g_right_count number of eigenvalues less than right limits - // g_lambda final eigenvalue - // g_pos index of eigenvalue (in ascending order) - // precision desired precision of eigenvalues - //////////////////////////////////////////////////////////////////////////////// - - template <typename StringType> - void generate_bisect_kernel_bisectKernelLarge_MultIntervals(StringType & source, std::string const & numeric_string) - { - source.append(" __kernel \n"); - source.append(" void \n"); - source.append(" bisectKernelLarge_MultIntervals(__global "); source.append(numeric_string); source.append(" *g_d, \n"); - source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n"); - source.append(" const unsigned int n, \n"); - source.append(" __global unsigned int *blocks_mult, \n"); - source.append(" __global unsigned int *blocks_mult_sum, \n"); - source.append(" __global "); source.append(numeric_string); source.append(" *g_left, \n"); - source.append(" __global "); source.append(numeric_string); source.append(" *g_right, \n"); - source.append(" __global unsigned int *g_left_count, \n"); - source.append(" __global unsigned int *g_right_count, \n"); - source.append(" __global "); source.append(numeric_string); source.append(" *g_lambda, \n"); - source.append(" __global unsigned int *g_pos, \n"); - source.append(" "); source.append(numeric_string); source.append(" precision \n"); - source.append(" ) \n"); - source.append(" { \n"); - source.append(" g_s = g_s + 1; \n"); - source.append(" uint glb_id = get_global_id(0); \n"); - source.append(" uint grp_id = get_group_id(0); \n"); - source.append(" uint grp_nm = get_num_groups(0); \n"); - source.append(" uint lcl_id = get_local_id(0); \n"); - source.append(" uint lcl_sz = get_local_size(0); \n"); - - source.append(" const unsigned int tid = lcl_id; \n"); - - // left and right limits of interval - source.append(" __local "); source.append(numeric_string); source.append(" s_left[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; \n"); - source.append(" __local "); source.append(numeric_string); source.append(" s_right[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; \n"); - - // number of eigenvalues smaller than interval limits - source.append(" __local unsigned int s_left_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; \n"); - source.append(" __local unsigned int s_right_count[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK]; \n"); - - // helper array for chunk compaction of second chunk - source.append(" __local unsigned int s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1]; \n"); - // compaction list helper for exclusive scan - source.append(" __local unsigned int *s_compaction_list_exc = s_compaction_list + 1; \n"); - - // flag if all threads are converged - source.append(" __local unsigned int all_threads_converged; \n"); - // number of active threads - source.append(" __local unsigned int num_threads_active; \n"); - // number of threads to employ for compaction - source.append(" __local unsigned int num_threads_compaction; \n"); - // flag if second chunk has to be compacted - source.append(" __local unsigned int compact_second_chunk; \n"); - - // parameters of block of intervals processed by this block of threads - source.append(" __local unsigned int c_block_start; \n"); - source.append(" __local unsigned int c_block_end; \n"); - source.append(" __local unsigned int c_block_offset_output; \n"); - - // midpoint of currently active interval of the thread - source.append(" "); source.append(numeric_string); source.append(" mid = 0.0f; \n"); - // number of eigenvalues smaller than \a mid - source.append(" unsigned int mid_count = 0; \n"); - // current interval parameter - source.append(" "); source.append(numeric_string); source.append(" left = 0.0f; \n"); - source.append(" "); source.append(numeric_string); source.append(" right = 0.0f; \n"); - source.append(" unsigned int left_count = 0; \n"); - source.append(" unsigned int right_count = 0; \n"); - // helper for compaction, keep track which threads have a second child - source.append(" unsigned int is_active_second = 0; \n"); - - source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n"); - - // initialize common start conditions - source.append(" if (0 == tid) \n"); - source.append(" { \n"); - - source.append(" c_block_start = blocks_mult[grp_id]; \n"); - source.append(" c_block_end = blocks_mult[grp_id + 1]; \n"); - source.append(" c_block_offset_output = blocks_mult_sum[grp_id]; \n"); - source.append(" \n"); - - source.append(" num_threads_active = c_block_end - c_block_start; \n"); - source.append(" s_compaction_list[0] = 0; \n"); - source.append(" num_threads_compaction = ceilPow2(num_threads_active); \n"); - - source.append(" all_threads_converged = 1; \n"); - source.append(" compact_second_chunk = 0; \n"); - source.append(" } \n"); - source.append(" s_left_count [tid] = 42; \n"); - source.append(" s_right_count[tid] = 42; \n"); - source.append(" s_left_count [tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; \n"); - source.append(" s_right_count[tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; \n"); - source.append(" \n"); - source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); - source.append(" \n"); - - // read data into shared memory - source.append(" if (tid < num_threads_active) \n"); - source.append(" { \n"); - - source.append(" s_left[tid] = g_left[c_block_start + tid]; \n"); - source.append(" s_right[tid] = g_right[c_block_start + tid]; \n"); - source.append(" s_left_count[tid] = g_left_count[c_block_start + tid]; \n"); - source.append(" s_right_count[tid] = g_right_count[c_block_start + tid]; \n"); - source.append(" \n"); - source.append(" } \n"); - source.append(" \n"); - source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); - source.append(" unsigned int iter = 0; \n"); - // do until all threads converged - source.append(" while (true) \n"); - source.append(" { \n"); - source.append(" iter++; \n"); - //for (int iter=0; iter < 0; iter++) { - source.append(" s_compaction_list[lcl_id] = 0; \n"); - source.append(" s_compaction_list[lcl_id + lcl_sz] = 0; \n"); - source.append(" s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0; \n"); - - // subdivide interval if currently active and not already converged - source.append(" subdivideActiveIntervalMulti(tid, s_left, s_right, \n"); - source.append(" s_left_count, s_right_count, \n"); - source.append(" num_threads_active, \n"); - source.append(" &left, &right, &left_count, &right_count, \n"); - source.append(" &mid, &all_threads_converged); \n"); - source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); - - // stop if all eigenvalues have been found - source.append(" if (1 == all_threads_converged) \n"); - source.append(" { \n"); - source.append(" \n"); - source.append(" break; \n"); - source.append(" } \n"); - - // compute number of eigenvalues smaller than mid for active and not - // converged intervals, use all threads for loading data from gmem and - // s_left and s_right as scratch space to store the data load from gmem - // in shared memory - source.append(" mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n, \n"); - source.append(" mid, tid, num_threads_active, \n"); - source.append(" s_left, s_right, \n"); - source.append(" (left == right)); \n"); - source.append(" \n"); - source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); - - source.append(" if (tid < num_threads_active) \n"); - source.append(" { \n"); - source.append(" \n"); - // store intervals - source.append(" if (left != right) \n"); - source.append(" { \n"); - - source.append(" storeNonEmptyIntervals(tid, num_threads_active, \n"); - source.append(" s_left, s_right, s_left_count, s_right_count, \n"); - source.append(" left, mid, right, \n"); - source.append(" left_count, mid_count, right_count, \n"); - source.append(" precision, &compact_second_chunk, \n"); - source.append(" s_compaction_list_exc, \n"); - source.append(" &is_active_second); \n"); - source.append(" \n"); - source.append(" } \n"); - source.append(" else \n"); - source.append(" { \n"); - - source.append(" storeIntervalConverged(s_left, s_right, s_left_count, s_right_count, \n"); - source.append(" &left, &mid, &right, \n"); - source.append(" &left_count, &mid_count, &right_count, \n"); - source.append(" s_compaction_list_exc, &compact_second_chunk, \n"); - source.append(" num_threads_active, \n"); - source.append(" &is_active_second); \n"); - source.append(" \n"); - source.append(" } \n"); - source.append(" } \n"); - - source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); - - // compact second chunk of intervals if any of the threads generated - // two child intervals - source.append(" if (1 == compact_second_chunk) \n"); - source.append(" { \n"); - - source.append(" createIndicesCompaction(s_compaction_list_exc, num_threads_compaction); \n"); - source.append(" compactIntervals(s_left, s_right, s_left_count, s_right_count, \n"); - source.append(" mid, right, mid_count, right_count, \n"); - source.append(" s_compaction_list, num_threads_active, \n"); - source.append(" is_active_second); \n"); - source.append(" } \n"); - - source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); - - // update state variables - source.append(" if (0 == tid) \n"); - source.append(" { \n"); - source.append(" num_threads_active += s_compaction_list[num_threads_active]; \n"); - source.append(" num_threads_compaction = ceilPow2(num_threads_active); \n"); - - source.append(" compact_second_chunk = 0; \n"); - source.append(" all_threads_converged = 1; \n"); - source.append(" } \n"); - - source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); - - // clear - source.append(" s_compaction_list_exc[lcl_id] = 0; \n"); - source.append(" s_compaction_list_exc[lcl_id + lcl_sz] = 0; \n"); - source.append(" \n"); - source.append(" if (num_threads_compaction > lcl_sz) \n"); - source.append(" { \n"); - source.append(" break; \n"); - source.append(" } \n"); - - - source.append(" barrier(CLK_LOCAL_MEM_FENCE) ; \n"); - - source.append(" } \n"); // end until all threads converged - - // write data back to global memory - source.append(" if (tid < num_threads_active) \n"); - source.append(" { \n"); - - source.append(" unsigned int addr = c_block_offset_output + tid; \n"); - source.append(" \n"); - source.append(" g_lambda[addr] = s_left[tid]; \n"); - source.append(" g_pos[addr] = s_right_count[tid]; \n"); - source.append(" } \n"); - source.append(" } \n"); - } - - - //////////////////////////////////////////////////////////////////////////////// - // Determine eigenvalues for large matrices for intervals that after the first step contained one eigenvalue - // - // g_d diagonal elements of symmetric, tridiagonal matrix - // g_s superdiagonal elements of symmetric, tridiagonal matrix - // n matrix size - // num_intervals total number of intervals containing one eigenvalue after the first step - // g_left left interval limits - // g_right right interval limits - // g_pos index of interval / number of intervals that are smaller than right interval limit - // precision desired precision of eigenvalues - //////////////////////////////////////////////////////////////////////////////// - - template <typename StringType> - void generate_bisect_kernel_bisectKernelLarge_OneIntervals(StringType & source, std::string const & numeric_string) - { - source.append(" __kernel \n"); - source.append(" void \n"); - source.append(" bisectKernelLarge_OneIntervals(__global "); source.append(numeric_string); source.append(" *g_d, \n"); - source.append(" __global "); source.append(numeric_string); source.append(" *g_s, \n"); - source.append(" const unsigned int n, \n"); - source.append(" unsigned int num_intervals, \n"); - source.append(" __global "); source.append(numeric_string); source.append(" *g_left, \n"); - source.append(" __global "); source.append(numeric_string); source.append(" *g_right, \n"); - source.append(" __global unsigned int *g_pos, \n"); - source.append(" "); source.append(numeric_string); source.append(" precision) \n"); - source.append(" { \n"); - source.append(" g_s = g_s + 1; \n"); - source.append(" uint glb_id = get_global_id(0); \n"); - source.append(" uint grp_id = get_group_id(0); \n"); - source.append(" uint grp_nm = get_num_groups(0); \n"); - source.append(" uint lcl_id = get_local_id(0); \n"); - source.append(" uint lcl_sz = get_local_size(0); \n"); - source.append(" const unsigned int gtid = (lcl_sz * grp_id) + lcl_id; \n"); - source.append(" __
<TRUNCATED>
