http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/bisect.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/bisect.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/bisect.hpp
deleted file mode 100644
index 64c12b0..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/opencl/kernels/bisect.hpp
+++ /dev/null
@@ -1,2645 +0,0 @@
-#ifndef VIENNACL_LINALG_OPENCL_KERNELS_BISECT_HPP_
-#define VIENNACL_LINALG_OPENCL_KERNELS_BISECT_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-
-/** @file viennacl/linalg/opencl/kernels/bisect.hpp
-    @brief OpenCL kernels for the bisection algorithm for eigenvalues
-
-    Implementation based on the sample provided with the CUDA 6.0 SDK, for 
which
-    the creation of derivative works is allowed by including the following 
statement:
-    "This software contains source code provided by NVIDIA Corporation."
-*/
-
-
-
-#include "viennacl/tools/tools.hpp"
-#include "viennacl/ocl/kernel.hpp"
-#include "viennacl/ocl/platform.hpp"
-#include "viennacl/ocl/utils.hpp"
-
-#include "viennacl/linalg/opencl/common.hpp"
-
-// declaration, forward
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace opencl
-{
-namespace kernels
-{
-  template <typename StringType>
-  void generate_bisect_kernel_config(StringType & source)
-  {
-    /* Global configuration parameter */
-    source.append("     #define  VIENNACL_BISECT_MAX_THREADS_BLOCK             
   256\n");
-    source.append("     #define  VIENNACL_BISECT_MAX_SMALL_MATRIX              
   256\n");
-    source.append("     #define  
VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX   256\n");
-    source.append("     #define  VIENNACL_BISECT_MIN_ABS_INTERVAL              
   5.0e-37\n");
-
-  }
-
-  
////////////////////////////////////////////////////////////////////////////////
-  // Compute the next lower power of two of n
-  // n    number for which next higher power of two is seeked
-  
////////////////////////////////////////////////////////////////////////////////
-
-  template <typename StringType>
-  void generate_bisect_kernel_floorPow2(StringType & source, std::string const 
& numeric_string)
-  {
-  source.append("       \n");
-  source.append("     inline int  \n");
-  source.append("     floorPow2(int n)  \n");
-  source.append("     {  \n");
-  source.append("         uint glb_id = get_global_id(0); \n");
-  source.append("         uint grp_id = get_group_id(0);  \n");
-  source.append("         uint grp_nm = get_num_groups(0); \n");
-  source.append("         uint lcl_id = get_local_id(0); \n");
-  source.append("         uint lcl_sz = get_local_size(0); \n");
-
-
-      // early out if already power of two
-  source.append("         if (0 == (n & (n-1)))  \n");
-  source.append("         {  \n");
-  source.append("             return n;  \n");
-  source.append("         }  \n");
-
-  source.append("         int exp;  \n");
-  source.append("         frexp(( "); source.append(numeric_string); 
source.append(" )n, &exp);  \n");
-  source.append("         return (1 << (exp - 1));  \n");
-  source.append("     }  \n");
-
-  }
-
-
-  
////////////////////////////////////////////////////////////////////////////////
-  // Compute the next higher power of two of n
-  // n  number for which next higher power of two is seeked
-  
////////////////////////////////////////////////////////////////////////////////
-
-  template <typename StringType>
-  void generate_bisect_kernel_ceilPow2(StringType & source, std::string const 
& numeric_string)
-  {
-  source.append("       \n");
-  source.append("     inline int  \n");
-  source.append("     ceilPow2(int n)  \n");
-  source.append("     {  \n");
-  source.append("         uint glb_id = get_global_id(0); \n");
-  source.append("         uint grp_id = get_group_id(0); \n");
-  source.append("         uint grp_nm = get_num_groups(0); \n");
-  source.append("         uint lcl_id = get_local_id(0); \n");
-  source.append("         uint lcl_sz = get_local_size(0); \n");
-
-
-      // early out if already power of two
-  source.append("         if (0 == (n & (n-1)))  \n");
-  source.append("         {  \n");
-  source.append("             return n;  \n");
-  source.append("         }  \n");
-
-  source.append("         int exp;  \n");
-  source.append("         frexp(( "); source.append(numeric_string); 
source.append(" )n, &exp);  \n");
-  source.append("         return (1 << exp);  \n");
-  source.append("     }  \n");
-  }
-
-
-  
////////////////////////////////////////////////////////////////////////////////
-  // Compute midpoint of interval [\a left, \a right] avoiding overflow if 
possible
-  //
-  // left     left  / lower limit of interval
-  // right    right / upper limit of interval
-  
////////////////////////////////////////////////////////////////////////////////
-
-  template <typename StringType>
-  void generate_bisect_kernel_computeMidpoint(StringType & source, std::string 
const & numeric_string)
-  {
-  source.append("       \n");
-  source.append("     inline "); source.append(numeric_string); 
source.append(" \n");
-  source.append("     computeMidpoint(const "); source.append(numeric_string); 
source.append(" left,\n");
-  source.append("       const "); source.append(numeric_string); 
source.append("  right)  \n");
-  source.append("     {  \n");
-  source.append("         uint glb_id = get_global_id(0); \n");
-  source.append("         uint grp_id = get_group_id(0); \n");
-  source.append("         uint grp_nm = get_num_groups(0); \n");
-  source.append("         uint lcl_id = get_local_id(0); \n");
-  source.append("         uint lcl_sz = get_local_size(0); \n");
-  source.append("          "); source.append(numeric_string); source.append("  
mid;  \n");
-
-  source.append("         if (sign(left) == sign(right))  \n");
-  source.append("         {  \n");
-  source.append("             mid = left + (right - left) * 0.5f;  \n");
-  source.append("         }  \n");
-  source.append("         else  \n");
-  source.append("         {  \n");
-  source.append("             mid = (left + right) * 0.5f;  \n");
-  source.append("         }  \n");
-
-  source.append("         return mid;  \n");
-  source.append("     }  \n");
-
-  }
-
-
-  
////////////////////////////////////////////////////////////////////////////////
-  // Check if interval converged and store appropriately
-  //
-  // addr           address where to store the information of the interval
-  // s_left         shared memory storage for left interval limits
-  // s_right        shared memory storage for right interval limits
-  // s_left_count   shared memory storage for number of eigenvalues less than 
left interval limits
-  // s_right_count  shared memory storage for number of eigenvalues less than 
right interval limits
-  // left           lower limit of interval
-  // right          upper limit of interval
-  // left_count     eigenvalues less than \a left
-  // right_count    eigenvalues less than \a right
-  // precision      desired precision for eigenvalues
-  
////////////////////////////////////////////////////////////////////////////////
-
-  template<typename StringType>
-  void generate_bisect_kernel_storeInterval(StringType & source, std::string 
const & numeric_string)
-  {
-  source.append("     \n");
-  source.append("     void  \n");
-  source.append("     storeInterval(unsigned int addr,  \n");
-  source.append("                   __local "); source.append(numeric_string); 
source.append(" * s_left,   \n");
-  source.append("                   __local "); source.append(numeric_string); 
source.append(" * s_right,  \n");
-  source.append("                   __local unsigned int * s_left_count,  \n");
-  source.append("                   __local unsigned int * s_right_count,  
\n");
-  source.append("                    "); source.append(numeric_string); 
source.append(" left,   \n");
-  source.append("                    "); source.append(numeric_string); 
source.append(" right,  \n");
-  source.append("                   unsigned int left_count, \n");
-  source.append("                   unsigned int right_count,  \n");
-  source.append("                    "); source.append(numeric_string); 
source.append("  precision)  \n");
-  source.append("     {  \n");
-  source.append("         uint glb_id = get_global_id(0); \n");
-  source.append("         uint grp_id = get_group_id(0); \n");
-  source.append("         uint grp_nm = get_num_groups(0); \n");
-  source.append("         uint lcl_id = get_local_id(0); \n");
-  source.append("         uint lcl_sz = get_local_size(0); \n");
-
-  source.append("         s_left_count[addr] = left_count;  \n");
-  source.append("         s_right_count[addr] = right_count;  \n");
-
-      // check if interval converged
-  source.append("          "); source.append(numeric_string); source.append(" 
t0 = fabs(right - left);  \n");
-  source.append("          "); source.append(numeric_string); source.append(" 
t1 = max(fabs(left), fabs(right)) * precision;  \n");
-
-  source.append("         if (t0 <= max(( "); source.append(numeric_string); 
source.append(" )VIENNACL_BISECT_MIN_ABS_INTERVAL, t1))  \n");
-  source.append("         {  \n");
-          // compute mid point
-  source.append("              "); source.append(numeric_string); 
source.append(" lambda = computeMidpoint(left, right);  \n");
-
-          // mark as converged
-  source.append("             s_left[addr] = lambda;  \n");
-  source.append("             s_right[addr] = lambda;  \n");
-  source.append("         }  \n");
-  source.append("         else  \n");
-  source.append("         {  \n");
-
-          // store current limits
-  source.append("             s_left[addr] = left;  \n");
-  source.append("             s_right[addr] = right;  \n");
-  source.append("         }  \n");
-
-  source.append("     }  \n");
-
-  }
-
-  template<typename StringType>
-  void generate_bisect_kernel_storeIntervalShort(StringType & source, 
std::string const & numeric_string)
-  {
-  source.append("     \n");
-  source.append("     void  \n");
-  source.append("     storeIntervalShort(unsigned int addr,  \n");
-  source.append("                   __local "); source.append(numeric_string); 
source.append(" * s_left,   \n");
-  source.append("                   __local "); source.append(numeric_string); 
source.append(" * s_right,  \n");
-  source.append("                   __local unsigned short * s_left_count,  
\n");
-  source.append("                   __local unsigned short * s_right_count,  
\n");
-  source.append("                    "); source.append(numeric_string); 
source.append(" left,   \n");
-  source.append("                    "); source.append(numeric_string); 
source.append(" right,  \n");
-  source.append("                   unsigned int left_count, \n");
-  source.append("                   unsigned int right_count,  \n");
-  source.append("                    "); source.append(numeric_string); 
source.append("  precision)  \n");
-  source.append("     {  \n");
-  source.append("         uint glb_id = get_global_id(0); \n");
-  source.append("         uint grp_id = get_group_id(0); \n");
-  source.append("         uint grp_nm = get_num_groups(0); \n");
-  source.append("         uint lcl_id = get_local_id(0); \n");
-  source.append("         uint lcl_sz = get_local_size(0); \n");
-
-  source.append("         s_left_count[addr] = left_count;  \n");
-  source.append("         s_right_count[addr] = right_count;  \n");
-
-      // check if interval converged
-  source.append("          "); source.append(numeric_string); source.append(" 
t0 = fabs(right - left);  \n");
-  source.append("          "); source.append(numeric_string); source.append(" 
t1 = max(fabs(left), fabs(right)) * precision;  \n");
-
-  source.append("         if (t0 <= max(( "); source.append(numeric_string); 
source.append(" )VIENNACL_BISECT_MIN_ABS_INTERVAL, t1))  \n");
-  source.append("         {  \n");
-          // compute mid point
-  source.append("              "); source.append(numeric_string); 
source.append(" lambda = computeMidpoint(left, right);  \n");
-
-          // mark as converged
-  source.append("             s_left[addr] = lambda;  \n");
-  source.append("             s_right[addr] = lambda;  \n");
-  source.append("         }  \n");
-  source.append("         else  \n");
-  source.append("         {  \n");
-
-          // store current limits
-  source.append("             s_left[addr] = left;  \n");
-  source.append("             s_right[addr] = right;  \n");
-  source.append("         }  \n");
-
-  source.append("     }  \n");
-
-
-  }
-
-
-  
////////////////////////////////////////////////////////////////////////////////
-  // Compute number of eigenvalues that are smaller than x given a symmetric,
-  // real, and tridiagonal matrix
-  //
-  // g_d                   diagonal elements stored in global memory
-  // g_s                   superdiagonal elements stored in global memory
-  // n                     size of matrix
-  // x                     value for which the number of eigenvalues that are 
smaller is sought
-  // tid                   thread identified (e.g. threadIdx.x or gtid)
-  // num_intervals_active  number of active intervals / threads that currently 
process an interval
-  // s_d                   scratch space to store diagonal entries of the 
tridiagonal matrix in shared memory
-  // s_s                   scratch space to store superdiagonal entries of the 
tridiagonal matrix in shared memory
-  // converged             flag if the current thread is already converged 
(that is count does not have to be computed)
-  
////////////////////////////////////////////////////////////////////////////////
-
-  template <typename StringType>
-  void generate_bisect_kernel_computeNumSmallerEigenvals(StringType & source, 
std::string const & numeric_string)
-  {
-  source.append("       \n");
-  source.append("     inline unsigned int  \n");
-  source.append("     computeNumSmallerEigenvals(__global "); 
source.append(numeric_string); source.append(" *g_d,   \n");
-  source.append("                                __global "); 
source.append(numeric_string); source.append(" *g_s,   \n");
-  source.append("                                const unsigned int n,  \n");
-  source.append("                                const "); 
source.append(numeric_string); source.append(" x,         \n");
-  source.append("                                const unsigned int tid,  \n");
-  source.append("                                const unsigned int 
num_intervals_active,  \n");
-  source.append("                                __local "); 
source.append(numeric_string); source.append(" *s_d,  \n");
-  source.append("                                __local "); 
source.append(numeric_string); source.append(" *s_s,  \n");
-  source.append("                                unsigned int converged  \n");
-  source.append("                               )  \n");
-  source.append("     {  \n");
-  source.append("         uint glb_id = get_global_id(0); \n");
-  source.append("         uint grp_id = get_group_id(0); \n");
-  source.append("         uint grp_nm = get_num_groups(0); \n");
-  source.append("         uint lcl_id = get_local_id(0); \n");
-  source.append("         uint lcl_sz = get_local_size(0); \n");
-
-
-  source.append("          "); source.append(numeric_string); source.append(" 
delta = 1.0f;  \n");
-  source.append("         unsigned int count = 0;  \n");
-
-  source.append("         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
-
-      // read data into shared memory
-  source.append("         if (lcl_id < n)  \n");
-  source.append("         {  \n");
-  source.append("             s_d[lcl_id] = *(g_d + lcl_id);  \n");
-  source.append("             s_s[lcl_id] = *(g_s + lcl_id - 1);  \n");
-  source.append("         }  \n");
-
-  source.append("         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
-
-      // perform loop only for active threads
-  source.append("         if ((tid < num_intervals_active) && (0 == 
converged))  \n");
-  source.append("         {  \n");
-
-          // perform (optimized) Gaussian elimination to determine the number
-          // of eigenvalues that are smaller than n
-  source.append("             for (unsigned int k = 0; k < n; ++k)  \n");
-  source.append("             {  \n");
-  source.append("                 delta = s_d[k] - x - (s_s[k] * s_s[k]) / 
delta;  \n");
-  source.append("                 count += (delta < 0) ? 1 : 0;  \n");
-  source.append("             }  \n");
-
-  source.append("         } \n"); // end if thread currently processing an 
interval
-
-  source.append("         return count;  \n");
-  source.append("     }  \n");
-
-  }
-
-
-  
////////////////////////////////////////////////////////////////////////////////
-  // Compute number of eigenvalues that are smaller than x given a symmetric,
-  // real, and tridiagonal matrix
-  //
-  // g_d                   diagonal elements stored in global memory
-  // g_s                   superdiagonal elements stored in global memory
-  // n                     size of matrix
-  // x                     value for which the number of eigenvalues that are 
smaller is seeked
-  // tid                   thread identified (e.g. threadIdx.x or gtid)
-  // num_intervals_active  number of active intervals / threads that currently 
process an interval
-  // s_d                   scratch space to store diagonal entries of the 
tridiagonal matrix in shared memory
-  // s_s                   scratch space to store superdiagonal entries of the 
tridiagonal matrix in shared memory
-  // converged             flag if the current thread is already converged 
(that is count does not have to be computed)
-  
////////////////////////////////////////////////////////////////////////////////
-
-  template <typename StringType>
-  void generate_bisect_kernel_computeNumSmallerEigenvalsLarge(StringType & 
source, std::string const & numeric_string)
-  {
-  source.append("       \n");
-  source.append("     inline unsigned int  \n");
-  source.append("     computeNumSmallerEigenvalsLarge(__global "); 
source.append(numeric_string); source.append(" *g_d,   \n");
-  source.append("                                __global "); 
source.append(numeric_string); source.append(" *g_s,   \n");
-  source.append("                                const unsigned int n,  \n");
-  source.append("                                const "); 
source.append(numeric_string); source.append(" x,         \n");
-  source.append("                                const unsigned int tid,  \n");
-  source.append("                                const unsigned int 
num_intervals_active,  \n");
-  source.append("                                __local "); 
source.append(numeric_string); source.append(" *s_d,  \n");
-  source.append("                                __local "); 
source.append(numeric_string); source.append(" *s_s,  \n");
-  source.append("                                unsigned int converged  \n");
-  source.append("                               )  \n");
-  source.append("     {  \n");
-  source.append("         uint glb_id = get_global_id(0); \n");
-  source.append("         uint grp_id = get_group_id(0); \n");
-  source.append("         uint grp_nm = get_num_groups(0); \n");
-  source.append("         uint lcl_id = get_local_id(0); \n");
-  source.append("         uint lcl_sz = get_local_size(0); \n");
-
-  source.append("          "); source.append(numeric_string); source.append(" 
delta = 1.0f;  \n");
-  source.append("         unsigned int count = 0;  \n");
-
-  source.append("         unsigned int rem = n;  \n");
-
-      // do until whole diagonal and superdiagonal has been loaded and 
processed
-  source.append("         for (unsigned int i = 0; i < n; i += lcl_sz)  \n");
-  source.append("         {  \n");
-
-  source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
-
-          // read new chunk of data into shared memory
-  source.append("             if ((i + lcl_id) < n)  \n");
-  source.append("             {  \n");
-
-  source.append("                 s_d[lcl_id] = *(g_d + i + lcl_id);  \n");
-  source.append("                 s_s[lcl_id] = *(g_s + i + lcl_id - 1);  \n");
-  source.append("             }  \n");
-
-  source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
-
-
-  source.append("             if (tid < num_intervals_active)  \n");
-  source.append("             {  \n");
-
-              // perform (optimized) Gaussian elimination to determine the 
number
-              // of eigenvalues that are smaller than n
-  source.append("                 for (unsigned int k = 0; k < 
min(rem,lcl_sz); ++k)  \n");
-  source.append("                 {  \n");
-  source.append("                     delta = s_d[k] - x - (s_s[k] * s_s[k]) / 
delta;  \n");
-                  // delta = (abs( delta) < (1.0e-10)) ? -(1.0e-10) : delta;
-  source.append("                     count += (delta < 0) ? 1 : 0;  \n");
-  source.append("                 }  \n");
-
-  source.append("             } \n"); // end if thread currently processing an 
interval
-
-  source.append("             rem -= lcl_sz;  \n");
-  source.append("         }  \n");
-
-  source.append("         return count;  \n");
-  source.append("     }  \n");
-
-
-  }
-
-  
////////////////////////////////////////////////////////////////////////////////
-  // Store all non-empty intervals resulting from the subdivision of the 
interval
-  // currently processed by the thread
-  //
-  // addr                     base address for storing intervals
-  // num_threads_active       number of threads / intervals in current sweep
-  // s_left                   shared memory storage for left interval limits
-  // s_right                  shared memory storage for right interval limits
-  // s_left_count             shared memory storage for number of eigenvalues 
less than left interval limits
-  // s_right_count            shared memory storage for number of eigenvalues 
less than right interval limits
-  // left                     lower limit of interval
-  // mid                      midpoint of interval
-  // right                    upper limit of interval
-  // left_count               eigenvalues less than \a left
-  // mid_count                eigenvalues less than \a mid
-  // right_count              eigenvalues less than \a right
-  // precision                desired precision for eigenvalues
-  // compact_second_chunk     shared mem flag if second chunk is used and ergo 
requires compaction
-  // s_compaction_list_exc    helper array for stream compaction, 
s_compaction_list_exc[tid] = 1 when the thread generated two child intervals
-  // is_active_interval       mark is thread has a second non-empty child 
interval
-  
////////////////////////////////////////////////////////////////////////////////
-
-  template<typename StringType>
-  void generate_bisect_kernel_storeNonEmptyIntervals(StringType & source, 
std::string const & numeric_string)
-  {
-  source.append("       \n");
-  source.append("     void  \n");
-  source.append("     storeNonEmptyIntervals(unsigned int addr,  \n");
-  source.append("                            const unsigned int 
num_threads_active,  \n");
-  source.append("                            __local "); 
source.append(numeric_string); source.append(" *s_left,   \n");
-  source.append("                            __local "); 
source.append(numeric_string); source.append(" *s_right,  \n");
-  source.append("                            __local unsigned int 
*s_left_count,  \n");
-  source.append("                            __local unsigned int 
*s_right_count,  \n");
-  source.append("                             "); 
source.append(numeric_string); source.append(" left, \n ");
-  source.append("                             "); 
source.append(numeric_string); source.append(" mid,  \n");
-  source.append("                             "); 
source.append(numeric_string); source.append(" right,\n");
-  source.append("                            const unsigned int left_count,  
\n");
-  source.append("                            const unsigned int mid_count,  
\n");
-  source.append("                            const unsigned int right_count,  
\n");
-  source.append("                             "); 
source.append(numeric_string); source.append(" precision,  \n");
-  source.append("                            __local unsigned int 
*compact_second_chunk,  \n");
-  source.append("                            __local unsigned int 
*s_compaction_list_exc,  \n");
-  source.append("                            unsigned int *is_active_second)  
\n");
-  source.append("     {  \n");
-  source.append("         uint glb_id = get_global_id(0); \n");
-  source.append("         uint grp_id = get_group_id(0); \n");
-  source.append("         uint grp_nm = get_num_groups(0); \n");
-  source.append("         uint lcl_id = get_local_id(0); \n");
-  source.append("         uint lcl_sz = get_local_size(0); \n");
-
-      // check if both child intervals are valid
-  source.append("          \n");
-  source.append("         if ((left_count != mid_count) && (mid_count != 
right_count))  \n");
-  source.append("         {  \n");
-
-          // store the left interval
-  source.append("             storeInterval(addr, s_left, s_right, 
s_left_count, s_right_count,  \n");
-  source.append("                           left, mid, left_count, mid_count, 
precision);  \n");
-
-          // mark that a second interval has been generated, only stored after
-          // stream compaction of second chunk
-  source.append("             *is_active_second = 1;  \n");
-  source.append("             s_compaction_list_exc[lcl_id] = 1;  \n");
-  source.append("             *compact_second_chunk = 1;  \n");
-  source.append("         }  \n");
-  source.append("         else  \n");
-  source.append("         {  \n");
-
-          // only one non-empty child interval
-
-          // mark that no second child
-  source.append("             *is_active_second = 0;  \n");
-  source.append("             s_compaction_list_exc[lcl_id] = 0;  \n");
-
-          // store the one valid child interval
-  source.append("             if (left_count != mid_count)  \n");
-  source.append("             {  \n");
-  source.append("                 storeInterval(addr, s_left, s_right, 
s_left_count, s_right_count,  \n");
-  source.append("                               left, mid, left_count, 
mid_count, precision);  \n");
-  source.append("             }  \n");
-  source.append("             else  \n");
-  source.append("             {  \n");
-  source.append("                 storeInterval(addr, s_left, s_right, 
s_left_count, s_right_count,  \n");
-  source.append("                               mid, right, mid_count, 
right_count, precision);  \n");
-  source.append("             }  \n");
-
-  source.append("         }  \n");
-  source.append("     }  \n");
-
-  }
-
-
-  
////////////////////////////////////////////////////////////////////////////////
-  //! Store all non-empty intervals resulting from the subdivision of the 
interval
-  //! currently processed by the thread
-  
////////////////////////////////////////////////////////////////////////////////
-
-  template <typename StringType>
-  void generate_bisect_kernel_storeNonEmptyIntervalsLarge(StringType & source, 
std::string const & numeric_string)
-  {
-      source.append("       \n");
-      source.append("     void  \n");
-      source.append("     storeNonEmptyIntervalsLarge(unsigned int addr,  \n");
-      source.append("                            const unsigned int 
num_threads_active,  \n");
-      source.append("                            __local "); 
source.append(numeric_string); source.append(" *s_left,   \n");
-      source.append("                            __local "); 
source.append(numeric_string); source.append(" *s_right,  \n");
-      source.append("                            __local unsigned short 
*s_left_count,  \n");
-      source.append("                            __local unsigned short 
*s_right_count,  \n");
-      source.append("                             "); 
source.append(numeric_string); source.append(" left, \n ");
-      source.append("                             "); 
source.append(numeric_string); source.append(" mid,  \n");
-      source.append("                             "); 
source.append(numeric_string); source.append(" right,\n");
-      source.append("                            const unsigned int 
left_count,  \n");
-      source.append("                            const unsigned int mid_count, 
 \n");
-      source.append("                            const unsigned int 
right_count,  \n");
-      source.append("                             "); 
source.append(numeric_string); source.append(" epsilon,  \n");
-      source.append("                            __local unsigned int 
*compact_second_chunk,  \n");
-      source.append("                            __local unsigned short 
*s_compaction_list,  \n");
-      source.append("                            unsigned int 
*is_active_second)  \n");
-      source.append("     {  \n");
-      source.append("         uint glb_id = get_global_id(0); \n");
-      source.append("         uint grp_id = get_group_id(0); \n");
-      source.append("         uint grp_nm = get_num_groups(0); \n");
-      source.append("         uint lcl_id = get_local_id(0); \n");
-      source.append("         uint lcl_sz = get_local_size(0); \n");
-
-          // check if both child intervals are valid
-      source.append("         if ((left_count != mid_count) && (mid_count != 
right_count))  \n");
-      source.append("         {  \n");
-
-      source.append("             storeIntervalShort(addr, s_left, s_right, 
s_left_count, s_right_count,  \n");
-      source.append("                           left, mid, left_count, 
mid_count, epsilon);  \n");
-
-      source.append("             *is_active_second = 1;  \n");
-      source.append("             s_compaction_list[lcl_id] = 1;  \n");
-      source.append("             *compact_second_chunk = 1;  \n");
-      source.append("         }  \n");
-      source.append("         else  \n");
-      source.append("         {  \n");
-
-              // only one non-empty child interval
-
-              // mark that no second child
-      source.append("             *is_active_second = 0;  \n");
-      source.append("             s_compaction_list[lcl_id] = 0;  \n");
-
-              // store the one valid child interval
-      source.append("             if (left_count != mid_count)  \n");
-      source.append("             {  \n");
-      source.append("                 storeIntervalShort(addr, s_left, 
s_right, s_left_count, s_right_count,  \n");
-      source.append("                               left, mid, left_count, 
mid_count, epsilon);  \n");
-      source.append("             }  \n");
-      source.append("             else  \n");
-      source.append("             {  \n");
-      source.append("                 storeIntervalShort(addr, s_left, 
s_right, s_left_count, s_right_count,  \n");
-      source.append("                               mid, right, mid_count, 
right_count, epsilon);  \n");
-      source.append("             }  \n");
-      source.append("         }  \n");
-      source.append("     }  \n");
-  }
-
-  
////////////////////////////////////////////////////////////////////////////////
-  // Create indices for compaction, that is process \a s_compaction_list_exc
-  // which is 1 for intervals that generated a second child and 0 otherwise
-  // and create for each of the non-zero elements the index where the new
-  // interval belongs to in a compact representation of all generated second 
children
-  //
-  // s_compaction_list_exc      list containing the flags which threads 
generated two children
-  // num_threads_compaction     number of threads to employ for compaction
-  
////////////////////////////////////////////////////////////////////////////////
-
-  template<typename StringType>
-  void generate_bisect_kernel_createIndicesCompaction(StringType & source)
-  {
-  source.append("       \n");
-  source.append("     void  \n");
-  source.append("     createIndicesCompaction(__local unsigned int 
*s_compaction_list_exc,  \n");
-  source.append("                             unsigned int 
num_threads_compaction)  \n");
-  source.append("     {  \n");
-  source.append("         uint glb_id = get_global_id(0); \n");
-  source.append("         uint grp_id = get_group_id(0); \n");
-  source.append("         uint grp_nm = get_num_groups(0); \n");
-  source.append("         uint lcl_id = get_local_id(0); \n");
-  source.append("         uint lcl_sz = get_local_size(0); \n");
-
-
-  source.append("         unsigned int offset = 1;  \n");
-  source.append("         const unsigned int tid = lcl_id;  \n");
-     // if(tid == 0)
-       // printf("num_threads_compaction = %u\n", num_threads_compaction);
-
-      // higher levels of scan tree
-  source.append("         for (int d = (num_threads_compaction >> 1); d > 0; d 
>>= 1)  \n");
-  source.append("         {  \n");
-
-  source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
-
-  source.append("             if (tid < d)  \n");
-  source.append("             {  \n");
-
-  source.append("                 unsigned int  ai = offset*(2*tid+1)-1;  \n");
-  source.append("                 unsigned int  bi = offset*(2*tid+2)-1;  \n");
-  source.append("              \n");
-  source.append("                 s_compaction_list_exc[bi] =   
s_compaction_list_exc[bi]  \n");
-  source.append("                                               + 
s_compaction_list_exc[ai];  \n");
-  source.append("             }  \n");
-
-  source.append("             offset <<= 1;  \n");
-  source.append("         }  \n");
-
-      // traverse down tree: first down to level 2 across
-  source.append("         for (int d = 2; d < num_threads_compaction; d <<= 1) 
 \n");
-  source.append("         {  \n");
-
-  source.append("             offset >>= 1;  \n");
-  source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
-
-  source.append("             if (tid < (d-1))  \n");
-  source.append("             {  \n");
-
-  source.append("                 unsigned int  ai = offset*(tid+1) - 1;  \n");
-  source.append("                 unsigned int  bi = ai + (offset >> 1);  \n");
-
-  source.append("                 s_compaction_list_exc[bi] =   
s_compaction_list_exc[bi]  \n");
-  source.append("                                               + 
s_compaction_list_exc[ai];  \n");
-  source.append("             }  \n");
-  source.append("         }  \n");
-
-  source.append("         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
-
-  source.append("     }  \n");
-  }
-
-
-  template<typename StringType>
-  void generate_bisect_kernel_createIndicesCompactionShort(StringType & source)
-  {
-  source.append("       \n");
-  source.append("     void  \n");
-  source.append("     createIndicesCompactionShort(__local unsigned short 
*s_compaction_list_exc,  \n");
-  source.append("                             unsigned int 
num_threads_compaction)  \n");
-  source.append("     {  \n");
-  source.append("         uint glb_id = get_global_id(0); \n");
-  source.append("         uint grp_id = get_group_id(0); \n");
-  source.append("         uint grp_nm = get_num_groups(0); \n");
-  source.append("         uint lcl_id = get_local_id(0); \n");
-  source.append("         uint lcl_sz = get_local_size(0); \n");
-
-
-  source.append("         unsigned int offset = 1;  \n");
-  source.append("         const unsigned int tid = lcl_id;  \n");
-
-      // higher levels of scan tree
-  source.append("         for (int d = (num_threads_compaction >> 1); d > 0; d 
>>= 1)  \n");
-  source.append("         {  \n");
-
-  source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
-
-  source.append("             if (tid < d)  \n");
-  source.append("             {  \n");
-
-  source.append("                 unsigned int  ai = offset*(2*tid+1)-1;  \n");
-  source.append("                 unsigned int  bi = offset*(2*tid+2)-1;  \n");
-  source.append("              \n");
-  source.append("                 s_compaction_list_exc[bi] =   
s_compaction_list_exc[bi]  \n");
-  source.append("                                               + 
s_compaction_list_exc[ai];  \n");
-  source.append("             }  \n");
-
-  source.append("             offset <<= 1;  \n");
-  source.append("         }  \n");
-
-      // traverse down tree: first down to level 2 across
-  source.append("         for (int d = 2; d < num_threads_compaction; d <<= 1) 
 \n");
-  source.append("         {  \n");
-
-  source.append("             offset >>= 1;  \n");
-  source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
-
-  source.append("             if (tid < (d-1))  \n");
-  source.append("             {  \n");
-
-  source.append("                 unsigned int  ai = offset*(tid+1) - 1;  \n");
-  source.append("                 unsigned int  bi = ai + (offset >> 1);  \n");
-
-  source.append("                 s_compaction_list_exc[bi] =   
s_compaction_list_exc[bi]  \n");
-  source.append("                                               + 
s_compaction_list_exc[ai];  \n");
-  source.append("             }  \n");
-  source.append("         }  \n");
-
-  source.append("         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
-
-  source.append("     }  \n");
-  }
-
-  
///////////////////////////////////////////////////////////////////////////////
-  // Perform stream compaction for second child intervals
-  //
-  // s_left              shared memory storage for left interval limits
-  // s_right             shared memory storage for right interval limits
-  // s_left_count        shared memory storage for number of eigenvalues less 
than left interval limits
-  // s_right_count       shared memory storage for number of eigenvalues less 
than right interval limits
-  // mid                 midpoint of current interval (left of new interval)
-  // right               upper limit of interval
-  // mid_count           eigenvalues less than \a mid
-  // s_compaction_list   list containing the indices where the data has to be 
stored
-  // num_threads_active  number of active threads / intervals
-  // is_active_interval  mark is thread has a second non-empty child interval
-  
///////////////////////////////////////////////////////////////////////////////
-
-
-  template<typename StringType>
-  void generate_bisect_kernel_compactIntervals(StringType & source, 
std::string const & numeric_string)
-  {
-  source.append("       \n");
-  source.append("     void  \n");
-  source.append("     compactIntervals(__local "); 
source.append(numeric_string); source.append(" *s_left,  \n");
-  source.append("                      __local "); 
source.append(numeric_string); source.append(" *s_right, \n");
-  source.append("                      __local unsigned int *s_left_count, 
\n");
-  source.append("                      __local unsigned int *s_right_count,  
\n");
-  source.append("                       "); source.append(numeric_string); 
source.append(" mid,  \n");
-  source.append("                       "); source.append(numeric_string); 
source.append(" right, \n");
-  source.append("                      unsigned int mid_count, unsigned int 
right_count,  \n");
-  source.append("                      __local unsigned int 
*s_compaction_list,  \n");
-  source.append("                      unsigned int num_threads_active,  \n");
-  source.append("                      unsigned int is_active_second)  \n");
-  source.append("     {  \n");
-  source.append("         uint glb_id = get_global_id(0); \n");
-  source.append("         uint grp_id = get_group_id(0); \n");
-  source.append("         uint grp_nm = get_num_groups(0); \n");
-  source.append("         uint lcl_id = get_local_id(0); \n");
-  source.append("         uint lcl_sz = get_local_size(0); \n");
-
-  source.append("         const unsigned int tid = lcl_id;  \n");
-
-      // perform compaction / copy data for all threads where the second
-      // child is not dead
-  source.append("         if ((tid < num_threads_active) && (1 == 
is_active_second))  \n");
-  source.append("         {  \n");
-  source.append("             unsigned int addr_w = num_threads_active + 
s_compaction_list[tid];  \n");
-  source.append("             s_left[addr_w] = mid;  \n");
-  source.append("             s_right[addr_w] = right;  \n");
-  source.append("             s_left_count[addr_w] = mid_count;  \n");
-  source.append("             s_right_count[addr_w] = right_count;  \n");
-  source.append("         }  \n");
-  source.append("     }  \n");
-  }
-
-
-
-
-  template<typename StringType>
-  void generate_bisect_kernel_compactIntervalsShort(StringType & source, 
std::string const & numeric_string)
-  {
-  source.append("       \n");
-  source.append("     void  \n");
-  source.append("     compactIntervalsShort(__local "); 
source.append(numeric_string); source.append(" *s_left,  \n");
-  source.append("                      __local "); 
source.append(numeric_string); source.append(" *s_right,  \n");
-  source.append("                      __local unsigned short *s_left_count, 
\n");
-  source.append("                      __local unsigned short *s_right_count,  
\n");
-  source.append("                      "); source.append(numeric_string); 
source.append(" mid,   \n");
-  source.append("                      "); source.append(numeric_string); 
source.append(" right, \n");
-  source.append("                      unsigned int mid_count, unsigned int 
right_count,  \n");
-  source.append("                      __local unsigned short 
*s_compaction_list,  \n");
-  source.append("                      unsigned int num_threads_active,  \n");
-  source.append("                      unsigned int is_active_second)  \n");
-  source.append("     {  \n");
-  source.append("         uint glb_id = get_global_id(0); \n");
-  source.append("         uint grp_id = get_group_id(0); \n");
-  source.append("         uint grp_nm = get_num_groups(0); \n");
-  source.append("         uint lcl_id = get_local_id(0); \n");
-  source.append("         uint lcl_sz = get_local_size(0); \n");
-
-  source.append("         const unsigned int tid = lcl_id;  \n");
-
-      // perform compaction / copy data for all threads where the second
-      // child is not dead
-  source.append("         if ((tid < num_threads_active) && (1 == 
is_active_second))  \n");
-  source.append("         {  \n");
-  source.append("             unsigned int addr_w = num_threads_active + 
s_compaction_list[tid];  \n");
-  source.append("             s_left[addr_w] = mid;  \n");
-  source.append("             s_right[addr_w] = right;  \n");
-  source.append("             s_left_count[addr_w] = mid_count;  \n");
-  source.append("             s_right_count[addr_w] = right_count;  \n");
-  source.append("         }  \n");
-  source.append("     }  \n");
-  }
-
-
-
-  template<typename StringType>
-  void generate_bisect_kernel_storeIntervalConverged(StringType & source, 
std::string const & numeric_string)
-  {
-  source.append("       \n");
-  source.append("     void  \n");
-  source.append("     storeIntervalConverged( __local "); 
source.append(numeric_string); source.append(" *s_left,   \n");
-  source.append("                             __local "); 
source.append(numeric_string); source.append(" *s_right,   \n");
-  source.append("                            __local unsigned int 
*s_left_count, \n");
-  source.append("                            __local unsigned int 
*s_right_count,  \n");
-  source.append("                            "); 
source.append(numeric_string); source.append(" *left,   \n");
-  source.append("                            "); 
source.append(numeric_string); source.append(" *mid,   \n");
-  source.append("                            "); 
source.append(numeric_string); source.append(" *right,   \n");
-  source.append("                            unsigned int *left_count,     
\n");
-  source.append("                            unsigned int *mid_count,      
\n");
-  source.append("                            unsigned int *right_count,     
\n");
-  source.append("                            __local unsigned int 
*s_compaction_list_exc,  \n");
-  source.append("                            __local unsigned int 
*compact_second_chunk,  \n");
-  source.append("                            const unsigned int 
num_threads_active,  \n");
-  source.append("                            unsigned int *is_active_second)  
\n");
-  source.append("     {  \n");
-  source.append("         uint glb_id = get_global_id(0); \n");
-  source.append("         uint grp_id = get_group_id(0); \n");
-  source.append("         uint grp_nm = get_num_groups(0); \n");
-  source.append("         uint lcl_id = get_local_id(0); \n");
-  source.append("         uint lcl_sz = get_local_size(0); \n");
-
-  source.append("         const unsigned int tid = lcl_id;  \n");
-  source.append("         const unsigned int multiplicity = *right_count - 
*left_count;  \n");
-      // check multiplicity of eigenvalue
-  source.append("         if (1 == multiplicity)  \n");
-  source.append("         {  \n");
-
-          // just re-store intervals, simple eigenvalue
-  source.append("             s_left[tid] = *left;  \n");
-  source.append("             s_right[tid] = *right;  \n");
-  source.append("             s_left_count[tid] = *left_count;  \n");
-  source.append("             s_right_count[tid] = *right_count;  \n");
-  source.append("             \n");
-
-          // mark that no second child / clear
-  source.append("             *is_active_second = 0;  \n");
-  source.append("             s_compaction_list_exc[tid] = 0;  \n");
-  source.append("         }  \n");
-  source.append("         else  \n");
-  source.append("         {  \n");
-
-          // number of eigenvalues after the split less than mid
-  source.append("             *mid_count = *left_count + (multiplicity >> 1);  
\n");
-
-          // store left interval
-  source.append("             s_left[tid] = *left;  \n");
-  source.append("             s_right[tid] = *right;  \n");
-  source.append("             s_left_count[tid] = *left_count;  \n");
-  source.append("             s_right_count[tid] = *mid_count;  \n");
-  source.append("             *mid = *left;  \n");
-
-          // mark that second child interval exists
-  source.append("             *is_active_second = 1;  \n");
-  source.append("             s_compaction_list_exc[tid] = 1;  \n");
-  source.append("             *compact_second_chunk = 1;  \n");
-  source.append("         }  \n");
-  source.append("     }  \n");
-  }
-
-
-
-
-
-  template<typename StringType>
-  void generate_bisect_kernel_storeIntervalConvergedShort(StringType & source, 
std::string const & numeric_string)
-  {
-  source.append("       \n");
-  source.append("     void  \n");
-  source.append("     storeIntervalConvergedShort(__local "); 
source.append(numeric_string); source.append(" *s_left,   \n");
-  source.append("                             __local "); 
source.append(numeric_string); source.append(" *s_right,   \n");
-  source.append("                            __local unsigned short 
*s_left_count, \n");
-  source.append("                            __local unsigned short 
*s_right_count,  \n");
-  source.append("                            "); 
source.append(numeric_string); source.append(" *left,   \n");
-  source.append("                            "); 
source.append(numeric_string); source.append(" *mid,   \n");
-  source.append("                            "); 
source.append(numeric_string); source.append(" *right,   \n");
-  source.append("                            unsigned int *left_count,     
\n");
-  source.append("                            unsigned int *mid_count,      
\n");
-  source.append("                            unsigned int *right_count,     
\n");
-  source.append("                            __local unsigned short 
*s_compaction_list_exc,  \n");
-  source.append("                            __local unsigned int 
*compact_second_chunk,  \n");
-  source.append("                            const unsigned int 
num_threads_active,  \n");
-  source.append("                            unsigned int *is_active_second)  
\n");
-  source.append("     {  \n");
-  source.append("         uint glb_id = get_global_id(0); \n");
-  source.append("         uint grp_id = get_group_id(0); \n");
-  source.append("         uint grp_nm = get_num_groups(0); \n");
-  source.append("         uint lcl_id = get_local_id(0); \n");
-  source.append("         uint lcl_sz = get_local_size(0); \n");
-
-  source.append("         const unsigned int tid = lcl_id;  \n");
-  source.append("         const unsigned int multiplicity = *right_count - 
*left_count;  \n");
-      // check multiplicity of eigenvalue
-  source.append("         if (1 == multiplicity)  \n");
-  source.append("         {  \n");
-
-          // just re-store intervals, simple eigenvalue
-  source.append("             s_left[tid] = *left;  \n");
-  source.append("             s_right[tid] = *right;  \n");
-  source.append("             s_left_count[tid] = *left_count;  \n");
-  source.append("             s_right_count[tid] = *right_count;  \n");
-  source.append("             \n");
-
-          // mark that no second child / clear
-  source.append("             *is_active_second = 0;  \n");
-  source.append("             s_compaction_list_exc[tid] = 0;  \n");
-  source.append("         }  \n");
-  source.append("         else  \n");
-  source.append("         {  \n");
-
-          // number of eigenvalues after the split less than mid
-  source.append("             *mid_count = *left_count + (multiplicity >> 1);  
\n");
-
-          // store left interval
-  source.append("             s_left[tid] = *left;  \n");
-  source.append("             s_right[tid] = *right;  \n");
-  source.append("             s_left_count[tid] = *left_count;  \n");
-  source.append("             s_right_count[tid] = *mid_count;  \n");
-  source.append("             *mid = *left;  \n");
-
-          // mark that second child interval exists
-  source.append("             *is_active_second = 1;  \n");
-  source.append("             s_compaction_list_exc[tid] = 1;  \n");
-  source.append("             *compact_second_chunk = 1;  \n");
-  source.append("         }  \n");
-  source.append("     }  \n");
-  }
-
-  
///////////////////////////////////////////////////////////////////////////////
-  // Subdivide interval if active and not already converged
-  //
-  // tid                    id of thread
-  // s_left                 shared memory storage for left interval limits
-  // s_right                shared memory storage for right interval limits
-  // s_left_count           shared memory storage for number of eigenvalues 
less than left interval limits
-  // s_right_count          shared memory storage for number of eigenvalues 
less than right interval limits
-  // num_threads_active     number of active threads in warp
-  // left                   lower limit of interval
-  // right                  upper limit of interval
-  // left_count             eigenvalues less than \a left
-  // right_count            eigenvalues less than \a right
-  // all_threads_converged  shared memory flag if all threads are converged
-  
///////////////////////////////////////////////////////////////////////////////
-
-
-  template<typename StringType>
-  void generate_bisect_kernel_subdivideActiveInterval(StringType & source, 
std::string const & numeric_string)
-  {
-  source.append("       \n");
-  source.append("     void  \n");
-  source.append("     subdivideActiveIntervalMulti(const unsigned int tid,  
\n");
-  source.append("                             __local "); 
source.append(numeric_string); source.append(" *s_left,    \n");
-  source.append("                             __local "); 
source.append(numeric_string); source.append(" *s_right,   \n");
-  source.append("                             __local unsigned int 
*s_left_count,   \n");
-  source.append("                             __local unsigned int 
*s_right_count,  \n");
-  source.append("                             const unsigned int 
num_threads_active,  \n");
-  source.append("                              "); 
source.append(numeric_string); source.append(" *left,   \n");
-  source.append("                              "); 
source.append(numeric_string); source.append(" *right,   \n");
-  source.append("                             unsigned int *left_count, 
unsigned int *right_count,  \n");
-  source.append("                              "); 
source.append(numeric_string); source.append(" *mid,    \n");
-  source.append("                              __local unsigned int 
*all_threads_converged)  \n");
-  source.append("     {  \n");
-  source.append("         uint glb_id = get_global_id(0); \n");
-  source.append("         uint grp_id = get_group_id(0); \n");
-  source.append("         uint grp_nm = get_num_groups(0); \n");
-  source.append("         uint lcl_id = get_local_id(0); \n");
-  source.append("         uint lcl_sz = get_local_size(0); \n");
-
-      // for all active threads
-  source.append("         if (tid < num_threads_active)  \n");
-  source.append("         {  \n");
-
-  source.append("             *left = s_left[tid];  \n");
-  source.append("             *right = s_right[tid];  \n");
-  source.append("             *left_count = s_left_count[tid];  \n");
-  source.append("             *right_count = s_right_count[tid];  \n");
-
-          // check if thread already converged
-  source.append("             if (*left != *right)  \n");
-  source.append("             {  \n");
-
-  source.append("                 *mid = computeMidpoint(*left, *right);  \n");
-  source.append("                 *all_threads_converged = 0;  \n");
-  source.append("             }  \n");
-  source.append("             else if ((*right_count - *left_count) > 1)  \n");
-  source.append("             {  \n");
-              // mark as not converged if multiple eigenvalues enclosed
-              // duplicate interval in storeIntervalsConverged()
-  source.append("                 *all_threads_converged = 0;  \n");
-  source.append("             }  \n");
-
-  source.append("         }    \n");
-  // end for all active threads
-  source.append("     }  \n");
-  }
-
-
-  template<typename StringType>
-  void generate_bisect_kernel_subdivideActiveIntervalShort(StringType & 
source, std::string const & numeric_string)
-  {
-  source.append("       \n");
-  source.append("     void  \n");
-  source.append("     subdivideActiveIntervalShort(const unsigned int tid,  
\n");
-  source.append("                             __local "); 
source.append(numeric_string); source.append(" *s_left,    \n");
-  source.append("                             __local "); 
source.append(numeric_string); source.append(" *s_right,   \n");
-  source.append("                             __local unsigned short 
*s_left_count,   \n");
-  source.append("                             __local unsigned short 
*s_right_count,  \n");
-  source.append("                             const unsigned int 
num_threads_active,  \n");
-  source.append("                             "); 
source.append(numeric_string); source.append(" *left,   \n");
-  source.append("                             "); 
source.append(numeric_string); source.append(" *right,   \n");
-  source.append("                             unsigned int *left_count, 
unsigned int *right_count,  \n");
-  source.append("                             "); 
source.append(numeric_string); source.append(" *mid,    \n");
-  source.append("                             __local unsigned int 
*all_threads_converged)  \n");
-  source.append("     {  \n");
-  source.append("         uint glb_id = get_global_id(0); \n");
-  source.append("         uint grp_id = get_group_id(0); \n");
-  source.append("         uint grp_nm = get_num_groups(0); \n");
-  source.append("         uint lcl_id = get_local_id(0); \n");
-  source.append("         uint lcl_sz = get_local_size(0); \n");
-
-      // for all active threads
-  source.append("         if (tid < num_threads_active)  \n");
-  source.append("         {  \n");
-
-  source.append("             *left = s_left[tid];  \n");
-  source.append("             *right = s_right[tid];  \n");
-  source.append("             *left_count = s_left_count[tid];  \n");
-  source.append("             *right_count = s_right_count[tid];  \n");
-
-          // check if thread already converged
-  source.append("             if (*left != *right)  \n");
-  source.append("             {  \n");
-
-  source.append("                 *mid = computeMidpoint(*left, *right);  \n");
-  source.append("                 *all_threads_converged = 0;  \n");
-  source.append("             }  \n");
-
-  source.append("         }    \n");
-  // end for all active threads
-  source.append("     }  \n");
-  }
-
-  // end of utilities
-  // start of kernels
-
-
-  
////////////////////////////////////////////////////////////////////////////////
-  // Bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix
-  //
-  // g_d             diagonal elements in global memory
-  // g_s             superdiagonal elements in global elements (stored so that 
the element *(g_s - 1) can be accessed an equals 0
-  // n               size of matrix
-  // lg              lower bound of input interval (e.g. Gerschgorin interval)
-  // ug              upper bound of input interval (e.g. Gerschgorin interval)
-  // lg_eig_count    number of eigenvalues that are smaller than \a lg
-  // lu_eig_count    number of eigenvalues that are smaller than \a lu
-  // epsilon         desired accuracy of eigenvalues to compute
-  
////////////////////////////////////////////////////////////////////////////////
-  ///
-  template <typename StringType>
-  void generate_bisect_kernel_bisectKernel(StringType & source, std::string 
const & numeric_string)
-  {
-      source.append("     __kernel  \n");
-      source.append("     void  \n");
-      source.append("     bisectKernelSmall(__global "); 
source.append(numeric_string); source.append(" *g_d,   \n");
-      source.append("                  __global "); 
source.append(numeric_string); source.append(" *g_s,   \n");
-      source.append("                  const unsigned int n,  \n");
-      source.append("                  __global "); 
source.append(numeric_string); source.append(" *g_left,   \n");
-      source.append("                  __global "); 
source.append(numeric_string); source.append(" *g_right,  \n");
-      source.append("                  __global unsigned int *g_left_count, 
__global unsigned int *g_right_count,  \n");
-      source.append("                  const "); 
source.append(numeric_string); source.append(" lg,  \n");
-      source.append("                  const "); 
source.append(numeric_string); source.append(" ug,  \n");
-      source.append("                  const unsigned int lg_eig_count, const 
unsigned int ug_eig_count, \n");
-      source.append("                  "); source.append(numeric_string); 
source.append(" epsilon  \n");
-      source.append("                 )  \n");
-      source.append("     {  \n");
-      source.append("         g_s = g_s + 1; \n");
-      source.append("         uint glb_id = get_global_id(0); \n");
-      source.append("         uint grp_id = get_group_id(0); \n");
-      source.append("         uint grp_nm = get_num_groups(0); \n");
-      source.append("         uint lcl_id = get_local_id(0); \n");
-      source.append("         uint lcl_sz = get_local_size(0); \n");
-
-          // intervals (store left and right because the subdivision tree is 
in general
-          // not dense
-      source.append("         __local "); source.append(numeric_string); 
source.append(" s_left[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];  \n");
-      source.append("         __local "); source.append(numeric_string); 
source.append(" s_right[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];  \n");
-
-          // number of eigenvalues that are smaller than s_left / s_right
-          // (correspondence is realized via indices)
-      source.append("         __local  unsigned int  
s_left_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];  \n");
-      source.append("         __local  unsigned int  
s_right_count[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX];  \n");
-
-          // helper for stream compaction
-      source.append("         __local  unsigned int  \n");
-      source.append("           
s_compaction_list[VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX + 1];  \n");
-
-          // state variables for whole block
-          // if 0 then compaction of second chunk of child intervals is not 
necessary
-          // (because all intervals had exactly one non-dead child)
-      source.append("         __local  unsigned int compact_second_chunk;  
\n");
-      source.append("         __local  unsigned int all_threads_converged;  
\n");
-
-          // number of currently active threads
-      source.append("         __local  unsigned int num_threads_active;  \n");
-
-          // number of threads to use for stream compaction
-      source.append("         __local  unsigned int num_threads_compaction;  
\n");
-
-          // helper for exclusive scan
-      source.append("         __local unsigned int *s_compaction_list_exc = 
s_compaction_list + 1;  \n");
-
-
-          // variables for currently processed interval
-          // left and right limit of active interval
-      source.append("          "); source.append(numeric_string); 
source.append(" left = 0.0f;  \n");
-      source.append("          "); source.append(numeric_string); 
source.append(" right = 0.0f;  \n");
-      source.append("         unsigned int left_count = 0;  \n");
-      source.append("         unsigned int right_count = 0;  \n");
-          // midpoint of active interval
-      source.append("          "); source.append(numeric_string); 
source.append(" mid = 0.0f;  \n");
-          // number of eigenvalues smaller then mid
-      source.append("         unsigned int mid_count = 0;  \n");
-          // affected from compaction
-      source.append("         unsigned int  is_active_second = 0;  \n");
-
-      source.append("         s_compaction_list[lcl_id] = 0;  \n");
-      source.append("         s_left[lcl_id] = 0.0;  \n");
-      source.append("         s_right[lcl_id] = 0.0;  \n");
-      source.append("         s_left_count[lcl_id] = 0;  \n");
-      source.append("         s_right_count[lcl_id] = 0;  \n");
-
-      source.append("         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
-
-          // set up initial configuration
-      source.append("         if (0 == lcl_id)  \n");
-      source.append("         {  \n");
-      source.append("             s_left[0] = lg;  \n");
-      source.append("             s_right[0] = ug;  \n");
-      source.append("             s_left_count[0] = lg_eig_count;  \n");
-      source.append("             s_right_count[0] = ug_eig_count;  \n");
-
-      source.append("             compact_second_chunk = 0;  \n");
-      source.append("             num_threads_active = 1;  \n");
-
-      source.append("             num_threads_compaction = 1;  \n");
-      source.append("         }  \n");
-
-          // for all active threads read intervals from the last level
-          // the number of (worst case) active threads per level l is 2^l
-
-      source.append("         while (true)  \n");
-      source.append("         {  \n");
-
-      source.append("             all_threads_converged = 1;  \n");
-      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
-
-      source.append("             is_active_second = 0;  \n");
-      source.append("             subdivideActiveIntervalMulti(lcl_id,  \n");
-      source.append("                                     s_left, s_right, 
s_left_count, s_right_count,  \n");
-      source.append("                                     num_threads_active,  
\n");
-      source.append("                                     &left, &right, 
&left_count, &right_count,  \n");
-      source.append("                                     &mid, 
&all_threads_converged);  \n");
-   //   source.append("             output[lcl_id] = s_left;  \n");
-      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
-
-              // check if done
-      source.append("             if (1 == all_threads_converged)  \n");
-      source.append("             {  \n");
-      source.append("                 break;  \n");
-      source.append("             }  \n");
-
-      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
-
-              // compute number of eigenvalues smaller than mid
-              // use all threads for reading the necessary matrix data from 
global
-              // memory
-              // use s_left and s_right as scratch space for diagonal and
-              // superdiagonal of matrix
-      source.append("             mid_count = computeNumSmallerEigenvals(g_d, 
g_s, n, mid,  \n");
-      source.append("                                                    
lcl_id, num_threads_active,  \n");
-      source.append("                                                    
s_left, s_right,  \n");
-      source.append("                                                    (left 
== right));  \n");
-
-      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
-
-              // store intervals
-              // for all threads store the first child interval in a 
continuous chunk of
-              // memory, and the second child interval -- if it exists -- in a 
second
-              // chunk; it is likely that all threads reach convergence up to
-              // \a epsilon at the same level; furthermore, for higher level 
most / all
-              // threads will have only one child, storing the first child 
compactly will
-              // (first) avoid to perform a compaction step on the first 
chunk, (second)
-              // make it for higher levels (when all threads / intervals have
-              // exactly one child)  unnecessary to perform a compaction of 
the second
-              // chunk
-      source.append("             if (lcl_id < num_threads_active)  \n");
-      source.append("             {  \n");
-
-      source.append("                 if (left != right)  \n");
-      source.append("                 {  \n");
-
-                      // store intervals
-      source.append("                     storeNonEmptyIntervals(lcl_id, 
num_threads_active,  \n");
-      source.append("                                            s_left, 
s_right, s_left_count, s_right_count,  \n");
-      source.append("                                            left, mid, 
right,  \n");
-      source.append("                                            left_count, 
mid_count, right_count,  \n");
-      source.append("                                            epsilon, 
&compact_second_chunk,  \n");
-      source.append("                                            
s_compaction_list_exc,  \n");
-      source.append("                                            
&is_active_second);  \n");
-      source.append("                 }  \n");
-      source.append("                 else  \n");
-      source.append("                 {  \n");
-
-      source.append("                     storeIntervalConverged(s_left, 
s_right, s_left_count, s_right_count,  \n");
-      source.append("                                            &left, &mid, 
&right,  \n");
-      source.append("                                            &left_count, 
&mid_count, &right_count,  \n");
-      source.append("                                            
s_compaction_list_exc, &compact_second_chunk,  \n");
-      source.append("                                            
num_threads_active,  \n");
-      source.append("                                            
&is_active_second);  \n");
-      source.append("                 }  \n");
-      source.append("             }  \n");
-
-              // necessary so that compact_second_chunk is up-to-date
-      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
-
-              // perform compaction of chunk where second children are stored
-              // scan of (num_threads_actieigenvaluesve / 2) elements, thus at 
most
-              // (num_threads_active / 4) threads are needed
-      source.append("             if (compact_second_chunk > 0)  \n");
-      source.append("             {  \n");
-
-      source.append("                 
createIndicesCompaction(s_compaction_list_exc, num_threads_compaction);  \n");
-
-      source.append("                 compactIntervals(s_left, s_right, 
s_left_count, s_right_count,  \n");
-      source.append("                                  mid, right, mid_count, 
right_count,  \n");
-      source.append("                                  s_compaction_list, 
num_threads_active,  \n");
-      source.append("                                  is_active_second);  
\n");
-      source.append("             }  \n");
-
-      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
-
-      source.append("             if (0 == lcl_id)  \n");
-      source.append("             {  \n");
-
-                  // update number of active threads with result of reduction
-      source.append("                 num_threads_active += 
s_compaction_list[num_threads_active];  \n");
-
-      source.append("                 num_threads_compaction = 
ceilPow2(num_threads_active);  \n");
-
-      source.append("                 compact_second_chunk = 0;  \n");
-      source.append("             }  \n");
-
-      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
-
-      source.append("         }  \n");
-
-      source.append("         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
-
-          // write resulting intervals to global mem
-          // for all threads write if they have been converged to an 
eigenvalue to
-          // a separate array
-
-          // at most n valid intervals
-      source.append("         if (lcl_id < n)  \n");
-      source.append("         {  \n");
-              // intervals converged so left and right limit are identical
-      source.append("             g_left[lcl_id]  = s_left[lcl_id];  \n");
-              // left count is sufficient to have global order
-      source.append("             g_left_count[lcl_id]  = 
s_left_count[lcl_id];  \n");
-      source.append("         }  \n");
-      source.append("     }  \n");
-     }
-
-  
////////////////////////////////////////////////////////////////////////////////
-  // Perform second step of bisection algorithm for large matrices for 
intervals that after the first step contained more than one eigenvalue
-  //
-  // g_d              diagonal elements of symmetric, tridiagonal matrix
-  // g_s              superdiagonal elements of symmetric, tridiagonal matrix
-  // n                matrix size
-  // blocks_mult      start addresses of blocks of intervals that are 
processed by one block of threads, each of the intervals contains more than one 
eigenvalue
-  // blocks_mult_sum  total number of eigenvalues / singleton intervals in one 
block of intervals
-  // g_left           left limits of intervals
-  // g_right          right limits of intervals
-  // g_left_count     number of eigenvalues less than left limits
-  // g_right_count    number of eigenvalues less than right limits
-  // g_lambda         final eigenvalue
-  // g_pos            index of eigenvalue (in ascending order)
-  // precision         desired precision of eigenvalues
-  
////////////////////////////////////////////////////////////////////////////////
-
-  template <typename StringType>
-  void generate_bisect_kernel_bisectKernelLarge_MultIntervals(StringType & 
source, std::string const & numeric_string)
-  {
-      source.append("     __kernel  \n");
-      source.append("     void  \n");
-      source.append("     bisectKernelLarge_MultIntervals(__global "); 
source.append(numeric_string); source.append(" *g_d,   \n");
-      source.append("                                     __global "); 
source.append(numeric_string); source.append(" *g_s,   \n");
-      source.append("                                     const unsigned int 
n,  \n");
-      source.append("                                     __global unsigned 
int *blocks_mult,  \n");
-      source.append("                                     __global unsigned 
int *blocks_mult_sum,  \n");
-      source.append("                                     __global "); 
source.append(numeric_string); source.append(" *g_left,   \n");
-      source.append("                                     __global "); 
source.append(numeric_string); source.append(" *g_right,  \n");
-      source.append("                                     __global unsigned 
int *g_left_count,  \n");
-      source.append("                                     __global unsigned 
int *g_right_count,  \n");
-      source.append("                                     __global  "); 
source.append(numeric_string); source.append(" *g_lambda, \n");
-      source.append("                                     __global unsigned 
int *g_pos,  \n");
-      source.append("                                     "); 
source.append(numeric_string); source.append(" precision  \n");
-      source.append("                                    )  \n");
-      source.append("     {  \n");
-      source.append("         g_s = g_s + 1; \n");
-      source.append("         uint glb_id = get_global_id(0); \n");
-      source.append("         uint grp_id = get_group_id(0); \n");
-      source.append("         uint grp_nm = get_num_groups(0); \n");
-      source.append("         uint lcl_id = get_local_id(0); \n");
-      source.append("         uint lcl_sz = get_local_size(0); \n");
-
-      source.append("       const unsigned int tid = lcl_id;  \n");
-
-          // left and right limits of interval
-      source.append("         __local "); source.append(numeric_string); 
source.append(" s_left[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];  \n");
-      source.append("         __local "); source.append(numeric_string); 
source.append(" s_right[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];  \n");
-
-          // number of eigenvalues smaller than interval limits
-      source.append("         __local  unsigned int  s_left_count[2 * 
VIENNACL_BISECT_MAX_THREADS_BLOCK];  \n");
-      source.append("         __local  unsigned int  s_right_count[2 * 
VIENNACL_BISECT_MAX_THREADS_BLOCK];  \n");
-
-          // helper array for chunk compaction of second chunk
-      source.append("         __local  unsigned int  s_compaction_list[2 * 
VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];  \n");
-          // compaction list helper for exclusive scan
-      source.append("         __local unsigned int *s_compaction_list_exc = 
s_compaction_list + 1;  \n");
-
-          // flag if all threads are converged
-      source.append("         __local  unsigned int  all_threads_converged;  
\n");
-          // number of active threads
-      source.append("         __local  unsigned int  num_threads_active;  \n");
-          // number of threads to employ for compaction
-      source.append("         __local  unsigned int  num_threads_compaction;  
\n");
-          // flag if second chunk has to be compacted
-      source.append("         __local  unsigned int compact_second_chunk;  
\n");
-
-          // parameters of block of intervals processed by this block of 
threads
-      source.append("         __local  unsigned int  c_block_start;  \n");
-      source.append("         __local  unsigned int  c_block_end;  \n");
-      source.append("         __local  unsigned int  c_block_offset_output;  
\n");
-
-          // midpoint of currently active interval of the thread
-      source.append("         "); source.append(numeric_string); 
source.append(" mid = 0.0f;  \n");
-          // number of eigenvalues smaller than \a mid
-      source.append("         unsigned int  mid_count = 0;  \n");
-          // current interval parameter
-      source.append("         "); source.append(numeric_string); 
source.append(" left = 0.0f;  \n");
-      source.append("         "); source.append(numeric_string); 
source.append(" right = 0.0f;  \n");
-      source.append("         unsigned int  left_count = 0;  \n");
-      source.append("         unsigned int  right_count = 0;  \n");
-          // helper for compaction, keep track which threads have a second 
child
-      source.append("         unsigned int  is_active_second = 0;  \n");
-
-      source.append("         barrier(CLK_LOCAL_MEM_FENCE);            \n");
-
-          // initialize common start conditions
-      source.append("         if (0 == tid)  \n");
-      source.append("         {  \n");
-
-      source.append("             c_block_start = blocks_mult[grp_id];  \n");
-      source.append("             c_block_end = blocks_mult[grp_id + 1];  \n");
-      source.append("             c_block_offset_output = 
blocks_mult_sum[grp_id];  \n");
-      source.append("               \n");
-
-      source.append("             num_threads_active = c_block_end - 
c_block_start;  \n");
-      source.append("             s_compaction_list[0] = 0;  \n");
-      source.append("             num_threads_compaction = 
ceilPow2(num_threads_active);  \n");
-
-      source.append("             all_threads_converged = 1;  \n");
-      source.append("             compact_second_chunk = 0;  \n");
-      source.append("         }  \n");
-      source.append("          s_left_count [tid] = 42;  \n");
-      source.append("          s_right_count[tid] = 42;  \n");
-      source.append("          s_left_count [tid + 
VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;  \n");
-      source.append("          s_right_count[tid + 
VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;  \n");
-      source.append("           \n");
-      source.append("         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
-      source.append("           \n");
-
-          // read data into shared memory
-      source.append("         if (tid < num_threads_active)  \n");
-      source.append("         {  \n");
-
-      source.append("             s_left[tid]  = g_left[c_block_start + tid];  
\n");
-      source.append("             s_right[tid] = g_right[c_block_start + tid]; 
 \n");
-      source.append("             s_left_count[tid]  = 
g_left_count[c_block_start + tid];  \n");
-      source.append("             s_right_count[tid] = 
g_right_count[c_block_start + tid];  \n");
-      source.append("               \n");
-      source.append("         }  \n");
-      source.append("        \n");
-      source.append("         barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
-      source.append("         unsigned int iter = 0;  \n");
-          // do until all threads converged
-      source.append("         while (true)  \n");
-      source.append("         {  \n");
-      source.append("             iter++;  \n");
-              //for (int iter=0; iter < 0; iter++) {
-      source.append("             s_compaction_list[lcl_id] = 0;  \n");
-      source.append("             s_compaction_list[lcl_id + lcl_sz] = 0;  
\n");
-      source.append("             s_compaction_list[2 * 
VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;  \n");
-
-              // subdivide interval if currently active and not already 
converged
-      source.append("             subdivideActiveIntervalMulti(tid, s_left, 
s_right,  \n");
-      source.append("                                     s_left_count, 
s_right_count,  \n");
-      source.append("                                     num_threads_active,  
\n");
-      source.append("                                     &left, &right, 
&left_count, &right_count,  \n");
-      source.append("                                     &mid, 
&all_threads_converged);  \n");
-      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
-
-              // stop if all eigenvalues have been found
-      source.append("             if (1 == all_threads_converged)  \n");
-      source.append("             {  \n");
-      source.append("                  \n");
-      source.append("                 break;  \n");
-      source.append("             }  \n");
-
-              // compute number of eigenvalues smaller than mid for active and 
not
-              // converged intervals, use all threads for loading data from 
gmem and
-              // s_left and s_right as scratch space to store the data load 
from gmem
-              // in shared memory
-      source.append("             mid_count = 
computeNumSmallerEigenvalsLarge(g_d, g_s, n,  \n");
-      source.append("                                                         
mid, tid, num_threads_active,  \n");
-      source.append("                                                         
s_left, s_right,  \n");
-      source.append("                                                         
(left == right));  \n");
-      source.append("                                                \n");
-      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
-
-      source.append("             if (tid < num_threads_active)  \n");
-      source.append("             {  \n");
-      source.append("                   \n");
-                  // store intervals
-      source.append("                 if (left != right)  \n");
-      source.append("                 {  \n");
-
-      source.append("                     storeNonEmptyIntervals(tid, 
num_threads_active,  \n");
-      source.append("                                            s_left, 
s_right, s_left_count, s_right_count,  \n");
-      source.append("                                            left, mid, 
right,  \n");
-      source.append("                                            left_count, 
mid_count, right_count,  \n");
-      source.append("                                            precision, 
&compact_second_chunk,  \n");
-      source.append("                                            
s_compaction_list_exc,  \n");
-      source.append("                                            
&is_active_second);  \n");
-      source.append("                      \n");
-      source.append("                 }  \n");
-      source.append("                 else  \n");
-      source.append("                 {  \n");
-
-      source.append("                     storeIntervalConverged(s_left, 
s_right, s_left_count, s_right_count,  \n");
-      source.append("                                            &left, &mid, 
&right,  \n");
-      source.append("                                            &left_count, 
&mid_count, &right_count,  \n");
-      source.append("                                            
s_compaction_list_exc, &compact_second_chunk,  \n");
-      source.append("                                            
num_threads_active,  \n");
-      source.append("                                            
&is_active_second);  \n");
-      source.append("                   \n");
-      source.append("                 }  \n");
-      source.append("             }  \n");
-
-      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
-
-              // compact second chunk of intervals if any of the threads 
generated
-              // two child intervals
-      source.append("             if (1 == compact_second_chunk)  \n");
-      source.append("             {  \n");
-
-      source.append("                 
createIndicesCompaction(s_compaction_list_exc, num_threads_compaction);  \n");
-      source.append("                 compactIntervals(s_left, s_right, 
s_left_count, s_right_count,  \n");
-      source.append("                                  mid, right, mid_count, 
right_count,  \n");
-      source.append("                                  s_compaction_list, 
num_threads_active,  \n");
-      source.append("                                  is_active_second);  
\n");
-      source.append("             }  \n");
-
-      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
-
-              // update state variables
-      source.append("             if (0 == tid)  \n");
-      source.append("             {  \n");
-      source.append("                 num_threads_active += 
s_compaction_list[num_threads_active];  \n");
-      source.append("                 num_threads_compaction = 
ceilPow2(num_threads_active);  \n");
-
-      source.append("                 compact_second_chunk = 0;  \n");
-      source.append("                 all_threads_converged = 1;  \n");
-      source.append("             }  \n");
-
-      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
-
-              // clear
-      source.append("             s_compaction_list_exc[lcl_id] = 0;  \n");
-      source.append("             s_compaction_list_exc[lcl_id + lcl_sz] = 0;  
 \n");
-      source.append("               \n");
-      source.append("             if (num_threads_compaction > lcl_sz)         
     \n");
-      source.append("             {  \n");
-      source.append("               break;  \n");
-      source.append("             }  \n");
-
-
-      source.append("             barrier(CLK_LOCAL_MEM_FENCE)  ;  \n");
-
-      source.append("    } \n"); // end until all threads converged
-
-          // write data back to global memory
-      source.append("         if (tid < num_threads_active)  \n");
-      source.append("         {  \n");
-
-      source.append("             unsigned int addr = c_block_offset_output + 
tid;  \n");
-      source.append("               \n");
-      source.append("             g_lambda[addr]  = s_left[tid];  \n");
-      source.append("             g_pos[addr]   = s_right_count[tid];  \n");
-      source.append("         }  \n");
-      source.append("     }  \n");
-  }
-
-
-  
////////////////////////////////////////////////////////////////////////////////
-  // Determine eigenvalues for large matrices for intervals that after the 
first step contained one eigenvalue
-  //
-  // g_d            diagonal elements of symmetric, tridiagonal matrix
-  // g_s            superdiagonal elements of symmetric, tridiagonal matrix
-  // n              matrix size
-  // num_intervals  total number of intervals containing one eigenvalue after 
the first step
-  // g_left         left interval limits
-  // g_right        right interval limits
-  // g_pos          index of interval / number of intervals that are smaller 
than right interval limit
-  // precision      desired precision of eigenvalues
-  
////////////////////////////////////////////////////////////////////////////////
-
-  template <typename StringType>
-  void generate_bisect_kernel_bisectKernelLarge_OneIntervals(StringType & 
source, std::string const & numeric_string)
-  {
-      source.append("     __kernel  \n");
-      source.append("     void  \n");
-      source.append("     bisectKernelLarge_OneIntervals(__global "); 
source.append(numeric_string); source.append(" *g_d,   \n");
-      source.append("                                    __global "); 
source.append(numeric_string); source.append(" *g_s,    \n");
-      source.append("                                    const unsigned int n, 
 \n");
-      source.append("                                    unsigned int 
num_intervals,  \n");
-      source.append("                                    __global "); 
source.append(numeric_string); source.append(" *g_left,  \n");
-      source.append("                                    __global "); 
source.append(numeric_string); source.append(" *g_right,  \n");
-      source.append("                                    __global unsigned int 
*g_pos,  \n");
-      source.append("                                    "); 
source.append(numeric_string); source.append(" precision)  \n");
-      source.append("     {  \n");
-      source.append("         g_s = g_s + 1; \n");
-      source.append("         uint glb_id = get_global_id(0); \n");
-      source.append("         uint grp_id = get_group_id(0); \n");
-      source.append("         uint grp_nm = get_num_groups(0); \n");
-      source.append("         uint lcl_id = get_local_id(0); \n");
-      source.append("         uint lcl_sz = get_local_size(0); \n");
-      source.append("         const unsigned int gtid = (lcl_sz * grp_id) + 
lcl_id;  \n");
-      source.append("         __

<TRUNCATED>

Reply via email to