http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/amg_operations.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/amg_operations.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/amg_operations.hpp
new file mode 100644
index 0000000..65b323e
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/amg_operations.hpp
@@ -0,0 +1,821 @@
+#ifndef VIENNACL_LINALG_CUDA_AMG_OPERATIONS_HPP
+#define VIENNACL_LINALG_CUDA_AMG_OPERATIONS_HPP
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the PDF manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+/** @file cuda/amg_operations.hpp
+    @brief Implementations of routines for AMG in OpenCL.
+*/
+
+#include <cstdlib>
+#include <cmath>
+#include "viennacl/linalg/detail/amg/amg_base.hpp"
+
+#include <map>
+#include <set>
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+namespace amg
+{
+
+
+///////////////////////////////////////////
+
+__global__ void amg_influence_trivial_kernel(
+          const unsigned int * row_indices,
+          const unsigned int * column_indices,
+          unsigned int size1,
+          unsigned int nnz,
+          unsigned int *influences_row,
+          unsigned int *influences_id,
+          unsigned int *influences_values
+          )
+{
+  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int global_size = gridDim.x * blockDim.x;
+
+  for (unsigned int i = global_id; i < size1; i += global_size)
+  {
+    unsigned int tmp = row_indices[i];
+    influences_row[i] = tmp;
+    influences_values[i] = row_indices[i+1] - tmp;
+  }
+
+  for (unsigned int i = global_id; i < nnz; i += global_size)
+    influences_id[i] = column_indices[i];
+
+  if (global_id == 0)
+    influences_row[size1] = row_indices[size1];
+}
+
+
+/** @brief Routine for taking all connections in the matrix as strong */
+template<typename NumericT>
+void amg_influence_trivial(compressed_matrix<NumericT> const & A,
+                           viennacl::linalg::detail::amg::amg_level_context & 
amg_context,
+                           viennacl::linalg::amg_tag & tag)
+{
+  (void)tag;
+
+  amg_influence_trivial_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned 
int>(A.handle1().cuda_handle()),
+                                             viennacl::cuda_arg<unsigned 
int>(A.handle2().cuda_handle()),
+                                             static_cast<unsigned 
int>(A.size1()),
+                                             static_cast<unsigned 
int>(A.nnz()),
+                                             
viennacl::cuda_arg(amg_context.influence_jumper_),
+                                             
viennacl::cuda_arg(amg_context.influence_ids_),
+                                             
viennacl::cuda_arg(amg_context.influence_values_)
+                                            );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_influence_trivial_kernel");
+}
+
+
+/** @brief Routine for extracting strongly connected points considering a 
user-provided threshold value */
+template<typename NumericT>
+void amg_influence_advanced(compressed_matrix<NumericT> const & A,
+                            viennacl::linalg::detail::amg::amg_level_context & 
amg_context,
+                            viennacl::linalg::amg_tag & tag)
+{
+  throw std::runtime_error("not implemented yet");
+}
+
+/** @brief Dispatcher for influence processing */
+template<typename NumericT>
+void amg_influence(compressed_matrix<NumericT> const & A,
+                   viennacl::linalg::detail::amg::amg_level_context & 
amg_context,
+                   viennacl::linalg::amg_tag & tag)
+{
+  // TODO: dispatch based on influence tolerance provided
+  amg_influence_trivial(A, amg_context, tag);
+}
+
+/** @brief Assign IDs to coarse points.
+*
+*  TODO: Use exclusive_scan on GPU for this.
+*/
+inline void 
enumerate_coarse_points(viennacl::linalg::detail::amg::amg_level_context & 
amg_context)
+{
+  viennacl::backend::typesafe_host_array<unsigned int> 
point_types(amg_context.point_types_.handle(), amg_context.point_types_.size());
+  viennacl::backend::typesafe_host_array<unsigned int> 
coarse_ids(amg_context.coarse_id_.handle(),    amg_context.coarse_id_.size());
+  viennacl::backend::memory_read(amg_context.point_types_.handle(), 0, 
point_types.raw_size(), point_types.get());
+  viennacl::backend::memory_read(amg_context.coarse_id_.handle(),   0, 
coarse_ids.raw_size(),  coarse_ids.get());
+
+  unsigned int coarse_id = 0;
+  for (std::size_t i=0; i<amg_context.point_types_.size(); ++i)
+  {
+    coarse_ids.set(i, coarse_id);
+    if (point_types[i] == 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
+      ++coarse_id;
+  }
+
+  amg_context.num_coarse_ = coarse_id;
+
+  viennacl::backend::memory_write(amg_context.coarse_id_.handle(), 0, 
coarse_ids.raw_size(), coarse_ids.get());
+}
+
+//////////////////////////////////////
+
+/** @brief CUDA kernel initializing the work vectors at each PMIS iteration */
+template<typename IndexT>
+__global__ void amg_pmis2_init_workdata(IndexT *work_state,
+                                        IndexT *work_random,
+                                        IndexT *work_index,
+                                        IndexT const *point_types,
+                                        IndexT const *random_weights,
+                                        unsigned int size)
+{
+  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int global_size = gridDim.x * blockDim.x;
+
+  for (unsigned int i = global_id; i < size; i += global_size)
+  {
+    switch (point_types[i])
+    {
+    case 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED: 
work_state[i] = 1; break;
+    case viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE:    
  work_state[i] = 0; break;
+    case viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE:  
  work_state[i] = 2; break;
+    default:
+      break; // do nothing
+    }
+
+    work_random[i] = random_weights[i];
+    work_index[i]  = i;
+  }
+}
+
+/** @brief CUDA kernel propagating the state triple (status, weight, nodeID) 
to neighbors using a max()-operation */
+template<typename IndexT>
+__global__ void amg_pmis2_max_neighborhood(IndexT const *work_state,
+                                           IndexT const *work_random,
+                                           IndexT const *work_index,
+                                           IndexT       *work_state2,
+                                           IndexT       *work_random2,
+                                           IndexT       *work_index2,
+                                           IndexT const *influences_row,
+                                           IndexT const *influences_id,
+                                           unsigned int size)
+{
+  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int global_size = gridDim.x * blockDim.x;
+
+  for (unsigned int i = global_id; i < size; i += global_size)
+  {
+    // load
+    unsigned int state  = work_state[i];
+    unsigned int random = work_random[i];
+    unsigned int index  = work_index[i];
+
+    // max
+    unsigned int j_stop = influences_row[i + 1];
+    for (unsigned int j = influences_row[i]; j < j_stop; ++j)
+    {
+      unsigned int influenced_point_id = influences_id[j];
+
+      // lexigraphical triple-max (not particularly pretty, but does the job):
+      if (state < work_state[influenced_point_id])
+      {
+        state  = work_state[influenced_point_id];
+        random = work_random[influenced_point_id];
+        index  = work_index[influenced_point_id];
+      }
+      else if (state == work_state[influenced_point_id])
+      {
+        if (random < work_random[influenced_point_id])
+        {
+          state  = work_state[influenced_point_id];
+          random = work_random[influenced_point_id];
+          index  = work_index[influenced_point_id];
+        }
+        else if (random == work_random[influenced_point_id])
+        {
+          if (index < work_index[influenced_point_id])
+          {
+            state  = work_state[influenced_point_id];
+            random = work_random[influenced_point_id];
+            index  = work_index[influenced_point_id];
+          }
+        } // max(random)
+      } // max(state)
+    } // for
+
+    // store
+    work_state2[i]  = state;
+    work_random2[i] = random;
+    work_index2[i]  = index;
+  }
+}
+
+/** @brief CUDA kernel for marking MIS and non-MIS nodes */
+template<typename IndexT>
+__global__ void amg_pmis2_mark_mis_nodes(IndexT const *work_state,
+                                         IndexT const *work_index,
+                                         IndexT *point_types,
+                                         IndexT *undecided_buffer,
+                                         unsigned int size)
+{
+  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int global_size = gridDim.x * blockDim.x;
+
+  unsigned int num_undecided = 0;
+  for (unsigned int i = global_id; i < size; i += global_size)
+  {
+    unsigned int max_state  = work_state[i];
+    unsigned int max_index  = work_index[i];
+
+    if (point_types[i] == 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
+    {
+      if (i == max_index) // make this a MIS node
+        point_types[i] = 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE;
+      else if (max_state == 2) // mind the mapping of 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE above!
+        point_types[i] = 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE;
+      else
+        num_undecided += 1;
+    }
+  }
+
+  // reduction of the number of undecided nodes:
+  __shared__ unsigned int shared_buffer[256];
+  shared_buffer[threadIdx.x] = num_undecided;
+  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
+  {
+    __syncthreads();
+    if (threadIdx.x < stride)
+      shared_buffer[threadIdx.x] += shared_buffer[threadIdx.x+stride];
+  }
+
+  if (threadIdx.x == 0)
+    undecided_buffer[blockIdx.x] = shared_buffer[0];
+
+}
+
+/** @brief CUDA kernel for resetting non-MIS (i.e. coarse) points to undecided 
so that subsequent kernels work */
+__global__ void amg_pmis2_reset_state(unsigned int *point_types, unsigned int 
size)
+{
+  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int global_size = gridDim.x * blockDim.x;
+
+  for (unsigned int i = global_id; i < size; i += global_size)
+  {
+    if (point_types[i] != 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
+      point_types[i] = 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED;
+  }
+}
+
+/** @brief AG (aggregation based) coarsening, single-threaded version of stage 
1
+*
+* @param A             Operator matrix on all levels
+* @param amg_context   AMG hierarchy datastructures
+* @param tag           AMG preconditioner tag
+*/
+template<typename NumericT>
+void amg_coarse_ag_stage1_mis2(compressed_matrix<NumericT> const & A,
+                               
viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                               viennacl::linalg::amg_tag & tag)
+{
+  viennacl::vector<unsigned int> random_weights(A.size1(), 
viennacl::context(viennacl::MAIN_MEMORY));
+  unsigned int *random_weights_ptr = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(random_weights.handle());
+  for (std::size_t i=0; i<random_weights.size(); ++i)
+    random_weights_ptr[i] = static_cast<unsigned int>(rand()) % 
static_cast<unsigned int>(A.size1());
+  random_weights.switch_memory_context(viennacl::traits::context(A));
+
+  // work vectors:
+  viennacl::vector<unsigned int> work_state(A.size1(),  
viennacl::traits::context(A));
+  viennacl::vector<unsigned int> work_random(A.size1(), 
viennacl::traits::context(A));
+  viennacl::vector<unsigned int> work_index(A.size1(),  
viennacl::traits::context(A));
+
+  viennacl::vector<unsigned int> work_state2(A.size1(),  
viennacl::traits::context(A));
+  viennacl::vector<unsigned int> work_random2(A.size1(), 
viennacl::traits::context(A));
+  viennacl::vector<unsigned int> work_index2(A.size1(),  
viennacl::traits::context(A));
+
+  unsigned int num_undecided = static_cast<unsigned int>(A.size1());
+  viennacl::vector<unsigned int> undecided_buffer(256, 
viennacl::traits::context(A));
+  viennacl::backend::typesafe_host_array<unsigned int> 
undecided_buffer_host(undecided_buffer.handle(), undecided_buffer.size());
+
+  unsigned int pmis_iters = 0;
+  while (num_undecided > 0)
+  {
+    ++pmis_iters;
+
+    //
+    // init temporary work data:
+    //
+    amg_pmis2_init_workdata<<<128, 128>>>(viennacl::cuda_arg(work_state),
+                                          viennacl::cuda_arg(work_random),
+                                          viennacl::cuda_arg(work_index),
+                                          
viennacl::cuda_arg(amg_context.point_types_),
+                                          viennacl::cuda_arg(random_weights),
+                                          static_cast<unsigned int>(A.size1())
+                                         );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("amg_pmis2_reset_state");
+
+
+    //
+    // Propagate maximum tuple twice
+    //
+    for (unsigned int r = 0; r < 2; ++r)
+    {
+      // max operation over neighborhood
+      amg_pmis2_max_neighborhood<<<128, 128>>>(viennacl::cuda_arg(work_state),
+                                               viennacl::cuda_arg(work_random),
+                                               viennacl::cuda_arg(work_index),
+                                               viennacl::cuda_arg(work_state2),
+                                               
viennacl::cuda_arg(work_random2),
+                                               viennacl::cuda_arg(work_index2),
+                                               
viennacl::cuda_arg(amg_context.influence_jumper_),
+                                               
viennacl::cuda_arg(amg_context.influence_ids_),
+                                               static_cast<unsigned 
int>(A.size1())
+                                              );
+      VIENNACL_CUDA_LAST_ERROR_CHECK("amg_pmis2_max_neighborhood");
+
+      // copy work array (can be fused into a single kernel if needed. 
Previous kernel is in most cases sufficiently heavy)
+      work_state  = work_state2;
+      work_random = work_random2;
+      work_index  = work_index2;
+    }
+
+    //
+    // mark MIS and non-MIS nodes:
+    //
+    amg_pmis2_mark_mis_nodes<<<128, 128>>>(viennacl::cuda_arg(work_state),
+                                           viennacl::cuda_arg(work_index),
+                                           
viennacl::cuda_arg(amg_context.point_types_),
+                                           
viennacl::cuda_arg(undecided_buffer),
+                                           static_cast<unsigned int>(A.size1())
+                                          );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("amg_pmis2_reset_state");
+
+    // get number of undecided points on host:
+    viennacl::backend::memory_read(undecided_buffer.handle(),   0, 
undecided_buffer_host.raw_size(),  undecided_buffer_host.get());
+    num_undecided = 0;
+    for (std::size_t i=0; i<undecided_buffer.size(); ++i)
+      num_undecided += undecided_buffer_host[i];
+
+  } //while
+
+  // consistency with sequential MIS: reset state for non-coarse points, so 
that coarse indices are correctly picked up later
+  amg_pmis2_reset_state<<<128, 
128>>>(viennacl::cuda_arg(amg_context.point_types_),
+                                      static_cast<unsigned 
int>(amg_context.point_types_.size())
+                                     );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_pmis2_reset_state");
+}
+
+
+
+
+
+template<typename IndexT>
+__global__ void amg_agg_propagate_coarse_indices(IndexT       *point_types,
+                                                 IndexT       *coarse_ids,
+                                                 IndexT const *influences_row,
+                                                 IndexT const *influences_id,
+                                                 unsigned int size)
+{
+  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int global_size = gridDim.x * blockDim.x;
+
+  for (unsigned int i = global_id; i < size; i += global_size)
+  {
+    if (point_types[i] == 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
+    {
+      unsigned int coarse_index = coarse_ids[i];
+
+      unsigned int j_stop = influences_row[i + 1];
+      for (unsigned int j = influences_row[i]; j < j_stop; ++j)
+      {
+        unsigned int influenced_point_id = influences_id[j];
+        coarse_ids[influenced_point_id] = coarse_index; // Set aggregate index 
for fine point
+
+        if (influenced_point_id != i) // Note: Any write races between threads 
are harmless here
+          point_types[influenced_point_id] = 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE;
+      }
+    }
+  }
+}
+
+
+template<typename IndexT>
+__global__ void amg_agg_merge_undecided(IndexT       *point_types,
+                                        IndexT       *coarse_ids,
+                                        IndexT const *influences_row,
+                                        IndexT const *influences_id,
+                                        unsigned int size)
+{
+  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int global_size = gridDim.x * blockDim.x;
+
+  for (unsigned int i = global_id; i < size; i += global_size)
+  {
+    if (point_types[i] == 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
+    {
+      unsigned int j_stop = influences_row[i + 1];
+      for (unsigned int j = influences_row[i]; j < j_stop; ++j)
+      {
+        unsigned int influenced_point_id = influences_id[j];
+        if (point_types[influenced_point_id] != 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED) // 
either coarse or fine point
+        {
+          //std::cout << "Setting fine node " << i << " to be aggregated with 
node " << *influence_iter << "/" << 
pointvector.get_coarse_index(*influence_iter) << std::endl;
+          coarse_ids[i] = coarse_ids[influenced_point_id];
+          break;
+        }
+      }
+    }
+  }
+}
+
+
+__global__ void amg_agg_merge_undecided_2(unsigned int *point_types,
+                                          unsigned int size)
+{
+  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int global_size = gridDim.x * blockDim.x;
+
+  for (unsigned int i = global_id; i < size; i += global_size)
+  {
+    if (point_types[i] == 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
+      point_types[i] = 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE;
+  }
+}
+
+
+/** @brief AG (aggregation based) coarsening. Partially single-threaded 
version (VIENNACL_AMG_COARSE_AG)
+*
+* @param A             Operator matrix
+* @param amg_context   AMG hierarchy datastructures
+* @param tag           AMG preconditioner tag
+*/
+template<typename NumericT>
+void amg_coarse_ag(compressed_matrix<NumericT> const & A,
+                   viennacl::linalg::detail::amg::amg_level_context & 
amg_context,
+                   viennacl::linalg::amg_tag & tag)
+{
+
+  amg_influence_trivial(A, amg_context, tag);
+
+  //
+  // Stage 1: Build aggregates:
+  //
+  if (tag.get_coarsening_method() == 
viennacl::linalg::AMG_COARSENING_METHOD_MIS2_AGGREGATION)
+    amg_coarse_ag_stage1_mis2(A, amg_context, tag);
+  else
+    throw std::runtime_error("Only MIS2 coarsening implemented. Selected 
coarsening not available with CUDA backend!");
+
+  viennacl::linalg::cuda::amg::enumerate_coarse_points(amg_context);
+
+  //
+  // Stage 2: Propagate coarse aggregate indices to neighbors:
+  //
+  amg_agg_propagate_coarse_indices<<<128, 
128>>>(viennacl::cuda_arg(amg_context.point_types_),
+                                                 
viennacl::cuda_arg(amg_context.coarse_id_),
+                                                 
viennacl::cuda_arg(amg_context.influence_jumper_),
+                                                 
viennacl::cuda_arg(amg_context.influence_ids_),
+                                                 static_cast<unsigned 
int>(A.size1())
+                                                );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_agg_propagate_coarse_indices");
+
+
+  //
+  // Stage 3: Merge remaining undecided points (merging to first aggregate 
found when cycling over the hierarchy
+  //
+  amg_agg_merge_undecided<<<128, 
128>>>(viennacl::cuda_arg(amg_context.point_types_),
+                                        
viennacl::cuda_arg(amg_context.coarse_id_),
+                                        
viennacl::cuda_arg(amg_context.influence_jumper_),
+                                        
viennacl::cuda_arg(amg_context.influence_ids_),
+                                        static_cast<unsigned int>(A.size1())
+                                       );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_agg_merge_undecided");
+
+  //
+  // Stage 4: Set undecided points to fine points (coarse ID already set in 
Stage 3)
+  //          Note: Stage 3 and Stage 4 were initially fused, but are now 
split in order to avoid race conditions (or a fallback to sequential execution).
+  //
+  amg_agg_merge_undecided_2<<<128, 
128>>>(viennacl::cuda_arg(amg_context.point_types_),
+                                         static_cast<unsigned int>(A.size1())
+                                        );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_agg_merge_undecided_2");
+}
+
+
+
+
+/** @brief Calls the right coarsening procedure
+*
+* @param A            Operator matrix on all levels
+* @param amg_context  AMG hierarchy datastructures
+* @param tag          AMG preconditioner tag
+*/
+template<typename InternalT1>
+void amg_coarse(InternalT1 & A,
+                viennacl::linalg::detail::amg::amg_level_context & amg_context,
+                viennacl::linalg::amg_tag & tag)
+{
+  switch (tag.get_coarsening_method())
+  {
+  case viennacl::linalg::AMG_COARSENING_METHOD_MIS2_AGGREGATION: 
amg_coarse_ag(A, amg_context, tag); break;
+  default: throw std::runtime_error("not implemented yet");
+  }
+}
+
+
+
+
+////////////////////////////////////// Interpolation 
/////////////////////////////
+
+template<typename NumericT>
+__global__ void amg_interpol_ag_kernel(unsigned int *P_row_buffer,
+                                       unsigned int *P_col_buffer,
+                                       NumericT *P_elements,
+                                       unsigned int *coarse_ids,
+                                       unsigned int size)
+{
+  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int global_size = gridDim.x * blockDim.x;
+
+  for (unsigned int i = global_id; i < size; i += global_size)
+  {
+    P_row_buffer[i] = i;
+    P_col_buffer[i] = coarse_ids[i];
+    P_elements[i]   = NumericT(1);
+  }
+
+  // set last entry as well:
+  if (global_id == 0)
+    P_row_buffer[size] = size;
+}
+
+/** @brief AG (aggregation based) interpolation. Multi-Threaded! 
(VIENNACL_INTERPOL_SA)
+ *
+ * @param A            Operator matrix
+ * @param P            Prolongation matrix
+ * @param amg_context  AMG hierarchy datastructures
+ * @param tag          AMG configuration tag
+*/
+template<typename NumericT>
+void amg_interpol_ag(compressed_matrix<NumericT> const & A,
+                     compressed_matrix<NumericT> & P,
+                     viennacl::linalg::detail::amg::amg_level_context & 
amg_context,
+                     viennacl::linalg::amg_tag & tag)
+{
+  (void)tag;
+  P = compressed_matrix<NumericT>(A.size1(), amg_context.num_coarse_, 
A.size1(), viennacl::traits::context(A));
+
+  amg_interpol_ag_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned 
int>(P.handle1().cuda_handle()),
+                                       viennacl::cuda_arg<unsigned 
int>(P.handle2().cuda_handle()),
+                                       
viennacl::cuda_arg<NumericT>(P.handle().cuda_handle()),
+                                       
viennacl::cuda_arg(amg_context.coarse_id_),
+                                       static_cast<unsigned int>(A.size1())
+                                      );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_interpol_ag_kernel");
+
+  P.generate_row_block_information();
+}
+
+
+
+template<typename NumericT>
+__global__ void amg_interpol_sa_kernel(
+          const unsigned int *A_row_indices,
+          const unsigned int *A_col_indices,
+          const NumericT     *A_elements,
+          unsigned int A_size1,
+          unsigned int A_nnz,
+          unsigned int *Jacobi_row_indices,
+          unsigned int *Jacobi_col_indices,
+          NumericT     *Jacobi_elements,
+          NumericT     omega
+          )
+{
+  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int global_size = gridDim.x * blockDim.x;
+
+  for (unsigned int row = global_id; row < A_size1; row += global_size)
+  {
+    unsigned int row_begin = A_row_indices[row];
+    unsigned int row_end   = A_row_indices[row+1];
+
+    Jacobi_row_indices[row] = row_begin;
+
+    // Step 1: Extract diagonal:
+    NumericT diag = 0;
+    for (unsigned int j = row_begin; j < row_end; ++j)
+    {
+      if (A_col_indices[j] == row)
+      {
+        diag = A_elements[j];
+        break;
+      }
+    }
+
+    // Step 2: Write entries:
+    for (unsigned int j = row_begin; j < row_end; ++j)
+    {
+      unsigned int col_index = A_col_indices[j];
+      Jacobi_col_indices[j] = col_index;
+
+      if (col_index == row)
+        Jacobi_elements[j] = NumericT(1) - omega;
+      else
+        Jacobi_elements[j] = - omega * A_elements[j] / diag;
+    }
+  }
+
+  if (global_id == 0)
+    Jacobi_row_indices[A_size1] = A_nnz; // don't forget finalizer
+}
+
+
+
+/** @brief Smoothed aggregation interpolation. (VIENNACL_INTERPOL_SA)
+ *
+ * @param A            Operator matrix
+ * @param P            Prolongation matrix
+ * @param amg_context  AMG hierarchy datastructures
+ * @param tag          AMG configuration tag
+*/
+template<typename NumericT>
+void amg_interpol_sa(compressed_matrix<NumericT> const & A,
+                     compressed_matrix<NumericT> & P,
+                     viennacl::linalg::detail::amg::amg_level_context & 
amg_context,
+                     viennacl::linalg::amg_tag & tag)
+{
+  (void)tag;
+  viennacl::compressed_matrix<NumericT> P_tentative(A.size1(), 
amg_context.num_coarse_, A.size1(), viennacl::traits::context(A));
+
+  // form tentative operator:
+  amg_interpol_ag(A, P_tentative, amg_context, tag);
+
+  viennacl::compressed_matrix<NumericT> Jacobi(A.size1(), A.size1(), A.nnz(), 
viennacl::traits::context(A));
+
+  amg_interpol_sa_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned 
int>(A.handle1().cuda_handle()),
+                                       viennacl::cuda_arg<unsigned 
int>(A.handle2().cuda_handle()),
+                                       
viennacl::cuda_arg<NumericT>(A.handle().cuda_handle()),
+                                       static_cast<unsigned int>(A.size1()),
+                                       static_cast<unsigned int>(A.nnz()),
+                                       viennacl::cuda_arg<unsigned 
int>(Jacobi.handle1().cuda_handle()),
+                                       viennacl::cuda_arg<unsigned 
int>(Jacobi.handle2().cuda_handle()),
+                                       
viennacl::cuda_arg<NumericT>(Jacobi.handle().cuda_handle()),
+                                       NumericT(tag.get_jacobi_weight())
+                                      );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_interpol_sa_kernel");
+
+  P = viennacl::linalg::prod(Jacobi, P_tentative);
+
+  P.generate_row_block_information();
+}
+
+
+/** @brief Dispatcher for building the interpolation matrix
+ *
+ * @param A            Operator matrix
+ * @param P            Prolongation matrix
+ * @param amg_context  AMG hierarchy datastructures
+ * @param tag          AMG configuration tag
+*/
+template<typename MatrixT>
+void amg_interpol(MatrixT const & A,
+                  MatrixT & P,
+                  viennacl::linalg::detail::amg::amg_level_context & 
amg_context,
+                  viennacl::linalg::amg_tag & tag)
+{
+  switch (tag.get_interpolation_method())
+  {
+  case viennacl::linalg::AMG_INTERPOLATION_METHOD_AGGREGATION:          
amg_interpol_ag     (A, P, amg_context, tag); break;
+  case viennacl::linalg::AMG_INTERPOLATION_METHOD_SMOOTHED_AGGREGATION: 
amg_interpol_sa     (A, P, amg_context, tag); break;
+  default: throw std::runtime_error("Not implemented yet!");
+  }
+}
+
+
+template<typename NumericT>
+__global__ void compressed_matrix_assign_to_dense(
+          const unsigned int * row_indices,
+          const unsigned int * column_indices,
+          const NumericT * elements,
+          NumericT *B,
+          unsigned int B_row_start,     unsigned int B_col_start,
+          unsigned int B_row_inc,       unsigned int B_col_inc,
+          unsigned int B_row_size,      unsigned int B_col_size,
+          unsigned int B_internal_rows, unsigned int B_internal_cols)
+{
+  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                    row  < B_row_size;
+                    row += gridDim.x * blockDim.x)
+  {
+    unsigned int row_end = row_indices[row+1];
+    for (unsigned int j = row_indices[row]; j<row_end; j++)
+      B[(B_row_start + row * B_row_inc) * B_internal_cols + B_col_start + 
column_indices[j] * B_col_inc] = elements[j];
+  }
+}
+
+
+template<typename NumericT, unsigned int AlignmentV>
+void assign_to_dense(viennacl::compressed_matrix<NumericT, AlignmentV> const & 
A,
+                     viennacl::matrix_base<NumericT> & B)
+{
+  compressed_matrix_assign_to_dense<<<128, 128>>>(viennacl::cuda_arg<unsigned 
int>(A.handle1().cuda_handle()),
+                                                  viennacl::cuda_arg<unsigned 
int>(A.handle2().cuda_handle()),
+                                                  
viennacl::cuda_arg<NumericT>(A.handle().cuda_handle()),
+                                                  
viennacl::cuda_arg<NumericT>(B),
+                                                  static_cast<unsigned 
int>(viennacl::traits::start1(B)),           static_cast<unsigned 
int>(viennacl::traits::start2(B)),
+                                                  static_cast<unsigned 
int>(viennacl::traits::stride1(B)),          static_cast<unsigned 
int>(viennacl::traits::stride2(B)),
+                                                  static_cast<unsigned 
int>(viennacl::traits::size1(B)),            static_cast<unsigned 
int>(viennacl::traits::size2(B)),
+                                                  static_cast<unsigned 
int>(viennacl::traits::internal_size1(B)),   static_cast<unsigned 
int>(viennacl::traits::internal_size2(B))
+                                                 );
+  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_assign_to_dense");
+}
+
+
+
+
+template<typename NumericT>
+__global__ void compressed_matrix_smooth_jacobi_kernel(
+          const unsigned int * row_indices,
+          const unsigned int * column_indices,
+          const NumericT * elements,
+          NumericT weight,
+          const NumericT * x_old,
+          NumericT * x_new,
+          const NumericT * rhs,
+          unsigned int size)
+{
+  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
+                    row  < size;
+                    row += gridDim.x * blockDim.x)
+  {
+    NumericT sum = NumericT(0);
+    NumericT diag = NumericT(1);
+    unsigned int row_end = row_indices[row+1];
+    for (unsigned int j = row_indices[row]; j < row_end; ++j)
+    {
+      unsigned int col = column_indices[j];
+      if (col == row)
+        diag = elements[j];
+      else
+        sum += elements[j] * x_old[col];
+    }
+    x_new[row] = weight * (rhs[row] - sum) / diag + (NumericT(1) - weight) * 
x_old[row];
+  }
+}
+
+
+
+
+/** @brief Damped Jacobi Smoother (CUDA version)
+*
+* @param iterations  Number of smoother iterations
+* @param A           Operator matrix for the smoothing
+* @param x           The vector smoothing is applied to
+* @param x_backup    (Different) Vector holding the same values as x
+* @param rhs_smooth  The right hand side of the equation for the smoother
+* @param weight      Damping factor. 0: No effect of smoother. 1: Undamped 
Jacobi iteration
+*/
+template<typename NumericT>
+void smooth_jacobi(unsigned int iterations,
+                   compressed_matrix<NumericT> const & A,
+                   vector<NumericT> & x,
+                   vector<NumericT> & x_backup,
+                   vector<NumericT> const & rhs_smooth,
+                   NumericT weight)
+{
+  for (unsigned int i=0; i<iterations; ++i)
+  {
+    x_backup = x;
+
+    compressed_matrix_smooth_jacobi_kernel<<<128, 
128>>>(viennacl::cuda_arg<unsigned int>(A.handle1().cuda_handle()),
+                                                         
viennacl::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
+                                                         
viennacl::cuda_arg<NumericT>(A.handle().cuda_handle()),
+                                                         
static_cast<NumericT>(weight),
+                                                         
viennacl::cuda_arg(x_backup),
+                                                         viennacl::cuda_arg(x),
+                                                         
viennacl::cuda_arg(rhs_smooth),
+                                                         static_cast<unsigned 
int>(rhs_smooth.size())
+                                                        );
+    VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_smooth_jacobi_kernel");
+  }
+}
+
+
+} //namespace amg
+} //namespace host_based
+} //namespace linalg
+} //namespace viennacl
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_calls.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_calls.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_calls.hpp
new file mode 100644
index 0000000..39f0015
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_calls.hpp
@@ -0,0 +1,166 @@
+#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_CALLS_HPP_
+#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_CALLS_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+
+/** @file viennacl/linalg/cuda/bisect_kernel_calls.hpp
+    @brief CUDA kernel calls for the bisection algorithm
+
+    Implementation based on the sample provided with the CUDA 6.0 SDK, for 
which
+    the creation of derivative works is allowed by including the following 
statement:
+    "This software contains source code provided by NVIDIA Corporation."
+*/
+
+#include "viennacl/linalg/detail/bisect/structs.hpp"
+
+// includes, kernels
+#include "viennacl/linalg/cuda/bisect_kernel_small.hpp"
+#include "viennacl/linalg/cuda/bisect_kernel_large.hpp"
+#include "viennacl/linalg/cuda/bisect_kernel_large_onei.hpp"
+#include "viennacl/linalg/cuda/bisect_kernel_large_multi.hpp"
+
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+template<typename NumericT>
+void bisectSmall(const viennacl::linalg::detail::InputData<NumericT> &input, 
viennacl::linalg::detail::ResultDataSmall<NumericT> &result,
+                       const unsigned int mat_size,
+                       const NumericT lg, const NumericT ug,
+                       const NumericT precision)
+{
+
+
+  dim3  blocks(1, 1, 1);
+  dim3  threads(VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX, 1, 1);
+
+  bisectKernelSmall<<< blocks, threads >>>(
+    viennacl::cuda_arg(input.g_a),
+    viennacl::cuda_arg(input.g_b) + 1,
+    mat_size,
+    viennacl::cuda_arg(result.vcl_g_left),
+    viennacl::cuda_arg(result.vcl_g_right),
+    viennacl::cuda_arg(result.vcl_g_left_count),
+    viennacl::cuda_arg(result.vcl_g_right_count),
+    lg, ug, 0, mat_size,
+    precision
+    );
+  viennacl::linalg::cuda::VIENNACL_CUDA_LAST_ERROR_CHECK("Kernel launch 
failed");
+}
+
+
+template<typename NumericT>
+void bisectLarge(const viennacl::linalg::detail::InputData<NumericT> &input, 
viennacl::linalg::detail::ResultDataLarge<NumericT> &result,
+                   const unsigned int mat_size,
+                   const NumericT lg, const NumericT ug,
+                   const NumericT precision)
+ {
+
+  dim3  blocks(1, 1, 1);
+  dim3  threads(mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : 
VIENNACL_BISECT_MAX_THREADS_BLOCK / 2 , 1, 1);
+  bisectKernelLarge<<< blocks, threads >>>
+    (viennacl::cuda_arg(input.g_a),
+     viennacl::cuda_arg(input.g_b) + 1,
+     mat_size,
+     lg, ug, static_cast<unsigned int>(0), mat_size, precision,
+     viennacl::cuda_arg(result.g_num_one),
+     viennacl::cuda_arg(result.g_num_blocks_mult),
+     viennacl::cuda_arg(result.g_left_one),
+     viennacl::cuda_arg(result.g_right_one),
+     viennacl::cuda_arg(result.g_pos_one),
+     viennacl::cuda_arg(result.g_left_mult),
+     viennacl::cuda_arg(result.g_right_mult),
+     viennacl::cuda_arg(result.g_left_count_mult),
+     viennacl::cuda_arg(result.g_right_count_mult),
+     viennacl::cuda_arg(result.g_blocks_mult),
+     viennacl::cuda_arg(result.g_blocks_mult_sum)
+     );
+  viennacl::linalg::cuda::VIENNACL_CUDA_LAST_ERROR_CHECK("Kernel launch 
failed.");
+}
+
+
+// compute eigenvalues for intervals that contained only one eigenvalue
+// after the first processing step
+template<typename NumericT>
+void bisectLarge_OneIntervals(const 
viennacl::linalg::detail::InputData<NumericT> &input, 
viennacl::linalg::detail::ResultDataLarge<NumericT> &result,
+                   const unsigned int mat_size,
+                   const NumericT precision)
+ {
+
+  unsigned int num_one_intervals = result.g_num_one;
+  unsigned int num_blocks = 
viennacl::linalg::detail::getNumBlocksLinear(num_one_intervals,
+                                                                         
mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : 
VIENNACL_BISECT_MAX_THREADS_BLOCK / 2);
+  dim3 grid_onei;
+  grid_onei.x = num_blocks;
+  grid_onei.y = 1, grid_onei.z = 1;
+  dim3 threads_onei(mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : 
VIENNACL_BISECT_MAX_THREADS_BLOCK / 2, 1, 1);
+
+
+  bisectKernelLarge_OneIntervals<<< grid_onei , threads_onei >>>
+    (viennacl::cuda_arg(input.g_a),
+     viennacl::cuda_arg(input.g_b) + 1,
+     mat_size, num_one_intervals,
+     viennacl::cuda_arg(result.g_left_one),
+     viennacl::cuda_arg(result.g_right_one),
+     viennacl::cuda_arg(result.g_pos_one),
+     precision
+     );
+  
viennacl::linalg::cuda::VIENNACL_CUDA_LAST_ERROR_CHECK("bisectKernelLarge_OneIntervals()
 FAILED.");
+}
+
+
+// process intervals that contained more than one eigenvalue after
+// the first processing step
+template<typename NumericT>
+void bisectLarge_MultIntervals(const 
viennacl::linalg::detail::InputData<NumericT> &input, 
viennacl::linalg::detail::ResultDataLarge<NumericT> &result,
+                   const unsigned int mat_size,
+                   const NumericT precision)
+ {
+    // get the number of blocks of intervals that contain, in total when
+    // each interval contains only one eigenvalue, not more than
+    // MAX_THREADS_BLOCK threads
+    unsigned int  num_blocks_mult = result.g_num_blocks_mult;
+
+    // setup the execution environment
+    dim3  grid_mult(num_blocks_mult, 1, 1);
+    dim3  threads_mult(mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : 
VIENNACL_BISECT_MAX_THREADS_BLOCK / 2, 1, 1);
+
+    bisectKernelLarge_MultIntervals<<< grid_mult, threads_mult >>>
+      (viennacl::cuda_arg(input.g_a),
+       viennacl::cuda_arg(input.g_b) + 1,
+       mat_size,
+       viennacl::cuda_arg(result.g_blocks_mult),
+       viennacl::cuda_arg(result.g_blocks_mult_sum),
+       viennacl::cuda_arg(result.g_left_mult),
+       viennacl::cuda_arg(result.g_right_mult),
+       viennacl::cuda_arg(result.g_left_count_mult),
+       viennacl::cuda_arg(result.g_right_count_mult),
+       viennacl::cuda_arg(result.g_lambda_mult),
+       viennacl::cuda_arg(result.g_pos_mult),
+       precision
+      );
+    
viennacl::linalg::cuda::VIENNACL_CUDA_LAST_ERROR_CHECK("bisectKernelLarge_MultIntervals()
 FAILED.");
+}
+}
+}
+}
+
+#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large.hpp
new file mode 100755
index 0000000..77c9773
--- /dev/null
+++ b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large.hpp
@@ -0,0 +1,928 @@
+#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_HPP_
+#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+
+/** @file viennacl/linalg/cuda/bisect_kernel_large.hpp
+    @brief First step of the bisection algorithm for the computation of 
eigenvalues.
+
+    Implementation based on the sample provided with the CUDA 6.0 SDK, for 
which
+    the creation of derivative works is allowed by including the following 
statement:
+    "This software contains source code provided by NVIDIA Corporation."
+*/
+
+
+/* Determine eigenvalues for large symmetric, tridiagonal matrix. First
+  step of the computation. */
+
+// includes, project
+#include "viennacl/linalg/detail/bisect/config.hpp"
+#include "viennacl/linalg/detail/bisect/util.hpp"
+// additional kernel
+#include "viennacl/linalg/cuda/bisect_util.hpp"
+
+// declaration, forward
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+////////////////////////////////////////////////////////////////////////////////
+//! Write data to global memory
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+__device__
+void writeToGmem(const unsigned int tid, const unsigned int tid_2,
+                 const unsigned int num_threads_active,
+                 const unsigned int num_blocks_mult,
+                 NumericT *g_left_one, NumericT *g_right_one,
+                 unsigned int *g_pos_one,
+                 NumericT *g_left_mult, NumericT *g_right_mult,
+                 unsigned int *g_left_count_mult,
+                 unsigned int *g_right_count_mult,
+                 NumericT *s_left, NumericT *s_right,
+                 unsigned short *s_left_count, unsigned short *s_right_count,
+                 unsigned int *g_blocks_mult,
+                 unsigned int *g_blocks_mult_sum,
+                 unsigned short *s_compaction_list,
+                 unsigned short *s_cl_helper,
+                 unsigned int offset_mult_lambda
+                )
+{
+
+    if (tid < offset_mult_lambda)
+    {
+
+        g_left_one[tid] = s_left[tid];
+        g_right_one[tid] = s_right[tid];
+        // right count can be used to order eigenvalues without sorting
+        g_pos_one[tid] = s_right_count[tid];
+    }
+    else
+    {
+
+
+        g_left_mult[tid - offset_mult_lambda] = s_left[tid];
+        g_right_mult[tid - offset_mult_lambda] = s_right[tid];
+        g_left_count_mult[tid - offset_mult_lambda] = s_left_count[tid];
+        g_right_count_mult[tid - offset_mult_lambda] = s_right_count[tid];
+    }
+
+    if (tid_2 < num_threads_active)
+    {
+
+        if (tid_2 < offset_mult_lambda)
+        {
+
+            g_left_one[tid_2] = s_left[tid_2];
+            g_right_one[tid_2] = s_right[tid_2];
+            // right count can be used to order eigenvalues without sorting
+            g_pos_one[tid_2] = s_right_count[tid_2];
+        }
+        else
+        {
+
+            g_left_mult[tid_2 - offset_mult_lambda] = s_left[tid_2];
+            g_right_mult[tid_2 - offset_mult_lambda] = s_right[tid_2];
+            g_left_count_mult[tid_2 - offset_mult_lambda] = 
s_left_count[tid_2];
+            g_right_count_mult[tid_2 - offset_mult_lambda] = 
s_right_count[tid_2];
+        }
+
+    } // end writing out data
+
+    __syncthreads();
+
+    // note that s_cl_blocking = s_compaction_list + 1;, that is by writing out
+    // s_compaction_list we write the exclusive scan result
+    if (tid <= num_blocks_mult)
+    {
+        g_blocks_mult[tid] = s_compaction_list[tid];
+        g_blocks_mult_sum[tid] = s_cl_helper[tid];
+    }
+
+    if (tid_2 <= num_blocks_mult)
+    {
+        g_blocks_mult[tid_2] = s_compaction_list[tid_2];
+        g_blocks_mult_sum[tid_2] = s_cl_helper[tid_2];
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Perform final stream compaction before writing data to global memory
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+__device__
+void
+compactStreamsFinal(const unsigned int tid, const unsigned int tid_2,
+                    const unsigned int num_threads_active,
+                    unsigned int &offset_mult_lambda,
+                    NumericT *s_left, NumericT *s_right,
+                    unsigned short *s_left_count, unsigned short 
*s_right_count,
+                    unsigned short *s_cl_one, unsigned short *s_cl_mult,
+                    unsigned short *s_cl_blocking, unsigned short *s_cl_helper,
+                    unsigned int is_one_lambda, unsigned int is_one_lambda_2,
+                    NumericT &left, NumericT &right, NumericT &left_2, 
NumericT &right_2,
+                    unsigned int &left_count, unsigned int &right_count,
+                    unsigned int &left_count_2, unsigned int &right_count_2,
+                    unsigned int c_block_iend, unsigned int c_sum_block,
+                    unsigned int c_block_iend_2, unsigned int c_sum_block_2
+                   )
+{
+    // cache data before performing compaction
+    left = s_left[tid];
+    right = s_right[tid];
+
+    if (tid_2 < num_threads_active)
+    {
+
+        left_2 = s_left[tid_2];
+        right_2 = s_right[tid_2];
+    }
+
+    __syncthreads();
+
+    // determine addresses for intervals containing multiple eigenvalues and
+    // addresses for blocks of intervals
+    unsigned int ptr_w = 0;
+    unsigned int ptr_w_2 = 0;
+    unsigned int ptr_blocking_w = 0;
+    unsigned int ptr_blocking_w_2 = 0;
+
+
+
+    ptr_w = (1 == is_one_lambda) ? s_cl_one[tid]
+            : s_cl_mult[tid] + offset_mult_lambda;
+
+    if (0 != c_block_iend)
+    {
+        ptr_blocking_w = s_cl_blocking[tid];
+    }
+
+    if (tid_2 < num_threads_active)
+    {
+        ptr_w_2 = (1 == is_one_lambda_2) ? s_cl_one[tid_2]
+                  : s_cl_mult[tid_2] + offset_mult_lambda;
+
+        if (0 != c_block_iend_2)
+        {
+            ptr_blocking_w_2 = s_cl_blocking[tid_2];
+        }
+    }
+
+
+    __syncthreads();
+      if(tid < num_threads_active)
+      {
+        // store compactly in shared mem
+        s_left[ptr_w] = left;
+        s_right[ptr_w] = right;
+        s_left_count[ptr_w] = left_count;
+        s_right_count[ptr_w] = right_count;
+      }
+
+
+    __syncthreads();
+    if(tid == 1)
+    {
+      s_left[ptr_w] = left;
+      s_right[ptr_w] = right;
+      s_left_count[ptr_w] = left_count;
+      s_right_count[ptr_w] = right_count;
+    }
+          if (0 != c_block_iend)
+    {
+        s_cl_blocking[ptr_blocking_w + 1] = c_block_iend - 1;
+        s_cl_helper[ptr_blocking_w + 1] = c_sum_block;
+    }
+
+    if (tid_2 < num_threads_active)
+    {
+
+        // store compactly in shared mem
+        s_left[ptr_w_2] = left_2;
+        s_right[ptr_w_2] = right_2;
+        s_left_count[ptr_w_2] = left_count_2;
+        s_right_count[ptr_w_2] = right_count_2;
+
+        if (0 != c_block_iend_2)
+        {
+            s_cl_blocking[ptr_blocking_w_2 + 1] = c_block_iend_2 - 1;
+            s_cl_helper[ptr_blocking_w_2 + 1] = c_sum_block_2;
+        }
+    }
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compute addresses to obtain compact list of block start addresses
+////////////////////////////////////////////////////////////////////////////////
+inline __device__
+void
+scanCompactBlocksStartAddress(const unsigned int tid, const unsigned int tid_2,
+                              const unsigned int num_threads_compaction,
+                              unsigned short *s_cl_blocking,
+                              unsigned short *s_cl_helper
+                             )
+{
+    // prepare for second step of block generation: compaction of the block
+    // list itself to efficiently write out these
+    s_cl_blocking[tid] = s_cl_helper[tid];
+
+    if (tid_2 < num_threads_compaction)
+    {
+        s_cl_blocking[tid_2] = s_cl_helper[tid_2];
+    }
+
+    __syncthreads();
+
+    // additional scan to compact s_cl_blocking that permits to generate a
+    // compact list of eigenvalue blocks each one containing about
+    // VIENNACL_BISECT_MAX_THREADS_BLOCK eigenvalues (so that each of these 
blocks may be
+    // processed by one thread block in a subsequent processing step
+
+    unsigned int offset = 1;
+
+    // build scan tree
+    for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1)
+    {
+
+        __syncthreads();
+
+        if (tid < d)
+        {
+
+            unsigned int  ai = offset*(2*tid+1)-1;
+            unsigned int  bi = offset*(2*tid+2)-1;
+            s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai];
+        }
+
+        offset <<= 1;
+    }
+
+    // traverse down tree: first down to level 2 across
+    for (int d = 2; d < num_threads_compaction; d <<= 1)
+    {
+
+        offset >>= 1;
+        __syncthreads();
+
+        //
+        if (tid < (d-1))
+        {
+
+            unsigned int  ai = offset*(tid+1) - 1;
+            unsigned int  bi = ai + (offset >> 1);
+            s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai];
+        }
+    }
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Perform scan to obtain number of eigenvalues before a specific block
+////////////////////////////////////////////////////////////////////////////////
+inline __device__
+void
+scanSumBlocks(const unsigned int tid, const unsigned int tid_2,
+              const unsigned int num_threads_active,
+              const unsigned int num_threads_compaction,
+              unsigned short *s_cl_blocking,
+              unsigned short *s_cl_helper)
+{
+    unsigned int offset = 1;
+
+    // first step of scan to build the sum of elements within each block
+    // build up tree
+    for (int d = num_threads_compaction >> 1; d > 0; d >>= 1)
+    {
+
+        __syncthreads();
+
+        if (tid < d)
+        {
+
+            unsigned int ai = offset*(2*tid+1)-1;
+            unsigned int bi = offset*(2*tid+2)-1;
+
+            s_cl_blocking[bi] += s_cl_blocking[ai];
+        }
+
+        offset *= 2;
+    }
+
+    // first step of scan to build the sum of elements within each block
+    // traverse down tree
+    for (int d = 2; d < (num_threads_compaction - 1); d <<= 1)
+    {
+
+        offset >>= 1;
+        __syncthreads();
+
+        if (tid < (d-1))
+        {
+
+            unsigned int ai = offset*(tid+1) - 1;
+            unsigned int bi = ai + (offset >> 1);
+
+            s_cl_blocking[bi] += s_cl_blocking[ai];
+        }
+    }
+
+    __syncthreads();
+
+    if (0 == tid)
+    {
+
+        // move last element of scan to last element that is valid
+        // necessary because the number of threads employed for scan is a power
+        // of two and not necessarily the number of active threasd
+        s_cl_helper[num_threads_active - 1] =
+            s_cl_helper[num_threads_compaction - 1];
+        s_cl_blocking[num_threads_active - 1] =
+            s_cl_blocking[num_threads_compaction - 1];
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Perform initial scan for compaction of intervals containing one and
+//! multiple eigenvalues; also do initial scan to build blocks
+////////////////////////////////////////////////////////////////////////////////
+inline __device__
+void
+scanInitial(const unsigned int tid, const unsigned int tid_2, const unsigned 
int mat_size,
+            const unsigned int num_threads_active,
+            const unsigned int num_threads_compaction,
+            unsigned short *s_cl_one, unsigned short *s_cl_mult,
+            unsigned short *s_cl_blocking, unsigned short *s_cl_helper
+           )
+{
+
+    // perform scan to compactly write out the intervals containing one and
+    // multiple eigenvalues
+    // also generate tree for blocking of intervals containing multiple
+    // eigenvalues
+
+    unsigned int offset = 1;
+
+    // build scan tree
+    for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1)
+    {
+
+        __syncthreads();
+
+        if (tid < d)
+        {
+
+            unsigned int  ai = offset*(2*tid+1);
+            unsigned int  bi = offset*(2*tid+2)-1;
+
+            s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai - 1];
+            s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai - 1];
+
+            // s_cl_helper is binary and zero for an internal node and 1 for a
+            // root node of a tree corresponding to a block
+            // s_cl_blocking contains the number of nodes in each sub-tree at 
each
+            // iteration, the data has to be kept to compute the total number 
of
+            // eigenvalues per block that, in turn, is needed to efficiently
+            // write out data in the second step
+            if ((s_cl_helper[ai - 1] != 1) || (s_cl_helper[bi] != 1))
+            {
+
+                // check how many childs are non terminated
+                if (s_cl_helper[ai - 1] == 1)
+                {
+                    // mark as terminated
+                    s_cl_helper[bi] = 1;
+                }
+                else if (s_cl_helper[bi] == 1)
+                {
+                    // mark as terminated
+                    s_cl_helper[ai - 1] = 1;
+                }
+                else    // both childs are non-terminated
+                {
+
+                    unsigned int temp = s_cl_blocking[bi] + s_cl_blocking[ai - 
1];
+
+                    if (temp > (mat_size > 512 ? 
VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2))
+                    {
+
+                        // the two child trees have to form separate blocks, 
terminate trees
+                        s_cl_helper[ai - 1] = 1;
+                        s_cl_helper[bi] = 1;
+                    }
+                    else
+                    {
+                        // build up tree by joining subtrees
+                        s_cl_blocking[bi] = temp;
+                        s_cl_blocking[ai - 1] = 0;
+                    }
+                }
+            }  // end s_cl_helper update
+
+        }
+
+        offset <<= 1;
+    }
+
+
+    // traverse down tree, this only for stream compaction, not for block
+    // construction
+    for (int d = 2; d < num_threads_compaction; d <<= 1)
+    {
+
+        offset >>= 1;
+        __syncthreads();
+
+        //
+        if (tid < (d-1))
+        {
+
+            unsigned int  ai = offset*(tid+1) - 1;
+            unsigned int  bi = ai + (offset >> 1);
+
+            s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai];
+            s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai];
+        }
+    }
+
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Store all non-empty intervals resulting from the subdivision of the 
interval
+//! currently processed by the thread
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+__device__
+void
+storeNonEmptyIntervalsLarge(unsigned int addr,
+                            const unsigned int num_threads_active,
+                            NumericT  *s_left, NumericT *s_right,
+                            unsigned short  *s_left_count,
+                            unsigned short *s_right_count,
+                            NumericT left, NumericT mid, NumericT right,
+                            const unsigned short left_count,
+                            const unsigned short mid_count,
+                            const unsigned short right_count,
+                            NumericT epsilon,
+                            unsigned int &compact_second_chunk,
+                            unsigned short *s_compaction_list,
+                            unsigned int &is_active_second)
+{
+    // check if both child intervals are valid
+    if ((left_count != mid_count) && (mid_count != right_count))
+    {
+
+        storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
+                      left, mid, left_count, mid_count, epsilon);
+
+        is_active_second = 1;
+        s_compaction_list[threadIdx.x] = 1;
+        compact_second_chunk = 1;
+    }
+    else
+    {
+
+        // only one non-empty child interval
+
+        // mark that no second child
+        is_active_second = 0;
+        s_compaction_list[threadIdx.x] = 0;
+
+        // store the one valid child interval
+        if (left_count != mid_count)
+        {
+            storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
+                          left, mid, left_count, mid_count, epsilon);
+        }
+        else
+        {
+            storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
+                          mid, right, mid_count, right_count, epsilon);
+        }
+    }
+}
+
+/** @brief Bisection to find eigenvalues of a real, symmetric, and tridiagonal 
matrix
+*  g_d  diagonal elements in global memory
+*  g_s  superdiagonal elements in global elements (stored so that the element 
*(g_s - 1) can be accessed and equals 0
+*  n   size of matrix
+*  lg  lower bound of input interval (e.g. Gerschgorin interval)
+*  ug  upper bound of input interval (e.g. Gerschgorin interval)
+*  lg_eig_count  number of eigenvalues that are smaller than lg
+*  lu_eig_count  number of eigenvalues that are smaller than lu
+*  epsilon  desired accuracy of eigenvalues to compute
+*/
+template<typename NumericT>
+__global__
+void
+bisectKernelLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int 
n,
+                  const NumericT lg, const NumericT ug,
+                  const unsigned int lg_eig_count,
+                  const unsigned int ug_eig_count,
+                  NumericT epsilon,
+                  unsigned int *g_num_one,
+                  unsigned int *g_num_blocks_mult,
+                  NumericT *g_left_one, NumericT *g_right_one,
+                  unsigned int *g_pos_one,
+                  NumericT *g_left_mult, NumericT *g_right_mult,
+                  unsigned int *g_left_count_mult,
+                  unsigned int *g_right_count_mult,
+                  unsigned int *g_blocks_mult,
+                  unsigned int *g_blocks_mult_sum
+                 )
+{
+    const unsigned int tid = threadIdx.x;
+
+    // intervals (store left and right because the subdivision tree is in 
general
+    // not dense
+    __shared__  NumericT  s_left[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
+    __shared__  NumericT  s_right[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
+
+    // number of eigenvalues that are smaller than s_left / s_right
+    // (correspondence is realized via indices)
+    __shared__  unsigned short  s_left_count[2 * 
VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
+    __shared__  unsigned short  s_right_count[2 * 
VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
+
+    // helper for stream compaction
+    __shared__  unsigned short  s_compaction_list[2 * 
VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
+
+    // state variables for whole block
+    // if 0 then compaction of second chunk of child intervals is not necessary
+    // (because all intervals had exactly one non-dead child)
+    __shared__  unsigned int compact_second_chunk;
+    // if 1 then all threads are converged
+    __shared__  unsigned int all_threads_converged;
+
+    // number of currently active threads
+    __shared__  unsigned int num_threads_active;
+
+    // number of threads to use for stream compaction
+    __shared__  unsigned int num_threads_compaction;
+
+    // helper for exclusive scan
+    unsigned short *s_compaction_list_exc = s_compaction_list + 1;
+
+
+    // variables for currently processed interval
+    // left and right limit of active interval
+    NumericT left = 0.0f;
+    NumericT right = 0.0f;
+    unsigned int left_count = 0;
+    unsigned int right_count = 0;
+    // midpoint of active interval
+    NumericT  mid = 0.0f;
+    // number of eigenvalues smaller then mid
+    unsigned int mid_count = 0;
+    // helper for stream compaction (tracking of threads generating second 
child)
+    unsigned int is_active_second = 0;
+
+    // initialize lists
+    s_compaction_list[tid] = 0;
+    s_left[tid] = 0;
+    s_right[tid] = 0;
+    s_left_count[tid] = 0;
+    s_right_count[tid] = 0;
+
+    __syncthreads();
+
+    // set up initial configuration
+    if (0 == tid)
+    {
+
+        s_left[0] = lg;
+        s_right[0] = ug;
+        s_left_count[0] = lg_eig_count;
+        s_right_count[0] = ug_eig_count;
+
+        compact_second_chunk = 0;
+        num_threads_active = 1;
+
+        num_threads_compaction = 1;
+
+        all_threads_converged = 1;
+    }
+
+    __syncthreads();
+
+    // for all active threads read intervals from the last level
+    // the number of (worst case) active threads per level l is 2^l
+    // determine coarse intervals. On these intervals the kernel for one or 
for multiple eigenvalues
+    // will be executed in the second step
+    while(true)
+    {
+        s_compaction_list[tid] = 0;
+        s_compaction_list[tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;
+        s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;
+        subdivideActiveInterval(tid, s_left, s_right, s_left_count, 
s_right_count,
+                                num_threads_active,
+                                left, right, left_count, right_count,
+                                mid, all_threads_converged);
+
+        __syncthreads();
+
+        // check if done
+        if (1 == all_threads_converged)
+        {
+            break;
+        }
+
+        // compute number of eigenvalues smaller than mid
+        // use all threads for reading the necessary matrix data from global
+        // memory
+        // use s_left and s_right as scratch space for diagonal and
+        // superdiagonal of matrix
+        mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n,
+                                                    mid, threadIdx.x,
+                                                    num_threads_active,
+                                                    s_left, s_right,
+                                                    (left == right));
+
+        __syncthreads();
+
+        // store intervals
+        // for all threads store the first child interval in a continuous 
chunk of
+        // memory, and the second child interval -- if it exists -- in a second
+        // chunk; it is likely that all threads reach convergence up to
+        // \a epsilon at the same level; furthermore, for higher level most / 
all
+        // threads will have only one child, storing the first child compactly 
will
+        // (first) avoid to perform a compaction step on the first chunk, 
(second)
+        // make it for higher levels (when all threads / intervals have
+        // exactly one child)  unnecessary to perform a compaction of the 
second
+        // chunk
+        if (tid < num_threads_active)
+        {
+
+            if (left != right)
+            {
+
+                // store intervals
+                storeNonEmptyIntervalsLarge(tid, num_threads_active,
+                                            s_left, s_right,
+                                            s_left_count, s_right_count,
+                                            left, mid, right,
+                                            left_count, mid_count, right_count,
+                                            epsilon, compact_second_chunk,
+                                            s_compaction_list_exc,
+                                            is_active_second);
+            }
+            else
+            {
+
+                // re-write converged interval (has to be stored again because 
s_left
+                // and s_right are used as scratch space for
+                // computeNumSmallerEigenvalsLarge()
+                s_left[tid] = left;
+                s_right[tid] = left;
+                s_left_count[tid] = left_count;
+                s_right_count[tid] = right_count;
+
+                is_active_second = 0;
+            }
+        }
+
+        // necessary so that compact_second_chunk is up-to-date
+        __syncthreads();
+
+        // perform compaction of chunk where second children are stored
+        // scan of (num_threads_active / 2) elements, thus at most
+        // (num_threads_active / 4) threads are needed
+        if (compact_second_chunk > 0)
+        {
+
+            // create indices for compaction
+            createIndicesCompaction(s_compaction_list_exc, 
num_threads_compaction);
+        }
+        __syncthreads();
+
+        if (compact_second_chunk > 0)
+        {
+            compactIntervals(s_left, s_right, s_left_count, s_right_count,
+                             mid, right, mid_count, right_count,
+                             s_compaction_list, num_threads_active,
+                             is_active_second);
+        }
+
+        __syncthreads();
+
+        // update state variables
+        if (0 == tid)
+        {
+
+            // update number of active threads with result of reduction
+            num_threads_active += s_compaction_list[num_threads_active];
+            num_threads_compaction = ceilPow2(num_threads_active);
+
+            compact_second_chunk = 0;
+            all_threads_converged = 1;
+        }
+
+        __syncthreads();
+
+        if (num_threads_compaction > blockDim.x)
+        {
+            break;
+        }
+
+    }
+
+    __syncthreads();
+
+    // generate two lists of intervals; one with intervals that contain one
+    // eigenvalue (or are converged), and one with intervals that need further
+    // subdivision
+
+    // perform two scans in parallel
+
+    unsigned int left_count_2;
+    unsigned int right_count_2;
+
+    unsigned int tid_2 = tid + blockDim.x;
+
+    // cache in per thread registers so that s_left_count and s_right_count
+    // can be used for scans
+    left_count = s_left_count[tid];
+    right_count = s_right_count[tid];
+
+    // some threads have to cache data for two intervals
+    if (tid_2 < num_threads_active)
+    {
+        left_count_2 = s_left_count[tid_2];
+        right_count_2 = s_right_count[tid_2];
+    }
+
+    // compaction list for intervals containing one and multiple eigenvalues
+    // do not affect first element for exclusive scan
+    unsigned short  *s_cl_one = s_left_count + 1;
+    unsigned short  *s_cl_mult = s_right_count + 1;
+
+    // compaction list for generating blocks of intervals containing multiple
+    // eigenvalues
+    unsigned short  *s_cl_blocking = s_compaction_list_exc;
+    // helper compaction list for generating blocks of intervals
+    __shared__ unsigned short  s_cl_helper[2 * 
VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
+
+    if (0 == tid)
+    {
+        // set to 0 for exclusive scan
+        s_left_count[0] = 0;
+        s_right_count[0] = 0;
+
+    }
+
+    __syncthreads();
+
+    // flag if interval contains one or multiple eigenvalues
+    unsigned int is_one_lambda = 0;
+    unsigned int is_one_lambda_2 = 0;
+
+    // number of eigenvalues in the interval
+    unsigned int multiplicity = right_count - left_count;
+    is_one_lambda = (1 == multiplicity);
+
+    s_cl_one[tid] = is_one_lambda;
+    s_cl_mult[tid] = (! is_one_lambda);
+
+    // (note: s_cl_blocking is non-zero only where s_cl_mult[] is non-zero)
+    s_cl_blocking[tid] = (1 == is_one_lambda) ? 0 : multiplicity;
+    s_cl_helper[tid] = 0;
+
+    if (tid_2 < num_threads_active)
+    {
+
+        unsigned int multiplicity = right_count_2 - left_count_2;
+        is_one_lambda_2 = (1 == multiplicity);
+
+        s_cl_one[tid_2] = is_one_lambda_2;
+        s_cl_mult[tid_2] = (! is_one_lambda_2);
+
+        // (note: s_cl_blocking is non-zero only where s_cl_mult[] is non-zero)
+        s_cl_blocking[tid_2] = (1 == is_one_lambda_2) ? 0 : multiplicity;
+        s_cl_helper[tid_2] = 0;
+    }
+    else if (tid_2 < (2 * (n > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : 
VIENNACL_BISECT_MAX_THREADS_BLOCK / 2) + 1))
+    {
+
+        // clear
+        s_cl_blocking[tid_2] = 0;
+        s_cl_helper[tid_2] = 0;
+    }
+
+
+    scanInitial(tid, tid_2, n, num_threads_active, num_threads_compaction,
+                s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper);
+
+    __syncthreads();
+
+    scanSumBlocks(tid, tid_2, num_threads_active,
+                  num_threads_compaction, s_cl_blocking, s_cl_helper);
+
+    // end down sweep of scan
+    __syncthreads();
+
+    unsigned int  c_block_iend = 0;
+    unsigned int  c_block_iend_2 = 0;
+    unsigned int  c_sum_block = 0;
+    unsigned int  c_sum_block_2 = 0;
+
+    // for each thread / interval that corresponds to root node of interval 
block
+    // store start address of block and total number of eigenvalues in all 
blocks
+    // before this block (particular thread is irrelevant, constraint is to
+    // have a subset of threads so that one and only one of them is in each
+    // interval)
+    if (1 == s_cl_helper[tid])
+    {
+
+        c_block_iend = s_cl_mult[tid] + 1;
+        c_sum_block = s_cl_blocking[tid];
+    }
+
+    if (1 == s_cl_helper[tid_2])
+    {
+
+        c_block_iend_2 = s_cl_mult[tid_2] + 1;
+        c_sum_block_2 = s_cl_blocking[tid_2];
+    }
+
+    scanCompactBlocksStartAddress(tid, tid_2, num_threads_compaction,
+                                  s_cl_blocking, s_cl_helper);
+
+
+    // finished second scan for s_cl_blocking
+    __syncthreads();
+
+    // determine the global results
+    __shared__  unsigned int num_blocks_mult;
+    __shared__  unsigned int num_mult;
+    __shared__  unsigned int offset_mult_lambda;
+
+    if (0 == tid)
+    {
+
+        num_blocks_mult = s_cl_blocking[num_threads_active - 1];
+        offset_mult_lambda = s_cl_one[num_threads_active - 1];
+        num_mult = s_cl_mult[num_threads_active - 1];
+
+        *g_num_one = offset_mult_lambda;
+        *g_num_blocks_mult = num_blocks_mult;
+    }
+
+    __syncthreads();
+
+    NumericT left_2, right_2;
+    --s_cl_one;
+    --s_cl_mult;
+    --s_cl_blocking;
+
+    __syncthreads();
+    compactStreamsFinal(tid, tid_2, num_threads_active, offset_mult_lambda,
+                        s_left, s_right, s_left_count, s_right_count,
+                        s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper,
+                        is_one_lambda, is_one_lambda_2,
+                        left, right, left_2, right_2,
+                        left_count, right_count, left_count_2, right_count_2,
+                        c_block_iend, c_sum_block, c_block_iend_2, 
c_sum_block_2
+                       );
+
+    __syncthreads();
+
+    // final adjustment before writing out data to global memory
+    if (0 == tid)
+    {
+        s_cl_blocking[num_blocks_mult] = num_mult;
+        s_cl_helper[0] = 0;
+    }
+
+    __syncthreads();
+
+    // write to global memory
+    writeToGmem(tid, tid_2, num_threads_active, num_blocks_mult,
+                g_left_one, g_right_one, g_pos_one,
+                g_left_mult, g_right_mult, g_left_count_mult, 
g_right_count_mult,
+                s_left, s_right, s_left_count, s_right_count,
+                g_blocks_mult, g_blocks_mult_sum,
+                s_compaction_list, s_cl_helper, offset_mult_lambda);
+
+}
+}
+}
+}
+#endif // #ifndef _BISECT_KERNEL_LARGE_H_

http://git-wip-us.apache.org/repos/asf/mahout/blob/f7c1f802/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_multi.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_multi.hpp
 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_multi.hpp
new file mode 100755
index 0000000..a670256
--- /dev/null
+++ 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_multi.hpp
@@ -0,0 +1,277 @@
+#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_MULTI_HPP_
+#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_MULTI_HPP_
+
+/* =========================================================================
+   Copyright (c) 2010-2016, Institute for Microelectronics,
+                            Institute for Analysis and Scientific Computing,
+                            TU Wien.
+   Portions of this software are copyright by UChicago Argonne, LLC.
+
+                            -----------------
+                  ViennaCL - The Vienna Computing Library
+                            -----------------
+
+   Project Head:    Karl Rupp                   [email protected]
+
+   (A list of authors and contributors can be found in the manual)
+
+   License:         MIT (X11), see file LICENSE in the base directory
+============================================================================= 
*/
+
+
+/** @file viennacl/linalg/cuda/bisect_kernel_large_multi.hpp
+    @brief Second step of the bisection algorithm for the computation of 
eigenvalues for large matrices.
+
+    Implementation based on the sample provided with the CUDA 6.0 SDK, for 
which
+    the creation of derivative works is allowed by including the following 
statement:
+    "This software contains source code provided by NVIDIA Corporation."
+*/
+
+/* Perform second step of bisection algorithm for large matrices for
+ * intervals that contained after the first step more than one eigenvalue
+ */
+
+// includes, project
+#include "viennacl/linalg/detail/bisect/config.hpp"
+#include "viennacl/linalg/detail/bisect/util.hpp"
+// additional kernel
+#include "viennacl/linalg/cuda/bisect_util.hpp"
+
+namespace viennacl
+{
+namespace linalg
+{
+namespace cuda
+{
+////////////////////////////////////////////////////////////////////////////////
+//! Perform second step of bisection algorithm for large matrices for
+//! intervals that after the first step contained more than one eigenvalue
+//! @param  g_d  diagonal elements of symmetric, tridiagonal matrix
+//! @param  g_s  superdiagonal elements of symmetric, tridiagonal matrix
+//! @param  n    matrix size
+//! @param  blocks_mult  start addresses of blocks of intervals that are
+//!                      processed by one block of threads, each of the
+//!                      intervals contains more than one eigenvalue
+//! @param  blocks_mult_sum  total number of eigenvalues / singleton intervals
+//!                          in one block of intervals
+//! @param  g_left  left limits of intervals
+//! @param  g_right  right limits of intervals
+//! @param  g_left_count  number of eigenvalues less than left limits
+//! @param  g_right_count  number of eigenvalues less than right limits
+//! @param  g_lambda  final eigenvalue
+//! @param  g_pos  index of eigenvalue (in ascending order)
+//! @param  precision  desired precision of eigenvalues
+////////////////////////////////////////////////////////////////////////////////
+template<typename NumericT>
+__global__
+void
+bisectKernelLarge_MultIntervals(const NumericT *g_d, const NumericT *g_s, 
const unsigned int n,
+                                unsigned int *blocks_mult,
+                                unsigned int *blocks_mult_sum,
+                                NumericT *g_left, NumericT *g_right,
+                                unsigned int *g_left_count,
+                                unsigned int *g_right_count,
+                                NumericT *g_lambda, unsigned int *g_pos,
+                                NumericT precision
+                               )
+{
+  const unsigned int tid = threadIdx.x;
+
+    // left and right limits of interval
+    __shared__  NumericT  s_left[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];
+    __shared__  NumericT  s_right[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];
+
+    // number of eigenvalues smaller than interval limits
+    __shared__  unsigned int  s_left_count[2 * 
VIENNACL_BISECT_MAX_THREADS_BLOCK];
+    __shared__  unsigned int  s_right_count[2 * 
VIENNACL_BISECT_MAX_THREADS_BLOCK];
+
+    // helper array for chunk compaction of second chunk
+    __shared__  unsigned int  s_compaction_list[2 * 
VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
+    // compaction list helper for exclusive scan
+    unsigned int *s_compaction_list_exc = s_compaction_list + 1;
+
+    // flag if all threads are converged
+    __shared__  unsigned int  all_threads_converged;
+    // number of active threads
+    __shared__  unsigned int  num_threads_active;
+    // number of threads to employ for compaction
+    __shared__  unsigned int  num_threads_compaction;
+    // flag if second chunk has to be compacted
+    __shared__  unsigned int compact_second_chunk;
+
+    // parameters of block of intervals processed by this block of threads
+    __shared__  unsigned int  c_block_start;
+    __shared__  unsigned int  c_block_end;
+    __shared__  unsigned int  c_block_offset_output;
+
+    // midpoint of currently active interval of the thread
+    NumericT mid = 0.0f;
+    // number of eigenvalues smaller than \a mid
+    unsigned int  mid_count = 0;
+    // current interval parameter
+    NumericT  left = 0.0f;
+    NumericT  right = 0.0f;
+    unsigned int  left_count = 0;
+    unsigned int  right_count = 0;
+    // helper for compaction, keep track which threads have a second child
+    unsigned int  is_active_second = 0;
+
+
+    __syncthreads();
+    // initialize common start conditions
+    if (0 == tid)
+    {
+
+        c_block_start = blocks_mult[blockIdx.x];
+        c_block_end = blocks_mult[blockIdx.x + 1];
+        c_block_offset_output = blocks_mult_sum[blockIdx.x];
+
+
+        num_threads_active = c_block_end - c_block_start;
+        s_compaction_list[0] = 0;
+        num_threads_compaction = ceilPow2(num_threads_active);
+
+        all_threads_converged = 1;
+        compact_second_chunk = 0;
+    }
+
+     s_left_count [tid] = 42;
+     s_right_count[tid] = 42;
+     s_left_count [tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;
+     s_right_count[tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;
+
+    __syncthreads();
+
+
+    // read data into shared memory
+    if (tid < num_threads_active)
+    {
+        s_left[tid]  = g_left[c_block_start + tid];
+        s_right[tid] = g_right[c_block_start + tid];
+        s_left_count[tid]  = g_left_count[c_block_start + tid];
+        s_right_count[tid] = g_right_count[c_block_start + tid];
+    }
+
+    __syncthreads();
+    unsigned int iter = 0;
+    // do until all threads converged
+    while (true)
+    {
+        iter++;
+        //for (int iter=0; iter < 0; iter++) {
+        s_compaction_list[threadIdx.x] = 0;
+        s_compaction_list[threadIdx.x + blockDim.x] = 0;
+        s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;
+
+        // subdivide interval if currently active and not already converged
+        subdivideActiveIntervalMulti(tid, s_left, s_right,
+                                s_left_count, s_right_count,
+                                num_threads_active,
+                                left, right, left_count, right_count,
+                                mid, all_threads_converged);
+        __syncthreads();
+
+        // stop if all eigenvalues have been found
+        if (1 == all_threads_converged)
+        {
+
+            break;
+        }
+
+        // compute number of eigenvalues smaller than mid for active and not
+        // converged intervals, use all threads for loading data from gmem and
+        // s_left and s_right as scratch space to store the data load from gmem
+        // in shared memory
+        mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n,
+                                                    mid, tid, 
num_threads_active,
+                                                    s_left, s_right,
+                                                    (left == right));
+
+        __syncthreads();
+
+        if (tid < num_threads_active)
+        {
+
+            // store intervals
+            if (left != right)
+            {
+
+                storeNonEmptyIntervals(tid, num_threads_active,
+                                       s_left, s_right, s_left_count, 
s_right_count,
+                                       left, mid, right,
+                                       left_count, mid_count, right_count,
+                                       precision, compact_second_chunk,
+                                       s_compaction_list_exc,
+                                       is_active_second);
+
+            }
+            else
+            {
+
+                storeIntervalConverged(s_left, s_right, s_left_count, 
s_right_count,
+                                       left, mid, right,
+                                       left_count, mid_count, right_count,
+                                       s_compaction_list_exc, 
compact_second_chunk,
+                                       num_threads_active,
+                                       is_active_second);
+
+            }
+        }
+
+        __syncthreads();
+
+        // compact second chunk of intervals if any of the threads generated
+        // two child intervals
+        if (1 == compact_second_chunk)
+        {
+
+            createIndicesCompaction(s_compaction_list_exc, 
num_threads_compaction);
+            compactIntervals(s_left, s_right, s_left_count, s_right_count,
+                             mid, right, mid_count, right_count,
+                             s_compaction_list, num_threads_active,
+                             is_active_second);
+        }
+
+        __syncthreads();
+
+        // update state variables
+        if (0 == tid)
+        {
+            num_threads_active += s_compaction_list[num_threads_active];
+            num_threads_compaction = ceilPow2(num_threads_active);
+
+            compact_second_chunk = 0;
+            all_threads_converged = 1;
+        }
+
+        __syncthreads();
+
+        // clear
+        s_compaction_list_exc[threadIdx.x] = 0;
+        s_compaction_list_exc[threadIdx.x + blockDim.x] = 0;
+
+        if (num_threads_compaction > blockDim.x)
+        {
+          break;
+        }
+
+
+        __syncthreads();
+
+    }  // end until all threads converged
+
+    // write data back to global memory
+    if (tid < num_threads_active)
+    {
+
+        unsigned int addr = c_block_offset_output + tid;
+
+        g_lambda[addr]  = s_left[tid];
+        g_pos[addr]   = s_right_count[tid];
+    }
+}
+} // namespace cuda
+} // namespace linalg
+} // namespace viennacl
+
+#endif // #ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_MULTI_HPP_

Reply via email to