http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/amg_operations.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/amg_operations.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/amg_operations.hpp
deleted file mode 100644
index 65b323e..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/amg_operations.hpp
+++ /dev/null
@@ -1,821 +0,0 @@
-#ifndef VIENNACL_LINALG_CUDA_AMG_OPERATIONS_HPP
-#define VIENNACL_LINALG_CUDA_AMG_OPERATIONS_HPP
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the PDF manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-/** @file cuda/amg_operations.hpp
-    @brief Implementations of routines for AMG in OpenCL.
-*/
-
-#include <cstdlib>
-#include <cmath>
-#include "viennacl/linalg/detail/amg/amg_base.hpp"
-
-#include <map>
-#include <set>
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace cuda
-{
-namespace amg
-{
-
-
-///////////////////////////////////////////
-
-__global__ void amg_influence_trivial_kernel(
-          const unsigned int * row_indices,
-          const unsigned int * column_indices,
-          unsigned int size1,
-          unsigned int nnz,
-          unsigned int *influences_row,
-          unsigned int *influences_id,
-          unsigned int *influences_values
-          )
-{
-  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
-  unsigned int global_size = gridDim.x * blockDim.x;
-
-  for (unsigned int i = global_id; i < size1; i += global_size)
-  {
-    unsigned int tmp = row_indices[i];
-    influences_row[i] = tmp;
-    influences_values[i] = row_indices[i+1] - tmp;
-  }
-
-  for (unsigned int i = global_id; i < nnz; i += global_size)
-    influences_id[i] = column_indices[i];
-
-  if (global_id == 0)
-    influences_row[size1] = row_indices[size1];
-}
-
-
-/** @brief Routine for taking all connections in the matrix as strong */
-template<typename NumericT>
-void amg_influence_trivial(compressed_matrix<NumericT> const & A,
-                           viennacl::linalg::detail::amg::amg_level_context & 
amg_context,
-                           viennacl::linalg::amg_tag & tag)
-{
-  (void)tag;
-
-  amg_influence_trivial_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned 
int>(A.handle1().cuda_handle()),
-                                             viennacl::cuda_arg<unsigned 
int>(A.handle2().cuda_handle()),
-                                             static_cast<unsigned 
int>(A.size1()),
-                                             static_cast<unsigned 
int>(A.nnz()),
-                                             
viennacl::cuda_arg(amg_context.influence_jumper_),
-                                             
viennacl::cuda_arg(amg_context.influence_ids_),
-                                             
viennacl::cuda_arg(amg_context.influence_values_)
-                                            );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_influence_trivial_kernel");
-}
-
-
-/** @brief Routine for extracting strongly connected points considering a 
user-provided threshold value */
-template<typename NumericT>
-void amg_influence_advanced(compressed_matrix<NumericT> const & A,
-                            viennacl::linalg::detail::amg::amg_level_context & 
amg_context,
-                            viennacl::linalg::amg_tag & tag)
-{
-  throw std::runtime_error("not implemented yet");
-}
-
-/** @brief Dispatcher for influence processing */
-template<typename NumericT>
-void amg_influence(compressed_matrix<NumericT> const & A,
-                   viennacl::linalg::detail::amg::amg_level_context & 
amg_context,
-                   viennacl::linalg::amg_tag & tag)
-{
-  // TODO: dispatch based on influence tolerance provided
-  amg_influence_trivial(A, amg_context, tag);
-}
-
-/** @brief Assign IDs to coarse points.
-*
-*  TODO: Use exclusive_scan on GPU for this.
-*/
-inline void 
enumerate_coarse_points(viennacl::linalg::detail::amg::amg_level_context & 
amg_context)
-{
-  viennacl::backend::typesafe_host_array<unsigned int> 
point_types(amg_context.point_types_.handle(), amg_context.point_types_.size());
-  viennacl::backend::typesafe_host_array<unsigned int> 
coarse_ids(amg_context.coarse_id_.handle(),    amg_context.coarse_id_.size());
-  viennacl::backend::memory_read(amg_context.point_types_.handle(), 0, 
point_types.raw_size(), point_types.get());
-  viennacl::backend::memory_read(amg_context.coarse_id_.handle(),   0, 
coarse_ids.raw_size(),  coarse_ids.get());
-
-  unsigned int coarse_id = 0;
-  for (std::size_t i=0; i<amg_context.point_types_.size(); ++i)
-  {
-    coarse_ids.set(i, coarse_id);
-    if (point_types[i] == 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
-      ++coarse_id;
-  }
-
-  amg_context.num_coarse_ = coarse_id;
-
-  viennacl::backend::memory_write(amg_context.coarse_id_.handle(), 0, 
coarse_ids.raw_size(), coarse_ids.get());
-}
-
-//////////////////////////////////////
-
-/** @brief CUDA kernel initializing the work vectors at each PMIS iteration */
-template<typename IndexT>
-__global__ void amg_pmis2_init_workdata(IndexT *work_state,
-                                        IndexT *work_random,
-                                        IndexT *work_index,
-                                        IndexT const *point_types,
-                                        IndexT const *random_weights,
-                                        unsigned int size)
-{
-  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
-  unsigned int global_size = gridDim.x * blockDim.x;
-
-  for (unsigned int i = global_id; i < size; i += global_size)
-  {
-    switch (point_types[i])
-    {
-    case 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED: 
work_state[i] = 1; break;
-    case viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE:    
  work_state[i] = 0; break;
-    case viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE:  
  work_state[i] = 2; break;
-    default:
-      break; // do nothing
-    }
-
-    work_random[i] = random_weights[i];
-    work_index[i]  = i;
-  }
-}
-
-/** @brief CUDA kernel propagating the state triple (status, weight, nodeID) 
to neighbors using a max()-operation */
-template<typename IndexT>
-__global__ void amg_pmis2_max_neighborhood(IndexT const *work_state,
-                                           IndexT const *work_random,
-                                           IndexT const *work_index,
-                                           IndexT       *work_state2,
-                                           IndexT       *work_random2,
-                                           IndexT       *work_index2,
-                                           IndexT const *influences_row,
-                                           IndexT const *influences_id,
-                                           unsigned int size)
-{
-  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
-  unsigned int global_size = gridDim.x * blockDim.x;
-
-  for (unsigned int i = global_id; i < size; i += global_size)
-  {
-    // load
-    unsigned int state  = work_state[i];
-    unsigned int random = work_random[i];
-    unsigned int index  = work_index[i];
-
-    // max
-    unsigned int j_stop = influences_row[i + 1];
-    for (unsigned int j = influences_row[i]; j < j_stop; ++j)
-    {
-      unsigned int influenced_point_id = influences_id[j];
-
-      // lexigraphical triple-max (not particularly pretty, but does the job):
-      if (state < work_state[influenced_point_id])
-      {
-        state  = work_state[influenced_point_id];
-        random = work_random[influenced_point_id];
-        index  = work_index[influenced_point_id];
-      }
-      else if (state == work_state[influenced_point_id])
-      {
-        if (random < work_random[influenced_point_id])
-        {
-          state  = work_state[influenced_point_id];
-          random = work_random[influenced_point_id];
-          index  = work_index[influenced_point_id];
-        }
-        else if (random == work_random[influenced_point_id])
-        {
-          if (index < work_index[influenced_point_id])
-          {
-            state  = work_state[influenced_point_id];
-            random = work_random[influenced_point_id];
-            index  = work_index[influenced_point_id];
-          }
-        } // max(random)
-      } // max(state)
-    } // for
-
-    // store
-    work_state2[i]  = state;
-    work_random2[i] = random;
-    work_index2[i]  = index;
-  }
-}
-
-/** @brief CUDA kernel for marking MIS and non-MIS nodes */
-template<typename IndexT>
-__global__ void amg_pmis2_mark_mis_nodes(IndexT const *work_state,
-                                         IndexT const *work_index,
-                                         IndexT *point_types,
-                                         IndexT *undecided_buffer,
-                                         unsigned int size)
-{
-  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
-  unsigned int global_size = gridDim.x * blockDim.x;
-
-  unsigned int num_undecided = 0;
-  for (unsigned int i = global_id; i < size; i += global_size)
-  {
-    unsigned int max_state  = work_state[i];
-    unsigned int max_index  = work_index[i];
-
-    if (point_types[i] == 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
-    {
-      if (i == max_index) // make this a MIS node
-        point_types[i] = 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE;
-      else if (max_state == 2) // mind the mapping of 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE above!
-        point_types[i] = 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE;
-      else
-        num_undecided += 1;
-    }
-  }
-
-  // reduction of the number of undecided nodes:
-  __shared__ unsigned int shared_buffer[256];
-  shared_buffer[threadIdx.x] = num_undecided;
-  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
-  {
-    __syncthreads();
-    if (threadIdx.x < stride)
-      shared_buffer[threadIdx.x] += shared_buffer[threadIdx.x+stride];
-  }
-
-  if (threadIdx.x == 0)
-    undecided_buffer[blockIdx.x] = shared_buffer[0];
-
-}
-
-/** @brief CUDA kernel for resetting non-MIS (i.e. coarse) points to undecided 
so that subsequent kernels work */
-__global__ void amg_pmis2_reset_state(unsigned int *point_types, unsigned int 
size)
-{
-  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
-  unsigned int global_size = gridDim.x * blockDim.x;
-
-  for (unsigned int i = global_id; i < size; i += global_size)
-  {
-    if (point_types[i] != 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
-      point_types[i] = 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED;
-  }
-}
-
-/** @brief AG (aggregation based) coarsening, single-threaded version of stage 
1
-*
-* @param A             Operator matrix on all levels
-* @param amg_context   AMG hierarchy datastructures
-* @param tag           AMG preconditioner tag
-*/
-template<typename NumericT>
-void amg_coarse_ag_stage1_mis2(compressed_matrix<NumericT> const & A,
-                               
viennacl::linalg::detail::amg::amg_level_context & amg_context,
-                               viennacl::linalg::amg_tag & tag)
-{
-  viennacl::vector<unsigned int> random_weights(A.size1(), 
viennacl::context(viennacl::MAIN_MEMORY));
-  unsigned int *random_weights_ptr = 
viennacl::linalg::host_based::detail::extract_raw_pointer<unsigned 
int>(random_weights.handle());
-  for (std::size_t i=0; i<random_weights.size(); ++i)
-    random_weights_ptr[i] = static_cast<unsigned int>(rand()) % 
static_cast<unsigned int>(A.size1());
-  random_weights.switch_memory_context(viennacl::traits::context(A));
-
-  // work vectors:
-  viennacl::vector<unsigned int> work_state(A.size1(),  
viennacl::traits::context(A));
-  viennacl::vector<unsigned int> work_random(A.size1(), 
viennacl::traits::context(A));
-  viennacl::vector<unsigned int> work_index(A.size1(),  
viennacl::traits::context(A));
-
-  viennacl::vector<unsigned int> work_state2(A.size1(),  
viennacl::traits::context(A));
-  viennacl::vector<unsigned int> work_random2(A.size1(), 
viennacl::traits::context(A));
-  viennacl::vector<unsigned int> work_index2(A.size1(),  
viennacl::traits::context(A));
-
-  unsigned int num_undecided = static_cast<unsigned int>(A.size1());
-  viennacl::vector<unsigned int> undecided_buffer(256, 
viennacl::traits::context(A));
-  viennacl::backend::typesafe_host_array<unsigned int> 
undecided_buffer_host(undecided_buffer.handle(), undecided_buffer.size());
-
-  unsigned int pmis_iters = 0;
-  while (num_undecided > 0)
-  {
-    ++pmis_iters;
-
-    //
-    // init temporary work data:
-    //
-    amg_pmis2_init_workdata<<<128, 128>>>(viennacl::cuda_arg(work_state),
-                                          viennacl::cuda_arg(work_random),
-                                          viennacl::cuda_arg(work_index),
-                                          
viennacl::cuda_arg(amg_context.point_types_),
-                                          viennacl::cuda_arg(random_weights),
-                                          static_cast<unsigned int>(A.size1())
-                                         );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("amg_pmis2_reset_state");
-
-
-    //
-    // Propagate maximum tuple twice
-    //
-    for (unsigned int r = 0; r < 2; ++r)
-    {
-      // max operation over neighborhood
-      amg_pmis2_max_neighborhood<<<128, 128>>>(viennacl::cuda_arg(work_state),
-                                               viennacl::cuda_arg(work_random),
-                                               viennacl::cuda_arg(work_index),
-                                               viennacl::cuda_arg(work_state2),
-                                               
viennacl::cuda_arg(work_random2),
-                                               viennacl::cuda_arg(work_index2),
-                                               
viennacl::cuda_arg(amg_context.influence_jumper_),
-                                               
viennacl::cuda_arg(amg_context.influence_ids_),
-                                               static_cast<unsigned 
int>(A.size1())
-                                              );
-      VIENNACL_CUDA_LAST_ERROR_CHECK("amg_pmis2_max_neighborhood");
-
-      // copy work array (can be fused into a single kernel if needed. 
Previous kernel is in most cases sufficiently heavy)
-      work_state  = work_state2;
-      work_random = work_random2;
-      work_index  = work_index2;
-    }
-
-    //
-    // mark MIS and non-MIS nodes:
-    //
-    amg_pmis2_mark_mis_nodes<<<128, 128>>>(viennacl::cuda_arg(work_state),
-                                           viennacl::cuda_arg(work_index),
-                                           
viennacl::cuda_arg(amg_context.point_types_),
-                                           
viennacl::cuda_arg(undecided_buffer),
-                                           static_cast<unsigned int>(A.size1())
-                                          );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("amg_pmis2_reset_state");
-
-    // get number of undecided points on host:
-    viennacl::backend::memory_read(undecided_buffer.handle(),   0, 
undecided_buffer_host.raw_size(),  undecided_buffer_host.get());
-    num_undecided = 0;
-    for (std::size_t i=0; i<undecided_buffer.size(); ++i)
-      num_undecided += undecided_buffer_host[i];
-
-  } //while
-
-  // consistency with sequential MIS: reset state for non-coarse points, so 
that coarse indices are correctly picked up later
-  amg_pmis2_reset_state<<<128, 
128>>>(viennacl::cuda_arg(amg_context.point_types_),
-                                      static_cast<unsigned 
int>(amg_context.point_types_.size())
-                                     );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_pmis2_reset_state");
-}
-
-
-
-
-
-template<typename IndexT>
-__global__ void amg_agg_propagate_coarse_indices(IndexT       *point_types,
-                                                 IndexT       *coarse_ids,
-                                                 IndexT const *influences_row,
-                                                 IndexT const *influences_id,
-                                                 unsigned int size)
-{
-  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
-  unsigned int global_size = gridDim.x * blockDim.x;
-
-  for (unsigned int i = global_id; i < size; i += global_size)
-  {
-    if (point_types[i] == 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_COARSE)
-    {
-      unsigned int coarse_index = coarse_ids[i];
-
-      unsigned int j_stop = influences_row[i + 1];
-      for (unsigned int j = influences_row[i]; j < j_stop; ++j)
-      {
-        unsigned int influenced_point_id = influences_id[j];
-        coarse_ids[influenced_point_id] = coarse_index; // Set aggregate index 
for fine point
-
-        if (influenced_point_id != i) // Note: Any write races between threads 
are harmless here
-          point_types[influenced_point_id] = 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE;
-      }
-    }
-  }
-}
-
-
-template<typename IndexT>
-__global__ void amg_agg_merge_undecided(IndexT       *point_types,
-                                        IndexT       *coarse_ids,
-                                        IndexT const *influences_row,
-                                        IndexT const *influences_id,
-                                        unsigned int size)
-{
-  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
-  unsigned int global_size = gridDim.x * blockDim.x;
-
-  for (unsigned int i = global_id; i < size; i += global_size)
-  {
-    if (point_types[i] == 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
-    {
-      unsigned int j_stop = influences_row[i + 1];
-      for (unsigned int j = influences_row[i]; j < j_stop; ++j)
-      {
-        unsigned int influenced_point_id = influences_id[j];
-        if (point_types[influenced_point_id] != 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED) // 
either coarse or fine point
-        {
-          //std::cout << "Setting fine node " << i << " to be aggregated with 
node " << *influence_iter << "/" << 
pointvector.get_coarse_index(*influence_iter) << std::endl;
-          coarse_ids[i] = coarse_ids[influenced_point_id];
-          break;
-        }
-      }
-    }
-  }
-}
-
-
-__global__ void amg_agg_merge_undecided_2(unsigned int *point_types,
-                                          unsigned int size)
-{
-  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
-  unsigned int global_size = gridDim.x * blockDim.x;
-
-  for (unsigned int i = global_id; i < size; i += global_size)
-  {
-    if (point_types[i] == 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_UNDECIDED)
-      point_types[i] = 
viennacl::linalg::detail::amg::amg_level_context::POINT_TYPE_FINE;
-  }
-}
-
-
-/** @brief AG (aggregation based) coarsening. Partially single-threaded 
version (VIENNACL_AMG_COARSE_AG)
-*
-* @param A             Operator matrix
-* @param amg_context   AMG hierarchy datastructures
-* @param tag           AMG preconditioner tag
-*/
-template<typename NumericT>
-void amg_coarse_ag(compressed_matrix<NumericT> const & A,
-                   viennacl::linalg::detail::amg::amg_level_context & 
amg_context,
-                   viennacl::linalg::amg_tag & tag)
-{
-
-  amg_influence_trivial(A, amg_context, tag);
-
-  //
-  // Stage 1: Build aggregates:
-  //
-  if (tag.get_coarsening_method() == 
viennacl::linalg::AMG_COARSENING_METHOD_MIS2_AGGREGATION)
-    amg_coarse_ag_stage1_mis2(A, amg_context, tag);
-  else
-    throw std::runtime_error("Only MIS2 coarsening implemented. Selected 
coarsening not available with CUDA backend!");
-
-  viennacl::linalg::cuda::amg::enumerate_coarse_points(amg_context);
-
-  //
-  // Stage 2: Propagate coarse aggregate indices to neighbors:
-  //
-  amg_agg_propagate_coarse_indices<<<128, 
128>>>(viennacl::cuda_arg(amg_context.point_types_),
-                                                 
viennacl::cuda_arg(amg_context.coarse_id_),
-                                                 
viennacl::cuda_arg(amg_context.influence_jumper_),
-                                                 
viennacl::cuda_arg(amg_context.influence_ids_),
-                                                 static_cast<unsigned 
int>(A.size1())
-                                                );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_agg_propagate_coarse_indices");
-
-
-  //
-  // Stage 3: Merge remaining undecided points (merging to first aggregate 
found when cycling over the hierarchy
-  //
-  amg_agg_merge_undecided<<<128, 
128>>>(viennacl::cuda_arg(amg_context.point_types_),
-                                        
viennacl::cuda_arg(amg_context.coarse_id_),
-                                        
viennacl::cuda_arg(amg_context.influence_jumper_),
-                                        
viennacl::cuda_arg(amg_context.influence_ids_),
-                                        static_cast<unsigned int>(A.size1())
-                                       );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_agg_merge_undecided");
-
-  //
-  // Stage 4: Set undecided points to fine points (coarse ID already set in 
Stage 3)
-  //          Note: Stage 3 and Stage 4 were initially fused, but are now 
split in order to avoid race conditions (or a fallback to sequential execution).
-  //
-  amg_agg_merge_undecided_2<<<128, 
128>>>(viennacl::cuda_arg(amg_context.point_types_),
-                                         static_cast<unsigned int>(A.size1())
-                                        );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_agg_merge_undecided_2");
-}
-
-
-
-
-/** @brief Calls the right coarsening procedure
-*
-* @param A            Operator matrix on all levels
-* @param amg_context  AMG hierarchy datastructures
-* @param tag          AMG preconditioner tag
-*/
-template<typename InternalT1>
-void amg_coarse(InternalT1 & A,
-                viennacl::linalg::detail::amg::amg_level_context & amg_context,
-                viennacl::linalg::amg_tag & tag)
-{
-  switch (tag.get_coarsening_method())
-  {
-  case viennacl::linalg::AMG_COARSENING_METHOD_MIS2_AGGREGATION: 
amg_coarse_ag(A, amg_context, tag); break;
-  default: throw std::runtime_error("not implemented yet");
-  }
-}
-
-
-
-
-////////////////////////////////////// Interpolation 
/////////////////////////////
-
-template<typename NumericT>
-__global__ void amg_interpol_ag_kernel(unsigned int *P_row_buffer,
-                                       unsigned int *P_col_buffer,
-                                       NumericT *P_elements,
-                                       unsigned int *coarse_ids,
-                                       unsigned int size)
-{
-  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
-  unsigned int global_size = gridDim.x * blockDim.x;
-
-  for (unsigned int i = global_id; i < size; i += global_size)
-  {
-    P_row_buffer[i] = i;
-    P_col_buffer[i] = coarse_ids[i];
-    P_elements[i]   = NumericT(1);
-  }
-
-  // set last entry as well:
-  if (global_id == 0)
-    P_row_buffer[size] = size;
-}
-
-/** @brief AG (aggregation based) interpolation. Multi-Threaded! 
(VIENNACL_INTERPOL_SA)
- *
- * @param A            Operator matrix
- * @param P            Prolongation matrix
- * @param amg_context  AMG hierarchy datastructures
- * @param tag          AMG configuration tag
-*/
-template<typename NumericT>
-void amg_interpol_ag(compressed_matrix<NumericT> const & A,
-                     compressed_matrix<NumericT> & P,
-                     viennacl::linalg::detail::amg::amg_level_context & 
amg_context,
-                     viennacl::linalg::amg_tag & tag)
-{
-  (void)tag;
-  P = compressed_matrix<NumericT>(A.size1(), amg_context.num_coarse_, 
A.size1(), viennacl::traits::context(A));
-
-  amg_interpol_ag_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned 
int>(P.handle1().cuda_handle()),
-                                       viennacl::cuda_arg<unsigned 
int>(P.handle2().cuda_handle()),
-                                       
viennacl::cuda_arg<NumericT>(P.handle().cuda_handle()),
-                                       
viennacl::cuda_arg(amg_context.coarse_id_),
-                                       static_cast<unsigned int>(A.size1())
-                                      );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_interpol_ag_kernel");
-
-  P.generate_row_block_information();
-}
-
-
-
-template<typename NumericT>
-__global__ void amg_interpol_sa_kernel(
-          const unsigned int *A_row_indices,
-          const unsigned int *A_col_indices,
-          const NumericT     *A_elements,
-          unsigned int A_size1,
-          unsigned int A_nnz,
-          unsigned int *Jacobi_row_indices,
-          unsigned int *Jacobi_col_indices,
-          NumericT     *Jacobi_elements,
-          NumericT     omega
-          )
-{
-  unsigned int global_id   = blockDim.x * blockIdx.x + threadIdx.x;
-  unsigned int global_size = gridDim.x * blockDim.x;
-
-  for (unsigned int row = global_id; row < A_size1; row += global_size)
-  {
-    unsigned int row_begin = A_row_indices[row];
-    unsigned int row_end   = A_row_indices[row+1];
-
-    Jacobi_row_indices[row] = row_begin;
-
-    // Step 1: Extract diagonal:
-    NumericT diag = 0;
-    for (unsigned int j = row_begin; j < row_end; ++j)
-    {
-      if (A_col_indices[j] == row)
-      {
-        diag = A_elements[j];
-        break;
-      }
-    }
-
-    // Step 2: Write entries:
-    for (unsigned int j = row_begin; j < row_end; ++j)
-    {
-      unsigned int col_index = A_col_indices[j];
-      Jacobi_col_indices[j] = col_index;
-
-      if (col_index == row)
-        Jacobi_elements[j] = NumericT(1) - omega;
-      else
-        Jacobi_elements[j] = - omega * A_elements[j] / diag;
-    }
-  }
-
-  if (global_id == 0)
-    Jacobi_row_indices[A_size1] = A_nnz; // don't forget finalizer
-}
-
-
-
-/** @brief Smoothed aggregation interpolation. (VIENNACL_INTERPOL_SA)
- *
- * @param A            Operator matrix
- * @param P            Prolongation matrix
- * @param amg_context  AMG hierarchy datastructures
- * @param tag          AMG configuration tag
-*/
-template<typename NumericT>
-void amg_interpol_sa(compressed_matrix<NumericT> const & A,
-                     compressed_matrix<NumericT> & P,
-                     viennacl::linalg::detail::amg::amg_level_context & 
amg_context,
-                     viennacl::linalg::amg_tag & tag)
-{
-  (void)tag;
-  viennacl::compressed_matrix<NumericT> P_tentative(A.size1(), 
amg_context.num_coarse_, A.size1(), viennacl::traits::context(A));
-
-  // form tentative operator:
-  amg_interpol_ag(A, P_tentative, amg_context, tag);
-
-  viennacl::compressed_matrix<NumericT> Jacobi(A.size1(), A.size1(), A.nnz(), 
viennacl::traits::context(A));
-
-  amg_interpol_sa_kernel<<<128, 128>>>(viennacl::cuda_arg<unsigned 
int>(A.handle1().cuda_handle()),
-                                       viennacl::cuda_arg<unsigned 
int>(A.handle2().cuda_handle()),
-                                       
viennacl::cuda_arg<NumericT>(A.handle().cuda_handle()),
-                                       static_cast<unsigned int>(A.size1()),
-                                       static_cast<unsigned int>(A.nnz()),
-                                       viennacl::cuda_arg<unsigned 
int>(Jacobi.handle1().cuda_handle()),
-                                       viennacl::cuda_arg<unsigned 
int>(Jacobi.handle2().cuda_handle()),
-                                       
viennacl::cuda_arg<NumericT>(Jacobi.handle().cuda_handle()),
-                                       NumericT(tag.get_jacobi_weight())
-                                      );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("amg_interpol_sa_kernel");
-
-  P = viennacl::linalg::prod(Jacobi, P_tentative);
-
-  P.generate_row_block_information();
-}
-
-
-/** @brief Dispatcher for building the interpolation matrix
- *
- * @param A            Operator matrix
- * @param P            Prolongation matrix
- * @param amg_context  AMG hierarchy datastructures
- * @param tag          AMG configuration tag
-*/
-template<typename MatrixT>
-void amg_interpol(MatrixT const & A,
-                  MatrixT & P,
-                  viennacl::linalg::detail::amg::amg_level_context & 
amg_context,
-                  viennacl::linalg::amg_tag & tag)
-{
-  switch (tag.get_interpolation_method())
-  {
-  case viennacl::linalg::AMG_INTERPOLATION_METHOD_AGGREGATION:          
amg_interpol_ag     (A, P, amg_context, tag); break;
-  case viennacl::linalg::AMG_INTERPOLATION_METHOD_SMOOTHED_AGGREGATION: 
amg_interpol_sa     (A, P, amg_context, tag); break;
-  default: throw std::runtime_error("Not implemented yet!");
-  }
-}
-
-
-template<typename NumericT>
-__global__ void compressed_matrix_assign_to_dense(
-          const unsigned int * row_indices,
-          const unsigned int * column_indices,
-          const NumericT * elements,
-          NumericT *B,
-          unsigned int B_row_start,     unsigned int B_col_start,
-          unsigned int B_row_inc,       unsigned int B_col_inc,
-          unsigned int B_row_size,      unsigned int B_col_size,
-          unsigned int B_internal_rows, unsigned int B_internal_cols)
-{
-  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
-                    row  < B_row_size;
-                    row += gridDim.x * blockDim.x)
-  {
-    unsigned int row_end = row_indices[row+1];
-    for (unsigned int j = row_indices[row]; j<row_end; j++)
-      B[(B_row_start + row * B_row_inc) * B_internal_cols + B_col_start + 
column_indices[j] * B_col_inc] = elements[j];
-  }
-}
-
-
-template<typename NumericT, unsigned int AlignmentV>
-void assign_to_dense(viennacl::compressed_matrix<NumericT, AlignmentV> const & 
A,
-                     viennacl::matrix_base<NumericT> & B)
-{
-  compressed_matrix_assign_to_dense<<<128, 128>>>(viennacl::cuda_arg<unsigned 
int>(A.handle1().cuda_handle()),
-                                                  viennacl::cuda_arg<unsigned 
int>(A.handle2().cuda_handle()),
-                                                  
viennacl::cuda_arg<NumericT>(A.handle().cuda_handle()),
-                                                  
viennacl::cuda_arg<NumericT>(B),
-                                                  static_cast<unsigned 
int>(viennacl::traits::start1(B)),           static_cast<unsigned 
int>(viennacl::traits::start2(B)),
-                                                  static_cast<unsigned 
int>(viennacl::traits::stride1(B)),          static_cast<unsigned 
int>(viennacl::traits::stride2(B)),
-                                                  static_cast<unsigned 
int>(viennacl::traits::size1(B)),            static_cast<unsigned 
int>(viennacl::traits::size2(B)),
-                                                  static_cast<unsigned 
int>(viennacl::traits::internal_size1(B)),   static_cast<unsigned 
int>(viennacl::traits::internal_size2(B))
-                                                 );
-  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_assign_to_dense");
-}
-
-
-
-
-template<typename NumericT>
-__global__ void compressed_matrix_smooth_jacobi_kernel(
-          const unsigned int * row_indices,
-          const unsigned int * column_indices,
-          const NumericT * elements,
-          NumericT weight,
-          const NumericT * x_old,
-          NumericT * x_new,
-          const NumericT * rhs,
-          unsigned int size)
-{
-  for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
-                    row  < size;
-                    row += gridDim.x * blockDim.x)
-  {
-    NumericT sum = NumericT(0);
-    NumericT diag = NumericT(1);
-    unsigned int row_end = row_indices[row+1];
-    for (unsigned int j = row_indices[row]; j < row_end; ++j)
-    {
-      unsigned int col = column_indices[j];
-      if (col == row)
-        diag = elements[j];
-      else
-        sum += elements[j] * x_old[col];
-    }
-    x_new[row] = weight * (rhs[row] - sum) / diag + (NumericT(1) - weight) * 
x_old[row];
-  }
-}
-
-
-
-
-/** @brief Damped Jacobi Smoother (CUDA version)
-*
-* @param iterations  Number of smoother iterations
-* @param A           Operator matrix for the smoothing
-* @param x           The vector smoothing is applied to
-* @param x_backup    (Different) Vector holding the same values as x
-* @param rhs_smooth  The right hand side of the equation for the smoother
-* @param weight      Damping factor. 0: No effect of smoother. 1: Undamped 
Jacobi iteration
-*/
-template<typename NumericT>
-void smooth_jacobi(unsigned int iterations,
-                   compressed_matrix<NumericT> const & A,
-                   vector<NumericT> & x,
-                   vector<NumericT> & x_backup,
-                   vector<NumericT> const & rhs_smooth,
-                   NumericT weight)
-{
-  for (unsigned int i=0; i<iterations; ++i)
-  {
-    x_backup = x;
-
-    compressed_matrix_smooth_jacobi_kernel<<<128, 
128>>>(viennacl::cuda_arg<unsigned int>(A.handle1().cuda_handle()),
-                                                         
viennacl::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
-                                                         
viennacl::cuda_arg<NumericT>(A.handle().cuda_handle()),
-                                                         
static_cast<NumericT>(weight),
-                                                         
viennacl::cuda_arg(x_backup),
-                                                         viennacl::cuda_arg(x),
-                                                         
viennacl::cuda_arg(rhs_smooth),
-                                                         static_cast<unsigned 
int>(rhs_smooth.size())
-                                                        );
-    VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_smooth_jacobi_kernel");
-  }
-}
-
-
-} //namespace amg
-} //namespace host_based
-} //namespace linalg
-} //namespace viennacl
-
-#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_calls.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_calls.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_calls.hpp
deleted file mode 100644
index 39f0015..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_calls.hpp
+++ /dev/null
@@ -1,166 +0,0 @@
-#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_CALLS_HPP_
-#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_CALLS_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-
-/** @file viennacl/linalg/cuda/bisect_kernel_calls.hpp
-    @brief CUDA kernel calls for the bisection algorithm
-
-    Implementation based on the sample provided with the CUDA 6.0 SDK, for 
which
-    the creation of derivative works is allowed by including the following 
statement:
-    "This software contains source code provided by NVIDIA Corporation."
-*/
-
-#include "viennacl/linalg/detail/bisect/structs.hpp"
-
-// includes, kernels
-#include "viennacl/linalg/cuda/bisect_kernel_small.hpp"
-#include "viennacl/linalg/cuda/bisect_kernel_large.hpp"
-#include "viennacl/linalg/cuda/bisect_kernel_large_onei.hpp"
-#include "viennacl/linalg/cuda/bisect_kernel_large_multi.hpp"
-
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace cuda
-{
-template<typename NumericT>
-void bisectSmall(const viennacl::linalg::detail::InputData<NumericT> &input, 
viennacl::linalg::detail::ResultDataSmall<NumericT> &result,
-                       const unsigned int mat_size,
-                       const NumericT lg, const NumericT ug,
-                       const NumericT precision)
-{
-
-
-  dim3  blocks(1, 1, 1);
-  dim3  threads(VIENNACL_BISECT_MAX_THREADS_BLOCK_SMALL_MATRIX, 1, 1);
-
-  bisectKernelSmall<<< blocks, threads >>>(
-    viennacl::cuda_arg(input.g_a),
-    viennacl::cuda_arg(input.g_b) + 1,
-    mat_size,
-    viennacl::cuda_arg(result.vcl_g_left),
-    viennacl::cuda_arg(result.vcl_g_right),
-    viennacl::cuda_arg(result.vcl_g_left_count),
-    viennacl::cuda_arg(result.vcl_g_right_count),
-    lg, ug, 0, mat_size,
-    precision
-    );
-  viennacl::linalg::cuda::VIENNACL_CUDA_LAST_ERROR_CHECK("Kernel launch 
failed");
-}
-
-
-template<typename NumericT>
-void bisectLarge(const viennacl::linalg::detail::InputData<NumericT> &input, 
viennacl::linalg::detail::ResultDataLarge<NumericT> &result,
-                   const unsigned int mat_size,
-                   const NumericT lg, const NumericT ug,
-                   const NumericT precision)
- {
-
-  dim3  blocks(1, 1, 1);
-  dim3  threads(mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : 
VIENNACL_BISECT_MAX_THREADS_BLOCK / 2 , 1, 1);
-  bisectKernelLarge<<< blocks, threads >>>
-    (viennacl::cuda_arg(input.g_a),
-     viennacl::cuda_arg(input.g_b) + 1,
-     mat_size,
-     lg, ug, static_cast<unsigned int>(0), mat_size, precision,
-     viennacl::cuda_arg(result.g_num_one),
-     viennacl::cuda_arg(result.g_num_blocks_mult),
-     viennacl::cuda_arg(result.g_left_one),
-     viennacl::cuda_arg(result.g_right_one),
-     viennacl::cuda_arg(result.g_pos_one),
-     viennacl::cuda_arg(result.g_left_mult),
-     viennacl::cuda_arg(result.g_right_mult),
-     viennacl::cuda_arg(result.g_left_count_mult),
-     viennacl::cuda_arg(result.g_right_count_mult),
-     viennacl::cuda_arg(result.g_blocks_mult),
-     viennacl::cuda_arg(result.g_blocks_mult_sum)
-     );
-  viennacl::linalg::cuda::VIENNACL_CUDA_LAST_ERROR_CHECK("Kernel launch 
failed.");
-}
-
-
-// compute eigenvalues for intervals that contained only one eigenvalue
-// after the first processing step
-template<typename NumericT>
-void bisectLarge_OneIntervals(const 
viennacl::linalg::detail::InputData<NumericT> &input, 
viennacl::linalg::detail::ResultDataLarge<NumericT> &result,
-                   const unsigned int mat_size,
-                   const NumericT precision)
- {
-
-  unsigned int num_one_intervals = result.g_num_one;
-  unsigned int num_blocks = 
viennacl::linalg::detail::getNumBlocksLinear(num_one_intervals,
-                                                                         
mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : 
VIENNACL_BISECT_MAX_THREADS_BLOCK / 2);
-  dim3 grid_onei;
-  grid_onei.x = num_blocks;
-  grid_onei.y = 1, grid_onei.z = 1;
-  dim3 threads_onei(mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : 
VIENNACL_BISECT_MAX_THREADS_BLOCK / 2, 1, 1);
-
-
-  bisectKernelLarge_OneIntervals<<< grid_onei , threads_onei >>>
-    (viennacl::cuda_arg(input.g_a),
-     viennacl::cuda_arg(input.g_b) + 1,
-     mat_size, num_one_intervals,
-     viennacl::cuda_arg(result.g_left_one),
-     viennacl::cuda_arg(result.g_right_one),
-     viennacl::cuda_arg(result.g_pos_one),
-     precision
-     );
-  
viennacl::linalg::cuda::VIENNACL_CUDA_LAST_ERROR_CHECK("bisectKernelLarge_OneIntervals()
 FAILED.");
-}
-
-
-// process intervals that contained more than one eigenvalue after
-// the first processing step
-template<typename NumericT>
-void bisectLarge_MultIntervals(const 
viennacl::linalg::detail::InputData<NumericT> &input, 
viennacl::linalg::detail::ResultDataLarge<NumericT> &result,
-                   const unsigned int mat_size,
-                   const NumericT precision)
- {
-    // get the number of blocks of intervals that contain, in total when
-    // each interval contains only one eigenvalue, not more than
-    // MAX_THREADS_BLOCK threads
-    unsigned int  num_blocks_mult = result.g_num_blocks_mult;
-
-    // setup the execution environment
-    dim3  grid_mult(num_blocks_mult, 1, 1);
-    dim3  threads_mult(mat_size > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : 
VIENNACL_BISECT_MAX_THREADS_BLOCK / 2, 1, 1);
-
-    bisectKernelLarge_MultIntervals<<< grid_mult, threads_mult >>>
-      (viennacl::cuda_arg(input.g_a),
-       viennacl::cuda_arg(input.g_b) + 1,
-       mat_size,
-       viennacl::cuda_arg(result.g_blocks_mult),
-       viennacl::cuda_arg(result.g_blocks_mult_sum),
-       viennacl::cuda_arg(result.g_left_mult),
-       viennacl::cuda_arg(result.g_right_mult),
-       viennacl::cuda_arg(result.g_left_count_mult),
-       viennacl::cuda_arg(result.g_right_count_mult),
-       viennacl::cuda_arg(result.g_lambda_mult),
-       viennacl::cuda_arg(result.g_pos_mult),
-       precision
-      );
-    
viennacl::linalg::cuda::VIENNACL_CUDA_LAST_ERROR_CHECK("bisectKernelLarge_MultIntervals()
 FAILED.");
-}
-}
-}
-}
-
-#endif

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large.hpp 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large.hpp
deleted file mode 100755
index 77c9773..0000000
--- a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large.hpp
+++ /dev/null
@@ -1,928 +0,0 @@
-#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_HPP_
-#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-
-/** @file viennacl/linalg/cuda/bisect_kernel_large.hpp
-    @brief First step of the bisection algorithm for the computation of 
eigenvalues.
-
-    Implementation based on the sample provided with the CUDA 6.0 SDK, for 
which
-    the creation of derivative works is allowed by including the following 
statement:
-    "This software contains source code provided by NVIDIA Corporation."
-*/
-
-
-/* Determine eigenvalues for large symmetric, tridiagonal matrix. First
-  step of the computation. */
-
-// includes, project
-#include "viennacl/linalg/detail/bisect/config.hpp"
-#include "viennacl/linalg/detail/bisect/util.hpp"
-// additional kernel
-#include "viennacl/linalg/cuda/bisect_util.hpp"
-
-// declaration, forward
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace cuda
-{
-////////////////////////////////////////////////////////////////////////////////
-//! Write data to global memory
-////////////////////////////////////////////////////////////////////////////////
-template<typename NumericT>
-__device__
-void writeToGmem(const unsigned int tid, const unsigned int tid_2,
-                 const unsigned int num_threads_active,
-                 const unsigned int num_blocks_mult,
-                 NumericT *g_left_one, NumericT *g_right_one,
-                 unsigned int *g_pos_one,
-                 NumericT *g_left_mult, NumericT *g_right_mult,
-                 unsigned int *g_left_count_mult,
-                 unsigned int *g_right_count_mult,
-                 NumericT *s_left, NumericT *s_right,
-                 unsigned short *s_left_count, unsigned short *s_right_count,
-                 unsigned int *g_blocks_mult,
-                 unsigned int *g_blocks_mult_sum,
-                 unsigned short *s_compaction_list,
-                 unsigned short *s_cl_helper,
-                 unsigned int offset_mult_lambda
-                )
-{
-
-    if (tid < offset_mult_lambda)
-    {
-
-        g_left_one[tid] = s_left[tid];
-        g_right_one[tid] = s_right[tid];
-        // right count can be used to order eigenvalues without sorting
-        g_pos_one[tid] = s_right_count[tid];
-    }
-    else
-    {
-
-
-        g_left_mult[tid - offset_mult_lambda] = s_left[tid];
-        g_right_mult[tid - offset_mult_lambda] = s_right[tid];
-        g_left_count_mult[tid - offset_mult_lambda] = s_left_count[tid];
-        g_right_count_mult[tid - offset_mult_lambda] = s_right_count[tid];
-    }
-
-    if (tid_2 < num_threads_active)
-    {
-
-        if (tid_2 < offset_mult_lambda)
-        {
-
-            g_left_one[tid_2] = s_left[tid_2];
-            g_right_one[tid_2] = s_right[tid_2];
-            // right count can be used to order eigenvalues without sorting
-            g_pos_one[tid_2] = s_right_count[tid_2];
-        }
-        else
-        {
-
-            g_left_mult[tid_2 - offset_mult_lambda] = s_left[tid_2];
-            g_right_mult[tid_2 - offset_mult_lambda] = s_right[tid_2];
-            g_left_count_mult[tid_2 - offset_mult_lambda] = 
s_left_count[tid_2];
-            g_right_count_mult[tid_2 - offset_mult_lambda] = 
s_right_count[tid_2];
-        }
-
-    } // end writing out data
-
-    __syncthreads();
-
-    // note that s_cl_blocking = s_compaction_list + 1;, that is by writing out
-    // s_compaction_list we write the exclusive scan result
-    if (tid <= num_blocks_mult)
-    {
-        g_blocks_mult[tid] = s_compaction_list[tid];
-        g_blocks_mult_sum[tid] = s_cl_helper[tid];
-    }
-
-    if (tid_2 <= num_blocks_mult)
-    {
-        g_blocks_mult[tid_2] = s_compaction_list[tid_2];
-        g_blocks_mult_sum[tid_2] = s_cl_helper[tid_2];
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//! Perform final stream compaction before writing data to global memory
-////////////////////////////////////////////////////////////////////////////////
-template<typename NumericT>
-__device__
-void
-compactStreamsFinal(const unsigned int tid, const unsigned int tid_2,
-                    const unsigned int num_threads_active,
-                    unsigned int &offset_mult_lambda,
-                    NumericT *s_left, NumericT *s_right,
-                    unsigned short *s_left_count, unsigned short 
*s_right_count,
-                    unsigned short *s_cl_one, unsigned short *s_cl_mult,
-                    unsigned short *s_cl_blocking, unsigned short *s_cl_helper,
-                    unsigned int is_one_lambda, unsigned int is_one_lambda_2,
-                    NumericT &left, NumericT &right, NumericT &left_2, 
NumericT &right_2,
-                    unsigned int &left_count, unsigned int &right_count,
-                    unsigned int &left_count_2, unsigned int &right_count_2,
-                    unsigned int c_block_iend, unsigned int c_sum_block,
-                    unsigned int c_block_iend_2, unsigned int c_sum_block_2
-                   )
-{
-    // cache data before performing compaction
-    left = s_left[tid];
-    right = s_right[tid];
-
-    if (tid_2 < num_threads_active)
-    {
-
-        left_2 = s_left[tid_2];
-        right_2 = s_right[tid_2];
-    }
-
-    __syncthreads();
-
-    // determine addresses for intervals containing multiple eigenvalues and
-    // addresses for blocks of intervals
-    unsigned int ptr_w = 0;
-    unsigned int ptr_w_2 = 0;
-    unsigned int ptr_blocking_w = 0;
-    unsigned int ptr_blocking_w_2 = 0;
-
-
-
-    ptr_w = (1 == is_one_lambda) ? s_cl_one[tid]
-            : s_cl_mult[tid] + offset_mult_lambda;
-
-    if (0 != c_block_iend)
-    {
-        ptr_blocking_w = s_cl_blocking[tid];
-    }
-
-    if (tid_2 < num_threads_active)
-    {
-        ptr_w_2 = (1 == is_one_lambda_2) ? s_cl_one[tid_2]
-                  : s_cl_mult[tid_2] + offset_mult_lambda;
-
-        if (0 != c_block_iend_2)
-        {
-            ptr_blocking_w_2 = s_cl_blocking[tid_2];
-        }
-    }
-
-
-    __syncthreads();
-      if(tid < num_threads_active)
-      {
-        // store compactly in shared mem
-        s_left[ptr_w] = left;
-        s_right[ptr_w] = right;
-        s_left_count[ptr_w] = left_count;
-        s_right_count[ptr_w] = right_count;
-      }
-
-
-    __syncthreads();
-    if(tid == 1)
-    {
-      s_left[ptr_w] = left;
-      s_right[ptr_w] = right;
-      s_left_count[ptr_w] = left_count;
-      s_right_count[ptr_w] = right_count;
-    }
-          if (0 != c_block_iend)
-    {
-        s_cl_blocking[ptr_blocking_w + 1] = c_block_iend - 1;
-        s_cl_helper[ptr_blocking_w + 1] = c_sum_block;
-    }
-
-    if (tid_2 < num_threads_active)
-    {
-
-        // store compactly in shared mem
-        s_left[ptr_w_2] = left_2;
-        s_right[ptr_w_2] = right_2;
-        s_left_count[ptr_w_2] = left_count_2;
-        s_right_count[ptr_w_2] = right_count_2;
-
-        if (0 != c_block_iend_2)
-        {
-            s_cl_blocking[ptr_blocking_w_2 + 1] = c_block_iend_2 - 1;
-            s_cl_helper[ptr_blocking_w_2 + 1] = c_sum_block_2;
-        }
-    }
-
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//! Compute addresses to obtain compact list of block start addresses
-////////////////////////////////////////////////////////////////////////////////
-inline __device__
-void
-scanCompactBlocksStartAddress(const unsigned int tid, const unsigned int tid_2,
-                              const unsigned int num_threads_compaction,
-                              unsigned short *s_cl_blocking,
-                              unsigned short *s_cl_helper
-                             )
-{
-    // prepare for second step of block generation: compaction of the block
-    // list itself to efficiently write out these
-    s_cl_blocking[tid] = s_cl_helper[tid];
-
-    if (tid_2 < num_threads_compaction)
-    {
-        s_cl_blocking[tid_2] = s_cl_helper[tid_2];
-    }
-
-    __syncthreads();
-
-    // additional scan to compact s_cl_blocking that permits to generate a
-    // compact list of eigenvalue blocks each one containing about
-    // VIENNACL_BISECT_MAX_THREADS_BLOCK eigenvalues (so that each of these 
blocks may be
-    // processed by one thread block in a subsequent processing step
-
-    unsigned int offset = 1;
-
-    // build scan tree
-    for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1)
-    {
-
-        __syncthreads();
-
-        if (tid < d)
-        {
-
-            unsigned int  ai = offset*(2*tid+1)-1;
-            unsigned int  bi = offset*(2*tid+2)-1;
-            s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai];
-        }
-
-        offset <<= 1;
-    }
-
-    // traverse down tree: first down to level 2 across
-    for (int d = 2; d < num_threads_compaction; d <<= 1)
-    {
-
-        offset >>= 1;
-        __syncthreads();
-
-        //
-        if (tid < (d-1))
-        {
-
-            unsigned int  ai = offset*(tid+1) - 1;
-            unsigned int  bi = ai + (offset >> 1);
-            s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai];
-        }
-    }
-
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//! Perform scan to obtain number of eigenvalues before a specific block
-////////////////////////////////////////////////////////////////////////////////
-inline __device__
-void
-scanSumBlocks(const unsigned int tid, const unsigned int tid_2,
-              const unsigned int num_threads_active,
-              const unsigned int num_threads_compaction,
-              unsigned short *s_cl_blocking,
-              unsigned short *s_cl_helper)
-{
-    unsigned int offset = 1;
-
-    // first step of scan to build the sum of elements within each block
-    // build up tree
-    for (int d = num_threads_compaction >> 1; d > 0; d >>= 1)
-    {
-
-        __syncthreads();
-
-        if (tid < d)
-        {
-
-            unsigned int ai = offset*(2*tid+1)-1;
-            unsigned int bi = offset*(2*tid+2)-1;
-
-            s_cl_blocking[bi] += s_cl_blocking[ai];
-        }
-
-        offset *= 2;
-    }
-
-    // first step of scan to build the sum of elements within each block
-    // traverse down tree
-    for (int d = 2; d < (num_threads_compaction - 1); d <<= 1)
-    {
-
-        offset >>= 1;
-        __syncthreads();
-
-        if (tid < (d-1))
-        {
-
-            unsigned int ai = offset*(tid+1) - 1;
-            unsigned int bi = ai + (offset >> 1);
-
-            s_cl_blocking[bi] += s_cl_blocking[ai];
-        }
-    }
-
-    __syncthreads();
-
-    if (0 == tid)
-    {
-
-        // move last element of scan to last element that is valid
-        // necessary because the number of threads employed for scan is a power
-        // of two and not necessarily the number of active threasd
-        s_cl_helper[num_threads_active - 1] =
-            s_cl_helper[num_threads_compaction - 1];
-        s_cl_blocking[num_threads_active - 1] =
-            s_cl_blocking[num_threads_compaction - 1];
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//! Perform initial scan for compaction of intervals containing one and
-//! multiple eigenvalues; also do initial scan to build blocks
-////////////////////////////////////////////////////////////////////////////////
-inline __device__
-void
-scanInitial(const unsigned int tid, const unsigned int tid_2, const unsigned 
int mat_size,
-            const unsigned int num_threads_active,
-            const unsigned int num_threads_compaction,
-            unsigned short *s_cl_one, unsigned short *s_cl_mult,
-            unsigned short *s_cl_blocking, unsigned short *s_cl_helper
-           )
-{
-
-    // perform scan to compactly write out the intervals containing one and
-    // multiple eigenvalues
-    // also generate tree for blocking of intervals containing multiple
-    // eigenvalues
-
-    unsigned int offset = 1;
-
-    // build scan tree
-    for (int d = (num_threads_compaction >> 1); d > 0; d >>= 1)
-    {
-
-        __syncthreads();
-
-        if (tid < d)
-        {
-
-            unsigned int  ai = offset*(2*tid+1);
-            unsigned int  bi = offset*(2*tid+2)-1;
-
-            s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai - 1];
-            s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai - 1];
-
-            // s_cl_helper is binary and zero for an internal node and 1 for a
-            // root node of a tree corresponding to a block
-            // s_cl_blocking contains the number of nodes in each sub-tree at 
each
-            // iteration, the data has to be kept to compute the total number 
of
-            // eigenvalues per block that, in turn, is needed to efficiently
-            // write out data in the second step
-            if ((s_cl_helper[ai - 1] != 1) || (s_cl_helper[bi] != 1))
-            {
-
-                // check how many childs are non terminated
-                if (s_cl_helper[ai - 1] == 1)
-                {
-                    // mark as terminated
-                    s_cl_helper[bi] = 1;
-                }
-                else if (s_cl_helper[bi] == 1)
-                {
-                    // mark as terminated
-                    s_cl_helper[ai - 1] = 1;
-                }
-                else    // both childs are non-terminated
-                {
-
-                    unsigned int temp = s_cl_blocking[bi] + s_cl_blocking[ai - 
1];
-
-                    if (temp > (mat_size > 512 ? 
VIENNACL_BISECT_MAX_THREADS_BLOCK : VIENNACL_BISECT_MAX_THREADS_BLOCK / 2))
-                    {
-
-                        // the two child trees have to form separate blocks, 
terminate trees
-                        s_cl_helper[ai - 1] = 1;
-                        s_cl_helper[bi] = 1;
-                    }
-                    else
-                    {
-                        // build up tree by joining subtrees
-                        s_cl_blocking[bi] = temp;
-                        s_cl_blocking[ai - 1] = 0;
-                    }
-                }
-            }  // end s_cl_helper update
-
-        }
-
-        offset <<= 1;
-    }
-
-
-    // traverse down tree, this only for stream compaction, not for block
-    // construction
-    for (int d = 2; d < num_threads_compaction; d <<= 1)
-    {
-
-        offset >>= 1;
-        __syncthreads();
-
-        //
-        if (tid < (d-1))
-        {
-
-            unsigned int  ai = offset*(tid+1) - 1;
-            unsigned int  bi = ai + (offset >> 1);
-
-            s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai];
-            s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai];
-        }
-    }
-
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//! Store all non-empty intervals resulting from the subdivision of the 
interval
-//! currently processed by the thread
-////////////////////////////////////////////////////////////////////////////////
-template<typename NumericT>
-__device__
-void
-storeNonEmptyIntervalsLarge(unsigned int addr,
-                            const unsigned int num_threads_active,
-                            NumericT  *s_left, NumericT *s_right,
-                            unsigned short  *s_left_count,
-                            unsigned short *s_right_count,
-                            NumericT left, NumericT mid, NumericT right,
-                            const unsigned short left_count,
-                            const unsigned short mid_count,
-                            const unsigned short right_count,
-                            NumericT epsilon,
-                            unsigned int &compact_second_chunk,
-                            unsigned short *s_compaction_list,
-                            unsigned int &is_active_second)
-{
-    // check if both child intervals are valid
-    if ((left_count != mid_count) && (mid_count != right_count))
-    {
-
-        storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
-                      left, mid, left_count, mid_count, epsilon);
-
-        is_active_second = 1;
-        s_compaction_list[threadIdx.x] = 1;
-        compact_second_chunk = 1;
-    }
-    else
-    {
-
-        // only one non-empty child interval
-
-        // mark that no second child
-        is_active_second = 0;
-        s_compaction_list[threadIdx.x] = 0;
-
-        // store the one valid child interval
-        if (left_count != mid_count)
-        {
-            storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
-                          left, mid, left_count, mid_count, epsilon);
-        }
-        else
-        {
-            storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
-                          mid, right, mid_count, right_count, epsilon);
-        }
-    }
-}
-
-/** @brief Bisection to find eigenvalues of a real, symmetric, and tridiagonal 
matrix
-*  g_d  diagonal elements in global memory
-*  g_s  superdiagonal elements in global elements (stored so that the element 
*(g_s - 1) can be accessed and equals 0
-*  n   size of matrix
-*  lg  lower bound of input interval (e.g. Gerschgorin interval)
-*  ug  upper bound of input interval (e.g. Gerschgorin interval)
-*  lg_eig_count  number of eigenvalues that are smaller than lg
-*  lu_eig_count  number of eigenvalues that are smaller than lu
-*  epsilon  desired accuracy of eigenvalues to compute
-*/
-template<typename NumericT>
-__global__
-void
-bisectKernelLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int 
n,
-                  const NumericT lg, const NumericT ug,
-                  const unsigned int lg_eig_count,
-                  const unsigned int ug_eig_count,
-                  NumericT epsilon,
-                  unsigned int *g_num_one,
-                  unsigned int *g_num_blocks_mult,
-                  NumericT *g_left_one, NumericT *g_right_one,
-                  unsigned int *g_pos_one,
-                  NumericT *g_left_mult, NumericT *g_right_mult,
-                  unsigned int *g_left_count_mult,
-                  unsigned int *g_right_count_mult,
-                  unsigned int *g_blocks_mult,
-                  unsigned int *g_blocks_mult_sum
-                 )
-{
-    const unsigned int tid = threadIdx.x;
-
-    // intervals (store left and right because the subdivision tree is in 
general
-    // not dense
-    __shared__  NumericT  s_left[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
-    __shared__  NumericT  s_right[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
-
-    // number of eigenvalues that are smaller than s_left / s_right
-    // (correspondence is realized via indices)
-    __shared__  unsigned short  s_left_count[2 * 
VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
-    __shared__  unsigned short  s_right_count[2 * 
VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
-
-    // helper for stream compaction
-    __shared__  unsigned short  s_compaction_list[2 * 
VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
-
-    // state variables for whole block
-    // if 0 then compaction of second chunk of child intervals is not necessary
-    // (because all intervals had exactly one non-dead child)
-    __shared__  unsigned int compact_second_chunk;
-    // if 1 then all threads are converged
-    __shared__  unsigned int all_threads_converged;
-
-    // number of currently active threads
-    __shared__  unsigned int num_threads_active;
-
-    // number of threads to use for stream compaction
-    __shared__  unsigned int num_threads_compaction;
-
-    // helper for exclusive scan
-    unsigned short *s_compaction_list_exc = s_compaction_list + 1;
-
-
-    // variables for currently processed interval
-    // left and right limit of active interval
-    NumericT left = 0.0f;
-    NumericT right = 0.0f;
-    unsigned int left_count = 0;
-    unsigned int right_count = 0;
-    // midpoint of active interval
-    NumericT  mid = 0.0f;
-    // number of eigenvalues smaller then mid
-    unsigned int mid_count = 0;
-    // helper for stream compaction (tracking of threads generating second 
child)
-    unsigned int is_active_second = 0;
-
-    // initialize lists
-    s_compaction_list[tid] = 0;
-    s_left[tid] = 0;
-    s_right[tid] = 0;
-    s_left_count[tid] = 0;
-    s_right_count[tid] = 0;
-
-    __syncthreads();
-
-    // set up initial configuration
-    if (0 == tid)
-    {
-
-        s_left[0] = lg;
-        s_right[0] = ug;
-        s_left_count[0] = lg_eig_count;
-        s_right_count[0] = ug_eig_count;
-
-        compact_second_chunk = 0;
-        num_threads_active = 1;
-
-        num_threads_compaction = 1;
-
-        all_threads_converged = 1;
-    }
-
-    __syncthreads();
-
-    // for all active threads read intervals from the last level
-    // the number of (worst case) active threads per level l is 2^l
-    // determine coarse intervals. On these intervals the kernel for one or 
for multiple eigenvalues
-    // will be executed in the second step
-    while(true)
-    {
-        s_compaction_list[tid] = 0;
-        s_compaction_list[tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;
-        s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;
-        subdivideActiveInterval(tid, s_left, s_right, s_left_count, 
s_right_count,
-                                num_threads_active,
-                                left, right, left_count, right_count,
-                                mid, all_threads_converged);
-
-        __syncthreads();
-
-        // check if done
-        if (1 == all_threads_converged)
-        {
-            break;
-        }
-
-        // compute number of eigenvalues smaller than mid
-        // use all threads for reading the necessary matrix data from global
-        // memory
-        // use s_left and s_right as scratch space for diagonal and
-        // superdiagonal of matrix
-        mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n,
-                                                    mid, threadIdx.x,
-                                                    num_threads_active,
-                                                    s_left, s_right,
-                                                    (left == right));
-
-        __syncthreads();
-
-        // store intervals
-        // for all threads store the first child interval in a continuous 
chunk of
-        // memory, and the second child interval -- if it exists -- in a second
-        // chunk; it is likely that all threads reach convergence up to
-        // \a epsilon at the same level; furthermore, for higher level most / 
all
-        // threads will have only one child, storing the first child compactly 
will
-        // (first) avoid to perform a compaction step on the first chunk, 
(second)
-        // make it for higher levels (when all threads / intervals have
-        // exactly one child)  unnecessary to perform a compaction of the 
second
-        // chunk
-        if (tid < num_threads_active)
-        {
-
-            if (left != right)
-            {
-
-                // store intervals
-                storeNonEmptyIntervalsLarge(tid, num_threads_active,
-                                            s_left, s_right,
-                                            s_left_count, s_right_count,
-                                            left, mid, right,
-                                            left_count, mid_count, right_count,
-                                            epsilon, compact_second_chunk,
-                                            s_compaction_list_exc,
-                                            is_active_second);
-            }
-            else
-            {
-
-                // re-write converged interval (has to be stored again because 
s_left
-                // and s_right are used as scratch space for
-                // computeNumSmallerEigenvalsLarge()
-                s_left[tid] = left;
-                s_right[tid] = left;
-                s_left_count[tid] = left_count;
-                s_right_count[tid] = right_count;
-
-                is_active_second = 0;
-            }
-        }
-
-        // necessary so that compact_second_chunk is up-to-date
-        __syncthreads();
-
-        // perform compaction of chunk where second children are stored
-        // scan of (num_threads_active / 2) elements, thus at most
-        // (num_threads_active / 4) threads are needed
-        if (compact_second_chunk > 0)
-        {
-
-            // create indices for compaction
-            createIndicesCompaction(s_compaction_list_exc, 
num_threads_compaction);
-        }
-        __syncthreads();
-
-        if (compact_second_chunk > 0)
-        {
-            compactIntervals(s_left, s_right, s_left_count, s_right_count,
-                             mid, right, mid_count, right_count,
-                             s_compaction_list, num_threads_active,
-                             is_active_second);
-        }
-
-        __syncthreads();
-
-        // update state variables
-        if (0 == tid)
-        {
-
-            // update number of active threads with result of reduction
-            num_threads_active += s_compaction_list[num_threads_active];
-            num_threads_compaction = ceilPow2(num_threads_active);
-
-            compact_second_chunk = 0;
-            all_threads_converged = 1;
-        }
-
-        __syncthreads();
-
-        if (num_threads_compaction > blockDim.x)
-        {
-            break;
-        }
-
-    }
-
-    __syncthreads();
-
-    // generate two lists of intervals; one with intervals that contain one
-    // eigenvalue (or are converged), and one with intervals that need further
-    // subdivision
-
-    // perform two scans in parallel
-
-    unsigned int left_count_2;
-    unsigned int right_count_2;
-
-    unsigned int tid_2 = tid + blockDim.x;
-
-    // cache in per thread registers so that s_left_count and s_right_count
-    // can be used for scans
-    left_count = s_left_count[tid];
-    right_count = s_right_count[tid];
-
-    // some threads have to cache data for two intervals
-    if (tid_2 < num_threads_active)
-    {
-        left_count_2 = s_left_count[tid_2];
-        right_count_2 = s_right_count[tid_2];
-    }
-
-    // compaction list for intervals containing one and multiple eigenvalues
-    // do not affect first element for exclusive scan
-    unsigned short  *s_cl_one = s_left_count + 1;
-    unsigned short  *s_cl_mult = s_right_count + 1;
-
-    // compaction list for generating blocks of intervals containing multiple
-    // eigenvalues
-    unsigned short  *s_cl_blocking = s_compaction_list_exc;
-    // helper compaction list for generating blocks of intervals
-    __shared__ unsigned short  s_cl_helper[2 * 
VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
-
-    if (0 == tid)
-    {
-        // set to 0 for exclusive scan
-        s_left_count[0] = 0;
-        s_right_count[0] = 0;
-
-    }
-
-    __syncthreads();
-
-    // flag if interval contains one or multiple eigenvalues
-    unsigned int is_one_lambda = 0;
-    unsigned int is_one_lambda_2 = 0;
-
-    // number of eigenvalues in the interval
-    unsigned int multiplicity = right_count - left_count;
-    is_one_lambda = (1 == multiplicity);
-
-    s_cl_one[tid] = is_one_lambda;
-    s_cl_mult[tid] = (! is_one_lambda);
-
-    // (note: s_cl_blocking is non-zero only where s_cl_mult[] is non-zero)
-    s_cl_blocking[tid] = (1 == is_one_lambda) ? 0 : multiplicity;
-    s_cl_helper[tid] = 0;
-
-    if (tid_2 < num_threads_active)
-    {
-
-        unsigned int multiplicity = right_count_2 - left_count_2;
-        is_one_lambda_2 = (1 == multiplicity);
-
-        s_cl_one[tid_2] = is_one_lambda_2;
-        s_cl_mult[tid_2] = (! is_one_lambda_2);
-
-        // (note: s_cl_blocking is non-zero only where s_cl_mult[] is non-zero)
-        s_cl_blocking[tid_2] = (1 == is_one_lambda_2) ? 0 : multiplicity;
-        s_cl_helper[tid_2] = 0;
-    }
-    else if (tid_2 < (2 * (n > 512 ? VIENNACL_BISECT_MAX_THREADS_BLOCK : 
VIENNACL_BISECT_MAX_THREADS_BLOCK / 2) + 1))
-    {
-
-        // clear
-        s_cl_blocking[tid_2] = 0;
-        s_cl_helper[tid_2] = 0;
-    }
-
-
-    scanInitial(tid, tid_2, n, num_threads_active, num_threads_compaction,
-                s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper);
-
-    __syncthreads();
-
-    scanSumBlocks(tid, tid_2, num_threads_active,
-                  num_threads_compaction, s_cl_blocking, s_cl_helper);
-
-    // end down sweep of scan
-    __syncthreads();
-
-    unsigned int  c_block_iend = 0;
-    unsigned int  c_block_iend_2 = 0;
-    unsigned int  c_sum_block = 0;
-    unsigned int  c_sum_block_2 = 0;
-
-    // for each thread / interval that corresponds to root node of interval 
block
-    // store start address of block and total number of eigenvalues in all 
blocks
-    // before this block (particular thread is irrelevant, constraint is to
-    // have a subset of threads so that one and only one of them is in each
-    // interval)
-    if (1 == s_cl_helper[tid])
-    {
-
-        c_block_iend = s_cl_mult[tid] + 1;
-        c_sum_block = s_cl_blocking[tid];
-    }
-
-    if (1 == s_cl_helper[tid_2])
-    {
-
-        c_block_iend_2 = s_cl_mult[tid_2] + 1;
-        c_sum_block_2 = s_cl_blocking[tid_2];
-    }
-
-    scanCompactBlocksStartAddress(tid, tid_2, num_threads_compaction,
-                                  s_cl_blocking, s_cl_helper);
-
-
-    // finished second scan for s_cl_blocking
-    __syncthreads();
-
-    // determine the global results
-    __shared__  unsigned int num_blocks_mult;
-    __shared__  unsigned int num_mult;
-    __shared__  unsigned int offset_mult_lambda;
-
-    if (0 == tid)
-    {
-
-        num_blocks_mult = s_cl_blocking[num_threads_active - 1];
-        offset_mult_lambda = s_cl_one[num_threads_active - 1];
-        num_mult = s_cl_mult[num_threads_active - 1];
-
-        *g_num_one = offset_mult_lambda;
-        *g_num_blocks_mult = num_blocks_mult;
-    }
-
-    __syncthreads();
-
-    NumericT left_2, right_2;
-    --s_cl_one;
-    --s_cl_mult;
-    --s_cl_blocking;
-
-    __syncthreads();
-    compactStreamsFinal(tid, tid_2, num_threads_active, offset_mult_lambda,
-                        s_left, s_right, s_left_count, s_right_count,
-                        s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper,
-                        is_one_lambda, is_one_lambda_2,
-                        left, right, left_2, right_2,
-                        left_count, right_count, left_count_2, right_count_2,
-                        c_block_iend, c_sum_block, c_block_iend_2, 
c_sum_block_2
-                       );
-
-    __syncthreads();
-
-    // final adjustment before writing out data to global memory
-    if (0 == tid)
-    {
-        s_cl_blocking[num_blocks_mult] = num_mult;
-        s_cl_helper[0] = 0;
-    }
-
-    __syncthreads();
-
-    // write to global memory
-    writeToGmem(tid, tid_2, num_threads_active, num_blocks_mult,
-                g_left_one, g_right_one, g_pos_one,
-                g_left_mult, g_right_mult, g_left_count_mult, 
g_right_count_mult,
-                s_left, s_right, s_left_count, s_right_count,
-                g_blocks_mult, g_blocks_mult_sum,
-                s_compaction_list, s_cl_helper, offset_mult_lambda);
-
-}
-}
-}
-}
-#endif // #ifndef _BISECT_KERNEL_LARGE_H_

http://git-wip-us.apache.org/repos/asf/mahout/blob/7ae549fa/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_multi.hpp
----------------------------------------------------------------------
diff --git 
a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_multi.hpp
 
b/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_multi.hpp
deleted file mode 100755
index a670256..0000000
--- 
a/native-viennaCL/src/main/cpp/viennacl/linalg/cuda/bisect_kernel_large_multi.hpp
+++ /dev/null
@@ -1,277 +0,0 @@
-#ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_MULTI_HPP_
-#define VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_MULTI_HPP_
-
-/* =========================================================================
-   Copyright (c) 2010-2016, Institute for Microelectronics,
-                            Institute for Analysis and Scientific Computing,
-                            TU Wien.
-   Portions of this software are copyright by UChicago Argonne, LLC.
-
-                            -----------------
-                  ViennaCL - The Vienna Computing Library
-                            -----------------
-
-   Project Head:    Karl Rupp                   [email protected]
-
-   (A list of authors and contributors can be found in the manual)
-
-   License:         MIT (X11), see file LICENSE in the base directory
-============================================================================= 
*/
-
-
-/** @file viennacl/linalg/cuda/bisect_kernel_large_multi.hpp
-    @brief Second step of the bisection algorithm for the computation of 
eigenvalues for large matrices.
-
-    Implementation based on the sample provided with the CUDA 6.0 SDK, for 
which
-    the creation of derivative works is allowed by including the following 
statement:
-    "This software contains source code provided by NVIDIA Corporation."
-*/
-
-/* Perform second step of bisection algorithm for large matrices for
- * intervals that contained after the first step more than one eigenvalue
- */
-
-// includes, project
-#include "viennacl/linalg/detail/bisect/config.hpp"
-#include "viennacl/linalg/detail/bisect/util.hpp"
-// additional kernel
-#include "viennacl/linalg/cuda/bisect_util.hpp"
-
-namespace viennacl
-{
-namespace linalg
-{
-namespace cuda
-{
-////////////////////////////////////////////////////////////////////////////////
-//! Perform second step of bisection algorithm for large matrices for
-//! intervals that after the first step contained more than one eigenvalue
-//! @param  g_d  diagonal elements of symmetric, tridiagonal matrix
-//! @param  g_s  superdiagonal elements of symmetric, tridiagonal matrix
-//! @param  n    matrix size
-//! @param  blocks_mult  start addresses of blocks of intervals that are
-//!                      processed by one block of threads, each of the
-//!                      intervals contains more than one eigenvalue
-//! @param  blocks_mult_sum  total number of eigenvalues / singleton intervals
-//!                          in one block of intervals
-//! @param  g_left  left limits of intervals
-//! @param  g_right  right limits of intervals
-//! @param  g_left_count  number of eigenvalues less than left limits
-//! @param  g_right_count  number of eigenvalues less than right limits
-//! @param  g_lambda  final eigenvalue
-//! @param  g_pos  index of eigenvalue (in ascending order)
-//! @param  precision  desired precision of eigenvalues
-////////////////////////////////////////////////////////////////////////////////
-template<typename NumericT>
-__global__
-void
-bisectKernelLarge_MultIntervals(const NumericT *g_d, const NumericT *g_s, 
const unsigned int n,
-                                unsigned int *blocks_mult,
-                                unsigned int *blocks_mult_sum,
-                                NumericT *g_left, NumericT *g_right,
-                                unsigned int *g_left_count,
-                                unsigned int *g_right_count,
-                                NumericT *g_lambda, unsigned int *g_pos,
-                                NumericT precision
-                               )
-{
-  const unsigned int tid = threadIdx.x;
-
-    // left and right limits of interval
-    __shared__  NumericT  s_left[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];
-    __shared__  NumericT  s_right[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK];
-
-    // number of eigenvalues smaller than interval limits
-    __shared__  unsigned int  s_left_count[2 * 
VIENNACL_BISECT_MAX_THREADS_BLOCK];
-    __shared__  unsigned int  s_right_count[2 * 
VIENNACL_BISECT_MAX_THREADS_BLOCK];
-
-    // helper array for chunk compaction of second chunk
-    __shared__  unsigned int  s_compaction_list[2 * 
VIENNACL_BISECT_MAX_THREADS_BLOCK + 1];
-    // compaction list helper for exclusive scan
-    unsigned int *s_compaction_list_exc = s_compaction_list + 1;
-
-    // flag if all threads are converged
-    __shared__  unsigned int  all_threads_converged;
-    // number of active threads
-    __shared__  unsigned int  num_threads_active;
-    // number of threads to employ for compaction
-    __shared__  unsigned int  num_threads_compaction;
-    // flag if second chunk has to be compacted
-    __shared__  unsigned int compact_second_chunk;
-
-    // parameters of block of intervals processed by this block of threads
-    __shared__  unsigned int  c_block_start;
-    __shared__  unsigned int  c_block_end;
-    __shared__  unsigned int  c_block_offset_output;
-
-    // midpoint of currently active interval of the thread
-    NumericT mid = 0.0f;
-    // number of eigenvalues smaller than \a mid
-    unsigned int  mid_count = 0;
-    // current interval parameter
-    NumericT  left = 0.0f;
-    NumericT  right = 0.0f;
-    unsigned int  left_count = 0;
-    unsigned int  right_count = 0;
-    // helper for compaction, keep track which threads have a second child
-    unsigned int  is_active_second = 0;
-
-
-    __syncthreads();
-    // initialize common start conditions
-    if (0 == tid)
-    {
-
-        c_block_start = blocks_mult[blockIdx.x];
-        c_block_end = blocks_mult[blockIdx.x + 1];
-        c_block_offset_output = blocks_mult_sum[blockIdx.x];
-
-
-        num_threads_active = c_block_end - c_block_start;
-        s_compaction_list[0] = 0;
-        num_threads_compaction = ceilPow2(num_threads_active);
-
-        all_threads_converged = 1;
-        compact_second_chunk = 0;
-    }
-
-     s_left_count [tid] = 42;
-     s_right_count[tid] = 42;
-     s_left_count [tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;
-     s_right_count[tid + VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;
-
-    __syncthreads();
-
-
-    // read data into shared memory
-    if (tid < num_threads_active)
-    {
-        s_left[tid]  = g_left[c_block_start + tid];
-        s_right[tid] = g_right[c_block_start + tid];
-        s_left_count[tid]  = g_left_count[c_block_start + tid];
-        s_right_count[tid] = g_right_count[c_block_start + tid];
-    }
-
-    __syncthreads();
-    unsigned int iter = 0;
-    // do until all threads converged
-    while (true)
-    {
-        iter++;
-        //for (int iter=0; iter < 0; iter++) {
-        s_compaction_list[threadIdx.x] = 0;
-        s_compaction_list[threadIdx.x + blockDim.x] = 0;
-        s_compaction_list[2 * VIENNACL_BISECT_MAX_THREADS_BLOCK] = 0;
-
-        // subdivide interval if currently active and not already converged
-        subdivideActiveIntervalMulti(tid, s_left, s_right,
-                                s_left_count, s_right_count,
-                                num_threads_active,
-                                left, right, left_count, right_count,
-                                mid, all_threads_converged);
-        __syncthreads();
-
-        // stop if all eigenvalues have been found
-        if (1 == all_threads_converged)
-        {
-
-            break;
-        }
-
-        // compute number of eigenvalues smaller than mid for active and not
-        // converged intervals, use all threads for loading data from gmem and
-        // s_left and s_right as scratch space to store the data load from gmem
-        // in shared memory
-        mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n,
-                                                    mid, tid, 
num_threads_active,
-                                                    s_left, s_right,
-                                                    (left == right));
-
-        __syncthreads();
-
-        if (tid < num_threads_active)
-        {
-
-            // store intervals
-            if (left != right)
-            {
-
-                storeNonEmptyIntervals(tid, num_threads_active,
-                                       s_left, s_right, s_left_count, 
s_right_count,
-                                       left, mid, right,
-                                       left_count, mid_count, right_count,
-                                       precision, compact_second_chunk,
-                                       s_compaction_list_exc,
-                                       is_active_second);
-
-            }
-            else
-            {
-
-                storeIntervalConverged(s_left, s_right, s_left_count, 
s_right_count,
-                                       left, mid, right,
-                                       left_count, mid_count, right_count,
-                                       s_compaction_list_exc, 
compact_second_chunk,
-                                       num_threads_active,
-                                       is_active_second);
-
-            }
-        }
-
-        __syncthreads();
-
-        // compact second chunk of intervals if any of the threads generated
-        // two child intervals
-        if (1 == compact_second_chunk)
-        {
-
-            createIndicesCompaction(s_compaction_list_exc, 
num_threads_compaction);
-            compactIntervals(s_left, s_right, s_left_count, s_right_count,
-                             mid, right, mid_count, right_count,
-                             s_compaction_list, num_threads_active,
-                             is_active_second);
-        }
-
-        __syncthreads();
-
-        // update state variables
-        if (0 == tid)
-        {
-            num_threads_active += s_compaction_list[num_threads_active];
-            num_threads_compaction = ceilPow2(num_threads_active);
-
-            compact_second_chunk = 0;
-            all_threads_converged = 1;
-        }
-
-        __syncthreads();
-
-        // clear
-        s_compaction_list_exc[threadIdx.x] = 0;
-        s_compaction_list_exc[threadIdx.x + blockDim.x] = 0;
-
-        if (num_threads_compaction > blockDim.x)
-        {
-          break;
-        }
-
-
-        __syncthreads();
-
-    }  // end until all threads converged
-
-    // write data back to global memory
-    if (tid < num_threads_active)
-    {
-
-        unsigned int addr = c_block_offset_output + tid;
-
-        g_lambda[addr]  = s_left[tid];
-        g_pos[addr]   = s_right_count[tid];
-    }
-}
-} // namespace cuda
-} // namespace linalg
-} // namespace viennacl
-
-#endif // #ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_MULTI_HPP_

Reply via email to